瀏覽代碼

fix(detect_all_bboxes): remove small overlapping blocks by merging (#501)

Previously, small blocks that overlapped with larger ones were merely removed. This fix
changes the approach to merge smaller blocks into the larger block instead, ensuring that
no information is lost and the larger block encompasses all the text content fully.
Xiaomeng Zhao 1 年之前
父節點
當前提交
9067cd31ca
共有 1 個文件被更改,包括 12 次插入3 次删除
  1. 12 3
      magic_pdf/pre_proc/ocr_detect_all_bboxes.py

+ 12 - 3
magic_pdf/pre_proc/ocr_detect_all_bboxes.py

@@ -133,6 +133,7 @@ def remove_need_drop_blocks(all_bboxes, discarded_blocks):
 
 
 def remove_overlaps_min_blocks(all_bboxes):
+    #  重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
     #  删除重叠blocks中较小的那些
     need_remove = []
     for block1 in all_bboxes:
@@ -142,9 +143,17 @@ def remove_overlaps_min_blocks(all_bboxes):
                 block2_bbox = block2[:4]
                 overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
                 if overlap_box is not None:
-                    bbox_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
-                    if bbox_to_remove is not None and bbox_to_remove not in need_remove:
-                        need_remove.append(bbox_to_remove)
+                    block_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
+                    if block_to_remove is not None and block_to_remove not in need_remove:
+                        large_block = block1 if block1 != block_to_remove else block2
+                        x1, y1, x2, y2 = large_block[:4]
+                        sx1, sy1, sx2, sy2 = block_to_remove[:4]
+                        x1 = min(x1, sx1)
+                        y1 = min(y1, sy1)
+                        x2 = max(x2, sx2)
+                        y2 = max(y2, sy2)
+                        large_block[:4] = [x1, y1, x2, y2]
+                        need_remove.append(block_to_remove)
 
     if len(need_remove) > 0:
         for block in need_remove: