Jelajahi Sumber

fix footnote overlap error

赵小蒙 1 tahun lalu
induk
melakukan
deb98fd0b1

+ 2 - 2
magic_pdf/pdf_parse_union_core.py

@@ -29,10 +29,10 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes):
         useful_blocks.append({
             "bbox": bbox[:4]
         })
-    is_useful_block_horz_overlap, smaller_bbox = check_useful_block_horizontal_overlap(useful_blocks)
+    is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = check_useful_block_horizontal_overlap(useful_blocks)
     if is_useful_block_horz_overlap:
         logger.warning(
-            f"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}")
+            f"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}")
         for bbox in all_bboxes.copy():
             if smaller_bbox == bbox[:4]:
                 all_bboxes.remove(bbox)

+ 6 - 4
magic_pdf/pre_proc/ocr_detect_all_bboxes.py

@@ -34,10 +34,6 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
     all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
     '''任何框体与舍弃框重叠,优先信任舍弃框'''
     all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
-    '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
-    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
-    '''将剩余的bbox做分离处理,防止后面分layout时出错'''
-    all_bboxes = remove_overlap_between_bbox_for_block(all_bboxes)
 
     '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
     for discarded in discarded_blocks:
@@ -47,6 +43,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
         if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
             all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None])
 
+    '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
+    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
+    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
+    '''将剩余的bbox做分离处理,防止后面分layout时出错'''
+    all_bboxes = remove_overlap_between_bbox_for_block(all_bboxes)
+
     return all_bboxes, all_discarded_blocks
 
 

+ 3 - 3
magic_pdf/pre_proc/resolve_bbox_conflict.py

@@ -184,8 +184,8 @@ def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
             area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
             if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
                 if area_i > area_j:
-                    return True, useful_bboxes[j]
+                    return True, useful_bboxes[j], useful_bboxes[i]
                 else:
-                    return True, useful_bboxes[i]
+                    return True, useful_bboxes[i], useful_bboxes[j]
 
-    return False, None
+    return False, None, None