فهرست منبع

remove overlap between with all blocks

赵小蒙 1 سال پیش
والد
کامیت
eebd976715

+ 2 - 3
magic_pdf/pdf_parse_by_ocr.py

@@ -24,7 +24,7 @@ from magic_pdf.pre_proc.ocr_dict_merge import (
 from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remove_overlaps_min_spans, \
     adjust_bbox_for_standalone_block, modify_y_axis, modify_inline_equation, get_qa_need_list, \
     remove_spans_by_bboxes_dict
-from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
+from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_span
 
 
 def parse_pdf_by_ocr(
@@ -158,8 +158,7 @@ def parse_pdf_by_ocr(
         spans = modify_inline_equation(spans, displayed_list, text_inline_lines)
 
         '''bbox去除粘连'''
-        spans = remove_overlap_between_bbox(spans)
-
+        spans = remove_overlap_between_bbox_for_span(spans)
         '''
         对tpye=["interline_equation", "image", "table"]进行额外处理,
         如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0

+ 1 - 2
magic_pdf/pdf_parse_union_core.py

@@ -20,7 +20,6 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
 from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans, \
     fix_discarded_block
 from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2
-from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
 from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
 
 
@@ -98,7 +97,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
     img_blocks = magic_model.get_imgs(page_id)
     table_blocks = magic_model.get_tables(page_id)
     discarded_blocks = magic_model.get_discarded(page_id)
-    text_blocks = remove_overlap_between_bbox(magic_model.get_text_blocks(page_id))
+    text_blocks = magic_model.get_text_blocks(page_id)
     title_blocks = magic_model.get_title_blocks(page_id)
     inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id)
 

+ 3 - 0
magic_pdf/pre_proc/ocr_detect_all_bboxes.py

@@ -2,6 +2,7 @@ from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_ove
     calculate_iou
 from magic_pdf.libs.drop_tag import DropTag
 from magic_pdf.libs.ocr_content_type import BlockType
+from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
 
 
 def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
@@ -35,6 +36,8 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
     all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
     '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
     all_bboxes = remove_overlaps_min_blocks(all_bboxes)
+    '''将剩余的bbox做分离处理,防止后面分layout时出错'''
+    all_bboxes = remove_overlap_between_bbox_for_block(all_bboxes)
 
     '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
     for discarded in discarded_blocks:

+ 2 - 2
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -5,7 +5,7 @@ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox
 from magic_pdf.libs.drop_tag import DropTag
 from magic_pdf.libs.ocr_content_type import ContentType, BlockType
 from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
-from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
+from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_span
 
 
 # 将每一个line中的span从左到右排序
@@ -168,7 +168,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
         block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
 
         '''bbox去除粘连'''  # 去粘连会影响span的bbox,导致后续fill的时候出错
-        # block_spans = remove_overlap_between_bbox(block_spans)
+        # block_spans = remove_overlap_between_bbox_for_span(block_spans)
 
         block_dict['spans'] = block_spans
         block_with_spans.append(block_dict)

+ 59 - 5
magic_pdf/pre_proc/remove_bbox_overlap.py

@@ -1,7 +1,7 @@
 from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in
 
 
-def _remove_overlap_between_bbox(spans):
+def _remove_overlap_between_bbox_for_span(spans):
     res = []
 
     keeps = [True] * len(spans)
@@ -17,7 +17,7 @@ def _remove_overlap_between_bbox(spans):
             continue
 
         for i in range(len(res)):
-            if  _is_in(v["bbox"], res[i]["bbox"]):
+            if _is_in(v["bbox"], res[i]["bbox"]):
                 continue
             if _is_in_or_part_overlap(res[i]["bbox"], v["bbox"]):
                 ix0, iy0, ix1, iy1 = res[i]["bbox"]
@@ -34,7 +34,7 @@ def _remove_overlap_between_bbox(spans):
                     else:
                         mid = (ix0 + x1) // 2
                         ix0 = max(mid + 0.25, ix0)
-                        x1 = min(mid -0.25, x1)
+                        x1 = min(mid - 0.25, x1)
                 else:
                     if y1 >= iy1:
                         mid = (y0 + iy1) // 2
@@ -51,5 +51,59 @@ def _remove_overlap_between_bbox(spans):
     return res
 
 
-def remove_overlap_between_bbox(spans):
-    return _remove_overlap_between_bbox(spans)
+def _remove_overlap_between_bbox_for_block(all_bboxes):
+    res = []
+
+    keeps = [True] * len(all_bboxes)
+    for i in range(len(all_bboxes)):
+        for j in range(len(all_bboxes)):
+            if i == j:
+                continue
+            if _is_in(all_bboxes[i][:4], all_bboxes[j][:4]):
+                keeps[i] = False
+
+    for idx, v in enumerate(all_bboxes):
+        if not keeps[idx]:
+            continue
+
+        for i in range(len(res)):
+            if _is_in(v[:4], res[i][:4]):
+                continue
+            if _is_in_or_part_overlap(res[i][:4], v[:4]):
+                ix0, iy0, ix1, iy1 = res[i][:4]
+                x0, y0, x1, y1 = v[:4]
+
+                diff_x = min(x1, ix1) - max(x0, ix0)
+                diff_y = min(y1, iy1) - max(y0, iy0)
+
+                if diff_y > diff_x:
+                    if x1 >= ix1:
+                        mid = (x0 + ix1) // 2
+                        ix1 = min(mid - 0.25, ix1)
+                        x0 = max(mid + 0.25, x0)
+                    else:
+                        mid = (ix0 + x1) // 2
+                        ix0 = max(mid + 0.25, ix0)
+                        x1 = min(mid - 0.25, x1)
+                else:
+                    if y1 >= iy1:
+                        mid = (y0 + iy1) // 2
+                        y0 = max(mid + 0.25, y0)
+                        iy1 = min(iy1, mid-0.25)
+                    else:
+                        mid = (iy0 + y1) // 2
+                        y1 = min(y1, mid-0.25)
+                        iy0 = max(mid + 0.25, iy0)
+                res[i][:4] = [ix0, iy0, ix1, iy1]
+                v[:4] = [x0, y0, x1, y1]
+
+        res.append(v)
+    return res
+
+
+def remove_overlap_between_bbox_for_span(spans):
+    return _remove_overlap_between_bbox_for_span(spans)
+
+
+def remove_overlap_between_bbox_for_block(all_bboxes):
+    return _remove_overlap_between_bbox_for_block(all_bboxes)