فهرست منبع

block type 字段名修复
增加remove_overlaps_min_blocks逻辑

赵小蒙 1 سال پیش
والد
کامیت
45ce99bf87

+ 1 - 0
magic_pdf/libs/drop_tag.py

@@ -16,3 +16,4 @@ class DropTag:
     FOOTNOTE = "footnote"
     NOT_IN_LAYOUT = "not_in_layout"
     SPAN_OVERLAP = "span_overlap"
+    BLOCK_OVERLAP = "block_overlap"

+ 0 - 4
magic_pdf/pdf_parse_by_ocr_v2.py

@@ -70,10 +70,6 @@ def parse_pdf_by_ocr(pdf_bytes,
         '''根据layout顺序,对当前页面所有需要留下的block进行排序'''
         sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
 
-        '''block嵌套问题解决'''
-        #@todo 1. text block大框套小框,删除小框 2. 图片或文本框与舍弃框重叠,优先信任舍弃框 3. 文本框与标题框重叠,优先信任文本框
-
-
         '''获取所有需要拼接的span资源'''
         spans = magic_model.get_all_spans(page_id)
         '''删除重叠spans中较小的那些'''

+ 25 - 0
magic_pdf/pre_proc/ocr_detect_all_bboxes.py

@@ -1,3 +1,5 @@
+from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
+from magic_pdf.libs.drop_tag import DropTag
 from magic_pdf.libs.ocr_content_type import BlockType
 
 
@@ -31,5 +33,28 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
         if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
             all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None])
 
+    '''block嵌套问题解决'''
+    # @todo 1. text block大框套小框,删除小框 2. 图片或文本框与舍弃框重叠,优先信任舍弃框 3. 文本框与标题框重叠,优先信任文本框
+    all_bboxes, dropped_blocks = remove_overlaps_min_blocks(all_bboxes)
+
     return all_bboxes
 
+
+def remove_overlaps_min_blocks(all_bboxes):
+    dropped_blocks = []
+    #  删除重叠blocks中较小的那些
+    for block1 in all_bboxes.copy():
+        for block2 in all_bboxes.copy():
+            if block1 != block2:
+                block1_box = block1[0], block1[1], block1[2], block1[3]
+                block2_box = block2[0], block2[1], block2[2], block2[3]
+                overlap_box = get_minbox_if_overlap_by_ratio(block1_box, block2_box, 0.8)
+                if overlap_box is not None:
+                    bbox_to_remove = next(
+                        (block for block in all_bboxes if [block[0], block[1], block[2], block[3]] == overlap_box),
+                        None)
+                    if bbox_to_remove is not None:
+                        all_bboxes.remove(bbox_to_remove)
+                        bbox_to_remove['tag'] = DropTag.BLOCK_OVERLAP
+                        dropped_blocks.append(bbox_to_remove)
+    return all_bboxes, dropped_blocks

+ 4 - 4
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -150,7 +150,7 @@ def fill_spans_in_blocks(blocks, spans):
         block_type = block[7]
         block_bbox = block[0:4]
         block_dict = {
-            'block_type': block_type,
+            'type': block_type,
             'bbox': block_bbox,
         }
         block_spans = []
@@ -190,7 +190,7 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
     '''
     fix_blocks = []
     for block in block_with_spans:
-        block_type = block['block_type']
+        block_type = block['type']
 
         if block_type == BlockType.Image:
             block = fix_image_block(block, img_blocks)
@@ -215,7 +215,7 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
     sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
     block = {
         'bbox': block_bbox,
-        'block_type': block_type,
+        'type': block_type,
         'lines': sort_block_lines
     }
     return block, block_spans
@@ -229,7 +229,7 @@ def make_body_block(span: dict, block_bbox: list, block_type: str):
     }
     body_block = {
         'bbox': block_bbox,
-        'block_type': block_type,
+        'type': block_type,
         'lines': [body_line]
     }
     return body_block

+ 3 - 3
magic_pdf/pre_proc/ocr_span_list_modify.py

@@ -222,10 +222,10 @@ def get_qa_need_list_v2(blocks):
     interline_equations = []
 
     for block in blocks:
-        if block["block_type"] == BlockType.Image:
+        if block["type"] == BlockType.Image:
             images.append(block)
-        elif block["block_type"] == BlockType.Table:
+        elif block["type"] == BlockType.Table:
             tables.append(block)
-        elif block["block_type"] == BlockType.InterlineEquation:
+        elif block["type"] == BlockType.InterlineEquation:
             interline_equations.append(block)
     return images, tables, interline_equations