Forráskód Böngészése

fix complicated layout logic

赵小蒙 1 éve
szülő
commit
442f36845f

+ 1 - 0
magic_pdf/libs/drop_reason.py

@@ -1,6 +1,7 @@
 
 class DropReason:
     TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖,导致无法准确定位文字顺序
+    USEFUL_BLOCK_HOR_OVERLAP = "useful_block_horizontal_overlap" # 需保留的block水平覆盖
     COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局,暂时不支持
     TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的
     COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。

+ 36 - 14
magic_pdf/pdf_parse_by_ocr_v2.py

@@ -18,6 +18,22 @@ from magic_pdf.para.para_split_v2 import para_split
 from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
 
 
+def remove_horizontal_overlap_block_which_smaller(all_bboxes):
+    useful_blocks = []
+    for bbox in all_bboxes:
+        useful_blocks.append({
+            "bbox": bbox[:4]
+        })
+    is_useful_block_horz_overlap, smaller_bbox = check_useful_block_horizontal_overlap(useful_blocks)
+    if is_useful_block_horz_overlap:
+        logger.warning(
+            f"skip this page, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}")
+        for bbox in all_bboxes.copy():
+            if smaller_bbox == bbox[:4]:
+                all_bboxes.remove(bbox)
+
+    return is_useful_block_horz_overlap, all_bboxes
+
 def parse_pdf_by_ocr(pdf_bytes,
                      model_list,
                      imageWriter,
@@ -25,6 +41,9 @@ def parse_pdf_by_ocr(pdf_bytes,
                      end_page_id=None,
                      debug_mode=False,
                      ):
+    need_drop = False
+    drop_reason = ""
+
     pdf_bytes_md5 = compute_md5(pdf_bytes)
     pdf_docs = fitz.open("pdf", pdf_bytes)
 
@@ -66,16 +85,14 @@ def parse_pdf_by_ocr(pdf_bytes,
             interline_equations, page_w, page_h)
 
         """在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
-        useful_blocks = []
-        for bbox in all_bboxes:
-            useful_blocks.append({
-                "bbox": bbox[:4]
-            })
-        is_useful_block_horz_overlap = check_useful_block_horizontal_overlap(useful_blocks)
-        if is_useful_block_horz_overlap:
-            logger.warning(
-                f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}")
-            continue
+
+        while True:  # 循环检查左右重叠的情况,如果存在就删除掉较小的那个bbox,直到不存在左右重叠的情况
+            is_useful_block_horz_overlap, all_bboxes = remove_horizontal_overlap_block_which_smaller(all_bboxes)
+            if is_useful_block_horz_overlap:
+                need_drop = True
+                drop_reason = DropReason.USEFUL_BLOCK_HOR_OVERLAP
+            else:
+                break
 
         '''根据区块信息计算layout'''
         page_boundry = [0, 0, page_w, page_h]
@@ -84,19 +101,23 @@ def parse_pdf_by_ocr(pdf_bytes,
         if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0:
             logger.warning(
                 f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}")
-            continue
+            need_drop = True
+            drop_reason = DropReason.CAN_NOT_DETECT_PAGE_LAYOUT
 
         """以下去掉复杂的布局和超过2列的布局"""
         if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]):  # 复杂的布局
             logger.warning(
                 f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}")
-            continue
+            need_drop = True
+            drop_reason = DropReason.COMPLICATED_LAYOUT
 
         layout_column_width = get_columns_cnt_of_layout(layout_tree)
         if layout_column_width > 2:  # 去掉超过2列的布局pdf
             logger.warning(
                 f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}")
-            continue
+            need_drop = True
+            drop_reason = DropReason.TOO_MANY_LAYOUT_COLUMNS
+
 
         '''根据layout顺序,对当前页面所有需要留下的block进行排序'''
         sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
@@ -119,7 +140,8 @@ def parse_pdf_by_ocr(pdf_bytes,
 
         '''构造pdf_info_dict'''
         page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
-                                                    images, tables, interline_equations, discarded_blocks)
+                                                    images, tables, interline_equations, discarded_blocks,
+                                                    need_drop, drop_reason)
         pdf_info_dict[f"page_{page_id}"] = page_info
 
     """分段"""

+ 35 - 13
magic_pdf/pdf_parse_by_txt_v2.py

@@ -32,6 +32,22 @@ from magic_pdf.libs.math import float_equal
 from magic_pdf.para.para_split_v2 import para_split
 from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
 
+def remove_horizontal_overlap_block_which_smaller(all_bboxes):
+    useful_blocks = []
+    for bbox in all_bboxes:
+        useful_blocks.append({
+            "bbox": bbox[:4]
+        })
+    is_useful_block_horz_overlap, smaller_bbox = check_useful_block_horizontal_overlap(useful_blocks)
+    if is_useful_block_horz_overlap:
+        logger.warning(
+            f"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}")
+        for bbox in all_bboxes.copy():
+            if smaller_bbox == bbox[:4]:
+                all_bboxes.remove(bbox)
+
+    return is_useful_block_horz_overlap, all_bboxes
+
 
 def txt_spans_extract(pdf_page, inline_equations, interline_equations):
     text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
@@ -91,6 +107,9 @@ def parse_pdf_by_txt(
     end_page_id=None,
     debug_mode=False,
 ):
+    need_drop = False
+    drop_reason = ""
+
     pdf_bytes_md5 = compute_md5(pdf_bytes)
     pdf_docs = fitz.open("pdf", pdf_bytes)
 
@@ -141,16 +160,14 @@ def parse_pdf_by_txt(
         )
 
         """在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
-        useful_blocks = []
-        for bbox in all_bboxes:
-            useful_blocks.append({
-                "bbox": bbox[:4]
-            })
-        is_useful_block_horz_overlap = check_useful_block_horizontal_overlap(useful_blocks)
-        if is_useful_block_horz_overlap:
-            logger.warning(
-                f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}")
-            continue
+
+        while True:  # 循环检查左右重叠的情况,如果存在就删除掉较小的那个bbox,直到不存在左右重叠的情况
+            is_useful_block_horz_overlap, all_bboxes = remove_horizontal_overlap_block_which_smaller(all_bboxes)
+            if is_useful_block_horz_overlap:
+                need_drop = True
+                drop_reason = DropReason.USEFUL_BLOCK_HOR_OVERLAP
+            else:
+                break
 
         '''根据区块信息计算layout'''
         page_boundry = [0, 0, page_w, page_h]
@@ -159,19 +176,22 @@ def parse_pdf_by_txt(
         if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0:
             logger.warning(
                 f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}")
-            continue
+            need_drop = True
+            drop_reason = DropReason.CAN_NOT_DETECT_PAGE_LAYOUT
 
         """以下去掉复杂的布局和超过2列的布局"""
         if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]):  # 复杂的布局
             logger.warning(
                 f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}")
-            continue
+            need_drop = True
+            drop_reason = DropReason.COMPLICATED_LAYOUT
 
         layout_column_width = get_columns_cnt_of_layout(layout_tree)
         if layout_column_width > 2:  # 去掉超过2列的布局pdf
             logger.warning(
                 f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}")
-            continue
+            need_drop = True
+            drop_reason = DropReason.TOO_MANY_LAYOUT_COLUMNS
 
         """根据layout顺序,对当前页面所有需要留下的block进行排序"""
         sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
@@ -211,6 +231,8 @@ def parse_pdf_by_txt(
             tables,
             interline_equations,
             discarded_blocks,
+            need_drop,
+            drop_reason
         )
         pdf_info_dict[f"page_{page_id}"] = page_info
 

+ 3 - 1
magic_pdf/pre_proc/construct_page_dict.py

@@ -55,7 +55,7 @@ def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h,
 
 
 def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
-                                    images, tables, interline_equations, discarded_blocks):
+                                    images, tables, interline_equations, discarded_blocks, need_drop, drop_reason):
     return_dict = {
         'preproc_blocks': blocks,
         'layout_bboxes': layout_bboxes,
@@ -66,5 +66,7 @@ def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page
         'tables': tables,
         'interline_equations': interline_equations,
         'discarded_blocks': discarded_blocks,
+        'need_drop': need_drop,
+        'drop_reason': drop_reason,
     }
     return return_dict

+ 7 - 2
magic_pdf/pre_proc/resolve_bbox_conflict.py

@@ -180,7 +180,12 @@ def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
 
     for i in range(len(useful_bboxes)):
         for j in range(i + 1, len(useful_bboxes)):
+            area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
+            area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
             if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
-                return True
+                if area_i > area_j:
+                    return True, useful_bboxes[j]
+                else:
+                    return True, useful_bboxes[i]
 
-    return False
+    return False, None