1 year ago · 442f36845f
--- a/magic_pdf/libs/drop_reason.py
+++ b/magic_pdf/libs/drop_reason.py
@@ -1,6 +1,7 @@
 
				 
			
 
				 class DropReason:
			
 
				     TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖，导致无法准确定位文字顺序
			
 
				+    USEFUL_BLOCK_HOR_OVERLAP = "useful_block_horizontal_overlap" # 需保留的block水平覆盖
			
 
				     COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局，暂时不支持
			
 
				     TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的
			
 
				     COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF，色块会改变阅读顺序，目前不支持带底色文字块的PDF。
			
--- a/magic_pdf/pdf_parse_by_ocr_v2.py
+++ b/magic_pdf/pdf_parse_by_ocr_v2.py
@@ -18,6 +18,22 @@ from magic_pdf.para.para_split_v2 import para_split
 
				 from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
			
 
				 
			
 
				 
			
 
				+def remove_horizontal_overlap_block_which_smaller(all_bboxes):
			
 
				+    useful_blocks = []
			
 
				+    for bbox in all_bboxes:
			
 
				+        useful_blocks.append({
			
 
				+            "bbox": bbox[:4]
			
 
				+        })
			
 
				+    is_useful_block_horz_overlap, smaller_bbox = check_useful_block_horizontal_overlap(useful_blocks)
			
 
				+    if is_useful_block_horz_overlap:
			
 
				+        logger.warning(
			
 
				+            f"skip this page, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}")
			
 
				+        for bbox in all_bboxes.copy():
			
 
				+            if smaller_bbox == bbox[:4]:
			
 
				+                all_bboxes.remove(bbox)
			
 
				+
			
 
				+    return is_useful_block_horz_overlap, all_bboxes
			
 
				+
			
 
				 def parse_pdf_by_ocr(pdf_bytes,
			
 
				                      model_list,
			
 
				                      imageWriter,
			
@@ -25,6 +41,9 @@ def parse_pdf_by_ocr(pdf_bytes,
 
				                      end_page_id=None,
			
 
				                      debug_mode=False,
			
 
				                      ):
			
 
				+    need_drop = False
			
 
				+    drop_reason = ""
			
 
				+
			
 
				     pdf_bytes_md5 = compute_md5(pdf_bytes)
			
 
				     pdf_docs = fitz.open("pdf", pdf_bytes)
			
 
				 
			
@@ -66,16 +85,14 @@ def parse_pdf_by_ocr(pdf_bytes,
 
				             interline_equations, page_w, page_h)
			
 
				 
			
 
				         """在切分之前，先检查一下bbox是否有左右重叠的情况，如果有，那么就认为这个pdf暂时没有能力处理好，这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
			
 
				-        useful_blocks = []
			
 
				-        for bbox in all_bboxes:
			
 
				-            useful_blocks.append({
			
 
				-                "bbox": bbox[:4]
			
 
				-            })
			
 
				-        is_useful_block_horz_overlap = check_useful_block_horizontal_overlap(useful_blocks)
			
 
				-        if is_useful_block_horz_overlap:
			
 
				-            logger.warning(
			
 
				-                f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}")
			
 
				-            continue
			
 
				+
			
 
				+        while True:  # 循环检查左右重叠的情况，如果存在就删除掉较小的那个bbox，直到不存在左右重叠的情况
			
 
				+            is_useful_block_horz_overlap, all_bboxes = remove_horizontal_overlap_block_which_smaller(all_bboxes)
			
 
				+            if is_useful_block_horz_overlap:
			
 
				+                need_drop = True
			
 
				+                drop_reason = DropReason.USEFUL_BLOCK_HOR_OVERLAP
			
 
				+            else:
			
 
				+                break
			
 
				 
			
 
				         '''根据区块信息计算layout'''
			
 
				         page_boundry = [0, 0, page_w, page_h]
			
@@ -84,19 +101,23 @@ def parse_pdf_by_ocr(pdf_bytes,
 
				         if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0:
			
 
				             logger.warning(
			
 
				                 f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}")
			
 
				-            continue
			
 
				+            need_drop = True
			
 
				+            drop_reason = DropReason.CAN_NOT_DETECT_PAGE_LAYOUT
			
 
				 
			
 
				         """以下去掉复杂的布局和超过2列的布局"""
			
 
				         if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]):  # 复杂的布局
			
 
				             logger.warning(
			
 
				                 f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}")
			
 
				-            continue
			
 
				+            need_drop = True
			
 
				+            drop_reason = DropReason.COMPLICATED_LAYOUT
			
 
				 
			
 
				         layout_column_width = get_columns_cnt_of_layout(layout_tree)
			
 
				         if layout_column_width > 2:  # 去掉超过2列的布局pdf
			
 
				             logger.warning(
			
 
				                 f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}")
			
 
				-            continue
			
 
				+            need_drop = True
			
 
				+            drop_reason = DropReason.TOO_MANY_LAYOUT_COLUMNS
			
 
				+
			
 
				 
			
 
				         '''根据layout顺序，对当前页面所有需要留下的block进行排序'''
			
 
				         sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
			
@@ -119,7 +140,8 @@ def parse_pdf_by_ocr(pdf_bytes,
 
				 
			
 
				         '''构造pdf_info_dict'''
			
 
				         page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
			
 
				-                                                    images, tables, interline_equations, discarded_blocks)
			
 
				+                                                    images, tables, interline_equations, discarded_blocks,
			
 
				+                                                    need_drop, drop_reason)
			
 
				         pdf_info_dict[f"page_{page_id}"] = page_info
			
 
				 
			
 
				     """分段"""
			
--- a/magic_pdf/pdf_parse_by_txt_v2.py
+++ b/magic_pdf/pdf_parse_by_txt_v2.py
@@ -32,6 +32,22 @@ from magic_pdf.libs.math import float_equal
 
				 from magic_pdf.para.para_split_v2 import para_split
			
 
				 from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
			
 
				 
			
 
				+def remove_horizontal_overlap_block_which_smaller(all_bboxes):
			
 
				+    useful_blocks = []
			
 
				+    for bbox in all_bboxes:
			
 
				+        useful_blocks.append({
			
 
				+            "bbox": bbox[:4]
			
 
				+        })
			
 
				+    is_useful_block_horz_overlap, smaller_bbox = check_useful_block_horizontal_overlap(useful_blocks)
			
 
				+    if is_useful_block_horz_overlap:
			
 
				+        logger.warning(
			
 
				+            f"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}")
			
 
				+        for bbox in all_bboxes.copy():
			
 
				+            if smaller_bbox == bbox[:4]:
			
 
				+                all_bboxes.remove(bbox)
			
 
				+
			
 
				+    return is_useful_block_horz_overlap, all_bboxes
			
 
				+
			
 
				 
			
 
				 def txt_spans_extract(pdf_page, inline_equations, interline_equations):
			
 
				     text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
			
@@ -91,6 +107,9 @@ def parse_pdf_by_txt(
 
				     end_page_id=None,
			
 
				     debug_mode=False,
			
 
				 ):
			
 
				+    need_drop = False
			
 
				+    drop_reason = ""
			
 
				+
			
 
				     pdf_bytes_md5 = compute_md5(pdf_bytes)
			
 
				     pdf_docs = fitz.open("pdf", pdf_bytes)
			
 
				 
			
@@ -141,16 +160,14 @@ def parse_pdf_by_txt(
 
				         )
			
 
				 
			
 
				         """在切分之前，先检查一下bbox是否有左右重叠的情况，如果有，那么就认为这个pdf暂时没有能力处理好，这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
			
 
				-        useful_blocks = []
			
 
				-        for bbox in all_bboxes:
			
 
				-            useful_blocks.append({
			
 
				-                "bbox": bbox[:4]
			
 
				-            })
			
 
				-        is_useful_block_horz_overlap = check_useful_block_horizontal_overlap(useful_blocks)
			
 
				-        if is_useful_block_horz_overlap:
			
 
				-            logger.warning(
			
 
				-                f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}")
			
 
				-            continue
			
 
				+
			
 
				+        while True:  # 循环检查左右重叠的情况，如果存在就删除掉较小的那个bbox，直到不存在左右重叠的情况
			
 
				+            is_useful_block_horz_overlap, all_bboxes = remove_horizontal_overlap_block_which_smaller(all_bboxes)
			
 
				+            if is_useful_block_horz_overlap:
			
 
				+                need_drop = True
			
 
				+                drop_reason = DropReason.USEFUL_BLOCK_HOR_OVERLAP
			
 
				+            else:
			
 
				+                break
			
 
				 
			
 
				         '''根据区块信息计算layout'''
			
 
				         page_boundry = [0, 0, page_w, page_h]
			
@@ -159,19 +176,22 @@ def parse_pdf_by_txt(
 
				         if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0:
			
 
				             logger.warning(
			
 
				                 f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}")
			
 
				-            continue
			
 
				+            need_drop = True
			
 
				+            drop_reason = DropReason.CAN_NOT_DETECT_PAGE_LAYOUT
			
 
				 
			
 
				         """以下去掉复杂的布局和超过2列的布局"""
			
 
				         if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]):  # 复杂的布局
			
 
				             logger.warning(
			
 
				                 f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}")
			
 
				-            continue
			
 
				+            need_drop = True
			
 
				+            drop_reason = DropReason.COMPLICATED_LAYOUT
			
 
				 
			
 
				         layout_column_width = get_columns_cnt_of_layout(layout_tree)
			
 
				         if layout_column_width > 2:  # 去掉超过2列的布局pdf
			
 
				             logger.warning(
			
 
				                 f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}")
			
 
				-            continue
			
 
				+            need_drop = True
			
 
				+            drop_reason = DropReason.TOO_MANY_LAYOUT_COLUMNS
			
 
				 
			
 
				         """根据layout顺序，对当前页面所有需要留下的block进行排序"""
			
 
				         sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
			
@@ -211,6 +231,8 @@ def parse_pdf_by_txt(
 
				             tables,
			
 
				             interline_equations,
			
 
				             discarded_blocks,
			
 
				+            need_drop,
			
 
				+            drop_reason
			
 
				         )
			
 
				         pdf_info_dict[f"page_{page_id}"] = page_info
			
 
				 
			
--- a/magic_pdf/pre_proc/construct_page_dict.py
+++ b/magic_pdf/pre_proc/construct_page_dict.py
@@ -55,7 +55,7 @@ def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h,
 
				 
			
 
				 
			
 
				 def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
			
 
				-                                    images, tables, interline_equations, discarded_blocks):
			
 
				+                                    images, tables, interline_equations, discarded_blocks, need_drop, drop_reason):
			
 
				     return_dict = {
			
 
				         'preproc_blocks': blocks,
			
 
				         'layout_bboxes': layout_bboxes,
			
@@ -66,5 +66,7 @@ def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page
 
				         'tables': tables,
			
 
				         'interline_equations': interline_equations,
			
 
				         'discarded_blocks': discarded_blocks,
			
 
				+        'need_drop': need_drop,
			
 
				+        'drop_reason': drop_reason,
			
 
				     }
			
 
				     return return_dict
			
--- a/magic_pdf/pre_proc/resolve_bbox_conflict.py
+++ b/magic_pdf/pre_proc/resolve_bbox_conflict.py
@@ -180,7 +180,12 @@ def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
 
				 
			
 
				     for i in range(len(useful_bboxes)):
			
 
				         for j in range(i + 1, len(useful_bboxes)):
			
 
				+            area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
			
 
				+            area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
			
 
				             if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
			
 
				-                return True
			
 
				+                if area_i > area_j:
			
 
				+                    return True, useful_bboxes[j]
			
 
				+                else:
			
 
				+                    return True, useful_bboxes[i]
			
 
				 
			
 
				-    return False
			
 
				+    return False, None