Browse Source

add new drop scene

赵小蒙 1 năm trước cách đây
mục cha
commit
2f13b3a87c

+ 1 - 0
magic_pdf/libs/drop_reason.py

@@ -23,4 +23,5 @@ class DropReason:
     PSEUDO_SINGLE_COLUMN = "pseudo_single_column" # 无法精确判断文字分栏
     CAN_NOT_DETECT_PAGE_LAYOUT="can_not_detect_page_layout" # 无法分析页面的版面
     NEGATIVE_BBOX_AREA = "negative_bbox_area" # 缩放导致 bbox 面积为负
+    OVERLAP_BLOCKS_CAN_NOT_SEPARATION = "overlap_blocks_can_t_separation" # 无法分离重叠的block
     

+ 5 - 2
magic_pdf/pdf_parse_union_core.py

@@ -130,13 +130,16 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
 
     '''将所有区块的bbox整理到一起'''
     if len(interline_equation_blocks) > 0:
-        all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split(
+        all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split(
             img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
             interline_equation_blocks, page_w, page_h)
     else:
-        all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split(
+        all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split(
             img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
             interline_equations, page_w, page_h)
+    if len(drop_reasons) > 0:
+        need_drop = True
+        drop_reason = DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION
 
     '''先处理不需要排版的discarded_blocks'''
     discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4)

+ 1 - 1
magic_pdf/pre_proc/ocr_detect_all_bboxes.py

@@ -51,7 +51,7 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
     '''将剩余的bbox做分离处理,防止后面分layout时出错'''
     all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
 
-    return all_bboxes, all_discarded_blocks
+    return all_bboxes, all_discarded_blocks, drop_reasons
 
 
 def fix_text_overlap_title_blocks(all_bboxes):