Selaa lähdekoodia

update discarded block and spans build logic

赵小蒙 1 vuosi sitten
vanhempi
commit
a817075b3c

+ 1 - 0
magic_pdf/libs/ocr_content_type.py

@@ -17,4 +17,5 @@ class BlockType:
     Title = "title"
     InterlineEquation = "interline_equation"
     Footnote = "footnote"
+    Discarded = "discarded"
 

+ 11 - 6
magic_pdf/pdf_parse_union_core.py

@@ -17,7 +17,8 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
 from magic_pdf.pre_proc.equations_replace import remove_chars_in_text_blocks, replace_equations_in_textblock, \
     combine_chars_to_pymudict
 from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split
-from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans
+from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans, \
+    fix_discarded_block
 from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2
 from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
 
@@ -122,15 +123,19 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
     spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
 
     '''将所有区块的bbox整理到一起'''
-    all_bboxes = ocr_prepare_bboxes_for_layout_split(
+    all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split(
         img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
         interline_equations, page_w, page_h)
 
+    '''先处理不需要排版的discarded_blocks'''
+    discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4)
+    fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
+
     '''如果当前页面没有bbox则跳过'''
     if len(all_bboxes) == 0:
-        logger.warning(f"skip this page, not found bbox, page_id: {page_id}")
+        logger.warning(f"skip this page, not found useful bbox, page_id: {page_id}")
         return ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [],
-                                               [], [], interline_equations, discarded_blocks,
+                                               [], [], interline_equations, fix_discarded_blocks,
                                                need_drop, drop_reason)
 
     """在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
@@ -171,7 +176,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
     sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
 
     '''将span填入排好序的blocks中'''
-    block_with_spans = fill_spans_in_blocks(sorted_blocks, spans)
+    block_with_spans, spans = fill_spans_in_blocks(sorted_blocks, spans, 0.6)
 
     '''对block进行fix操作'''
     fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
@@ -181,7 +186,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
 
     '''构造pdf_info_dict'''
     page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
-                                                images, tables, interline_equations, discarded_blocks,
+                                                images, tables, interline_equations, fix_discarded_blocks,
                                                 need_drop, drop_reason)
     return page_info
 

+ 4 - 1
magic_pdf/pre_proc/ocr_detect_all_bboxes.py

@@ -7,6 +7,7 @@ from magic_pdf.libs.ocr_content_type import BlockType
 def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
                                         title_blocks, interline_equation_blocks, page_w, page_h):
     all_bboxes = []
+    all_discarded_blocks = []
     for image in img_blocks:
         x0, y0, x1, y1 = image['bbox']
         all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None])
@@ -38,10 +39,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
     '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
     for discarded in discarded_blocks:
         x0, y0, x1, y1 = discarded['bbox']
+        all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None])
+        # 将footnote加入到all_bboxes中,用来计算layout
         if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
             all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None])
 
-    return all_bboxes
+    return all_bboxes, all_discarded_blocks
 
 
 def fix_text_overlap_title_blocks(all_bboxes):

+ 11 - 3
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -141,7 +141,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
     return sort_blocks
 
 
-def fill_spans_in_blocks(blocks, spans):
+def fill_spans_in_blocks(blocks, spans, radio):
     '''
     将allspans中的span按位置关系,放入blocks中
     '''
@@ -156,7 +156,7 @@ def fill_spans_in_blocks(blocks, spans):
         block_spans = []
         for span in spans:
             span_bbox = span['bbox']
-            if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.6:
+            if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio:
                 block_spans.append(span)
 
         '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
@@ -178,7 +178,7 @@ def fill_spans_in_blocks(blocks, spans):
             for span in block_spans:
                 spans.remove(span)
 
-    return block_with_spans
+    return block_with_spans, spans
 
 
 def fix_block_spans(block_with_spans, img_blocks, table_blocks):
@@ -204,6 +204,14 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
     return fix_blocks
 
 
+def fix_discarded_block(discarded_block_with_spans):
+    fix_discarded_blocks = []
+    for block in discarded_block_with_spans:
+        block = fix_text_block(block)
+        fix_discarded_blocks.append(block)
+    return fix_discarded_blocks
+
+
 def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
     block_spans = []
     # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中