Selaa lähdekoodia

Merge pull request #824 from myhloli/dev

fix(pdf_parse): optimize span processing by removing outside spans
Xiaomeng Zhao 1 vuosi sitten
vanhempi
commit
b2cd4aa029
1 muutettua tiedostoa jossa 56 lisäystä ja 21 poistoa
  1. 56 21
      magic_pdf/pdf_parse_union_core_v2.py

+ 56 - 21
magic_pdf/pdf_parse_union_core_v2.py

@@ -9,6 +9,7 @@ from loguru import logger
 
 from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.data.dataset import Dataset, PageableData
+from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
 from magic_pdf.libs.clean_memory import clean_memory
 from magic_pdf.libs.commons import fitz, get_delta_time
 from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
@@ -381,6 +382,37 @@ def revert_group_blocks(blocks):
     return new_blocks
 
 
+def remove_outside_spans(spans, all_bboxes):
+    image_bboxes = []
+    table_bboxes = []
+    for block in all_bboxes:
+        block_type = block[7]
+        block_bbox = block[0:4]
+        if block_type == BlockType.ImageBody:
+            image_bboxes.append(block_bbox)
+        elif block_type == BlockType.TableBody:
+            table_bboxes.append(block_bbox)
+        else:
+            continue
+
+    new_spans = []
+    for span in spans:
+        if span['type'] == ContentType.Image:
+            for block_bbox in image_bboxes:
+                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
+                    new_spans.append(span)
+                    break
+        elif span['type'] == ContentType.Table:
+            for block_bbox in table_bboxes:
+                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
+                    new_spans.append(span)
+                    break
+        else:
+            new_spans.append(span)
+
+    return new_spans
+
+
 def parse_page_core(
     page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
 ):
@@ -411,27 +443,6 @@ def parse_page_core(
 
     page_w, page_h = magic_model.get_page_size(page_id)
 
-    spans = magic_model.get_all_spans(page_id)
-
-    """根据parse_mode,构造spans"""
-    if parse_mode == SupportedPdfParseMethod.TXT:
-        """ocr 中文本类的 span 用 pymu spans 替换!"""
-        pymu_spans = txt_spans_extract(page_doc, inline_equations, interline_equations)
-        spans = replace_text_span(pymu_spans, spans)
-    elif parse_mode == SupportedPdfParseMethod.OCR:
-        pass
-    else:
-        raise Exception('parse_mode must be txt or ocr')
-
-    """删除重叠spans中置信度较低的那些"""
-    spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
-    """删除重叠spans中较小的那些"""
-    spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
-    """对image和table截图"""
-    spans = ocr_cut_image_and_table(
-        spans, page_doc, page_id, pdf_bytes_md5, imageWriter
-    )
-
     """将所有区块的bbox整理到一起"""
     # interline_equation_blocks参数不够准,后面切换到interline_equations上
     interline_equation_blocks = []
@@ -458,6 +469,30 @@ def parse_page_core(
             page_h,
         )
 
+    spans = magic_model.get_all_spans(page_id)
+
+    """根据parse_mode,构造spans"""
+    if parse_mode == SupportedPdfParseMethod.TXT:
+        """ocr 中文本类的 span 用 pymu spans 替换!"""
+        pymu_spans = txt_spans_extract(page_doc, inline_equations, interline_equations)
+        spans = replace_text_span(pymu_spans, spans)
+    elif parse_mode == SupportedPdfParseMethod.OCR:
+        pass
+    else:
+        raise Exception('parse_mode must be txt or ocr')
+
+    """在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span"""
+    spans = remove_outside_spans(spans, all_bboxes)
+
+    """删除重叠spans中置信度较低的那些"""
+    spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
+    """删除重叠spans中较小的那些"""
+    spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
+    """对image和table截图"""
+    spans = ocr_cut_image_and_table(
+        spans, page_doc, page_id, pdf_bytes_md5, imageWriter
+    )
+
     """先处理不需要排版的discarded_blocks"""
     discarded_block_with_spans, spans = fill_spans_in_blocks(
         all_discarded_blocks, spans, 0.4