Переглянути джерело

fix(pdf_parse): Move the logic for filling text content into spans before the discarded_block recognition to fix the issue of empty text blocks in discarded_block.

myhloli 11 місяців тому
батько
коміт
0d3ef89fb9
1 змінених файлів з 21 додано та 20 видалено
  1. 21 20
      magic_pdf/pdf_parse_union_core_v2.py

+ 21 - 20
magic_pdf/pdf_parse_union_core_v2.py

@@ -682,6 +682,27 @@ def parse_page_core(
     """顺便删除大水印并保留abandon的span"""
     spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
 
+    """删除重叠spans中置信度较低的那些"""
+    spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
+    """删除重叠spans中较小的那些"""
+    spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
+
+    """根据parse_mode,构造spans,主要是文本类的字符填充"""
+    if parse_mode == SupportedPdfParseMethod.TXT:
+
+        """之前的公式替换方案"""
+        # pymu_spans = txt_spans_extract_v1(page_doc, inline_equations, interline_equations)
+        # spans = replace_text_span(pymu_spans, spans)
+
+        """使用新版本的混合ocr方案"""
+        spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
+
+    elif parse_mode == SupportedPdfParseMethod.OCR:
+        pass
+    else:
+        raise Exception('parse_mode must be txt or ocr')
+
+
     """先处理不需要排版的discarded_blocks"""
     discarded_block_with_spans, spans = fill_spans_in_blocks(
         all_discarded_blocks, spans, 0.4
@@ -706,26 +727,6 @@ def parse_page_core(
             drop_reason,
         )
 
-    """删除重叠spans中置信度较低的那些"""
-    spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
-    """删除重叠spans中较小的那些"""
-    spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
-
-    """根据parse_mode,构造spans,主要是文本类的字符填充"""
-    if parse_mode == SupportedPdfParseMethod.TXT:
-
-        """之前的公式替换方案"""
-        # pymu_spans = txt_spans_extract_v1(page_doc, inline_equations, interline_equations)
-        # spans = replace_text_span(pymu_spans, spans)
-
-        """ocr 中文本类的 span 用 pymu spans 替换!"""
-        spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
-
-    elif parse_mode == SupportedPdfParseMethod.OCR:
-        pass
-    else:
-        raise Exception('parse_mode must be txt or ocr')
-
     """对image和table截图"""
     spans = ocr_cut_image_and_table(
         spans, page_doc, page_id, pdf_bytes_md5, imageWriter