Просмотр исходного кода

feat: 更新 process_text_element 方法,改进预匹配 spans 的处理逻辑,支持 OCR 和 PDF 文本提取来源

zhch158_admin 4 дней назад
Родитель
Сommit
85f5dfa1f4
1 измененных файлов с 14 добавлено и 9 удалено
  1. 14 9
      ocr_tools/universal_doc_parser/core/element_processors.py

+ 14 - 9
ocr_tools/universal_doc_parser/core/element_processors.py

@@ -116,7 +116,7 @@ class ElementProcessors:
             pdf_doc: PDF文档对象
             page_idx: 页码索引
             scale: 缩放比例
-            pre_matched_spans: 预匹配的 OCR spans(来自整页 OCR)
+            pre_matched_spans: 预匹配的 OCR/PDF-TXT spans
             
         Returns:
             处理后的元素字典
@@ -129,22 +129,27 @@ class ElementProcessors:
         
         # 优先级1:使用预匹配的 spans(整页 OCR 结果)
         if pre_matched_spans and len(pre_matched_spans) > 0:
+            # 检查 spans 来源
+            span_source = pre_matched_spans[0].get('source', 'ocr')
+            
             text_content, sorted_spans = SpanMatcher.merge_spans_to_text(
                 pre_matched_spans, bbox
             )
+            
             if text_content.strip():
-                # spans 的坐标已经是绝对坐标,直接使用
                 ocr_details = sorted_spans
-                extraction_method = "fullpage_ocr"
-                logger.debug(f"📝 Text from full-page OCR: '{text_content[:30]}...'")
-        
-        # 优先级2:数字 PDF 字符提取
-        if not text_content.strip() and pdf_type == 'txt' and pdf_doc is not None:
+                extraction_method = f"fullpage_{span_source}"  # 'fullpage_pdf' 或 'fullpage_ocr'
+                logger.debug(f"📝 Text from {span_source}: '{text_content[:30]}...'")        
+
+        # 优先级2:PDF 字符提取 如果没有预匹配的 spans)
+        # 注意: 如果 pdf_type='txt' 且没有 pre_matched_spans,说明 pipeline 跳过了整页识别 ,必须走这里
+        elif pdf_type == 'txt' and pdf_doc is not None:
             try:
-                text_content, extraction_success = PDFUtils.extract_text_from_pdf(
+                extraction_text, extraction_success = PDFUtils.extract_text_from_pdf(
                     pdf_doc, page_idx, bbox, scale
                 )
-                if extraction_success and text_content.strip():
+                if extraction_success and extraction_text.strip():
+                    text_content = extraction_text
                     extraction_method = "pdf_extract"
                     logger.debug(f"📝 Text extracted from PDF: '{text_content[:30]}...'")
             except Exception as e: