5 месяцев назад · 85f5dfa1f4
--- a/ocr_tools/universal_doc_parser/core/element_processors.py
+++ b/ocr_tools/universal_doc_parser/core/element_processors.py
@@ -116,7 +116,7 @@ class ElementProcessors:
 
				             pdf_doc: PDF文档对象
			
 
				             page_idx: 页码索引
			
 
				             scale: 缩放比例
			
 
				-            pre_matched_spans: 预匹配的 OCR spans（来自整页 OCR）
			
 
				+            pre_matched_spans: 预匹配的 OCR/PDF-TXT spans
			
 
				             
			
 
				         Returns:
			
 
				             处理后的元素字典
			
@@ -129,22 +129,27 @@ class ElementProcessors:
 
				         
			
 
				         # 优先级1：使用预匹配的 spans（整页 OCR 结果）
			
 
				         if pre_matched_spans and len(pre_matched_spans) > 0:
			
 
				+            # 检查 spans 来源
			
 
				+            span_source = pre_matched_spans[0].get('source', 'ocr')
			
 
				+            
			
 
				             text_content, sorted_spans = SpanMatcher.merge_spans_to_text(
			
 
				                 pre_matched_spans, bbox
			
 
				             )
			
 
				+            
			
 
				             if text_content.strip():
			
 
				-                # spans 的坐标已经是绝对坐标，直接使用
			
 
				                 ocr_details = sorted_spans
			
 
				-                extraction_method = "fullpage_ocr"
			
 
				-                logger.debug(f"📝 Text from full-page OCR: '{text_content[:30]}...'")
			
 
				-        
			
 
				-        # 优先级2：数字 PDF 字符提取
			
 
				-        if not text_content.strip() and pdf_type == 'txt' and pdf_doc is not None:
			
 
				+                extraction_method = f"fullpage_{span_source}"  # 'fullpage_pdf' 或 'fullpage_ocr'
			
 
				+                logger.debug(f"📝 Text from {span_source}: '{text_content[:30]}...'")        
			
 
				+
			
 
				+        # 优先级2：PDF 字符提取 如果没有预匹配的 spans）
			
 
				+        # 注意: 如果 pdf_type='txt' 且没有 pre_matched_spans，说明 pipeline 跳过了整页识别 ，必须走这里
			
 
				+        elif pdf_type == 'txt' and pdf_doc is not None:
			
 
				             try:
			
 
				-                text_content, extraction_success = PDFUtils.extract_text_from_pdf(
			
 
				+                extraction_text, extraction_success = PDFUtils.extract_text_from_pdf(
			
 
				                     pdf_doc, page_idx, bbox, scale
			
 
				                 )
			
 
				-                if extraction_success and text_content.strip():
			
 
				+                if extraction_success and extraction_text.strip():
			
 
				+                    text_content = extraction_text
			
 
				                     extraction_method = "pdf_extract"
			
 
				                     logger.debug(f"📝 Text extracted from PDF: '{text_content[:30]}...'")
			
 
				             except Exception as e: