|
|
@@ -116,7 +116,7 @@ class ElementProcessors:
|
|
|
pdf_doc: PDF文档对象
|
|
|
page_idx: 页码索引
|
|
|
scale: 缩放比例
|
|
|
- pre_matched_spans: 预匹配的 OCR spans(来自整页 OCR)
|
|
|
+ pre_matched_spans: 预匹配的 OCR/PDF-TXT spans
|
|
|
|
|
|
Returns:
|
|
|
处理后的元素字典
|
|
|
@@ -129,22 +129,27 @@ class ElementProcessors:
|
|
|
|
|
|
# 优先级1:使用预匹配的 spans(整页 OCR 结果)
|
|
|
if pre_matched_spans and len(pre_matched_spans) > 0:
|
|
|
+ # 检查 spans 来源
|
|
|
+ span_source = pre_matched_spans[0].get('source', 'ocr')
|
|
|
+
|
|
|
text_content, sorted_spans = SpanMatcher.merge_spans_to_text(
|
|
|
pre_matched_spans, bbox
|
|
|
)
|
|
|
+
|
|
|
if text_content.strip():
|
|
|
- # spans 的坐标已经是绝对坐标,直接使用
|
|
|
ocr_details = sorted_spans
|
|
|
- extraction_method = "fullpage_ocr"
|
|
|
- logger.debug(f"📝 Text from full-page OCR: '{text_content[:30]}...'")
|
|
|
-
|
|
|
- # 优先级2:数字 PDF 字符提取
|
|
|
- if not text_content.strip() and pdf_type == 'txt' and pdf_doc is not None:
|
|
|
+ extraction_method = f"fullpage_{span_source}" # 'fullpage_pdf' 或 'fullpage_ocr'
|
|
|
+ logger.debug(f"📝 Text from {span_source}: '{text_content[:30]}...'")
|
|
|
+
|
|
|
+ # 优先级2:PDF 字符提取 如果没有预匹配的 spans)
|
|
|
+ # 注意: 如果 pdf_type='txt' 且没有 pre_matched_spans,说明 pipeline 跳过了整页识别 ,必须走这里
|
|
|
+ elif pdf_type == 'txt' and pdf_doc is not None:
|
|
|
try:
|
|
|
- text_content, extraction_success = PDFUtils.extract_text_from_pdf(
|
|
|
+ extraction_text, extraction_success = PDFUtils.extract_text_from_pdf(
|
|
|
pdf_doc, page_idx, bbox, scale
|
|
|
)
|
|
|
- if extraction_success and text_content.strip():
|
|
|
+ if extraction_success and extraction_text.strip():
|
|
|
+ text_content = extraction_text
|
|
|
extraction_method = "pdf_extract"
|
|
|
logger.debug(f"📝 Text extracted from PDF: '{text_content[:30]}...'")
|
|
|
except Exception as e:
|