|
@@ -272,7 +272,7 @@ class EnhancedDocPipeline:
|
|
|
self,
|
|
self,
|
|
|
image_dict: Dict[str, Any],
|
|
image_dict: Dict[str, Any],
|
|
|
page_idx: int,
|
|
page_idx: int,
|
|
|
- pdf_type: str,
|
|
|
|
|
|
|
+ pdf_type: str, # 'ocr' 或 'txt'
|
|
|
pdf_doc: Optional[Any] = None,
|
|
pdf_doc: Optional[Any] = None,
|
|
|
page_name: Optional[str] = None,
|
|
page_name: Optional[str] = None,
|
|
|
output_dir: Optional[str] = None
|
|
output_dir: Optional[str] = None
|
|
@@ -362,25 +362,35 @@ class EnhancedDocPipeline:
|
|
|
all_text_spans = []
|
|
all_text_spans = []
|
|
|
all_ocr_spans = []
|
|
all_ocr_spans = []
|
|
|
should_run_ocr = True
|
|
should_run_ocr = True
|
|
|
|
|
+ actual_page_type = None
|
|
|
|
|
|
|
|
if pdf_type == 'txt' and pdf_doc is not None:
|
|
if pdf_type == 'txt' and pdf_doc is not None:
|
|
|
# 文字 PDF:直接从 PDF 提取文本块
|
|
# 文字 PDF:直接从 PDF 提取文本块
|
|
|
try:
|
|
try:
|
|
|
- pdf_text_blocks, rotation_angle = PDFUtils.extract_all_text_blocks(
|
|
|
|
|
- pdf_doc, page_idx, scale=scale
|
|
|
|
|
- )
|
|
|
|
|
- # 保存rotation角度
|
|
|
|
|
- page_result['angle'] = rotation_angle
|
|
|
|
|
- if rotation_angle != 0:
|
|
|
|
|
- logger.info(f"📐 Page {page_idx}: PDF rotation {rotation_angle}°")
|
|
|
|
|
|
|
+ # 页级别检测:该页是否真的有文字
|
|
|
|
|
+ actual_page_type = PDFUtils.detect_page_type(pdf_doc, page_idx)
|
|
|
|
|
+
|
|
|
|
|
+ if actual_page_type == 'txt':
|
|
|
|
|
+ # 正常提取文字
|
|
|
|
|
+ all_text_spans, rotation = PDFUtils.extract_all_text_blocks(
|
|
|
|
|
+ pdf_doc, page_idx, scale=scale
|
|
|
|
|
+ )
|
|
|
|
|
+ # 保存rotation角度
|
|
|
|
|
+ page_result['angle'] = rotation
|
|
|
|
|
+ if rotation != 0:
|
|
|
|
|
+ logger.info(f"📐 Page {page_idx}: PDF rotation {rotation}°")
|
|
|
|
|
|
|
|
- # 将 PDF 文本块转换为 span 格式
|
|
|
|
|
- all_text_spans = self._convert_pdf_blocks_to_spans(
|
|
|
|
|
- pdf_text_blocks, detection_image.shape
|
|
|
|
|
- )
|
|
|
|
|
- logger.info(f"📝 Page {page_idx}: PDF extracted {len(all_text_spans)} text blocks")
|
|
|
|
|
- if self.debug_mode:
|
|
|
|
|
- # 调试模式:同时运行 OCR 对比
|
|
|
|
|
|
|
+ # 将 PDF 文本块转换为 span 格式
|
|
|
|
|
+ all_text_spans = self._convert_pdf_blocks_to_spans(
|
|
|
|
|
+ all_text_spans, detection_image.shape
|
|
|
|
|
+ )
|
|
|
|
|
+ logger.info(f"📝 Page {page_idx}: PDF extracted {len(all_text_spans)} text blocks")
|
|
|
|
|
+ if self.debug_mode:
|
|
|
|
|
+ # 调试模式:同时运行 OCR 对比
|
|
|
|
|
+ should_run_ocr = True
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 该页是扫描页,使用OCR
|
|
|
|
|
+ logger.info(f"Page {page_idx}: detected as scanned page, using OCR")
|
|
|
should_run_ocr = True
|
|
should_run_ocr = True
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
logger.warning(f"⚠️ PDF text extraction failed, fallback to OCR: {e}")
|
|
logger.warning(f"⚠️ PDF text extraction failed, fallback to OCR: {e}")
|
|
@@ -405,7 +415,10 @@ class EnhancedDocPipeline:
|
|
|
self._compare_ocr_and_pdf_text(
|
|
self._compare_ocr_and_pdf_text(
|
|
|
page_idx, pdf_doc, all_ocr_spans, detection_image, output_dir, page_name, scale
|
|
page_idx, pdf_doc, all_ocr_spans, detection_image, output_dir, page_name, scale
|
|
|
)
|
|
)
|
|
|
-
|
|
|
|
|
|
|
+ # 根据实际情况决定使用 OCR 结果还是 PDF 提取结果
|
|
|
|
|
+ if actual_page_type and actual_page_type == 'ocr':
|
|
|
|
|
+ pdf_type = 'ocr'
|
|
|
|
|
+
|
|
|
if pdf_type == 'ocr':
|
|
if pdf_type == 'ocr':
|
|
|
all_text_spans = all_ocr_spans
|
|
all_text_spans = all_ocr_spans
|
|
|
|
|
|
|
@@ -683,7 +696,7 @@ class EnhancedDocPipeline:
|
|
|
self,
|
|
self,
|
|
|
detection_image: np.ndarray,
|
|
detection_image: np.ndarray,
|
|
|
classified_elements: Dict[str, List[Dict[str, Any]]],
|
|
classified_elements: Dict[str, List[Dict[str, Any]]],
|
|
|
- pdf_type: str,
|
|
|
|
|
|
|
+ pdf_type: str, # 'ocr' 或 'txt'
|
|
|
pdf_doc: Optional[Any],
|
|
pdf_doc: Optional[Any],
|
|
|
page_idx: int,
|
|
page_idx: int,
|
|
|
scale: float,
|
|
scale: float,
|
|
@@ -778,7 +791,7 @@ class EnhancedDocPipeline:
|
|
|
# 有线表格路径:UNet 识别
|
|
# 有线表格路径:UNet 识别
|
|
|
logger.info(f"🔷 Using wired UNet table recognition (configured)")
|
|
logger.info(f"🔷 Using wired UNet table recognition (configured)")
|
|
|
element = self.element_processors.process_table_element_wired(
|
|
element = self.element_processors.process_table_element_wired(
|
|
|
- detection_image, item, scale, pre_matched_spans=spans,
|
|
|
|
|
|
|
+ detection_image, item, scale, pre_matched_spans=spans, pdf_type=pdf_type,
|
|
|
output_dir=output_dir, basename=f"{basename}_{idx}"
|
|
output_dir=output_dir, basename=f"{basename}_{idx}"
|
|
|
)
|
|
)
|
|
|
|
|
|