5 месяцев назад · e4304a8c0e
--- a/ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py
+++ b/ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py
@@ -272,7 +272,7 @@ class EnhancedDocPipeline:
 
															             self,
														
 
															             image_dict: Dict[str, Any],
														
 
															             page_idx: int,
														
 
															-            pdf_type: str,
														
 
															+            pdf_type: str, # 'ocr' 或 'txt'
														
 
															             pdf_doc: Optional[Any] = None,
														
 
															             page_name: Optional[str] = None,
														
 
															             output_dir: Optional[str] = None
														
@@ -362,25 +362,35 @@ class EnhancedDocPipeline:
 
															         all_text_spans = []
														
 
															         all_ocr_spans = []
														
 
															         should_run_ocr = True
														
 
															+        actual_page_type = None
														
 
															         if pdf_type == 'txt' and pdf_doc is not None:
														
 
															             # 文字 PDF：直接从 PDF 提取文本块
														
 
															             try:
														
 
															-                pdf_text_blocks, rotation_angle = PDFUtils.extract_all_text_blocks(
														
 
															-                    pdf_doc, page_idx, scale=scale
														
 
															-                )
														
 
															-                # 保存rotation角度
														
 
															-                page_result['angle'] = rotation_angle
														
 
															-                if rotation_angle != 0:
														
 
															-                    logger.info(f"📐 Page {page_idx}: PDF rotation {rotation_angle}°")
														
 
															+                # 页级别检测：该页是否真的有文字
														
 
															+                actual_page_type = PDFUtils.detect_page_type(pdf_doc, page_idx)
														
 
															+                
														
 
															+                if actual_page_type == 'txt':
														
 
															+                    # 正常提取文字
														
 
															+                    all_text_spans, rotation = PDFUtils.extract_all_text_blocks(
														
 
															+                        pdf_doc, page_idx, scale=scale
														
 
															+                    )
														
 
															+                    # 保存rotation角度
														
 
															+                    page_result['angle'] = rotation
														
 
															+                    if rotation != 0:
														
 
															+                        logger.info(f"📐 Page {page_idx}: PDF rotation {rotation}°")
														
 
															-                # 将 PDF 文本块转换为 span 格式
														
 
															-                all_text_spans = self._convert_pdf_blocks_to_spans(
														
 
															-                    pdf_text_blocks, detection_image.shape
														
 
															-                )
														
 
															-                logger.info(f"📝 Page {page_idx}: PDF extracted {len(all_text_spans)} text blocks")
														
 
															-                if self.debug_mode:
														
 
															-                    # 调试模式：同时运行 OCR 对比
														
 
															+                    # 将 PDF 文本块转换为 span 格式
														
 
															+                    all_text_spans = self._convert_pdf_blocks_to_spans(
														
 
															+                        all_text_spans, detection_image.shape
														
 
															+                    )
														
 
															+                    logger.info(f"📝 Page {page_idx}: PDF extracted {len(all_text_spans)} text blocks")
														
 
															+                    if self.debug_mode:
														
 
															+                        # 调试模式：同时运行 OCR 对比
														
 
															+                        should_run_ocr = True
														
 
															+                else:
														
 
															+                    # 该页是扫描页，使用OCR
														
 
															+                    logger.info(f"Page {page_idx}: detected as scanned page, using OCR")
														
 
															                     should_run_ocr = True
														
 
															             except Exception as e:
														
 
															                 logger.warning(f"⚠️ PDF text extraction failed, fallback to OCR: {e}")
														
@@ -405,7 +415,10 @@ class EnhancedDocPipeline:
 
															                 self._compare_ocr_and_pdf_text(
														
 
															                     page_idx, pdf_doc, all_ocr_spans, detection_image, output_dir, page_name, scale
														
 
															                 )
														
 
															-        
														
 
															+        # 根据实际情况决定使用 OCR 结果还是 PDF 提取结果
														
 
															+        if actual_page_type and actual_page_type == 'ocr':
														
 
															+            pdf_type = 'ocr'
														
 
															+            
														
 
															         if pdf_type == 'ocr':
														
 
															             all_text_spans = all_ocr_spans
														
@@ -683,7 +696,7 @@ class EnhancedDocPipeline:
 
															         self,
														
 
															         detection_image: np.ndarray,
														
 
															         classified_elements: Dict[str, List[Dict[str, Any]]],
														
 
															-        pdf_type: str,
														
 
															+        pdf_type: str,  # 'ocr' 或 'txt'
														
 
															         pdf_doc: Optional[Any],
														
 
															         page_idx: int,
														
 
															         scale: float,
														
@@ -778,7 +791,7 @@ class EnhancedDocPipeline:
 
															                     # 有线表格路径：UNet 识别
														
 
															                     logger.info(f"🔷 Using wired UNet table recognition (configured)")
														
 
															                     element = self.element_processors.process_table_element_wired(
														
 
															-                        detection_image, item, scale, pre_matched_spans=spans,
														
 
															+                        detection_image, item, scale, pre_matched_spans=spans, pdf_type=pdf_type,
														
 
															                         output_dir=output_dir, basename=f"{basename}_{idx}"
														
 
															                     )