пре 5 месеци · e4304a8c0e
--- a/ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py
+++ b/ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py
@@ -272,7 +272,7 @@ class EnhancedDocPipeline:
 
				             self,
			
 
				             image_dict: Dict[str, Any],
			
 
				             page_idx: int,
			
 
				-            pdf_type: str,
			
 
				+            pdf_type: str, # 'ocr' 或 'txt'
			
 
				             pdf_doc: Optional[Any] = None,
			
 
				             page_name: Optional[str] = None,
			
 
				             output_dir: Optional[str] = None
			
@@ -362,25 +362,35 @@ class EnhancedDocPipeline:
 
				         all_text_spans = []
			
 
				         all_ocr_spans = []
			
 
				         should_run_ocr = True
			
 
				+        actual_page_type = None
			
 
				         
			
 
				         if pdf_type == 'txt' and pdf_doc is not None:
			
 
				             # 文字 PDF：直接从 PDF 提取文本块
			
 
				             try:
			
 
				-                pdf_text_blocks, rotation_angle = PDFUtils.extract_all_text_blocks(
			
 
				-                    pdf_doc, page_idx, scale=scale
			
 
				-                )
			
 
				-                # 保存rotation角度
			
 
				-                page_result['angle'] = rotation_angle
			
 
				-                if rotation_angle != 0:
			
 
				-                    logger.info(f"📐 Page {page_idx}: PDF rotation {rotation_angle}°")
			
 
				+                # 页级别检测：该页是否真的有文字
			
 
				+                actual_page_type = PDFUtils.detect_page_type(pdf_doc, page_idx)
			
 
				+                
			
 
				+                if actual_page_type == 'txt':
			
 
				+                    # 正常提取文字
			
 
				+                    all_text_spans, rotation = PDFUtils.extract_all_text_blocks(
			
 
				+                        pdf_doc, page_idx, scale=scale
			
 
				+                    )
			
 
				+                    # 保存rotation角度
			
 
				+                    page_result['angle'] = rotation
			
 
				+                    if rotation != 0:
			
 
				+                        logger.info(f"📐 Page {page_idx}: PDF rotation {rotation}°")
			
 
				 
			
 
				-                # 将 PDF 文本块转换为 span 格式
			
 
				-                all_text_spans = self._convert_pdf_blocks_to_spans(
			
 
				-                    pdf_text_blocks, detection_image.shape
			
 
				-                )
			
 
				-                logger.info(f"📝 Page {page_idx}: PDF extracted {len(all_text_spans)} text blocks")
			
 
				-                if self.debug_mode:
			
 
				-                    # 调试模式：同时运行 OCR 对比
			
 
				+                    # 将 PDF 文本块转换为 span 格式
			
 
				+                    all_text_spans = self._convert_pdf_blocks_to_spans(
			
 
				+                        all_text_spans, detection_image.shape
			
 
				+                    )
			
 
				+                    logger.info(f"📝 Page {page_idx}: PDF extracted {len(all_text_spans)} text blocks")
			
 
				+                    if self.debug_mode:
			
 
				+                        # 调试模式：同时运行 OCR 对比
			
 
				+                        should_run_ocr = True
			
 
				+                else:
			
 
				+                    # 该页是扫描页，使用OCR
			
 
				+                    logger.info(f"Page {page_idx}: detected as scanned page, using OCR")
			
 
				                     should_run_ocr = True
			
 
				             except Exception as e:
			
 
				                 logger.warning(f"⚠️ PDF text extraction failed, fallback to OCR: {e}")
			
@@ -405,7 +415,10 @@ class EnhancedDocPipeline:
 
				                 self._compare_ocr_and_pdf_text(
			
 
				                     page_idx, pdf_doc, all_ocr_spans, detection_image, output_dir, page_name, scale
			
 
				                 )
			
 
				-        
			
 
				+        # 根据实际情况决定使用 OCR 结果还是 PDF 提取结果
			
 
				+        if actual_page_type and actual_page_type == 'ocr':
			
 
				+            pdf_type = 'ocr'
			
 
				+            
			
 
				         if pdf_type == 'ocr':
			
 
				             all_text_spans = all_ocr_spans
			
 
				             
			
@@ -683,7 +696,7 @@ class EnhancedDocPipeline:
 
				         self,
			
 
				         detection_image: np.ndarray,
			
 
				         classified_elements: Dict[str, List[Dict[str, Any]]],
			
 
				-        pdf_type: str,
			
 
				+        pdf_type: str,  # 'ocr' 或 'txt'
			
 
				         pdf_doc: Optional[Any],
			
 
				         page_idx: int,
			
 
				         scale: float,
			
@@ -778,7 +791,7 @@ class EnhancedDocPipeline:
 
				                     # 有线表格路径：UNet 识别
			
 
				                     logger.info(f"🔷 Using wired UNet table recognition (configured)")
			
 
				                     element = self.element_processors.process_table_element_wired(
			
 
				-                        detection_image, item, scale, pre_matched_spans=spans,
			
 
				+                        detection_image, item, scale, pre_matched_spans=spans, pdf_type=pdf_type,
			
 
				                         output_dir=output_dir, basename=f"{basename}_{idx}"
			
 
				                     )