Преглед изворни кода

feat: 增强 PDF 文本提取逻辑,添加页级别类型检测,优化 OCR 使用条件

zhch158_admin пре 5 дана
родитељ
комит
e4304a8c0e
1 измењених фајлова са 31 додато и 18 уклоњено
  1. 31 18
      ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py

+ 31 - 18
ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py

@@ -272,7 +272,7 @@ class EnhancedDocPipeline:
             self,
             image_dict: Dict[str, Any],
             page_idx: int,
-            pdf_type: str,
+            pdf_type: str, # 'ocr' 或 'txt'
             pdf_doc: Optional[Any] = None,
             page_name: Optional[str] = None,
             output_dir: Optional[str] = None
@@ -362,25 +362,35 @@ class EnhancedDocPipeline:
         all_text_spans = []
         all_ocr_spans = []
         should_run_ocr = True
+        actual_page_type = None
         
         if pdf_type == 'txt' and pdf_doc is not None:
             # 文字 PDF:直接从 PDF 提取文本块
             try:
-                pdf_text_blocks, rotation_angle = PDFUtils.extract_all_text_blocks(
-                    pdf_doc, page_idx, scale=scale
-                )
-                # 保存rotation角度
-                page_result['angle'] = rotation_angle
-                if rotation_angle != 0:
-                    logger.info(f"📐 Page {page_idx}: PDF rotation {rotation_angle}°")
+                # 页级别检测:该页是否真的有文字
+                actual_page_type = PDFUtils.detect_page_type(pdf_doc, page_idx)
+                
+                if actual_page_type == 'txt':
+                    # 正常提取文字
+                    all_text_spans, rotation = PDFUtils.extract_all_text_blocks(
+                        pdf_doc, page_idx, scale=scale
+                    )
+                    # 保存rotation角度
+                    page_result['angle'] = rotation
+                    if rotation != 0:
+                        logger.info(f"📐 Page {page_idx}: PDF rotation {rotation}°")
 
-                # 将 PDF 文本块转换为 span 格式
-                all_text_spans = self._convert_pdf_blocks_to_spans(
-                    pdf_text_blocks, detection_image.shape
-                )
-                logger.info(f"📝 Page {page_idx}: PDF extracted {len(all_text_spans)} text blocks")
-                if self.debug_mode:
-                    # 调试模式:同时运行 OCR 对比
+                    # 将 PDF 文本块转换为 span 格式
+                    all_text_spans = self._convert_pdf_blocks_to_spans(
+                        all_text_spans, detection_image.shape
+                    )
+                    logger.info(f"📝 Page {page_idx}: PDF extracted {len(all_text_spans)} text blocks")
+                    if self.debug_mode:
+                        # 调试模式:同时运行 OCR 对比
+                        should_run_ocr = True
+                else:
+                    # 该页是扫描页,使用OCR
+                    logger.info(f"Page {page_idx}: detected as scanned page, using OCR")
                     should_run_ocr = True
             except Exception as e:
                 logger.warning(f"⚠️ PDF text extraction failed, fallback to OCR: {e}")
@@ -405,7 +415,10 @@ class EnhancedDocPipeline:
                 self._compare_ocr_and_pdf_text(
                     page_idx, pdf_doc, all_ocr_spans, detection_image, output_dir, page_name, scale
                 )
-        
+        # 根据实际情况决定使用 OCR 结果还是 PDF 提取结果
+        if actual_page_type and actual_page_type == 'ocr':
+            pdf_type = 'ocr'
+            
         if pdf_type == 'ocr':
             all_text_spans = all_ocr_spans
             
@@ -683,7 +696,7 @@ class EnhancedDocPipeline:
         self,
         detection_image: np.ndarray,
         classified_elements: Dict[str, List[Dict[str, Any]]],
-        pdf_type: str,
+        pdf_type: str,  # 'ocr' 或 'txt'
         pdf_doc: Optional[Any],
         page_idx: int,
         scale: float,
@@ -778,7 +791,7 @@ class EnhancedDocPipeline:
                     # 有线表格路径:UNet 识别
                     logger.info(f"🔷 Using wired UNet table recognition (configured)")
                     element = self.element_processors.process_table_element_wired(
-                        detection_image, item, scale, pre_matched_spans=spans,
+                        detection_image, item, scale, pre_matched_spans=spans, pdf_type=pdf_type,
                         output_dir=output_dir, basename=f"{basename}_{idx}"
                     )