浏览代码

feat: Implement wired table processing with grid recovery and skew detection, and improve HuggingFace model caching.

zhch158_admin 3 天之前
父节点
当前提交
a4ad1d803a

+ 20 - 15
ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py

@@ -185,7 +185,8 @@ class EnhancedDocPipeline:
     def process_document(
         self, 
         document_path: str,
-        page_range: Optional[str] = None
+        page_range: Optional[str] = None,
+        output_dir: Optional[str] = None
     ) -> Dict[str, Any]:
         """
         处理文档主流程
@@ -233,7 +234,8 @@ class EnhancedDocPipeline:
                     page_idx=page_idx,
                     pdf_type=pdf_type,
                     pdf_doc=pdf_doc,
-                    page_name=page_name
+                    page_name=page_name,
+                    output_dir=output_dir,
                 )
                 results['pages'].append(page_result)
             
@@ -252,13 +254,14 @@ class EnhancedDocPipeline:
             raise
     
     def _process_single_page(
-        self,
-        image_dict: Dict[str, Any],
-        page_idx: int,
-        pdf_type: str,
-        pdf_doc: Optional[Any] = None,
-        page_name: Optional[str] = None
-    ) -> Dict[str, Any]:
+            self,
+            image_dict: Dict[str, Any],
+            page_idx: int,
+            pdf_type: str,
+            pdf_doc: Optional[Any] = None,
+            page_name: Optional[str] = None,
+            output_dir: Optional[str] = None
+        ) -> Dict[str, Any]:
         """
         处理单页文档
         
@@ -351,9 +354,6 @@ class EnhancedDocPipeline:
             logger.info(f"📝 Page {page_idx}: OCR detected {len(all_ocr_spans)} text spans")
         except Exception as e:
             logger.warning(f"⚠️ Full-page OCR failed: {e}")
-
-        skew_angle = BBoxExtractor.calculate_skew_angle(all_ocr_spans)
-        logger.info(f"📊 Wired table skew angle: {skew_angle:.3f}°")
         
         # 4. 将 OCR spans 匹配到 layout blocks
         matched_spans = SpanMatcher.match_spans_to_blocks(
@@ -372,7 +372,9 @@ class EnhancedDocPipeline:
             page_idx=page_idx,
             scale=scale,
             matched_spans=matched_spans,
-            layout_results=layout_results
+            layout_results=layout_results,
+            output_dir=output_dir,
+            basename=page_name
         )
         
         # 7. 按阅读顺序排序
@@ -512,7 +514,9 @@ class EnhancedDocPipeline:
         page_idx: int,
         scale: float,
         matched_spans: Optional[Dict[int, List[Dict[str, Any]]]] = None,
-        layout_results: Optional[List[Dict[str, Any]]] = None
+        layout_results: Optional[List[Dict[str, Any]]] = None,
+        output_dir: Optional[str] = None,
+        basename: Optional[str] = None,
     ) -> tuple:
         """
         处理所有分类后的元素
@@ -600,7 +604,8 @@ class EnhancedDocPipeline:
                     # 有线表格路径:UNet 识别
                     logger.info(f"🔷 Using wired UNet table recognition (configured)")
                     element = self.element_processors.process_table_element_wired(
-                        detection_image, item, scale, pre_matched_spans=spans
+                        detection_image, item, scale, pre_matched_spans=spans,
+                        output_dir=output_dir, basename=basename
                     )
                     
                     # 如果有线识别失败(返回空 HTML),fallback 到 VLM

+ 2 - 1
ocr_tools/universal_doc_parser/core/pipeline_manager_v2_streaming.py

@@ -157,7 +157,8 @@ class StreamingDocPipeline(EnhancedDocPipeline):
                     page_idx=page_idx,
                     pdf_type=pdf_type,
                     pdf_doc=pdf_doc,
-                    page_name=page_name
+                    page_name=page_name,
+                    output_dir=self.output_dir
                 )
                 
                 # 立即保存该页结果(使用 OutputFormatterV2 的方法,保持输出一致)