Browse Source

feat(pipeline_manager): 添加 normalize_numbers 参数以支持表格内容的金额标准化处理

zhch158_admin 2 tuần trước cách đây
mục cha
commit
38f373384f
1 tập tin đã thay đổi với 13 bổ sung7 xóa
  1. 13 7
      ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py

+ 13 - 7
ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py

@@ -546,6 +546,7 @@ class EnhancedDocPipeline:
         classified_elements = self._classify_elements(layout_results, page_idx)
         
         # 6. 处理各类元素(传入匹配的 spans)
+        normalize_numbers = self.config.get('output', {}).get('normalize_numbers', True)
         processed_elements, discarded_elements = self._process_all_elements(
             detection_image=detection_image,
             classified_elements=classified_elements,
@@ -556,7 +557,8 @@ class EnhancedDocPipeline:
             matched_spans=matched_spans,
             layout_results=layout_results,
             output_dir=output_dir,
-            basename=page_name
+            basename=page_name,
+            normalize_numbers=normalize_numbers,
         )
         
         # 7. 按阅读顺序排序
@@ -820,10 +822,11 @@ class EnhancedDocPipeline:
         layout_results: Optional[List[Dict[str, Any]]] = None,
         output_dir: Optional[str] = None,
         basename: Optional[str] = None,
+        normalize_numbers: bool = True,
     ) -> tuple:
         """
         处理所有分类后的元素
-        
+
         Args:
             detection_image: 检测用图像
             classified_elements: 分类后的元素
@@ -833,7 +836,8 @@ class EnhancedDocPipeline:
             scale: 缩放比例
             matched_spans: 匹配的 OCR spans {block_idx: [spans]}
             layout_results: 原始 layout 检测结果(用于索引匹配)
-            
+            normalize_numbers: 是否对表格内容做金额标准化
+
         Returns:
             (processed_elements, discarded_elements)
         """
@@ -939,20 +943,22 @@ class EnhancedDocPipeline:
                     logger.info(f"🔷 Table {idx}: Using wired UNet recognition")
                     element = self.element_processors.process_table_element_wired(
                         detection_image, item, scale, pre_matched_spans=spans, pdf_type=pdf_type,
-                        output_dir=output_dir, basename=f"{basename}_{idx}"
+                        output_dir=output_dir, basename=f"{basename}_{idx}",
+                        normalize_numbers=normalize_numbers,
                     )
-                    
                     # 如果有线识别失败(返回空 HTML),fallback 到 VLM
                     if not element['content'].get('html') and not element['content'].get('cells'):
                         logger.warning(f"⚠️ Wired recognition failed for table {idx}, fallback to VLM")
                         element = self.element_processors.process_table_element_vlm(
-                            detection_image, item, scale, pre_matched_spans=spans
+                            detection_image, item, scale, pre_matched_spans=spans,
+                            normalize_numbers=normalize_numbers,
                         )
                 else:
                     # VLM 无线表格路径(默认)
                     logger.info(f"🔷 Table {idx}: Using VLM recognition")
                     element = self.element_processors.process_table_element_vlm(
-                        detection_image, item, scale, pre_matched_spans=spans
+                        detection_image, item, scale, pre_matched_spans=spans,
+                        normalize_numbers=normalize_numbers,
                     )
                 
                 processed_elements.append(element)