Selaa lähdekoodia

feat(element_processors): 添加金额标准化功能,支持表格内容的财务数字规范化处理

zhch158_admin 3 viikkoa sitten
vanhempi
commit
2b8ed01af2
1 muutettua tiedostoa jossa 78 lisäystä ja 12 poistoa
  1. 78 12
      ocr_tools/universal_doc_parser/core/element_processors.py

+ 78 - 12
ocr_tools/universal_doc_parser/core/element_processors.py

@@ -15,6 +15,7 @@ from loguru import logger
 
 from ocr_utils.coordinate_utils import CoordinateUtils
 from ocr_utils import PDFUtils
+from ocr_utils import normalize_financial_numbers
 from .table_coordinate_utils import TableCoordinateUtils
 
 # 导入 SpanMatcher(用于 spans 合并)
@@ -358,24 +359,26 @@ class ElementProcessors:
         layout_item: Dict[str, Any],
         scale: float,
         pre_matched_spans: Optional[List[Dict[str, Any]]] = None,
-        pdf_type: str = 'ocr', # 'ocr' 或 'txt'
+        pdf_type: str = 'ocr',  # 'ocr' 或 'txt'
         output_dir: Optional[str] = None,
-        basename: Optional[str] = None
+        basename: Optional[str] = None,
+        normalize_numbers: bool = True,
     ) -> Dict[str, Any]:
         """
         使用 UNet 有线表格识别处理表格元素
-        
+
         流程:
         1. OCR检测获取文本框坐标
         2. UNet 有线表格识别
         3. 坐标逆向转换回原图坐标
-        
+
         Args:
             image: 页面图像
             layout_item: 布局检测项
             scale: 缩放比例
             pre_matched_spans: 预匹配的 OCR spans(来自整页 OCR)
-            
+            normalize_numbers: 是否对表格内容做金额标准化
+
         Returns:
             处理后的元素字典
         """
@@ -446,7 +449,7 @@ class ElementProcessors:
             enhanced_html = TableCoordinateUtils.add_table_offset_to_html(enhanced_html, cropped_offset_bbox)
             ocr_boxes = TableCoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, cropped_offset_bbox)
         
-        return {
+        result = {
             'type': 'table',
             'bbox': bbox,
             'confidence': layout_item.get('confidence', 0.0),
@@ -461,29 +464,34 @@ class ElementProcessors:
                 'recognition_method': 'wired_unet',
             },
         }
-    
+        if normalize_numbers:
+            self._normalize_table_content(result['content'])
+        return result
+
     def process_table_element_vlm(
         self,
         image: np.ndarray,
         layout_item: Dict[str, Any],
         scale: float,
-        pre_matched_spans: Optional[List[Dict[str, Any]]] = None
+        pre_matched_spans: Optional[List[Dict[str, Any]]] = None,
+        normalize_numbers: bool = True,
     ) -> Dict[str, Any]:
         """
         使用 VLM 无线表格识别处理表格元素
-        
+
         流程:
         1. OCR检测获取文本框坐标
         2. VLM识别获取表格结构HTML
         3. 匹配OCR坐标与VLM结构
         4. 坐标逆向转换回原图坐标
-        
+
         Args:
             image: 页面图像
             layout_item: 布局检测项
             scale: 缩放比例
             pre_matched_spans: 预匹配的 OCR spans(来自整页 OCR)
-            
+            normalize_numbers: 是否对表格内容做金额标准化
+
         Returns:
             处理后的元素字典
         """
@@ -557,7 +565,7 @@ class ElementProcessors:
             enhanced_html = TableCoordinateUtils.add_table_offset_to_html(enhanced_html, cropped_offset_bbox)
             ocr_boxes = TableCoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, cropped_offset_bbox)
         
-        return {
+        result = {
             'type': 'table',
             'bbox': bbox,
             'confidence': layout_item.get('confidence', 0.0),
@@ -572,6 +580,64 @@ class ElementProcessors:
                 'recognition_method': 'vlm',
             },
         }
+        if normalize_numbers:
+            self._normalize_table_content(result['content'])
+        return result
+
+    def _normalize_table_content(self, content: Dict[str, Any]) -> None:
+        """
+        对表格 content 中的 html 与 cells 做金额标准化(就地修改),
+        并记录变更到 content['number_normalization_changes'],供 JSON/MD 输出写入。
+        """
+        if not content or not content.get('html'):
+            return
+        table_changes: List[Dict[str, Any]] = []
+        try:
+            from ast import literal_eval
+            from bs4 import BeautifulSoup, Tag
+            html = content['html']
+            soup = BeautifulSoup(html, 'html.parser')
+            for table in soup.find_all('table'):
+                if not isinstance(table, Tag):
+                    continue
+                for row_idx, tr in enumerate(table.find_all('tr')):  # type: ignore[reportAttributeAccessIssue]
+                    for col_idx, cell in enumerate(tr.find_all(['td', 'th'])):  # type: ignore[reportAttributeAccessIssue]
+                        if not isinstance(cell, Tag):
+                            continue
+                        raw = cell.get_text()
+                        norm = normalize_financial_numbers(raw)
+                        if norm != raw:
+                            change: Dict[str, Any] = {
+                                "row": row_idx,
+                                "col": col_idx,
+                                "old": raw,
+                                "new": norm,
+                            }
+                            bbox_attr = cell.get("data-bbox")
+                            if isinstance(bbox_attr, str):
+                                try:
+                                    change["bbox"] = literal_eval(bbox_attr)
+                                except Exception:
+                                    change["bbox"] = bbox_attr
+                            table_changes.append(change)
+                            cell.string = norm
+            content['html'] = str(soup)
+        except Exception as e:
+            logger.warning(f"表格 HTML 金额标准化失败: {e}")
+        cells = content.get('cells')
+        if isinstance(cells, list):
+            try:
+                for cell in cells:
+                    if not isinstance(cell, dict):
+                        continue
+                    # for key in ['text', 'matched_text']:
+                    for key in ['text']:
+                        if key in cell and isinstance(cell[key], str):
+                            cell[key] = normalize_financial_numbers(cell[key])
+            except Exception as e:
+                logger.warning(f"表格 cells 金额标准化失败: {e}")
+        if table_changes:
+            content['number_normalization_changes'] = table_changes
     
     def _create_empty_table_result(
         self,