3 kuukautta sitten · 2b8ed01af2
--- a/ocr_tools/universal_doc_parser/core/element_processors.py
+++ b/ocr_tools/universal_doc_parser/core/element_processors.py
@@ -15,6 +15,7 @@ from loguru import logger
 
				 
			
 
				 from ocr_utils.coordinate_utils import CoordinateUtils
			
 
				 from ocr_utils import PDFUtils
			
 
				+from ocr_utils import normalize_financial_numbers
			
 
				 from .table_coordinate_utils import TableCoordinateUtils
			
 
				 
			
 
				 # 导入 SpanMatcher（用于 spans 合并）
			
@@ -358,24 +359,26 @@ class ElementProcessors:
 
				         layout_item: Dict[str, Any],
			
 
				         scale: float,
			
 
				         pre_matched_spans: Optional[List[Dict[str, Any]]] = None,
			
 
				-        pdf_type: str = 'ocr', # 'ocr' 或 'txt'
			
 
				+        pdf_type: str = 'ocr',  # 'ocr' 或 'txt'
			
 
				         output_dir: Optional[str] = None,
			
 
				-        basename: Optional[str] = None
			
 
				+        basename: Optional[str] = None,
			
 
				+        normalize_numbers: bool = True,
			
 
				     ) -> Dict[str, Any]:
			
 
				         """
			
 
				         使用 UNet 有线表格识别处理表格元素
			
 
				-        
			
 
				+
			
 
				         流程：
			
 
				         1. OCR检测获取文本框坐标
			
 
				         2. UNet 有线表格识别
			
 
				         3. 坐标逆向转换回原图坐标
			
 
				-        
			
 
				+
			
 
				         Args:
			
 
				             image: 页面图像
			
 
				             layout_item: 布局检测项
			
 
				             scale: 缩放比例
			
 
				             pre_matched_spans: 预匹配的 OCR spans（来自整页 OCR）
			
 
				-            
			
 
				+            normalize_numbers: 是否对表格内容做金额标准化
			
 
				+
			
 
				         Returns:
			
 
				             处理后的元素字典
			
 
				         """
			
@@ -446,7 +449,7 @@ class ElementProcessors:
 
				             enhanced_html = TableCoordinateUtils.add_table_offset_to_html(enhanced_html, cropped_offset_bbox)
			
 
				             ocr_boxes = TableCoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, cropped_offset_bbox)
			
 
				         
			
 
				-        return {
			
 
				+        result = {
			
 
				             'type': 'table',
			
 
				             'bbox': bbox,
			
 
				             'confidence': layout_item.get('confidence', 0.0),
			
@@ -461,29 +464,34 @@ class ElementProcessors:
 
				                 'recognition_method': 'wired_unet',
			
 
				             },
			
 
				         }
			
 
				-    
			
 
				+        if normalize_numbers:
			
 
				+            self._normalize_table_content(result['content'])
			
 
				+        return result
			
 
				+
			
 
				     def process_table_element_vlm(
			
 
				         self,
			
 
				         image: np.ndarray,
			
 
				         layout_item: Dict[str, Any],
			
 
				         scale: float,
			
 
				-        pre_matched_spans: Optional[List[Dict[str, Any]]] = None
			
 
				+        pre_matched_spans: Optional[List[Dict[str, Any]]] = None,
			
 
				+        normalize_numbers: bool = True,
			
 
				     ) -> Dict[str, Any]:
			
 
				         """
			
 
				         使用 VLM 无线表格识别处理表格元素
			
 
				-        
			
 
				+
			
 
				         流程：
			
 
				         1. OCR检测获取文本框坐标
			
 
				         2. VLM识别获取表格结构HTML
			
 
				         3. 匹配OCR坐标与VLM结构
			
 
				         4. 坐标逆向转换回原图坐标
			
 
				-        
			
 
				+
			
 
				         Args:
			
 
				             image: 页面图像
			
 
				             layout_item: 布局检测项
			
 
				             scale: 缩放比例
			
 
				             pre_matched_spans: 预匹配的 OCR spans（来自整页 OCR）
			
 
				-            
			
 
				+            normalize_numbers: 是否对表格内容做金额标准化
			
 
				+
			
 
				         Returns:
			
 
				             处理后的元素字典
			
 
				         """
			
@@ -557,7 +565,7 @@ class ElementProcessors:
 
				             enhanced_html = TableCoordinateUtils.add_table_offset_to_html(enhanced_html, cropped_offset_bbox)
			
 
				             ocr_boxes = TableCoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, cropped_offset_bbox)
			
 
				         
			
 
				-        return {
			
 
				+        result = {
			
 
				             'type': 'table',
			
 
				             'bbox': bbox,
			
 
				             'confidence': layout_item.get('confidence', 0.0),
			
@@ -572,6 +580,64 @@ class ElementProcessors:
 
				                 'recognition_method': 'vlm',
			
 
				             },
			
 
				         }
			
 
				+        if normalize_numbers:
			
 
				+            self._normalize_table_content(result['content'])
			
 
				+        return result
			
 
				+
			
 
				+    def _normalize_table_content(self, content: Dict[str, Any]) -> None:
			
 
				+        """
			
 
				+        对表格 content 中的 html 与 cells 做金额标准化（就地修改），
			
 
				+        并记录变更到 content['number_normalization_changes']，供 JSON/MD 输出写入。
			
 
				+        """
			
 
				+        if not content or not content.get('html'):
			
 
				+            return
			
 
				+        table_changes: List[Dict[str, Any]] = []
			
 
				+        try:
			
 
				+            from ast import literal_eval
			
 
				+            from bs4 import BeautifulSoup, Tag
			
 
				+            html = content['html']
			
 
				+            soup = BeautifulSoup(html, 'html.parser')
			
 
				+            for table in soup.find_all('table'):
			
 
				+                if not isinstance(table, Tag):
			
 
				+                    continue
			
 
				+                for row_idx, tr in enumerate(table.find_all('tr')):  # type: ignore[reportAttributeAccessIssue]
			
 
				+                    for col_idx, cell in enumerate(tr.find_all(['td', 'th'])):  # type: ignore[reportAttributeAccessIssue]
			
 
				+                        if not isinstance(cell, Tag):
			
 
				+                            continue
			
 
				+                        raw = cell.get_text()
			
 
				+                        norm = normalize_financial_numbers(raw)
			
 
				+                        if norm != raw:
			
 
				+                            change: Dict[str, Any] = {
			
 
				+                                "row": row_idx,
			
 
				+                                "col": col_idx,
			
 
				+                                "old": raw,
			
 
				+                                "new": norm,
			
 
				+                            }
			
 
				+                            bbox_attr = cell.get("data-bbox")
			
 
				+                            if isinstance(bbox_attr, str):
			
 
				+                                try:
			
 
				+                                    change["bbox"] = literal_eval(bbox_attr)
			
 
				+                                except Exception:
			
 
				+                                    change["bbox"] = bbox_attr
			
 
				+                            table_changes.append(change)
			
 
				+                            cell.string = norm
			
 
				+            content['html'] = str(soup)
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"表格 HTML 金额标准化失败: {e}")
			
 
				+        cells = content.get('cells')
			
 
				+        if isinstance(cells, list):
			
 
				+            try:
			
 
				+                for cell in cells:
			
 
				+                    if not isinstance(cell, dict):
			
 
				+                        continue
			
 
				+                    # for key in ['text', 'matched_text']:
			
 
				+                    for key in ['text']:
			
 
				+                        if key in cell and isinstance(cell[key], str):
			
 
				+                            cell[key] = normalize_financial_numbers(cell[key])
			
 
				+            except Exception as e:
			
 
				+                logger.warning(f"表格 cells 金额标准化失败: {e}")
			
 
				+        if table_changes:
			
 
				+            content['number_normalization_changes'] = table_changes
			
 
				     
			
 
				     def _create_empty_table_result(
			
 
				         self,