7 Commits 2e8ed5fad9 ... 11668f177e

Auteur SHA1 Message Date
  zhch158_admin 11668f177e refactor(output_formatter_v2): 移除冗余的金额标准化逻辑,更新保存中间JSON的注释以反映标准化已在处理管道中完成 il y a 2 semaines
  zhch158_admin 6e96478c23 feat(ocr_utils): 增强金额标准化功能,支持欧洲格式小数和JSON表格内容的标准化处理 il y a 2 semaines
  zhch158_admin 40bad356ee refactor(markdown_generator): 移除冗余的金额标准化逻辑,更新Markdown生成以包含标准化变更说明 il y a 2 semaines
  zhch158_admin 1ce742a4ef refactor(json_formatters): 移除冗余的金额标准化逻辑,更新JSON输出以包含标准化变更记录 il y a 2 semaines
  zhch158_admin 38f373384f feat(pipeline_manager): 添加 normalize_numbers 参数以支持表格内容的金额标准化处理 il y a 2 semaines
  zhch158_admin 2b8ed01af2 feat(element_processors): 添加金额标准化功能,支持表格内容的财务数字规范化处理 il y a 2 semaines
  zhch158_admin 35c6e6cf36 feat(ocr_utils): 增强财务数字标准化功能,添加金额 token 纠错逻辑,支持逗号和小数点的正确用法 il y a 2 semaines

+ 78 - 12
ocr_tools/universal_doc_parser/core/element_processors.py

@@ -15,6 +15,7 @@ from loguru import logger
 
 from ocr_utils.coordinate_utils import CoordinateUtils
 from ocr_utils import PDFUtils
+from ocr_utils import normalize_financial_numbers
 from .table_coordinate_utils import TableCoordinateUtils
 
 # 导入 SpanMatcher(用于 spans 合并)
@@ -358,24 +359,26 @@ class ElementProcessors:
         layout_item: Dict[str, Any],
         scale: float,
         pre_matched_spans: Optional[List[Dict[str, Any]]] = None,
-        pdf_type: str = 'ocr', # 'ocr' 或 'txt'
+        pdf_type: str = 'ocr',  # 'ocr' 或 'txt'
         output_dir: Optional[str] = None,
-        basename: Optional[str] = None
+        basename: Optional[str] = None,
+        normalize_numbers: bool = True,
     ) -> Dict[str, Any]:
         """
         使用 UNet 有线表格识别处理表格元素
-        
+
         流程:
         1. OCR检测获取文本框坐标
         2. UNet 有线表格识别
         3. 坐标逆向转换回原图坐标
-        
+
         Args:
             image: 页面图像
             layout_item: 布局检测项
             scale: 缩放比例
             pre_matched_spans: 预匹配的 OCR spans(来自整页 OCR)
-            
+            normalize_numbers: 是否对表格内容做金额标准化
+
         Returns:
             处理后的元素字典
         """
@@ -446,7 +449,7 @@ class ElementProcessors:
             enhanced_html = TableCoordinateUtils.add_table_offset_to_html(enhanced_html, cropped_offset_bbox)
             ocr_boxes = TableCoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, cropped_offset_bbox)
         
-        return {
+        result = {
             'type': 'table',
             'bbox': bbox,
             'confidence': layout_item.get('confidence', 0.0),
@@ -461,29 +464,34 @@ class ElementProcessors:
                 'recognition_method': 'wired_unet',
             },
         }
-    
+        if normalize_numbers:
+            self._normalize_table_content(result['content'])
+        return result
+
     def process_table_element_vlm(
         self,
         image: np.ndarray,
         layout_item: Dict[str, Any],
         scale: float,
-        pre_matched_spans: Optional[List[Dict[str, Any]]] = None
+        pre_matched_spans: Optional[List[Dict[str, Any]]] = None,
+        normalize_numbers: bool = True,
     ) -> Dict[str, Any]:
         """
         使用 VLM 无线表格识别处理表格元素
-        
+
         流程:
         1. OCR检测获取文本框坐标
         2. VLM识别获取表格结构HTML
         3. 匹配OCR坐标与VLM结构
         4. 坐标逆向转换回原图坐标
-        
+
         Args:
             image: 页面图像
             layout_item: 布局检测项
             scale: 缩放比例
             pre_matched_spans: 预匹配的 OCR spans(来自整页 OCR)
-            
+            normalize_numbers: 是否对表格内容做金额标准化
+
         Returns:
             处理后的元素字典
         """
@@ -557,7 +565,7 @@ class ElementProcessors:
             enhanced_html = TableCoordinateUtils.add_table_offset_to_html(enhanced_html, cropped_offset_bbox)
             ocr_boxes = TableCoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, cropped_offset_bbox)
         
-        return {
+        result = {
             'type': 'table',
             'bbox': bbox,
             'confidence': layout_item.get('confidence', 0.0),
@@ -572,6 +580,64 @@ class ElementProcessors:
                 'recognition_method': 'vlm',
             },
         }
+        if normalize_numbers:
+            self._normalize_table_content(result['content'])
+        return result
+
+    def _normalize_table_content(self, content: Dict[str, Any]) -> None:
+        """
+        对表格 content 中的 html 与 cells 做金额标准化(就地修改),
+        并记录变更到 content['number_normalization_changes'],供 JSON/MD 输出写入。
+        """
+        if not content or not content.get('html'):
+            return
+        table_changes: List[Dict[str, Any]] = []
+        try:
+            from ast import literal_eval
+            from bs4 import BeautifulSoup, Tag
+            html = content['html']
+            soup = BeautifulSoup(html, 'html.parser')
+            for table in soup.find_all('table'):
+                if not isinstance(table, Tag):
+                    continue
+                for row_idx, tr in enumerate(table.find_all('tr')):  # type: ignore[reportAttributeAccessIssue]
+                    for col_idx, cell in enumerate(tr.find_all(['td', 'th'])):  # type: ignore[reportAttributeAccessIssue]
+                        if not isinstance(cell, Tag):
+                            continue
+                        raw = cell.get_text()
+                        norm = normalize_financial_numbers(raw)
+                        if norm != raw:
+                            change: Dict[str, Any] = {
+                                "row": row_idx,
+                                "col": col_idx,
+                                "old": raw,
+                                "new": norm,
+                            }
+                            bbox_attr = cell.get("data-bbox")
+                            if isinstance(bbox_attr, str):
+                                try:
+                                    change["bbox"] = literal_eval(bbox_attr)
+                                except Exception:
+                                    change["bbox"] = bbox_attr
+                            table_changes.append(change)
+                            cell.string = norm
+            content['html'] = str(soup)
+        except Exception as e:
+            logger.warning(f"表格 HTML 金额标准化失败: {e}")
+        cells = content.get('cells')
+        if isinstance(cells, list):
+            try:
+                for cell in cells:
+                    if not isinstance(cell, dict):
+                        continue
+                    # for key in ['text', 'matched_text']:
+                    for key in ['text']:
+                        if key in cell and isinstance(cell[key], str):
+                            cell[key] = normalize_financial_numbers(cell[key])
+            except Exception as e:
+                logger.warning(f"表格 cells 金额标准化失败: {e}")
+        if table_changes:
+            content['number_normalization_changes'] = table_changes
     
     def _create_empty_table_result(
         self,

+ 13 - 7
ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py

@@ -546,6 +546,7 @@ class EnhancedDocPipeline:
         classified_elements = self._classify_elements(layout_results, page_idx)
         
         # 6. 处理各类元素(传入匹配的 spans)
+        normalize_numbers = self.config.get('output', {}).get('normalize_numbers', True)
         processed_elements, discarded_elements = self._process_all_elements(
             detection_image=detection_image,
             classified_elements=classified_elements,
@@ -556,7 +557,8 @@ class EnhancedDocPipeline:
             matched_spans=matched_spans,
             layout_results=layout_results,
             output_dir=output_dir,
-            basename=page_name
+            basename=page_name,
+            normalize_numbers=normalize_numbers,
         )
         
         # 7. 按阅读顺序排序
@@ -820,10 +822,11 @@ class EnhancedDocPipeline:
         layout_results: Optional[List[Dict[str, Any]]] = None,
         output_dir: Optional[str] = None,
         basename: Optional[str] = None,
+        normalize_numbers: bool = True,
     ) -> tuple:
         """
         处理所有分类后的元素
-        
+
         Args:
             detection_image: 检测用图像
             classified_elements: 分类后的元素
@@ -833,7 +836,8 @@ class EnhancedDocPipeline:
             scale: 缩放比例
             matched_spans: 匹配的 OCR spans {block_idx: [spans]}
             layout_results: 原始 layout 检测结果(用于索引匹配)
-            
+            normalize_numbers: 是否对表格内容做金额标准化
+
         Returns:
             (processed_elements, discarded_elements)
         """
@@ -939,20 +943,22 @@ class EnhancedDocPipeline:
                     logger.info(f"🔷 Table {idx}: Using wired UNet recognition")
                     element = self.element_processors.process_table_element_wired(
                         detection_image, item, scale, pre_matched_spans=spans, pdf_type=pdf_type,
-                        output_dir=output_dir, basename=f"{basename}_{idx}"
+                        output_dir=output_dir, basename=f"{basename}_{idx}",
+                        normalize_numbers=normalize_numbers,
                     )
-                    
                     # 如果有线识别失败(返回空 HTML),fallback 到 VLM
                     if not element['content'].get('html') and not element['content'].get('cells'):
                         logger.warning(f"⚠️ Wired recognition failed for table {idx}, fallback to VLM")
                         element = self.element_processors.process_table_element_vlm(
-                            detection_image, item, scale, pre_matched_spans=spans
+                            detection_image, item, scale, pre_matched_spans=spans,
+                            normalize_numbers=normalize_numbers,
                         )
                 else:
                     # VLM 无线表格路径(默认)
                     logger.info(f"🔷 Table {idx}: Using VLM recognition")
                     element = self.element_processors.process_table_element_vlm(
-                        detection_image, item, scale, pre_matched_spans=spans
+                        detection_image, item, scale, pre_matched_spans=spans,
+                        normalize_numbers=normalize_numbers,
                     )
                 
                 processed_elements.append(element)

+ 7 - 18
ocr_utils/json_formatters.py

@@ -14,10 +14,6 @@ from pathlib import Path
 from typing import Dict, Any, List, Optional
 from loguru import logger
 
-# 导入数字标准化工具
-from .normalize_financial_numbers import normalize_json_table
-
-
 class NumpyEncoder(json.JSONEncoder):
     """自定义JSON编码器,处理numpy类型"""
     def default(self, obj):
@@ -263,20 +259,9 @@ class JSONFormatters:
                 if converted:
                     page_elements.append(converted)
             
-            # 转换为 JSON 字符串
+            # 转换为 JSON 字符串(金额标准化已在 pipeline element_processors 中完成,此处不再重复)
             json_content = json.dumps(page_elements, ensure_ascii=False, indent=2, cls=NumpyEncoder)
-            
-            # 金额数字标准化
-            if normalize_numbers:
-                original_content = json_content
-                json_content = normalize_json_table(json_content)
-                
-                if json_content != original_content:
-                    original_path = output_dir / f"{page_name}_original.json"
-                    with open(original_path, 'w', encoding='utf-8') as f:
-                        f.write(original_content)
-                    logger.debug(f"📄 Original page JSON saved: {original_path}")
-            
+
             # 保存 JSON
             json_path = output_dir / f"{page_name}.json"
             with open(json_path, 'w', encoding='utf-8') as f:
@@ -338,7 +323,11 @@ class JSONFormatters:
             cells = content.get('cells', [])
             if cells:
                 result['table_cells'] = JSONFormatters.format_table_cells(cells)
-            
+            # 金额标准化变更记录(来自 element_processors._normalize_table_content)
+            changes = content.get('number_normalization_changes', [])
+            if changes:
+                result['number_normalization_changes'] = changes
+
             # 旋转和倾斜信息
             if 'table_angle' in content:
                 result['image_rotation_angle'] = float(content['table_angle'])

+ 19 - 41
ocr_utils/markdown_generator.py

@@ -29,10 +29,6 @@ except ImportError:
         MM_MD = 'mm_md'
         NLP_MD = 'nlp_md'
 
-# 导入数字标准化工具
-from .normalize_financial_numbers import normalize_markdown_table
-
-
 class MarkdownGenerator:
     """Markdown 生成器类"""
     
@@ -80,18 +76,7 @@ class MarkdownGenerator:
                     
                     header = MarkdownGenerator._generate_header(results)
                     markdown_content = header + str(markdown_content)
-                    
-                    # 金额数字标准化
-                    if normalize_numbers:
-                        original_content = markdown_content
-                        markdown_content = normalize_markdown_table(markdown_content)
-                        
-                        if markdown_content != original_content:
-                            original_path = output_dir / f"{doc_name}_original.md"
-                            with open(original_path, 'w', encoding='utf-8') as f:
-                                f.write(original_content)
-                            logger.info(f"📝 Original Markdown saved: {original_path}")
-                    
+                    # 金额标准化已在 pipeline element_processors 中完成,此处不再重复
                     with open(md_path, 'w', encoding='utf-8') as f:
                         f.write(markdown_content)
                     
@@ -103,18 +88,7 @@ class MarkdownGenerator:
         
         # 使用自定义实现,确保所有元素类型都被处理
         markdown_content = MarkdownGenerator._generate_full_markdown(results)
-        
-        # 金额数字标准化
-        if normalize_numbers:
-            original_content = markdown_content
-            markdown_content = normalize_markdown_table(markdown_content)
-            
-            if markdown_content != original_content:
-                original_path = output_dir / f"{doc_name}_original.md"
-                with open(original_path, 'w', encoding='utf-8') as f:
-                    f.write(original_content)
-                logger.info(f"📝 Original Markdown saved: {original_path}")
-        
+        # 金额标准化已在 pipeline element_processors 中完成,此处不再重复
         with open(md_path, 'w', encoding='utf-8') as f:
             f.write(markdown_content)
         
@@ -163,20 +137,9 @@ class MarkdownGenerator:
             else:
                 page_name = doc_name
             
-            # 生成单页 Markdown
+            # 生成单页 Markdown(金额标准化已在 pipeline element_processors 中完成,此处不再重复)
             md_content = MarkdownGenerator._generate_page_markdown(page, doc_name, page_idx)
-            
-            # 金额数字标准化
-            if normalize_numbers:
-                original_content = md_content
-                md_content = normalize_markdown_table(md_content)
-                
-                if md_content != original_content:
-                    original_path = output_dir / f"{page_name}_original.md"
-                    with open(original_path, 'w', encoding='utf-8') as f:
-                        f.write(original_content)
-                    logger.debug(f"📝 Original page Markdown saved: {original_path}")
-            
+
             # 保存
             md_path = output_dir / f"{page_name}.md"
             with open(md_path, 'w', encoding='utf-8') as f:
@@ -245,7 +208,14 @@ pages: {len(results.get('pages', []))}
                     html = content.get('html', '')
                     if html:
                         md_lines.append(f"\n{html}\n")
+                    changes = content.get('number_normalization_changes', [])
+                    if changes:
                         md_lines.append("")
+                        md_lines.append("<!-- 数字标准化说明:")
+                        for ch in changes:
+                            md_lines.append(f"  - [row={ch.get('row')},col={ch.get('col')}] {ch.get('old', '')} -> {ch.get('new', '')}")
+                        md_lines.append("-->")
+                    md_lines.append("")
                 
                 elif elem_type in ['image', 'image_body', 'figure']:
                     img_filename = content.get('image_path', '')
@@ -343,6 +313,14 @@ pages: {len(results.get('pages', []))}
                 html = content.get('html', '')
                 if html:
                     md_lines.append(f"\n{html}\n")
+                # 金额标准化说明(来自 element_processors._normalize_table_content)
+                changes = content.get('number_normalization_changes', [])
+                if changes:
+                    md_lines.append("")
+                    md_lines.append("<!-- 数字标准化说明:")
+                    for ch in changes:
+                        md_lines.append(f"  - [row={ch.get('row')},col={ch.get('col')}] {ch.get('old', '')} -> {ch.get('new', '')}")
+                    md_lines.append("-->")
                 md_lines.append("")
             
             elif elem_type in ['image', 'image_body', 'figure']:

+ 300 - 132
ocr_utils/normalize_financial_numbers.py

@@ -1,16 +1,90 @@
 import re
 import os
 from pathlib import Path
+from decimal import Decimal, InvalidOperation
+
+
+def _normalize_amount_token(token: str) -> str:
+    """
+    规范单个金额 token 中逗号/小数点的用法。
+    仅在形态明显为金额时进行纠错,其他情况原样返回。
+    """
+    if not token:
+        return token
+
+    # 只处理包含数字的简单 token,避免带字母/其他符号的误改
+    if not re.fullmatch(r"[+-]?\d[\d,\.]*\d", token):
+        return token
+
+    sign = ""
+    core = token
+    if core[0] in "+-":
+        sign, core = core[0], core[1:]
+
+    has_dot = "." in core
+    has_comma = "," in core
+
+    # 辅助: 尝试解析为 Decimal;失败则认为不安全,回退原值
+    def _safe_decimal(s: str) -> bool:
+        try:
+            Decimal(s.replace(",", ""))
+            return True
+        except (InvalidOperation, ValueError):
+            return False
+
+    # 规则A:同时包含 . 和 ,,最后一个分隔符是逗号,且其后为 1-2 位数字
+    if has_dot and has_comma:
+        last_comma = core.rfind(",")
+        last_dot = core.rfind(".")
+        if last_comma > last_dot and last_comma != -1:
+            frac = core[last_comma + 1 :]
+            if 1 <= len(frac) <= 2 and frac.isdigit():
+                # 先把所有点当作千分位逗号,再把最后一个逗号当作小数点
+                temp = core.replace(".", ",")
+                idx = temp.rfind(",")
+                if idx != -1:
+                    candidate = temp[:idx] + "." + temp[idx + 1 :]
+                    if _safe_decimal(candidate):
+                        return sign + candidate
+
+    # 规则B:只有 .,多个点,最后一段视为小数,其余为千分位
+    if has_dot and not has_comma:
+        parts = core.split(".")
+        if len(parts) >= 3:
+            last = parts[-1]
+            ints = parts[:-1]
+            if 1 <= len(last) <= 2 and all(len(p) == 3 for p in ints[1:]):
+                candidate = ",".join(ints) + "." + last
+                if _safe_decimal(candidate):
+                    return sign + candidate
+
+    # 规则C:只有 ,,多个逗号,最后一段长度为 1-2 且前面为 3 位分组
+    if has_comma and not has_dot:
+        parts = core.split(",")
+        if len(parts) >= 3:
+            last = parts[-1]
+            ints = parts[:-1]
+            if 1 <= len(last) <= 2 and all(len(p) == 3 for p in ints[1:]):
+                # 将最后一个逗号视为小数点
+                idx = core.rfind(",")
+                candidate = core[:idx] + "." + core[idx + 1 :]
+                if _safe_decimal(candidate):
+                    return sign + candidate
+        # 规则D:只有 ,,且仅有一个逗号、逗号后 1-2 位数字 → 欧洲格式小数,如 301,55 → 301.55
+        elif len(parts) == 2:
+            left, right = parts[0], parts[1]
+            if 1 <= len(right) <= 2 and right.isdigit() and left.isdigit():
+                candidate = left + "." + right
+                if _safe_decimal(candidate):
+                    return sign + candidate
+
+    # 没有需要纠错的典型形态,直接返回原 token
+    return token
+
 
 def normalize_financial_numbers(text: str) -> str:
     """
-    标准化财务数字:将全角字符转换为半角字符
-    
-    Args:
-        text: 原始文本
-    
-    Returns:
-        标准化后的文本
+    标准化财务数字:将全角字符转换为半角字符,并纠正常见的逗号/小数点错用。
     """
     if not text:
         return text
@@ -31,30 +105,30 @@ def normalize_financial_numbers(text: str) -> str:
         '%': '%',  # 全角百分号转半角百分号
     }
     
-    # 第一步:执行基础字符替换
+    # 第一步:执行基础字符替换(全角 -> 半角)
     normalized_text = text
     for fullwidth, halfwidth in fullwidth_to_halfwidth.items():
         normalized_text = normalized_text.replace(fullwidth, halfwidth)
     
-    # 第二步:处理数字序列中的空格和分隔符
-    # 修改正则表达式以匹配完整的数字序列,包括空格
-    # 匹配模式:数字 + (空格? + 逗号 + 空格? + 数字)* + (空格? + 小数点 + 数字+)?
+    # 第二步:处理数字序列中的空格和分隔符(保留原有逻辑)
     number_sequence_pattern = r'(\d+(?:\s*[,,]\s*\d+)*(?:\s*[。..]\s*\d+)?)'
     
     def normalize_number_sequence(match):
         sequence = match.group(1)
-        
-        # 处理千分位分隔符周围的空格
-        # 将 "数字 + 空格 + 逗号 + 空格 + 数字" 标准化为 "数字,数字"
         sequence = re.sub(r'(\d)\s*[,,]\s*(\d)', r'\1,\2', sequence)
-        
-        # 处理小数点周围的空格
-        # 将 "数字 + 空格 + 小数点 + 空格 + 数字" 标准化为 "数字.数字"
         sequence = re.sub(r'(\d)\s*[。..]\s*(\d)', r'\1.\2', sequence)
-        
         return sequence
     
     normalized_text = re.sub(number_sequence_pattern, normalize_number_sequence, normalized_text)
+
+    # 第三步:对疑似金额 token 做逗号/小数点纠错
+    amount_pattern = r'(?P<tok>[+-]?\d[\d,\.]*\d)'
+
+    def _amount_sub(m: re.Match) -> str:
+        tok = m.group('tok')
+        return _normalize_amount_token(tok)
+
+    normalized_text = re.sub(amount_pattern, _amount_sub, normalized_text)
     return normalized_text
     
 def normalize_markdown_table(markdown_content: str) -> str:
@@ -78,7 +152,7 @@ def normalize_markdown_table(markdown_content: str) -> str:
     table_pattern = r'(<table[^>]*>.*?</table>)'
     
     def normalize_table_match(match):
-        """处理单个表格匹配,保留原始格式"""
+        """处理单个表格匹配,保留原始格式,并追加数字标准化说明注释。"""
         table_html = match.group(1)
         original_table_html = table_html  # 保存原始HTML用于比较
         
@@ -86,133 +160,163 @@ def normalize_markdown_table(markdown_content: str) -> str:
         soup = BeautifulSoup(table_html, 'html.parser')
         tables = soup.find_all('table')
         
-        # 记录所有需要替换的文本(原始文本 -> 标准化文本)
-        replacements = []
+        # 记录本表格中所有数值修改
+        changes: list[dict] = []
         
         for table in tables:
-            if isinstance(table, Tag):
-                cells = table.find_all(['td', 'th'])
-                for cell in cells:
-                    if isinstance(cell, Tag):
-                        # 获取单元格的纯文本内容
-                        original_text = cell.get_text()
-                        normalized_text = normalize_financial_numbers(original_text)
-                        
-                        # 如果内容发生了变化,记录替换
-                        if original_text != normalized_text:
-                            # 找到单元格中所有文本节点并替换
-                            from bs4.element import NavigableString
-                            for text_node in cell.find_all(string=True, recursive=True):
-                                if isinstance(text_node, NavigableString):
-                                    text_str = str(text_node)
-                                    if text_str.strip():
-                                        normalized = normalize_financial_numbers(text_str.strip())
-                                        if normalized != text_str.strip():
-                                            # 保留原始文本节点的前后空白
-                                            if text_str.strip() == text_str:
-                                                # 纯文本节点,直接替换
-                                                text_node.replace_with(normalized)
-                                            else:
-                                                # 有前后空白,需要保留
-                                                leading_ws = text_str[:len(text_str) - len(text_str.lstrip())]
-                                                trailing_ws = text_str[len(text_str.rstrip()):]
-                                                text_node.replace_with(leading_ws + normalized + trailing_ws)
+            if not isinstance(table, Tag):
+                continue
+            # 通过 tr / td(th) 计算行列位置
+            for row_idx, tr in enumerate(table.find_all('tr')):  # type: ignore[reportAttributeAccessIssue]
+                cells = tr.find_all(['td', 'th'])  # type: ignore[reportAttributeAccessIssue]
+                for col_idx, cell in enumerate(cells):
+                    if not isinstance(cell, Tag):
+                        continue
+                    # 与 normalize_json_table 一致:整格取文本、只标准化一次、再写回
+                    original_text = cell.get_text()
+                    normalized_text = normalize_financial_numbers(original_text)
+                    if original_text == normalized_text:
+                        continue
+                    # 记录一条修改
+                    changes.append(
+                        {
+                            "row": row_idx,
+                            "col": col_idx,
+                            "old": original_text,
+                            "new": normalized_text,
+                        }
+                    )
+                    # 整格替换为标准化后的文本(与 normalize_json_table 的 cell.string = normalized_text 一致)
+                    cell.string = normalized_text
+        
+        # 如果没有任何数值修改,直接返回原始 HTML
+        if not changes:
+            return original_table_html
         
         # 获取修改后的HTML
         modified_html = str(soup)
         
-        # 如果内容没有变化,返回原始HTML(保持原始格式)
-        # 检查是否只是格式变化(换行、空格等)
-        original_text_only = re.sub(r'\s+', '', original_table_html)
-        modified_text_only = re.sub(r'\s+', '', modified_html)
+        # 在表格后追加注释,说明哪些单元格被修改
+        lines = ["<!-- 数字标准化说明:"]
+        for ch in changes:
+            lines.append(
+                f"  - [row={ch['row']},col={ch['col']}] {ch['old']} -> {ch['new']}"
+            )
+        lines.append("-->")
+        comment = "\n".join(lines)
         
-        if original_text_only == modified_text_only:
-            # 只有格式变化,返回原始HTML以保留换行符
-            return original_table_html
-        
-        # 有实际内容变化,返回修改后的HTML
-        return modified_html
+        return modified_html + "\n\n" + comment
     
     # 使用正则替换,只替换表格内容,保留其他部分(包括换行符)不变
     normalized_content = re.sub(table_pattern, normalize_table_match, markdown_content, flags=re.DOTALL)
     
     return normalized_content
 
-def normalize_json_table(json_content: str) -> str:
+def normalize_json_table(
+    json_content: str,
+    *,
+    table_type_key: str = "category",
+    table_type_value: str = "Table",
+    html_key: str = "text",
+    cells_key: str | None = None,
+) -> str:
     """
-    专门处理JSON格式OCR结果中表格的数字标准化
-    
+    专门处理JSON格式OCR结果中表格的数字标准化。
+    通过参数指定提取用的 key,以兼容不同 OCR 工具的 JSON 结构。
+
     Args:
-        json_content: JSON格式的OCR结果内容
-    
+        json_content: JSON格式的OCR结果内容(字符串或已解析的 list)
+        table_type_key: 用于判断“是否为表格”的字段名,如 "type" 或 "category"
+        table_type_value: 上述字段等于该值时视为表格,如 "table" 或 "Table"
+        html_key: 存放表格 HTML 的字段名,如 "table_body" 或 "text"
+        cells_key: 存放单元格列表的字段名,如 "table_cells";为 None 则不处理 cells,
+                   仅标准化 html_key 中的表格
+
     Returns:
-        标准化后的JSON内容
-    """
-    """
-    json_content 示例:
-    [
-        {
-            "category": "Table",
-            "text": "<table>...</table>"
-        },
-        {
-            "category": "Text",
-            "text": "Some other text"
-        }
-    ]
+        标准化后的JSON内容(字符串)
+
+    常见格式示例:
+        - 旧格式: category="Table", html 在 "text"
+          normalize_json_table(s)  # 默认即此
+        - mineru_vllm_results_cell_bbox: type="table", html 在 "table_body", cells 在 "table_cells"
+          normalize_json_table(s, table_type_key="type", table_type_value="table",
+                               html_key="table_body", cells_key="table_cells")
     """
     import json
-    
+    from ast import literal_eval
+
     try:
-        # 解析JSON内容
         data = json.loads(json_content) if isinstance(json_content, str) else json_content
-        
-        # 确保data是列表格式
         if not isinstance(data, list):
             return json_content
-        
-        # 遍历所有OCR结果项
+
         for item in data:
             if not isinstance(item, dict):
                 continue
-                
-            # 检查是否是表格类型
-            if item.get('category') == 'Table' and 'text' in item:
-                table_html = item['text']
-                
-                # 使用BeautifulSoup处理HTML表格
-                from bs4 import BeautifulSoup, Tag
-                
-                soup = BeautifulSoup(table_html, 'html.parser')
-                tables = soup.find_all('table')
-                
-                for table in tables:
-                    if isinstance(table, Tag):
-                        cells = table.find_all(['td', 'th'])
-                        for cell in cells:
-                            if isinstance(cell, Tag):
-                                original_text = cell.get_text()
-                                
-                                # 应用数字标准化
-                                normalized_text = normalize_financial_numbers(original_text)
-                                
-                                # 如果内容发生了变化,更新单元格内容
-                                if original_text != normalized_text:
-                                    cell.string = normalized_text
-                
-                # 更新item中的表格内容
-                item['text'] = str(soup)
-            
-            # 同时标准化普通文本中的数字(如果需要)
-            # elif 'text' in item:
-            #     original_text = item['text']
-            #     normalized_text = normalize_financial_numbers(original_text)
-            #     if original_text != normalized_text:
-            #         item['text'] = normalized_text
-        
-        # 返回标准化后的JSON字符串
+            # 按参数判断是否为表格项,且包含 HTML
+            if item.get(table_type_key) != table_type_value or html_key not in item:
+                continue
+
+            table_html = item[html_key]
+            if not table_html or not isinstance(table_html, str):
+                continue
+
+            from bs4 import BeautifulSoup, Tag
+
+            soup = BeautifulSoup(table_html, "html.parser")
+            tables = soup.find_all("table")
+            table_changes: list[dict] = []
+
+            for table in tables:
+                if not isinstance(table, Tag):
+                    continue
+                for row_idx, tr in enumerate(table.find_all("tr")):  # type: ignore[reportAttributeAccessIssue]
+                    cells_tag = tr.find_all(["td", "th"])  # type: ignore[reportAttributeAccessIssue]
+                    for col_idx, cell in enumerate(cells_tag):
+                        if not isinstance(cell, Tag):
+                            continue
+                        original_text = cell.get_text()
+                        normalized_text = normalize_financial_numbers(original_text)
+                        if original_text == normalized_text:
+                            continue
+                        change: dict[str, object] = {
+                            "row": row_idx,
+                            "col": col_idx,
+                            "old": original_text,
+                            "new": normalized_text,
+                        }
+                        bbox_attr = cell.get("data-bbox")
+                        if isinstance(bbox_attr, str):
+                            try:
+                                change["bbox"] = literal_eval(bbox_attr)
+                            except Exception:
+                                change["bbox"] = bbox_attr
+                        table_changes.append(change)
+                        cell.string = normalized_text
+
+            # 写回 HTML
+            item[html_key] = str(soup)
+            if table_changes:
+                item["number_normalization_changes"] = table_changes
+
+            # 若指定了 cells_key,同时标准化 cells 中每格的 text(及 matched_text)
+            # for key in ("text", "matched_text"):
+            table_cell_text_keys = ["text"]
+            if cells_key and cells_key in item and isinstance(item[cells_key], list):
+                for cell in item[cells_key]:
+                    if not isinstance(cell, dict):
+                        continue
+
+                    for key in table_cell_text_keys:
+                        if key not in cell or not isinstance(cell[key], str):
+                            continue
+                        orig = cell[key]
+                        norm = normalize_financial_numbers(orig)
+                        if norm != orig:
+                            cell[key] = norm
+
         return json.dumps(data, ensure_ascii=False, indent=2)
-        
+
     except json.JSONDecodeError as e:
         print(f"⚠️ JSON解析失败: {e}")
         return json_content
@@ -220,31 +324,48 @@ def normalize_json_table(json_content: str) -> str:
         print(f"⚠️ JSON表格标准化失败: {e}")
         return json_content
 
-def normalize_json_file(file_path: str, output_path: str | None = None) -> str:
+def normalize_json_file(
+    file_path: str,
+    output_path: str | None = None,
+    *,
+    table_type_key: str = "category",
+    table_type_value: str = "Table",
+    html_key: str = "text",
+    cells_key: str | None = None,
+) -> str:
     """
-    标准化JSON文件中的表格数字
-    
+    标准化JSON文件中的表格数字。
+    提取表格时使用的 key 可通过参数指定,以兼容不同 OCR 工具。
+
     Args:
         file_path: 输入JSON文件路径
         output_path: 输出文件路径,如果为None则覆盖原文件
-    
+        table_type_key: 判断表格的字段名(见 normalize_json_table)
+        table_type_value: 判断表格的字段值
+        html_key: 表格 HTML 所在字段名
+        cells_key: 单元格列表所在字段名,None 表示不处理 cells
+
     Returns:
         标准化后的JSON内容
     """
     input_file = Path(file_path)
     output_file = Path(output_path) if output_path else input_file
-    
+
     if not input_file.exists():
         raise FileNotFoundError(f"找不到文件: {file_path}")
-    
-    # 读取原始JSON文件
-    with open(input_file, 'r', encoding='utf-8') as f:
+
+    with open(input_file, "r", encoding="utf-8") as f:
         original_content = f.read()
-    
+
     print(f"🔧 正在标准化JSON文件: {input_file.name}")
-    
-    # 标准化内容
-    normalized_content = normalize_json_table(original_content)
+
+    normalized_content = normalize_json_table(
+        original_content,
+        table_type_key=table_type_key,
+        table_type_value=table_type_value,
+        html_key=html_key,
+        cells_key=cells_key,
+    )
     
     # 保存标准化后的文件
     with open(output_file, 'w', encoding='utf-8') as f:
@@ -266,4 +387,51 @@ def normalize_json_file(file_path: str, output_path: str | None = None) -> str:
     
     print(f"📄 标准化结果已保存到: {output_file}")
     return normalized_content
+    
+
+if __name__ == "__main__":
+    """
+    简单验证:构造一份“故意打乱逗号/小数点”的 JSON / Markdown 示例,
+    并打印标准化前后的差异。
+    """
+    import json
+
+    print("=== JSON 示例:金额格式纠错 + 变更记录 ===")
+    demo_json_data = [
+        {
+            "category": "Table",
+            "text": (
+                "<table><tbody>"
+                "<tr><td data-bbox=\"[0,0,10,10]\">项目</td>"
+                "<td data-bbox=\"[10,0,20,10]\">2023 年12 月31 日</td></tr>"
+                # 故意打乱的数字:应为 12,123,456.00 和 1,234,567.89
+                "<tr><td data-bbox=\"[0,10,10,20]\">测试金额A</td>"
+                "<td data-bbox=\"[10,10,20,20]\">12.123,456,00</td></tr>"
+                "<tr><td data-bbox=\"[0,20,10,30]\">测试金额B</td>"
+                "<td data-bbox=\"[10,20,20,30]\">1,234,567,89</td></tr>"
+                "<tr><td data-bbox=\"[0,20,10,40]\">测试金额C</td>"
+                "<td data-bbox=\"[10,20,20,40]\">301,55</td></tr>"
+                "</tbody></table>"
+            ),
+        }
+    ]
+    demo_json_str = json.dumps(demo_json_data, ensure_ascii=False, indent=2)
+    print("原始 JSON:")
+    print(demo_json_str)
+    normalized_json_str = normalize_json_table(demo_json_str)
+    print("\n标准化后 JSON:")
+    print(normalized_json_str)
 
+    print("\n=== Markdown 示例:金额格式纠错 + 注释说明 ===")
+    demo_md = """<table><tbody>
+<tr><td>项目</td><td>2023 年12 月31 日</td></tr>
+<tr><td>测试金额A</td><td>12.123,456,00</td></tr>
+<tr><td>测试金额B</td><td>1,234,567,89</td></tr>
+<tr><td>测试金额C</td><td>301,55</td></tr>
+</tbody></table>
+"""
+    print("原始 Markdown:")
+    print(demo_md)
+    normalized_md = normalize_markdown_table(demo_md)
+    print("\n标准化后 Markdown:")
+    print(normalized_md)

+ 1 - 21
ocr_utils/output_formatter_v2.py

@@ -30,10 +30,6 @@ from .markdown_generator import MarkdownGenerator
 from .html_generator import HTMLGenerator
 from .visualization_utils import VisualizationUtils
 
-# 导入数字标准化工具
-from .normalize_financial_numbers import normalize_markdown_table, normalize_json_table
-
-
 class NumpyEncoder(json.JSONEncoder):
     """自定义JSON编码器,处理numpy类型"""
     def default(self, obj):
@@ -171,26 +167,10 @@ class OutputFormatterV2:
         # 2. 转换为 MinerU middle.json 格式
         middle_json = JSONFormatters.convert_to_middle_json(results)
         
-        # 3. 保存 middle.json
+        # 3. 保存 middle.json(金额标准化已在 pipeline element_processors 中完成,此处不再重复)
         if output_config.get('save_json', True):
             json_path = doc_output_dir / f"{doc_name}_middle.json"
             json_content = json.dumps(middle_json, ensure_ascii=False, indent=2, cls=NumpyEncoder)
-            
-            # 金额数字标准化
-            normalize_numbers = output_config.get('normalize_numbers', True)
-            if normalize_numbers:
-                original_content = json_content
-                json_content = normalize_json_table(json_content)
-                
-                # 检查是否有变化
-                if json_content != original_content:
-                    # 保存原始文件
-                    original_path = doc_output_dir / f"{doc_name}_middle_original.json"
-                    with open(original_path, 'w', encoding='utf-8') as f:
-                        f.write(original_content)
-                    logger.info(f"📄 Original middle JSON saved: {original_path}")
-                    output_paths['middle_json_original'] = str(original_path)
-            
             with open(json_path, 'w', encoding='utf-8') as f:
                 f.write(json_content)
             output_paths['middle_json'] = str(json_path)