SHA1
--- a/ocr_tools/universal_doc_parser/core/element_processors.py
+++ b/ocr_tools/universal_doc_parser/core/element_processors.py
@@ -15,6 +15,7 @@ from loguru import logger
 
				 
			
 
				 from ocr_utils.coordinate_utils import CoordinateUtils
			
 
				 from ocr_utils import PDFUtils
			
 
				+from ocr_utils import normalize_financial_numbers
			
 
				 from .table_coordinate_utils import TableCoordinateUtils
			
 
				 
			
 
				 # 导入 SpanMatcher（用于 spans 合并）
			
@@ -358,24 +359,26 @@ class ElementProcessors:
 
				         layout_item: Dict[str, Any],
			
 
				         scale: float,
			
 
				         pre_matched_spans: Optional[List[Dict[str, Any]]] = None,
			
 
				-        pdf_type: str = 'ocr', # 'ocr' 或 'txt'
			
 
				+        pdf_type: str = 'ocr',  # 'ocr' 或 'txt'
			
 
				         output_dir: Optional[str] = None,
			
 
				-        basename: Optional[str] = None
			
 
				+        basename: Optional[str] = None,
			
 
				+        normalize_numbers: bool = True,
			
 
				     ) -> Dict[str, Any]:
			
 
				         """
			
 
				         使用 UNet 有线表格识别处理表格元素
			
 
				-        
			
 
				+
			
 
				         流程：
			
 
				         1. OCR检测获取文本框坐标
			
 
				         2. UNet 有线表格识别
			
 
				         3. 坐标逆向转换回原图坐标
			
 
				-        
			
 
				+
			
 
				         Args:
			
 
				             image: 页面图像
			
 
				             layout_item: 布局检测项
			
 
				             scale: 缩放比例
			
 
				             pre_matched_spans: 预匹配的 OCR spans（来自整页 OCR）
			
 
				-            
			
 
				+            normalize_numbers: 是否对表格内容做金额标准化
			
 
				+
			
 
				         Returns:
			
 
				             处理后的元素字典
			
 
				         """
			
@@ -446,7 +449,7 @@ class ElementProcessors:
 
				             enhanced_html = TableCoordinateUtils.add_table_offset_to_html(enhanced_html, cropped_offset_bbox)
			
 
				             ocr_boxes = TableCoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, cropped_offset_bbox)
			
 
				         
			
 
				-        return {
			
 
				+        result = {
			
 
				             'type': 'table',
			
 
				             'bbox': bbox,
			
 
				             'confidence': layout_item.get('confidence', 0.0),
			
@@ -461,29 +464,34 @@ class ElementProcessors:
 
				                 'recognition_method': 'wired_unet',
			
 
				             },
			
 
				         }
			
 
				-    
			
 
				+        if normalize_numbers:
			
 
				+            self._normalize_table_content(result['content'])
			
 
				+        return result
			
 
				+
			
 
				     def process_table_element_vlm(
			
 
				         self,
			
 
				         image: np.ndarray,
			
 
				         layout_item: Dict[str, Any],
			
 
				         scale: float,
			
 
				-        pre_matched_spans: Optional[List[Dict[str, Any]]] = None
			
 
				+        pre_matched_spans: Optional[List[Dict[str, Any]]] = None,
			
 
				+        normalize_numbers: bool = True,
			
 
				     ) -> Dict[str, Any]:
			
 
				         """
			
 
				         使用 VLM 无线表格识别处理表格元素
			
 
				-        
			
 
				+
			
 
				         流程：
			
 
				         1. OCR检测获取文本框坐标
			
 
				         2. VLM识别获取表格结构HTML
			
 
				         3. 匹配OCR坐标与VLM结构
			
 
				         4. 坐标逆向转换回原图坐标
			
 
				-        
			
 
				+
			
 
				         Args:
			
 
				             image: 页面图像
			
 
				             layout_item: 布局检测项
			
 
				             scale: 缩放比例
			
 
				             pre_matched_spans: 预匹配的 OCR spans（来自整页 OCR）
			
 
				-            
			
 
				+            normalize_numbers: 是否对表格内容做金额标准化
			
 
				+
			
 
				         Returns:
			
 
				             处理后的元素字典
			
 
				         """
			
@@ -557,7 +565,7 @@ class ElementProcessors:
 
				             enhanced_html = TableCoordinateUtils.add_table_offset_to_html(enhanced_html, cropped_offset_bbox)
			
 
				             ocr_boxes = TableCoordinateUtils.add_table_offset_to_ocr_boxes(ocr_boxes, cropped_offset_bbox)
			
 
				         
			
 
				-        return {
			
 
				+        result = {
			
 
				             'type': 'table',
			
 
				             'bbox': bbox,
			
 
				             'confidence': layout_item.get('confidence', 0.0),
			
@@ -572,6 +580,64 @@ class ElementProcessors:
 
				                 'recognition_method': 'vlm',
			
 
				             },
			
 
				         }
			
 
				+        if normalize_numbers:
			
 
				+            self._normalize_table_content(result['content'])
			
 
				+        return result
			
 
				+
			
 
				+    def _normalize_table_content(self, content: Dict[str, Any]) -> None:
			
 
				+        """
			
 
				+        对表格 content 中的 html 与 cells 做金额标准化（就地修改），
			
 
				+        并记录变更到 content['number_normalization_changes']，供 JSON/MD 输出写入。
			
 
				+        """
			
 
				+        if not content or not content.get('html'):
			
 
				+            return
			
 
				+        table_changes: List[Dict[str, Any]] = []
			
 
				+        try:
			
 
				+            from ast import literal_eval
			
 
				+            from bs4 import BeautifulSoup, Tag
			
 
				+            html = content['html']
			
 
				+            soup = BeautifulSoup(html, 'html.parser')
			
 
				+            for table in soup.find_all('table'):
			
 
				+                if not isinstance(table, Tag):
			
 
				+                    continue
			
 
				+                for row_idx, tr in enumerate(table.find_all('tr')):  # type: ignore[reportAttributeAccessIssue]
			
 
				+                    for col_idx, cell in enumerate(tr.find_all(['td', 'th'])):  # type: ignore[reportAttributeAccessIssue]
			
 
				+                        if not isinstance(cell, Tag):
			
 
				+                            continue
			
 
				+                        raw = cell.get_text()
			
 
				+                        norm = normalize_financial_numbers(raw)
			
 
				+                        if norm != raw:
			
 
				+                            change: Dict[str, Any] = {
			
 
				+                                "row": row_idx,
			
 
				+                                "col": col_idx,
			
 
				+                                "old": raw,
			
 
				+                                "new": norm,
			
 
				+                            }
			
 
				+                            bbox_attr = cell.get("data-bbox")
			
 
				+                            if isinstance(bbox_attr, str):
			
 
				+                                try:
			
 
				+                                    change["bbox"] = literal_eval(bbox_attr)
			
 
				+                                except Exception:
			
 
				+                                    change["bbox"] = bbox_attr
			
 
				+                            table_changes.append(change)
			
 
				+                            cell.string = norm
			
 
				+            content['html'] = str(soup)
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"表格 HTML 金额标准化失败: {e}")
			
 
				+        cells = content.get('cells')
			
 
				+        if isinstance(cells, list):
			
 
				+            try:
			
 
				+                for cell in cells:
			
 
				+                    if not isinstance(cell, dict):
			
 
				+                        continue
			
 
				+                    # for key in ['text', 'matched_text']:
			
 
				+                    for key in ['text']:
			
 
				+                        if key in cell and isinstance(cell[key], str):
			
 
				+                            cell[key] = normalize_financial_numbers(cell[key])
			
 
				+            except Exception as e:
			
 
				+                logger.warning(f"表格 cells 金额标准化失败: {e}")
			
 
				+        if table_changes:
			
 
				+            content['number_normalization_changes'] = table_changes
			
 
				     
			
 
				     def _create_empty_table_result(
			
 
				         self,
			
--- a/ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py
+++ b/ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py
@@ -546,6 +546,7 @@ class EnhancedDocPipeline:
 
				         classified_elements = self._classify_elements(layout_results, page_idx)
			
 
				         
			
 
				         # 6. 处理各类元素（传入匹配的 spans）
			
 
				+        normalize_numbers = self.config.get('output', {}).get('normalize_numbers', True)
			
 
				         processed_elements, discarded_elements = self._process_all_elements(
			
 
				             detection_image=detection_image,
			
 
				             classified_elements=classified_elements,
			
@@ -556,7 +557,8 @@ class EnhancedDocPipeline:
 
				             matched_spans=matched_spans,
			
 
				             layout_results=layout_results,
			
 
				             output_dir=output_dir,
			
 
				-            basename=page_name
			
 
				+            basename=page_name,
			
 
				+            normalize_numbers=normalize_numbers,
			
 
				         )
			
 
				         
			
 
				         # 7. 按阅读顺序排序
			
@@ -820,10 +822,11 @@ class EnhancedDocPipeline:
 
				         layout_results: Optional[List[Dict[str, Any]]] = None,
			
 
				         output_dir: Optional[str] = None,
			
 
				         basename: Optional[str] = None,
			
 
				+        normalize_numbers: bool = True,
			
 
				     ) -> tuple:
			
 
				         """
			
 
				         处理所有分类后的元素
			
 
				-        
			
 
				+
			
 
				         Args:
			
 
				             detection_image: 检测用图像
			
 
				             classified_elements: 分类后的元素
			
@@ -833,7 +836,8 @@ class EnhancedDocPipeline:
 
				             scale: 缩放比例
			
 
				             matched_spans: 匹配的 OCR spans {block_idx: [spans]}
			
 
				             layout_results: 原始 layout 检测结果（用于索引匹配）
			
 
				-            
			
 
				+            normalize_numbers: 是否对表格内容做金额标准化
			
 
				+
			
 
				         Returns:
			
 
				             (processed_elements, discarded_elements)
			
 
				         """
			
@@ -939,20 +943,22 @@ class EnhancedDocPipeline:
 
				                     logger.info(f"🔷 Table {idx}: Using wired UNet recognition")
			
 
				                     element = self.element_processors.process_table_element_wired(
			
 
				                         detection_image, item, scale, pre_matched_spans=spans, pdf_type=pdf_type,
			
 
				-                        output_dir=output_dir, basename=f"{basename}_{idx}"
			
 
				+                        output_dir=output_dir, basename=f"{basename}_{idx}",
			
 
				+                        normalize_numbers=normalize_numbers,
			
 
				                     )
			
 
				-                    
			
 
				                     # 如果有线识别失败（返回空 HTML），fallback 到 VLM
			
 
				                     if not element['content'].get('html') and not element['content'].get('cells'):
			
 
				                         logger.warning(f"⚠️ Wired recognition failed for table {idx}, fallback to VLM")
			
 
				                         element = self.element_processors.process_table_element_vlm(
			
 
				-                            detection_image, item, scale, pre_matched_spans=spans
			
 
				+                            detection_image, item, scale, pre_matched_spans=spans,
			
 
				+                            normalize_numbers=normalize_numbers,
			
 
				                         )
			
 
				                 else:
			
 
				                     # VLM 无线表格路径（默认）
			
 
				                     logger.info(f"🔷 Table {idx}: Using VLM recognition")
			
 
				                     element = self.element_processors.process_table_element_vlm(
			
 
				-                        detection_image, item, scale, pre_matched_spans=spans
			
 
				+                        detection_image, item, scale, pre_matched_spans=spans,
			
 
				+                        normalize_numbers=normalize_numbers,
			
 
				                     )
			
 
				                 
			
 
				                 processed_elements.append(element)
			
--- a/ocr_utils/json_formatters.py
+++ b/ocr_utils/json_formatters.py
@@ -14,10 +14,6 @@ from pathlib import Path
 
				 from typing import Dict, Any, List, Optional
			
 
				 from loguru import logger
			
 
				 
			
 
				-# 导入数字标准化工具
			
 
				-from .normalize_financial_numbers import normalize_json_table
			
 
				-
			
 
				-
			
 
				 class NumpyEncoder(json.JSONEncoder):
			
 
				     """自定义JSON编码器，处理numpy类型"""
			
 
				     def default(self, obj):
			
@@ -263,20 +259,9 @@ class JSONFormatters:
 
				                 if converted:
			
 
				                     page_elements.append(converted)
			
 
				             
			
 
				-            # 转换为 JSON 字符串
			
 
				+            # 转换为 JSON 字符串（金额标准化已在 pipeline element_processors 中完成，此处不再重复）
			
 
				             json_content = json.dumps(page_elements, ensure_ascii=False, indent=2, cls=NumpyEncoder)
			
 
				-            
			
 
				-            # 金额数字标准化
			
 
				-            if normalize_numbers:
			
 
				-                original_content = json_content
			
 
				-                json_content = normalize_json_table(json_content)
			
 
				-                
			
 
				-                if json_content != original_content:
			
 
				-                    original_path = output_dir / f"{page_name}_original.json"
			
 
				-                    with open(original_path, 'w', encoding='utf-8') as f:
			
 
				-                        f.write(original_content)
			
 
				-                    logger.debug(f"📄 Original page JSON saved: {original_path}")
			
 
				-            
			
 
				+
			
 
				             # 保存 JSON
			
 
				             json_path = output_dir / f"{page_name}.json"
			
 
				             with open(json_path, 'w', encoding='utf-8') as f:
			
@@ -338,7 +323,11 @@ class JSONFormatters:
 
				             cells = content.get('cells', [])
			
 
				             if cells:
			
 
				                 result['table_cells'] = JSONFormatters.format_table_cells(cells)
			
 
				-            
			
 
				+            # 金额标准化变更记录（来自 element_processors._normalize_table_content）
			
 
				+            changes = content.get('number_normalization_changes', [])
			
 
				+            if changes:
			
 
				+                result['number_normalization_changes'] = changes
			
 
				+
			
 
				             # 旋转和倾斜信息
			
 
				             if 'table_angle' in content:
			
 
				                 result['image_rotation_angle'] = float(content['table_angle'])
			
--- a/ocr_utils/markdown_generator.py
+++ b/ocr_utils/markdown_generator.py
@@ -29,10 +29,6 @@ except ImportError:
 
				         MM_MD = 'mm_md'
			
 
				         NLP_MD = 'nlp_md'
			
 
				 
			
 
				-# 导入数字标准化工具
			
 
				-from .normalize_financial_numbers import normalize_markdown_table
			
 
				-
			
 
				-
			
 
				 class MarkdownGenerator:
			
 
				     """Markdown 生成器类"""
			
 
				     
			
@@ -80,18 +76,7 @@ class MarkdownGenerator:
 
				                     
			
 
				                     header = MarkdownGenerator._generate_header(results)
			
 
				                     markdown_content = header + str(markdown_content)
			
 
				-                    
			
 
				-                    # 金额数字标准化
			
 
				-                    if normalize_numbers:
			
 
				-                        original_content = markdown_content
			
 
				-                        markdown_content = normalize_markdown_table(markdown_content)
			
 
				-                        
			
 
				-                        if markdown_content != original_content:
			
 
				-                            original_path = output_dir / f"{doc_name}_original.md"
			
 
				-                            with open(original_path, 'w', encoding='utf-8') as f:
			
 
				-                                f.write(original_content)
			
 
				-                            logger.info(f"📝 Original Markdown saved: {original_path}")
			
 
				-                    
			
 
				+                    # 金额标准化已在 pipeline element_processors 中完成，此处不再重复
			
 
				                     with open(md_path, 'w', encoding='utf-8') as f:
			
 
				                         f.write(markdown_content)
			
 
				                     
			
@@ -103,18 +88,7 @@ class MarkdownGenerator:
 
				         
			
 
				         # 使用自定义实现，确保所有元素类型都被处理
			
 
				         markdown_content = MarkdownGenerator._generate_full_markdown(results)
			
 
				-        
			
 
				-        # 金额数字标准化
			
 
				-        if normalize_numbers:
			
 
				-            original_content = markdown_content
			
 
				-            markdown_content = normalize_markdown_table(markdown_content)
			
 
				-            
			
 
				-            if markdown_content != original_content:
			
 
				-                original_path = output_dir / f"{doc_name}_original.md"
			
 
				-                with open(original_path, 'w', encoding='utf-8') as f:
			
 
				-                    f.write(original_content)
			
 
				-                logger.info(f"📝 Original Markdown saved: {original_path}")
			
 
				-        
			
 
				+        # 金额标准化已在 pipeline element_processors 中完成，此处不再重复
			
 
				         with open(md_path, 'w', encoding='utf-8') as f:
			
 
				             f.write(markdown_content)
			
 
				         
			
@@ -163,20 +137,9 @@ class MarkdownGenerator:
 
				             else:
			
 
				                 page_name = doc_name
			
 
				             
			
 
				-            # 生成单页 Markdown
			
 
				+            # 生成单页 Markdown（金额标准化已在 pipeline element_processors 中完成，此处不再重复）
			
 
				             md_content = MarkdownGenerator._generate_page_markdown(page, doc_name, page_idx)
			
 
				-            
			
 
				-            # 金额数字标准化
			
 
				-            if normalize_numbers:
			
 
				-                original_content = md_content
			
 
				-                md_content = normalize_markdown_table(md_content)
			
 
				-                
			
 
				-                if md_content != original_content:
			
 
				-                    original_path = output_dir / f"{page_name}_original.md"
			
 
				-                    with open(original_path, 'w', encoding='utf-8') as f:
			
 
				-                        f.write(original_content)
			
 
				-                    logger.debug(f"📝 Original page Markdown saved: {original_path}")
			
 
				-            
			
 
				+
			
 
				             # 保存
			
 
				             md_path = output_dir / f"{page_name}.md"
			
 
				             with open(md_path, 'w', encoding='utf-8') as f:
			
@@ -245,7 +208,14 @@ pages: {len(results.get('pages', []))}
 
				                     html = content.get('html', '')
			
 
				                     if html:
			
 
				                         md_lines.append(f"\n{html}\n")
			
 
				+                    changes = content.get('number_normalization_changes', [])
			
 
				+                    if changes:
			
 
				                         md_lines.append("")
			
 
				+                        md_lines.append("<!-- 数字标准化说明：")
			
 
				+                        for ch in changes:
			
 
				+                            md_lines.append(f"  - [row={ch.get('row')},col={ch.get('col')}] {ch.get('old', '')} -> {ch.get('new', '')}")
			
 
				+                        md_lines.append("-->")
			
 
				+                    md_lines.append("")
			
 
				                 
			
 
				                 elif elem_type in ['image', 'image_body', 'figure']:
			
 
				                     img_filename = content.get('image_path', '')
			
@@ -343,6 +313,14 @@ pages: {len(results.get('pages', []))}
 
				                 html = content.get('html', '')
			
 
				                 if html:
			
 
				                     md_lines.append(f"\n{html}\n")
			
 
				+                # 金额标准化说明（来自 element_processors._normalize_table_content）
			
 
				+                changes = content.get('number_normalization_changes', [])
			
 
				+                if changes:
			
 
				+                    md_lines.append("")
			
 
				+                    md_lines.append("<!-- 数字标准化说明：")
			
 
				+                    for ch in changes:
			
 
				+                        md_lines.append(f"  - [row={ch.get('row')},col={ch.get('col')}] {ch.get('old', '')} -> {ch.get('new', '')}")
			
 
				+                    md_lines.append("-->")
			
 
				                 md_lines.append("")
			
 
				             
			
 
				             elif elem_type in ['image', 'image_body', 'figure']:
			
--- a/ocr_utils/normalize_financial_numbers.py
+++ b/ocr_utils/normalize_financial_numbers.py
@@ -1,16 +1,90 @@
 
				 import re
			
 
				 import os
			
 
				 from pathlib import Path
			
 
				+from decimal import Decimal, InvalidOperation
			
 
				+
			
 
				+
			
 
				+def _normalize_amount_token(token: str) -> str:
			
 
				+    """
			
 
				+    规范单个金额 token 中逗号/小数点的用法。
			
 
				+    仅在形态明显为金额时进行纠错，其他情况原样返回。
			
 
				+    """
			
 
				+    if not token:
			
 
				+        return token
			
 
				+
			
 
				+    # 只处理包含数字的简单 token，避免带字母/其他符号的误改
			
 
				+    if not re.fullmatch(r"[+-]?\d[\d,\.]*\d", token):
			
 
				+        return token
			
 
				+
			
 
				+    sign = ""
			
 
				+    core = token
			
 
				+    if core[0] in "+-":
			
 
				+        sign, core = core[0], core[1:]
			
 
				+
			
 
				+    has_dot = "." in core
			
 
				+    has_comma = "," in core
			
 
				+
			
 
				+    # 辅助: 尝试解析为 Decimal；失败则认为不安全，回退原值
			
 
				+    def _safe_decimal(s: str) -> bool:
			
 
				+        try:
			
 
				+            Decimal(s.replace(",", ""))
			
 
				+            return True
			
 
				+        except (InvalidOperation, ValueError):
			
 
				+            return False
			
 
				+
			
 
				+    # 规则A：同时包含 . 和 ,，最后一个分隔符是逗号，且其后为 1-2 位数字
			
 
				+    if has_dot and has_comma:
			
 
				+        last_comma = core.rfind(",")
			
 
				+        last_dot = core.rfind(".")
			
 
				+        if last_comma > last_dot and last_comma != -1:
			
 
				+            frac = core[last_comma + 1 :]
			
 
				+            if 1 <= len(frac) <= 2 and frac.isdigit():
			
 
				+                # 先把所有点当作千分位逗号，再把最后一个逗号当作小数点
			
 
				+                temp = core.replace(".", ",")
			
 
				+                idx = temp.rfind(",")
			
 
				+                if idx != -1:
			
 
				+                    candidate = temp[:idx] + "." + temp[idx + 1 :]
			
 
				+                    if _safe_decimal(candidate):
			
 
				+                        return sign + candidate
			
 
				+
			
 
				+    # 规则B：只有 .，多个点，最后一段视为小数，其余为千分位
			
 
				+    if has_dot and not has_comma:
			
 
				+        parts = core.split(".")
			
 
				+        if len(parts) >= 3:
			
 
				+            last = parts[-1]
			
 
				+            ints = parts[:-1]
			
 
				+            if 1 <= len(last) <= 2 and all(len(p) == 3 for p in ints[1:]):
			
 
				+                candidate = ",".join(ints) + "." + last
			
 
				+                if _safe_decimal(candidate):
			
 
				+                    return sign + candidate
			
 
				+
			
 
				+    # 规则C：只有 ,，多个逗号，最后一段长度为 1-2 且前面为 3 位分组
			
 
				+    if has_comma and not has_dot:
			
 
				+        parts = core.split(",")
			
 
				+        if len(parts) >= 3:
			
 
				+            last = parts[-1]
			
 
				+            ints = parts[:-1]
			
 
				+            if 1 <= len(last) <= 2 and all(len(p) == 3 for p in ints[1:]):
			
 
				+                # 将最后一个逗号视为小数点
			
 
				+                idx = core.rfind(",")
			
 
				+                candidate = core[:idx] + "." + core[idx + 1 :]
			
 
				+                if _safe_decimal(candidate):
			
 
				+                    return sign + candidate
			
 
				+        # 规则D：只有 ,，且仅有一个逗号、逗号后 1-2 位数字 → 欧洲格式小数，如 301,55 → 301.55
			
 
				+        elif len(parts) == 2:
			
 
				+            left, right = parts[0], parts[1]
			
 
				+            if 1 <= len(right) <= 2 and right.isdigit() and left.isdigit():
			
 
				+                candidate = left + "." + right
			
 
				+                if _safe_decimal(candidate):
			
 
				+                    return sign + candidate
			
 
				+
			
 
				+    # 没有需要纠错的典型形态，直接返回原 token
			
 
				+    return token
			
 
				+
			
 
				 
			
 
				 def normalize_financial_numbers(text: str) -> str:
			
 
				     """
			
 
				-    标准化财务数字：将全角字符转换为半角字符
			
 
				-    
			
 
				-    Args:
			
 
				-        text: 原始文本
			
 
				-    
			
 
				-    Returns:
			
 
				-        标准化后的文本
			
 
				+    标准化财务数字：将全角字符转换为半角字符，并纠正常见的逗号/小数点错用。
			
 
				     """
			
 
				     if not text:
			
 
				         return text
			
@@ -31,30 +105,30 @@ def normalize_financial_numbers(text: str) -> str:
 
				         '％': '%',  # 全角百分号转半角百分号
			
 
				     }
			
 
				     
			
 
				-    # 第一步：执行基础字符替换
			
 
				+    # 第一步：执行基础字符替换（全角 -> 半角）
			
 
				     normalized_text = text
			
 
				     for fullwidth, halfwidth in fullwidth_to_halfwidth.items():
			
 
				         normalized_text = normalized_text.replace(fullwidth, halfwidth)
			
 
				     
			
 
				-    # 第二步：处理数字序列中的空格和分隔符
			
 
				-    # 修改正则表达式以匹配完整的数字序列，包括空格
			
 
				-    # 匹配模式：数字 + (空格? + 逗号 + 空格? + 数字)* + (空格? + 小数点 + 数字+)?
			
 
				+    # 第二步：处理数字序列中的空格和分隔符（保留原有逻辑）
			
 
				     number_sequence_pattern = r'(\d+(?:\s*[，,]\s*\d+)*(?:\s*[。．.]\s*\d+)?)'
			
 
				     
			
 
				     def normalize_number_sequence(match):
			
 
				         sequence = match.group(1)
			
 
				-        
			
 
				-        # 处理千分位分隔符周围的空格
			
 
				-        # 将 "数字 + 空格 + 逗号 + 空格 + 数字" 标准化为 "数字,数字"
			
 
				         sequence = re.sub(r'(\d)\s*[，,]\s*(\d)', r'\1,\2', sequence)
			
 
				-        
			
 
				-        # 处理小数点周围的空格
			
 
				-        # 将 "数字 + 空格 + 小数点 + 空格 + 数字" 标准化为 "数字.数字"
			
 
				         sequence = re.sub(r'(\d)\s*[。．.]\s*(\d)', r'\1.\2', sequence)
			
 
				-        
			
 
				         return sequence
			
 
				     
			
 
				     normalized_text = re.sub(number_sequence_pattern, normalize_number_sequence, normalized_text)
			
 
				+
			
 
				+    # 第三步：对疑似金额 token 做逗号/小数点纠错
			
 
				+    amount_pattern = r'(?P<tok>[+-]?\d[\d,\.]*\d)'
			
 
				+
			
 
				+    def _amount_sub(m: re.Match) -> str:
			
 
				+        tok = m.group('tok')
			
 
				+        return _normalize_amount_token(tok)
			
 
				+
			
 
				+    normalized_text = re.sub(amount_pattern, _amount_sub, normalized_text)
			
 
				     return normalized_text
			
 
				     
			
 
				 def normalize_markdown_table(markdown_content: str) -> str:
			
@@ -78,7 +152,7 @@ def normalize_markdown_table(markdown_content: str) -> str:
 
				     table_pattern = r'(<table[^>]*>.*?</table>)'
			
 
				     
			
 
				     def normalize_table_match(match):
			
 
				-        """处理单个表格匹配，保留原始格式"""
			
 
				+        """处理单个表格匹配，保留原始格式，并追加数字标准化说明注释。"""
			
 
				         table_html = match.group(1)
			
 
				         original_table_html = table_html  # 保存原始HTML用于比较
			
 
				         
			
@@ -86,133 +160,163 @@ def normalize_markdown_table(markdown_content: str) -> str:
 
				         soup = BeautifulSoup(table_html, 'html.parser')
			
 
				         tables = soup.find_all('table')
			
 
				         
			
 
				-        # 记录所有需要替换的文本（原始文本 -> 标准化文本）
			
 
				-        replacements = []
			
 
				+        # 记录本表格中所有数值修改
			
 
				+        changes: list[dict] = []
			
 
				         
			
 
				         for table in tables:
			
 
				-            if isinstance(table, Tag):
			
 
				-                cells = table.find_all(['td', 'th'])
			
 
				-                for cell in cells:
			
 
				-                    if isinstance(cell, Tag):
			
 
				-                        # 获取单元格的纯文本内容
			
 
				-                        original_text = cell.get_text()
			
 
				-                        normalized_text = normalize_financial_numbers(original_text)
			
 
				-                        
			
 
				-                        # 如果内容发生了变化，记录替换
			
 
				-                        if original_text != normalized_text:
			
 
				-                            # 找到单元格中所有文本节点并替换
			
 
				-                            from bs4.element import NavigableString
			
 
				-                            for text_node in cell.find_all(string=True, recursive=True):
			
 
				-                                if isinstance(text_node, NavigableString):
			
 
				-                                    text_str = str(text_node)
			
 
				-                                    if text_str.strip():
			
 
				-                                        normalized = normalize_financial_numbers(text_str.strip())
			
 
				-                                        if normalized != text_str.strip():
			
 
				-                                            # 保留原始文本节点的前后空白
			
 
				-                                            if text_str.strip() == text_str:
			
 
				-                                                # 纯文本节点，直接替换
			
 
				-                                                text_node.replace_with(normalized)
			
 
				-                                            else:
			
 
				-                                                # 有前后空白，需要保留
			
 
				-                                                leading_ws = text_str[:len(text_str) - len(text_str.lstrip())]
			
 
				-                                                trailing_ws = text_str[len(text_str.rstrip()):]
			
 
				-                                                text_node.replace_with(leading_ws + normalized + trailing_ws)
			
 
				+            if not isinstance(table, Tag):
			
 
				+                continue
			
 
				+            # 通过 tr / td(th) 计算行列位置
			
 
				+            for row_idx, tr in enumerate(table.find_all('tr')):  # type: ignore[reportAttributeAccessIssue]
			
 
				+                cells = tr.find_all(['td', 'th'])  # type: ignore[reportAttributeAccessIssue]
			
 
				+                for col_idx, cell in enumerate(cells):
			
 
				+                    if not isinstance(cell, Tag):
			
 
				+                        continue
			
 
				+                    # 与 normalize_json_table 一致：整格取文本、只标准化一次、再写回
			
 
				+                    original_text = cell.get_text()
			
 
				+                    normalized_text = normalize_financial_numbers(original_text)
			
 
				+                    if original_text == normalized_text:
			
 
				+                        continue
			
 
				+                    # 记录一条修改
			
 
				+                    changes.append(
			
 
				+                        {
			
 
				+                            "row": row_idx,
			
 
				+                            "col": col_idx,
			
 
				+                            "old": original_text,
			
 
				+                            "new": normalized_text,
			
 
				+                        }
			
 
				+                    )
			
 
				+                    # 整格替换为标准化后的文本（与 normalize_json_table 的 cell.string = normalized_text 一致）
			
 
				+                    cell.string = normalized_text
			
 
				+        
			
 
				+        # 如果没有任何数值修改，直接返回原始 HTML
			
 
				+        if not changes:
			
 
				+            return original_table_html
			
 
				         
			
 
				         # 获取修改后的HTML
			
 
				         modified_html = str(soup)
			
 
				         
			
 
				-        # 如果内容没有变化，返回原始HTML（保持原始格式）
			
 
				-        # 检查是否只是格式变化（换行、空格等）
			
 
				-        original_text_only = re.sub(r'\s+', '', original_table_html)
			
 
				-        modified_text_only = re.sub(r'\s+', '', modified_html)
			
 
				+        # 在表格后追加注释，说明哪些单元格被修改
			
 
				+        lines = ["<!-- 数字标准化说明："]
			
 
				+        for ch in changes:
			
 
				+            lines.append(
			
 
				+                f"  - [row={ch['row']},col={ch['col']}] {ch['old']} -> {ch['new']}"
			
 
				+            )
			
 
				+        lines.append("-->")
			
 
				+        comment = "\n".join(lines)
			
 
				         
			
 
				-        if original_text_only == modified_text_only:
			
 
				-            # 只有格式变化，返回原始HTML以保留换行符
			
 
				-            return original_table_html
			
 
				-        
			
 
				-        # 有实际内容变化，返回修改后的HTML
			
 
				-        return modified_html
			
 
				+        return modified_html + "\n\n" + comment
			
 
				     
			
 
				     # 使用正则替换，只替换表格内容，保留其他部分（包括换行符）不变
			
 
				     normalized_content = re.sub(table_pattern, normalize_table_match, markdown_content, flags=re.DOTALL)
			
 
				     
			
 
				     return normalized_content
			
 
				 
			
 
				-def normalize_json_table(json_content: str) -> str:
			
 
				+def normalize_json_table(
			
 
				+    json_content: str,
			
 
				+    *,
			
 
				+    table_type_key: str = "category",
			
 
				+    table_type_value: str = "Table",
			
 
				+    html_key: str = "text",
			
 
				+    cells_key: str | None = None,
			
 
				+) -> str:
			
 
				     """
			
 
				-    专门处理JSON格式OCR结果中表格的数字标准化
			
 
				-    
			
 
				+    专门处理JSON格式OCR结果中表格的数字标准化。
			
 
				+    通过参数指定提取用的 key，以兼容不同 OCR 工具的 JSON 结构。
			
 
				+
			
 
				     Args:
			
 
				-        json_content: JSON格式的OCR结果内容
			
 
				-    
			
 
				+        json_content: JSON格式的OCR结果内容（字符串或已解析的 list）
			
 
				+        table_type_key: 用于判断“是否为表格”的字段名，如 "type" 或 "category"
			
 
				+        table_type_value: 上述字段等于该值时视为表格，如 "table" 或 "Table"
			
 
				+        html_key: 存放表格 HTML 的字段名，如 "table_body" 或 "text"
			
 
				+        cells_key: 存放单元格列表的字段名，如 "table_cells"；为 None 则不处理 cells，
			
 
				+                   仅标准化 html_key 中的表格
			
 
				+
			
 
				     Returns:
			
 
				-        标准化后的JSON内容
			
 
				-    """
			
 
				-    """
			
 
				-    json_content 示例:
			
 
				-    [
			
 
				-        {
			
 
				-            "category": "Table",
			
 
				-            "text": "<table>...</table>"
			
 
				-        },
			
 
				-        {
			
 
				-            "category": "Text",
			
 
				-            "text": "Some other text"
			
 
				-        }
			
 
				-    ]
			
 
				+        标准化后的JSON内容（字符串）
			
 
				+
			
 
				+    常见格式示例:
			
 
				+        - 旧格式: category="Table", html 在 "text"
			
 
				+          normalize_json_table(s)  # 默认即此
			
 
				+        - mineru_vllm_results_cell_bbox: type="table", html 在 "table_body", cells 在 "table_cells"
			
 
				+          normalize_json_table(s, table_type_key="type", table_type_value="table",
			
 
				+                               html_key="table_body", cells_key="table_cells")
			
 
				     """
			
 
				     import json
			
 
				-    
			
 
				+    from ast import literal_eval
			
 
				+
			
 
				     try:
			
 
				-        # 解析JSON内容
			
 
				         data = json.loads(json_content) if isinstance(json_content, str) else json_content
			
 
				-        
			
 
				-        # 确保data是列表格式
			
 
				         if not isinstance(data, list):
			
 
				             return json_content
			
 
				-        
			
 
				-        # 遍历所有OCR结果项
			
 
				+
			
 
				         for item in data:
			
 
				             if not isinstance(item, dict):
			
 
				                 continue
			
 
				-                
			
 
				-            # 检查是否是表格类型
			
 
				-            if item.get('category') == 'Table' and 'text' in item:
			
 
				-                table_html = item['text']
			
 
				-                
			
 
				-                # 使用BeautifulSoup处理HTML表格
			
 
				-                from bs4 import BeautifulSoup, Tag
			
 
				-                
			
 
				-                soup = BeautifulSoup(table_html, 'html.parser')
			
 
				-                tables = soup.find_all('table')
			
 
				-                
			
 
				-                for table in tables:
			
 
				-                    if isinstance(table, Tag):
			
 
				-                        cells = table.find_all(['td', 'th'])
			
 
				-                        for cell in cells:
			
 
				-                            if isinstance(cell, Tag):
			
 
				-                                original_text = cell.get_text()
			
 
				-                                
			
 
				-                                # 应用数字标准化
			
 
				-                                normalized_text = normalize_financial_numbers(original_text)
			
 
				-                                
			
 
				-                                # 如果内容发生了变化，更新单元格内容
			
 
				-                                if original_text != normalized_text:
			
 
				-                                    cell.string = normalized_text
			
 
				-                
			
 
				-                # 更新item中的表格内容
			
 
				-                item['text'] = str(soup)
			
 
				-            
			
 
				-            # 同时标准化普通文本中的数字（如果需要）
			
 
				-            # elif 'text' in item:
			
 
				-            #     original_text = item['text']
			
 
				-            #     normalized_text = normalize_financial_numbers(original_text)
			
 
				-            #     if original_text != normalized_text:
			
 
				-            #         item['text'] = normalized_text
			
 
				-        
			
 
				-        # 返回标准化后的JSON字符串
			
 
				+            # 按参数判断是否为表格项，且包含 HTML
			
 
				+            if item.get(table_type_key) != table_type_value or html_key not in item:
			
 
				+                continue
			
 
				+
			
 
				+            table_html = item[html_key]
			
 
				+            if not table_html or not isinstance(table_html, str):
			
 
				+                continue
			
 
				+
			
 
				+            from bs4 import BeautifulSoup, Tag
			
 
				+
			
 
				+            soup = BeautifulSoup(table_html, "html.parser")
			
 
				+            tables = soup.find_all("table")
			
 
				+            table_changes: list[dict] = []
			
 
				+
			
 
				+            for table in tables:
			
 
				+                if not isinstance(table, Tag):
			
 
				+                    continue
			
 
				+                for row_idx, tr in enumerate(table.find_all("tr")):  # type: ignore[reportAttributeAccessIssue]
			
 
				+                    cells_tag = tr.find_all(["td", "th"])  # type: ignore[reportAttributeAccessIssue]
			
 
				+                    for col_idx, cell in enumerate(cells_tag):
			
 
				+                        if not isinstance(cell, Tag):
			
 
				+                            continue
			
 
				+                        original_text = cell.get_text()
			
 
				+                        normalized_text = normalize_financial_numbers(original_text)
			
 
				+                        if original_text == normalized_text:
			
 
				+                            continue
			
 
				+                        change: dict[str, object] = {
			
 
				+                            "row": row_idx,
			
 
				+                            "col": col_idx,
			
 
				+                            "old": original_text,
			
 
				+                            "new": normalized_text,
			
 
				+                        }
			
 
				+                        bbox_attr = cell.get("data-bbox")
			
 
				+                        if isinstance(bbox_attr, str):
			
 
				+                            try:
			
 
				+                                change["bbox"] = literal_eval(bbox_attr)
			
 
				+                            except Exception:
			
 
				+                                change["bbox"] = bbox_attr
			
 
				+                        table_changes.append(change)
			
 
				+                        cell.string = normalized_text
			
 
				+
			
 
				+            # 写回 HTML
			
 
				+            item[html_key] = str(soup)
			
 
				+            if table_changes:
			
 
				+                item["number_normalization_changes"] = table_changes
			
 
				+
			
 
				+            # 若指定了 cells_key，同时标准化 cells 中每格的 text（及 matched_text）
			
 
				+            # for key in ("text", "matched_text"):
			
 
				+            table_cell_text_keys = ["text"]
			
 
				+            if cells_key and cells_key in item and isinstance(item[cells_key], list):
			
 
				+                for cell in item[cells_key]:
			
 
				+                    if not isinstance(cell, dict):
			
 
				+                        continue
			
 
				+
			
 
				+                    for key in table_cell_text_keys:
			
 
				+                        if key not in cell or not isinstance(cell[key], str):
			
 
				+                            continue
			
 
				+                        orig = cell[key]
			
 
				+                        norm = normalize_financial_numbers(orig)
			
 
				+                        if norm != orig:
			
 
				+                            cell[key] = norm
			
 
				+
			
 
				         return json.dumps(data, ensure_ascii=False, indent=2)
			
 
				-        
			
 
				+
			
 
				     except json.JSONDecodeError as e:
			
 
				         print(f"⚠️ JSON解析失败: {e}")
			
 
				         return json_content
			
@@ -220,31 +324,48 @@ def normalize_json_table(json_content: str) -> str:
 
				         print(f"⚠️ JSON表格标准化失败: {e}")
			
 
				         return json_content
			
 
				 
			
 
				-def normalize_json_file(file_path: str, output_path: str | None = None) -> str:
			
 
				+def normalize_json_file(
			
 
				+    file_path: str,
			
 
				+    output_path: str | None = None,
			
 
				+    *,
			
 
				+    table_type_key: str = "category",
			
 
				+    table_type_value: str = "Table",
			
 
				+    html_key: str = "text",
			
 
				+    cells_key: str | None = None,
			
 
				+) -> str:
			
 
				     """
			
 
				-    标准化JSON文件中的表格数字
			
 
				-    
			
 
				+    标准化JSON文件中的表格数字。
			
 
				+    提取表格时使用的 key 可通过参数指定，以兼容不同 OCR 工具。
			
 
				+
			
 
				     Args:
			
 
				         file_path: 输入JSON文件路径
			
 
				         output_path: 输出文件路径，如果为None则覆盖原文件
			
 
				-    
			
 
				+        table_type_key: 判断表格的字段名（见 normalize_json_table）
			
 
				+        table_type_value: 判断表格的字段值
			
 
				+        html_key: 表格 HTML 所在字段名
			
 
				+        cells_key: 单元格列表所在字段名，None 表示不处理 cells
			
 
				+
			
 
				     Returns:
			
 
				         标准化后的JSON内容
			
 
				     """
			
 
				     input_file = Path(file_path)
			
 
				     output_file = Path(output_path) if output_path else input_file
			
 
				-    
			
 
				+
			
 
				     if not input_file.exists():
			
 
				         raise FileNotFoundError(f"找不到文件: {file_path}")
			
 
				-    
			
 
				-    # 读取原始JSON文件
			
 
				-    with open(input_file, 'r', encoding='utf-8') as f:
			
 
				+
			
 
				+    with open(input_file, "r", encoding="utf-8") as f:
			
 
				         original_content = f.read()
			
 
				-    
			
 
				+
			
 
				     print(f"🔧 正在标准化JSON文件: {input_file.name}")
			
 
				-    
			
 
				-    # 标准化内容
			
 
				-    normalized_content = normalize_json_table(original_content)
			
 
				+
			
 
				+    normalized_content = normalize_json_table(
			
 
				+        original_content,
			
 
				+        table_type_key=table_type_key,
			
 
				+        table_type_value=table_type_value,
			
 
				+        html_key=html_key,
			
 
				+        cells_key=cells_key,
			
 
				+    )
			
 
				     
			
 
				     # 保存标准化后的文件
			
 
				     with open(output_file, 'w', encoding='utf-8') as f:
			
@@ -266,4 +387,51 @@ def normalize_json_file(file_path: str, output_path: str | None = None) -> str:
 
				     
			
 
				     print(f"📄 标准化结果已保存到: {output_file}")
			
 
				     return normalized_content
			
 
				+    
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    """
			
 
				+    简单验证：构造一份“故意打乱逗号/小数点”的 JSON / Markdown 示例，
			
 
				+    并打印标准化前后的差异。
			
 
				+    """
			
 
				+    import json
			
 
				+
			
 
				+    print("=== JSON 示例：金额格式纠错 + 变更记录 ===")
			
 
				+    demo_json_data = [
			
 
				+        {
			
 
				+            "category": "Table",
			
 
				+            "text": (
			
 
				+                "<table><tbody>"
			
 
				+                "<tr><td data-bbox=\"[0,0,10,10]\">项目</td>"
			
 
				+                "<td data-bbox=\"[10,0,20,10]\">2023 年12 月31 日</td></tr>"
			
 
				+                # 故意打乱的数字：应为 12,123,456.00 和 1,234,567.89
			
 
				+                "<tr><td data-bbox=\"[0,10,10,20]\">测试金额A</td>"
			
 
				+                "<td data-bbox=\"[10,10,20,20]\">12.123,456,00</td></tr>"
			
 
				+                "<tr><td data-bbox=\"[0,20,10,30]\">测试金额B</td>"
			
 
				+                "<td data-bbox=\"[10,20,20,30]\">1,234,567,89</td></tr>"
			
 
				+                "<tr><td data-bbox=\"[0,20,10,40]\">测试金额C</td>"
			
 
				+                "<td data-bbox=\"[10,20,20,40]\">301,55</td></tr>"
			
 
				+                "</tbody></table>"
			
 
				+            ),
			
 
				+        }
			
 
				+    ]
			
 
				+    demo_json_str = json.dumps(demo_json_data, ensure_ascii=False, indent=2)
			
 
				+    print("原始 JSON：")
			
 
				+    print(demo_json_str)
			
 
				+    normalized_json_str = normalize_json_table(demo_json_str)
			
 
				+    print("\n标准化后 JSON：")
			
 
				+    print(normalized_json_str)
			
 
				 
			
 
				+    print("\n=== Markdown 示例：金额格式纠错 + 注释说明 ===")
			
 
				+    demo_md = """<table><tbody>
			
 
				+<tr><td>项目</td><td>2023 年12 月31 日</td></tr>
			
 
				+<tr><td>测试金额A</td><td>12.123,456,00</td></tr>
			
 
				+<tr><td>测试金额B</td><td>1,234,567,89</td></tr>
			
 
				+<tr><td>测试金额C</td><td>301,55</td></tr>
			
 
				+</tbody></table>
			
 
				+"""
			
 
				+    print("原始 Markdown：")
			
 
				+    print(demo_md)
			
 
				+    normalized_md = normalize_markdown_table(demo_md)
			
 
				+    print("\n标准化后 Markdown：")
			
 
				+    print(normalized_md)
			
--- a/ocr_utils/output_formatter_v2.py
+++ b/ocr_utils/output_formatter_v2.py
@@ -30,10 +30,6 @@ from .markdown_generator import MarkdownGenerator
 
				 from .html_generator import HTMLGenerator
			
 
				 from .visualization_utils import VisualizationUtils
			
 
				 
			
 
				-# 导入数字标准化工具
			
 
				-from .normalize_financial_numbers import normalize_markdown_table, normalize_json_table
			
 
				-
			
 
				-
			
 
				 class NumpyEncoder(json.JSONEncoder):
			
 
				     """自定义JSON编码器，处理numpy类型"""
			
 
				     def default(self, obj):
			
@@ -171,26 +167,10 @@ class OutputFormatterV2:
 
				         # 2. 转换为 MinerU middle.json 格式
			
 
				         middle_json = JSONFormatters.convert_to_middle_json(results)
			
 
				         
			
 
				-        # 3. 保存 middle.json
			
 
				+        # 3. 保存 middle.json（金额标准化已在 pipeline element_processors 中完成，此处不再重复）
			
 
				         if output_config.get('save_json', True):
			
 
				             json_path = doc_output_dir / f"{doc_name}_middle.json"
			
 
				             json_content = json.dumps(middle_json, ensure_ascii=False, indent=2, cls=NumpyEncoder)
			
 
				-            
			
 
				-            # 金额数字标准化
			
 
				-            normalize_numbers = output_config.get('normalize_numbers', True)
			
 
				-            if normalize_numbers:
			
 
				-                original_content = json_content
			
 
				-                json_content = normalize_json_table(json_content)
			
 
				-                
			
 
				-                # 检查是否有变化
			
 
				-                if json_content != original_content:
			
 
				-                    # 保存原始文件
			
 
				-                    original_path = doc_output_dir / f"{doc_name}_middle_original.json"
			
 
				-                    with open(original_path, 'w', encoding='utf-8') as f:
			
 
				-                        f.write(original_content)
			
 
				-                    logger.info(f"📄 Original middle JSON saved: {original_path}")
			
 
				-                    output_paths['middle_json_original'] = str(original_path)
			
 
				-            
			
 
				             with open(json_path, 'w', encoding='utf-8') as f:
			
 
				                 f.write(json_content)
			
 
				             output_paths['middle_json'] = str(json_path)
Autors	SHA1 Ziņojums	Datums
zhch158_admin	11668f177e refactor(output_formatter_v2): 移除冗余的金额标准化逻辑，更新保存中间JSON的注释以反映标准化已在处理管道中完成	2 nedēļas atpakaļ
zhch158_admin	6e96478c23 feat(ocr_utils): 增强金额标准化功能，支持欧洲格式小数和JSON表格内容的标准化处理	2 nedēļas atpakaļ
zhch158_admin	40bad356ee refactor(markdown_generator): 移除冗余的金额标准化逻辑，更新Markdown生成以包含标准化变更说明	2 nedēļas atpakaļ
zhch158_admin	1ce742a4ef refactor(json_formatters): 移除冗余的金额标准化逻辑，更新JSON输出以包含标准化变更记录	2 nedēļas atpakaļ
zhch158_admin	38f373384f feat(pipeline_manager): 添加 normalize_numbers 参数以支持表格内容的金额标准化处理	2 nedēļas atpakaļ
zhch158_admin	2b8ed01af2 feat(element_processors): 添加金额标准化功能，支持表格内容的财务数字规范化处理	2 nedēļas atpakaļ
zhch158_admin	35c6e6cf36 feat(ocr_utils): 增强财务数字标准化功能，添加金额 token 纠错逻辑，支持逗号和小数点的正确用法	2 nedēļas atpakaļ