vor 3 Monaten · 35c6e6cf36
--- a/ocr_utils/normalize_financial_numbers.py
+++ b/ocr_utils/normalize_financial_numbers.py
@@ -1,16 +1,83 @@
 
				 import re
			
 
				 import os
			
 
				 from pathlib import Path
			
 
				+from decimal import Decimal, InvalidOperation
			
 
				+
			
 
				+
			
 
				+def _normalize_amount_token(token: str) -> str:
			
 
				+    """
			
 
				+    规范单个金额 token 中逗号/小数点的用法。
			
 
				+    仅在形态明显为金额时进行纠错，其他情况原样返回。
			
 
				+    """
			
 
				+    if not token:
			
 
				+        return token
			
 
				+
			
 
				+    # 只处理包含数字的简单 token，避免带字母/其他符号的误改
			
 
				+    if not re.fullmatch(r"[+-]?\d[\d,\.]*\d", token):
			
 
				+        return token
			
 
				+
			
 
				+    sign = ""
			
 
				+    core = token
			
 
				+    if core[0] in "+-":
			
 
				+        sign, core = core[0], core[1:]
			
 
				+
			
 
				+    has_dot = "." in core
			
 
				+    has_comma = "," in core
			
 
				+
			
 
				+    # 辅助: 尝试解析为 Decimal；失败则认为不安全，回退原值
			
 
				+    def _safe_decimal(s: str) -> bool:
			
 
				+        try:
			
 
				+            Decimal(s.replace(",", ""))
			
 
				+            return True
			
 
				+        except (InvalidOperation, ValueError):
			
 
				+            return False
			
 
				+
			
 
				+    # 规则A：同时包含 . 和 ,，最后一个分隔符是逗号，且其后为 1-2 位数字
			
 
				+    if has_dot and has_comma:
			
 
				+        last_comma = core.rfind(",")
			
 
				+        last_dot = core.rfind(".")
			
 
				+        if last_comma > last_dot and last_comma != -1:
			
 
				+            frac = core[last_comma + 1 :]
			
 
				+            if 1 <= len(frac) <= 2 and frac.isdigit():
			
 
				+                # 先把所有点当作千分位逗号，再把最后一个逗号当作小数点
			
 
				+                temp = core.replace(".", ",")
			
 
				+                idx = temp.rfind(",")
			
 
				+                if idx != -1:
			
 
				+                    candidate = temp[:idx] + "." + temp[idx + 1 :]
			
 
				+                    if _safe_decimal(candidate):
			
 
				+                        return sign + candidate
			
 
				+
			
 
				+    # 规则B：只有 .，多个点，最后一段视为小数，其余为千分位
			
 
				+    if has_dot and not has_comma:
			
 
				+        parts = core.split(".")
			
 
				+        if len(parts) >= 3:
			
 
				+            last = parts[-1]
			
 
				+            ints = parts[:-1]
			
 
				+            if 1 <= len(last) <= 2 and all(len(p) == 3 for p in ints[1:]):
			
 
				+                candidate = ",".join(ints) + "." + last
			
 
				+                if _safe_decimal(candidate):
			
 
				+                    return sign + candidate
			
 
				+
			
 
				+    # 规则C：只有 ,，多个逗号，最后一段长度为 1-2 且前面为 3 位分组
			
 
				+    if has_comma and not has_dot:
			
 
				+        parts = core.split(",")
			
 
				+        if len(parts) >= 3:
			
 
				+            last = parts[-1]
			
 
				+            ints = parts[:-1]
			
 
				+            if 1 <= len(last) <= 2 and all(len(p) == 3 for p in ints[1:]):
			
 
				+                # 将最后一个逗号视为小数点
			
 
				+                idx = core.rfind(",")
			
 
				+                candidate = core[:idx] + "." + core[idx + 1 :]
			
 
				+                if _safe_decimal(candidate):
			
 
				+                    return sign + candidate
			
 
				+
			
 
				+    # 没有需要纠错的典型形态，直接返回原 token
			
 
				+    return token
			
 
				+
			
 
				 
			
 
				 def normalize_financial_numbers(text: str) -> str:
			
 
				     """
			
 
				-    标准化财务数字：将全角字符转换为半角字符
			
 
				-    
			
 
				-    Args:
			
 
				-        text: 原始文本
			
 
				-    
			
 
				-    Returns:
			
 
				-        标准化后的文本
			
 
				+    标准化财务数字：将全角字符转换为半角字符，并纠正常见的逗号/小数点错用。
			
 
				     """
			
 
				     if not text:
			
 
				         return text
			
@@ -31,30 +98,30 @@ def normalize_financial_numbers(text: str) -> str:
 
				         '％': '%',  # 全角百分号转半角百分号
			
 
				     }
			
 
				     
			
 
				-    # 第一步：执行基础字符替换
			
 
				+    # 第一步：执行基础字符替换（全角 -> 半角）
			
 
				     normalized_text = text
			
 
				     for fullwidth, halfwidth in fullwidth_to_halfwidth.items():
			
 
				         normalized_text = normalized_text.replace(fullwidth, halfwidth)
			
 
				     
			
 
				-    # 第二步：处理数字序列中的空格和分隔符
			
 
				-    # 修改正则表达式以匹配完整的数字序列，包括空格
			
 
				-    # 匹配模式：数字 + (空格? + 逗号 + 空格? + 数字)* + (空格? + 小数点 + 数字+)?
			
 
				+    # 第二步：处理数字序列中的空格和分隔符（保留原有逻辑）
			
 
				     number_sequence_pattern = r'(\d+(?:\s*[，,]\s*\d+)*(?:\s*[。．.]\s*\d+)?)'
			
 
				     
			
 
				     def normalize_number_sequence(match):
			
 
				         sequence = match.group(1)
			
 
				-        
			
 
				-        # 处理千分位分隔符周围的空格
			
 
				-        # 将 "数字 + 空格 + 逗号 + 空格 + 数字" 标准化为 "数字,数字"
			
 
				         sequence = re.sub(r'(\d)\s*[，,]\s*(\d)', r'\1,\2', sequence)
			
 
				-        
			
 
				-        # 处理小数点周围的空格
			
 
				-        # 将 "数字 + 空格 + 小数点 + 空格 + 数字" 标准化为 "数字.数字"
			
 
				         sequence = re.sub(r'(\d)\s*[。．.]\s*(\d)', r'\1.\2', sequence)
			
 
				-        
			
 
				         return sequence
			
 
				     
			
 
				     normalized_text = re.sub(number_sequence_pattern, normalize_number_sequence, normalized_text)
			
 
				+
			
 
				+    # 第三步：对疑似金额 token 做逗号/小数点纠错
			
 
				+    amount_pattern = r'(?P<tok>[+-]?\d[\d,\.]*\d)'
			
 
				+
			
 
				+    def _amount_sub(m: re.Match) -> str:
			
 
				+        tok = m.group('tok')
			
 
				+        return _normalize_amount_token(tok)
			
 
				+
			
 
				+    normalized_text = re.sub(amount_pattern, _amount_sub, normalized_text)
			
 
				     return normalized_text
			
 
				     
			
 
				 def normalize_markdown_table(markdown_content: str) -> str:
			
@@ -78,7 +145,7 @@ def normalize_markdown_table(markdown_content: str) -> str:
 
				     table_pattern = r'(<table[^>]*>.*?</table>)'
			
 
				     
			
 
				     def normalize_table_match(match):
			
 
				-        """处理单个表格匹配，保留原始格式"""
			
 
				+        """处理单个表格匹配，保留原始格式，并追加数字标准化说明注释。"""
			
 
				         table_html = match.group(1)
			
 
				         original_table_html = table_html  # 保存原始HTML用于比较
			
 
				         
			
@@ -86,52 +153,65 @@ def normalize_markdown_table(markdown_content: str) -> str:
 
				         soup = BeautifulSoup(table_html, 'html.parser')
			
 
				         tables = soup.find_all('table')
			
 
				         
			
 
				-        # 记录所有需要替换的文本（原始文本 -> 标准化文本）
			
 
				-        replacements = []
			
 
				+        # 记录本表格中所有数值修改
			
 
				+        changes: list[dict] = []
			
 
				         
			
 
				         for table in tables:
			
 
				-            if isinstance(table, Tag):
			
 
				-                cells = table.find_all(['td', 'th'])
			
 
				-                for cell in cells:
			
 
				-                    if isinstance(cell, Tag):
			
 
				-                        # 获取单元格的纯文本内容
			
 
				-                        original_text = cell.get_text()
			
 
				-                        normalized_text = normalize_financial_numbers(original_text)
			
 
				-                        
			
 
				-                        # 如果内容发生了变化，记录替换
			
 
				-                        if original_text != normalized_text:
			
 
				-                            # 找到单元格中所有文本节点并替换
			
 
				-                            from bs4.element import NavigableString
			
 
				-                            for text_node in cell.find_all(string=True, recursive=True):
			
 
				-                                if isinstance(text_node, NavigableString):
			
 
				-                                    text_str = str(text_node)
			
 
				-                                    if text_str.strip():
			
 
				-                                        normalized = normalize_financial_numbers(text_str.strip())
			
 
				-                                        if normalized != text_str.strip():
			
 
				-                                            # 保留原始文本节点的前后空白
			
 
				-                                            if text_str.strip() == text_str:
			
 
				-                                                # 纯文本节点，直接替换
			
 
				-                                                text_node.replace_with(normalized)
			
 
				-                                            else:
			
 
				-                                                # 有前后空白，需要保留
			
 
				-                                                leading_ws = text_str[:len(text_str) - len(text_str.lstrip())]
			
 
				-                                                trailing_ws = text_str[len(text_str.rstrip()):]
			
 
				-                                                text_node.replace_with(leading_ws + normalized + trailing_ws)
			
 
				+            if not isinstance(table, Tag):
			
 
				+                continue
			
 
				+            # 通过 tr / td(th) 计算行列位置
			
 
				+            for row_idx, tr in enumerate(table.find_all('tr')):  # type: ignore[reportAttributeAccessIssue]
			
 
				+                cells = tr.find_all(['td', 'th'])  # type: ignore[reportAttributeAccessIssue]
			
 
				+                for col_idx, cell in enumerate(cells):
			
 
				+                    if not isinstance(cell, Tag):
			
 
				+                        continue
			
 
				+                    # 获取单元格纯文本
			
 
				+                    original_text = cell.get_text()
			
 
				+                    normalized_text = normalize_financial_numbers(original_text)
			
 
				+                    if original_text == normalized_text:
			
 
				+                        continue
			
 
				+                    # 记录一条修改
			
 
				+                    changes.append(
			
 
				+                        {
			
 
				+                            "row": row_idx,
			
 
				+                            "col": col_idx,
			
 
				+                            "old": original_text,
			
 
				+                            "new": normalized_text,
			
 
				+                        }
			
 
				+                    )
			
 
				+                    # 具体替换：保持原有逻辑，按文本节点逐个替换以保留空白
			
 
				+                    from bs4.element import NavigableString
			
 
				+                    for text_node in cell.find_all(string=True, recursive=True):
			
 
				+                        if isinstance(text_node, NavigableString):
			
 
				+                            text_str = str(text_node)
			
 
				+                            if not text_str.strip():
			
 
				+                                continue
			
 
				+                            normalized = normalize_financial_numbers(text_str.strip())
			
 
				+                            if normalized != text_str.strip():
			
 
				+                                if text_str.strip() == text_str:
			
 
				+                                    text_node.replace_with(normalized)
			
 
				+                                else:
			
 
				+                                    leading_ws = text_str[: len(text_str) - len(text_str.lstrip())]
			
 
				+                                    trailing_ws = text_str[len(text_str.rstrip()) :]
			
 
				+                                    text_node.replace_with(leading_ws + normalized + trailing_ws)
			
 
				+        
			
 
				+        # 如果没有任何数值修改，直接返回原始 HTML
			
 
				+        if not changes:
			
 
				+            return original_table_html
			
 
				         
			
 
				         # 获取修改后的HTML
			
 
				         modified_html = str(soup)
			
 
				         
			
 
				-        # 如果内容没有变化，返回原始HTML（保持原始格式）
			
 
				-        # 检查是否只是格式变化（换行、空格等）
			
 
				-        original_text_only = re.sub(r'\s+', '', original_table_html)
			
 
				-        modified_text_only = re.sub(r'\s+', '', modified_html)
			
 
				-        
			
 
				-        if original_text_only == modified_text_only:
			
 
				-            # 只有格式变化，返回原始HTML以保留换行符
			
 
				-            return original_table_html
			
 
				+        # 在表格后追加注释，说明哪些单元格被修改
			
 
				+        lines = ["<!-- 数字标准化说明："]
			
 
				+        for ch in changes:
			
 
				+            lines.append(
			
 
				+                f"  - [row={ch['row']},col={ch['col']}] {ch['old']} -> {ch['new']}"
			
 
				+            )
			
 
				+        lines.append("-->")
			
 
				+        comment = "\n".join(lines)
			
 
				         
			
 
				-        # 有实际内容变化，返回修改后的HTML
			
 
				-        return modified_html
			
 
				+        return modified_html + "\n\n" + comment
			
 
				     
			
 
				     # 使用正则替换，只替换表格内容，保留其他部分（包括换行符）不变
			
 
				     normalized_content = re.sub(table_pattern, normalize_table_match, markdown_content, flags=re.DOTALL)
			
@@ -162,6 +242,7 @@ def normalize_json_table(json_content: str) -> str:
 
				     ]
			
 
				     """
			
 
				     import json
			
 
				+    from ast import literal_eval
			
 
				     
			
 
				     try:
			
 
				         # 解析JSON内容
			
@@ -185,23 +266,43 @@ def normalize_json_table(json_content: str) -> str:
 
				                 
			
 
				                 soup = BeautifulSoup(table_html, 'html.parser')
			
 
				                 tables = soup.find_all('table')
			
 
				+
			
 
				+                table_changes: list[dict] = []
			
 
				                 
			
 
				                 for table in tables:
			
 
				-                    if isinstance(table, Tag):
			
 
				-                        cells = table.find_all(['td', 'th'])
			
 
				-                        for cell in cells:
			
 
				-                            if isinstance(cell, Tag):
			
 
				-                                original_text = cell.get_text()
			
 
				-                                
			
 
				-                                # 应用数字标准化
			
 
				-                                normalized_text = normalize_financial_numbers(original_text)
			
 
				-                                
			
 
				-                                # 如果内容发生了变化，更新单元格内容
			
 
				-                                if original_text != normalized_text:
			
 
				-                                    cell.string = normalized_text
			
 
				+                    if not isinstance(table, Tag):
			
 
				+                        continue
			
 
				+                    # 通过 tr / td(th) 计算行列位置
			
 
				+                    for row_idx, tr in enumerate(table.find_all('tr')):  # type: ignore[reportAttributeAccessIssue]
			
 
				+                        cells = tr.find_all(['td', 'th'])  # type: ignore[reportAttributeAccessIssue]
			
 
				+                        for col_idx, cell in enumerate(cells):
			
 
				+                            if not isinstance(cell, Tag):
			
 
				+                                continue
			
 
				+                            original_text = cell.get_text()
			
 
				+                            normalized_text = normalize_financial_numbers(original_text)
			
 
				+                            if original_text == normalized_text:
			
 
				+                                continue
			
 
				+                            # 记录本单元格的变更
			
 
				+                            change: dict[str, object] = {
			
 
				+                                "row": row_idx,
			
 
				+                                "col": col_idx,
			
 
				+                                "old": original_text,
			
 
				+                                "new": normalized_text,
			
 
				+                            }
			
 
				+                            bbox_attr = cell.get("data-bbox")
			
 
				+                            if isinstance(bbox_attr, str):
			
 
				+                                try:
			
 
				+                                    change["bbox"] = literal_eval(bbox_attr)
			
 
				+                                except Exception:
			
 
				+                                    change["bbox"] = bbox_attr
			
 
				+                            table_changes.append(change)
			
 
				+                            # 更新单元格内容（简单覆盖文本即可）
			
 
				+                            cell.string = normalized_text
			
 
				                 
			
 
				-                # 更新item中的表格内容
			
 
				+                # 更新 item 中的表格内容
			
 
				                 item['text'] = str(soup)
			
 
				+                if table_changes:
			
 
				+                    item['number_normalization_changes'] = table_changes
			
 
				             
			
 
				             # 同时标准化普通文本中的数字（如果需要）
			
 
				             # elif 'text' in item:
			
@@ -266,4 +367,48 @@ def normalize_json_file(file_path: str, output_path: str | None = None) -> str:
 
				     
			
 
				     print(f"📄 标准化结果已保存到: {output_file}")
			
 
				     return normalized_content
			
 
				+    
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    """
			
 
				+    简单验证：构造一份“故意打乱逗号/小数点”的 JSON / Markdown 示例，
			
 
				+    并打印标准化前后的差异。
			
 
				+    """
			
 
				+    import json
			
 
				+
			
 
				+    print("=== JSON 示例：金额格式纠错 + 变更记录 ===")
			
 
				+    demo_json_data = [
			
 
				+        {
			
 
				+            "category": "Table",
			
 
				+            "text": (
			
 
				+                "<table><tbody>"
			
 
				+                "<tr><td data-bbox=\"[0,0,10,10]\">项目</td>"
			
 
				+                "<td data-bbox=\"[10,0,20,10]\">2023 年12 月31 日</td></tr>"
			
 
				+                # 故意打乱的数字：应为 12,123,456.00 和 1,234,567.89
			
 
				+                "<tr><td data-bbox=\"[0,10,10,20]\">测试金额A</td>"
			
 
				+                "<td data-bbox=\"[10,10,20,20]\">12.123,456,00</td></tr>"
			
 
				+                "<tr><td data-bbox=\"[0,20,10,30]\">测试金额B</td>"
			
 
				+                "<td data-bbox=\"[10,20,20,30]\">1,234,567,89</td></tr>"
			
 
				+                "</tbody></table>"
			
 
				+            ),
			
 
				+        }
			
 
				+    ]
			
 
				+    demo_json_str = json.dumps(demo_json_data, ensure_ascii=False, indent=2)
			
 
				+    print("原始 JSON：")
			
 
				+    print(demo_json_str)
			
 
				+    normalized_json_str = normalize_json_table(demo_json_str)
			
 
				+    print("\n标准化后 JSON：")
			
 
				+    print(normalized_json_str)
			
 
				 
			
 
				+    print("\n=== Markdown 示例：金额格式纠错 + 注释说明 ===")
			
 
				+    demo_md = """<table><tbody>
			
 
				+<tr><td>项目</td><td>2023 年12 月31 日</td></tr>
			
 
				+<tr><td>测试金额A</td><td>12.123,456,00</td></tr>
			
 
				+<tr><td>测试金额B</td><td>1,234,567,89</td></tr>
			
 
				+</tbody></table>
			
 
				+"""
			
 
				+    print("原始 Markdown：")
			
 
				+    print(demo_md)
			
 
				+    normalized_md = normalize_markdown_table(demo_md)
			
 
				+    print("\n标准化后 Markdown：")
			
 
				+    print(normalized_md)