Преглед изворни кода

refactor(json_formatters): 移除冗余的金额标准化逻辑,更新JSON输出以包含标准化变更记录

zhch158_admin пре 3 недеља
родитељ
комит
1ce742a4ef
1 измењених фајлова са 7 додато и 18 уклоњено
  1. 7 18
      ocr_utils/json_formatters.py

+ 7 - 18
ocr_utils/json_formatters.py

@@ -14,10 +14,6 @@ from pathlib import Path
 from typing import Dict, Any, List, Optional
 from typing import Dict, Any, List, Optional
 from loguru import logger
 from loguru import logger
 
 
-# 导入数字标准化工具
-from .normalize_financial_numbers import normalize_json_table
-
-
 class NumpyEncoder(json.JSONEncoder):
 class NumpyEncoder(json.JSONEncoder):
     """自定义JSON编码器,处理numpy类型"""
     """自定义JSON编码器,处理numpy类型"""
     def default(self, obj):
     def default(self, obj):
@@ -263,20 +259,9 @@ class JSONFormatters:
                 if converted:
                 if converted:
                     page_elements.append(converted)
                     page_elements.append(converted)
             
             
-            # 转换为 JSON 字符串
+            # 转换为 JSON 字符串(金额标准化已在 pipeline element_processors 中完成,此处不再重复)
             json_content = json.dumps(page_elements, ensure_ascii=False, indent=2, cls=NumpyEncoder)
             json_content = json.dumps(page_elements, ensure_ascii=False, indent=2, cls=NumpyEncoder)
-            
-            # 金额数字标准化
-            if normalize_numbers:
-                original_content = json_content
-                json_content = normalize_json_table(json_content)
-                
-                if json_content != original_content:
-                    original_path = output_dir / f"{page_name}_original.json"
-                    with open(original_path, 'w', encoding='utf-8') as f:
-                        f.write(original_content)
-                    logger.debug(f"📄 Original page JSON saved: {original_path}")
-            
+
             # 保存 JSON
             # 保存 JSON
             json_path = output_dir / f"{page_name}.json"
             json_path = output_dir / f"{page_name}.json"
             with open(json_path, 'w', encoding='utf-8') as f:
             with open(json_path, 'w', encoding='utf-8') as f:
@@ -338,7 +323,11 @@ class JSONFormatters:
             cells = content.get('cells', [])
             cells = content.get('cells', [])
             if cells:
             if cells:
                 result['table_cells'] = JSONFormatters.format_table_cells(cells)
                 result['table_cells'] = JSONFormatters.format_table_cells(cells)
-            
+            # 金额标准化变更记录(来自 element_processors._normalize_table_content)
+            changes = content.get('number_normalization_changes', [])
+            if changes:
+                result['number_normalization_changes'] = changes
+
             # 旋转和倾斜信息
             # 旋转和倾斜信息
             if 'table_angle' in content:
             if 'table_angle' in content:
                 result['image_rotation_angle'] = float(content['table_angle'])
                 result['image_rotation_angle'] = float(content['table_angle'])