Browse Source

refactor(output_formatter_v2): 移除冗余的金额标准化逻辑,更新保存中间JSON的注释以反映标准化已在处理管道中完成

zhch158_admin 3 tuần trước cách đây
mục cha
commit
11668f177e
1 tập tin đã thay đổi với 1 bổ sung21 xóa
  1. 1 21
      ocr_utils/output_formatter_v2.py

+ 1 - 21
ocr_utils/output_formatter_v2.py

@@ -30,10 +30,6 @@ from .markdown_generator import MarkdownGenerator
 from .html_generator import HTMLGenerator
 from .visualization_utils import VisualizationUtils
 
-# 导入数字标准化工具
-from .normalize_financial_numbers import normalize_markdown_table, normalize_json_table
-
-
 class NumpyEncoder(json.JSONEncoder):
     """自定义JSON编码器,处理numpy类型"""
     def default(self, obj):
@@ -171,26 +167,10 @@ class OutputFormatterV2:
         # 2. 转换为 MinerU middle.json 格式
         middle_json = JSONFormatters.convert_to_middle_json(results)
         
-        # 3. 保存 middle.json
+        # 3. 保存 middle.json(金额标准化已在 pipeline element_processors 中完成,此处不再重复)
         if output_config.get('save_json', True):
             json_path = doc_output_dir / f"{doc_name}_middle.json"
             json_content = json.dumps(middle_json, ensure_ascii=False, indent=2, cls=NumpyEncoder)
-            
-            # 金额数字标准化
-            normalize_numbers = output_config.get('normalize_numbers', True)
-            if normalize_numbers:
-                original_content = json_content
-                json_content = normalize_json_table(json_content)
-                
-                # 检查是否有变化
-                if json_content != original_content:
-                    # 保存原始文件
-                    original_path = doc_output_dir / f"{doc_name}_middle_original.json"
-                    with open(original_path, 'w', encoding='utf-8') as f:
-                        f.write(original_content)
-                    logger.info(f"📄 Original middle JSON saved: {original_path}")
-                    output_paths['middle_json_original'] = str(original_path)
-            
             with open(json_path, 'w', encoding='utf-8') as f:
                 f.write(json_content)
             output_paths['middle_json'] = str(json_path)