|
|
@@ -30,10 +30,6 @@ from .markdown_generator import MarkdownGenerator
|
|
|
from .html_generator import HTMLGenerator
|
|
|
from .visualization_utils import VisualizationUtils
|
|
|
|
|
|
-# 导入数字标准化工具
|
|
|
-from .normalize_financial_numbers import normalize_markdown_table, normalize_json_table
|
|
|
-
|
|
|
-
|
|
|
class NumpyEncoder(json.JSONEncoder):
|
|
|
"""自定义JSON编码器,处理numpy类型"""
|
|
|
def default(self, obj):
|
|
|
@@ -171,26 +167,10 @@ class OutputFormatterV2:
|
|
|
# 2. 转换为 MinerU middle.json 格式
|
|
|
middle_json = JSONFormatters.convert_to_middle_json(results)
|
|
|
|
|
|
- # 3. 保存 middle.json
|
|
|
+ # 3. 保存 middle.json(金额标准化已在 pipeline element_processors 中完成,此处不再重复)
|
|
|
if output_config.get('save_json', True):
|
|
|
json_path = doc_output_dir / f"{doc_name}_middle.json"
|
|
|
json_content = json.dumps(middle_json, ensure_ascii=False, indent=2, cls=NumpyEncoder)
|
|
|
-
|
|
|
- # 金额数字标准化
|
|
|
- normalize_numbers = output_config.get('normalize_numbers', True)
|
|
|
- if normalize_numbers:
|
|
|
- original_content = json_content
|
|
|
- json_content = normalize_json_table(json_content)
|
|
|
-
|
|
|
- # 检查是否有变化
|
|
|
- if json_content != original_content:
|
|
|
- # 保存原始文件
|
|
|
- original_path = doc_output_dir / f"{doc_name}_middle_original.json"
|
|
|
- with open(original_path, 'w', encoding='utf-8') as f:
|
|
|
- f.write(original_content)
|
|
|
- logger.info(f"📄 Original middle JSON saved: {original_path}")
|
|
|
- output_paths['middle_json_original'] = str(original_path)
|
|
|
-
|
|
|
with open(json_path, 'w', encoding='utf-8') as f:
|
|
|
f.write(json_content)
|
|
|
output_paths['middle_json'] = str(json_path)
|