|
@@ -14,10 +14,6 @@ from pathlib import Path
|
|
|
from typing import Dict, Any, List, Optional
|
|
from typing import Dict, Any, List, Optional
|
|
|
from loguru import logger
|
|
from loguru import logger
|
|
|
|
|
|
|
|
-# 导入数字标准化工具
|
|
|
|
|
-from .normalize_financial_numbers import normalize_json_table
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
class NumpyEncoder(json.JSONEncoder):
|
|
class NumpyEncoder(json.JSONEncoder):
|
|
|
"""自定义JSON编码器,处理numpy类型"""
|
|
"""自定义JSON编码器,处理numpy类型"""
|
|
|
def default(self, obj):
|
|
def default(self, obj):
|
|
@@ -263,20 +259,9 @@ class JSONFormatters:
|
|
|
if converted:
|
|
if converted:
|
|
|
page_elements.append(converted)
|
|
page_elements.append(converted)
|
|
|
|
|
|
|
|
- # 转换为 JSON 字符串
|
|
|
|
|
|
|
+ # 转换为 JSON 字符串(金额标准化已在 pipeline element_processors 中完成,此处不再重复)
|
|
|
json_content = json.dumps(page_elements, ensure_ascii=False, indent=2, cls=NumpyEncoder)
|
|
json_content = json.dumps(page_elements, ensure_ascii=False, indent=2, cls=NumpyEncoder)
|
|
|
-
|
|
|
|
|
- # 金额数字标准化
|
|
|
|
|
- if normalize_numbers:
|
|
|
|
|
- original_content = json_content
|
|
|
|
|
- json_content = normalize_json_table(json_content)
|
|
|
|
|
-
|
|
|
|
|
- if json_content != original_content:
|
|
|
|
|
- original_path = output_dir / f"{page_name}_original.json"
|
|
|
|
|
- with open(original_path, 'w', encoding='utf-8') as f:
|
|
|
|
|
- f.write(original_content)
|
|
|
|
|
- logger.debug(f"📄 Original page JSON saved: {original_path}")
|
|
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# 保存 JSON
|
|
# 保存 JSON
|
|
|
json_path = output_dir / f"{page_name}.json"
|
|
json_path = output_dir / f"{page_name}.json"
|
|
|
with open(json_path, 'w', encoding='utf-8') as f:
|
|
with open(json_path, 'w', encoding='utf-8') as f:
|
|
@@ -338,7 +323,11 @@ class JSONFormatters:
|
|
|
cells = content.get('cells', [])
|
|
cells = content.get('cells', [])
|
|
|
if cells:
|
|
if cells:
|
|
|
result['table_cells'] = JSONFormatters.format_table_cells(cells)
|
|
result['table_cells'] = JSONFormatters.format_table_cells(cells)
|
|
|
-
|
|
|
|
|
|
|
+ # 金额标准化变更记录(来自 element_processors._normalize_table_content)
|
|
|
|
|
+ changes = content.get('number_normalization_changes', [])
|
|
|
|
|
+ if changes:
|
|
|
|
|
+ result['number_normalization_changes'] = changes
|
|
|
|
|
+
|
|
|
# 旋转和倾斜信息
|
|
# 旋转和倾斜信息
|
|
|
if 'table_angle' in content:
|
|
if 'table_angle' in content:
|
|
|
result['image_rotation_angle'] = float(content['table_angle'])
|
|
result['image_rotation_angle'] = float(content['table_angle'])
|