|
@@ -10,7 +10,8 @@ import numpy as np
|
|
|
|
|
|
|
|
from utils import (
|
|
from utils import (
|
|
|
load_images_from_pdf,
|
|
load_images_from_pdf,
|
|
|
- normalize_markdown_table
|
|
|
|
|
|
|
+ normalize_markdown_table,
|
|
|
|
|
+ normalize_financial_numbers
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
def convert_pdf_to_images(pdf_file: str, output_dir: str | None = None, dpi: int = 200) -> List[str]:
|
|
def convert_pdf_to_images(pdf_file: str, output_dir: str | None = None, dpi: int = 200) -> List[str]:
|
|
@@ -273,16 +274,23 @@ def save_output_images(output_images: Dict[str, Any], output_dir: str, output_fi
|
|
|
return saved_images
|
|
return saved_images
|
|
|
|
|
|
|
|
def save_markdown_content(markdown_data: Dict[str, Any], output_dir: str,
|
|
def save_markdown_content(markdown_data: Dict[str, Any], output_dir: str,
|
|
|
- filename: str, normalize_numbers: bool = True, key_text: str = 'text', key_images: str = 'images') -> str:
|
|
|
|
|
|
|
+ filename: str, normalize_numbers: bool = True,
|
|
|
|
|
+ key_text: str = 'text', key_images: str = 'images',
|
|
|
|
|
+ json_data: Dict[str, Any] = None) -> str:
|
|
|
"""
|
|
"""
|
|
|
- 保存Markdown内容,支持数字标准化
|
|
|
|
|
|
|
+ 保存Markdown内容,支持数字标准化和表格补全
|
|
|
"""
|
|
"""
|
|
|
- if not markdown_data:
|
|
|
|
|
|
|
+ if not markdown_data and not json_data:
|
|
|
return ""
|
|
return ""
|
|
|
|
|
+
|
|
|
output_path = Path(output_dir).resolve()
|
|
output_path = Path(output_dir).resolve()
|
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
- # 保存Markdown文本
|
|
|
|
|
|
|
+ # 🎯 优先使用json_data生成完整内容
|
|
|
|
|
+ if json_data:
|
|
|
|
|
+ return save_markdown_content_enhanced(json_data, str(output_path), filename, normalize_numbers)
|
|
|
|
|
+
|
|
|
|
|
+ # 原有逻辑保持不变
|
|
|
markdown_text = markdown_data.get(key_text, '')
|
|
markdown_text = markdown_data.get(key_text, '')
|
|
|
|
|
|
|
|
# 数字标准化处理
|
|
# 数字标准化处理
|
|
@@ -316,4 +324,73 @@ def save_markdown_content(markdown_data: Dict[str, Any], output_dir: str,
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
print(f"❌ Error saving Markdown image {img_path}: {e}")
|
|
print(f"❌ Error saving Markdown image {img_path}: {e}")
|
|
|
|
|
|
|
|
|
|
+ return str(md_file_path)
|
|
|
|
|
+
|
|
|
|
|
+def save_markdown_content_enhanced(json_data: Dict[str, Any], output_dir: str,
|
|
|
|
|
+ filename: str, normalize_numbers: bool = True) -> str:
|
|
|
|
|
+ """
|
|
|
|
|
+ 增强版Markdown内容保存,同时处理parsing_res_list和table_res_list
|
|
|
|
|
+ """
|
|
|
|
|
+ if not json_data:
|
|
|
|
|
+ return ""
|
|
|
|
|
+
|
|
|
|
|
+ output_path = Path(output_dir).resolve()
|
|
|
|
|
+ output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
+
|
|
|
|
|
+ markdown_content = []
|
|
|
|
|
+
|
|
|
|
|
+ # 处理 parsing_res_list
|
|
|
|
|
+ parsing_res_list = json_data.get('parsing_res_list', [])
|
|
|
|
|
+ table_res_list = json_data.get('table_res_list', [])
|
|
|
|
|
+
|
|
|
|
|
+ table_index = 0 # 用于匹配table_res_list中的表格
|
|
|
|
|
+
|
|
|
|
|
+ for item in parsing_res_list:
|
|
|
|
|
+ block_label = item.get('block_label', '')
|
|
|
|
|
+ block_content = item.get('block_content', '')
|
|
|
|
|
+
|
|
|
|
|
+ if block_label == 'table':
|
|
|
|
|
+ # 如果是表格,优先使用table_res_list中的详细HTML
|
|
|
|
|
+ if table_index < len(table_res_list):
|
|
|
|
|
+ detailed_html = table_res_list[table_index].get('pred_html', block_content)
|
|
|
|
|
+ if normalize_numbers:
|
|
|
|
|
+ detailed_html = normalize_markdown_table(detailed_html)
|
|
|
|
|
+
|
|
|
|
|
+ # 转换为居中显示的HTML
|
|
|
|
|
+ markdown_content.append(f'<div style="text-align: center;">{detailed_html}</div>')
|
|
|
|
|
+ table_index += 1
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 如果table_res_list中没有对应项,使用parsing_res_list中的内容
|
|
|
|
|
+ if normalize_numbers:
|
|
|
|
|
+ block_content = normalize_markdown_table(block_content)
|
|
|
|
|
+ markdown_content.append(f'<div style="text-align: center;">{block_content}</div>')
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 非表格内容直接添加
|
|
|
|
|
+ if normalize_numbers:
|
|
|
|
|
+ block_content = normalize_financial_numbers(block_content)
|
|
|
|
|
+ markdown_content.append(block_content)
|
|
|
|
|
+
|
|
|
|
|
+ # 🎯 关键修复:处理剩余的table_res_list项目
|
|
|
|
|
+ # 如果table_res_list中还有未处理的表格(比parsing_res_list中的表格多)
|
|
|
|
|
+ remaining_tables = table_res_list[table_index:]
|
|
|
|
|
+ for table_item in remaining_tables:
|
|
|
|
|
+ detailed_html = table_item.get('pred_html', '')
|
|
|
|
|
+ if detailed_html:
|
|
|
|
|
+ if normalize_numbers:
|
|
|
|
|
+ detailed_html = normalize_markdown_table(detailed_html)
|
|
|
|
|
+ markdown_content.append(f'<div style="text-align: center;">{detailed_html}</div>')
|
|
|
|
|
+
|
|
|
|
|
+ # 合并所有内容
|
|
|
|
|
+ final_markdown = '\n\n'.join(markdown_content)
|
|
|
|
|
+
|
|
|
|
|
+ # 保存文件
|
|
|
|
|
+ md_file_path = output_path / f"{filename}.md"
|
|
|
|
|
+ with open(md_file_path, 'w', encoding='utf-8') as f:
|
|
|
|
|
+ f.write(final_markdown)
|
|
|
|
|
+
|
|
|
|
|
+ print(f"📄 Enhanced Markdown saved: {md_file_path}")
|
|
|
|
|
+ print(f" - parsing_res_list tables: {sum(1 for item in parsing_res_list if item.get('block_label') == 'table')}")
|
|
|
|
|
+ print(f" - table_res_list tables: {len(table_res_list)}")
|
|
|
|
|
+ print(f" - remaining tables added: {len(remaining_tables)}")
|
|
|
|
|
+
|
|
|
return str(md_file_path)
|
|
return str(md_file_path)
|