|
|
@@ -29,10 +29,6 @@ except ImportError:
|
|
|
MM_MD = 'mm_md'
|
|
|
NLP_MD = 'nlp_md'
|
|
|
|
|
|
-# 导入数字标准化工具
|
|
|
-from .normalize_financial_numbers import normalize_markdown_table
|
|
|
-
|
|
|
-
|
|
|
class MarkdownGenerator:
|
|
|
"""Markdown 生成器类"""
|
|
|
|
|
|
@@ -80,18 +76,7 @@ class MarkdownGenerator:
|
|
|
|
|
|
header = MarkdownGenerator._generate_header(results)
|
|
|
markdown_content = header + str(markdown_content)
|
|
|
-
|
|
|
- # 金额数字标准化
|
|
|
- if normalize_numbers:
|
|
|
- original_content = markdown_content
|
|
|
- markdown_content = normalize_markdown_table(markdown_content)
|
|
|
-
|
|
|
- if markdown_content != original_content:
|
|
|
- original_path = output_dir / f"{doc_name}_original.md"
|
|
|
- with open(original_path, 'w', encoding='utf-8') as f:
|
|
|
- f.write(original_content)
|
|
|
- logger.info(f"📝 Original Markdown saved: {original_path}")
|
|
|
-
|
|
|
+ # 金额标准化已在 pipeline element_processors 中完成,此处不再重复
|
|
|
with open(md_path, 'w', encoding='utf-8') as f:
|
|
|
f.write(markdown_content)
|
|
|
|
|
|
@@ -103,18 +88,7 @@ class MarkdownGenerator:
|
|
|
|
|
|
# 使用自定义实现,确保所有元素类型都被处理
|
|
|
markdown_content = MarkdownGenerator._generate_full_markdown(results)
|
|
|
-
|
|
|
- # 金额数字标准化
|
|
|
- if normalize_numbers:
|
|
|
- original_content = markdown_content
|
|
|
- markdown_content = normalize_markdown_table(markdown_content)
|
|
|
-
|
|
|
- if markdown_content != original_content:
|
|
|
- original_path = output_dir / f"{doc_name}_original.md"
|
|
|
- with open(original_path, 'w', encoding='utf-8') as f:
|
|
|
- f.write(original_content)
|
|
|
- logger.info(f"📝 Original Markdown saved: {original_path}")
|
|
|
-
|
|
|
+ # 金额标准化已在 pipeline element_processors 中完成,此处不再重复
|
|
|
with open(md_path, 'w', encoding='utf-8') as f:
|
|
|
f.write(markdown_content)
|
|
|
|
|
|
@@ -163,20 +137,9 @@ class MarkdownGenerator:
|
|
|
else:
|
|
|
page_name = doc_name
|
|
|
|
|
|
- # 生成单页 Markdown
|
|
|
+ # 生成单页 Markdown(金额标准化已在 pipeline element_processors 中完成,此处不再重复)
|
|
|
md_content = MarkdownGenerator._generate_page_markdown(page, doc_name, page_idx)
|
|
|
-
|
|
|
- # 金额数字标准化
|
|
|
- if normalize_numbers:
|
|
|
- original_content = md_content
|
|
|
- md_content = normalize_markdown_table(md_content)
|
|
|
-
|
|
|
- if md_content != original_content:
|
|
|
- original_path = output_dir / f"{page_name}_original.md"
|
|
|
- with open(original_path, 'w', encoding='utf-8') as f:
|
|
|
- f.write(original_content)
|
|
|
- logger.debug(f"📝 Original page Markdown saved: {original_path}")
|
|
|
-
|
|
|
+
|
|
|
# 保存
|
|
|
md_path = output_dir / f"{page_name}.md"
|
|
|
with open(md_path, 'w', encoding='utf-8') as f:
|
|
|
@@ -245,7 +208,14 @@ pages: {len(results.get('pages', []))}
|
|
|
html = content.get('html', '')
|
|
|
if html:
|
|
|
md_lines.append(f"\n{html}\n")
|
|
|
+ changes = content.get('number_normalization_changes', [])
|
|
|
+ if changes:
|
|
|
md_lines.append("")
|
|
|
+ md_lines.append("<!-- 数字标准化说明:")
|
|
|
+ for ch in changes:
|
|
|
+ md_lines.append(f" - [row={ch.get('row')},col={ch.get('col')}] {ch.get('old', '')} -> {ch.get('new', '')}")
|
|
|
+ md_lines.append("-->")
|
|
|
+ md_lines.append("")
|
|
|
|
|
|
elif elem_type in ['image', 'image_body', 'figure']:
|
|
|
img_filename = content.get('image_path', '')
|
|
|
@@ -343,6 +313,14 @@ pages: {len(results.get('pages', []))}
|
|
|
html = content.get('html', '')
|
|
|
if html:
|
|
|
md_lines.append(f"\n{html}\n")
|
|
|
+ # 金额标准化说明(来自 element_processors._normalize_table_content)
|
|
|
+ changes = content.get('number_normalization_changes', [])
|
|
|
+ if changes:
|
|
|
+ md_lines.append("")
|
|
|
+ md_lines.append("<!-- 数字标准化说明:")
|
|
|
+ for ch in changes:
|
|
|
+ md_lines.append(f" - [row={ch.get('row')},col={ch.get('col')}] {ch.get('old', '')} -> {ch.get('new', '')}")
|
|
|
+ md_lines.append("-->")
|
|
|
md_lines.append("")
|
|
|
|
|
|
elif elem_type in ['image', 'image_body', 'figure']:
|