Bläddra i källkod

refactor(markdown_generator): 移除冗余的金额标准化逻辑,更新Markdown生成以包含标准化变更说明

zhch158_admin 2 veckor sedan
förälder
incheckning
40bad356ee
1 ändrade filer med 19 tillägg och 41 borttagningar
  1. 19 41
      ocr_utils/markdown_generator.py

+ 19 - 41
ocr_utils/markdown_generator.py

@@ -29,10 +29,6 @@ except ImportError:
         MM_MD = 'mm_md'
         NLP_MD = 'nlp_md'
 
-# 导入数字标准化工具
-from .normalize_financial_numbers import normalize_markdown_table
-
-
 class MarkdownGenerator:
     """Markdown 生成器类"""
     
@@ -80,18 +76,7 @@ class MarkdownGenerator:
                     
                     header = MarkdownGenerator._generate_header(results)
                     markdown_content = header + str(markdown_content)
-                    
-                    # 金额数字标准化
-                    if normalize_numbers:
-                        original_content = markdown_content
-                        markdown_content = normalize_markdown_table(markdown_content)
-                        
-                        if markdown_content != original_content:
-                            original_path = output_dir / f"{doc_name}_original.md"
-                            with open(original_path, 'w', encoding='utf-8') as f:
-                                f.write(original_content)
-                            logger.info(f"📝 Original Markdown saved: {original_path}")
-                    
+                    # 金额标准化已在 pipeline element_processors 中完成,此处不再重复
                     with open(md_path, 'w', encoding='utf-8') as f:
                         f.write(markdown_content)
                     
@@ -103,18 +88,7 @@ class MarkdownGenerator:
         
         # 使用自定义实现,确保所有元素类型都被处理
         markdown_content = MarkdownGenerator._generate_full_markdown(results)
-        
-        # 金额数字标准化
-        if normalize_numbers:
-            original_content = markdown_content
-            markdown_content = normalize_markdown_table(markdown_content)
-            
-            if markdown_content != original_content:
-                original_path = output_dir / f"{doc_name}_original.md"
-                with open(original_path, 'w', encoding='utf-8') as f:
-                    f.write(original_content)
-                logger.info(f"📝 Original Markdown saved: {original_path}")
-        
+        # 金额标准化已在 pipeline element_processors 中完成,此处不再重复
         with open(md_path, 'w', encoding='utf-8') as f:
             f.write(markdown_content)
         
@@ -163,20 +137,9 @@ class MarkdownGenerator:
             else:
                 page_name = doc_name
             
-            # 生成单页 Markdown
+            # 生成单页 Markdown(金额标准化已在 pipeline element_processors 中完成,此处不再重复)
             md_content = MarkdownGenerator._generate_page_markdown(page, doc_name, page_idx)
-            
-            # 金额数字标准化
-            if normalize_numbers:
-                original_content = md_content
-                md_content = normalize_markdown_table(md_content)
-                
-                if md_content != original_content:
-                    original_path = output_dir / f"{page_name}_original.md"
-                    with open(original_path, 'w', encoding='utf-8') as f:
-                        f.write(original_content)
-                    logger.debug(f"📝 Original page Markdown saved: {original_path}")
-            
+
             # 保存
             md_path = output_dir / f"{page_name}.md"
             with open(md_path, 'w', encoding='utf-8') as f:
@@ -245,7 +208,14 @@ pages: {len(results.get('pages', []))}
                     html = content.get('html', '')
                     if html:
                         md_lines.append(f"\n{html}\n")
+                    changes = content.get('number_normalization_changes', [])
+                    if changes:
                         md_lines.append("")
+                        md_lines.append("<!-- 数字标准化说明:")
+                        for ch in changes:
+                            md_lines.append(f"  - [row={ch.get('row')},col={ch.get('col')}] {ch.get('old', '')} -> {ch.get('new', '')}")
+                        md_lines.append("-->")
+                    md_lines.append("")
                 
                 elif elem_type in ['image', 'image_body', 'figure']:
                     img_filename = content.get('image_path', '')
@@ -343,6 +313,14 @@ pages: {len(results.get('pages', []))}
                 html = content.get('html', '')
                 if html:
                     md_lines.append(f"\n{html}\n")
+                # 金额标准化说明(来自 element_processors._normalize_table_content)
+                changes = content.get('number_normalization_changes', [])
+                if changes:
+                    md_lines.append("")
+                    md_lines.append("<!-- 数字标准化说明:")
+                    for ch in changes:
+                        md_lines.append(f"  - [row={ch.get('row')},col={ch.get('col')}] {ch.get('old', '')} -> {ch.get('new', '')}")
+                    md_lines.append("-->")
                 md_lines.append("")
             
             elif elem_type in ['image', 'image_body', 'figure']: