Forráskód Böngészése

feat: 更新文档方向分类参数为启用状态,并增强Markdown内容保存功能

zhch158_admin 1 hónapja
szülő
commit
ffb5c38dfe

+ 5 - 2
zhch/ppstructurev3_single_client.py

@@ -48,7 +48,7 @@ def call_api_for_image(image_path: str, api_url: str, timeout: int = 300) -> Dic
             "file": image_data,
             "fileType": 1,
             # 添加管道参数设置
-            "useDocOrientationClassify": False,
+            "useDocOrientationClassify": True,
             "useDocUnwarping": False,
             "useSealRecognition": True,
             "useTableRecognition": True,
@@ -153,7 +153,10 @@ def process_images_via_api(image_paths: List[str],
                         markdown_content, 
                         output_dir, 
                         output_filename,
-                        normalize_numbers=normalize_numbers
+                        normalize_numbers=normalize_numbers,
+                        key_text='markdown_texts',
+                        key_images='markdown_images',
+                        json_data=converted_json  # 🎯 新增参数
                     )
                     
                     # 记录处理结果

+ 3 - 2
zhch/ppstructurev3_single_process.py

@@ -97,7 +97,7 @@ def process_images_unified(image_paths: List[str],
                     # 使用pipeline预测单个图像
                     results = pipeline.predict(
                         img_path,
-                        use_doc_orientation_classify=False,
+                        use_doc_orientation_classify=True,
                         use_doc_unwarping=False,
                         use_layout_detection=True,
                         use_seal_recognition=True,
@@ -147,7 +147,8 @@ def process_images_unified(image_paths: List[str],
                                 output_filename,
                                 normalize_numbers=normalize_numbers,
                                 key_text='markdown_texts',
-                                key_images='markdown_images'
+                                key_images='markdown_images',
+                                json_data=converted_json  # 🎯 新增参数
                             )
                             
                             # 记录处理结果

+ 82 - 5
zhch/ppstructurev3_utils.py

@@ -10,7 +10,8 @@ import numpy as np
 
 from utils import (
     load_images_from_pdf,
-    normalize_markdown_table
+    normalize_markdown_table,
+    normalize_financial_numbers
 )
 
 def convert_pdf_to_images(pdf_file: str, output_dir: str | None = None, dpi: int = 200) -> List[str]:
@@ -273,16 +274,23 @@ def save_output_images(output_images: Dict[str, Any], output_dir: str, output_fi
     return saved_images
 
 def save_markdown_content(markdown_data: Dict[str, Any], output_dir: str, 
-                         filename: str, normalize_numbers: bool = True, key_text: str = 'text', key_images: str = 'images') -> str:
+                         filename: str, normalize_numbers: bool = True, 
+                         key_text: str = 'text', key_images: str = 'images',
+                         json_data: Dict[str, Any] = None) -> str:
     """
-    保存Markdown内容,支持数字标准化
+    保存Markdown内容,支持数字标准化和表格补全
     """
-    if not markdown_data:
+    if not markdown_data and not json_data:
         return ""
+    
     output_path = Path(output_dir).resolve()
     output_path.mkdir(parents=True, exist_ok=True)
     
-    # 保存Markdown文本
+    # 🎯 优先使用json_data生成完整内容
+    if json_data:
+        return save_markdown_content_enhanced(json_data, str(output_path), filename, normalize_numbers)
+    
+    # 原有逻辑保持不变
     markdown_text = markdown_data.get(key_text, '')
     
     # 数字标准化处理
@@ -316,4 +324,73 @@ def save_markdown_content(markdown_data: Dict[str, Any], output_dir: str,
         except Exception as e:
             print(f"❌ Error saving Markdown image {img_path}: {e}")
 
+    return str(md_file_path)
+
+def save_markdown_content_enhanced(json_data: Dict[str, Any], output_dir: str, 
+                         filename: str, normalize_numbers: bool = True) -> str:
+    """
+    增强版Markdown内容保存,同时处理parsing_res_list和table_res_list
+    """
+    if not json_data:
+        return ""
+    
+    output_path = Path(output_dir).resolve()
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    markdown_content = []
+    
+    # 处理 parsing_res_list
+    parsing_res_list = json_data.get('parsing_res_list', [])
+    table_res_list = json_data.get('table_res_list', [])
+    
+    table_index = 0  # 用于匹配table_res_list中的表格
+    
+    for item in parsing_res_list:
+        block_label = item.get('block_label', '')
+        block_content = item.get('block_content', '')
+        
+        if block_label == 'table':
+            # 如果是表格,优先使用table_res_list中的详细HTML
+            if table_index < len(table_res_list):
+                detailed_html = table_res_list[table_index].get('pred_html', block_content)
+                if normalize_numbers:
+                    detailed_html = normalize_markdown_table(detailed_html)
+                
+                # 转换为居中显示的HTML
+                markdown_content.append(f'<div style="text-align: center;">{detailed_html}</div>')
+                table_index += 1
+            else:
+                # 如果table_res_list中没有对应项,使用parsing_res_list中的内容
+                if normalize_numbers:
+                    block_content = normalize_markdown_table(block_content)
+                markdown_content.append(f'<div style="text-align: center;">{block_content}</div>')
+        else:
+            # 非表格内容直接添加
+            if normalize_numbers:
+                block_content = normalize_financial_numbers(block_content)
+            markdown_content.append(block_content)
+    
+    # 🎯 关键修复:处理剩余的table_res_list项目
+    # 如果table_res_list中还有未处理的表格(比parsing_res_list中的表格多)
+    remaining_tables = table_res_list[table_index:]
+    for table_item in remaining_tables:
+        detailed_html = table_item.get('pred_html', '')
+        if detailed_html:
+            if normalize_numbers:
+                detailed_html = normalize_markdown_table(detailed_html)
+            markdown_content.append(f'<div style="text-align: center;">{detailed_html}</div>')
+    
+    # 合并所有内容
+    final_markdown = '\n\n'.join(markdown_content)
+    
+    # 保存文件
+    md_file_path = output_path / f"{filename}.md"
+    with open(md_file_path, 'w', encoding='utf-8') as f:
+        f.write(final_markdown)
+    
+    print(f"📄 Enhanced Markdown saved: {md_file_path}")
+    print(f"   - parsing_res_list tables: {sum(1 for item in parsing_res_list if item.get('block_label') == 'table')}")
+    print(f"   - table_res_list tables: {len(table_res_list)}")
+    print(f"   - remaining tables added: {len(remaining_tables)}")
+    
     return str(md_file_path)