ソースを参照

feat: 修改合并函数参数名称,将output_format更改为output_type,统一输出为MinerU格式

zhch158_admin 4 週間 前
コミット
184ba0d988
1 ファイル変更29 行追加21 行削除
  1. 29 21
      merger/merge_paddleocr_vl_paddleocr.py

+ 29 - 21
merger/merge_paddleocr_vl_paddleocr.py

@@ -13,7 +13,7 @@ except ImportError:
 
 
 def merge_single_file(paddleocr_vl_file: Path, paddle_file: Path, output_dir: Path, 
-                     output_format: str, merger: PaddleOCRVLMerger) -> bool:
+                     output_type: str, merger: PaddleOCRVLMerger) -> bool:
     """
     合并单个文件
     
@@ -34,29 +34,33 @@ def merge_single_file(paddleocr_vl_file: Path, paddle_file: Path, output_dir: Pa
     merged_json_path = output_dir / f"{paddleocr_vl_file.stem}.json"
     
     try:
-        # 合并数据
+        # 合并数据 (统一输出为MinerU格式)
         merged_data = merger.merge_table_with_bbox(
             str(paddleocr_vl_file),
-            str(paddle_file)
+            str(paddle_file),
+            data_format='mineru'  # 强制使用MinerU格式
         )
         
-        # 生成 Markdown
-        if output_format in ['markdown', 'both']:
-            merger.generate_enhanced_markdown(
-                merged_data, str(merged_md_path), str(paddleocr_vl_file)
+        # ✅ 生成 Markdown (基于MinerU格式)
+        if output_type in ['markdown', 'both']:
+            markdown = merger.generate_enhanced_markdown(
+                merged_data, 
+                str(merged_md_path), 
+                str(paddleocr_vl_file),
+                data_format='mineru'  # 强制使用MinerU格式
             )
         
-        # 保存 JSON
-        if output_format in ['json', 'both']:
+        # 保存 JSON (MinerU格式)
+        if output_type in ['json', 'both']:
             with open(merged_json_path, 'w', encoding='utf-8') as f:
                 json.dump(merged_data, f, ensure_ascii=False, indent=2)
 
-        print(f"  ✅ 合并完成")
+        print(f"  ✅ 合并完成 (MinerU格式)")
         print(f"  📊 共处理了 {len(merged_data)} 个对象")
         print(f"  💾 输出文件:")
-        if output_format in ['markdown', 'both']:
+        if output_type in ['markdown', 'both']:
             print(f"    - {merged_md_path.name}")
-        if output_format in ['json', 'both']:
+        if output_type in ['json', 'both']:
             print(f"    - {merged_json_path.name}")
 
         return True
@@ -69,7 +73,7 @@ def merge_single_file(paddleocr_vl_file: Path, paddle_file: Path, output_dir: Pa
 
 
 def merge_paddleocr_vl_batch(paddleocr_vl_dir: str, paddle_dir: str, output_dir: str,
-                             output_format: str = 'both',
+                             output_type: str = 'both',
                              look_ahead_window: int = 10, 
                              similarity_threshold: int = 80):
     """
@@ -113,7 +117,7 @@ def merge_paddleocr_vl_batch(paddleocr_vl_dir: str, paddle_dir: str, output_dir:
             failed_count += 1
             continue
 
-        if merge_single_file(paddleocr_vl_file, paddle_file, output_path, output_format, merger):
+        if merge_single_file(paddleocr_vl_file, paddle_file, output_path, output_type, merger):
             success_count += 1
         else:
             failed_count += 1
@@ -132,7 +136,7 @@ def merge_paddleocr_vl_batch(paddleocr_vl_dir: str, paddle_dir: str, output_dir:
 def main():
     """主函数"""
     parser = argparse.ArgumentParser(
-        description='合并 PaddleOCR_VL 和 PaddleOCR 的识别结果,添加 bbox 坐标信息',
+        description='合并 PaddleOCR_VL 和 PaddleOCR 的识别结果,统一输出为MinerU格式',
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 示例用法:
@@ -148,6 +152,10 @@ def main():
          --paddleocr-vl-file /path/to/file_page_001.json \\
          --paddle-file /path/to/file_page_001.json \\
          --output-dir /path/to/output
+        
+输出格式说明:
+  - JSON: 统一的MinerU格式JSON文件
+  - Markdown: 基于MinerU格式生成的Markdown文件
         """
     )
     
@@ -185,7 +193,7 @@ def main():
         help='输出目录(必需)'
     )
     output_group.add_argument(
-        '-f', '--format', 
+        '-f', '--output-type', 
         choices=['json', 'markdown', 'both'], 
         default='both', 
         help='输出格式'
@@ -207,7 +215,7 @@ def main():
     )
     
     args = parser.parse_args()
-    output_format = args.format.lower()
+    output_type = args.output_type.lower()
     
     # 验证参数
     if args.paddleocr_vl_file and args.paddle_file:
@@ -236,7 +244,7 @@ def main():
             similarity_threshold=args.threshold
         )
         
-        success = merge_single_file(paddleocr_vl_file, paddle_file, output_dir, output_format, merger)
+        success = merge_single_file(paddleocr_vl_file, paddle_file, output_dir, output_type, merger)
         
         if success:
             print("\n✅ 处理完成!")
@@ -259,7 +267,7 @@ def main():
             args.paddleocr_vl_dir,
             args.paddle_dir,
             args.output_dir,
-            output_format=output_format,
+            output_type=output_type,
             look_ahead_window=args.window,
             similarity_threshold=args.threshold
         )
@@ -270,7 +278,7 @@ def main():
 
 
 if __name__ == "__main__":
-    print("🚀 启动 PaddleOCR_VL + PaddleOCR 合并程序...")
+    print("🚀 启动 PaddleOCR_VL + PaddleOCR 合并程序 (统一输出MinerU格式)...")
     
     import sys
     
@@ -280,7 +288,7 @@ if __name__ == "__main__":
             "paddleocr-vl-file": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/PaddleOCR_VL_Results/对公_招商银行图_page_001.json",
             "paddle-file": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/data_PPStructureV3_Results/对公_招商银行图_page_001.json",
             "output-dir": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/PaddleOCR_VL_Results_cell_bbox",
-            "format": "both",
+            "output-type": "both",
             "window": "15",
             "threshold": "85"
         }