| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309 |
- """
- 合并 MinerU 和 PaddleOCR 的结果
- 主程序入口
- """
- import json
- import argparse
- from pathlib import Path
- try:
- from .merger_core import MinerUPaddleOCRMerger
- except ImportError:
- from merger_core import MinerUPaddleOCRMerger
- def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
- output_type: str, merger: MinerUPaddleOCRMerger) -> bool:
- """
- 合并单个文件
-
- Args:
- mineru_file: MinerU JSON 文件路径
- paddle_file: PaddleOCR JSON 文件路径
- output_dir: 输出目录
- merger: 合并器实例
-
- Returns:
- 是否成功
- """
- print(f"📄 处理: {mineru_file.name}")
-
- # 输出文件路径
- merged_md_path = output_dir / f"{mineru_file.stem}.md"
- merged_json_path = output_dir / f"{mineru_file.stem}.json"
-
- try:
- # 合并数据
- merged_data = merger.merge_table_with_bbox(
- str(mineru_file),
- str(paddle_file)
- )
-
- # 生成 Markdown
- if output_type in ['markdown', 'both']:
- merger.generate_enhanced_markdown(merged_data, str(merged_md_path), mineru_file)
-
- # 保存 JSON
- if output_type in ['json', 'both']:
- with open(merged_json_path, 'w', encoding='utf-8') as f:
- json.dump(merged_data, f, ensure_ascii=False, indent=2)
- print(f" ✅ 合并完成")
- print(f" 📊 共处理了 {len(merged_data)} 个对象")
- print(f" 💾 输出文件:")
- if output_type in ['markdown', 'both']:
- print(f" - {merged_md_path.name}")
- if output_type in ['json', 'both']:
- print(f" - {merged_json_path.name}")
- return True
-
- except Exception as e:
- print(f" ❌ 处理失败: {e}")
- import traceback
- traceback.print_exc()
- return False
- def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str,
- output_type: str = 'both',
- look_ahead_window: int = 10,
- similarity_threshold: int = 80):
- """
- 批量合并 MinerU 和 PaddleOCR 的结果
-
- Args:
- mineru_dir: MinerU 结果目录
- paddle_dir: PaddleOCR 结果目录
- output_dir: 输出目录
- look_ahead_window: 向前查找窗口大小
- similarity_threshold: 相似度阈值
- """
- mineru_path = Path(mineru_dir)
- paddle_path = Path(paddle_dir)
- output_path = Path(output_dir)
- output_path.mkdir(parents=True, exist_ok=True)
-
- merger = MinerUPaddleOCRMerger(look_ahead_window, similarity_threshold)
-
- # 查找所有 MinerU 的 JSON 文件
- mineru_files = list(mineru_path.glob('*_page_*[0-9].json'))
- mineru_files.sort()
-
- print(f"\n🔍 找到 {len(mineru_files)} 个 MinerU 文件")
- print(f"📂 MinerU 目录: {mineru_dir}")
- print(f"📂 PaddleOCR 目录: {paddle_dir}")
- print(f"📂 输出目录: {output_dir}")
- print(f"⚙️ 查找窗口: {look_ahead_window}")
- print(f"⚙️ 相似度阈值: {similarity_threshold}%\n")
-
- success_count = 0
- failed_count = 0
-
- for mineru_file in mineru_files:
- # 查找对应的 PaddleOCR 文件
- paddle_file = paddle_path / mineru_file.name
-
- if not paddle_file.exists():
- print(f"⚠️ 跳过: 未找到对应的 PaddleOCR 文件: {paddle_file.name}\n")
- failed_count += 1
- continue
- if merge_single_file(mineru_file, paddle_file, output_path, output_type, merger):
- success_count += 1
- else:
- failed_count += 1
-
- print() # 空行分隔
-
- # 打印统计信息
- print("=" * 60)
- print(f"✅ 处理完成!")
- print(f"📊 统计信息:")
- print(f" - 总文件数: {len(mineru_files)}")
- print(f" - 成功: {success_count}")
- print(f" - 失败: {failed_count}")
- print("=" * 60)
- def main():
- """主函数"""
- parser = argparse.ArgumentParser(
- description='合并 MinerU 和 PaddleOCR 的识别结果,添加 bbox 坐标信息',
- formatter_class=argparse.RawDescriptionHelpFormatter,
- epilog="""
- 示例用法:
- 1. 批量处理整个目录:
- python merge_mineru_paddle_ocr.py \\
- --mineru-dir /path/to/mineru/results \\
- --paddle-dir /path/to/paddle/results \\
- --output-dir /path/to/output
- 2. 处理单个文件:
- python merge_mineru_paddle_ocr.py \\
- --mineru-file /path/to/file_page_001.json \\
- --paddle-file /path/to/file_page_001.json \\
- --output-dir /path/to/output
- 3. 自定义参数:
- python merge_mineru_paddle_ocr.py \\
- --mineru-dir /path/to/mineru \\
- --paddle-dir /path/to/paddle \\
- --output-dir /path/to/output \\
- --window 15 \\
- --threshold 85
- """
- )
-
- # 文件/目录参数
- file_group = parser.add_argument_group('文件参数')
- file_group.add_argument(
- '--mineru-file',
- type=str,
- help='MinerU 输出的 JSON 文件路径(单文件模式)'
- )
- file_group.add_argument(
- '--paddle-file',
- type=str,
- help='PaddleOCR 输出的 JSON 文件路径(单文件模式)'
- )
-
- dir_group = parser.add_argument_group('目录参数')
- dir_group.add_argument(
- '--mineru-dir',
- type=str,
- help='MinerU 结果目录(批量模式)'
- )
- dir_group.add_argument(
- '--paddle-dir',
- type=str,
- help='PaddleOCR 结果目录(批量模式)'
- )
-
- # 输出参数
- output_group = parser.add_argument_group('输出参数')
- output_group.add_argument(
- '-o', '--output-dir',
- type=str,
- required=True,
- help='输出目录(必需)'
- )
- output_group.add_argument(
- '-f', '--output-type',
- choices=['json', 'markdown', 'both'],
- default='both', help='输出格式'
- )
- # 算法参数
- algo_group = parser.add_argument_group('算法参数')
- algo_group.add_argument(
- '-w', '--window',
- type=int,
- default=15,
- help='向前查找的窗口大小(默认: 10)'
- )
- algo_group.add_argument(
- '-t', '--threshold',
- type=int,
- default=80,
- help='文本相似度阈值(0-100,默认: 80)'
- )
-
- args = parser.parse_args()
- output_type = args.output_type.lower()
-
- # 验证参数
- if args.mineru_file and args.paddle_file:
- # 单文件模式
- mineru_file = Path(args.mineru_file)
- paddle_file = Path(args.paddle_file)
- output_dir = Path(args.output_dir)
-
- if not mineru_file.exists():
- print(f"❌ 错误: MinerU 文件不存在: {mineru_file}")
- return
-
- if not paddle_file.exists():
- print(f"❌ 错误: PaddleOCR 文件不存在: {paddle_file}")
- return
-
- output_dir.mkdir(parents=True, exist_ok=True)
-
- print("\n🔧 单文件处理模式")
- print(f"📄 MinerU 文件: {mineru_file}")
- print(f"📄 PaddleOCR 文件: {paddle_file}")
- print(f"📂 输出目录: {output_dir}")
- print(f"⚙️ 查找窗口: {args.window}")
- print(f"⚙️ 相似度阈值: {args.threshold}%\n")
-
- merger = MinerUPaddleOCRMerger(
- look_ahead_window=args.window,
- similarity_threshold=args.threshold
- )
-
- success = merge_single_file(mineru_file, paddle_file, output_dir, output_type, merger)
-
- if success:
- print("\n✅ 处理完成!")
- else:
- print("\n❌ 处理失败!")
-
- elif args.mineru_dir and args.paddle_dir:
- # 批量模式
- if not Path(args.mineru_dir).exists():
- print(f"❌ 错误: MinerU 目录不存在: {args.mineru_dir}")
- return
-
- if not Path(args.paddle_dir).exists():
- print(f"❌ 错误: PaddleOCR 目录不存在: {args.paddle_dir}")
- return
-
- print("\n🔧 批量处理模式")
-
- merge_mineru_paddle_batch(
- args.mineru_dir,
- args.paddle_dir,
- args.output_dir,
- output_type=output_type,
- look_ahead_window=args.window,
- similarity_threshold=args.threshold
- )
-
- else:
- parser.print_help()
- print("\n❌ 错误: 请指定单文件模式或批量模式的参数")
- print(" 单文件模式: --mineru-file 和 --paddle-file")
- print(" 批量模式: --mineru-dir 和 --paddle-dir")
- if __name__ == "__main__":
- print("🚀 启动 MinerU + PaddleOCR 合并程序...")
-
- import sys
-
- if len(sys.argv) == 1:
- # 如果没有命令行参数,使用默认配置运行
- print("ℹ️ 未提供命令行参数,使用默认配置运行...")
-
- # 默认配置
- default_config = {
- "mineru-file": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/mineru-vlm-2.5.3_Results/对公_招商银行图_page_001.json",
- "paddle-file": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/data_PPStructureV3_Results/对公_招商银行图_page_001.json",
- "output-dir": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/merged_results",
- # "mineru-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results",
- # "paddle-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results",
- # "output-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results",
- "output-type": "both",
- "window": "15",
- "threshold": "85"
- }
-
- print("⚙️ 默认参数:")
- for key, value in default_config.items():
- print(f" --{key}: {value}")
- # 构造参数
- sys.argv = [sys.argv[0]]
- for key, value in default_config.items():
- sys.argv.extend([f"--{key}", str(value)])
-
- sys.exit(main())
|