|
|
@@ -0,0 +1,297 @@
|
|
|
+"""
|
|
|
+合并 PaddleOCR_VL 和 PaddleOCR 的结果
|
|
|
+主程序入口
|
|
|
+"""
|
|
|
+import json
|
|
|
+import argparse
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+try:
|
|
|
+ from .paddleocr_vl_merger import PaddleOCRVLMerger
|
|
|
+except ImportError:
|
|
|
+ from paddleocr_vl_merger import PaddleOCRVLMerger
|
|
|
+
|
|
|
+
|
|
|
+def merge_single_file(paddleocr_vl_file: Path, paddle_file: Path, output_dir: Path,
|
|
|
+ output_format: str, merger: PaddleOCRVLMerger) -> bool:
|
|
|
+ """
|
|
|
+ 合并单个文件
|
|
|
+
|
|
|
+ Args:
|
|
|
+ paddleocr_vl_file: PaddleOCR_VL JSON 文件路径
|
|
|
+ paddle_file: PaddleOCR JSON 文件路径
|
|
|
+ output_dir: 输出目录
|
|
|
+ output_format: 输出格式
|
|
|
+ merger: 合并器实例
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 是否成功
|
|
|
+ """
|
|
|
+ print(f"📄 处理: {paddleocr_vl_file.name}")
|
|
|
+
|
|
|
+ # 输出文件路径
|
|
|
+ merged_md_path = output_dir / f"{paddleocr_vl_file.stem}.md"
|
|
|
+ merged_json_path = output_dir / f"{paddleocr_vl_file.stem}.json"
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 合并数据
|
|
|
+ merged_data = merger.merge_table_with_bbox(
|
|
|
+ str(paddleocr_vl_file),
|
|
|
+ str(paddle_file)
|
|
|
+ )
|
|
|
+
|
|
|
+ # 生成 Markdown
|
|
|
+ if output_format in ['markdown', 'both']:
|
|
|
+ merger.generate_enhanced_markdown(
|
|
|
+ merged_data, str(merged_md_path), str(paddleocr_vl_file)
|
|
|
+ )
|
|
|
+
|
|
|
+ # 保存 JSON
|
|
|
+ if output_format in ['json', 'both']:
|
|
|
+ with open(merged_json_path, 'w', encoding='utf-8') as f:
|
|
|
+ json.dump(merged_data, f, ensure_ascii=False, indent=2)
|
|
|
+
|
|
|
+ print(f" ✅ 合并完成")
|
|
|
+ print(f" 📊 共处理了 {len(merged_data)} 个对象")
|
|
|
+ print(f" 💾 输出文件:")
|
|
|
+ if output_format in ['markdown', 'both']:
|
|
|
+ print(f" - {merged_md_path.name}")
|
|
|
+ if output_format in ['json', 'both']:
|
|
|
+ print(f" - {merged_json_path.name}")
|
|
|
+
|
|
|
+ return True
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f" ❌ 处理失败: {e}")
|
|
|
+ import traceback
|
|
|
+ traceback.print_exc()
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
+def merge_paddleocr_vl_batch(paddleocr_vl_dir: str, paddle_dir: str, output_dir: str,
|
|
|
+ output_format: str = 'both',
|
|
|
+ look_ahead_window: int = 10,
|
|
|
+ similarity_threshold: int = 80):
|
|
|
+ """
|
|
|
+ 批量合并 PaddleOCR_VL 和 PaddleOCR 的结果
|
|
|
+
|
|
|
+ Args:
|
|
|
+ paddleocr_vl_dir: PaddleOCR_VL 结果目录
|
|
|
+ paddle_dir: PaddleOCR 结果目录
|
|
|
+ output_dir: 输出目录
|
|
|
+ output_format: 输出格式
|
|
|
+ look_ahead_window: 向前查找窗口大小
|
|
|
+ similarity_threshold: 相似度阈值
|
|
|
+ """
|
|
|
+ paddleocr_vl_path = Path(paddleocr_vl_dir)
|
|
|
+ paddle_path = Path(paddle_dir)
|
|
|
+ output_path = Path(output_dir)
|
|
|
+ output_path.mkdir(parents=True, exist_ok=True)
|
|
|
+
|
|
|
+ merger = PaddleOCRVLMerger(look_ahead_window, similarity_threshold)
|
|
|
+
|
|
|
+ # 查找所有 PaddleOCR_VL 的 JSON 文件
|
|
|
+ paddleocr_vl_files = list(paddleocr_vl_path.glob('*_page_*[0-9].json'))
|
|
|
+ paddleocr_vl_files.sort()
|
|
|
+
|
|
|
+ print(f"\n🔍 找到 {len(paddleocr_vl_files)} 个 PaddleOCR_VL 文件")
|
|
|
+ print(f"📂 PaddleOCR_VL 目录: {paddleocr_vl_dir}")
|
|
|
+ print(f"📂 PaddleOCR 目录: {paddle_dir}")
|
|
|
+ print(f"📂 输出目录: {output_dir}")
|
|
|
+ print(f"⚙️ 查找窗口: {look_ahead_window}")
|
|
|
+ print(f"⚙️ 相似度阈值: {similarity_threshold}%\n")
|
|
|
+
|
|
|
+ success_count = 0
|
|
|
+ failed_count = 0
|
|
|
+
|
|
|
+ for paddleocr_vl_file in paddleocr_vl_files:
|
|
|
+ # 查找对应的 PaddleOCR 文件
|
|
|
+ paddle_file = paddle_path / paddleocr_vl_file.name
|
|
|
+
|
|
|
+ if not paddle_file.exists():
|
|
|
+ print(f"⚠️ 跳过: 未找到对应的 PaddleOCR 文件: {paddle_file.name}\n")
|
|
|
+ failed_count += 1
|
|
|
+ continue
|
|
|
+
|
|
|
+ if merge_single_file(paddleocr_vl_file, paddle_file, output_path, output_format, merger):
|
|
|
+ success_count += 1
|
|
|
+ else:
|
|
|
+ failed_count += 1
|
|
|
+
|
|
|
+ print()
|
|
|
+
|
|
|
+ print("=" * 60)
|
|
|
+ print(f"✅ 处理完成!")
|
|
|
+ print(f"📊 统计信息:")
|
|
|
+ print(f" - 总文件数: {len(paddleocr_vl_files)}")
|
|
|
+ print(f" - 成功: {success_count}")
|
|
|
+ print(f" - 失败: {failed_count}")
|
|
|
+ print("=" * 60)
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ """主函数"""
|
|
|
+ parser = argparse.ArgumentParser(
|
|
|
+ description='合并 PaddleOCR_VL 和 PaddleOCR 的识别结果,添加 bbox 坐标信息',
|
|
|
+ formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
+ epilog="""
|
|
|
+示例用法:
|
|
|
+
|
|
|
+ 1. 批量处理整个目录:
|
|
|
+ python merge_paddleocr_vl_paddleocr.py \\
|
|
|
+ --paddleocr-vl-dir /path/to/paddleocr_vl/results \\
|
|
|
+ --paddle-dir /path/to/paddle/results \\
|
|
|
+ --output-dir /path/to/output
|
|
|
+
|
|
|
+ 2. 处理单个文件:
|
|
|
+ python merge_paddleocr_vl_paddleocr.py \\
|
|
|
+ --paddleocr-vl-file /path/to/file_page_001.json \\
|
|
|
+ --paddle-file /path/to/file_page_001.json \\
|
|
|
+ --output-dir /path/to/output
|
|
|
+ """
|
|
|
+ )
|
|
|
+
|
|
|
+ # 文件/目录参数
|
|
|
+ file_group = parser.add_argument_group('文件参数')
|
|
|
+ file_group.add_argument(
|
|
|
+ '--paddleocr-vl-file',
|
|
|
+ type=str,
|
|
|
+ help='PaddleOCR_VL 输出的 JSON 文件路径(单文件模式)'
|
|
|
+ )
|
|
|
+ file_group.add_argument(
|
|
|
+ '--paddle-file',
|
|
|
+ type=str,
|
|
|
+ help='PaddleOCR 输出的 JSON 文件路径(单文件模式)'
|
|
|
+ )
|
|
|
+
|
|
|
+ dir_group = parser.add_argument_group('目录参数')
|
|
|
+ dir_group.add_argument(
|
|
|
+ '--paddleocr-vl-dir',
|
|
|
+ type=str,
|
|
|
+ help='PaddleOCR_VL 结果目录(批量模式)'
|
|
|
+ )
|
|
|
+ dir_group.add_argument(
|
|
|
+ '--paddle-dir',
|
|
|
+ type=str,
|
|
|
+ help='PaddleOCR 结果目录(批量模式)'
|
|
|
+ )
|
|
|
+
|
|
|
+ # 输出参数
|
|
|
+ output_group = parser.add_argument_group('输出参数')
|
|
|
+ output_group.add_argument(
|
|
|
+ '-o', '--output-dir',
|
|
|
+ type=str,
|
|
|
+ required=True,
|
|
|
+ help='输出目录(必需)'
|
|
|
+ )
|
|
|
+ output_group.add_argument(
|
|
|
+ '-f', '--format',
|
|
|
+ choices=['json', 'markdown', 'both'],
|
|
|
+ default='both',
|
|
|
+ help='输出格式'
|
|
|
+ )
|
|
|
+
|
|
|
+ # 算法参数
|
|
|
+ algo_group = parser.add_argument_group('算法参数')
|
|
|
+ algo_group.add_argument(
|
|
|
+ '-w', '--window',
|
|
|
+ type=int,
|
|
|
+ default=15,
|
|
|
+ help='向前查找的窗口大小(默认: 15)'
|
|
|
+ )
|
|
|
+ algo_group.add_argument(
|
|
|
+ '-t', '--threshold',
|
|
|
+ type=int,
|
|
|
+ default=80,
|
|
|
+ help='文本相似度阈值(0-100,默认: 80)'
|
|
|
+ )
|
|
|
+
|
|
|
+ args = parser.parse_args()
|
|
|
+ output_format = args.format.lower()
|
|
|
+
|
|
|
+ # 验证参数
|
|
|
+ if args.paddleocr_vl_file and args.paddle_file:
|
|
|
+ # 单文件模式
|
|
|
+ paddleocr_vl_file = Path(args.paddleocr_vl_file)
|
|
|
+ paddle_file = Path(args.paddle_file)
|
|
|
+ output_dir = Path(args.output_dir)
|
|
|
+
|
|
|
+ if not paddleocr_vl_file.exists():
|
|
|
+ print(f"❌ 错误: PaddleOCR_VL 文件不存在: {paddleocr_vl_file}")
|
|
|
+ return
|
|
|
+
|
|
|
+ if not paddle_file.exists():
|
|
|
+ print(f"❌ 错误: PaddleOCR 文件不存在: {paddle_file}")
|
|
|
+ return
|
|
|
+
|
|
|
+ output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
+
|
|
|
+ print("\n🔧 单文件处理模式")
|
|
|
+ print(f"📄 PaddleOCR_VL 文件: {paddleocr_vl_file}")
|
|
|
+ print(f"📄 PaddleOCR 文件: {paddle_file}")
|
|
|
+ print(f"📂 输出目录: {output_dir}\n")
|
|
|
+
|
|
|
+ merger = PaddleOCRVLMerger(
|
|
|
+ look_ahead_window=args.window,
|
|
|
+ similarity_threshold=args.threshold
|
|
|
+ )
|
|
|
+
|
|
|
+ success = merge_single_file(paddleocr_vl_file, paddle_file, output_dir, output_format, merger)
|
|
|
+
|
|
|
+ if success:
|
|
|
+ print("\n✅ 处理完成!")
|
|
|
+ else:
|
|
|
+ print("\n❌ 处理失败!")
|
|
|
+
|
|
|
+ elif args.paddleocr_vl_dir and args.paddle_dir:
|
|
|
+ # 批量模式
|
|
|
+ if not Path(args.paddleocr_vl_dir).exists():
|
|
|
+ print(f"❌ 错误: PaddleOCR_VL 目录不存在: {args.paddleocr_vl_dir}")
|
|
|
+ return
|
|
|
+
|
|
|
+ if not Path(args.paddle_dir).exists():
|
|
|
+ print(f"❌ 错误: PaddleOCR 目录不存在: {args.paddle_dir}")
|
|
|
+ return
|
|
|
+
|
|
|
+ print("\n🔧 批量处理模式")
|
|
|
+
|
|
|
+ merge_paddleocr_vl_batch(
|
|
|
+ args.paddleocr_vl_dir,
|
|
|
+ args.paddle_dir,
|
|
|
+ args.output_dir,
|
|
|
+ output_format=output_format,
|
|
|
+ look_ahead_window=args.window,
|
|
|
+ similarity_threshold=args.threshold
|
|
|
+ )
|
|
|
+
|
|
|
+ else:
|
|
|
+ parser.print_help()
|
|
|
+ print("\n❌ 错误: 请指定单文件模式或批量模式的参数")
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ print("🚀 启动 PaddleOCR_VL + PaddleOCR 合并程序...")
|
|
|
+
|
|
|
+ import sys
|
|
|
+
|
|
|
+ if len(sys.argv) == 1:
|
|
|
+ # 默认配置
|
|
|
+ default_config = {
|
|
|
+ "paddleocr-vl-file": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/PaddleOCR_VL_Results/对公_招商银行图_page_001.json",
|
|
|
+ "paddle-file": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/data_PPStructureV3_Results/对公_招商银行图_page_001.json",
|
|
|
+ "output-dir": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/PaddleOCR_VL_Results_cell_bbox",
|
|
|
+ "format": "both",
|
|
|
+ "window": "15",
|
|
|
+ "threshold": "85"
|
|
|
+ }
|
|
|
+
|
|
|
+ print("ℹ️ 未提供命令行参数,使用默认配置运行...")
|
|
|
+ print("⚙️ 默认参数:")
|
|
|
+ for key, value in default_config.items():
|
|
|
+ print(f" --{key}: {value}")
|
|
|
+
|
|
|
+ sys.argv = [sys.argv[0]]
|
|
|
+ for key, value in default_config.items():
|
|
|
+ sys.argv.extend([f"--{key}", str(value)])
|
|
|
+
|
|
|
+ sys.exit(main())
|