merge_paddleocr_vl_paddleocr.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. """
  2. 合并 PaddleOCR_VL 和 PaddleOCR 的结果
  3. 主程序入口
  4. """
  5. import json
  6. import argparse
  7. from pathlib import Path
  8. try:
  9. from .paddleocr_vl_merger import PaddleOCRVLMerger
  10. except ImportError:
  11. from paddleocr_vl_merger import PaddleOCRVLMerger
  12. def merge_single_file(paddleocr_vl_file: Path, paddle_file: Path, output_dir: Path,
  13. output_format: str, merger: PaddleOCRVLMerger) -> bool:
  14. """
  15. 合并单个文件
  16. Args:
  17. paddleocr_vl_file: PaddleOCR_VL JSON 文件路径
  18. paddle_file: PaddleOCR JSON 文件路径
  19. output_dir: 输出目录
  20. output_format: 输出格式
  21. merger: 合并器实例
  22. Returns:
  23. 是否成功
  24. """
  25. print(f"📄 处理: {paddleocr_vl_file.name}")
  26. # 输出文件路径
  27. merged_md_path = output_dir / f"{paddleocr_vl_file.stem}.md"
  28. merged_json_path = output_dir / f"{paddleocr_vl_file.stem}.json"
  29. try:
  30. # 合并数据
  31. merged_data = merger.merge_table_with_bbox(
  32. str(paddleocr_vl_file),
  33. str(paddle_file)
  34. )
  35. # 生成 Markdown
  36. if output_format in ['markdown', 'both']:
  37. merger.generate_enhanced_markdown(
  38. merged_data, str(merged_md_path), str(paddleocr_vl_file)
  39. )
  40. # 保存 JSON
  41. if output_format in ['json', 'both']:
  42. with open(merged_json_path, 'w', encoding='utf-8') as f:
  43. json.dump(merged_data, f, ensure_ascii=False, indent=2)
  44. print(f" ✅ 合并完成")
  45. print(f" 📊 共处理了 {len(merged_data)} 个对象")
  46. print(f" 💾 输出文件:")
  47. if output_format in ['markdown', 'both']:
  48. print(f" - {merged_md_path.name}")
  49. if output_format in ['json', 'both']:
  50. print(f" - {merged_json_path.name}")
  51. return True
  52. except Exception as e:
  53. print(f" ❌ 处理失败: {e}")
  54. import traceback
  55. traceback.print_exc()
  56. return False
  57. def merge_paddleocr_vl_batch(paddleocr_vl_dir: str, paddle_dir: str, output_dir: str,
  58. output_format: str = 'both',
  59. look_ahead_window: int = 10,
  60. similarity_threshold: int = 80):
  61. """
  62. 批量合并 PaddleOCR_VL 和 PaddleOCR 的结果
  63. Args:
  64. paddleocr_vl_dir: PaddleOCR_VL 结果目录
  65. paddle_dir: PaddleOCR 结果目录
  66. output_dir: 输出目录
  67. output_format: 输出格式
  68. look_ahead_window: 向前查找窗口大小
  69. similarity_threshold: 相似度阈值
  70. """
  71. paddleocr_vl_path = Path(paddleocr_vl_dir)
  72. paddle_path = Path(paddle_dir)
  73. output_path = Path(output_dir)
  74. output_path.mkdir(parents=True, exist_ok=True)
  75. merger = PaddleOCRVLMerger(look_ahead_window, similarity_threshold)
  76. # 查找所有 PaddleOCR_VL 的 JSON 文件
  77. paddleocr_vl_files = list(paddleocr_vl_path.glob('*_page_*[0-9].json'))
  78. paddleocr_vl_files.sort()
  79. print(f"\n🔍 找到 {len(paddleocr_vl_files)} 个 PaddleOCR_VL 文件")
  80. print(f"📂 PaddleOCR_VL 目录: {paddleocr_vl_dir}")
  81. print(f"📂 PaddleOCR 目录: {paddle_dir}")
  82. print(f"📂 输出目录: {output_dir}")
  83. print(f"⚙️ 查找窗口: {look_ahead_window}")
  84. print(f"⚙️ 相似度阈值: {similarity_threshold}%\n")
  85. success_count = 0
  86. failed_count = 0
  87. for paddleocr_vl_file in paddleocr_vl_files:
  88. # 查找对应的 PaddleOCR 文件
  89. paddle_file = paddle_path / paddleocr_vl_file.name
  90. if not paddle_file.exists():
  91. print(f"⚠️ 跳过: 未找到对应的 PaddleOCR 文件: {paddle_file.name}\n")
  92. failed_count += 1
  93. continue
  94. if merge_single_file(paddleocr_vl_file, paddle_file, output_path, output_format, merger):
  95. success_count += 1
  96. else:
  97. failed_count += 1
  98. print()
  99. print("=" * 60)
  100. print(f"✅ 处理完成!")
  101. print(f"📊 统计信息:")
  102. print(f" - 总文件数: {len(paddleocr_vl_files)}")
  103. print(f" - 成功: {success_count}")
  104. print(f" - 失败: {failed_count}")
  105. print("=" * 60)
  106. def main():
  107. """主函数"""
  108. parser = argparse.ArgumentParser(
  109. description='合并 PaddleOCR_VL 和 PaddleOCR 的识别结果,添加 bbox 坐标信息',
  110. formatter_class=argparse.RawDescriptionHelpFormatter,
  111. epilog="""
  112. 示例用法:
  113. 1. 批量处理整个目录:
  114. python merge_paddleocr_vl_paddleocr.py \\
  115. --paddleocr-vl-dir /path/to/paddleocr_vl/results \\
  116. --paddle-dir /path/to/paddle/results \\
  117. --output-dir /path/to/output
  118. 2. 处理单个文件:
  119. python merge_paddleocr_vl_paddleocr.py \\
  120. --paddleocr-vl-file /path/to/file_page_001.json \\
  121. --paddle-file /path/to/file_page_001.json \\
  122. --output-dir /path/to/output
  123. """
  124. )
  125. # 文件/目录参数
  126. file_group = parser.add_argument_group('文件参数')
  127. file_group.add_argument(
  128. '--paddleocr-vl-file',
  129. type=str,
  130. help='PaddleOCR_VL 输出的 JSON 文件路径(单文件模式)'
  131. )
  132. file_group.add_argument(
  133. '--paddle-file',
  134. type=str,
  135. help='PaddleOCR 输出的 JSON 文件路径(单文件模式)'
  136. )
  137. dir_group = parser.add_argument_group('目录参数')
  138. dir_group.add_argument(
  139. '--paddleocr-vl-dir',
  140. type=str,
  141. help='PaddleOCR_VL 结果目录(批量模式)'
  142. )
  143. dir_group.add_argument(
  144. '--paddle-dir',
  145. type=str,
  146. help='PaddleOCR 结果目录(批量模式)'
  147. )
  148. # 输出参数
  149. output_group = parser.add_argument_group('输出参数')
  150. output_group.add_argument(
  151. '-o', '--output-dir',
  152. type=str,
  153. required=True,
  154. help='输出目录(必需)'
  155. )
  156. output_group.add_argument(
  157. '-f', '--format',
  158. choices=['json', 'markdown', 'both'],
  159. default='both',
  160. help='输出格式'
  161. )
  162. # 算法参数
  163. algo_group = parser.add_argument_group('算法参数')
  164. algo_group.add_argument(
  165. '-w', '--window',
  166. type=int,
  167. default=15,
  168. help='向前查找的窗口大小(默认: 15)'
  169. )
  170. algo_group.add_argument(
  171. '-t', '--threshold',
  172. type=int,
  173. default=80,
  174. help='文本相似度阈值(0-100,默认: 80)'
  175. )
  176. args = parser.parse_args()
  177. output_format = args.format.lower()
  178. # 验证参数
  179. if args.paddleocr_vl_file and args.paddle_file:
  180. # 单文件模式
  181. paddleocr_vl_file = Path(args.paddleocr_vl_file)
  182. paddle_file = Path(args.paddle_file)
  183. output_dir = Path(args.output_dir)
  184. if not paddleocr_vl_file.exists():
  185. print(f"❌ 错误: PaddleOCR_VL 文件不存在: {paddleocr_vl_file}")
  186. return
  187. if not paddle_file.exists():
  188. print(f"❌ 错误: PaddleOCR 文件不存在: {paddle_file}")
  189. return
  190. output_dir.mkdir(parents=True, exist_ok=True)
  191. print("\n🔧 单文件处理模式")
  192. print(f"📄 PaddleOCR_VL 文件: {paddleocr_vl_file}")
  193. print(f"📄 PaddleOCR 文件: {paddle_file}")
  194. print(f"📂 输出目录: {output_dir}\n")
  195. merger = PaddleOCRVLMerger(
  196. look_ahead_window=args.window,
  197. similarity_threshold=args.threshold
  198. )
  199. success = merge_single_file(paddleocr_vl_file, paddle_file, output_dir, output_format, merger)
  200. if success:
  201. print("\n✅ 处理完成!")
  202. else:
  203. print("\n❌ 处理失败!")
  204. elif args.paddleocr_vl_dir and args.paddle_dir:
  205. # 批量模式
  206. if not Path(args.paddleocr_vl_dir).exists():
  207. print(f"❌ 错误: PaddleOCR_VL 目录不存在: {args.paddleocr_vl_dir}")
  208. return
  209. if not Path(args.paddle_dir).exists():
  210. print(f"❌ 错误: PaddleOCR 目录不存在: {args.paddle_dir}")
  211. return
  212. print("\n🔧 批量处理模式")
  213. merge_paddleocr_vl_batch(
  214. args.paddleocr_vl_dir,
  215. args.paddle_dir,
  216. args.output_dir,
  217. output_format=output_format,
  218. look_ahead_window=args.window,
  219. similarity_threshold=args.threshold
  220. )
  221. else:
  222. parser.print_help()
  223. print("\n❌ 错误: 请指定单文件模式或批量模式的参数")
  224. if __name__ == "__main__":
  225. print("🚀 启动 PaddleOCR_VL + PaddleOCR 合并程序...")
  226. import sys
  227. if len(sys.argv) == 1:
  228. # 默认配置
  229. default_config = {
  230. "paddleocr-vl-file": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/PaddleOCR_VL_Results/对公_招商银行图_page_001.json",
  231. "paddle-file": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/data_PPStructureV3_Results/对公_招商银行图_page_001.json",
  232. "output-dir": "/Users/zhch158/workspace/data/流水分析/对公_招商银行图/PaddleOCR_VL_Results_cell_bbox",
  233. "format": "both",
  234. "window": "15",
  235. "threshold": "85"
  236. }
  237. print("ℹ️ 未提供命令行参数,使用默认配置运行...")
  238. print("⚙️ 默认参数:")
  239. for key, value in default_config.items():
  240. print(f" --{key}: {value}")
  241. sys.argv = [sys.argv[0]]
  242. for key, value in default_config.items():
  243. sys.argv.extend([f"--{key}", str(value)])
  244. sys.exit(main())