merge_paddleocr_vl_paddleocr.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. """
  2. 合并 PaddleOCR_VL 和 PaddleOCR 的结果
  3. 主程序入口
  4. """
  5. import json
  6. import argparse
  7. from pathlib import Path
  8. import sys
  9. from pathlib import Path
  10. # 添加 ocr_platform 根目录到 Python 路径
  11. ocr_platform_root = Path(__file__).parents[2] # ocr_merger -> ocr_tools -> ocr_platform -> repository.git
  12. if str(ocr_platform_root) not in sys.path:
  13. sys.path.insert(0, str(ocr_platform_root))
  14. try:
  15. from ocr_tools.ocr_merger.paddleocr_vl_merger import PaddleOCRVLMerger
  16. except ImportError:
  17. from paddleocr_vl_merger import PaddleOCRVLMerger
  18. def merge_single_file(paddleocr_vl_file: Path, paddle_file: Path, output_dir: Path,
  19. output_type: str, merger: PaddleOCRVLMerger) -> bool:
  20. """
  21. 合并单个文件
  22. Args:
  23. paddleocr_vl_file: PaddleOCR_VL JSON 文件路径
  24. paddle_file: PaddleOCR JSON 文件路径
  25. output_dir: 输出目录
  26. output_format: 输出格式
  27. merger: 合并器实例
  28. Returns:
  29. 是否成功
  30. """
  31. print(f"📄 处理: {paddleocr_vl_file.name}")
  32. # 输出文件路径
  33. merged_md_path = output_dir / f"{paddleocr_vl_file.stem}.md"
  34. merged_json_path = output_dir / f"{paddleocr_vl_file.stem}.json"
  35. try:
  36. # ✅ 合并数据 (统一输出为MinerU格式)
  37. merged_data = merger.merge_table_with_bbox(
  38. str(paddleocr_vl_file),
  39. str(paddle_file),
  40. data_format='mineru' # 强制使用MinerU格式
  41. )
  42. # ✅ 生成 Markdown (基于MinerU格式)
  43. if output_type in ['markdown', 'both']:
  44. markdown = merger.generate_enhanced_markdown(
  45. merged_data,
  46. str(merged_md_path),
  47. str(paddleocr_vl_file),
  48. data_format='mineru' # 强制使用MinerU格式
  49. )
  50. # ✅ 保存 JSON (MinerU格式)
  51. if output_type in ['json', 'both']:
  52. with open(merged_json_path, 'w', encoding='utf-8') as f:
  53. json.dump(merged_data, f, ensure_ascii=False, indent=2)
  54. print(f" ✅ 合并完成 (MinerU格式)")
  55. print(f" 📊 共处理了 {len(merged_data)} 个对象")
  56. print(f" 💾 输出文件:")
  57. if output_type in ['markdown', 'both']:
  58. print(f" - {merged_md_path.name}")
  59. if output_type in ['json', 'both']:
  60. print(f" - {merged_json_path.name}")
  61. return True
  62. except Exception as e:
  63. print(f" ❌ 处理失败: {e}")
  64. import traceback
  65. traceback.print_exc()
  66. return False
  67. def merge_paddleocr_vl_batch(paddleocr_vl_dir: str, paddle_dir: str, output_dir: str,
  68. output_type: str = 'both',
  69. look_ahead_window: int = 10,
  70. similarity_threshold: int = 80):
  71. """
  72. 批量合并 PaddleOCR_VL 和 PaddleOCR 的结果
  73. Args:
  74. paddleocr_vl_dir: PaddleOCR_VL 结果目录
  75. paddle_dir: PaddleOCR 结果目录
  76. output_dir: 输出目录
  77. output_format: 输出格式
  78. look_ahead_window: 向前查找窗口大小
  79. similarity_threshold: 相似度阈值
  80. """
  81. paddleocr_vl_path = Path(paddleocr_vl_dir)
  82. paddle_path = Path(paddle_dir)
  83. output_path = Path(output_dir)
  84. output_path.mkdir(parents=True, exist_ok=True)
  85. merger = PaddleOCRVLMerger(look_ahead_window, similarity_threshold)
  86. # 查找所有 PaddleOCR_VL 的 JSON 文件
  87. paddleocr_vl_files = list(paddleocr_vl_path.glob('*_page_*[0-9].json'))
  88. paddleocr_vl_files.sort()
  89. print(f"\n🔍 找到 {len(paddleocr_vl_files)} 个 PaddleOCR_VL 文件")
  90. print(f"📂 PaddleOCR_VL 目录: {paddleocr_vl_dir}")
  91. print(f"📂 PaddleOCR 目录: {paddle_dir}")
  92. print(f"📂 输出目录: {output_dir}")
  93. print(f"⚙️ 查找窗口: {look_ahead_window}")
  94. print(f"⚙️ 相似度阈值: {similarity_threshold}%\n")
  95. success_count = 0
  96. failed_count = 0
  97. for paddleocr_vl_file in paddleocr_vl_files:
  98. # 查找对应的 PaddleOCR 文件
  99. paddle_file = paddle_path / paddleocr_vl_file.name
  100. if not paddle_file.exists():
  101. print(f"⚠️ 跳过: 未找到对应的 PaddleOCR 文件: {paddle_file.name}\n")
  102. failed_count += 1
  103. continue
  104. if merge_single_file(paddleocr_vl_file, paddle_file, output_path, output_type, merger):
  105. success_count += 1
  106. else:
  107. failed_count += 1
  108. print()
  109. print("=" * 60)
  110. print(f"✅ 处理完成!")
  111. print(f"📊 统计信息:")
  112. print(f" - 总文件数: {len(paddleocr_vl_files)}")
  113. print(f" - 成功: {success_count}")
  114. print(f" - 失败: {failed_count}")
  115. print("=" * 60)
  116. def main():
  117. """主函数"""
  118. parser = argparse.ArgumentParser(
  119. description='合并 PaddleOCR_VL 和 PaddleOCR 的识别结果,统一输出为MinerU格式',
  120. formatter_class=argparse.RawDescriptionHelpFormatter,
  121. epilog="""
  122. 示例用法:
  123. 1. 批量处理整个目录:
  124. python merge_paddleocr_vl_paddleocr.py \\
  125. --paddleocr-vl-dir /path/to/paddleocr_vl/results \\
  126. --paddle-dir /path/to/paddle/results \\
  127. --output-dir /path/to/output
  128. 2. 处理单个文件:
  129. python merge_paddleocr_vl_paddleocr.py \\
  130. --paddleocr-vl-file /path/to/file_page_001.json \\
  131. --paddle-file /path/to/file_page_001.json \\
  132. --output-dir /path/to/output
  133. 输出格式说明:
  134. - JSON: 统一的MinerU格式JSON文件
  135. - Markdown: 基于MinerU格式生成的Markdown文件
  136. """
  137. )
  138. # 文件/目录参数
  139. file_group = parser.add_argument_group('文件参数')
  140. file_group.add_argument(
  141. '--paddleocr-vl-file',
  142. type=str,
  143. help='PaddleOCR_VL 输出的 JSON 文件路径(单文件模式)'
  144. )
  145. file_group.add_argument(
  146. '--paddle-file',
  147. type=str,
  148. help='PaddleOCR 输出的 JSON 文件路径(单文件模式)'
  149. )
  150. dir_group = parser.add_argument_group('目录参数')
  151. dir_group.add_argument(
  152. '--paddleocr-vl-dir',
  153. type=str,
  154. help='PaddleOCR_VL 结果目录(批量模式)'
  155. )
  156. dir_group.add_argument(
  157. '--paddle-dir',
  158. type=str,
  159. help='PaddleOCR 结果目录(批量模式)'
  160. )
  161. # 输出参数
  162. output_group = parser.add_argument_group('输出参数')
  163. output_group.add_argument(
  164. '-o', '--output-dir',
  165. type=str,
  166. required=True,
  167. help='输出目录(必需)'
  168. )
  169. output_group.add_argument(
  170. '-f', '--output-type',
  171. choices=['json', 'markdown', 'both'],
  172. default='both',
  173. help='输出格式'
  174. )
  175. # 算法参数
  176. algo_group = parser.add_argument_group('算法参数')
  177. algo_group.add_argument(
  178. '-w', '--window',
  179. type=int,
  180. default=15,
  181. help='向前查找的窗口大小(默认: 15)'
  182. )
  183. algo_group.add_argument(
  184. '-t', '--threshold',
  185. type=int,
  186. default=80,
  187. help='文本相似度阈值(0-100,默认: 80)'
  188. )
  189. args = parser.parse_args()
  190. output_type = args.output_type.lower()
  191. # 验证参数
  192. if args.paddleocr_vl_file and args.paddle_file:
  193. # 单文件模式
  194. paddleocr_vl_file = Path(args.paddleocr_vl_file)
  195. paddle_file = Path(args.paddle_file)
  196. output_dir = Path(args.output_dir)
  197. if not paddleocr_vl_file.exists():
  198. print(f"❌ 错误: PaddleOCR_VL 文件不存在: {paddleocr_vl_file}")
  199. return
  200. if not paddle_file.exists():
  201. print(f"❌ 错误: PaddleOCR 文件不存在: {paddle_file}")
  202. return
  203. output_dir.mkdir(parents=True, exist_ok=True)
  204. print("\n🔧 单文件处理模式")
  205. print(f"📄 PaddleOCR_VL 文件: {paddleocr_vl_file}")
  206. print(f"📄 PaddleOCR 文件: {paddle_file}")
  207. print(f"📂 输出目录: {output_dir}\n")
  208. merger = PaddleOCRVLMerger(
  209. look_ahead_window=args.window,
  210. similarity_threshold=args.threshold
  211. )
  212. success = merge_single_file(paddleocr_vl_file, paddle_file, output_dir, output_type, merger)
  213. if success:
  214. print("\n✅ 处理完成!")
  215. else:
  216. print("\n❌ 处理失败!")
  217. elif args.paddleocr_vl_dir and args.paddle_dir:
  218. # 批量模式
  219. if not Path(args.paddleocr_vl_dir).exists():
  220. print(f"❌ 错误: PaddleOCR_VL 目录不存在: {args.paddleocr_vl_dir}")
  221. return
  222. if not Path(args.paddle_dir).exists():
  223. print(f"❌ 错误: PaddleOCR 目录不存在: {args.paddle_dir}")
  224. return
  225. print("\n🔧 批量处理模式")
  226. merge_paddleocr_vl_batch(
  227. args.paddleocr_vl_dir,
  228. args.paddle_dir,
  229. args.output_dir,
  230. output_type=output_type,
  231. look_ahead_window=args.window,
  232. similarity_threshold=args.threshold
  233. )
  234. else:
  235. parser.print_help()
  236. print("\n❌ 错误: 请指定单文件模式或批量模式的参数")
  237. if __name__ == "__main__":
  238. print("🚀 启动 PaddleOCR_VL + PaddleOCR 合并程序 (统一输出MinerU格式)...")
  239. import sys
  240. if len(sys.argv) == 1:
  241. # 默认配置
  242. # default_config = {
  243. # "paddleocr-vl-file": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/paddleocr_vl_results/A用户_单元格扫描流水_page_007.json",
  244. # "paddle-file": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/ppstructurev3_client_results/A用户_单元格扫描流水_page_007.json",
  245. # "output-dir": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/paddleocr_vl_results_cell_bbox",
  246. # "output-type": "both",
  247. # "window": "15",
  248. # "threshold": "85"
  249. # }
  250. # default_config = {
  251. # "paddleocr-vl-file": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/paddleocr_vl_results/B用户_扫描流水_page_001.json",
  252. # "paddle-file": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/ppstructurev3_client_results/B用户_扫描流水_page_001.json",
  253. # "output-dir": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/paddleocr_vl_results_cell_bbox",
  254. # "output-type": "both",
  255. # "window": "15",
  256. # "threshold": "85"
  257. # }
  258. default_config = {
  259. "paddleocr-vl-file": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/paddleocr_vl_results/2023年度报告母公司_page_006.json",
  260. "paddle-file": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/ppstructurev3_client_results/2023年度报告母公司_page_006.json",
  261. "output-dir": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/paddleocr_vl_results_cell_bbox",
  262. "output-type": "both",
  263. "window": "15",
  264. "threshold": "85"
  265. }
  266. # default_config = {
  267. # "paddleocr-vl-file": "/Users/zhch158/workspace/data/流水分析/康强_北京农村商业银行/paddleocr_vl_results/康强_北京农村商业银行_page_001.json",
  268. # "paddle-file": "/Users/zhch158/workspace/data/流水分析/康强_北京农村商业银行/ppstructurev3_client_results/康强_北京农村商业银行_page_001.json",
  269. # "output-dir": "/Users/zhch158/workspace/data/流水分析/康强_北京农村商业银行/paddleocr_vl_results_cell_bbox",
  270. # "output-type": "both",
  271. # "window": "15",
  272. # "threshold": "85"
  273. # }
  274. print("ℹ️ 未提供命令行参数,使用默认配置运行...")
  275. print("⚙️ 默认参数:")
  276. for key, value in default_config.items():
  277. print(f" --{key}: {value}")
  278. sys.argv = [sys.argv[0]]
  279. for key, value in default_config.items():
  280. sys.argv.extend([f"--{key}", str(value)])
  281. sys.exit(main())