merge_mineru_paddle_ocr.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. """
  2. 合并 MinerU 和 PaddleOCR 的结果
  3. 主程序入口
  4. """
  5. import json
  6. import argparse
  7. import sys
  8. from pathlib import Path
  9. # 添加 ocr_platform 根目录到 Python 路径
  10. ocr_platform_root = Path(__file__).parents[3] # ocr_merger -> ocr_tools -> ocr_platform -> repository.git
  11. if str(ocr_platform_root) not in sys.path:
  12. sys.path.insert(0, str(ocr_platform_root))
  13. try:
  14. from ocr_tools.ocr_merger.merger_core import MinerUPaddleOCRMerger
  15. except ImportError:
  16. from merger_core import MinerUPaddleOCRMerger
  17. def merge_single_file(mineru_file: Path, paddle_file: Path, output_dir: Path,
  18. output_type: str, merger: MinerUPaddleOCRMerger) -> bool:
  19. """
  20. 合并单个文件
  21. Args:
  22. mineru_file: MinerU JSON 文件路径
  23. paddle_file: PaddleOCR JSON 文件路径
  24. output_dir: 输出目录
  25. merger: 合并器实例
  26. Returns:
  27. 是否成功
  28. """
  29. print(f"📄 处理: {mineru_file.name}")
  30. # 输出文件路径
  31. merged_md_path = output_dir / f"{mineru_file.stem}.md"
  32. merged_json_path = output_dir / f"{mineru_file.stem}.json"
  33. try:
  34. # 合并数据
  35. merged_data = merger.merge_table_with_bbox(
  36. str(mineru_file),
  37. str(paddle_file)
  38. )
  39. # 生成 Markdown
  40. if output_type in ['markdown', 'both']:
  41. merger.generate_enhanced_markdown(merged_data, str(merged_md_path), mineru_file)
  42. # 保存 JSON
  43. if output_type in ['json', 'both']:
  44. with open(merged_json_path, 'w', encoding='utf-8') as f:
  45. json.dump(merged_data, f, ensure_ascii=False, indent=2)
  46. print(f" ✅ 合并完成")
  47. print(f" 📊 共处理了 {len(merged_data)} 个对象")
  48. print(f" 💾 输出文件:")
  49. if output_type in ['markdown', 'both']:
  50. print(f" - {merged_md_path.name}")
  51. if output_type in ['json', 'both']:
  52. print(f" - {merged_json_path.name}")
  53. return True
  54. except Exception as e:
  55. print(f" ❌ 处理失败: {e}")
  56. import traceback
  57. traceback.print_exc()
  58. return False
  59. def merge_mineru_paddle_batch(mineru_dir: str, paddle_dir: str, output_dir: str,
  60. output_type: str = 'both',
  61. look_ahead_window: int = 10,
  62. similarity_threshold: int = 80):
  63. """
  64. 批量合并 MinerU 和 PaddleOCR 的结果
  65. Args:
  66. mineru_dir: MinerU 结果目录
  67. paddle_dir: PaddleOCR 结果目录
  68. output_dir: 输出目录
  69. look_ahead_window: 向前查找窗口大小
  70. similarity_threshold: 相似度阈值
  71. """
  72. mineru_path = Path(mineru_dir)
  73. paddle_path = Path(paddle_dir)
  74. output_path = Path(output_dir)
  75. output_path.mkdir(parents=True, exist_ok=True)
  76. merger = MinerUPaddleOCRMerger(look_ahead_window, similarity_threshold)
  77. # 查找所有 MinerU 的 JSON 文件
  78. mineru_files = list(mineru_path.glob('*_page_*[0-9].json'))
  79. mineru_files.sort()
  80. print(f"\n🔍 找到 {len(mineru_files)} 个 MinerU 文件")
  81. print(f"📂 MinerU 目录: {mineru_dir}")
  82. print(f"📂 PaddleOCR 目录: {paddle_dir}")
  83. print(f"📂 输出目录: {output_dir}")
  84. print(f"⚙️ 查找窗口: {look_ahead_window}")
  85. print(f"⚙️ 相似度阈值: {similarity_threshold}%\n")
  86. success_count = 0
  87. failed_count = 0
  88. for mineru_file in mineru_files:
  89. # 查找对应的 PaddleOCR 文件
  90. paddle_file = paddle_path / mineru_file.name
  91. if not paddle_file.exists():
  92. print(f"⚠️ 跳过: 未找到对应的 PaddleOCR 文件: {paddle_file.name}\n")
  93. failed_count += 1
  94. continue
  95. if merge_single_file(mineru_file, paddle_file, output_path, output_type, merger):
  96. success_count += 1
  97. else:
  98. failed_count += 1
  99. print() # 空行分隔
  100. # 打印统计信息
  101. print("=" * 60)
  102. print(f"✅ 处理完成!")
  103. print(f"📊 统计信息:")
  104. print(f" - 总文件数: {len(mineru_files)}")
  105. print(f" - 成功: {success_count}")
  106. print(f" - 失败: {failed_count}")
  107. print("=" * 60)
  108. def main():
  109. """主函数"""
  110. parser = argparse.ArgumentParser(
  111. description='合并 MinerU 和 PaddleOCR 的识别结果,添加 bbox 坐标信息',
  112. formatter_class=argparse.RawDescriptionHelpFormatter,
  113. epilog="""
  114. 示例用法:
  115. 1. 批量处理整个目录:
  116. python merge_mineru_paddle_ocr.py \\
  117. --mineru-dir /path/to/mineru/results \\
  118. --paddle-dir /path/to/paddle/results \\
  119. --output-dir /path/to/output
  120. 2. 处理单个文件:
  121. python merge_mineru_paddle_ocr.py \\
  122. --mineru-file /path/to/file_page_001.json \\
  123. --paddle-file /path/to/file_page_001.json \\
  124. --output-dir /path/to/output
  125. 3. 自定义参数:
  126. python merge_mineru_paddle_ocr.py \\
  127. --mineru-dir /path/to/mineru \\
  128. --paddle-dir /path/to/paddle \\
  129. --output-dir /path/to/output \\
  130. --window 15 \\
  131. --threshold 85
  132. """
  133. )
  134. # 文件/目录参数
  135. file_group = parser.add_argument_group('文件参数')
  136. file_group.add_argument(
  137. '--mineru-file',
  138. type=str,
  139. help='MinerU 输出的 JSON 文件路径(单文件模式)'
  140. )
  141. file_group.add_argument(
  142. '--paddle-file',
  143. type=str,
  144. help='PaddleOCR 输出的 JSON 文件路径(单文件模式)'
  145. )
  146. dir_group = parser.add_argument_group('目录参数')
  147. dir_group.add_argument(
  148. '--mineru-dir',
  149. type=str,
  150. help='MinerU 结果目录(批量模式)'
  151. )
  152. dir_group.add_argument(
  153. '--paddle-dir',
  154. type=str,
  155. help='PaddleOCR 结果目录(批量模式)'
  156. )
  157. # 输出参数
  158. output_group = parser.add_argument_group('输出参数')
  159. output_group.add_argument(
  160. '-o', '--output-dir',
  161. type=str,
  162. required=True,
  163. help='输出目录(必需)'
  164. )
  165. output_group.add_argument(
  166. '-f', '--output-type',
  167. choices=['json', 'markdown', 'both'],
  168. default='both', help='输出格式'
  169. )
  170. # 算法参数
  171. algo_group = parser.add_argument_group('算法参数')
  172. algo_group.add_argument(
  173. '-w', '--window',
  174. type=int,
  175. default=15,
  176. help='向前查找的窗口大小(默认: 10)'
  177. )
  178. algo_group.add_argument(
  179. '-t', '--threshold',
  180. type=int,
  181. default=80,
  182. help='文本相似度阈值(0-100,默认: 80)'
  183. )
  184. args = parser.parse_args()
  185. output_type = args.output_type.lower()
  186. # 验证参数
  187. if args.mineru_file and args.paddle_file:
  188. # 单文件模式
  189. mineru_file = Path(args.mineru_file)
  190. paddle_file = Path(args.paddle_file)
  191. output_dir = Path(args.output_dir)
  192. if not mineru_file.exists():
  193. print(f"❌ 错误: MinerU 文件不存在: {mineru_file}")
  194. return
  195. if not paddle_file.exists():
  196. print(f"❌ 错误: PaddleOCR 文件不存在: {paddle_file}")
  197. return
  198. output_dir.mkdir(parents=True, exist_ok=True)
  199. print("\n🔧 单文件处理模式")
  200. print(f"📄 MinerU 文件: {mineru_file}")
  201. print(f"📄 PaddleOCR 文件: {paddle_file}")
  202. print(f"📂 输出目录: {output_dir}")
  203. print(f"⚙️ 查找窗口: {args.window}")
  204. print(f"⚙️ 相似度阈值: {args.threshold}%\n")
  205. merger = MinerUPaddleOCRMerger(
  206. look_ahead_window=args.window,
  207. similarity_threshold=args.threshold
  208. )
  209. success = merge_single_file(mineru_file, paddle_file, output_dir, output_type, merger)
  210. if success:
  211. print("\n✅ 处理完成!")
  212. else:
  213. print("\n❌ 处理失败!")
  214. elif args.mineru_dir and args.paddle_dir:
  215. # 批量模式
  216. if not Path(args.mineru_dir).exists():
  217. print(f"❌ 错误: MinerU 目录不存在: {args.mineru_dir}")
  218. return
  219. if not Path(args.paddle_dir).exists():
  220. print(f"❌ 错误: PaddleOCR 目录不存在: {args.paddle_dir}")
  221. return
  222. print("\n🔧 批量处理模式")
  223. merge_mineru_paddle_batch(
  224. args.mineru_dir,
  225. args.paddle_dir,
  226. args.output_dir,
  227. output_type=output_type,
  228. look_ahead_window=args.window,
  229. similarity_threshold=args.threshold
  230. )
  231. else:
  232. parser.print_help()
  233. print("\n❌ 错误: 请指定单文件模式或批量模式的参数")
  234. print(" 单文件模式: --mineru-file 和 --paddle-file")
  235. print(" 批量模式: --mineru-dir 和 --paddle-dir")
  236. if __name__ == "__main__":
  237. print("🚀 启动 MinerU + PaddleOCR 合并程序...")
  238. import sys
  239. if len(sys.argv) == 1:
  240. # 如果没有命令行参数,使用默认配置运行
  241. print("ℹ️ 未提供命令行参数,使用默认配置运行...")
  242. # 默认配置
  243. default_config = {
  244. # "mineru-file": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/mineru_vllm_results/A用户_单元格扫描流水_page_001.json",
  245. # "paddle-file": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/ppstructurev3_client_results/A用户_单元格扫描流水_page_001.json",
  246. # "output-dir": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/mineru_vllm_results_cell_bbox",
  247. "mineru-file": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/mineru_vllm_results/B用户_扫描流水_page_007.json",
  248. "paddle-file": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/ppstructurev3_client_results/B用户_扫描流水_page_007.json",
  249. "output-dir": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/mineru_vllm_results_cell_bbox",
  250. # "mineru-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results",
  251. # "paddle-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results",
  252. # "output-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results",
  253. "output-type": "both",
  254. "window": "15",
  255. "threshold": "85"
  256. }
  257. print("⚙️ 默认参数:")
  258. for key, value in default_config.items():
  259. print(f" --{key}: {value}")
  260. # 构造参数
  261. sys.argv = [sys.argv[0]]
  262. for key, value in default_config.items():
  263. sys.argv.extend([f"--{key}", str(value)])
  264. sys.exit(main())