batch_merge_results.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736
  1. #!/usr/bin/env python3
  2. """
  3. 批量合并 OCR 结果
  4. 自动读取配置文件,对所有 VL 处理器的输出进行 bbox 合并
  5. 支持执行器输出日志重定向
  6. """
  7. import os
  8. import sys
  9. import yaml
  10. import argparse
  11. import subprocess
  12. from pathlib import Path
  13. from datetime import datetime
  14. from typing import Dict, List, Tuple, Optional, Any
  15. from dataclasses import dataclass
  16. import logging
  17. from tqdm import tqdm
  18. # 添加 ocr_platform 根目录到 Python 路径(用于导入 ocr_merger)
  19. ocr_platform_root = Path(__file__).parents[2] # ocr_batch -> ocr_tools -> ocr_platform
  20. if str(ocr_platform_root) not in sys.path:
  21. sys.path.insert(0, str(ocr_platform_root))
  22. @dataclass
  23. class MergeTask:
  24. """合并任务"""
  25. processor_name: str
  26. vl_result_dir: Path
  27. paddle_result_dir: Path
  28. output_dir: Path
  29. merger_script: str
  30. description: str
  31. log_file: str = "" # 🎯 新增:日志文件路径
  32. class BatchMerger:
  33. """批量合并器"""
  34. # VL 处理器类型映射到合并脚本
  35. MERGER_SCRIPTS = {
  36. 'paddleocr_vl': 'merge_paddleocr_vl_paddleocr.py',
  37. 'mineru': 'merge_mineru_paddle_ocr.py',
  38. 'dotsocr': 'merge_dotsocr_paddleocr.py'
  39. }
  40. def __init__(self, config_file: str, base_dir: str = None):
  41. """
  42. Args:
  43. config_file: processor_configs.yaml 路径
  44. base_dir: PDF 基础目录,覆盖配置文件中的设置
  45. """
  46. self.config_file = Path(config_file)
  47. self.config = self._load_config()
  48. self.base_dir = Path(base_dir) if base_dir else Path(self.config['global']['base_dir'])
  49. # 🎯 日志基础目录
  50. self.log_base_dir = self.base_dir / self.config['global'].get('log_dir', 'logs')
  51. # 设置日志
  52. self.logger = self._setup_logger()
  53. # merger 脚本目录
  54. # 从 ocr_batch 目录计算路径:ocr_batch -> ocr_tools -> ocr_platform
  55. ocr_platform_root = Path(__file__).parents[2] # ocr_batch -> ocr_tools -> ocr_platform
  56. self.merger_dir = ocr_platform_root / 'ocr_tools' / 'ocr_merger'
  57. # 🎯 统计信息
  58. self.merge_results: List[Dict[str, Any]] = []
  59. def _load_config(self) -> Dict:
  60. """加载配置文件"""
  61. with open(self.config_file, 'r', encoding='utf-8') as f:
  62. return yaml.safe_load(f)
  63. def _setup_logger(self) -> logging.Logger:
  64. """设置日志"""
  65. logger = logging.getLogger('BatchMerger')
  66. logger.setLevel(logging.INFO)
  67. if not logger.handlers:
  68. console_handler = logging.StreamHandler()
  69. console_handler.setLevel(logging.INFO)
  70. formatter = logging.Formatter(
  71. '%(asctime)s - %(levelname)s - %(message)s',
  72. datefmt='%Y-%m-%d %H:%M:%S'
  73. )
  74. console_handler.setFormatter(formatter)
  75. logger.addHandler(console_handler)
  76. return logger
  77. def _detect_processor_type(self, processor_name: str) -> str:
  78. """
  79. 检测处理器类型
  80. Returns:
  81. 'paddleocr_vl', 'mineru', 'dotsocr', 'ppstructure' 等
  82. """
  83. name_lower = processor_name.lower()
  84. if 'paddleocr_vl' in name_lower or 'paddleocr-vl' in name_lower:
  85. return 'paddleocr_vl'
  86. elif 'mineru' in name_lower:
  87. return 'mineru'
  88. elif 'dotsocr' in name_lower or 'dots' in name_lower:
  89. return 'dotsocr'
  90. elif 'ppstructure' in name_lower or 'pp-structure' in name_lower:
  91. return 'ppstructure'
  92. else:
  93. return 'unknown'
  94. def _get_merger_script(self, processor_type: str) -> str:
  95. """获取合并脚本路径"""
  96. script_name = self.MERGER_SCRIPTS.get(processor_type)
  97. if not script_name:
  98. return None
  99. script_path = self.merger_dir / script_name
  100. return str(script_path) if script_path.exists() else None
  101. def _find_paddle_result_dir(self, pdf_dir: Path) -> Path:
  102. """
  103. 查找对应的 PaddleOCR 结果目录
  104. 优先级:
  105. 1. ppstructurev3_cpu_results (本地 CPU)
  106. 2. ppstructurev3_results (默认)
  107. 3. data_PPStructureV3_Results (旧格式)
  108. """
  109. candidates = [
  110. pdf_dir / 'ppstructurev3_client_results',
  111. pdf_dir / 'ppstructurev3_single_process_results',
  112. ]
  113. for candidate in candidates:
  114. if candidate.exists():
  115. return candidate
  116. return None
  117. def _get_log_file_path(self, pdf_dir: Path, processor_name: str) -> Path:
  118. """
  119. 🎯 获取合并任务的日志文件路径
  120. 日志结构:
  121. PDF目录/
  122. └── logs/
  123. └── merge_processor_name/
  124. └── PDF名称_merge_YYYYMMDD_HHMMSS.log
  125. """
  126. # 日志目录
  127. log_dir = pdf_dir / 'logs' / f'merge_{processor_name}'
  128. log_dir.mkdir(parents=True, exist_ok=True)
  129. # 日志文件名
  130. timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
  131. log_file = log_dir / f"{pdf_dir.name}_merge_{timestamp}.log"
  132. return log_file
  133. def discover_merge_tasks(
  134. self,
  135. pdf_list: List[str] = None,
  136. processors: List[str] = None
  137. ) -> List[MergeTask]:
  138. """
  139. 自动发现需要合并的任务
  140. Args:
  141. pdf_list: PDF 文件列表(不含扩展名),如 ['德_内蒙古银行照', ...]
  142. processors: 处理器列表,如 ['paddleocr_vl_single_process', ...]
  143. Returns:
  144. MergeTask 列表
  145. """
  146. tasks = []
  147. # 如果没有指定处理器,扫描所有 VL 类型的处理器
  148. if not processors:
  149. processors = []
  150. for proc_name, proc_config in self.config['processors'].items():
  151. proc_type = self._detect_processor_type(proc_name)
  152. if proc_type in ['paddleocr_vl', 'mineru', 'dotsocr']:
  153. processors.append(proc_name)
  154. # 如果没有指定 PDF 列表,扫描基础目录
  155. if not pdf_list:
  156. pdf_list = [d.name for d in self.base_dir.iterdir() if d.is_dir()]
  157. self.logger.info(f"📂 基础目录: {self.base_dir}")
  158. self.logger.info(f"🔍 发现 {len(pdf_list)} 个 PDF 目录")
  159. self.logger.info(f"⚙️ 发现 {len(processors)} 个 VL 处理器")
  160. # 遍历每个 PDF 目录和处理器组合
  161. for pdf_name in pdf_list:
  162. pdf_dir = self.base_dir / pdf_name
  163. if not pdf_dir.exists():
  164. self.logger.warning(f"⚠️ 目录不存在: {pdf_dir}")
  165. continue
  166. # 查找 PaddleOCR 结果目录
  167. paddle_result_dir = self._find_paddle_result_dir(pdf_dir)
  168. if not paddle_result_dir:
  169. self.logger.warning(f"⚠️ 未找到 PaddleOCR 结果: {pdf_name}")
  170. continue
  171. # 遍历每个 VL 处理器
  172. for proc_name in processors:
  173. if proc_name not in self.config['processors']:
  174. self.logger.warning(f"⚠️ 处理器不存在: {proc_name}")
  175. continue
  176. proc_config = self.config['processors'][proc_name]
  177. proc_type = self._detect_processor_type(proc_name)
  178. # 获取合并脚本
  179. merger_script = self._get_merger_script(proc_type)
  180. if not merger_script:
  181. self.logger.warning(f"⚠️ 不支持的处理器类型: {proc_name} ({proc_type})")
  182. continue
  183. # VL 结果目录
  184. vl_output_subdir = proc_config.get('output_subdir', f'{proc_name}_results')
  185. vl_result_dir = pdf_dir / vl_output_subdir
  186. if not vl_result_dir.exists():
  187. self.logger.debug(f"⏭️ VL 结果不存在: {vl_result_dir}")
  188. continue
  189. # 输出目录
  190. output_dir = pdf_dir / f"{vl_output_subdir}_cell_bbox"
  191. # 🎯 日志文件路径
  192. log_file = self._get_log_file_path(pdf_dir, proc_name)
  193. # 创建任务
  194. task = MergeTask(
  195. processor_name=proc_name,
  196. vl_result_dir=vl_result_dir,
  197. paddle_result_dir=paddle_result_dir,
  198. output_dir=output_dir,
  199. merger_script=merger_script,
  200. description=proc_config.get('description', proc_name),
  201. log_file=str(log_file) # 🎯 新增
  202. )
  203. tasks.append(task)
  204. return tasks
  205. def execute_merge_task(
  206. self,
  207. task: MergeTask,
  208. window: int = 15,
  209. threshold: int = 85,
  210. output_type: str = 'both',
  211. dry_run: bool = False
  212. ) -> Dict[str, Any]:
  213. """
  214. 🎯 执行单个合并任务(支持日志重定向)
  215. Args:
  216. task: 合并任务
  217. window: 查找窗口
  218. threshold: 相似度阈值
  219. output_type: 输出格式
  220. dry_run: 模拟运行
  221. Returns:
  222. 执行结果字典
  223. """
  224. self.logger.info(f"\n{'='*80}")
  225. self.logger.info(f"📄 处理: {task.vl_result_dir.parent.name}")
  226. self.logger.info(f"🔧 处理器: {task.description}")
  227. self.logger.info(f"📂 VL 结果: {task.vl_result_dir}")
  228. self.logger.info(f"📂 PaddleOCR 结果: {task.paddle_result_dir}")
  229. self.logger.info(f"📂 输出目录: {task.output_dir}")
  230. self.logger.info(f"📄 日志文件: {task.log_file}")
  231. self.logger.info(f"{'='*80}")
  232. # 构建命令
  233. cmd = [
  234. sys.executable, # 当前 Python 解释器
  235. task.merger_script,
  236. f"--{self._get_vl_arg_name(task.merger_script)}-dir", str(task.vl_result_dir),
  237. '--paddle-dir', str(task.paddle_result_dir),
  238. '--output-dir', str(task.output_dir),
  239. '--output-type', output_type,
  240. '--window', str(window),
  241. '--threshold', str(threshold)
  242. ]
  243. if dry_run:
  244. self.logger.info(f"[DRY RUN] 命令: {' '.join(cmd)}")
  245. return {
  246. 'task': task,
  247. 'success': True,
  248. 'duration': 0,
  249. 'error': '',
  250. 'dry_run': True
  251. }
  252. # 🎯 执行命令并重定向输出到日志文件
  253. import time
  254. start_time = time.time()
  255. try:
  256. with open(task.log_file, 'w', encoding='utf-8') as log_f:
  257. # 写入日志头
  258. log_f.write(f"{'='*80}\n")
  259. log_f.write(f"合并任务日志\n")
  260. log_f.write(f"{'='*80}\n\n")
  261. log_f.write(f"PDF 目录: {task.vl_result_dir.parent}\n")
  262. log_f.write(f"处理器: {task.description}\n")
  263. log_f.write(f"处理器名称: {task.processor_name}\n")
  264. log_f.write(f"VL 结果目录: {task.vl_result_dir}\n")
  265. log_f.write(f"PaddleOCR 结果目录: {task.paddle_result_dir}\n")
  266. log_f.write(f"输出目录: {task.output_dir}\n")
  267. log_f.write(f"合并脚本: {task.merger_script}\n")
  268. log_f.write(f"查找窗口: {window}\n")
  269. log_f.write(f"相似度阈值: {threshold}\n")
  270. log_f.write(f"输出格式: {output_type}\n")
  271. log_f.write(f"开始时间: {datetime.now()}\n")
  272. log_f.write(f"{'='*80}\n\n")
  273. log_f.flush()
  274. # 执行命令
  275. result = subprocess.run(
  276. cmd,
  277. stdout=log_f, # 🎯 重定向 stdout
  278. stderr=subprocess.STDOUT, # 🎯 合并 stderr 到 stdout
  279. text=True,
  280. check=True
  281. )
  282. # 写入日志尾
  283. log_f.write(f"\n{'='*80}\n")
  284. log_f.write(f"结束时间: {datetime.now()}\n")
  285. log_f.write(f"状态: 成功\n")
  286. log_f.write(f"{'='*80}\n")
  287. duration = time.time() - start_time
  288. self.logger.info(f"✅ 合并成功 (耗时: {duration:.2f}秒)")
  289. return {
  290. 'task': task,
  291. 'success': True,
  292. 'duration': duration,
  293. 'error': '',
  294. 'dry_run': False
  295. }
  296. except subprocess.CalledProcessError as e:
  297. duration = time.time() - start_time
  298. error_msg = f"命令执行失败 (退出码: {e.returncode})"
  299. # 🎯 在日志文件中追加错误信息
  300. with open(task.log_file, 'a', encoding='utf-8') as log_f:
  301. log_f.write(f"\n{'='*80}\n")
  302. log_f.write(f"结束时间: {datetime.now()}\n")
  303. log_f.write(f"状态: 失败\n")
  304. log_f.write(f"错误: {error_msg}\n")
  305. log_f.write(f"{'='*80}\n")
  306. self.logger.error(f"❌ 合并失败 (耗时: {duration:.2f}秒)")
  307. self.logger.error(f"错误信息: {error_msg}")
  308. self.logger.error(f"详细日志: {task.log_file}")
  309. return {
  310. 'task': task,
  311. 'success': False,
  312. 'duration': duration,
  313. 'error': error_msg,
  314. 'dry_run': False
  315. }
  316. except Exception as e:
  317. duration = time.time() - start_time
  318. error_msg = str(e)
  319. with open(task.log_file, 'a', encoding='utf-8') as log_f:
  320. log_f.write(f"\n{'='*80}\n")
  321. log_f.write(f"结束时间: {datetime.now()}\n")
  322. log_f.write(f"状态: 异常\n")
  323. log_f.write(f"错误: {error_msg}\n")
  324. log_f.write(f"{'='*80}\n")
  325. self.logger.error(f"❌ 合并异常 (耗时: {duration:.2f}秒)")
  326. self.logger.error(f"错误信息: {error_msg}")
  327. self.logger.error(f"详细日志: {task.log_file}")
  328. return {
  329. 'task': task,
  330. 'success': False,
  331. 'duration': duration,
  332. 'error': error_msg,
  333. 'dry_run': False
  334. }
  335. def _get_vl_arg_name(self, merger_script: str) -> str:
  336. """获取 VL 参数名称"""
  337. script_name = Path(merger_script).stem
  338. if 'paddleocr_vl' in script_name:
  339. return 'paddleocr-vl'
  340. elif 'mineru' in script_name:
  341. return 'mineru'
  342. elif 'dotsocr' in script_name:
  343. return 'dotsocr'
  344. else:
  345. return 'vl'
  346. def _save_summary_log(self, stats: Dict[str, Any]):
  347. """🎯 保存汇总日志"""
  348. timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
  349. summary_log_file = self.log_base_dir / f"merge_batch_summary_{timestamp}.log"
  350. # 确保目录存在
  351. summary_log_file.parent.mkdir(parents=True, exist_ok=True)
  352. with open(summary_log_file, 'w', encoding='utf-8') as f:
  353. f.write("OCR 结果批量合并汇总日志\n")
  354. f.write("=" * 80 + "\n\n")
  355. f.write(f"配置文件: {self.config_file}\n")
  356. f.write(f"基础目录: {self.base_dir}\n")
  357. f.write(f"日志目录: {self.log_base_dir}\n")
  358. f.write(f"开始时间: {datetime.now()}\n")
  359. f.write(f"总耗时: {stats['total_duration']:.2f} 秒\n\n")
  360. f.write("统计信息:\n")
  361. f.write(f" 总任务数: {stats['total']}\n")
  362. f.write(f" 成功: {stats['success']}\n")
  363. f.write(f" 失败: {stats['failed']}\n\n")
  364. if stats['failed_tasks']:
  365. f.write("失败的任务:\n")
  366. for item in stats['failed_tasks']:
  367. f.write(f" ✗ {item['pdf_dir']} / {item['processor']}\n")
  368. f.write(f" 错误: {item['error']}\n")
  369. f.write(f" 日志: {item['log']}\n\n")
  370. f.write("详细结果:\n")
  371. for result in self.merge_results:
  372. task = result['task']
  373. status = "✓" if result['success'] else "✗"
  374. f.write(f"{status} {task.vl_result_dir.parent.name} / {task.processor_name} ({result['duration']:.2f}s)\n")
  375. f.write(f" 日志: {task.log_file}\n")
  376. if result['error']:
  377. f.write(f" 错误: {result['error']}\n")
  378. self.logger.info(f"汇总日志已保存: {summary_log_file}")
  379. def batch_merge(
  380. self,
  381. pdf_list: List[str] = None,
  382. processors: List[str] = None,
  383. window: int = 15,
  384. threshold: int = 85,
  385. output_type: str = 'both',
  386. dry_run: bool = False
  387. ) -> Dict:
  388. """
  389. 批量合并
  390. Returns:
  391. 统计信息字典
  392. """
  393. # 发现任务
  394. tasks = self.discover_merge_tasks(pdf_list, processors)
  395. if not tasks:
  396. self.logger.warning("⚠️ 没有发现任何合并任务")
  397. return {
  398. 'total': 0,
  399. 'success': 0,
  400. 'failed': 0,
  401. 'total_duration': 0,
  402. 'failed_tasks': []
  403. }
  404. self.logger.info(f"\n🎯 发现 {len(tasks)} 个合并任务\n")
  405. # 显示任务列表
  406. for i, task in enumerate(tasks, 1):
  407. self.logger.info(f"{i}. {task.vl_result_dir.parent.name} / {task.processor_name}")
  408. # 确认执行
  409. if not dry_run:
  410. confirm = input(f"\n是否继续执行 {len(tasks)} 个合并任务? [Y/n]: ")
  411. if confirm.lower() not in ['', 'y', 'yes']:
  412. self.logger.info("❌ 已取消")
  413. return {
  414. 'total': 0,
  415. 'success': 0,
  416. 'failed': 0,
  417. 'total_duration': 0,
  418. 'failed_tasks': []
  419. }
  420. # 执行任务
  421. import time
  422. batch_start_time = time.time()
  423. success_count = 0
  424. failed_count = 0
  425. with tqdm(total=len(tasks), desc="合并进度", unit="task") as pbar:
  426. for task in tasks:
  427. result = self.execute_merge_task(
  428. task,
  429. window=window,
  430. threshold=threshold,
  431. output_type=output_type,
  432. dry_run=dry_run
  433. )
  434. self.merge_results.append(result)
  435. if result['success']:
  436. success_count += 1
  437. else:
  438. failed_count += 1
  439. pbar.update(1)
  440. pbar.set_postfix({
  441. 'success': success_count,
  442. 'failed': failed_count
  443. })
  444. total_duration = time.time() - batch_start_time
  445. # 统计失败任务
  446. failed_tasks = [
  447. {
  448. 'pdf_dir': r['task'].vl_result_dir.parent.name,
  449. 'processor': r['task'].processor_name,
  450. 'error': r['error'],
  451. 'log': r['task'].log_file
  452. }
  453. for r in self.merge_results if not r['success']
  454. ]
  455. # 统计信息
  456. stats = {
  457. 'total': len(tasks),
  458. 'success': success_count,
  459. 'failed': failed_count,
  460. 'total_duration': total_duration,
  461. 'failed_tasks': failed_tasks
  462. }
  463. # 🎯 保存汇总日志
  464. self._save_summary_log(stats)
  465. # 打印总结
  466. self.logger.info(f"\n{'='*80}")
  467. self.logger.info("📊 合并完成")
  468. self.logger.info(f" 总任务数: {stats['total']}")
  469. self.logger.info(f" ✅ 成功: {stats['success']}")
  470. self.logger.info(f" ❌ 失败: {stats['failed']}")
  471. self.logger.info(f" ⏱️ 总耗时: {stats['total_duration']:.2f} 秒")
  472. self.logger.info(f"{'='*80}")
  473. if failed_tasks:
  474. self.logger.info(f"\n失败的任务:")
  475. for item in failed_tasks:
  476. self.logger.info(f" ✗ {item['pdf_dir']} / {item['processor']}")
  477. self.logger.info(f" 错误: {item['error']}")
  478. self.logger.info(f" 日志: {item['log']}")
  479. return stats
  480. def create_parser() -> argparse.ArgumentParser:
  481. """创建命令行参数解析器"""
  482. parser = argparse.ArgumentParser(
  483. description='批量合并 OCR 结果(VL + PaddleOCR)',
  484. formatter_class=argparse.RawDescriptionHelpFormatter,
  485. epilog="""
  486. 示例用法:
  487. 1. 合并配置文件中所有 VL 处理器的结果:
  488. python batch_merge_results.py
  489. 2. 合并指定 PDF 的结果:
  490. python batch_merge_results.py -f pdf_list.txt
  491. 3. 合并指定处理器的结果:
  492. python batch_merge_results.py -p paddleocr_vl_single_process -p mineru_vllm
  493. 4. 自定义参数:
  494. python batch_merge_results.py -w 20 -t 90
  495. 5. 模拟运行(不实际执行):
  496. python batch_merge_results.py --dry-run
  497. """
  498. )
  499. # 配置文件
  500. parser.add_argument(
  501. '-c', '--config',
  502. default='processor_configs.yaml',
  503. help='配置文件路径 (默认: processor_configs.yaml)'
  504. )
  505. # PDF 和处理器
  506. parser.add_argument(
  507. '-d', '--base-dir',
  508. help='PDF 基础目录(覆盖配置文件)'
  509. )
  510. parser.add_argument(
  511. '-f', '--file-list',
  512. help='PDF 列表文件(每行一个 PDF 名称,不含扩展名)'
  513. )
  514. parser.add_argument(
  515. '-l', '--pdf-list',
  516. nargs='+',
  517. help='PDF 名称列表(不含扩展名)'
  518. )
  519. parser.add_argument(
  520. '-p', '--processors',
  521. nargs='+',
  522. help='处理器列表(不指定则自动检测所有 VL 处理器)'
  523. )
  524. # 合并参数
  525. parser.add_argument(
  526. '-w', '--window',
  527. type=int,
  528. default=15,
  529. help='查找窗口大小 (默认: 15)'
  530. )
  531. parser.add_argument(
  532. '-t', '--threshold',
  533. type=int,
  534. default=85,
  535. help='相似度阈值 (默认: 85)'
  536. )
  537. parser.add_argument(
  538. '--output-type',
  539. choices=['json', 'markdown', 'both'],
  540. default='both',
  541. help='输出格式 (默认: both)'
  542. )
  543. # 工具选项
  544. parser.add_argument(
  545. '--dry-run',
  546. action='store_true',
  547. help='模拟运行,不实际执行'
  548. )
  549. parser.add_argument(
  550. '-v', '--verbose',
  551. action='store_true',
  552. help='详细输出'
  553. )
  554. return parser
  555. def main():
  556. """主函数"""
  557. parser = create_parser()
  558. args = parser.parse_args()
  559. # 设置日志级别
  560. if args.verbose:
  561. logging.getLogger().setLevel(logging.DEBUG)
  562. # 读取 PDF 列表
  563. pdf_list = None
  564. if args.file_list:
  565. pdf_list = []
  566. with open(args.file_list, 'r', encoding='utf-8') as f:
  567. for line in f:
  568. line = line.strip()
  569. if line and not line.startswith('#'):
  570. # 移除 .pdf 扩展名
  571. pdf_name = line.replace('.pdf', '')
  572. pdf_list.append(pdf_name)
  573. elif args.pdf_list:
  574. pdf_list = [p.replace('.pdf', '') for p in args.pdf_list]
  575. # 创建批量合并器
  576. merger = BatchMerger(
  577. config_file=args.config,
  578. base_dir=args.base_dir
  579. )
  580. # 执行批量合并
  581. stats = merger.batch_merge(
  582. pdf_list=pdf_list,
  583. processors=args.processors,
  584. window=args.window,
  585. threshold=args.threshold,
  586. output_type=args.output_type,
  587. dry_run=args.dry_run
  588. )
  589. return 0 if stats['failed'] == 0 else 1
  590. if __name__ == '__main__':
  591. print("🚀 启动批量OCR bbox 合并程序...")
  592. import sys
  593. if len(sys.argv) == 1:
  594. # 如果没有命令行参数,使用默认配置运行
  595. print("ℹ️ 未提供命令行参数,使用默认配置运行...")
  596. # 默认配置
  597. default_config = {
  598. "file-list": "pdf_list.txt",
  599. }
  600. print("⚙️ 默认参数:")
  601. for key, value in default_config.items():
  602. print(f" --{key}: {value}")
  603. # 构造参数
  604. sys.argv = [sys.argv[0]]
  605. for key, value in default_config.items():
  606. sys.argv.extend([f"--{key}", str(value)])
  607. sys.argv.append("--dry-run")
  608. sys.argv.append("--verbose") # 添加详细输出参数
  609. sys.exit(main())