main_v2.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670
  1. #!/usr/bin/env python3
  2. """
  3. 金融文档处理统一入口 v2
  4. 支持完整的处理流程:
  5. 1. PDF分类(扫描件/数字原生PDF)
  6. 2. 页面方向识别
  7. 3. Layout检测
  8. 4. 并行处理:文本OCR + 表格VLM识别
  9. 5. 单元格坐标匹配
  10. 6. 多格式输出(JSON、Markdown、HTML、可视化图片)
  11. 使用方法:
  12. # 处理单个PDF
  13. python main_v2.py -i /path/to/document.pdf -c ./config/bank_statement_mineru_vl.yaml
  14. # 处理图片目录
  15. python main_v2.py -i /path/to/images/ -c ./config/bank_statement_paddle_vl.yaml
  16. # 开启debug模式(输出可视化图片)
  17. python main_v2.py -i /path/to/doc.pdf -c ./config/xxx.yaml --debug
  18. """
  19. import argparse
  20. import json
  21. import sys
  22. import os
  23. from pathlib import Path
  24. from typing import Optional
  25. from loguru import logger
  26. from datetime import datetime
  27. # 添加 ocr_platform 根目录到 Python 路径(用于导入 ocr_utils)
  28. ocr_platform_root = Path(__file__).parents[2] # universal_doc_parser -> ocr_tools -> ocr_platform -> repository.git
  29. if str(ocr_platform_root) not in sys.path:
  30. sys.path.insert(0, str(ocr_platform_root))
  31. # 添加当前目录到 Python 路径(用于相对导入)
  32. project_root = Path(__file__).parent
  33. if str(project_root) not in sys.path:
  34. sys.path.insert(0, str(project_root))
  35. from dotenv import load_dotenv
  36. load_dotenv(override=True)
  37. from core.pipeline_manager_v2 import EnhancedDocPipeline
  38. from core.pipeline_manager_v2_streaming import StreamingDocPipeline
  39. # 从 ocr_utils 导入工具函数
  40. try:
  41. from ocr_utils import OutputFormatterV2
  42. except ImportError:
  43. # 降级:从 utils 导入(向后兼容)
  44. from utils import OutputFormatterV2
  45. # ==================== Helper Functions ====================
  46. def _print_environment_info():
  47. """打印环境变量信息"""
  48. env_vars = [
  49. 'CUDA_VISIBLE_DEVICES', 'HF_HOME', 'HF_ENDPOINT', 'HF_HUB_OFFLINE',
  50. 'TORCH_HOME', 'MODELSCOPE_CACHE', 'USE_MODELSCOPE_HUB', 'MINERU_MODEL_SOURCE'
  51. ]
  52. for var in env_vars:
  53. print(f"🔧 {var}: {os.environ.get(var, 'Not set')}")
  54. def _validate_arguments(args: argparse.Namespace) -> bool:
  55. """验证命令行参数"""
  56. input_path = Path(args.input)
  57. if not input_path.exists():
  58. logger.error(f"❌ 输入路径不存在: {input_path}")
  59. return False
  60. config_path = Path(args.config)
  61. if not config_path.exists():
  62. logger.error(f"❌ 配置文件不存在: {config_path}")
  63. return False
  64. return True
  65. def _handle_dry_run(args: argparse.Namespace) -> bool:
  66. """处理dry run模式"""
  67. if args.dry_run:
  68. if _validate_arguments(args):
  69. logger.info("✅ 配置验证通过(dry run)")
  70. return True
  71. return False
  72. return False
  73. def _create_pipeline(
  74. streaming: bool,
  75. config_path: str,
  76. output_dir: str,
  77. debug: bool = False,
  78. debug_layout: bool = False,
  79. debug_table: bool = False,
  80. debug_ocr: bool = False
  81. ):
  82. """
  83. 创建并初始化处理流水线(应用 debug 覆盖)
  84. Args:
  85. streaming: 是否使用流式处理模式
  86. config_path: 配置文件路径
  87. output_dir: 输出目录
  88. debug: 全局 debug 开关
  89. debug_layout: 布局检测 debug 开关
  90. debug_table: 表格识别 debug 开关
  91. debug_ocr: OCR 识别 debug 开关
  92. Returns:
  93. 初始化后的 pipeline 实例
  94. """
  95. # 1. 先加载配置
  96. from core.config_manager import ConfigManager
  97. config = ConfigManager.load_config(config_path)
  98. # 2. 应用 debug 覆盖(在创建 pipeline 之前)
  99. if debug or debug_layout or debug_table or debug_ocr:
  100. _apply_debug_overrides_to_config(config, debug, debug_layout, debug_table, debug_ocr)
  101. # 3. 创建 pipeline(adapter 会读取到已修改的 config)
  102. if streaming:
  103. logger.info("🔄 Using streaming processing mode (memory-efficient)")
  104. pipeline = StreamingDocPipeline(config, output_dir, config_is_dict=True)
  105. else:
  106. logger.info("🔄 Using batch processing mode (all pages in memory)")
  107. pipeline = EnhancedDocPipeline(config, config_is_dict=True)
  108. return pipeline
  109. def _get_default_output_config(debug: bool) -> dict:
  110. """获取默认输出配置"""
  111. return {
  112. 'create_subdir': True,
  113. 'save_pdf_images': False,
  114. 'save_json': True,
  115. 'save_markdown': True,
  116. 'save_html': True,
  117. 'save_page_json': True,
  118. 'save_images': True,
  119. 'save_layout_image': debug,
  120. 'save_ocr_image': debug,
  121. 'draw_type_label': True,
  122. 'draw_bbox_number': True,
  123. 'save_enhanced_json': True,
  124. 'normalize_numbers': True,
  125. 'merge_cross_page_tables': True,
  126. }
  127. def _apply_debug_overrides_to_config(
  128. config: dict,
  129. debug: bool,
  130. debug_layout: bool,
  131. debug_table: bool,
  132. debug_ocr: bool
  133. ):
  134. """
  135. 应用命令行 debug 参数覆盖配置文件设置(在创建 pipeline 之前)
  136. 优先级规则:
  137. 1. --debug: 启用所有模块的 debug
  138. 2. --debug-layout/--debug-table/--debug-ocr: 精细控制各模块
  139. 3. 配置文件的 debug_options 只提供默认值
  140. Args:
  141. config: 配置字典(会被直接修改)
  142. debug: 全局 debug 开关
  143. debug_layout: 布局检测 debug 开关
  144. debug_table: 表格识别 debug 开关
  145. debug_ocr: OCR 识别 debug 开关
  146. """
  147. # 确定需要启用哪些模块的 debug
  148. enable_layout_debug = debug or debug_layout
  149. enable_table_debug = debug or debug_table
  150. enable_ocr_debug = debug or debug_ocr
  151. # 1. 布局检测 debug
  152. if enable_layout_debug:
  153. if 'layout_detection' in config:
  154. if 'debug_options' not in config['layout_detection']:
  155. config['layout_detection']['debug_options'] = {}
  156. config['layout_detection']['debug_options']['enabled'] = True
  157. logger.info("✅ 启用布局检测 debug 输出")
  158. # 2. 表格分类 debug
  159. if enable_table_debug:
  160. if 'table_classification' in config:
  161. if 'debug_options' not in config['table_classification']:
  162. config['table_classification']['debug_options'] = {}
  163. config['table_classification']['debug_options']['enabled'] = True
  164. logger.info("✅ 启用表格分类 debug 输出")
  165. # 3. 有线表格识别 debug
  166. if enable_table_debug:
  167. if 'table_recognition_wired' in config:
  168. if 'debug_options' not in config['table_recognition_wired']:
  169. config['table_recognition_wired']['debug_options'] = {}
  170. config['table_recognition_wired']['debug_options']['enabled'] = True
  171. logger.info("✅ 启用有线表格识别 debug 输出")
  172. # 4. OCR 识别 debug(如果有 debug_options)
  173. if enable_ocr_debug:
  174. if 'ocr_recognition' in config:
  175. if 'debug_options' not in config['ocr_recognition']:
  176. config['ocr_recognition']['debug_options'] = {}
  177. config['ocr_recognition']['debug_options']['enabled'] = True
  178. logger.info("✅ 启用 OCR 识别 debug 输出")
  179. # 5. 更新输出配置
  180. if enable_layout_debug or enable_ocr_debug or enable_table_debug:
  181. output_config = config.get('output', {})
  182. output_config['debug_mode'] = True
  183. if enable_layout_debug or enable_ocr_debug:
  184. output_config.setdefault('save_layout_image', True)
  185. output_config.setdefault('save_ocr_image', True)
  186. # 输出当前 debug 状态
  187. if debug:
  188. logger.info("🐛 全局 Debug 模式已启用(所有模块)")
  189. else:
  190. debug_modules = []
  191. if debug_layout:
  192. debug_modules.append("布局检测")
  193. if debug_table:
  194. debug_modules.append("表格识别")
  195. if debug_ocr:
  196. debug_modules.append("OCR识别")
  197. if debug_modules:
  198. logger.info(f"🐛 Debug 模式已启用: {', '.join(debug_modules)}")
  199. def setup_logging(log_level: str = "INFO", log_file: Optional[str] = None):
  200. """设置日志"""
  201. logger.remove()
  202. # 控制台输出
  203. logger.add(
  204. sys.stdout,
  205. level=log_level,
  206. format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
  207. )
  208. # 文件输出
  209. if log_file:
  210. logger.add(
  211. log_file,
  212. level="DEBUG",
  213. format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}",
  214. rotation="10 MB"
  215. )
  216. def process_single_input(
  217. input_path: Path,
  218. config_path: Path,
  219. output_dir: Path,
  220. debug: bool = False,
  221. debug_layout: bool = False,
  222. debug_table: bool = False,
  223. debug_ocr: bool = False,
  224. scene: Optional[str] = None,
  225. page_range: Optional[str] = None,
  226. streaming: bool = False
  227. ) -> dict:
  228. """
  229. 处理单个输入(文件或目录)
  230. Args:
  231. input_path: 输入路径
  232. config_path: 配置文件路径
  233. output_dir: 输出目录
  234. debug: 全局debug开关(启用所有模块debug)
  235. debug_layout: 仅启用布局检测debug
  236. debug_table: 仅启用表格识别debug
  237. debug_ocr: 仅启用OCR识别debug
  238. scene: 场景类型覆盖
  239. page_range: 页面范围(如 "1-5,7,9-12")
  240. streaming: 是否使用流式处理模式(按页处理,立即保存,节省内存)
  241. Returns:
  242. 处理结果和输出路径
  243. """
  244. try:
  245. # 创建流水线(debug 覆盖已在 _create_pipeline 中应用)
  246. pipeline = _create_pipeline(
  247. streaming,
  248. str(config_path),
  249. str(output_dir),
  250. debug=debug,
  251. debug_layout=debug_layout,
  252. debug_table=debug_table,
  253. debug_ocr=debug_ocr
  254. )
  255. output_config = pipeline.config.get('output', {}) or _get_default_output_config(debug)
  256. use_context = not streaming and hasattr(pipeline, '__enter__')
  257. if use_context:
  258. pipeline = pipeline.__enter__()
  259. try:
  260. if scene:
  261. pipeline.scene_name = scene
  262. if hasattr(pipeline, 'set_scene_name'):
  263. pipeline.set_scene_name(scene)
  264. logger.info(f"🔄 Scene overridden to: {scene}")
  265. logger.info(f"🚀 开始处理: {input_path}")
  266. logger.info(f"📋 场景配置: {pipeline.scene_name}")
  267. logger.info(f"📁 输出目录: {output_dir}")
  268. if page_range:
  269. logger.info(f"📄 页面范围: {page_range}")
  270. start_time = datetime.now()
  271. if streaming:
  272. # 流式处理模式
  273. results = pipeline.process_document_streaming(
  274. str(input_path),
  275. page_range=page_range,
  276. output_config=output_config
  277. )
  278. process_time = (datetime.now() - start_time).total_seconds()
  279. _print_summary_streaming(results, process_time)
  280. return {
  281. 'success': True,
  282. 'results': results,
  283. 'output_paths': results.get('output_paths', {}),
  284. 'process_time': process_time
  285. }
  286. else:
  287. # 批量处理模式
  288. results = pipeline.process_document(
  289. str(input_path),
  290. page_range=page_range,
  291. output_dir=str(output_dir)
  292. )
  293. process_time = (datetime.now() - start_time).total_seconds()
  294. logger.info(f"⏱️ 处理耗时: {process_time:.2f}秒")
  295. logger.info("💾 保存结果...")
  296. formatter = OutputFormatterV2(str(output_dir))
  297. output_paths = formatter.save_results(results, output_config)
  298. _print_summary(results, output_paths, process_time)
  299. return {
  300. 'success': True,
  301. 'results': results,
  302. 'output_paths': output_paths,
  303. 'process_time': process_time
  304. }
  305. finally:
  306. if use_context:
  307. pipeline.__exit__(None, None, None)
  308. except Exception as e:
  309. logger.error(f"❌ 处理失败: {e}")
  310. import traceback
  311. traceback.print_exc()
  312. return {
  313. 'success': False,
  314. 'error': str(e)
  315. }
  316. def _print_summary(results: dict, output_paths: dict, process_time: float):
  317. """打印处理结果摘要"""
  318. total_pages = len(results.get('pages', []))
  319. total_tables = 0
  320. total_text_blocks = 0
  321. total_cells = 0
  322. for page in results.get('pages', []):
  323. for element in page.get('elements', []):
  324. elem_type = element.get('type', '')
  325. if elem_type in ['table', 'table_body']:
  326. total_tables += 1
  327. cells = element.get('content', {}).get('cells', [])
  328. total_cells += len(cells)
  329. elif elem_type in ['text', 'title', 'ocr_text', 'ref_text']:
  330. total_text_blocks += 1
  331. print(f"\n{'='*60}")
  332. print(f"📊 处理摘要")
  333. print(f"{'='*60}")
  334. print(f" 📄 文档: {results.get('document_path', 'N/A')}")
  335. print(f" 🎯 场景: {results.get('scene', 'N/A')}")
  336. print(f" 📋 PDF类型: {results.get('metadata', {}).get('pdf_type', 'N/A')}")
  337. print(f" 📖 页面数: {total_pages}")
  338. print(f" 📋 表格数: {total_tables}")
  339. print(f" 📝 文本块: {total_text_blocks}")
  340. print(f" 🔢 单元格: {total_cells} (带坐标)")
  341. print(f" ⏱️ 耗时: {process_time:.2f}秒")
  342. print(f"{'='*60}")
  343. print(f"📁 输出文件:")
  344. for key, path in output_paths.items():
  345. if isinstance(path, list):
  346. for p in path:
  347. print(f" - {p}")
  348. else:
  349. print(f" - {path}")
  350. print(f"{'='*60}\n")
  351. def _print_summary_streaming(results_summary: dict, process_time: float):
  352. """打印流式处理结果摘要"""
  353. print(f"\n{'='*60}")
  354. print(f"📊 处理摘要(流式模式)")
  355. print(f"{'='*60}")
  356. print(f" 📄 文档: {results_summary.get('document_path', 'N/A')}")
  357. print(f" 🎯 场景: {results_summary.get('scene', 'N/A')}")
  358. print(f" 📋 PDF类型: {results_summary.get('metadata', {}).get('pdf_type', 'N/A')}")
  359. print(f" 📖 页面数: {results_summary.get('total_pages', 0)}")
  360. print(f" ⏱️ 耗时: {process_time:.2f}秒")
  361. print(f"{'='*60}")
  362. print(f"📁 输出文件:")
  363. output_paths = results_summary.get('output_paths', {})
  364. if output_paths.get('middle_json'):
  365. print(f" - {output_paths['middle_json']}")
  366. if output_paths.get('json_pages'):
  367. print(f" - {len(output_paths['json_pages'])} 个页面JSON文件")
  368. if output_paths.get('images'):
  369. print(f" - {len(output_paths['images'])} 个图片文件")
  370. print(f"{'='*60}\n")
  371. def main():
  372. parser = argparse.ArgumentParser(
  373. description="金融文档处理工具 v2",
  374. formatter_class=argparse.RawDescriptionHelpFormatter,
  375. epilog="""
  376. 示例:
  377. # 处理单个PDF文件
  378. python main_v2.py -i document.pdf -c config/bank_statement_mineru_vl.yaml
  379. # 处理图片目录
  380. python main_v2.py -i ./images/ -c config/bank_statement_paddle_vl.yaml
  381. # 开启全局debug模式(所有模块输出可视化图片)
  382. python main_v2.py -i doc.pdf -c config.yaml --debug
  383. # 开启特定模块的debug(精细控制)
  384. python main_v2.py -i doc.pdf -c config.yaml --debug-layout # 仅布局debug
  385. python main_v2.py -i doc.pdf -c config.yaml --debug-table # 仅表格debug
  386. python main_v2.py -i doc.pdf -c config.yaml --debug-layout --debug-table # 组合
  387. # 指定输出目录
  388. python main_v2.py -i doc.pdf -c config.yaml -o ./my_output/
  389. # 指定页面范围(PDF按页码,图片目录按排序位置)
  390. python main_v2.py -i doc.pdf -c config.yaml -p 1-5 # 处理第1-5页
  391. python main_v2.py -i doc.pdf -c config.yaml -p 3,7,10 # 处理第3、7、10页
  392. python main_v2.py -i doc.pdf -c config.yaml -p 1-5,8-10 # 处理第1-5、8-10页
  393. python main_v2.py -i doc.pdf -c config.yaml -p 5- # 从第5页到最后
  394. # 使用流式处理模式(节省内存,适合大文档)
  395. python main_v2.py -i large_doc.pdf -c config.yaml --streaming
  396. """
  397. )
  398. parser.add_argument(
  399. "--input", "-i",
  400. required=True,
  401. help="输入路径(PDF文件、图片文件或图片目录)"
  402. )
  403. parser.add_argument(
  404. "--config", "-c",
  405. required=True,
  406. help="配置文件路径"
  407. )
  408. parser.add_argument(
  409. "--output_dir", "-o",
  410. default="./output",
  411. help="输出目录(默认: ./output)"
  412. )
  413. parser.add_argument(
  414. "--scene", "-s",
  415. required=True,
  416. choices=["bank_statement", "financial_report"],
  417. help="场景类型(覆盖配置文件设置)"
  418. )
  419. parser.add_argument(
  420. "--debug",
  421. action="store_true",
  422. help="开启全局debug模式(启用所有模块的调试输出)"
  423. )
  424. parser.add_argument(
  425. "--debug-layout",
  426. action="store_true",
  427. help="仅开启布局检测的debug输出"
  428. )
  429. parser.add_argument(
  430. "--debug-table",
  431. action="store_true",
  432. help="仅开启表格识别的debug输出"
  433. )
  434. parser.add_argument(
  435. "--debug-ocr",
  436. action="store_true",
  437. help="仅开启OCR识别的debug输出"
  438. )
  439. parser.add_argument(
  440. "--log_level",
  441. default="INFO",
  442. choices=["DEBUG", "INFO", "WARNING", "ERROR"],
  443. help="日志级别(默认: INFO)"
  444. )
  445. parser.add_argument(
  446. "--log_file",
  447. help="日志文件路径"
  448. )
  449. parser.add_argument(
  450. "--dry_run",
  451. action="store_true",
  452. help="仅验证配置,不执行处理"
  453. )
  454. parser.add_argument(
  455. "--pages", "-p",
  456. help="页面范围(PDF按页码,图片目录按排序位置),如: 1-5,7,9-12"
  457. )
  458. parser.add_argument(
  459. "--streaming",
  460. action="store_true",
  461. help="使用流式处理模式(按页处理,立即保存,节省内存,适合大文档)"
  462. )
  463. args = parser.parse_args()
  464. setup_logging(args.log_level, args.log_file)
  465. if _handle_dry_run(args):
  466. return 0
  467. if not _validate_arguments(args):
  468. return 1
  469. result = process_single_input(
  470. input_path=Path(args.input),
  471. config_path=Path(args.config),
  472. output_dir=Path(args.output_dir),
  473. debug=args.debug,
  474. debug_layout=args.debug_layout,
  475. debug_table=args.debug_table,
  476. debug_ocr=args.debug_ocr,
  477. scene=args.scene,
  478. page_range=args.pages,
  479. streaming=args.streaming
  480. )
  481. return 0 if result.get('success') else 1
  482. if __name__ == "__main__":
  483. _print_environment_info()
  484. if len(sys.argv) == 1:
  485. print("ℹ️ 未提供命令行参数,使用默认配置运行...")
  486. # 默认配置(用于开发测试)
  487. default_config = {
  488. # 测试输入
  489. # "input": "/Users/zhch158/workspace/data/流水分析/湛_平安银行图.pdf",
  490. # "output_dir": "./output/湛_平安银行图/bank_statement_yusys_v3",
  491. "input": "/Users/zhch158/workspace/data/流水分析/张_微信图.pdf",
  492. "output_dir": "./output/张_微信图/bank_statement_yusys_v4",
  493. # "input": "/Users/zhch158/workspace/data/流水分析/许_民生银行图.pdf",
  494. # "output_dir": "./output/许_民生银行图/bank_statement_yusys_v3",
  495. # "input": "/Users/zhch158/workspace/data/流水分析/康强_北京农村商业银行.pdf",
  496. # "output_dir": "./output/康强_北京农村商业银行/bank_statement_mineru_vl",
  497. # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/A用户_单元格扫描流水_page_002.png",
  498. # "output_dir": "./output/A用户_单元格扫描流水_bank_statement_yusys_v3",
  499. # "input": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水.pdf",
  500. # "output_dir": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/bank_statement_yusys_v2",
  501. # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/2023年度报告母公司_page_005.png",
  502. # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/2023年度报告母公司_page_003.png",
  503. # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/2023年度报告母公司_page_003.png",
  504. # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/2023年度报告母公司_page_003_270_skew(-0.4).png",
  505. # "input": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司.pdf",
  506. # "output_dir": "./output/2023年度报告母公司/bank_statement_yusys_v3",
  507. # "output_dir": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/bank_statement_yusys_v3",
  508. # "output_dir": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/bank_statement_glm_vl",
  509. # "input": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司.pdf",
  510. # "output_dir": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/bank_statement_yusys_v2",
  511. # # "input": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水.pdf",
  512. # "output_dir": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/bank_statement_yusys_v3",
  513. # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/600916_中国黄金_2022年报_page_096.png",
  514. # "output_dir": "./output/600916_中国黄金_2022年报/bank_statement_yusys_v3",
  515. # "input": "/Users/zhch158/workspace/data/流水分析/600916_中国黄金_2022年报.pdf",
  516. # "output_dir": "./output/600916_中国黄金_2022年报/bank_statement_yusys_v3",
  517. # "input": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照.pdf",
  518. # "output_dir": "./output/德_内蒙古银行照/bank_statement_yusys_v3",
  519. # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/提取自赤峰黄金2023年报.pdf",
  520. # "output_dir": "./output/提取自赤峰黄金2023年报/bank_statement_yusys_v3",
  521. # "input": "/Users/zhch158/workspace/data/流水分析/提取自赤峰黄金2023年报.pdf",
  522. # "output_dir": "./output/提取自赤峰黄金2023年报/bank_statement_yusys_v4",
  523. # "output_dir": "/Users/zhch158/workspace/data/流水分析/提取自赤峰黄金2023年报/bank_statement_yusys_v4",
  524. # "input": "/Users/zhch158/workspace/data/流水分析/施博深.pdf",
  525. # "output_dir": "/Users/zhch158/workspace/data/流水分析/施博深/bank_statement_yusys_v3",
  526. # "output_dir": "./output/施博深/bank_statement_smart_router",
  527. # "input": "/Users/zhch158/workspace/data/流水分析/施博深.wiredtable/施博深_page_020.png",
  528. # "output_dir": "./output/施博深/bank_statement_yusys_v3",
  529. # "input": "/Users/zhch158/workspace/data/流水分析/施博深.wiredtable",
  530. # "output_dir": "/Users/zhch158/workspace/data/流水分析/施博深/bank_statement_yusys_v3",
  531. # "input": "/Users/zhch158/workspace/data/流水分析/山西云集科技有限公司.pdf",
  532. # "output_dir": "/Users/zhch158/workspace/data/流水分析/山西云集科技有限公司/bank_statement_yusys_v3",
  533. # 配置文件
  534. "config": "./config/bank_statement_yusys_v4.yaml",
  535. # "config": "./config/bank_statement_yusys_v3.yaml",
  536. # "config": "./config/bank_statement_smart_router.yaml",
  537. # "config": "./config/bank_statement_mineru_vl.yaml",
  538. # "config": "./config/bank_statement_yusys_v2.yaml",
  539. # "config": "./config/bank_statement_paddle_vl.yaml",
  540. # 场景
  541. "scene": "bank_statement",
  542. # "scene": "financial_report",
  543. # 页面范围(可选)
  544. "pages": "1", # 只处理前1页
  545. # "pages": "1-3,5,7-10", # 处理指定页面
  546. # "pages": "83-109", # 处理指定页面
  547. "streaming": True,
  548. # Debug模式
  549. "debug": True,
  550. # 日志级别
  551. "log_level": "DEBUG",
  552. # 日志文件
  553. "log_file": "./output/logs/bank_statement_yusys_v4/process.log",
  554. }
  555. # 构造参数
  556. sys.argv = [sys.argv[0]]
  557. for key, value in default_config.items():
  558. if isinstance(value, bool):
  559. if value:
  560. sys.argv.append(f"--{key}")
  561. else:
  562. sys.argv.extend([f"--{key}", str(value)])
  563. sys.exit(main())