main_v2.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
  1. #!/usr/bin/env python3
  2. """
  3. 金融文档处理统一入口 v2
  4. 支持完整的处理流程:
  5. 1. PDF分类(扫描件/数字原生PDF)
  6. 2. 页面方向识别
  7. 3. Layout检测
  8. 4. 并行处理:文本OCR + 表格VLM识别
  9. 5. 单元格坐标匹配
  10. 6. 多格式输出(JSON、Markdown、HTML、可视化图片)
  11. 使用方法:
  12. # 处理单个PDF
  13. python main_v2.py -i /path/to/document.pdf -c ./config/bank_statement_mineru_vl.yaml
  14. # 处理图片目录
  15. python main_v2.py -i /path/to/images/ -c ./config/bank_statement_paddle_vl.yaml
  16. # 开启debug模式(输出可视化图片)
  17. python main_v2.py -i /path/to/doc.pdf -c ./config/xxx.yaml --debug
  18. """
  19. import argparse
  20. import json
  21. import sys
  22. import os
  23. from pathlib import Path
  24. from typing import Optional
  25. from loguru import logger
  26. from datetime import datetime
  27. # 添加 ocr_platform 根目录到 Python 路径(用于导入 ocr_utils)
  28. ocr_platform_root = Path(__file__).parents[2] # universal_doc_parser -> ocr_tools -> ocr_platform -> repository.git
  29. if str(ocr_platform_root) not in sys.path:
  30. sys.path.insert(0, str(ocr_platform_root))
  31. # 添加当前目录到 Python 路径(用于相对导入)
  32. project_root = Path(__file__).parent
  33. if str(project_root) not in sys.path:
  34. sys.path.insert(0, str(project_root))
  35. from dotenv import load_dotenv
  36. load_dotenv(override=True)
  37. from core.pipeline_manager_v2 import EnhancedDocPipeline
  38. from core.pipeline_manager_v2_streaming import StreamingDocPipeline
  39. # 从 ocr_utils 导入工具函数
  40. try:
  41. from ocr_utils import OutputFormatterV2
  42. except ImportError:
  43. # 降级:从 utils 导入(向后兼容)
  44. from utils import OutputFormatterV2
  45. def setup_logging(log_level: str = "INFO", log_file: Optional[str] = None):
  46. """设置日志"""
  47. logger.remove()
  48. # 控制台输出
  49. logger.add(
  50. sys.stdout,
  51. level=log_level,
  52. format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
  53. )
  54. # 文件输出
  55. if log_file:
  56. logger.add(
  57. log_file,
  58. level="DEBUG",
  59. format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}",
  60. rotation="10 MB"
  61. )
  62. def process_single_input(
  63. input_path: Path,
  64. config_path: Path,
  65. output_dir: Path,
  66. debug: bool = False,
  67. scene: Optional[str] = None,
  68. page_range: Optional[str] = None,
  69. streaming: bool = False
  70. ) -> dict:
  71. """
  72. 处理单个输入(文件或目录)
  73. Args:
  74. input_path: 输入路径
  75. config_path: 配置文件路径
  76. output_dir: 输出目录
  77. debug: 是否开启debug模式
  78. scene: 场景类型覆盖
  79. page_range: 页面范围(如 "1-5,7,9-12")
  80. streaming: 是否使用流式处理模式(按页处理,立即保存,节省内存)
  81. Returns:
  82. 处理结果和输出路径
  83. """
  84. try:
  85. # 选择处理模式
  86. if streaming:
  87. logger.info("🔄 Using streaming processing mode (memory-efficient)")
  88. pipeline_streaming = StreamingDocPipeline(str(config_path), str(output_dir))
  89. use_context = False # StreamingDocPipeline 不使用 context manager
  90. else:
  91. logger.info("🔄 Using batch processing mode (all pages in memory)")
  92. pipeline_batch = EnhancedDocPipeline(str(config_path))
  93. use_context = hasattr(pipeline_batch, '__enter__')
  94. if use_context:
  95. pipeline_batch = pipeline_batch.__enter__()
  96. try:
  97. # 覆盖场景设置
  98. if streaming:
  99. pipeline = pipeline_streaming
  100. else:
  101. pipeline = pipeline_batch
  102. if scene:
  103. pipeline.scene_name = scene
  104. logger.info(f"🔄 Scene overridden to: {scene}")
  105. logger.info(f"🚀 开始处理: {input_path}")
  106. logger.info(f"📋 场景配置: {pipeline.scene_name}")
  107. logger.info(f"📁 输出目录: {output_dir}")
  108. if page_range:
  109. logger.info(f"📄 页面范围: {page_range}")
  110. # 构建输出配置
  111. output_config = {
  112. 'save_json': True,
  113. 'save_markdown': True,
  114. 'save_html': True,
  115. 'save_page_json': True,
  116. 'save_images': True,
  117. 'save_layout_image': debug,
  118. 'save_ocr_image': debug,
  119. 'normalize_numbers': True,
  120. 'merge_cross_page_tables': True,
  121. 'cleanup_temp_files': True,
  122. }
  123. # 处理文档
  124. start_time = datetime.now()
  125. if streaming:
  126. # 流式处理模式
  127. results = pipeline.process_document_streaming( # type: ignore
  128. str(input_path),
  129. page_range=page_range,
  130. output_config=output_config
  131. )
  132. process_time = (datetime.now() - start_time).total_seconds()
  133. # 流式模式已经保存了所有结果,只需要返回摘要
  134. output_paths = results.get('output_paths', {})
  135. # 打印摘要
  136. _print_summary_streaming(results, process_time)
  137. return {
  138. 'success': True,
  139. 'results': results,
  140. 'output_paths': output_paths,
  141. 'process_time': process_time
  142. }
  143. else:
  144. # 批量处理模式(原有逻辑)
  145. results = pipeline.process_document(str(input_path), page_range=page_range)
  146. process_time = (datetime.now() - start_time).total_seconds()
  147. logger.info(f"⏱️ 处理耗时: {process_time:.2f}秒")
  148. # 格式化输出
  149. logger.info("💾 保存结果...")
  150. formatter = OutputFormatterV2(str(output_dir))
  151. output_paths = formatter.save_results(results, output_config)
  152. # 打印摘要
  153. _print_summary(results, output_paths, process_time)
  154. return {
  155. 'success': True,
  156. 'results': results,
  157. 'output_paths': output_paths,
  158. 'process_time': process_time
  159. }
  160. finally:
  161. # 关闭context manager
  162. if not streaming and use_context:
  163. pipeline_batch.__exit__(None, None, None)
  164. except Exception as e:
  165. logger.error(f"❌ 处理失败: {e}")
  166. import traceback
  167. traceback.print_exc()
  168. return {
  169. 'success': False,
  170. 'error': str(e)
  171. }
  172. def _print_summary(results: dict, output_paths: dict, process_time: float):
  173. """打印处理结果摘要"""
  174. total_pages = len(results.get('pages', []))
  175. total_tables = 0
  176. total_text_blocks = 0
  177. total_cells = 0
  178. for page in results.get('pages', []):
  179. for element in page.get('elements', []):
  180. elem_type = element.get('type', '')
  181. if elem_type in ['table', 'table_body']:
  182. total_tables += 1
  183. cells = element.get('content', {}).get('cells', [])
  184. total_cells += len(cells)
  185. elif elem_type in ['text', 'title', 'ocr_text', 'ref_text']:
  186. total_text_blocks += 1
  187. print(f"\n{'='*60}")
  188. print(f"📊 处理摘要")
  189. print(f"{'='*60}")
  190. print(f" 📄 文档: {results.get('document_path', 'N/A')}")
  191. print(f" 🎯 场景: {results.get('scene', 'N/A')}")
  192. print(f" 📋 PDF类型: {results.get('metadata', {}).get('pdf_type', 'N/A')}")
  193. print(f" 📖 页面数: {total_pages}")
  194. print(f" 📋 表格数: {total_tables}")
  195. print(f" 📝 文本块: {total_text_blocks}")
  196. print(f" 🔢 单元格: {total_cells} (带坐标)")
  197. print(f" ⏱️ 耗时: {process_time:.2f}秒")
  198. print(f"{'='*60}")
  199. print(f"📁 输出文件:")
  200. for key, path in output_paths.items():
  201. if isinstance(path, list):
  202. for p in path:
  203. print(f" - {p}")
  204. else:
  205. print(f" - {path}")
  206. print(f"{'='*60}\n")
  207. def _print_summary_streaming(results_summary: dict, process_time: float):
  208. """打印流式处理结果摘要"""
  209. print(f"\n{'='*60}")
  210. print(f"📊 处理摘要(流式模式)")
  211. print(f"{'='*60}")
  212. print(f" 📄 文档: {results_summary.get('document_path', 'N/A')}")
  213. print(f" 🎯 场景: {results_summary.get('scene', 'N/A')}")
  214. print(f" 📋 PDF类型: {results_summary.get('metadata', {}).get('pdf_type', 'N/A')}")
  215. print(f" 📖 页面数: {results_summary.get('total_pages', 0)}")
  216. print(f" ⏱️ 耗时: {process_time:.2f}秒")
  217. print(f"{'='*60}")
  218. print(f"📁 输出文件:")
  219. output_paths = results_summary.get('output_paths', {})
  220. if output_paths.get('middle_json'):
  221. print(f" - {output_paths['middle_json']}")
  222. if output_paths.get('json_pages'):
  223. print(f" - {len(output_paths['json_pages'])} 个页面JSON文件")
  224. if output_paths.get('images'):
  225. print(f" - {len(output_paths['images'])} 个图片文件")
  226. print(f"{'='*60}\n")
  227. def main():
  228. parser = argparse.ArgumentParser(
  229. description="金融文档处理工具 v2",
  230. formatter_class=argparse.RawDescriptionHelpFormatter,
  231. epilog="""
  232. 示例:
  233. # 处理单个PDF文件
  234. python main_v2.py -i document.pdf -c config/bank_statement_mineru_vl.yaml
  235. # 处理图片目录
  236. python main_v2.py -i ./images/ -c config/bank_statement_paddle_vl.yaml
  237. # 开启debug模式(输出可视化图片)
  238. python main_v2.py -i doc.pdf -c config.yaml --debug
  239. # 指定输出目录
  240. python main_v2.py -i doc.pdf -c config.yaml -o ./my_output/
  241. # 指定页面范围(PDF按页码,图片目录按排序位置)
  242. python main_v2.py -i doc.pdf -c config.yaml -p 1-5 # 处理第1-5页
  243. python main_v2.py -i doc.pdf -c config.yaml -p 3,7,10 # 处理第3、7、10页
  244. python main_v2.py -i doc.pdf -c config.yaml -p 1-5,8-10 # 处理第1-5、8-10页
  245. python main_v2.py -i doc.pdf -c config.yaml -p 5- # 从第5页到最后
  246. # 使用流式处理模式(节省内存,适合大文档)
  247. python main_v2.py -i large_doc.pdf -c config.yaml --streaming
  248. """
  249. )
  250. parser.add_argument(
  251. "--input", "-i",
  252. required=True,
  253. help="输入路径(PDF文件、图片文件或图片目录)"
  254. )
  255. parser.add_argument(
  256. "--config", "-c",
  257. required=True,
  258. help="配置文件路径"
  259. )
  260. parser.add_argument(
  261. "--output_dir", "-o",
  262. default="./output",
  263. help="输出目录(默认: ./output)"
  264. )
  265. parser.add_argument(
  266. "--scene", "-s",
  267. choices=["bank_statement", "financial_report"],
  268. help="场景类型(覆盖配置文件设置)"
  269. )
  270. parser.add_argument(
  271. "--debug",
  272. action="store_true",
  273. help="开启debug模式(输出layout和OCR可视化图片)"
  274. )
  275. parser.add_argument(
  276. "--log_level",
  277. default="INFO",
  278. choices=["DEBUG", "INFO", "WARNING", "ERROR"],
  279. help="日志级别(默认: INFO)"
  280. )
  281. parser.add_argument(
  282. "--log_file",
  283. help="日志文件路径"
  284. )
  285. parser.add_argument(
  286. "--dry_run",
  287. action="store_true",
  288. help="仅验证配置,不执行处理"
  289. )
  290. parser.add_argument(
  291. "--pages", "-p",
  292. help="页面范围(PDF按页码,图片目录按排序位置),如: 1-5,7,9-12"
  293. )
  294. parser.add_argument(
  295. "--streaming",
  296. action="store_true",
  297. help="使用流式处理模式(按页处理,立即保存,节省内存,适合大文档)"
  298. )
  299. args = parser.parse_args()
  300. # 设置日志
  301. setup_logging(args.log_level, args.log_file)
  302. # 验证输入
  303. input_path = Path(args.input)
  304. if not input_path.exists():
  305. logger.error(f"❌ 输入路径不存在: {input_path}")
  306. return 1
  307. # 验证配置文件
  308. config_path = Path(args.config)
  309. if not config_path.exists():
  310. logger.error(f"❌ 配置文件不存在: {config_path}")
  311. return 1
  312. # 仅验证模式
  313. if args.dry_run:
  314. logger.info("✅ 配置验证通过(dry run)")
  315. return 0
  316. # 处理文档
  317. result = process_single_input(
  318. input_path=input_path,
  319. config_path=config_path,
  320. output_dir=Path(args.output_dir),
  321. debug=args.debug,
  322. scene=args.scene,
  323. page_range=args.pages,
  324. streaming=args.streaming
  325. )
  326. return 0 if result.get('success') else 1
  327. if __name__ == "__main__":
  328. # 打印环境变量
  329. print(f"🔧 CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set')}")
  330. print(f"🔧 HF_HOME: {os.environ.get('HF_HOME', 'Not set')}")
  331. print(f"🔧 HF_ENDPOINT: {os.environ.get('HF_ENDPOINT', 'Not set')}")
  332. print(f"🔧 HF_HUB_OFFLINE: {os.environ.get('HF_HUB_OFFLINE', 'Not set')}")
  333. print(f"🔧 TORCH_HOME: {os.environ.get('TORCH_HOME', 'Not set')}")
  334. print(f"🔧 MODELSCOPE_CACHE: {os.environ.get('MODELSCOPE_CACHE', 'Not set')}")
  335. print(f"🔧 USE_MODELSCOPE_HUB: {os.environ.get('USE_MODELSCOPE_HUB', 'Not set')}")
  336. print(f"🔧 MINERU_MODEL_SOURCE: {os.environ.get('MINERU_MODEL_SOURCE', 'Not set')}")
  337. if len(sys.argv) == 1:
  338. # 没有命令行参数时,使用默认配置运行
  339. print("ℹ️ 未提供命令行参数,使用默认配置运行...")
  340. # 默认配置
  341. default_config = {
  342. # 测试输入
  343. # "input": "/Users/zhch158/workspace/data/流水分析/康强_北京农村商业银行.pdf",
  344. # "output_dir": "./output/康强_北京农村商业银行_bank_statement_v2",
  345. # "input": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/mineru_vllm_results/2023年度报告母公司/2023年度报告母公司_page_003.png",
  346. # "output_dir": "./output/2023年度报告母公司_bank_statement_v2",
  347. # "input": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水.pdf",
  348. # "output_dir": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/bank_statement_yusys_v2",
  349. # "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/2023年度报告母公司_page_006_270.png",
  350. # "output_dir": "./output/2023年度报告母公司/bank_statement_wired_unet",
  351. # "input": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司.pdf",
  352. # "output_dir": "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司/bank_statement_yusys_v2",
  353. "input": "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/tests/600916_中国黄金_2022年报_page_096.png",
  354. "output_dir": "./output/600916_中国黄金_2022年报/bank_statement_wired_unet",
  355. # "input": "/Users/zhch158/workspace/data/流水分析/施博深.pdf",
  356. # "output_dir": "/Users/zhch158/workspace/data/流水分析/施博深/bank_statement_yusys_v2",
  357. # "input": "/Users/zhch158/workspace/data/流水分析/施博深.wiredtable/施博深_page_001.png",
  358. # "output_dir": "./output/施博深_page_001_bank_statement_wired_unet",
  359. # "input": "/Users/zhch158/workspace/data/流水分析/施博深.wiredtable",
  360. # "output_dir": "/Users/zhch158/workspace/data/流水分析/施博深/bank_statement_wired_unet",
  361. # 配置文件
  362. "config": "./config/bank_statement_wired_unet.yaml",
  363. # "config": "./config/bank_statement_yusys_v2.yaml",
  364. # "config": "./config/bank_statement_paddle_vl.yaml",
  365. # 场景
  366. "scene": "bank_statement",
  367. # 页面范围(可选)
  368. # "pages": "6", # 只处理前1页
  369. # "pages": "1-3,5,7-10", # 处理指定页面
  370. "streaming": True,
  371. # Debug模式
  372. "debug": True,
  373. # 日志级别
  374. "log_level": "DEBUG",
  375. }
  376. # 构造参数
  377. sys.argv = [sys.argv[0]]
  378. for key, value in default_config.items():
  379. if isinstance(value, bool):
  380. if value:
  381. sys.argv.append(f"--{key}")
  382. else:
  383. sys.argv.extend([f"--{key}", str(value)])
  384. sys.exit(main())