#!/usr/bin/env python3 """ 金融文档处理统一入口 支持银行流水和财务报表两种场景 """ import argparse import json import sys from pathlib import Path from loguru import logger # 添加项目根目录到 Python 路径 project_root = Path(__file__).parents[1] if str(project_root) not in sys.path: sys.path.insert(0, str(project_root)) from core.pipeline_manager import FinancialDocPipeline from universal_doc_parser.utils import OutputFormatter from dotenv import load_dotenv load_dotenv(override=True) # 加载环境变量 def setup_logging(log_level: str = "INFO"): """设置日志""" logger.remove() # 移除默认处理器 logger.add(sys.stdout, level=log_level, format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}") def main(): parser = argparse.ArgumentParser(description="金融文档处理工具") parser.add_argument("--input", "-i", required=True, help="输入文档路径") parser.add_argument("--config", "-c", required=True, help="配置文件路径") parser.add_argument("--output_dir", "-o", default="./output", help="输出目录") parser.add_argument("--scene", "-s", choices=["bank_statement", "financial_report"], help="场景类型(会覆盖配置文件中的设置)") parser.add_argument("--log_level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="日志级别") parser.add_argument("--dry_run", action="store_true", help="仅验证配置,不执行处理") args = parser.parse_args() # 设置日志 setup_logging(args.log_level) # 验证输入文件 input_path = Path(args.input) if not input_path.exists(): logger.error(f"❌ Input file not found: {input_path}") return 1 # 验证配置文件 config_path = Path(args.config) if not config_path.exists(): logger.error(f"❌ Config file not found: {config_path}") return 1 try: # 初始化处理流水线 with FinancialDocPipeline(str(config_path)) as pipeline: # 如果命令行指定了场景,覆盖配置文件 if args.scene: pipeline.scene_name = args.scene logger.info(f"🔄 Scene overridden to: {args.scene}") logger.info(f"🚀 开始处理文档: {input_path}") logger.info(f"📋 使用场景配置: {pipeline.scene_name}") logger.info(f"📁 输出目录: {args.output_dir}") # 仅验证模式 if args.dry_run: logger.info("✅ Dry run completed - configuration is valid") return 0 # 处理文档 results = pipeline.process_document(str(input_path)) # 格式化输出 logger.info("💾 Saving results...") formatter = OutputFormatter(args.output_dir) output_paths = formatter.save_results(results, pipeline.config['output']) logger.info(f"✅ 处理完成,结果保存至: {output_paths}") # 打印关键统计信息 _print_summary(results) return 0 except KeyboardInterrupt: logger.warning("⚠️ Process interrupted by user") return 1 except Exception as e: logger.error(f"❌ Processing failed: {e}") if args.log_level == "DEBUG": logger.exception("Full traceback:") return 1 def _print_summary(results: dict): """打印处理结果摘要""" total_pages = len(results['pages']) total_tables = sum( len([e for e in page['elements'] if e.get('type') == 'table']) for page in results['pages'] ) total_text_blocks = sum( len([e for e in page['elements'] if e.get('type') in ['text', 'title', 'ocr_text']]) for page in results['pages'] ) total_formulas = sum( len([e for e in page['elements'] if e.get('type') == 'formula']) for page in results['pages'] ) print(f"\n📊 处理摘要:") print(f" 📄 文档: {results['document_path']}") print(f" 🎯 场景类型: {results['scene']}") print(f" 📖 页面数量: {total_pages}") print(f" 📋 表格数量: {total_tables}") print(f" 📝 文本块数量: {total_text_blocks}") print(f" 🧮 公式数量: {total_formulas}") if __name__ == "__main__": if len(sys.argv) == 1: # 如果没有命令行参数,使用默认配置运行 print("ℹ️ No command line arguments provided. Running with default configuration...") # 默认配置 default_config = { "input": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/PaddleOCR_VL_Results/B用户_扫描流水/B用户_扫描流水_page_022.png", "output_dir": "./output/bank_statement_enhanced", "config": "./config/bank_statement_enhanced.yaml", "scene": "bank_statement", "log_level": "DEBUG", } # 构造参数 sys.argv = [sys.argv[0]] for key, value in default_config.items(): sys.argv.extend([f"--{key}", str(value)]) sys.exit(main())