4 tháng trước cách đây · 10872b84e9
--- a/ocr_tools/ocr_batch/batch_process_pdf.py
+++ b/ocr_tools/ocr_batch/batch_process_pdf.py
@@ -14,7 +14,7 @@ import json
 
				 import yaml
			
 
				 from pathlib import Path
			
 
				 from datetime import datetime
			
 
				-from typing import List, Dict, Optional, Any
			
 
				+from typing import List, Dict, Optional, Any, Tuple
			
 
				 from dataclasses import dataclass, field
			
 
				 import logging
			
 
				 from tqdm import tqdm
			
@@ -34,11 +34,19 @@ class ProcessorConfig:
 
				     extra_args: List[str] = field(default_factory=list)
			
 
				     output_subdir: str = "results"
			
 
				     log_subdir: str = "logs"  # 🎯 新增：日志子目录
			
 
				+    scene_arg: Optional[str] = None  # 场景参数名（如 --scene）
			
 
				     venv: Optional[str] = None
			
 
				     description: str = ""
			
 
				 
			
 
				 
			
 
				 @dataclass
			
 
				+class PDFTask:
			
 
				+    """PDF 处理任务"""
			
 
				+    path: Path
			
 
				+    scene: Optional[str] = None
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				 class ProcessResult:
			
 
				     """处理结果"""
			
 
				     pdf_file: str
			
@@ -165,6 +173,7 @@ class ConfigManager:
 
				             extra_args=proc_config.get('extra_args', []),
			
 
				             output_subdir=proc_config.get('output_subdir', processor_name + '_results'),
			
 
				             log_subdir=proc_config.get('log_subdir', f'logs/{processor_name}'),  # 🎯 新增
			
 
				+            scene_arg=proc_config.get('scene_arg'),
			
 
				             venv=proc_config.get('venv'),
			
 
				             description=proc_config.get('description', '')
			
 
				         )
			
@@ -188,9 +197,9 @@ class PDFFileFinder:
 
				     def __init__(self, base_dir: str):
			
 
				         self.base_dir = Path(base_dir)
			
 
				     
			
 
				-    def from_file_list(self, list_file: str) -> List[Path]:
			
 
				+    def from_file_list(self, list_file: str) -> List[PDFTask]:
			
 
				         """从文件列表读取"""
			
 
				-        pdf_files = []
			
 
				+        pdf_files: List[PDFTask] = []
			
 
				         
			
 
				         with open(list_file, 'r', encoding='utf-8') as f:
			
 
				             for line in f:
			
@@ -199,27 +208,39 @@ class PDFFileFinder:
 
				                 if not line or line.startswith('#'):
			
 
				                     continue
			
 
				                 
			
 
				+                file_part, scene = self._parse_list_line(line)
			
 
				                 # 构建完整路径
			
 
				-                pdf_path = self._resolve_path(line)
			
 
				+                pdf_path = self._resolve_path(file_part)
			
 
				                 if pdf_path:
			
 
				-                    pdf_files.append(pdf_path)
			
 
				+                    pdf_files.append(PDFTask(path=pdf_path, scene=scene))
			
 
				         
			
 
				         return pdf_files
			
 
				     
			
 
				-    def from_list(self, pdf_list: List[str]) -> List[Path]:
			
 
				+    def from_list(self, pdf_list: List[str]) -> List[PDFTask]:
			
 
				         """从列表读取"""
			
 
				-        pdf_files = []
			
 
				+        pdf_files: List[PDFTask] = []
			
 
				         
			
 
				         for pdf in pdf_list:
			
 
				-            pdf_path = self._resolve_path(pdf.strip())
			
 
				+            file_part, scene = self._parse_list_line(pdf.strip())
			
 
				+            pdf_path = self._resolve_path(file_part)
			
 
				             if pdf_path:
			
 
				-                pdf_files.append(pdf_path)
			
 
				+                pdf_files.append(PDFTask(path=pdf_path, scene=scene))
			
 
				         
			
 
				         return pdf_files
			
 
				     
			
 
				-    def find_all(self) -> List[Path]:
			
 
				+    def find_all(self) -> List[PDFTask]:
			
 
				         """查找基础目录下所有 PDF"""
			
 
				-        return sorted(self.base_dir.rglob('*.pdf'))
			
 
				+        return [PDFTask(path=path) for path in sorted(self.base_dir.rglob('*.pdf'))]
			
 
				+
			
 
				+    def _parse_list_line(self, line: str) -> Tuple[str, Optional[str]]:
			
 
				+        """解析列表行（支持  文件<TAB>场景  或  文件,场景）"""
			
 
				+        for sep in ["\t", ","]:
			
 
				+            if sep in line:
			
 
				+                file_part, scene_part = line.split(sep, 1)
			
 
				+                file_part = file_part.strip()
			
 
				+                scene_part = scene_part.strip()
			
 
				+                return file_part, scene_part or None
			
 
				+        return line.strip(), None
			
 
				     
			
 
				     def _resolve_path(self, path_str: str) -> Optional[Path]:
			
 
				         """解析路径"""
			
@@ -263,13 +284,15 @@ class PDFBatchProcessor:
 
				         processor_config: ProcessorConfig,
			
 
				         output_subdir: Optional[str] = None,
			
 
				         log_base_dir: Optional[str] = None,  # 🎯 新增：日志基础目录
			
 
				-        dry_run: bool = False
			
 
				+        dry_run: bool = False,
			
 
				+        default_scene: Optional[str] = None
			
 
				     ):
			
 
				         self.processor_config = processor_config
			
 
				         # 如果指定了output_subdir，使用指定的；否则使用处理器配置中的
			
 
				         self.output_subdir = output_subdir or processor_config.output_subdir
			
 
				         self.log_base_dir = Path(log_base_dir) if log_base_dir else Path('logs')  # 🎯 新增
			
 
				         self.dry_run = dry_run
			
 
				+        self.default_scene = default_scene
			
 
				         
			
 
				         # 设置日志
			
 
				         self.logger = self._setup_logger()
			
@@ -320,7 +343,7 @@ class PDFBatchProcessor:
 
				         
			
 
				         return log_file
			
 
				     
			
 
				-    def process_files(self, pdf_files: List[Path]) -> Dict[str, Any]:
			
 
				+    def process_files(self, pdf_files: List[PDFTask]) -> Dict[str, Any]:
			
 
				         """批量处理文件"""
			
 
				         self.logger.info(f"开始处理 {len(pdf_files)} 个文件")
			
 
				         self.logger.info(f"处理器: {self.processor_config.description}")
			
@@ -335,8 +358,8 @@ class PDFBatchProcessor:
 
				         
			
 
				         # 使用进度条
			
 
				         with tqdm(total=len(pdf_files), desc="处理进度", unit="file") as pbar:
			
 
				-            for pdf_file in pdf_files:
			
 
				-                result = self._process_single_file(pdf_file)
			
 
				+            for task in pdf_files:
			
 
				+                result = self._process_single_file(task)
			
 
				                 self.results.append(result)
			
 
				                 pbar.update(1)
			
 
				                 
			
@@ -355,9 +378,12 @@ class PDFBatchProcessor:
 
				         
			
 
				         return stats
			
 
				     
			
 
				-    def _process_single_file(self, pdf_file: Path) -> ProcessResult:
			
 
				+    def _process_single_file(self, task: PDFTask) -> ProcessResult:
			
 
				         """🎯 处理单个文件（支持日志重定向）"""
			
 
				-        self.logger.info(f"处理: {pdf_file}")
			
 
				+        pdf_file = task.path
			
 
				+        scene = task.scene or self.default_scene
			
 
				+        scene_info = f" (scene: {scene})" if scene else ""
			
 
				+        self.logger.info(f"处理: {pdf_file}{scene_info}")
			
 
				         
			
 
				         # 检查文件是否存在
			
 
				         if not pdf_file.exists():
			
@@ -376,7 +402,7 @@ class PDFBatchProcessor:
 
				         log_file = self._get_log_file_path(pdf_file)
			
 
				         
			
 
				         # 构建命令
			
 
				-        cmd = self._build_command(pdf_file, output_dir)
			
 
				+        cmd = self._build_command(pdf_file, output_dir, scene)
			
 
				         
			
 
				         self.logger.debug(f"执行命令: {cmd if isinstance(cmd, str) else ' '.join(cmd)}")
			
 
				         self.logger.info(f"日志输出: {log_file}")
			
@@ -398,7 +424,7 @@ class PDFBatchProcessor:
 
				                 # 写入日志头
			
 
				                 log_f.write(f"{'='*80}\n")
			
 
				                 log_f.write(f"处理器: {self.processor_config.description}\n")
			
 
				-                log_f.write(f"PDF 文件: {pdf_file}\n")
			
 
				+                log_f.write(f"PDF 文件: {pdf_file}{scene_info}\n")
			
 
				                 log_f.write(f"输出目录: {output_dir}\n")
			
 
				                 log_f.write(f"开始时间: {datetime.now()}\n")
			
 
				                 log_f.write(f"{'='*80}\n\n")
			
@@ -486,7 +512,7 @@ class PDFBatchProcessor:
 
				                 log_file=str(log_file)
			
 
				             )
			
 
				     
			
 
				-    def _build_command(self, pdf_file: Path, output_dir: Path):
			
 
				+    def _build_command(self, pdf_file: Path, output_dir: Path, scene: Optional[str]):
			
 
				         """构建执行命令
			
 
				         
			
 
				         Returns:
			
@@ -503,6 +529,13 @@ class PDFBatchProcessor:
 
				         
			
 
				         # 添加额外参数
			
 
				         base_cmd.extend(self.processor_config.extra_args)
			
 
				+
			
 
				+        # 添加场景参数（如果配置了scene_arg）
			
 
				+        if scene:
			
 
				+            if self.processor_config.scene_arg:
			
 
				+                base_cmd.extend([self.processor_config.scene_arg, scene])
			
 
				+            else:
			
 
				+                self.logger.warning("⚠️ 场景已提供但未配置scene_arg，已忽略场景参数")
			
 
				         
			
 
				         # 如果配置了虚拟环境，构建 shell 命令
			
 
				         if self.processor_config.venv:
			
@@ -690,6 +723,17 @@ def create_parser() -> argparse.ArgumentParser:
 
				         nargs='+',
			
 
				         help='PDF 文件列表 (空格分隔)'
			
 
				     )
			
 
				+
			
 
				+    # 场景参数
			
 
				+    parser.add_argument(
			
 
				+        '--scene',
			
 
				+        help='默认场景名称（文件列表未提供场景时使用）'
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--scene-arg',
			
 
				+        default='--scene',
			
 
				+        help='场景参数名称 (默认: --scene)'
			
 
				+    )
			
 
				     
			
 
				     # 额外参数
			
 
				     parser.add_argument(
			
@@ -772,8 +816,12 @@ def main():
 
				             script=args.script,
			
 
				             extra_args=args.extra_args.split() if args.extra_args else [],
			
 
				             output_subdir=args.output_subdir or 'manual_results',
			
 
				+            scene_arg=args.scene_arg,
			
 
				             venv=args.venv
			
 
				         )
			
 
				+        # 如果配置中没有scene_arg且用户指定了scene，默认设置为--scene
			
 
				+        if args.scene and not processor_config.scene_arg:
			
 
				+            print("⚠️  已指定场景但未配置scene_arg，忽略场景参数")
			
 
				     else:
			
 
				         parser.error("必须指定 -p 或 -s 参数")
			
 
				     
			
@@ -806,18 +854,20 @@ def main():
 
				         return 1
			
 
				     
			
 
				     # 显示找到的文件
			
 
				-    valid_file_paths = [f.as_posix() for f in pdf_files if f.exists()]
			
 
				+    valid_file_paths = [f"{t.path.as_posix()}\t{t.scene}" if t.scene else t.path.as_posix()
			
 
				+                        for t in pdf_files if t.path.exists()]
			
 
				     if valid_file_paths:
			
 
				         print("\n".join(valid_file_paths))    
			
 
				 
			
 
				     # 验证文件
			
 
				-    valid_files = [f for f in pdf_files if f.exists()]
			
 
				-    invalid_files = [f for f in pdf_files if not f.exists()]
			
 
				+    valid_files = [t for t in pdf_files if t.path.exists()]
			
 
				+    invalid_files = [t for t in pdf_files if not t.path.exists()]
			
 
				     
			
 
				     if invalid_files:
			
 
				         print(f"\n⚠️  警告: {len(invalid_files)} 个文件不存在:")
			
 
				-        for f in invalid_files[:5]:
			
 
				-            print(f"  - {f}")
			
 
				+        for t in invalid_files[:5]:
			
 
				+            scene_suffix = f" (scene: {t.scene})" if t.scene else ""
			
 
				+            print(f"  - {t.path}{scene_suffix}")
			
 
				         if len(invalid_files) > 5:
			
 
				             print(f"  ... 还有 {len(invalid_files) - 5} 个")
			
 
				     
			
@@ -834,7 +884,8 @@ def main():
 
				         processor_config=processor_config,
			
 
				         output_subdir=args.output_subdir,
			
 
				         log_base_dir=log_base_dir,  # 🎯 传递日志目录
			
 
				-        dry_run=args.dry_run
			
 
				+        dry_run=args.dry_run,
			
 
				+        default_scene=args.scene
			
 
				     )
			
 
				     
			
 
				     stats = processor.process_files(valid_files)
			
--- a/ocr_tools/ocr_batch/pdf_list.txt
+++ b/ocr_tools/ocr_batch/pdf_list.txt
@@ -1,17 +1,18 @@
 
				-德_内蒙古银行照.pdf
			
 
				-对公_招商银行图.pdf
			
 
				-A用户_单元格扫描流水.pdf
			
 
				-B用户_扫描流水.pdf
			
 
				-康强_北京农村商业银行.pdf
			
 
				-施博深.pdf
			
 
				-山西云集科技有限公司.pdf
			
 
				-2023年度报告母公司.pdf
			
 
				-提取自赤峰黄金2023年报.pdf
			
 
				-许_民生银行图.pdf
			
 
				-方_广发银行图.pdf
			
 
				-付_工商银行943825图.pdf
			
 
				-乔_建设银行图.pdf
			
 
				-湛_平安银行图.pdf
			
 
				-张_微信图.pdf
			
 
				-朱_中信银行图.pdf
			
 
				+# 文件名<TAB>","场景（bank_statement / financial_report）
			
 
				+德_内蒙古银行照.pdf,bank_statement
			
 
				+对公_招商银行图.pdf,bank_statement
			
 
				+A用户_单元格扫描流水.pdf,bank_statement
			
 
				+B用户_扫描流水.pdf,bank_statement
			
 
				+康强_北京农村商业银行.pdf,bank_statement
			
 
				+施博深.pdf,bank_statement
			
 
				+山西云集科技有限公司.pdf,bank_statement
			
 
				+2023年度报告母公司.pdf,financial_report
			
 
				+提取自赤峰黄金2023年报.pdf,financial_report
			
 
				+许_民生银行图.pdf,bank_statement
			
 
				+方_广发银行图.pdf,bank_statement
			
 
				+付_工商银行943825图.pdf,bank_statement
			
 
				+乔_建设银行图.pdf,bank_statement
			
 
				+湛_平安银行图.pdf,bank_statement
			
 
				+张_微信图.pdf,bank_statement
			
 
				+朱_中信银行图.pdf,bank_statement
			
 
				 
			
--- a/ocr_tools/ocr_batch/processor_configs.yaml
+++ b/ocr_tools/ocr_batch/processor_configs.yaml
@@ -12,6 +12,7 @@ processors:
 
				     script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/main_v2.py"
			
 
				     input_arg: "--input"
			
 
				     output_arg: "--output_dir"
			
 
				+    scene_arg: "--scene"
			
 
				     extra_args:
			
 
				       - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v4.yaml"
			
 
				       - "--pages=1-35"
			
@@ -27,6 +28,7 @@ processors:
 
				     script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/main_v2.py"
			
 
				     input_arg: "--input"
			
 
				     output_arg: "--output_dir"
			
 
				+    scene_arg: "--scene"
			
 
				     extra_args:
			
 
				       - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v3.yaml"
			
 
				       # - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v2.yaml"
			
@@ -45,6 +47,7 @@ processors:
 
				     script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/main_v2.py"
			
 
				     input_arg: "--input"
			
 
				     output_arg: "--output_dir"
			
 
				+    scene_arg: "--scene"
			
 
				     extra_args:
			
 
				       - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_mineru_vl.yaml"
			
 
				       # - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v2.yaml"