Browse Source

feat(pdf_processing): 添加场景参数支持,优化 PDF 处理任务配置

zhch158_admin 1 tuần trước cách đây
mục cha
commit
10872b84e9

+ 77 - 26
ocr_tools/ocr_batch/batch_process_pdf.py

@@ -14,7 +14,7 @@ import json
 import yaml
 from pathlib import Path
 from datetime import datetime
-from typing import List, Dict, Optional, Any
+from typing import List, Dict, Optional, Any, Tuple
 from dataclasses import dataclass, field
 import logging
 from tqdm import tqdm
@@ -34,11 +34,19 @@ class ProcessorConfig:
     extra_args: List[str] = field(default_factory=list)
     output_subdir: str = "results"
     log_subdir: str = "logs"  # 🎯 新增:日志子目录
+    scene_arg: Optional[str] = None  # 场景参数名(如 --scene)
     venv: Optional[str] = None
     description: str = ""
 
 
 @dataclass
+class PDFTask:
+    """PDF 处理任务"""
+    path: Path
+    scene: Optional[str] = None
+
+
+@dataclass
 class ProcessResult:
     """处理结果"""
     pdf_file: str
@@ -165,6 +173,7 @@ class ConfigManager:
             extra_args=proc_config.get('extra_args', []),
             output_subdir=proc_config.get('output_subdir', processor_name + '_results'),
             log_subdir=proc_config.get('log_subdir', f'logs/{processor_name}'),  # 🎯 新增
+            scene_arg=proc_config.get('scene_arg'),
             venv=proc_config.get('venv'),
             description=proc_config.get('description', '')
         )
@@ -188,9 +197,9 @@ class PDFFileFinder:
     def __init__(self, base_dir: str):
         self.base_dir = Path(base_dir)
     
-    def from_file_list(self, list_file: str) -> List[Path]:
+    def from_file_list(self, list_file: str) -> List[PDFTask]:
         """从文件列表读取"""
-        pdf_files = []
+        pdf_files: List[PDFTask] = []
         
         with open(list_file, 'r', encoding='utf-8') as f:
             for line in f:
@@ -199,27 +208,39 @@ class PDFFileFinder:
                 if not line or line.startswith('#'):
                     continue
                 
+                file_part, scene = self._parse_list_line(line)
                 # 构建完整路径
-                pdf_path = self._resolve_path(line)
+                pdf_path = self._resolve_path(file_part)
                 if pdf_path:
-                    pdf_files.append(pdf_path)
+                    pdf_files.append(PDFTask(path=pdf_path, scene=scene))
         
         return pdf_files
     
-    def from_list(self, pdf_list: List[str]) -> List[Path]:
+    def from_list(self, pdf_list: List[str]) -> List[PDFTask]:
         """从列表读取"""
-        pdf_files = []
+        pdf_files: List[PDFTask] = []
         
         for pdf in pdf_list:
-            pdf_path = self._resolve_path(pdf.strip())
+            file_part, scene = self._parse_list_line(pdf.strip())
+            pdf_path = self._resolve_path(file_part)
             if pdf_path:
-                pdf_files.append(pdf_path)
+                pdf_files.append(PDFTask(path=pdf_path, scene=scene))
         
         return pdf_files
     
-    def find_all(self) -> List[Path]:
+    def find_all(self) -> List[PDFTask]:
         """查找基础目录下所有 PDF"""
-        return sorted(self.base_dir.rglob('*.pdf'))
+        return [PDFTask(path=path) for path in sorted(self.base_dir.rglob('*.pdf'))]
+
+    def _parse_list_line(self, line: str) -> Tuple[str, Optional[str]]:
+        """解析列表行(支持  文件<TAB>场景  或  文件,场景)"""
+        for sep in ["\t", ","]:
+            if sep in line:
+                file_part, scene_part = line.split(sep, 1)
+                file_part = file_part.strip()
+                scene_part = scene_part.strip()
+                return file_part, scene_part or None
+        return line.strip(), None
     
     def _resolve_path(self, path_str: str) -> Optional[Path]:
         """解析路径"""
@@ -263,13 +284,15 @@ class PDFBatchProcessor:
         processor_config: ProcessorConfig,
         output_subdir: Optional[str] = None,
         log_base_dir: Optional[str] = None,  # 🎯 新增:日志基础目录
-        dry_run: bool = False
+        dry_run: bool = False,
+        default_scene: Optional[str] = None
     ):
         self.processor_config = processor_config
         # 如果指定了output_subdir,使用指定的;否则使用处理器配置中的
         self.output_subdir = output_subdir or processor_config.output_subdir
         self.log_base_dir = Path(log_base_dir) if log_base_dir else Path('logs')  # 🎯 新增
         self.dry_run = dry_run
+        self.default_scene = default_scene
         
         # 设置日志
         self.logger = self._setup_logger()
@@ -320,7 +343,7 @@ class PDFBatchProcessor:
         
         return log_file
     
-    def process_files(self, pdf_files: List[Path]) -> Dict[str, Any]:
+    def process_files(self, pdf_files: List[PDFTask]) -> Dict[str, Any]:
         """批量处理文件"""
         self.logger.info(f"开始处理 {len(pdf_files)} 个文件")
         self.logger.info(f"处理器: {self.processor_config.description}")
@@ -335,8 +358,8 @@ class PDFBatchProcessor:
         
         # 使用进度条
         with tqdm(total=len(pdf_files), desc="处理进度", unit="file") as pbar:
-            for pdf_file in pdf_files:
-                result = self._process_single_file(pdf_file)
+            for task in pdf_files:
+                result = self._process_single_file(task)
                 self.results.append(result)
                 pbar.update(1)
                 
@@ -355,9 +378,12 @@ class PDFBatchProcessor:
         
         return stats
     
-    def _process_single_file(self, pdf_file: Path) -> ProcessResult:
+    def _process_single_file(self, task: PDFTask) -> ProcessResult:
         """🎯 处理单个文件(支持日志重定向)"""
-        self.logger.info(f"处理: {pdf_file}")
+        pdf_file = task.path
+        scene = task.scene or self.default_scene
+        scene_info = f" (scene: {scene})" if scene else ""
+        self.logger.info(f"处理: {pdf_file}{scene_info}")
         
         # 检查文件是否存在
         if not pdf_file.exists():
@@ -376,7 +402,7 @@ class PDFBatchProcessor:
         log_file = self._get_log_file_path(pdf_file)
         
         # 构建命令
-        cmd = self._build_command(pdf_file, output_dir)
+        cmd = self._build_command(pdf_file, output_dir, scene)
         
         self.logger.debug(f"执行命令: {cmd if isinstance(cmd, str) else ' '.join(cmd)}")
         self.logger.info(f"日志输出: {log_file}")
@@ -398,7 +424,7 @@ class PDFBatchProcessor:
                 # 写入日志头
                 log_f.write(f"{'='*80}\n")
                 log_f.write(f"处理器: {self.processor_config.description}\n")
-                log_f.write(f"PDF 文件: {pdf_file}\n")
+                log_f.write(f"PDF 文件: {pdf_file}{scene_info}\n")
                 log_f.write(f"输出目录: {output_dir}\n")
                 log_f.write(f"开始时间: {datetime.now()}\n")
                 log_f.write(f"{'='*80}\n\n")
@@ -486,7 +512,7 @@ class PDFBatchProcessor:
                 log_file=str(log_file)
             )
     
-    def _build_command(self, pdf_file: Path, output_dir: Path):
+    def _build_command(self, pdf_file: Path, output_dir: Path, scene: Optional[str]):
         """构建执行命令
         
         Returns:
@@ -503,6 +529,13 @@ class PDFBatchProcessor:
         
         # 添加额外参数
         base_cmd.extend(self.processor_config.extra_args)
+
+        # 添加场景参数(如果配置了scene_arg)
+        if scene:
+            if self.processor_config.scene_arg:
+                base_cmd.extend([self.processor_config.scene_arg, scene])
+            else:
+                self.logger.warning("⚠️ 场景已提供但未配置scene_arg,已忽略场景参数")
         
         # 如果配置了虚拟环境,构建 shell 命令
         if self.processor_config.venv:
@@ -690,6 +723,17 @@ def create_parser() -> argparse.ArgumentParser:
         nargs='+',
         help='PDF 文件列表 (空格分隔)'
     )
+
+    # 场景参数
+    parser.add_argument(
+        '--scene',
+        help='默认场景名称(文件列表未提供场景时使用)'
+    )
+    parser.add_argument(
+        '--scene-arg',
+        default='--scene',
+        help='场景参数名称 (默认: --scene)'
+    )
     
     # 额外参数
     parser.add_argument(
@@ -772,8 +816,12 @@ def main():
             script=args.script,
             extra_args=args.extra_args.split() if args.extra_args else [],
             output_subdir=args.output_subdir or 'manual_results',
+            scene_arg=args.scene_arg,
             venv=args.venv
         )
+        # 如果配置中没有scene_arg且用户指定了scene,默认设置为--scene
+        if args.scene and not processor_config.scene_arg:
+            print("⚠️  已指定场景但未配置scene_arg,忽略场景参数")
     else:
         parser.error("必须指定 -p 或 -s 参数")
     
@@ -806,18 +854,20 @@ def main():
         return 1
     
     # 显示找到的文件
-    valid_file_paths = [f.as_posix() for f in pdf_files if f.exists()]
+    valid_file_paths = [f"{t.path.as_posix()}\t{t.scene}" if t.scene else t.path.as_posix()
+                        for t in pdf_files if t.path.exists()]
     if valid_file_paths:
         print("\n".join(valid_file_paths))    
 
     # 验证文件
-    valid_files = [f for f in pdf_files if f.exists()]
-    invalid_files = [f for f in pdf_files if not f.exists()]
+    valid_files = [t for t in pdf_files if t.path.exists()]
+    invalid_files = [t for t in pdf_files if not t.path.exists()]
     
     if invalid_files:
         print(f"\n⚠️  警告: {len(invalid_files)} 个文件不存在:")
-        for f in invalid_files[:5]:
-            print(f"  - {f}")
+        for t in invalid_files[:5]:
+            scene_suffix = f" (scene: {t.scene})" if t.scene else ""
+            print(f"  - {t.path}{scene_suffix}")
         if len(invalid_files) > 5:
             print(f"  ... 还有 {len(invalid_files) - 5} 个")
     
@@ -834,7 +884,8 @@ def main():
         processor_config=processor_config,
         output_subdir=args.output_subdir,
         log_base_dir=log_base_dir,  # 🎯 传递日志目录
-        dry_run=args.dry_run
+        dry_run=args.dry_run,
+        default_scene=args.scene
     )
     
     stats = processor.process_files(valid_files)

+ 17 - 16
ocr_tools/ocr_batch/pdf_list.txt

@@ -1,17 +1,18 @@
-德_内蒙古银行照.pdf
-对公_招商银行图.pdf
-A用户_单元格扫描流水.pdf
-B用户_扫描流水.pdf
-康强_北京农村商业银行.pdf
-施博深.pdf
-山西云集科技有限公司.pdf
-2023年度报告母公司.pdf
-提取自赤峰黄金2023年报.pdf
-许_民生银行图.pdf
-方_广发银行图.pdf
-付_工商银行943825图.pdf
-乔_建设银行图.pdf
-湛_平安银行图.pdf
-张_微信图.pdf
-朱_中信银行图.pdf
+# 文件名<TAB>","场景(bank_statement / financial_report)
+德_内蒙古银行照.pdf,bank_statement
+对公_招商银行图.pdf,bank_statement
+A用户_单元格扫描流水.pdf,bank_statement
+B用户_扫描流水.pdf,bank_statement
+康强_北京农村商业银行.pdf,bank_statement
+施博深.pdf,bank_statement
+山西云集科技有限公司.pdf,bank_statement
+2023年度报告母公司.pdf,financial_report
+提取自赤峰黄金2023年报.pdf,financial_report
+许_民生银行图.pdf,bank_statement
+方_广发银行图.pdf,bank_statement
+付_工商银行943825图.pdf,bank_statement
+乔_建设银行图.pdf,bank_statement
+湛_平安银行图.pdf,bank_statement
+张_微信图.pdf,bank_statement
+朱_中信银行图.pdf,bank_statement
 

+ 3 - 0
ocr_tools/ocr_batch/processor_configs.yaml

@@ -12,6 +12,7 @@ processors:
     script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/main_v2.py"
     input_arg: "--input"
     output_arg: "--output_dir"
+    scene_arg: "--scene"
     extra_args:
       - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v4.yaml"
       - "--pages=1-35"
@@ -27,6 +28,7 @@ processors:
     script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/main_v2.py"
     input_arg: "--input"
     output_arg: "--output_dir"
+    scene_arg: "--scene"
     extra_args:
       - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v3.yaml"
       # - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v2.yaml"
@@ -45,6 +47,7 @@ processors:
     script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/main_v2.py"
     input_arg: "--input"
     output_arg: "--output_dir"
+    scene_arg: "--scene"
     extra_args:
       - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_mineru_vl.yaml"
       # - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v2.yaml"