hai 6 meses · f961184f0b
--- a/ocr_tools/ocr_batch/README.md
+++ b/ocr_tools/ocr_batch/README.md
@@ -0,0 +1,78 @@
 
				+# batch_process_pdf.py 使用说明
			
 
				+
			
 
				+**位置**: `ocr_platform/ocr_tools/ocr_batch/`
			
 
				+
			
 
				+**处理器配置文件**: `processor_configs.yaml`
			
 
				+
			
 
				+## 功能概述
			
 
				+### ✅ 5. 日志输出
			
 
				+- 显示使用的虚拟环境
			
 
				+- 在 dry-run 模式下显示完整命令
			
 
				+
			
 
				+## 使用示例
			
 
				+
			
 
				+```bash
			
 
				+# 1. 使用 DotsOCR（自动切换到 py312 环境）
			
 
				+python batch_process_pdf.py -p dotsocr_vllm -f pdf_list.txt
			
 
				+
			
 
				+# 2. 使用 MinerU（自动切换到 mineru2 环境）
			
 
				+python batch_process_pdf.py -p mineru_vllm -f pdf_list.txt
			
 
				+
			
 
				+# 3. 使用 PaddleOCR（自动切换到 paddle_env）
			
 
				+python batch_process_pdf.py -p paddleocr_vl_single_process -f pdf_list.txt
			
 
				+
			
 
				+# 4. 模拟运行查看完整命令
			
 
				+python batch_process_pdf.py -p dotsocr_vllm -f pdf_list.txt --dry-run -v
			
 
				+
			
 
				+# 5. 覆盖虚拟环境
			
 
				+python batch_process_pdf.py -p dotsocr_vllm -f pdf_list.txt \
			
 
				+    --venv "conda activate custom_env"
			
 
				+
			
 
				+# 6. 手动指定脚本和虚拟环境
			
 
				+python batch_process_pdf.py \
			
 
				+    -s /path/to/script.py \
			
 
				+    --venv "conda activate myenv" \
			
 
				+    -f pdf_list.txt
			
 
				+```
			
 
				+
			
 
				+## 实际执行的命令示例
			
 
				+
			
 
				+### DotsOCR:
			
 
				+```bash
			
 
				+conda activate py312 && python /path/to/dotsocr_vllm_multthreads.py \
			
 
				+    --input_file /path/to/file.pdf \
			
 
				+    --output_dir /path/to/output \
			
 
				+    --ip=10.192.72.11 --port=8101 --dpi=200
			
 
				+```
			
 
				+
			
 
				+### MinerU:
			
 
				+```bash
			
 
				+conda activate mineru2 && python /path/to/mineru2_vllm_multthreads.py \
			
 
				+    --input_file /path/to/file.pdf \
			
 
				+    --output_dir /path/to/output \
			
 
				+    --server_url=http://10.192.72.11:8121
			
 
				+```
			
 
				+
			
 
				+### PaddleOCR-VL:
			
 
				+```bash
			
 
				+source /path/to/paddle_env/bin/activate && python /path/to/ocr_platform/ocr_tools/paddle_vl_tool/main.py \
			
 
				+    --input /path/to/file.pdf \
			
 
				+    --output_dir /path/to/output \
			
 
				+    --pipeline=/path/to/config.yaml
			
 
				+```
			
 
				+
			
 
				+### PP-StructureV3:
			
 
				+```bash
			
 
				+source /path/to/paddle_env/bin/activate && python /path/to/ocr_platform/ocr_tools/ppstructure_tool/main.py \
			
 
				+    --input /path/to/file.pdf \
			
 
				+    --output_dir /path/to/output \
			
 
				+    --pipeline=/path/to/config.yaml
			
 
				+```
			
 
				+
			
 
				+### PP-StructureV3 API Client:
			
 
				+```bash
			
 
				+source /path/to/paddle_env/bin/activate && python /path/to/ocr_platform/ocr_tools/ppstructure_tool/api_client.py \
			
 
				+    --input /path/to/file.pdf \
			
 
				+    --output_dir /path/to/output \
			
 
				+    --api_url=http://10.192.72.11:8111/layout-parsing
			
 
				+```
			
--- a/ocr_tools/ocr_batch/batch_merge_results.py
+++ b/ocr_tools/ocr_batch/batch_merge_results.py
@@ -0,0 +1,736 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+批量合并 OCR 结果
			
 
				+自动读取配置文件，对所有 VL 处理器的输出进行 bbox 合并
			
 
				+支持执行器输出日志重定向
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import yaml
			
 
				+import argparse
			
 
				+import subprocess
			
 
				+from pathlib import Path
			
 
				+from datetime import datetime
			
 
				+from typing import Dict, List, Tuple, Optional, Any
			
 
				+from dataclasses import dataclass
			
 
				+import logging
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+# 添加 ocr_platform 根目录到 Python 路径（用于导入 ocr_merger）
			
 
				+ocr_platform_root = Path(__file__).parents[2]  # ocr_batch -> ocr_tools -> ocr_platform
			
 
				+if str(ocr_platform_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(ocr_platform_root))
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class MergeTask:
			
 
				+    """合并任务"""
			
 
				+    processor_name: str
			
 
				+    vl_result_dir: Path
			
 
				+    paddle_result_dir: Path
			
 
				+    output_dir: Path
			
 
				+    merger_script: str
			
 
				+    description: str
			
 
				+    log_file: str = ""  # 🎯 新增：日志文件路径
			
 
				+
			
 
				+
			
 
				+class BatchMerger:
			
 
				+    """批量合并器"""
			
 
				+    
			
 
				+    # VL 处理器类型映射到合并脚本
			
 
				+    MERGER_SCRIPTS = {
			
 
				+        'paddleocr_vl': 'merge_paddleocr_vl_paddleocr.py',
			
 
				+        'mineru': 'merge_mineru_paddle_ocr.py',
			
 
				+        'dotsocr': 'merge_dotsocr_paddleocr.py'
			
 
				+    }
			
 
				+    
			
 
				+    def __init__(self, config_file: str, base_dir: str = None):
			
 
				+        """
			
 
				+        Args:
			
 
				+            config_file: processor_configs.yaml 路径
			
 
				+            base_dir: PDF 基础目录，覆盖配置文件中的设置
			
 
				+        """
			
 
				+        self.config_file = Path(config_file)
			
 
				+        self.config = self._load_config()
			
 
				+        self.base_dir = Path(base_dir) if base_dir else Path(self.config['global']['base_dir'])
			
 
				+        
			
 
				+        # 🎯 日志基础目录
			
 
				+        self.log_base_dir = self.base_dir / self.config['global'].get('log_dir', 'logs')
			
 
				+        
			
 
				+        # 设置日志
			
 
				+        self.logger = self._setup_logger()
			
 
				+        
			
 
				+        # merger 脚本目录
			
 
				+        # 从 ocr_batch 目录计算路径：ocr_batch -> ocr_tools -> ocr_platform
			
 
				+        ocr_platform_root = Path(__file__).parents[2]  # ocr_batch -> ocr_tools -> ocr_platform
			
 
				+        self.merger_dir = ocr_platform_root / 'ocr_tools' / 'ocr_merger'
			
 
				+        
			
 
				+        # 🎯 统计信息
			
 
				+        self.merge_results: List[Dict[str, Any]] = []
			
 
				+    
			
 
				+    def _load_config(self) -> Dict:
			
 
				+        """加载配置文件"""
			
 
				+        with open(self.config_file, 'r', encoding='utf-8') as f:
			
 
				+            return yaml.safe_load(f)
			
 
				+    
			
 
				+    def _setup_logger(self) -> logging.Logger:
			
 
				+        """设置日志"""
			
 
				+        logger = logging.getLogger('BatchMerger')
			
 
				+        logger.setLevel(logging.INFO)
			
 
				+        
			
 
				+        if not logger.handlers:
			
 
				+            console_handler = logging.StreamHandler()
			
 
				+            console_handler.setLevel(logging.INFO)
			
 
				+            formatter = logging.Formatter(
			
 
				+                '%(asctime)s - %(levelname)s - %(message)s',
			
 
				+                datefmt='%Y-%m-%d %H:%M:%S'
			
 
				+            )
			
 
				+            console_handler.setFormatter(formatter)
			
 
				+            logger.addHandler(console_handler)
			
 
				+        
			
 
				+        return logger
			
 
				+    
			
 
				+    def _detect_processor_type(self, processor_name: str) -> str:
			
 
				+        """
			
 
				+        检测处理器类型
			
 
				+        
			
 
				+        Returns:
			
 
				+            'paddleocr_vl', 'mineru', 'dotsocr', 'ppstructure' 等
			
 
				+        """
			
 
				+        name_lower = processor_name.lower()
			
 
				+        
			
 
				+        if 'paddleocr_vl' in name_lower or 'paddleocr-vl' in name_lower:
			
 
				+            return 'paddleocr_vl'
			
 
				+        elif 'mineru' in name_lower:
			
 
				+            return 'mineru'
			
 
				+        elif 'dotsocr' in name_lower or 'dots' in name_lower:
			
 
				+            return 'dotsocr'
			
 
				+        elif 'ppstructure' in name_lower or 'pp-structure' in name_lower:
			
 
				+            return 'ppstructure'
			
 
				+        else:
			
 
				+            return 'unknown'
			
 
				+    
			
 
				+    def _get_merger_script(self, processor_type: str) -> str:
			
 
				+        """获取合并脚本路径"""
			
 
				+        script_name = self.MERGER_SCRIPTS.get(processor_type)
			
 
				+        if not script_name:
			
 
				+            return None
			
 
				+        
			
 
				+        script_path = self.merger_dir / script_name
			
 
				+        return str(script_path) if script_path.exists() else None
			
 
				+    
			
 
				+    def _find_paddle_result_dir(self, pdf_dir: Path) -> Path:
			
 
				+        """
			
 
				+        查找对应的 PaddleOCR 结果目录
			
 
				+        
			
 
				+        优先级:
			
 
				+        1. ppstructurev3_cpu_results (本地 CPU)
			
 
				+        2. ppstructurev3_results (默认)
			
 
				+        3. data_PPStructureV3_Results (旧格式)
			
 
				+        """
			
 
				+        candidates = [
			
 
				+            pdf_dir / 'ppstructurev3_client_results',
			
 
				+            pdf_dir / 'ppstructurev3_single_process_results',
			
 
				+        ]
			
 
				+        
			
 
				+        for candidate in candidates:
			
 
				+            if candidate.exists():
			
 
				+                return candidate
			
 
				+        
			
 
				+        return None
			
 
				+    
			
 
				+    def _get_log_file_path(self, pdf_dir: Path, processor_name: str) -> Path:
			
 
				+        """
			
 
				+        🎯 获取合并任务的日志文件路径
			
 
				+        
			
 
				+        日志结构:
			
 
				+        PDF目录/
			
 
				+        └── logs/
			
 
				+            └── merge_processor_name/
			
 
				+                └── PDF名称_merge_YYYYMMDD_HHMMSS.log
			
 
				+        """
			
 
				+        # 日志目录
			
 
				+        log_dir = pdf_dir / 'logs' / f'merge_{processor_name}'
			
 
				+        log_dir.mkdir(parents=True, exist_ok=True)
			
 
				+        
			
 
				+        # 日志文件名
			
 
				+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
			
 
				+        log_file = log_dir / f"{pdf_dir.name}_merge_{timestamp}.log"
			
 
				+        
			
 
				+        return log_file
			
 
				+    
			
 
				+    def discover_merge_tasks(
			
 
				+        self, 
			
 
				+        pdf_list: List[str] = None,
			
 
				+        processors: List[str] = None
			
 
				+    ) -> List[MergeTask]:
			
 
				+        """
			
 
				+        自动发现需要合并的任务
			
 
				+        
			
 
				+        Args:
			
 
				+            pdf_list: PDF 文件列表（不含扩展名），如 ['德_内蒙古银行照', ...]
			
 
				+            processors: 处理器列表，如 ['paddleocr_vl_single_process', ...]
			
 
				+        
			
 
				+        Returns:
			
 
				+            MergeTask 列表
			
 
				+        """
			
 
				+        tasks = []
			
 
				+        
			
 
				+        # 如果没有指定处理器，扫描所有 VL 类型的处理器
			
 
				+        if not processors:
			
 
				+            processors = []
			
 
				+            for proc_name, proc_config in self.config['processors'].items():
			
 
				+                proc_type = self._detect_processor_type(proc_name)
			
 
				+                if proc_type in ['paddleocr_vl', 'mineru', 'dotsocr']:
			
 
				+                    processors.append(proc_name)
			
 
				+        
			
 
				+        # 如果没有指定 PDF 列表，扫描基础目录
			
 
				+        if not pdf_list:
			
 
				+            pdf_list = [d.name for d in self.base_dir.iterdir() if d.is_dir()]
			
 
				+        
			
 
				+        self.logger.info(f"📂 基础目录: {self.base_dir}")
			
 
				+        self.logger.info(f"🔍 发现 {len(pdf_list)} 个 PDF 目录")
			
 
				+        self.logger.info(f"⚙️  发现 {len(processors)} 个 VL 处理器")
			
 
				+        
			
 
				+        # 遍历每个 PDF 目录和处理器组合
			
 
				+        for pdf_name in pdf_list:
			
 
				+            pdf_dir = self.base_dir / pdf_name
			
 
				+            
			
 
				+            if not pdf_dir.exists():
			
 
				+                self.logger.warning(f"⚠️  目录不存在: {pdf_dir}")
			
 
				+                continue
			
 
				+            
			
 
				+            # 查找 PaddleOCR 结果目录
			
 
				+            paddle_result_dir = self._find_paddle_result_dir(pdf_dir)
			
 
				+            
			
 
				+            if not paddle_result_dir:
			
 
				+                self.logger.warning(f"⚠️  未找到 PaddleOCR 结果: {pdf_name}")
			
 
				+                continue
			
 
				+            
			
 
				+            # 遍历每个 VL 处理器
			
 
				+            for proc_name in processors:
			
 
				+                if proc_name not in self.config['processors']:
			
 
				+                    self.logger.warning(f"⚠️  处理器不存在: {proc_name}")
			
 
				+                    continue
			
 
				+                
			
 
				+                proc_config = self.config['processors'][proc_name]
			
 
				+                proc_type = self._detect_processor_type(proc_name)
			
 
				+                
			
 
				+                # 获取合并脚本
			
 
				+                merger_script = self._get_merger_script(proc_type)
			
 
				+                if not merger_script:
			
 
				+                    self.logger.warning(f"⚠️  不支持的处理器类型: {proc_name} ({proc_type})")
			
 
				+                    continue
			
 
				+                
			
 
				+                # VL 结果目录
			
 
				+                vl_output_subdir = proc_config.get('output_subdir', f'{proc_name}_results')
			
 
				+                vl_result_dir = pdf_dir / vl_output_subdir
			
 
				+                
			
 
				+                if not vl_result_dir.exists():
			
 
				+                    self.logger.debug(f"⏭️  VL 结果不存在: {vl_result_dir}")
			
 
				+                    continue
			
 
				+                
			
 
				+                # 输出目录
			
 
				+                output_dir = pdf_dir / f"{vl_output_subdir}_cell_bbox"
			
 
				+                
			
 
				+                # 🎯 日志文件路径
			
 
				+                log_file = self._get_log_file_path(pdf_dir, proc_name)
			
 
				+                
			
 
				+                # 创建任务
			
 
				+                task = MergeTask(
			
 
				+                    processor_name=proc_name,
			
 
				+                    vl_result_dir=vl_result_dir,
			
 
				+                    paddle_result_dir=paddle_result_dir,
			
 
				+                    output_dir=output_dir,
			
 
				+                    merger_script=merger_script,
			
 
				+                    description=proc_config.get('description', proc_name),
			
 
				+                    log_file=str(log_file)  # 🎯 新增
			
 
				+                )
			
 
				+                
			
 
				+                tasks.append(task)
			
 
				+        
			
 
				+        return tasks
			
 
				+    
			
 
				+    def execute_merge_task(
			
 
				+        self, 
			
 
				+        task: MergeTask,
			
 
				+        window: int = 15,
			
 
				+        threshold: int = 85,
			
 
				+        output_type: str = 'both',
			
 
				+        dry_run: bool = False
			
 
				+    ) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        🎯 执行单个合并任务（支持日志重定向）
			
 
				+        
			
 
				+        Args:
			
 
				+            task: 合并任务
			
 
				+            window: 查找窗口
			
 
				+            threshold: 相似度阈值
			
 
				+            output_type: 输出格式
			
 
				+            dry_run: 模拟运行
			
 
				+        
			
 
				+        Returns:
			
 
				+            执行结果字典
			
 
				+        """
			
 
				+        self.logger.info(f"\n{'='*80}")
			
 
				+        self.logger.info(f"📄 处理: {task.vl_result_dir.parent.name}")
			
 
				+        self.logger.info(f"🔧 处理器: {task.description}")
			
 
				+        self.logger.info(f"📂 VL 结果: {task.vl_result_dir}")
			
 
				+        self.logger.info(f"📂 PaddleOCR 结果: {task.paddle_result_dir}")
			
 
				+        self.logger.info(f"📂 输出目录: {task.output_dir}")
			
 
				+        self.logger.info(f"📄 日志文件: {task.log_file}")
			
 
				+        self.logger.info(f"{'='*80}")
			
 
				+        
			
 
				+        # 构建命令
			
 
				+        cmd = [
			
 
				+            sys.executable,  # 当前 Python 解释器
			
 
				+            task.merger_script,
			
 
				+            f"--{self._get_vl_arg_name(task.merger_script)}-dir", str(task.vl_result_dir),
			
 
				+            '--paddle-dir', str(task.paddle_result_dir),
			
 
				+            '--output-dir', str(task.output_dir),
			
 
				+            '--output-type', output_type,
			
 
				+            '--window', str(window),
			
 
				+            '--threshold', str(threshold)
			
 
				+        ]
			
 
				+        
			
 
				+        if dry_run:
			
 
				+            self.logger.info(f"[DRY RUN] 命令: {' '.join(cmd)}")
			
 
				+            return {
			
 
				+                'task': task,
			
 
				+                'success': True,
			
 
				+                'duration': 0,
			
 
				+                'error': '',
			
 
				+                'dry_run': True
			
 
				+            }
			
 
				+        
			
 
				+        # 🎯 执行命令并重定向输出到日志文件
			
 
				+        import time
			
 
				+        start_time = time.time()
			
 
				+        
			
 
				+        try:
			
 
				+            with open(task.log_file, 'w', encoding='utf-8') as log_f:
			
 
				+                # 写入日志头
			
 
				+                log_f.write(f"{'='*80}\n")
			
 
				+                log_f.write(f"合并任务日志\n")
			
 
				+                log_f.write(f"{'='*80}\n\n")
			
 
				+                log_f.write(f"PDF 目录: {task.vl_result_dir.parent}\n")
			
 
				+                log_f.write(f"处理器: {task.description}\n")
			
 
				+                log_f.write(f"处理器名称: {task.processor_name}\n")
			
 
				+                log_f.write(f"VL 结果目录: {task.vl_result_dir}\n")
			
 
				+                log_f.write(f"PaddleOCR 结果目录: {task.paddle_result_dir}\n")
			
 
				+                log_f.write(f"输出目录: {task.output_dir}\n")
			
 
				+                log_f.write(f"合并脚本: {task.merger_script}\n")
			
 
				+                log_f.write(f"查找窗口: {window}\n")
			
 
				+                log_f.write(f"相似度阈值: {threshold}\n")
			
 
				+                log_f.write(f"输出格式: {output_type}\n")
			
 
				+                log_f.write(f"开始时间: {datetime.now()}\n")
			
 
				+                log_f.write(f"{'='*80}\n\n")
			
 
				+                log_f.flush()
			
 
				+                
			
 
				+                # 执行命令
			
 
				+                result = subprocess.run(
			
 
				+                    cmd,
			
 
				+                    stdout=log_f,  # 🎯 重定向 stdout
			
 
				+                    stderr=subprocess.STDOUT,  # 🎯 合并 stderr 到 stdout
			
 
				+                    text=True,
			
 
				+                    check=True
			
 
				+                )
			
 
				+                
			
 
				+                # 写入日志尾
			
 
				+                log_f.write(f"\n{'='*80}\n")
			
 
				+                log_f.write(f"结束时间: {datetime.now()}\n")
			
 
				+                log_f.write(f"状态: 成功\n")
			
 
				+                log_f.write(f"{'='*80}\n")
			
 
				+            
			
 
				+            duration = time.time() - start_time
			
 
				+            self.logger.info(f"✅ 合并成功 (耗时: {duration:.2f}秒)")
			
 
				+            
			
 
				+            return {
			
 
				+                'task': task,
			
 
				+                'success': True,
			
 
				+                'duration': duration,
			
 
				+                'error': '',
			
 
				+                'dry_run': False
			
 
				+            }
			
 
				+            
			
 
				+        except subprocess.CalledProcessError as e:
			
 
				+            duration = time.time() - start_time
			
 
				+            error_msg = f"命令执行失败 (退出码: {e.returncode})"
			
 
				+            
			
 
				+            # 🎯 在日志文件中追加错误信息
			
 
				+            with open(task.log_file, 'a', encoding='utf-8') as log_f:
			
 
				+                log_f.write(f"\n{'='*80}\n")
			
 
				+                log_f.write(f"结束时间: {datetime.now()}\n")
			
 
				+                log_f.write(f"状态: 失败\n")
			
 
				+                log_f.write(f"错误: {error_msg}\n")
			
 
				+                log_f.write(f"{'='*80}\n")
			
 
				+            
			
 
				+            self.logger.error(f"❌ 合并失败 (耗时: {duration:.2f}秒)")
			
 
				+            self.logger.error(f"错误信息: {error_msg}")
			
 
				+            self.logger.error(f"详细日志: {task.log_file}")
			
 
				+            
			
 
				+            return {
			
 
				+                'task': task,
			
 
				+                'success': False,
			
 
				+                'duration': duration,
			
 
				+                'error': error_msg,
			
 
				+                'dry_run': False
			
 
				+            }
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            duration = time.time() - start_time
			
 
				+            error_msg = str(e)
			
 
				+            
			
 
				+            with open(task.log_file, 'a', encoding='utf-8') as log_f:
			
 
				+                log_f.write(f"\n{'='*80}\n")
			
 
				+                log_f.write(f"结束时间: {datetime.now()}\n")
			
 
				+                log_f.write(f"状态: 异常\n")
			
 
				+                log_f.write(f"错误: {error_msg}\n")
			
 
				+                log_f.write(f"{'='*80}\n")
			
 
				+            
			
 
				+            self.logger.error(f"❌ 合并异常 (耗时: {duration:.2f}秒)")
			
 
				+            self.logger.error(f"错误信息: {error_msg}")
			
 
				+            self.logger.error(f"详细日志: {task.log_file}")
			
 
				+            
			
 
				+            return {
			
 
				+                'task': task,
			
 
				+                'success': False,
			
 
				+                'duration': duration,
			
 
				+                'error': error_msg,
			
 
				+                'dry_run': False
			
 
				+            }
			
 
				+    
			
 
				+    def _get_vl_arg_name(self, merger_script: str) -> str:
			
 
				+        """获取 VL 参数名称"""
			
 
				+        script_name = Path(merger_script).stem
			
 
				+        
			
 
				+        if 'paddleocr_vl' in script_name:
			
 
				+            return 'paddleocr-vl'
			
 
				+        elif 'mineru' in script_name:
			
 
				+            return 'mineru'
			
 
				+        elif 'dotsocr' in script_name:
			
 
				+            return 'dotsocr'
			
 
				+        else:
			
 
				+            return 'vl'
			
 
				+    
			
 
				+    def _save_summary_log(self, stats: Dict[str, Any]):
			
 
				+        """🎯 保存汇总日志"""
			
 
				+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
			
 
				+        summary_log_file = self.log_base_dir / f"merge_batch_summary_{timestamp}.log"
			
 
				+        
			
 
				+        # 确保目录存在
			
 
				+        summary_log_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				+        
			
 
				+        with open(summary_log_file, 'w', encoding='utf-8') as f:
			
 
				+            f.write("OCR 结果批量合并汇总日志\n")
			
 
				+            f.write("=" * 80 + "\n\n")
			
 
				+            
			
 
				+            f.write(f"配置文件: {self.config_file}\n")
			
 
				+            f.write(f"基础目录: {self.base_dir}\n")
			
 
				+            f.write(f"日志目录: {self.log_base_dir}\n")
			
 
				+            f.write(f"开始时间: {datetime.now()}\n")
			
 
				+            f.write(f"总耗时: {stats['total_duration']:.2f} 秒\n\n")
			
 
				+            
			
 
				+            f.write("统计信息:\n")
			
 
				+            f.write(f"  总任务数: {stats['total']}\n")
			
 
				+            f.write(f"  成功: {stats['success']}\n")
			
 
				+            f.write(f"  失败: {stats['failed']}\n\n")
			
 
				+            
			
 
				+            if stats['failed_tasks']:
			
 
				+                f.write("失败的任务:\n")
			
 
				+                for item in stats['failed_tasks']:
			
 
				+                    f.write(f"  ✗ {item['pdf_dir']} / {item['processor']}\n")
			
 
				+                    f.write(f"    错误: {item['error']}\n")
			
 
				+                    f.write(f"    日志: {item['log']}\n\n")
			
 
				+            
			
 
				+            f.write("详细结果:\n")
			
 
				+            for result in self.merge_results:
			
 
				+                task = result['task']
			
 
				+                status = "✓" if result['success'] else "✗"
			
 
				+                f.write(f"{status} {task.vl_result_dir.parent.name} / {task.processor_name} ({result['duration']:.2f}s)\n")
			
 
				+                f.write(f"   日志: {task.log_file}\n")
			
 
				+                if result['error']:
			
 
				+                    f.write(f"   错误: {result['error']}\n")
			
 
				+        
			
 
				+        self.logger.info(f"汇总日志已保存: {summary_log_file}")
			
 
				+    
			
 
				+    def batch_merge(
			
 
				+        self,
			
 
				+        pdf_list: List[str] = None,
			
 
				+        processors: List[str] = None,
			
 
				+        window: int = 15,
			
 
				+        threshold: int = 85,
			
 
				+        output_type: str = 'both',
			
 
				+        dry_run: bool = False
			
 
				+    ) -> Dict:
			
 
				+        """
			
 
				+        批量合并
			
 
				+        
			
 
				+        Returns:
			
 
				+            统计信息字典
			
 
				+        """
			
 
				+        # 发现任务
			
 
				+        tasks = self.discover_merge_tasks(pdf_list, processors)
			
 
				+        
			
 
				+        if not tasks:
			
 
				+            self.logger.warning("⚠️  没有发现任何合并任务")
			
 
				+            return {
			
 
				+                'total': 0,
			
 
				+                'success': 0,
			
 
				+                'failed': 0,
			
 
				+                'total_duration': 0,
			
 
				+                'failed_tasks': []
			
 
				+            }
			
 
				+        
			
 
				+        self.logger.info(f"\n🎯 发现 {len(tasks)} 个合并任务\n")
			
 
				+        
			
 
				+        # 显示任务列表
			
 
				+        for i, task in enumerate(tasks, 1):
			
 
				+            self.logger.info(f"{i}. {task.vl_result_dir.parent.name} / {task.processor_name}")
			
 
				+        
			
 
				+        # 确认执行
			
 
				+        if not dry_run:
			
 
				+            confirm = input(f"\n是否继续执行 {len(tasks)} 个合并任务? [Y/n]: ")
			
 
				+            if confirm.lower() not in ['', 'y', 'yes']:
			
 
				+                self.logger.info("❌ 已取消")
			
 
				+                return {
			
 
				+                    'total': 0,
			
 
				+                    'success': 0,
			
 
				+                    'failed': 0,
			
 
				+                    'total_duration': 0,
			
 
				+                    'failed_tasks': []
			
 
				+                }
			
 
				+        
			
 
				+        # 执行任务
			
 
				+        import time
			
 
				+        batch_start_time = time.time()
			
 
				+        success_count = 0
			
 
				+        failed_count = 0
			
 
				+        
			
 
				+        with tqdm(total=len(tasks), desc="合并进度", unit="task") as pbar:
			
 
				+            for task in tasks:
			
 
				+                result = self.execute_merge_task(
			
 
				+                    task,
			
 
				+                    window=window,
			
 
				+                    threshold=threshold,
			
 
				+                    output_type=output_type,
			
 
				+                    dry_run=dry_run
			
 
				+                )
			
 
				+                
			
 
				+                self.merge_results.append(result)
			
 
				+                
			
 
				+                if result['success']:
			
 
				+                    success_count += 1
			
 
				+                else:
			
 
				+                    failed_count += 1
			
 
				+                
			
 
				+                pbar.update(1)
			
 
				+                pbar.set_postfix({
			
 
				+                    'success': success_count,
			
 
				+                    'failed': failed_count
			
 
				+                })
			
 
				+        
			
 
				+        total_duration = time.time() - batch_start_time
			
 
				+        
			
 
				+        # 统计失败任务
			
 
				+        failed_tasks = [
			
 
				+            {
			
 
				+                'pdf_dir': r['task'].vl_result_dir.parent.name,
			
 
				+                'processor': r['task'].processor_name,
			
 
				+                'error': r['error'],
			
 
				+                'log': r['task'].log_file
			
 
				+            }
			
 
				+            for r in self.merge_results if not r['success']
			
 
				+        ]
			
 
				+        
			
 
				+        # 统计信息
			
 
				+        stats = {
			
 
				+            'total': len(tasks),
			
 
				+            'success': success_count,
			
 
				+            'failed': failed_count,
			
 
				+            'total_duration': total_duration,
			
 
				+            'failed_tasks': failed_tasks
			
 
				+        }
			
 
				+        
			
 
				+        # 🎯 保存汇总日志
			
 
				+        self._save_summary_log(stats)
			
 
				+        
			
 
				+        # 打印总结
			
 
				+        self.logger.info(f"\n{'='*80}")
			
 
				+        self.logger.info("📊 合并完成")
			
 
				+        self.logger.info(f"  总任务数: {stats['total']}")
			
 
				+        self.logger.info(f"  ✅ 成功: {stats['success']}")
			
 
				+        self.logger.info(f"  ❌ 失败: {stats['failed']}")
			
 
				+        self.logger.info(f"  ⏱️  总耗时: {stats['total_duration']:.2f} 秒")
			
 
				+        self.logger.info(f"{'='*80}")
			
 
				+        
			
 
				+        if failed_tasks:
			
 
				+            self.logger.info(f"\n失败的任务:")
			
 
				+            for item in failed_tasks:
			
 
				+                self.logger.info(f"  ✗ {item['pdf_dir']} / {item['processor']}")
			
 
				+                self.logger.info(f"    错误: {item['error']}")
			
 
				+                self.logger.info(f"    日志: {item['log']}")
			
 
				+        
			
 
				+        return stats
			
 
				+
			
 
				+
			
 
				+def create_parser() -> argparse.ArgumentParser:
			
 
				+    """创建命令行参数解析器"""
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='批量合并 OCR 结果（VL + PaddleOCR）',
			
 
				+        formatter_class=argparse.RawDescriptionHelpFormatter,
			
 
				+        epilog="""
			
 
				+示例用法:
			
 
				+
			
 
				+  1. 合并配置文件中所有 VL 处理器的结果:
			
 
				+     python batch_merge_results.py
			
 
				+
			
 
				+  2. 合并指定 PDF 的结果:
			
 
				+     python batch_merge_results.py -f pdf_list.txt
			
 
				+
			
 
				+  3. 合并指定处理器的结果:
			
 
				+     python batch_merge_results.py -p paddleocr_vl_single_process -p mineru_vllm
			
 
				+
			
 
				+  4. 自定义参数:
			
 
				+     python batch_merge_results.py -w 20 -t 90
			
 
				+
			
 
				+  5. 模拟运行（不实际执行）:
			
 
				+     python batch_merge_results.py --dry-run
			
 
				+        """
			
 
				+    )
			
 
				+    
			
 
				+    # 配置文件
			
 
				+    parser.add_argument(
			
 
				+        '-c', '--config',
			
 
				+        default='processor_configs.yaml',
			
 
				+        help='配置文件路径 (默认: processor_configs.yaml)'
			
 
				+    )
			
 
				+    
			
 
				+    # PDF 和处理器
			
 
				+    parser.add_argument(
			
 
				+        '-d', '--base-dir',
			
 
				+        help='PDF 基础目录（覆盖配置文件）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '-f', '--file-list',
			
 
				+        help='PDF 列表文件（每行一个 PDF 名称，不含扩展名）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '-l', '--pdf-list',
			
 
				+        nargs='+',
			
 
				+        help='PDF 名称列表（不含扩展名）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '-p', '--processors',
			
 
				+        nargs='+',
			
 
				+        help='处理器列表（不指定则自动检测所有 VL 处理器）'
			
 
				+    )
			
 
				+    
			
 
				+    # 合并参数
			
 
				+    parser.add_argument(
			
 
				+        '-w', '--window',
			
 
				+        type=int,
			
 
				+        default=15,
			
 
				+        help='查找窗口大小 (默认: 15)'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '-t', '--threshold',
			
 
				+        type=int,
			
 
				+        default=85,
			
 
				+        help='相似度阈值 (默认: 85)'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '--output-type',
			
 
				+        choices=['json', 'markdown', 'both'],
			
 
				+        default='both',
			
 
				+        help='输出格式 (默认: both)'
			
 
				+    )
			
 
				+    
			
 
				+    # 工具选项
			
 
				+    parser.add_argument(
			
 
				+        '--dry-run',
			
 
				+        action='store_true',
			
 
				+        help='模拟运行，不实际执行'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '-v', '--verbose',
			
 
				+        action='store_true',
			
 
				+        help='详细输出'
			
 
				+    )
			
 
				+    
			
 
				+    return parser
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    parser = create_parser()
			
 
				+    args = parser.parse_args()
			
 
				+    
			
 
				+    # 设置日志级别
			
 
				+    if args.verbose:
			
 
				+        logging.getLogger().setLevel(logging.DEBUG)
			
 
				+    
			
 
				+    # 读取 PDF 列表
			
 
				+    pdf_list = None
			
 
				+    if args.file_list:
			
 
				+        pdf_list = []
			
 
				+        with open(args.file_list, 'r', encoding='utf-8') as f:
			
 
				+            for line in f:
			
 
				+                line = line.strip()
			
 
				+                if line and not line.startswith('#'):
			
 
				+                    # 移除 .pdf 扩展名
			
 
				+                    pdf_name = line.replace('.pdf', '')
			
 
				+                    pdf_list.append(pdf_name)
			
 
				+    elif args.pdf_list:
			
 
				+        pdf_list = [p.replace('.pdf', '') for p in args.pdf_list]
			
 
				+    
			
 
				+    # 创建批量合并器
			
 
				+    merger = BatchMerger(
			
 
				+        config_file=args.config,
			
 
				+        base_dir=args.base_dir
			
 
				+    )
			
 
				+    
			
 
				+    # 执行批量合并
			
 
				+    stats = merger.batch_merge(
			
 
				+        pdf_list=pdf_list,
			
 
				+        processors=args.processors,
			
 
				+        window=args.window,
			
 
				+        threshold=args.threshold,
			
 
				+        output_type=args.output_type,
			
 
				+        dry_run=args.dry_run
			
 
				+    )
			
 
				+    
			
 
				+    return 0 if stats['failed'] == 0 else 1
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    print("🚀 启动批量OCR bbox 合并程序...")
			
 
				+    
			
 
				+    import sys
			
 
				+    
			
 
				+    if len(sys.argv) == 1:
			
 
				+        # 如果没有命令行参数，使用默认配置运行
			
 
				+        print("ℹ️  未提供命令行参数，使用默认配置运行...")
			
 
				+        
			
 
				+        # 默认配置
			
 
				+        default_config = {
			
 
				+            "file-list": "pdf_list.txt",
			
 
				+        }
			
 
				+        
			
 
				+        print("⚙️  默认参数:")
			
 
				+        for key, value in default_config.items():
			
 
				+            print(f"  --{key}: {value}")
			
 
				+        # 构造参数
			
 
				+        sys.argv = [sys.argv[0]]
			
 
				+        for key, value in default_config.items():
			
 
				+            sys.argv.extend([f"--{key}", str(value)])
			
 
				+        sys.argv.append("--dry-run")
			
 
				+        sys.argv.append("--verbose")  # 添加详细输出参数 
			
 
				+    sys.exit(main())
			
--- a/ocr_tools/ocr_batch/batch_process_pdf.py
+++ b/ocr_tools/ocr_batch/batch_process_pdf.py
@@ -0,0 +1,890 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+PDF 批量处理脚本
			
 
				+支持多种处理器，配置文件驱动
			
 
				+支持自动切换虚拟环境
			
 
				+支持执行器输出日志重定向
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import argparse
			
 
				+import subprocess
			
 
				+import json
			
 
				+import yaml
			
 
				+from pathlib import Path
			
 
				+from datetime import datetime
			
 
				+from typing import List, Dict, Optional, Any
			
 
				+from dataclasses import dataclass, field
			
 
				+import logging
			
 
				+from tqdm import tqdm
			
 
				+import time
			
 
				+
			
 
				+# ============================================================================
			
 
				+# 数据类定义
			
 
				+# ============================================================================
			
 
				+
			
 
				+@dataclass
			
 
				+class ProcessorConfig:
			
 
				+    """处理器配置"""
			
 
				+    name: str
			
 
				+    script: str
			
 
				+    input_arg: str = "--input_file"
			
 
				+    output_arg: str = "--output_dir"
			
 
				+    extra_args: List[str] = field(default_factory=list)
			
 
				+    output_subdir: str = "results"
			
 
				+    log_subdir: str = "logs"  # 🎯 新增：日志子目录
			
 
				+    venv: Optional[str] = None
			
 
				+    description: str = ""
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class ProcessResult:
			
 
				+    """处理结果"""
			
 
				+    pdf_file: str
			
 
				+    success: bool
			
 
				+    duration: float
			
 
				+    error_message: str = ""
			
 
				+    log_file: str = ""  # 🎯 新增：日志文件路径
			
 
				+
			
 
				+
			
 
				+# ============================================================================
			
 
				+# 配置管理
			
 
				+# ============================================================================
			
 
				+
			
 
				+class ConfigManager:
			
 
				+    """配置管理器"""
			
 
				+    
			
 
				+    DEFAULT_CONFIG = {
			
 
				+        'processors': {
			
 
				+            'paddleocr_vl_single_process': {
			
 
				+                'script': '/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/paddle_vl_tool/main.py',
			
 
				+                'input_arg': '--input',
			
 
				+                'output_arg': '--output_dir',
			
 
				+                'extra_args': [
			
 
				+                    '--pipeline=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/paddle_common/config/PaddleOCR-VL-Client.yaml',
			
 
				+                    '--no-adapter'
			
 
				+                ],
			
 
				+                'output_subdir': 'paddleocr_vl_results',
			
 
				+                'venv': 'source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate',
			
 
				+                'description': 'PaddleOCR-VL 处理器',
			
 
				+                'log_subdir': 'logs/paddleocr_vl_single_process'  # 🎯 新增
			
 
				+            },
			
 
				+            'ppstructurev3_single_process': {
			
 
				+                'script': '/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/ppstructure_tool/main.py',
			
 
				+                'input_arg': '--input',
			
 
				+                'output_arg': '--output_dir',
			
 
				+                'extra_args': [
			
 
				+                    '--pipeline=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/paddle_common/config/PP-StructureV3.yaml'
			
 
				+                ],
			
 
				+                'output_subdir': 'ppstructurev3_results',
			
 
				+                'venv': 'conda activate paddle',
			
 
				+                'description': 'PP-StructureV3 处理器',
			
 
				+                'log_subdir': 'logs/ppstructurev3_single_process'  # 🎯 新增
			
 
				+            },
			
 
				+            'ppstructurev3_single_client': {
			
 
				+                'script': '/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/ppstructure_tool/api_client.py',
			
 
				+                'input_arg': '--input',
			
 
				+                'output_arg': '--output_dir',
			
 
				+                'extra_args': [
			
 
				+                    '--api_url=http://10.192.72.11:8111/layout-parsing',
			
 
				+                    '--timeout=300'
			
 
				+                ],
			
 
				+                'output_subdir': 'ppstructurev3_client_results',
			
 
				+                'venv': 'source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate',
			
 
				+                'description': 'PP-StructureV3 HTTP API 客户端',
			
 
				+                'log_subdir': 'logs/ppstructurev3_single_client'  # 🎯 新增
			
 
				+            },
			
 
				+            'mineru_vllm': {
			
 
				+                'script': '/Users/zhch158/workspace/repository.git/MinerU/zhch/mineru2_vllm_multthreads.py',
			
 
				+                'input_arg': '--input_file',
			
 
				+                'output_arg': '--output_dir',
			
 
				+                'extra_args': [
			
 
				+                    '--server_url=http://10.192.72.11:8121',
			
 
				+                    '--timeout=300',
			
 
				+                    '--batch_size=1'
			
 
				+                ],
			
 
				+                'output_subdir': 'mineru_vllm_results',
			
 
				+                'venv': 'conda activate mineru2',
			
 
				+                'description': 'MinerU vLLM 处理器',
			
 
				+                'log_subdir': 'logs/mineru_vllm'  # 🎯 新增
			
 
				+            },
			
 
				+            'dotsocr_vllm': {
			
 
				+                'script': '/Users/zhch158/workspace/repository.git/dots.ocr/zhch/dotsocr_vllm_multthreads.py',
			
 
				+                'input_arg': '--input_file',
			
 
				+                'output_arg': '--output_dir',
			
 
				+                'extra_args': [
			
 
				+                    '--ip=10.192.72.11',
			
 
				+                    '--port=8101',
			
 
				+                    '--model_name=DotsOCR',
			
 
				+                    '--prompt_mode=prompt_layout_all_en',
			
 
				+                    '--batch_size=1',
			
 
				+                    '--max_workers=1',
			
 
				+                    '--dpi=200'
			
 
				+                ],
			
 
				+                'output_subdir': 'dotsocr_vllm_results',
			
 
				+                'venv': 'conda activate py312',
			
 
				+                'description': 'DotsOCR vLLM 处理器 - 支持PDF和图片',
			
 
				+                'log_subdir': 'logs/dotsocr_vllm'  # 🎯 新增
			
 
				+            }
			
 
				+        },
			
 
				+        'global': {
			
 
				+            'base_dir': '/Users/zhch158/workspace/data/流水分析',
			
 
				+            'output_subdir': 'results',
			
 
				+            'log_dir': 'logs',
			
 
				+            'log_retention_days': 30,
			
 
				+            'log_level': 'INFO'
			
 
				+        }
			
 
				+    }
			
 
				+    
			
 
				+    def __init__(self, config_file: Optional[str] = None):
			
 
				+        self.config_file = config_file
			
 
				+        self.config = self._load_config()
			
 
				+    
			
 
				+    def _load_config(self) -> Dict:
			
 
				+        """加载配置文件"""
			
 
				+        if self.config_file and Path(self.config_file).exists():
			
 
				+            with open(self.config_file, 'r', encoding='utf-8') as f:
			
 
				+                if self.config_file.endswith('.yaml') or self.config_file.endswith('.yml'):
			
 
				+                    return yaml.safe_load(f)
			
 
				+                else:
			
 
				+                    return json.load(f)
			
 
				+        return self.DEFAULT_CONFIG.copy()
			
 
				+    
			
 
				+    def get_processor_config(self, processor_name: str) -> ProcessorConfig:
			
 
				+        """获取处理器配置"""
			
 
				+        if processor_name not in self.config['processors']:
			
 
				+            raise ValueError(f"处理器 '{processor_name}' 不存在")
			
 
				+        
			
 
				+        proc_config = self.config['processors'][processor_name]
			
 
				+        return ProcessorConfig(
			
 
				+            name=processor_name,
			
 
				+            script=proc_config['script'],
			
 
				+            input_arg=proc_config.get('input_arg', '--input_file'),
			
 
				+            output_arg=proc_config.get('output_arg', '--output_dir'),
			
 
				+            extra_args=proc_config.get('extra_args', []),
			
 
				+            output_subdir=proc_config.get('output_subdir', processor_name + '_results'),
			
 
				+            log_subdir=proc_config.get('log_subdir', f'logs/{processor_name}'),  # 🎯 新增
			
 
				+            venv=proc_config.get('venv'),
			
 
				+            description=proc_config.get('description', '')
			
 
				+        )
			
 
				+    
			
 
				+    def get_global_config(self, key: str, default=None):
			
 
				+        """获取全局配置"""
			
 
				+        return self.config.get('global', {}).get(key, default)
			
 
				+    
			
 
				+    def list_processors(self) -> List[str]:
			
 
				+        """列出所有可用的处理器"""
			
 
				+        return list(self.config['processors'].keys())
			
 
				+
			
 
				+
			
 
				+# ============================================================================
			
 
				+# PDF 文件查找器
			
 
				+# ============================================================================
			
 
				+
			
 
				+class PDFFileFinder:
			
 
				+    """PDF 文件查找器"""
			
 
				+    
			
 
				+    def __init__(self, base_dir: str):
			
 
				+        self.base_dir = Path(base_dir)
			
 
				+    
			
 
				+    def from_file_list(self, list_file: str) -> List[Path]:
			
 
				+        """从文件列表读取"""
			
 
				+        pdf_files = []
			
 
				+        
			
 
				+        with open(list_file, 'r', encoding='utf-8') as f:
			
 
				+            for line in f:
			
 
				+                # 跳过空行和注释
			
 
				+                line = line.strip()
			
 
				+                if not line or line.startswith('#'):
			
 
				+                    continue
			
 
				+                
			
 
				+                # 构建完整路径
			
 
				+                pdf_path = self._resolve_path(line)
			
 
				+                if pdf_path:
			
 
				+                    pdf_files.append(pdf_path)
			
 
				+        
			
 
				+        return pdf_files
			
 
				+    
			
 
				+    def from_list(self, pdf_list: List[str]) -> List[Path]:
			
 
				+        """从列表读取"""
			
 
				+        pdf_files = []
			
 
				+        
			
 
				+        for pdf in pdf_list:
			
 
				+            pdf_path = self._resolve_path(pdf.strip())
			
 
				+            if pdf_path:
			
 
				+                pdf_files.append(pdf_path)
			
 
				+        
			
 
				+        return pdf_files
			
 
				+    
			
 
				+    def find_all(self) -> List[Path]:
			
 
				+        """查找基础目录下所有 PDF"""
			
 
				+        return sorted(self.base_dir.rglob('*.pdf'))
			
 
				+    
			
 
				+    def _resolve_path(self, path_str: str) -> Optional[Path]:
			
 
				+        """解析路径"""
			
 
				+        path = Path(path_str)
			
 
				+        
			
 
				+        # 绝对路径
			
 
				+        if path.is_absolute():
			
 
				+            return path if path.exists() else path  # 返回路径，即使不存在
			
 
				+        
			
 
				+        # 相对路径
			
 
				+        # 1. 尝试完整相对路径
			
 
				+        candidate1 = self.base_dir / path
			
 
				+        if candidate1.exists():
			
 
				+            return candidate1
			
 
				+        
			
 
				+        # 2. 尝试在同名子目录下查找
			
 
				+        if '/' not in path_str:
			
 
				+            pdf_name = path.stem
			
 
				+            candidate2 = self.base_dir / pdf_name / path.name
			
 
				+            if candidate2.exists():
			
 
				+                return candidate2
			
 
				+        
			
 
				+        # 3. 使用 glob 搜索
			
 
				+        matches = list(self.base_dir.rglob(path.name))
			
 
				+        if matches:
			
 
				+            return matches[0]
			
 
				+        
			
 
				+        # 返回候选路径（即使不存在）
			
 
				+        return candidate1
			
 
				+
			
 
				+
			
 
				+# ============================================================================
			
 
				+# PDF 批处理器
			
 
				+# ============================================================================
			
 
				+
			
 
				+class PDFBatchProcessor:
			
 
				+    """PDF 批处理器"""
			
 
				+    
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        processor_config: ProcessorConfig,
			
 
				+        output_subdir: Optional[str] = None,
			
 
				+        log_base_dir: Optional[str] = None,  # 🎯 新增：日志基础目录
			
 
				+        dry_run: bool = False
			
 
				+    ):
			
 
				+        self.processor_config = processor_config
			
 
				+        # 如果指定了output_subdir，使用指定的；否则使用处理器配置中的
			
 
				+        self.output_subdir = output_subdir or processor_config.output_subdir
			
 
				+        self.log_base_dir = Path(log_base_dir) if log_base_dir else Path('logs')  # 🎯 新增
			
 
				+        self.dry_run = dry_run
			
 
				+        
			
 
				+        # 设置日志
			
 
				+        self.logger = self._setup_logger()
			
 
				+        
			
 
				+        # 统计信息
			
 
				+        self.results: List[ProcessResult] = []
			
 
				+    
			
 
				+    def _setup_logger(self) -> logging.Logger:
			
 
				+        """设置日志"""
			
 
				+        logger = logging.getLogger('PDFBatchProcessor')
			
 
				+        logger.setLevel(logging.INFO)
			
 
				+        
			
 
				+        # 避免重复添加handler
			
 
				+        if not logger.handlers:
			
 
				+            # 控制台输出
			
 
				+            console_handler = logging.StreamHandler()
			
 
				+            console_handler.setLevel(logging.INFO)
			
 
				+            console_format = logging.Formatter(
			
 
				+                '%(asctime)s - %(levelname)s - %(message)s',
			
 
				+                datefmt='%Y-%m-%d %H:%M:%S'
			
 
				+            )
			
 
				+            console_handler.setFormatter(console_format)
			
 
				+            logger.addHandler(console_handler)
			
 
				+        
			
 
				+        return logger
			
 
				+    
			
 
				+    def _get_log_file_path(self, pdf_file: Path) -> Path:
			
 
				+        """
			
 
				+        🎯 获取日志文件路径
			
 
				+        
			
 
				+        日志结构:
			
 
				+        base_dir/
			
 
				+        └── PDF名称/
			
 
				+            └── logs/
			
 
				+                └── processor_name/
			
 
				+                    └── PDF名称_YYYYMMDD_HHMMSS.log
			
 
				+        """
			
 
				+        # PDF 目录
			
 
				+        pdf_dir = pdf_file.parent / pdf_file.stem
			
 
				+        
			
 
				+        # 日志目录: pdf_dir / logs / processor_name
			
 
				+        log_dir = pdf_dir / self.processor_config.log_subdir
			
 
				+        log_dir.mkdir(parents=True, exist_ok=True)
			
 
				+        
			
 
				+        # 日志文件名: PDF名称_时间戳.log
			
 
				+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
			
 
				+        log_file = log_dir / f"{pdf_file.stem}_{timestamp}.log"
			
 
				+        
			
 
				+        return log_file
			
 
				+    
			
 
				+    def process_files(self, pdf_files: List[Path]) -> Dict[str, Any]:
			
 
				+        """批量处理文件"""
			
 
				+        self.logger.info(f"开始处理 {len(pdf_files)} 个文件")
			
 
				+        self.logger.info(f"处理器: {self.processor_config.description}")
			
 
				+        self.logger.info(f"脚本: {self.processor_config.script}")
			
 
				+        self.logger.info(f"输出目录: {self.output_subdir}")
			
 
				+        self.logger.info(f"日志目录: {self.processor_config.log_subdir}")
			
 
				+        
			
 
				+        if self.processor_config.venv:
			
 
				+            self.logger.info(f"虚拟环境: {self.processor_config.venv}")
			
 
				+        
			
 
				+        start_time = time.time()
			
 
				+        
			
 
				+        # 使用进度条
			
 
				+        with tqdm(total=len(pdf_files), desc="处理进度", unit="file") as pbar:
			
 
				+            for pdf_file in pdf_files:
			
 
				+                result = self._process_single_file(pdf_file)
			
 
				+                self.results.append(result)
			
 
				+                pbar.update(1)
			
 
				+                
			
 
				+                # 更新进度条描述
			
 
				+                success_count = sum(1 for r in self.results if r.success)
			
 
				+                pbar.set_postfix({
			
 
				+                    'success': success_count,
			
 
				+                    'failed': len(self.results) - success_count
			
 
				+                })
			
 
				+        
			
 
				+        total_duration = time.time() - start_time
			
 
				+        
			
 
				+        # 生成统计信息
			
 
				+        stats = self._generate_stats(total_duration)
			
 
				+        self._save_summary_log(stats)
			
 
				+        
			
 
				+        return stats
			
 
				+    
			
 
				+    def _process_single_file(self, pdf_file: Path) -> ProcessResult:
			
 
				+        """🎯 处理单个文件（支持日志重定向）"""
			
 
				+        self.logger.info(f"处理: {pdf_file}")
			
 
				+        
			
 
				+        # 检查文件是否存在
			
 
				+        if not pdf_file.exists():
			
 
				+            self.logger.warning(f"跳过: 文件不存在 - {pdf_file}")
			
 
				+            return ProcessResult(
			
 
				+                pdf_file=str(pdf_file),
			
 
				+                success=False,
			
 
				+                duration=0,
			
 
				+                error_message="文件不存在"
			
 
				+            )
			
 
				+        
			
 
				+        # 确定输出目录
			
 
				+        output_dir = pdf_file.parent / pdf_file.stem / self.output_subdir
			
 
				+        
			
 
				+        # 🎯 获取日志文件路径
			
 
				+        log_file = self._get_log_file_path(pdf_file)
			
 
				+        
			
 
				+        # 构建命令
			
 
				+        cmd = self._build_command(pdf_file, output_dir)
			
 
				+        
			
 
				+        self.logger.debug(f"执行命令: {cmd if isinstance(cmd, str) else ' '.join(cmd)}")
			
 
				+        self.logger.info(f"日志输出: {log_file}")
			
 
				+        
			
 
				+        if self.dry_run:
			
 
				+            self.logger.info(f"[DRY RUN] 将执行: {cmd if isinstance(cmd, str) else ' '.join(cmd)}")
			
 
				+            return ProcessResult(
			
 
				+                pdf_file=str(pdf_file),
			
 
				+                success=True,
			
 
				+                duration=0,
			
 
				+                error_message="",
			
 
				+                log_file=str(log_file)
			
 
				+            )
			
 
				+        
			
 
				+        # 🎯 执行命令并重定向输出到日志文件
			
 
				+        start_time = time.time()
			
 
				+        try:
			
 
				+            with open(log_file, 'w', encoding='utf-8') as log_f:
			
 
				+                # 写入日志头
			
 
				+                log_f.write(f"{'='*80}\n")
			
 
				+                log_f.write(f"处理器: {self.processor_config.description}\n")
			
 
				+                log_f.write(f"PDF 文件: {pdf_file}\n")
			
 
				+                log_f.write(f"输出目录: {output_dir}\n")
			
 
				+                log_f.write(f"开始时间: {datetime.now()}\n")
			
 
				+                log_f.write(f"{'='*80}\n\n")
			
 
				+                log_f.flush()
			
 
				+                
			
 
				+                # 执行命令
			
 
				+                if isinstance(cmd, str):
			
 
				+                    result = subprocess.run(
			
 
				+                        cmd,
			
 
				+                        shell=True,
			
 
				+                        executable='/bin/bash',
			
 
				+                        stdout=log_f,  # 🎯 重定向 stdout
			
 
				+                        stderr=subprocess.STDOUT,  # 🎯 合并 stderr 到 stdout
			
 
				+                        text=True,
			
 
				+                        check=True
			
 
				+                    )
			
 
				+                else:
			
 
				+                    result = subprocess.run(
			
 
				+                        cmd,
			
 
				+                        stdout=log_f,  # 🎯 重定向 stdout
			
 
				+                        stderr=subprocess.STDOUT,  # 🎯 合并 stderr
			
 
				+                        text=True,
			
 
				+                        check=True
			
 
				+                    )
			
 
				+                
			
 
				+                # 写入日志尾
			
 
				+                log_f.write(f"\n{'='*80}\n")
			
 
				+                log_f.write(f"结束时间: {datetime.now()}\n")
			
 
				+                log_f.write(f"状态: 成功\n")
			
 
				+                log_f.write(f"{'='*80}\n")
			
 
				+            
			
 
				+            duration = time.time() - start_time
			
 
				+            self.logger.info(f"✓ 成功 (耗时: {duration:.2f}秒)")
			
 
				+            
			
 
				+            return ProcessResult(
			
 
				+                pdf_file=str(pdf_file),
			
 
				+                success=True,
			
 
				+                duration=duration,
			
 
				+                error_message="",
			
 
				+                log_file=str(log_file)
			
 
				+            )
			
 
				+            
			
 
				+        except subprocess.CalledProcessError as e:
			
 
				+            duration = time.time() - start_time
			
 
				+            error_msg = f"命令执行失败 (退出码: {e.returncode})"
			
 
				+            
			
 
				+            # 🎯 在日志文件中追加错误信息
			
 
				+            with open(log_file, 'a', encoding='utf-8') as log_f:
			
 
				+                log_f.write(f"\n{'='*80}\n")
			
 
				+                log_f.write(f"结束时间: {datetime.now()}\n")
			
 
				+                log_f.write(f"状态: 失败\n")
			
 
				+                log_f.write(f"错误: {error_msg}\n")
			
 
				+                log_f.write(f"{'='*80}\n")
			
 
				+            
			
 
				+            self.logger.error(f"✗ 失败 (耗时: {duration:.2f}秒)")
			
 
				+            self.logger.error(f"错误信息: {error_msg}")
			
 
				+            self.logger.error(f"详细日志: {log_file}")
			
 
				+            
			
 
				+            return ProcessResult(
			
 
				+                pdf_file=str(pdf_file),
			
 
				+                success=False,
			
 
				+                duration=duration,
			
 
				+                error_message=error_msg,
			
 
				+                log_file=str(log_file)
			
 
				+            )
			
 
				+        except Exception as e:
			
 
				+            duration = time.time() - start_time
			
 
				+            error_msg = str(e)
			
 
				+            
			
 
				+            with open(log_file, 'a', encoding='utf-8') as log_f:
			
 
				+                log_f.write(f"\n{'='*80}\n")
			
 
				+                log_f.write(f"结束时间: {datetime.now()}\n")
			
 
				+                log_f.write(f"状态: 异常\n")
			
 
				+                log_f.write(f"错误: {error_msg}\n")
			
 
				+                log_f.write(f"{'='*80}\n")
			
 
				+            
			
 
				+            self.logger.error(f"✗ 异常 (耗时: {duration:.2f}秒)")
			
 
				+            self.logger.error(f"错误信息: {error_msg}")
			
 
				+            
			
 
				+            return ProcessResult(
			
 
				+                pdf_file=str(pdf_file),
			
 
				+                success=False,
			
 
				+                duration=duration,
			
 
				+                error_message=error_msg,
			
 
				+                log_file=str(log_file)
			
 
				+            )
			
 
				+    
			
 
				+    def _build_command(self, pdf_file: Path, output_dir: Path):
			
 
				+        """构建执行命令
			
 
				+        
			
 
				+        Returns:
			
 
				+            如果配置了 venv，返回 shell 命令字符串
			
 
				+            否则返回命令列表
			
 
				+        """
			
 
				+        # 构建基础 Python 命令
			
 
				+        base_cmd = [
			
 
				+            'python',  # 使用虚拟环境中的 python
			
 
				+            self.processor_config.script,
			
 
				+            self.processor_config.input_arg, str(pdf_file),
			
 
				+            self.processor_config.output_arg, str(output_dir)
			
 
				+        ]
			
 
				+        
			
 
				+        # 添加额外参数
			
 
				+        base_cmd.extend(self.processor_config.extra_args)
			
 
				+        
			
 
				+        # 如果配置了虚拟环境，构建 shell 命令
			
 
				+        if self.processor_config.venv:
			
 
				+            # 转义参数中的特殊字符
			
 
				+            escaped_cmd = []
			
 
				+            for arg in base_cmd:
			
 
				+                if ' ' in arg or '"' in arg or "'" in arg:
			
 
				+                    # 使用单引号包裹，内部单引号转义
			
 
				+                    arg = arg.replace("'", "'\\''")
			
 
				+                    escaped_cmd.append(f"'{arg}'")
			
 
				+                else:
			
 
				+                    escaped_cmd.append(arg)
			
 
				+            
			
 
				+            python_cmd = ' '.join(escaped_cmd)
			
 
				+            
			
 
				+            # 检查是否使用 conda
			
 
				+            if 'conda activate' in self.processor_config.venv:
			
 
				+                # 获取 conda 基础路径
			
 
				+                # 对于 conda，需要先 source conda.sh，然后 conda activate
			
 
				+                conda_init = """
			
 
				+eval "$(conda shell.bash hook)"
			
 
				+                """.strip()
			
 
				+                
			
 
				+                shell_cmd = f"{conda_init} && {self.processor_config.venv} && {python_cmd}"
			
 
				+            else:
			
 
				+                # 对于 source 激活的虚拟环境
			
 
				+                shell_cmd = f"{self.processor_config.venv} && {python_cmd}"
			
 
				+            
			
 
				+            return shell_cmd
			
 
				+        else:
			
 
				+            # 没有虚拟环境，使用当前 Python 解释器
			
 
				+            base_cmd[0] = sys.executable
			
 
				+            return base_cmd
			
 
				+    
			
 
				+    def _generate_stats(self, total_duration: float) -> Dict[str, Any]:
			
 
				+        """生成统计信息"""
			
 
				+        success_count = sum(1 for r in self.results if r.success)
			
 
				+        failed_count = len(self.results) - success_count
			
 
				+        
			
 
				+        failed_files = [
			
 
				+            {
			
 
				+                'file': r.pdf_file,
			
 
				+                'error': r.error_message,
			
 
				+                'log': r.log_file
			
 
				+            }
			
 
				+            for r in self.results if not r.success
			
 
				+        ]
			
 
				+        
			
 
				+        stats = {
			
 
				+            'total': len(self.results),
			
 
				+            'success': success_count,
			
 
				+            'failed': failed_count,
			
 
				+            'total_duration': total_duration,
			
 
				+            'failed_files': failed_files,
			
 
				+            'results': [
			
 
				+                {
			
 
				+                    'file': r.pdf_file,
			
 
				+                    'success': r.success,
			
 
				+                    'duration': r.duration,
			
 
				+                    'error': r.error_message,
			
 
				+                    'log': r.log_file
			
 
				+                }
			
 
				+                for r in self.results
			
 
				+            ]
			
 
				+        }
			
 
				+        
			
 
				+        return stats
			
 
				+    
			
 
				+    def _save_summary_log(self, stats: Dict[str, Any]):
			
 
				+        """🎯 保存汇总日志"""
			
 
				+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
			
 
				+        summary_log_file = self.log_base_dir / f"batch_summary_{self.processor_config.name}_{timestamp}.log"
			
 
				+        
			
 
				+        # 确保目录存在
			
 
				+        summary_log_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				+        
			
 
				+        with open(summary_log_file, 'w', encoding='utf-8') as f:
			
 
				+            f.write("PDF 批量处理汇总日志\n")
			
 
				+            f.write("=" * 80 + "\n\n")
			
 
				+            
			
 
				+            f.write(f"处理器: {self.processor_config.description}\n")
			
 
				+            f.write(f"处理器名称: {self.processor_config.name}\n")
			
 
				+            f.write(f"脚本: {self.processor_config.script}\n")
			
 
				+            f.write(f"输出目录: {self.output_subdir}\n")
			
 
				+            f.write(f"日志目录: {self.processor_config.log_subdir}\n")
			
 
				+            
			
 
				+            if self.processor_config.venv:
			
 
				+                f.write(f"虚拟环境: {self.processor_config.venv}\n")
			
 
				+            
			
 
				+            f.write(f"开始时间: {datetime.now()}\n")
			
 
				+            f.write(f"总耗时: {stats['total_duration']:.2f} 秒\n\n")
			
 
				+            
			
 
				+            f.write("统计信息:\n")
			
 
				+            f.write(f"  总文件数: {stats['total']}\n")
			
 
				+            f.write(f"  成功: {stats['success']}\n")
			
 
				+            f.write(f"  失败: {stats['failed']}\n\n")
			
 
				+            
			
 
				+            if stats['failed_files']:
			
 
				+                f.write("失败的文件:\n")
			
 
				+                for item in stats['failed_files']:
			
 
				+                    f.write(f"  ✗ {item['file']}\n")
			
 
				+                    f.write(f"    错误: {item['error']}\n")
			
 
				+                    f.write(f"    日志: {item['log']}\n\n")
			
 
				+            
			
 
				+            f.write("详细结果:\n")
			
 
				+            for result in stats['results']:
			
 
				+                status = "✓" if result['success'] else "✗"
			
 
				+                f.write(f"{status} {result['file']} ({result['duration']:.2f}s)\n")
			
 
				+                f.write(f"   日志: {result['log']}\n")
			
 
				+                if result['error']:
			
 
				+                    f.write(f"   错误: {result['error']}\n")
			
 
				+        
			
 
				+        self.logger.info(f"汇总日志已保存: {summary_log_file}")
			
 
				+
			
 
				+
			
 
				+# ============================================================================
			
 
				+# 命令行接口
			
 
				+# ============================================================================
			
 
				+
			
 
				+def create_parser() -> argparse.ArgumentParser:
			
 
				+    """创建命令行参数解析器"""
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='PDF 批量处理工具 (支持虚拟环境自动切换)',
			
 
				+        formatter_class=argparse.RawDescriptionHelpFormatter,
			
 
				+        epilog="""
			
 
				+示例用法:
			
 
				+
			
 
				+  1. 使用配置文件中的处理器 (自动切换虚拟环境):
			
 
				+     python batch_process_pdf.py -p paddleocr_vl_single_process -f pdf_list.txt
			
 
				+
			
 
				+  2. 使用 DotsOCR 处理器 (自动切换到 py312 环境):
			
 
				+     python batch_process_pdf.py -p dotsocr_vllm -f pdf_list.txt
			
 
				+
			
 
				+  3. 使用 MinerU 处理器 (自动切换到 mineru2 环境):
			
 
				+     python batch_process_pdf.py -p mineru_vllm -f pdf_list.txt
			
 
				+
			
 
				+  4. 处理指定目录下所有 PDF:
			
 
				+     python batch_process_pdf.py -p ppstructurev3_single_client -d /path/to/pdfs
			
 
				+
			
 
				+  5. 列出所有可用的处理器:
			
 
				+     python batch_process_pdf.py --list-processors
			
 
				+
			
 
				+  6. 手动指定虚拟环境:
			
 
				+     python batch_process_pdf.py -p paddleocr_vl_single_process -f pdf_list.txt --venv "conda activate paddle"
			
 
				+        """
			
 
				+    )
			
 
				+    
			
 
				+    # 处理器选择
			
 
				+    parser.add_argument(
			
 
				+        '-p', '--processor',
			
 
				+        help='处理器名称'
			
 
				+    )
			
 
				+    
			
 
				+    # 配置文件
			
 
				+    parser.add_argument(
			
 
				+        '-c', '--config',
			
 
				+        default='processor_configs.yaml',
			
 
				+        help='配置文件路径 (默认: processor_configs.yaml)'
			
 
				+    )
			
 
				+    
			
 
				+    # 手动指定脚本
			
 
				+    parser.add_argument(
			
 
				+        '-s', '--script',
			
 
				+        help='Python 脚本路径 (覆盖配置文件)'
			
 
				+    )
			
 
				+    
			
 
				+    # 目录和文件
			
 
				+    parser.add_argument(
			
 
				+        '-d', '--base-dir',
			
 
				+        help='PDF 文件基础目录'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '-o', '--output-subdir',
			
 
				+        help='输出子目录名称 (覆盖处理器默认配置)'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '-f', '--file-list',
			
 
				+        help='PDF 文件列表文件路径'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '-l', '--pdf-list',
			
 
				+        nargs='+',
			
 
				+        help='PDF 文件列表 (空格分隔)'
			
 
				+    )
			
 
				+    
			
 
				+    # 额外参数
			
 
				+    parser.add_argument(
			
 
				+        '-e', '--extra-args',
			
 
				+        help='额外参数 (覆盖配置文件)'
			
 
				+    )
			
 
				+    
			
 
				+    # 虚拟环境
			
 
				+    parser.add_argument(
			
 
				+        '--venv',
			
 
				+        help='虚拟环境激活命令 (覆盖配置文件)'
			
 
				+    )
			
 
				+    
			
 
				+    # 工具选项
			
 
				+    parser.add_argument(
			
 
				+        '--list-processors',
			
 
				+        action='store_true',
			
 
				+        help='列出所有可用的处理器'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '--show-config',
			
 
				+        action='store_true',
			
 
				+        help='显示配置文件内容'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '--dry-run',
			
 
				+        action='store_true',
			
 
				+        help='模拟运行,不实际执行'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '-v', '--verbose',
			
 
				+        action='store_true',
			
 
				+        help='详细输出'
			
 
				+    )
			
 
				+    
			
 
				+    return parser
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    parser = create_parser()
			
 
				+    args = parser.parse_args()
			
 
				+    
			
 
				+    # 设置日志级别
			
 
				+    if args.verbose:
			
 
				+        logging.getLogger().setLevel(logging.DEBUG)
			
 
				+    
			
 
				+    # 加载配置
			
 
				+    config_manager = ConfigManager(args.config if Path(args.config).exists() else None)
			
 
				+    
			
 
				+    # 列出处理器
			
 
				+    if args.list_processors:
			
 
				+        print("可用的处理器:")
			
 
				+        for name in config_manager.list_processors():
			
 
				+            proc_config = config_manager.get_processor_config(name)
			
 
				+            print(f"  • {name}")
			
 
				+            print(f"    描述: {proc_config.description}")
			
 
				+            print(f"    脚本: {proc_config.script}")
			
 
				+            print(f"    输出目录: {proc_config.output_subdir}")
			
 
				+            if proc_config.venv:
			
 
				+                print(f"    虚拟环境: {proc_config.venv}")
			
 
				+            print()
			
 
				+        return 0
			
 
				+    
			
 
				+    # 显示配置
			
 
				+    if args.show_config:
			
 
				+        print(yaml.dump(config_manager.config, allow_unicode=True))
			
 
				+        return 0
			
 
				+    
			
 
				+    # 获取处理器配置
			
 
				+    if args.processor:
			
 
				+        processor_config = config_manager.get_processor_config(args.processor)
			
 
				+    elif args.script:
			
 
				+        # 手动指定脚本
			
 
				+        processor_config = ProcessorConfig(
			
 
				+            name='manual',
			
 
				+            script=args.script,
			
 
				+            extra_args=args.extra_args.split() if args.extra_args else [],
			
 
				+            output_subdir=args.output_subdir or 'manual_results',
			
 
				+            venv=args.venv
			
 
				+        )
			
 
				+    else:
			
 
				+        parser.error("必须指定 -p 或 -s 参数")
			
 
				+    
			
 
				+    # 覆盖额外参数
			
 
				+    if args.extra_args and args.processor:
			
 
				+        processor_config.extra_args = args.extra_args.split()
			
 
				+    
			
 
				+    # 覆盖虚拟环境
			
 
				+    if args.venv:
			
 
				+        processor_config.venv = args.venv
			
 
				+    
			
 
				+    # 获取基础目录
			
 
				+    base_dir = args.base_dir or config_manager.get_global_config('base_dir')
			
 
				+    if not base_dir:
			
 
				+        parser.error("必须指定 -d 参数或在配置文件中设置 base_dir")
			
 
				+    log_base_dir = base_dir + '/' + config_manager.get_global_config('log_dir', 'logs')
			
 
				+    
			
 
				+    # 查找 PDF 文件
			
 
				+    finder = PDFFileFinder(base_dir)
			
 
				+    
			
 
				+    if args.file_list:
			
 
				+        pdf_files = finder.from_file_list(args.file_list)
			
 
				+    elif args.pdf_list:
			
 
				+        pdf_files = finder.from_list(args.pdf_list)
			
 
				+    else:
			
 
				+        pdf_files = finder.find_all()
			
 
				+    
			
 
				+    if not pdf_files:
			
 
				+        print("❌ 未找到任何 PDF 文件")
			
 
				+        return 1
			
 
				+    
			
 
				+    # 显示找到的文件
			
 
				+    valid_file_paths = [f.as_posix() for f in pdf_files if f.exists()]
			
 
				+    if valid_file_paths:
			
 
				+        print("\n".join(valid_file_paths))    
			
 
				+
			
 
				+    # 验证文件
			
 
				+    valid_files = [f for f in pdf_files if f.exists()]
			
 
				+    invalid_files = [f for f in pdf_files if not f.exists()]
			
 
				+    
			
 
				+    if invalid_files:
			
 
				+        print(f"\n⚠️  警告: {len(invalid_files)} 个文件不存在:")
			
 
				+        for f in invalid_files[:5]:
			
 
				+            print(f"  - {f}")
			
 
				+        if len(invalid_files) > 5:
			
 
				+            print(f"  ... 还有 {len(invalid_files) - 5} 个")
			
 
				+    
			
 
				+    # 确认执行
			
 
				+    if not args.dry_run and valid_files:
			
 
				+        venv_info = f" (虚拟环境: {processor_config.venv})" if processor_config.venv else ""
			
 
				+        confirm = input(f"\n是否继续处理 {len(valid_files)} 个文件{venv_info}? [Y/n]: ")
			
 
				+        if confirm.lower() not in ['', 'y', 'yes']:
			
 
				+            print("已取消")
			
 
				+            return 0
			
 
				+    
			
 
				+    # 批量处理
			
 
				+    processor = PDFBatchProcessor(
			
 
				+        processor_config=processor_config,
			
 
				+        output_subdir=args.output_subdir,
			
 
				+        log_base_dir=log_base_dir,  # 🎯 传递日志目录
			
 
				+        dry_run=args.dry_run
			
 
				+    )
			
 
				+    
			
 
				+    stats = processor.process_files(valid_files)
			
 
				+    
			
 
				+    # 显示统计信息
			
 
				+    print("\n" + "=" * 80)
			
 
				+    print("处理完成")
			
 
				+    print("=" * 80)
			
 
				+    print(f"\n📊 统计信息:")
			
 
				+    print(f"  处理器: {processor_config.description}")
			
 
				+    print(f"  输出目录: {processor.output_subdir}")
			
 
				+    print(f"  日志目录: {processor.processor_config.log_subdir}")
			
 
				+    print(f"  总文件数: {stats['total']}")
			
 
				+    print(f"  ✓ 成功: {stats['success']}")
			
 
				+    print(f"  ✗ 失败: {stats['failed']}")
			
 
				+    print(f"  ⏱️  总耗时: {stats['total_duration']:.2f} 秒")
			
 
				+    
			
 
				+    if stats['failed_files']:
			
 
				+        print(f"\n失败的文件:")
			
 
				+        for item in stats['failed_files']:
			
 
				+            print(f"  ✗ {item['file']}")
			
 
				+            print(f"    错误: {item['error']}")
			
 
				+            print(f"    日志: {item['log']}")
			
 
				+    
			
 
				+    return 0 if stats['failed'] == 0 else 1
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    print("🚀 启动批量OCR程序...")
			
 
				+    
			
 
				+    import sys
			
 
				+    
			
 
				+    if len(sys.argv) == 1:
			
 
				+        # 如果没有命令行参数，使用默认配置运行
			
 
				+        print("ℹ️  未提供命令行参数，使用默认配置运行...")
			
 
				+        
			
 
				+        # 默认配置
			
 
				+        default_config = {
			
 
				+            "processor": "mineru_vllm",
			
 
				+            "file-list": "pdf_list.txt",
			
 
				+        }
			
 
				+        
			
 
				+        print("⚙️  默认参数:")
			
 
				+        for key, value in default_config.items():
			
 
				+            print(f"  --{key}: {value}")
			
 
				+        # 构造参数
			
 
				+        sys.argv = [sys.argv[0]]
			
 
				+        for key, value in default_config.items():
			
 
				+            sys.argv.extend([f"--{key}", str(value)])
			
 
				+        sys.argv.append("--dry-run")
			
 
				+        sys.argv.append("--verbose")  # 添加详细输出参数 
			
 
				+
			
 
				+    sys.exit(main())
			
--- a/ocr_tools/ocr_batch/pdf_list.txt
+++ b/ocr_tools/ocr_batch/pdf_list.txt
@@ -0,0 +1,8 @@
 
				+德_内蒙古银行照.pdf
			
 
				+对公_招商银行图.pdf
			
 
				+A用户_单元格扫描流水.pdf
			
 
				+B用户_扫描流水.pdf
			
 
				+康强_北京农村商业银行.pdf
			
 
				+施博深.pdf
			
 
				+山西云集科技有限公司.pdf
			
 
				+2023年度报告母公司.pdf
			
--- a/ocr_tools/ocr_batch/processor_configs.yaml
+++ b/ocr_tools/ocr_batch/processor_configs.yaml
@@ -0,0 +1,151 @@
 
				+# ============================================================================
			
 
				+# PDF 批量处理器配置文件
			
 
				+# ============================================================================
			
 
				+
			
 
				+# 处理器定义
			
 
				+processors:
			
 
				+  # -------------------------------------------------------------------------
			
 
				+  # MinerU vLLM 处理器
			
 
				+  # 基于 MinerU 的多线程批量处理（支持 PDF 和图片）
			
 
				+  # -------------------------------------------------------------------------
			
 
				+  yusys_ocr:
			
 
				+    script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/main_v2.py"
			
 
				+    input_arg: "--input"
			
 
				+    output_arg: "--output_dir"
			
 
				+    extra_args:
			
 
				+      - "--config=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v2.yaml"
			
 
				+      - "--pages=1-35"
			
 
				+      - "--streaming"
			
 
				+      - "--debug"
			
 
				+      - "--log_level=DEBUG"
			
 
				+    output_subdir: "bank_statement_yusys_v2"
			
 
				+    log_subdir: "logs/bank_statement_yusys_v2"
			
 
				+    venv: "conda activate mineru2"
			
 
				+    description: "YUSYS统一OCR框架"
			
 
				+
			
 
				+  # -------------------------------------------------------------------------
			
 
				+  # PaddleOCR-VL 处理器
			
 
				+  # -------------------------------------------------------------------------
			
 
				+  paddleocr_vl_single_process:
			
 
				+    script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/paddle_vl_tool/main.py"
			
 
				+    input_arg: "--input"
			
 
				+    output_arg: "--output_dir"
			
 
				+    extra_args:
			
 
				+      - "--pipeline=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/paddle_common/config/PaddleOCR-VL-Client.yaml"
			
 
				+      - "--device=cpu"
			
 
				+      # - "--no-adapter"
			
 
				+    output_subdir: "paddleocr_vl_results"
			
 
				+    log_subdir: "logs/paddleocr_vl"  # 🎯 新增：日志子目录
			
 
				+    venv: "source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate"
			
 
				+    description: "PaddleOCR-VL 处理器 - 视觉语言模型OCR"
			
 
				+
			
 
				+  # -------------------------------------------------------------------------
			
 
				+  # PP-StructureV3 本地处理器
			
 
				+  # -------------------------------------------------------------------------
			
 
				+  ppstructurev3_single_process:
			
 
				+    script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/ppstructure_tool/main.py"
			
 
				+    input_arg: "--input"
			
 
				+    output_arg: "--output_dir"
			
 
				+    extra_args:
			
 
				+      - "--pipeline=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/paddle_common/config/PP-StructureV3-zhch.yaml"
			
 
				+      - "--device=cpu"
			
 
				+    output_subdir: "ppstructurev3_results"
			
 
				+    log_subdir: "logs/ppstructurev3"
			
 
				+    venv: "conda activate paddle"
			
 
				+    description: "PP-StructureV3 处理器 - 本地处理"
			
 
				+
			
 
				+  ppstructurev3_gpu:
			
 
				+    script: "/home/ubuntu/zhch/PaddleX/zhch/ppstructurev3_single_process.py"
			
 
				+    input_arg: "--input_file"
			
 
				+    output_arg: "--output_dir"
			
 
				+    extra_args:
			
 
				+      - "--pipeline=/home/ubuntu/zhch/PaddleX/zhch/my_config/PP-StructureV3.yaml"
			
 
				+    output_subdir: "ppstructurev3_gpu_results"
			
 
				+    log_subdir: "logs/ppstructurev3_gpu"
			
 
				+    venv: "conda activate paddle"
			
 
				+    description: "PP-StructureV3 处理器 - GPU加速"
			
 
				+
			
 
				+  # -------------------------------------------------------------------------
			
 
				+  # PP-StructureV3 CPU 处理器
			
 
				+  # 明确使用 CPU 处理
			
 
				+  # -------------------------------------------------------------------------
			
 
				+  ppstructurev3_cpu:
			
 
				+    script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/ppstructure_tool/main.py"
			
 
				+    input_arg: "--input"
			
 
				+    output_arg: "--output_dir"
			
 
				+    extra_args:
			
 
				+      - "--pipeline=/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/paddle_common/config/PP-StructureV3-zhch.yaml"
			
 
				+      - "--device=cpu"
			
 
				+    output_subdir: "ppstructurev3_cpu_results"
			
 
				+    log_subdir: "logs/ppstructurev3_cpu"
			
 
				+    venv: "source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate"
			
 
				+    description: "PP-StructureV3 处理器 - CPU处理"
			
 
				+
			
 
				+  # -------------------------------------------------------------------------
			
 
				+  # PP-StructureV3 API 客户端 (默认)
			
 
				+  # 通过 HTTP API 调用远程服务
			
 
				+  # -------------------------------------------------------------------------
			
 
				+  ppstructurev3_single_client:
			
 
				+    script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/ppstructure_tool/api_client.py"
			
 
				+    input_arg: "--input"
			
 
				+    output_arg: "--output_dir"
			
 
				+    extra_args:
			
 
				+      - "--api_url=http://10.192.72.11:20026/layout-parsing"
			
 
				+      - "--timeout=300"
			
 
				+    output_subdir: "ppstructurev3_client_results"
			
 
				+    log_subdir: "logs/ppstructurev3_client"
			
 
				+    venv: "source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate"
			
 
				+    description: "PP-StructureV3 HTTP API 客户端 - 远程服务"
			
 
				+
			
 
				+  # -------------------------------------------------------------------------
			
 
				+  # MinerU vLLM 处理器
			
 
				+  # 基于 MinerU 的多线程批量处理（支持 PDF 和图片）
			
 
				+  # -------------------------------------------------------------------------
			
 
				+  mineru_vllm:
			
 
				+    script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/mineru_vl_tool/main.py"
			
 
				+    input_arg: "--input"
			
 
				+    output_arg: "--output_dir"
			
 
				+    extra_args:
			
 
				+      - "--server_url=http://10.192.72.11:20006"
			
 
				+      - "--timeout=300"
			
 
				+      - "--batch_size=1"
			
 
				+    output_subdir: "mineru_vllm_results"
			
 
				+    log_subdir: "logs/mineru_vllm"
			
 
				+    venv: "conda activate mineru2"
			
 
				+    description: "MinerU vLLM 处理器 - 支持PDF和图片"
			
 
				+
			
 
				+  # -------------------------------------------------------------------------
			
 
				+  # DotsOCR vLLM 处理器
			
 
				+  # 基于 DotsOCR 的批量处理（支持 PDF 和图片）
			
 
				+  # -------------------------------------------------------------------------
			
 
				+  dotsocr_vllm:
			
 
				+    script: "/Users/zhch158/workspace/repository.git/ocr_platform/ocr_tools/dotsocr_vl_tool/main.py"
			
 
				+    input_arg: "--input"
			
 
				+    output_arg: "--output_dir"
			
 
				+    extra_args:
			
 
				+      - "--ip=10.192.72.11"
			
 
				+      - "--port=8101"
			
 
				+      - "--model_name=DotsOCR"
			
 
				+      - "--prompt_mode=prompt_layout_all_en"
			
 
				+      - "--batch_size=1"
			
 
				+      - "--max_workers=1"
			
 
				+      - "--dpi=200"
			
 
				+    output_subdir: "dotsocr_vllm_results"
			
 
				+    log_subdir: "logs/dotsocr_vllm"
			
 
				+    venv: "conda activate py312"
			
 
				+    description: "DotsOCR vLLM 处理器 - 支持PDF和图片"
			
 
				+
			
 
				+# ============================================================================
			
 
				+# 全局配置
			
 
				+# ============================================================================
			
 
				+global:
			
 
				+  # PDF 文件基础目录
			
 
				+  base_dir: "/Users/zhch158/workspace/data/流水分析"
			
 
				+  
			
 
				+  # 默认输出子目录名称（如果处理器未指定）
			
 
				+  output_subdir: "results"
			
 
				+  
			
 
				+  # 🎯 新增：全局日志配置
			
 
				+  log_dir: "logs"  # 全局日志目录（相对于 base_dir）
			
 
				+  log_retention_days: 30  # 日志保留天数
			
 
				+  log_level: "INFO"  # 日志级别: DEBUG, INFO, WARNING, ERROR