| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890 |
- #!/usr/bin/env python3
- """
- PDF 批量处理脚本
- 支持多种处理器,配置文件驱动
- 支持自动切换虚拟环境
- 支持执行器输出日志重定向
- """
- import os
- import sys
- import argparse
- import subprocess
- import json
- import yaml
- from pathlib import Path
- from datetime import datetime
- from typing import List, Dict, Optional, Any
- from dataclasses import dataclass, field
- import logging
- from tqdm import tqdm
- import time
- # ============================================================================
- # 数据类定义
- # ============================================================================
- @dataclass
- class ProcessorConfig:
- """处理器配置"""
- name: str
- script: str
- input_arg: str = "--input_file"
- output_arg: str = "--output_dir"
- extra_args: List[str] = field(default_factory=list)
- output_subdir: str = "results"
- log_subdir: str = "logs" # 🎯 新增:日志子目录
- venv: Optional[str] = None
- description: str = ""
- @dataclass
- class ProcessResult:
- """处理结果"""
- pdf_file: str
- success: bool
- duration: float
- error_message: str = ""
- log_file: str = "" # 🎯 新增:日志文件路径
- # ============================================================================
- # 配置管理
- # ============================================================================
- class ConfigManager:
- """配置管理器"""
-
- DEFAULT_CONFIG = {
- 'processors': {
- 'paddleocr_vl_single_process': {
- 'script': '/Users/zhch158/workspace/repository.git/PaddleX/zhch/paddleocr_vl_single_process.py',
- 'input_arg': '--input_file',
- 'output_arg': '--output_dir',
- 'extra_args': [
- '--pipeline=/Users/zhch158/workspace/repository.git/PaddleX/zhch/my_config/PaddleOCR-VL-Client-RT-DETR-H_layout_17cls.yaml',
- '--no-adapter'
- ],
- 'output_subdir': 'paddleocr_vl_results',
- 'venv': 'source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate',
- 'description': 'PaddleOCR-VL 处理器',
- 'log_subdir': 'logs/paddleocr_vl_single_process' # 🎯 新增
- },
- 'ppstructurev3_single_process': {
- 'script': '/Users/zhch158/workspace/repository.git/PaddleX/zhch/ppstructurev3_single_process.py',
- 'input_arg': '--input_file',
- 'output_arg': '--output_dir',
- 'extra_args': [
- '--pipeline=/Users/zhch158/workspace/repository.git/PaddleX/zhch/my_config/PP-StructureV3.yaml'
- ],
- 'output_subdir': 'ppstructurev3_results',
- 'venv': 'conda activate paddle',
- 'description': 'PP-StructureV3 处理器',
- 'log_subdir': 'logs/ppstructurev3_single_process' # 🎯 新增
- },
- 'ppstructurev3_single_client': {
- 'script': '/Users/zhch158/workspace/repository.git/PaddleX/zhch/ppstructurev3_single_client.py',
- 'input_arg': '--input_file',
- 'output_arg': '--output_dir',
- 'extra_args': [
- '--api_url=http://10.192.72.11:8111/layout-parsing',
- '--timeout=300'
- ],
- 'output_subdir': 'ppstructurev3_client_results',
- 'venv': 'source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate',
- 'description': 'PP-StructureV3 HTTP API 客户端',
- 'log_subdir': 'logs/ppstructurev3_single_client' # 🎯 新增
- },
- 'mineru_vllm': {
- 'script': '/Users/zhch158/workspace/repository.git/MinerU/zhch/mineru2_vllm_multthreads.py',
- 'input_arg': '--input_file',
- 'output_arg': '--output_dir',
- 'extra_args': [
- '--server_url=http://10.192.72.11:8121',
- '--timeout=300',
- '--batch_size=1'
- ],
- 'output_subdir': 'mineru_vllm_results',
- 'venv': 'conda activate mineru2',
- 'description': 'MinerU vLLM 处理器',
- 'log_subdir': 'logs/mineru_vllm' # 🎯 新增
- },
- 'dotsocr_vllm': {
- 'script': '/Users/zhch158/workspace/repository.git/dots.ocr/zhch/dotsocr_vllm_multthreads.py',
- 'input_arg': '--input_file',
- 'output_arg': '--output_dir',
- 'extra_args': [
- '--ip=10.192.72.11',
- '--port=8101',
- '--model_name=DotsOCR',
- '--prompt_mode=prompt_layout_all_en',
- '--batch_size=1',
- '--max_workers=1',
- '--dpi=200'
- ],
- 'output_subdir': 'dotsocr_vllm_results',
- 'venv': 'conda activate py312',
- 'description': 'DotsOCR vLLM 处理器 - 支持PDF和图片',
- 'log_subdir': 'logs/dotsocr_vllm' # 🎯 新增
- }
- },
- 'global': {
- 'base_dir': '/Users/zhch158/workspace/data/流水分析',
- 'output_subdir': 'results',
- 'log_dir': 'logs',
- 'log_retention_days': 30,
- 'log_level': 'INFO'
- }
- }
-
- def __init__(self, config_file: Optional[str] = None):
- self.config_file = config_file
- self.config = self._load_config()
-
- def _load_config(self) -> Dict:
- """加载配置文件"""
- if self.config_file and Path(self.config_file).exists():
- with open(self.config_file, 'r', encoding='utf-8') as f:
- if self.config_file.endswith('.yaml') or self.config_file.endswith('.yml'):
- return yaml.safe_load(f)
- else:
- return json.load(f)
- return self.DEFAULT_CONFIG.copy()
-
- def get_processor_config(self, processor_name: str) -> ProcessorConfig:
- """获取处理器配置"""
- if processor_name not in self.config['processors']:
- raise ValueError(f"处理器 '{processor_name}' 不存在")
-
- proc_config = self.config['processors'][processor_name]
- return ProcessorConfig(
- name=processor_name,
- script=proc_config['script'],
- input_arg=proc_config.get('input_arg', '--input_file'),
- output_arg=proc_config.get('output_arg', '--output_dir'),
- extra_args=proc_config.get('extra_args', []),
- output_subdir=proc_config.get('output_subdir', processor_name + '_results'),
- log_subdir=proc_config.get('log_subdir', f'logs/{processor_name}'), # 🎯 新增
- venv=proc_config.get('venv'),
- description=proc_config.get('description', '')
- )
-
- def get_global_config(self, key: str, default=None):
- """获取全局配置"""
- return self.config.get('global', {}).get(key, default)
-
- def list_processors(self) -> List[str]:
- """列出所有可用的处理器"""
- return list(self.config['processors'].keys())
- # ============================================================================
- # PDF 文件查找器
- # ============================================================================
- class PDFFileFinder:
- """PDF 文件查找器"""
-
- def __init__(self, base_dir: str):
- self.base_dir = Path(base_dir)
-
- def from_file_list(self, list_file: str) -> List[Path]:
- """从文件列表读取"""
- pdf_files = []
-
- with open(list_file, 'r', encoding='utf-8') as f:
- for line in f:
- # 跳过空行和注释
- line = line.strip()
- if not line or line.startswith('#'):
- continue
-
- # 构建完整路径
- pdf_path = self._resolve_path(line)
- if pdf_path:
- pdf_files.append(pdf_path)
-
- return pdf_files
-
- def from_list(self, pdf_list: List[str]) -> List[Path]:
- """从列表读取"""
- pdf_files = []
-
- for pdf in pdf_list:
- pdf_path = self._resolve_path(pdf.strip())
- if pdf_path:
- pdf_files.append(pdf_path)
-
- return pdf_files
-
- def find_all(self) -> List[Path]:
- """查找基础目录下所有 PDF"""
- return sorted(self.base_dir.rglob('*.pdf'))
-
- def _resolve_path(self, path_str: str) -> Optional[Path]:
- """解析路径"""
- path = Path(path_str)
-
- # 绝对路径
- if path.is_absolute():
- return path if path.exists() else path # 返回路径,即使不存在
-
- # 相对路径
- # 1. 尝试完整相对路径
- candidate1 = self.base_dir / path
- if candidate1.exists():
- return candidate1
-
- # 2. 尝试在同名子目录下查找
- if '/' not in path_str:
- pdf_name = path.stem
- candidate2 = self.base_dir / pdf_name / path.name
- if candidate2.exists():
- return candidate2
-
- # 3. 使用 glob 搜索
- matches = list(self.base_dir.rglob(path.name))
- if matches:
- return matches[0]
-
- # 返回候选路径(即使不存在)
- return candidate1
- # ============================================================================
- # PDF 批处理器
- # ============================================================================
- class PDFBatchProcessor:
- """PDF 批处理器"""
-
- def __init__(
- self,
- processor_config: ProcessorConfig,
- output_subdir: Optional[str] = None,
- log_base_dir: Optional[str] = None, # 🎯 新增:日志基础目录
- dry_run: bool = False
- ):
- self.processor_config = processor_config
- # 如果指定了output_subdir,使用指定的;否则使用处理器配置中的
- self.output_subdir = output_subdir or processor_config.output_subdir
- self.log_base_dir = Path(log_base_dir) if log_base_dir else Path('logs') # 🎯 新增
- self.dry_run = dry_run
-
- # 设置日志
- self.logger = self._setup_logger()
-
- # 统计信息
- self.results: List[ProcessResult] = []
-
- def _setup_logger(self) -> logging.Logger:
- """设置日志"""
- logger = logging.getLogger('PDFBatchProcessor')
- logger.setLevel(logging.INFO)
-
- # 避免重复添加handler
- if not logger.handlers:
- # 控制台输出
- console_handler = logging.StreamHandler()
- console_handler.setLevel(logging.INFO)
- console_format = logging.Formatter(
- '%(asctime)s - %(levelname)s - %(message)s',
- datefmt='%Y-%m-%d %H:%M:%S'
- )
- console_handler.setFormatter(console_format)
- logger.addHandler(console_handler)
-
- return logger
-
- def _get_log_file_path(self, pdf_file: Path) -> Path:
- """
- 🎯 获取日志文件路径
-
- 日志结构:
- base_dir/
- └── PDF名称/
- └── logs/
- └── processor_name/
- └── PDF名称_YYYYMMDD_HHMMSS.log
- """
- # PDF 目录
- pdf_dir = pdf_file.parent / pdf_file.stem
-
- # 日志目录: pdf_dir / logs / processor_name
- log_dir = pdf_dir / self.processor_config.log_subdir
- log_dir.mkdir(parents=True, exist_ok=True)
-
- # 日志文件名: PDF名称_时间戳.log
- timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
- log_file = log_dir / f"{pdf_file.stem}_{timestamp}.log"
-
- return log_file
-
- def process_files(self, pdf_files: List[Path]) -> Dict[str, Any]:
- """批量处理文件"""
- self.logger.info(f"开始处理 {len(pdf_files)} 个文件")
- self.logger.info(f"处理器: {self.processor_config.description}")
- self.logger.info(f"脚本: {self.processor_config.script}")
- self.logger.info(f"输出目录: {self.output_subdir}")
- self.logger.info(f"日志目录: {self.processor_config.log_subdir}")
-
- if self.processor_config.venv:
- self.logger.info(f"虚拟环境: {self.processor_config.venv}")
-
- start_time = time.time()
-
- # 使用进度条
- with tqdm(total=len(pdf_files), desc="处理进度", unit="file") as pbar:
- for pdf_file in pdf_files:
- result = self._process_single_file(pdf_file)
- self.results.append(result)
- pbar.update(1)
-
- # 更新进度条描述
- success_count = sum(1 for r in self.results if r.success)
- pbar.set_postfix({
- 'success': success_count,
- 'failed': len(self.results) - success_count
- })
-
- total_duration = time.time() - start_time
-
- # 生成统计信息
- stats = self._generate_stats(total_duration)
- self._save_summary_log(stats)
-
- return stats
-
- def _process_single_file(self, pdf_file: Path) -> ProcessResult:
- """🎯 处理单个文件(支持日志重定向)"""
- self.logger.info(f"处理: {pdf_file}")
-
- # 检查文件是否存在
- if not pdf_file.exists():
- self.logger.warning(f"跳过: 文件不存在 - {pdf_file}")
- return ProcessResult(
- pdf_file=str(pdf_file),
- success=False,
- duration=0,
- error_message="文件不存在"
- )
-
- # 确定输出目录
- output_dir = pdf_file.parent / pdf_file.stem / self.output_subdir
-
- # 🎯 获取日志文件路径
- log_file = self._get_log_file_path(pdf_file)
-
- # 构建命令
- cmd = self._build_command(pdf_file, output_dir)
-
- self.logger.debug(f"执行命令: {cmd if isinstance(cmd, str) else ' '.join(cmd)}")
- self.logger.info(f"日志输出: {log_file}")
-
- if self.dry_run:
- self.logger.info(f"[DRY RUN] 将执行: {cmd if isinstance(cmd, str) else ' '.join(cmd)}")
- return ProcessResult(
- pdf_file=str(pdf_file),
- success=True,
- duration=0,
- error_message="",
- log_file=str(log_file)
- )
-
- # 🎯 执行命令并重定向输出到日志文件
- start_time = time.time()
- try:
- with open(log_file, 'w', encoding='utf-8') as log_f:
- # 写入日志头
- log_f.write(f"{'='*80}\n")
- log_f.write(f"处理器: {self.processor_config.description}\n")
- log_f.write(f"PDF 文件: {pdf_file}\n")
- log_f.write(f"输出目录: {output_dir}\n")
- log_f.write(f"开始时间: {datetime.now()}\n")
- log_f.write(f"{'='*80}\n\n")
- log_f.flush()
-
- # 执行命令
- if isinstance(cmd, str):
- result = subprocess.run(
- cmd,
- shell=True,
- executable='/bin/bash',
- stdout=log_f, # 🎯 重定向 stdout
- stderr=subprocess.STDOUT, # 🎯 合并 stderr 到 stdout
- text=True,
- check=True
- )
- else:
- result = subprocess.run(
- cmd,
- stdout=log_f, # 🎯 重定向 stdout
- stderr=subprocess.STDOUT, # 🎯 合并 stderr
- text=True,
- check=True
- )
-
- # 写入日志尾
- log_f.write(f"\n{'='*80}\n")
- log_f.write(f"结束时间: {datetime.now()}\n")
- log_f.write(f"状态: 成功\n")
- log_f.write(f"{'='*80}\n")
-
- duration = time.time() - start_time
- self.logger.info(f"✓ 成功 (耗时: {duration:.2f}秒)")
-
- return ProcessResult(
- pdf_file=str(pdf_file),
- success=True,
- duration=duration,
- error_message="",
- log_file=str(log_file)
- )
-
- except subprocess.CalledProcessError as e:
- duration = time.time() - start_time
- error_msg = f"命令执行失败 (退出码: {e.returncode})"
-
- # 🎯 在日志文件中追加错误信息
- with open(log_file, 'a', encoding='utf-8') as log_f:
- log_f.write(f"\n{'='*80}\n")
- log_f.write(f"结束时间: {datetime.now()}\n")
- log_f.write(f"状态: 失败\n")
- log_f.write(f"错误: {error_msg}\n")
- log_f.write(f"{'='*80}\n")
-
- self.logger.error(f"✗ 失败 (耗时: {duration:.2f}秒)")
- self.logger.error(f"错误信息: {error_msg}")
- self.logger.error(f"详细日志: {log_file}")
-
- return ProcessResult(
- pdf_file=str(pdf_file),
- success=False,
- duration=duration,
- error_message=error_msg,
- log_file=str(log_file)
- )
- except Exception as e:
- duration = time.time() - start_time
- error_msg = str(e)
-
- with open(log_file, 'a', encoding='utf-8') as log_f:
- log_f.write(f"\n{'='*80}\n")
- log_f.write(f"结束时间: {datetime.now()}\n")
- log_f.write(f"状态: 异常\n")
- log_f.write(f"错误: {error_msg}\n")
- log_f.write(f"{'='*80}\n")
-
- self.logger.error(f"✗ 异常 (耗时: {duration:.2f}秒)")
- self.logger.error(f"错误信息: {error_msg}")
-
- return ProcessResult(
- pdf_file=str(pdf_file),
- success=False,
- duration=duration,
- error_message=error_msg,
- log_file=str(log_file)
- )
-
- def _build_command(self, pdf_file: Path, output_dir: Path):
- """构建执行命令
-
- Returns:
- 如果配置了 venv,返回 shell 命令字符串
- 否则返回命令列表
- """
- # 构建基础 Python 命令
- base_cmd = [
- 'python', # 使用虚拟环境中的 python
- self.processor_config.script,
- self.processor_config.input_arg, str(pdf_file),
- self.processor_config.output_arg, str(output_dir)
- ]
-
- # 添加额外参数
- base_cmd.extend(self.processor_config.extra_args)
-
- # 如果配置了虚拟环境,构建 shell 命令
- if self.processor_config.venv:
- # 转义参数中的特殊字符
- escaped_cmd = []
- for arg in base_cmd:
- if ' ' in arg or '"' in arg or "'" in arg:
- # 使用单引号包裹,内部单引号转义
- arg = arg.replace("'", "'\\''")
- escaped_cmd.append(f"'{arg}'")
- else:
- escaped_cmd.append(arg)
-
- python_cmd = ' '.join(escaped_cmd)
-
- # 检查是否使用 conda
- if 'conda activate' in self.processor_config.venv:
- # 获取 conda 基础路径
- # 对于 conda,需要先 source conda.sh,然后 conda activate
- conda_init = """
- eval "$(conda shell.bash hook)"
- """.strip()
-
- shell_cmd = f"{conda_init} && {self.processor_config.venv} && {python_cmd}"
- else:
- # 对于 source 激活的虚拟环境
- shell_cmd = f"{self.processor_config.venv} && {python_cmd}"
-
- return shell_cmd
- else:
- # 没有虚拟环境,使用当前 Python 解释器
- base_cmd[0] = sys.executable
- return base_cmd
-
- def _generate_stats(self, total_duration: float) -> Dict[str, Any]:
- """生成统计信息"""
- success_count = sum(1 for r in self.results if r.success)
- failed_count = len(self.results) - success_count
-
- failed_files = [
- {
- 'file': r.pdf_file,
- 'error': r.error_message,
- 'log': r.log_file
- }
- for r in self.results if not r.success
- ]
-
- stats = {
- 'total': len(self.results),
- 'success': success_count,
- 'failed': failed_count,
- 'total_duration': total_duration,
- 'failed_files': failed_files,
- 'results': [
- {
- 'file': r.pdf_file,
- 'success': r.success,
- 'duration': r.duration,
- 'error': r.error_message,
- 'log': r.log_file
- }
- for r in self.results
- ]
- }
-
- return stats
-
- def _save_summary_log(self, stats: Dict[str, Any]):
- """🎯 保存汇总日志"""
- timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
- summary_log_file = self.log_base_dir / f"batch_summary_{self.processor_config.name}_{timestamp}.log"
-
- # 确保目录存在
- summary_log_file.parent.mkdir(parents=True, exist_ok=True)
-
- with open(summary_log_file, 'w', encoding='utf-8') as f:
- f.write("PDF 批量处理汇总日志\n")
- f.write("=" * 80 + "\n\n")
-
- f.write(f"处理器: {self.processor_config.description}\n")
- f.write(f"处理器名称: {self.processor_config.name}\n")
- f.write(f"脚本: {self.processor_config.script}\n")
- f.write(f"输出目录: {self.output_subdir}\n")
- f.write(f"日志目录: {self.processor_config.log_subdir}\n")
-
- if self.processor_config.venv:
- f.write(f"虚拟环境: {self.processor_config.venv}\n")
-
- f.write(f"开始时间: {datetime.now()}\n")
- f.write(f"总耗时: {stats['total_duration']:.2f} 秒\n\n")
-
- f.write("统计信息:\n")
- f.write(f" 总文件数: {stats['total']}\n")
- f.write(f" 成功: {stats['success']}\n")
- f.write(f" 失败: {stats['failed']}\n\n")
-
- if stats['failed_files']:
- f.write("失败的文件:\n")
- for item in stats['failed_files']:
- f.write(f" ✗ {item['file']}\n")
- f.write(f" 错误: {item['error']}\n")
- f.write(f" 日志: {item['log']}\n\n")
-
- f.write("详细结果:\n")
- for result in stats['results']:
- status = "✓" if result['success'] else "✗"
- f.write(f"{status} {result['file']} ({result['duration']:.2f}s)\n")
- f.write(f" 日志: {result['log']}\n")
- if result['error']:
- f.write(f" 错误: {result['error']}\n")
-
- self.logger.info(f"汇总日志已保存: {summary_log_file}")
- # ============================================================================
- # 命令行接口
- # ============================================================================
- def create_parser() -> argparse.ArgumentParser:
- """创建命令行参数解析器"""
- parser = argparse.ArgumentParser(
- description='PDF 批量处理工具 (支持虚拟环境自动切换)',
- formatter_class=argparse.RawDescriptionHelpFormatter,
- epilog="""
- 示例用法:
- 1. 使用配置文件中的处理器 (自动切换虚拟环境):
- python batch_process_pdf.py -p paddleocr_vl_single_process -f pdf_list.txt
- 2. 使用 DotsOCR 处理器 (自动切换到 py312 环境):
- python batch_process_pdf.py -p dotsocr_vllm -f pdf_list.txt
- 3. 使用 MinerU 处理器 (自动切换到 mineru2 环境):
- python batch_process_pdf.py -p mineru_vllm -f pdf_list.txt
- 4. 处理指定目录下所有 PDF:
- python batch_process_pdf.py -p ppstructurev3_single_client -d /path/to/pdfs
- 5. 列出所有可用的处理器:
- python batch_process_pdf.py --list-processors
- 6. 手动指定虚拟环境:
- python batch_process_pdf.py -p paddleocr_vl_single_process -f pdf_list.txt --venv "conda activate paddle"
- """
- )
-
- # 处理器选择
- parser.add_argument(
- '-p', '--processor',
- help='处理器名称'
- )
-
- # 配置文件
- parser.add_argument(
- '-c', '--config',
- default='processor_configs.yaml',
- help='配置文件路径 (默认: processor_configs.yaml)'
- )
-
- # 手动指定脚本
- parser.add_argument(
- '-s', '--script',
- help='Python 脚本路径 (覆盖配置文件)'
- )
-
- # 目录和文件
- parser.add_argument(
- '-d', '--base-dir',
- help='PDF 文件基础目录'
- )
-
- parser.add_argument(
- '-o', '--output-subdir',
- help='输出子目录名称 (覆盖处理器默认配置)'
- )
-
- parser.add_argument(
- '-f', '--file-list',
- help='PDF 文件列表文件路径'
- )
-
- parser.add_argument(
- '-l', '--pdf-list',
- nargs='+',
- help='PDF 文件列表 (空格分隔)'
- )
-
- # 额外参数
- parser.add_argument(
- '-e', '--extra-args',
- help='额外参数 (覆盖配置文件)'
- )
-
- # 虚拟环境
- parser.add_argument(
- '--venv',
- help='虚拟环境激活命令 (覆盖配置文件)'
- )
-
- # 工具选项
- parser.add_argument(
- '--list-processors',
- action='store_true',
- help='列出所有可用的处理器'
- )
-
- parser.add_argument(
- '--show-config',
- action='store_true',
- help='显示配置文件内容'
- )
-
- parser.add_argument(
- '--dry-run',
- action='store_true',
- help='模拟运行,不实际执行'
- )
-
- parser.add_argument(
- '-v', '--verbose',
- action='store_true',
- help='详细输出'
- )
-
- return parser
- def main():
- """主函数"""
- parser = create_parser()
- args = parser.parse_args()
-
- # 设置日志级别
- if args.verbose:
- logging.getLogger().setLevel(logging.DEBUG)
-
- # 加载配置
- config_manager = ConfigManager(args.config if Path(args.config).exists() else None)
-
- # 列出处理器
- if args.list_processors:
- print("可用的处理器:")
- for name in config_manager.list_processors():
- proc_config = config_manager.get_processor_config(name)
- print(f" • {name}")
- print(f" 描述: {proc_config.description}")
- print(f" 脚本: {proc_config.script}")
- print(f" 输出目录: {proc_config.output_subdir}")
- if proc_config.venv:
- print(f" 虚拟环境: {proc_config.venv}")
- print()
- return 0
-
- # 显示配置
- if args.show_config:
- print(yaml.dump(config_manager.config, allow_unicode=True))
- return 0
-
- # 获取处理器配置
- if args.processor:
- processor_config = config_manager.get_processor_config(args.processor)
- elif args.script:
- # 手动指定脚本
- processor_config = ProcessorConfig(
- name='manual',
- script=args.script,
- extra_args=args.extra_args.split() if args.extra_args else [],
- output_subdir=args.output_subdir or 'manual_results',
- venv=args.venv
- )
- else:
- parser.error("必须指定 -p 或 -s 参数")
-
- # 覆盖额外参数
- if args.extra_args and args.processor:
- processor_config.extra_args = args.extra_args.split()
-
- # 覆盖虚拟环境
- if args.venv:
- processor_config.venv = args.venv
-
- # 获取基础目录
- base_dir = args.base_dir or config_manager.get_global_config('base_dir')
- if not base_dir:
- parser.error("必须指定 -d 参数或在配置文件中设置 base_dir")
- log_base_dir = base_dir + '/' + config_manager.get_global_config('log_dir', 'logs')
-
- # 查找 PDF 文件
- finder = PDFFileFinder(base_dir)
-
- if args.file_list:
- pdf_files = finder.from_file_list(args.file_list)
- elif args.pdf_list:
- pdf_files = finder.from_list(args.pdf_list)
- else:
- pdf_files = finder.find_all()
-
- if not pdf_files:
- print("❌ 未找到任何 PDF 文件")
- return 1
-
- # 显示找到的文件
- valid_file_paths = [f.as_posix() for f in pdf_files if f.exists()]
- if valid_file_paths:
- print("\n".join(valid_file_paths))
- # 验证文件
- valid_files = [f for f in pdf_files if f.exists()]
- invalid_files = [f for f in pdf_files if not f.exists()]
-
- if invalid_files:
- print(f"\n⚠️ 警告: {len(invalid_files)} 个文件不存在:")
- for f in invalid_files[:5]:
- print(f" - {f}")
- if len(invalid_files) > 5:
- print(f" ... 还有 {len(invalid_files) - 5} 个")
-
- # 确认执行
- if not args.dry_run and valid_files:
- venv_info = f" (虚拟环境: {processor_config.venv})" if processor_config.venv else ""
- confirm = input(f"\n是否继续处理 {len(valid_files)} 个文件{venv_info}? [Y/n]: ")
- if confirm.lower() not in ['', 'y', 'yes']:
- print("已取消")
- return 0
-
- # 批量处理
- processor = PDFBatchProcessor(
- processor_config=processor_config,
- output_subdir=args.output_subdir,
- log_base_dir=log_base_dir, # 🎯 传递日志目录
- dry_run=args.dry_run
- )
-
- stats = processor.process_files(valid_files)
-
- # 显示统计信息
- print("\n" + "=" * 80)
- print("处理完成")
- print("=" * 80)
- print(f"\n📊 统计信息:")
- print(f" 处理器: {processor_config.description}")
- print(f" 输出目录: {processor.output_subdir}")
- print(f" 日志目录: {processor.processor_config.log_subdir}")
- print(f" 总文件数: {stats['total']}")
- print(f" ✓ 成功: {stats['success']}")
- print(f" ✗ 失败: {stats['failed']}")
- print(f" ⏱️ 总耗时: {stats['total_duration']:.2f} 秒")
-
- if stats['failed_files']:
- print(f"\n失败的文件:")
- for item in stats['failed_files']:
- print(f" ✗ {item['file']}")
- print(f" 错误: {item['error']}")
- print(f" 日志: {item['log']}")
-
- return 0 if stats['failed'] == 0 else 1
- if __name__ == '__main__':
- print("🚀 启动批量OCR程序...")
-
- import sys
-
- if len(sys.argv) == 1:
- # 如果没有命令行参数,使用默认配置运行
- print("ℹ️ 未提供命令行参数,使用默认配置运行...")
-
- # 默认配置
- default_config = {
- "processor": "mineru_vllm",
- "file-list": "pdf_list.txt",
- }
-
- print("⚙️ 默认参数:")
- for key, value in default_config.items():
- print(f" --{key}: {value}")
- # 构造参数
- sys.argv = [sys.argv[0]]
- for key, value in default_config.items():
- sys.argv.extend([f"--{key}", str(value)])
- sys.argv.append("--dry-run")
- sys.argv.append("--verbose") # 添加详细输出参数
- sys.exit(main())
|