|
|
@@ -14,7 +14,7 @@ import json
|
|
|
import yaml
|
|
|
from pathlib import Path
|
|
|
from datetime import datetime
|
|
|
-from typing import List, Dict, Optional, Any
|
|
|
+from typing import List, Dict, Optional, Any, Tuple
|
|
|
from dataclasses import dataclass, field
|
|
|
import logging
|
|
|
from tqdm import tqdm
|
|
|
@@ -34,11 +34,19 @@ class ProcessorConfig:
|
|
|
extra_args: List[str] = field(default_factory=list)
|
|
|
output_subdir: str = "results"
|
|
|
log_subdir: str = "logs" # 🎯 新增:日志子目录
|
|
|
+ scene_arg: Optional[str] = None # 场景参数名(如 --scene)
|
|
|
venv: Optional[str] = None
|
|
|
description: str = ""
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
+class PDFTask:
|
|
|
+ """PDF 处理任务"""
|
|
|
+ path: Path
|
|
|
+ scene: Optional[str] = None
|
|
|
+
|
|
|
+
|
|
|
+@dataclass
|
|
|
class ProcessResult:
|
|
|
"""处理结果"""
|
|
|
pdf_file: str
|
|
|
@@ -165,6 +173,7 @@ class ConfigManager:
|
|
|
extra_args=proc_config.get('extra_args', []),
|
|
|
output_subdir=proc_config.get('output_subdir', processor_name + '_results'),
|
|
|
log_subdir=proc_config.get('log_subdir', f'logs/{processor_name}'), # 🎯 新增
|
|
|
+ scene_arg=proc_config.get('scene_arg'),
|
|
|
venv=proc_config.get('venv'),
|
|
|
description=proc_config.get('description', '')
|
|
|
)
|
|
|
@@ -188,9 +197,9 @@ class PDFFileFinder:
|
|
|
def __init__(self, base_dir: str):
|
|
|
self.base_dir = Path(base_dir)
|
|
|
|
|
|
- def from_file_list(self, list_file: str) -> List[Path]:
|
|
|
+ def from_file_list(self, list_file: str) -> List[PDFTask]:
|
|
|
"""从文件列表读取"""
|
|
|
- pdf_files = []
|
|
|
+ pdf_files: List[PDFTask] = []
|
|
|
|
|
|
with open(list_file, 'r', encoding='utf-8') as f:
|
|
|
for line in f:
|
|
|
@@ -199,27 +208,39 @@ class PDFFileFinder:
|
|
|
if not line or line.startswith('#'):
|
|
|
continue
|
|
|
|
|
|
+ file_part, scene = self._parse_list_line(line)
|
|
|
# 构建完整路径
|
|
|
- pdf_path = self._resolve_path(line)
|
|
|
+ pdf_path = self._resolve_path(file_part)
|
|
|
if pdf_path:
|
|
|
- pdf_files.append(pdf_path)
|
|
|
+ pdf_files.append(PDFTask(path=pdf_path, scene=scene))
|
|
|
|
|
|
return pdf_files
|
|
|
|
|
|
- def from_list(self, pdf_list: List[str]) -> List[Path]:
|
|
|
+ def from_list(self, pdf_list: List[str]) -> List[PDFTask]:
|
|
|
"""从列表读取"""
|
|
|
- pdf_files = []
|
|
|
+ pdf_files: List[PDFTask] = []
|
|
|
|
|
|
for pdf in pdf_list:
|
|
|
- pdf_path = self._resolve_path(pdf.strip())
|
|
|
+ file_part, scene = self._parse_list_line(pdf.strip())
|
|
|
+ pdf_path = self._resolve_path(file_part)
|
|
|
if pdf_path:
|
|
|
- pdf_files.append(pdf_path)
|
|
|
+ pdf_files.append(PDFTask(path=pdf_path, scene=scene))
|
|
|
|
|
|
return pdf_files
|
|
|
|
|
|
- def find_all(self) -> List[Path]:
|
|
|
+ def find_all(self) -> List[PDFTask]:
|
|
|
"""查找基础目录下所有 PDF"""
|
|
|
- return sorted(self.base_dir.rglob('*.pdf'))
|
|
|
+ return [PDFTask(path=path) for path in sorted(self.base_dir.rglob('*.pdf'))]
|
|
|
+
|
|
|
+ def _parse_list_line(self, line: str) -> Tuple[str, Optional[str]]:
|
|
|
+ """解析列表行(支持 文件<TAB>场景 或 文件,场景)"""
|
|
|
+ for sep in ["\t", ","]:
|
|
|
+ if sep in line:
|
|
|
+ file_part, scene_part = line.split(sep, 1)
|
|
|
+ file_part = file_part.strip()
|
|
|
+ scene_part = scene_part.strip()
|
|
|
+ return file_part, scene_part or None
|
|
|
+ return line.strip(), None
|
|
|
|
|
|
def _resolve_path(self, path_str: str) -> Optional[Path]:
|
|
|
"""解析路径"""
|
|
|
@@ -263,13 +284,15 @@ class PDFBatchProcessor:
|
|
|
processor_config: ProcessorConfig,
|
|
|
output_subdir: Optional[str] = None,
|
|
|
log_base_dir: Optional[str] = None, # 🎯 新增:日志基础目录
|
|
|
- dry_run: bool = False
|
|
|
+ dry_run: bool = False,
|
|
|
+ default_scene: Optional[str] = None
|
|
|
):
|
|
|
self.processor_config = processor_config
|
|
|
# 如果指定了output_subdir,使用指定的;否则使用处理器配置中的
|
|
|
self.output_subdir = output_subdir or processor_config.output_subdir
|
|
|
self.log_base_dir = Path(log_base_dir) if log_base_dir else Path('logs') # 🎯 新增
|
|
|
self.dry_run = dry_run
|
|
|
+ self.default_scene = default_scene
|
|
|
|
|
|
# 设置日志
|
|
|
self.logger = self._setup_logger()
|
|
|
@@ -320,7 +343,7 @@ class PDFBatchProcessor:
|
|
|
|
|
|
return log_file
|
|
|
|
|
|
- def process_files(self, pdf_files: List[Path]) -> Dict[str, Any]:
|
|
|
+ def process_files(self, pdf_files: List[PDFTask]) -> Dict[str, Any]:
|
|
|
"""批量处理文件"""
|
|
|
self.logger.info(f"开始处理 {len(pdf_files)} 个文件")
|
|
|
self.logger.info(f"处理器: {self.processor_config.description}")
|
|
|
@@ -335,8 +358,8 @@ class PDFBatchProcessor:
|
|
|
|
|
|
# 使用进度条
|
|
|
with tqdm(total=len(pdf_files), desc="处理进度", unit="file") as pbar:
|
|
|
- for pdf_file in pdf_files:
|
|
|
- result = self._process_single_file(pdf_file)
|
|
|
+ for task in pdf_files:
|
|
|
+ result = self._process_single_file(task)
|
|
|
self.results.append(result)
|
|
|
pbar.update(1)
|
|
|
|
|
|
@@ -355,9 +378,12 @@ class PDFBatchProcessor:
|
|
|
|
|
|
return stats
|
|
|
|
|
|
- def _process_single_file(self, pdf_file: Path) -> ProcessResult:
|
|
|
+ def _process_single_file(self, task: PDFTask) -> ProcessResult:
|
|
|
"""🎯 处理单个文件(支持日志重定向)"""
|
|
|
- self.logger.info(f"处理: {pdf_file}")
|
|
|
+ pdf_file = task.path
|
|
|
+ scene = task.scene or self.default_scene
|
|
|
+ scene_info = f" (scene: {scene})" if scene else ""
|
|
|
+ self.logger.info(f"处理: {pdf_file}{scene_info}")
|
|
|
|
|
|
# 检查文件是否存在
|
|
|
if not pdf_file.exists():
|
|
|
@@ -376,7 +402,7 @@ class PDFBatchProcessor:
|
|
|
log_file = self._get_log_file_path(pdf_file)
|
|
|
|
|
|
# 构建命令
|
|
|
- cmd = self._build_command(pdf_file, output_dir)
|
|
|
+ cmd = self._build_command(pdf_file, output_dir, scene)
|
|
|
|
|
|
self.logger.debug(f"执行命令: {cmd if isinstance(cmd, str) else ' '.join(cmd)}")
|
|
|
self.logger.info(f"日志输出: {log_file}")
|
|
|
@@ -398,7 +424,7 @@ class PDFBatchProcessor:
|
|
|
# 写入日志头
|
|
|
log_f.write(f"{'='*80}\n")
|
|
|
log_f.write(f"处理器: {self.processor_config.description}\n")
|
|
|
- log_f.write(f"PDF 文件: {pdf_file}\n")
|
|
|
+ log_f.write(f"PDF 文件: {pdf_file}{scene_info}\n")
|
|
|
log_f.write(f"输出目录: {output_dir}\n")
|
|
|
log_f.write(f"开始时间: {datetime.now()}\n")
|
|
|
log_f.write(f"{'='*80}\n\n")
|
|
|
@@ -486,7 +512,7 @@ class PDFBatchProcessor:
|
|
|
log_file=str(log_file)
|
|
|
)
|
|
|
|
|
|
- def _build_command(self, pdf_file: Path, output_dir: Path):
|
|
|
+ def _build_command(self, pdf_file: Path, output_dir: Path, scene: Optional[str]):
|
|
|
"""构建执行命令
|
|
|
|
|
|
Returns:
|
|
|
@@ -503,6 +529,13 @@ class PDFBatchProcessor:
|
|
|
|
|
|
# 添加额外参数
|
|
|
base_cmd.extend(self.processor_config.extra_args)
|
|
|
+
|
|
|
+ # 添加场景参数(如果配置了scene_arg)
|
|
|
+ if scene:
|
|
|
+ if self.processor_config.scene_arg:
|
|
|
+ base_cmd.extend([self.processor_config.scene_arg, scene])
|
|
|
+ else:
|
|
|
+ self.logger.warning("⚠️ 场景已提供但未配置scene_arg,已忽略场景参数")
|
|
|
|
|
|
# 如果配置了虚拟环境,构建 shell 命令
|
|
|
if self.processor_config.venv:
|
|
|
@@ -690,6 +723,17 @@ def create_parser() -> argparse.ArgumentParser:
|
|
|
nargs='+',
|
|
|
help='PDF 文件列表 (空格分隔)'
|
|
|
)
|
|
|
+
|
|
|
+ # 场景参数
|
|
|
+ parser.add_argument(
|
|
|
+ '--scene',
|
|
|
+ help='默认场景名称(文件列表未提供场景时使用)'
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ '--scene-arg',
|
|
|
+ default='--scene',
|
|
|
+ help='场景参数名称 (默认: --scene)'
|
|
|
+ )
|
|
|
|
|
|
# 额外参数
|
|
|
parser.add_argument(
|
|
|
@@ -772,8 +816,12 @@ def main():
|
|
|
script=args.script,
|
|
|
extra_args=args.extra_args.split() if args.extra_args else [],
|
|
|
output_subdir=args.output_subdir or 'manual_results',
|
|
|
+ scene_arg=args.scene_arg,
|
|
|
venv=args.venv
|
|
|
)
|
|
|
+ # 如果配置中没有scene_arg且用户指定了scene,默认设置为--scene
|
|
|
+ if args.scene and not processor_config.scene_arg:
|
|
|
+ print("⚠️ 已指定场景但未配置scene_arg,忽略场景参数")
|
|
|
else:
|
|
|
parser.error("必须指定 -p 或 -s 参数")
|
|
|
|
|
|
@@ -806,18 +854,20 @@ def main():
|
|
|
return 1
|
|
|
|
|
|
# 显示找到的文件
|
|
|
- valid_file_paths = [f.as_posix() for f in pdf_files if f.exists()]
|
|
|
+ valid_file_paths = [f"{t.path.as_posix()}\t{t.scene}" if t.scene else t.path.as_posix()
|
|
|
+ for t in pdf_files if t.path.exists()]
|
|
|
if valid_file_paths:
|
|
|
print("\n".join(valid_file_paths))
|
|
|
|
|
|
# 验证文件
|
|
|
- valid_files = [f for f in pdf_files if f.exists()]
|
|
|
- invalid_files = [f for f in pdf_files if not f.exists()]
|
|
|
+ valid_files = [t for t in pdf_files if t.path.exists()]
|
|
|
+ invalid_files = [t for t in pdf_files if not t.path.exists()]
|
|
|
|
|
|
if invalid_files:
|
|
|
print(f"\n⚠️ 警告: {len(invalid_files)} 个文件不存在:")
|
|
|
- for f in invalid_files[:5]:
|
|
|
- print(f" - {f}")
|
|
|
+ for t in invalid_files[:5]:
|
|
|
+ scene_suffix = f" (scene: {t.scene})" if t.scene else ""
|
|
|
+ print(f" - {t.path}{scene_suffix}")
|
|
|
if len(invalid_files) > 5:
|
|
|
print(f" ... 还有 {len(invalid_files) - 5} 个")
|
|
|
|
|
|
@@ -834,7 +884,8 @@ def main():
|
|
|
processor_config=processor_config,
|
|
|
output_subdir=args.output_subdir,
|
|
|
log_base_dir=log_base_dir, # 🎯 传递日志目录
|
|
|
- dry_run=args.dry_run
|
|
|
+ dry_run=args.dry_run,
|
|
|
+ default_scene=args.scene
|
|
|
)
|
|
|
|
|
|
stats = processor.process_files(valid_files)
|