|
|
@@ -3,6 +3,7 @@
|
|
|
PDF 批量处理脚本
|
|
|
支持多种处理器,配置文件驱动
|
|
|
支持自动切换虚拟环境
|
|
|
+支持执行器输出日志重定向
|
|
|
"""
|
|
|
|
|
|
import os
|
|
|
@@ -32,7 +33,8 @@ class ProcessorConfig:
|
|
|
output_arg: str = "--output_dir"
|
|
|
extra_args: List[str] = field(default_factory=list)
|
|
|
output_subdir: str = "results"
|
|
|
- venv: Optional[str] = None # 虚拟环境激活命令
|
|
|
+ log_subdir: str = "logs" # 🎯 新增:日志子目录
|
|
|
+ venv: Optional[str] = None
|
|
|
description: str = ""
|
|
|
|
|
|
|
|
|
@@ -43,6 +45,7 @@ class ProcessResult:
|
|
|
success: bool
|
|
|
duration: float
|
|
|
error_message: str = ""
|
|
|
+ log_file: str = "" # 🎯 新增:日志文件路径
|
|
|
|
|
|
|
|
|
# ============================================================================
|
|
|
@@ -64,7 +67,8 @@ class ConfigManager:
|
|
|
],
|
|
|
'output_subdir': 'paddleocr_vl_results',
|
|
|
'venv': 'source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate',
|
|
|
- 'description': 'PaddleOCR-VL 处理器'
|
|
|
+ 'description': 'PaddleOCR-VL 处理器',
|
|
|
+ 'log_subdir': 'logs/paddleocr_vl_single_process' # 🎯 新增
|
|
|
},
|
|
|
'ppstructurev3_single_process': {
|
|
|
'script': '/Users/zhch158/workspace/repository.git/PaddleX/zhch/ppstructurev3_single_process.py',
|
|
|
@@ -75,7 +79,8 @@ class ConfigManager:
|
|
|
],
|
|
|
'output_subdir': 'ppstructurev3_results',
|
|
|
'venv': 'conda activate paddle',
|
|
|
- 'description': 'PP-StructureV3 处理器'
|
|
|
+ 'description': 'PP-StructureV3 处理器',
|
|
|
+ 'log_subdir': 'logs/ppstructurev3_single_process' # 🎯 新增
|
|
|
},
|
|
|
'ppstructurev3_single_client': {
|
|
|
'script': '/Users/zhch158/workspace/repository.git/PaddleX/zhch/ppstructurev3_single_client.py',
|
|
|
@@ -87,7 +92,8 @@ class ConfigManager:
|
|
|
],
|
|
|
'output_subdir': 'ppstructurev3_client_results',
|
|
|
'venv': 'source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate',
|
|
|
- 'description': 'PP-StructureV3 HTTP API 客户端'
|
|
|
+ 'description': 'PP-StructureV3 HTTP API 客户端',
|
|
|
+ 'log_subdir': 'logs/ppstructurev3_single_client' # 🎯 新增
|
|
|
},
|
|
|
'mineru_vllm': {
|
|
|
'script': '/Users/zhch158/workspace/repository.git/MinerU/zhch/mineru2_vllm_multthreads.py',
|
|
|
@@ -100,7 +106,8 @@ class ConfigManager:
|
|
|
],
|
|
|
'output_subdir': 'mineru_vllm_results',
|
|
|
'venv': 'conda activate mineru2',
|
|
|
- 'description': 'MinerU vLLM 处理器'
|
|
|
+ 'description': 'MinerU vLLM 处理器',
|
|
|
+ 'log_subdir': 'logs/mineru_vllm' # 🎯 新增
|
|
|
},
|
|
|
'dotsocr_vllm': {
|
|
|
'script': '/Users/zhch158/workspace/repository.git/dots.ocr/zhch/dotsocr_vllm_multthreads.py',
|
|
|
@@ -117,12 +124,16 @@ class ConfigManager:
|
|
|
],
|
|
|
'output_subdir': 'dotsocr_vllm_results',
|
|
|
'venv': 'conda activate py312',
|
|
|
- 'description': 'DotsOCR vLLM 处理器 - 支持PDF和图片'
|
|
|
+ 'description': 'DotsOCR vLLM 处理器 - 支持PDF和图片',
|
|
|
+ 'log_subdir': 'logs/dotsocr_vllm' # 🎯 新增
|
|
|
}
|
|
|
},
|
|
|
'global': {
|
|
|
'base_dir': '/Users/zhch158/workspace/data/流水分析',
|
|
|
- 'output_subdir': 'results'
|
|
|
+ 'output_subdir': 'results',
|
|
|
+ 'log_dir': 'logs',
|
|
|
+ 'log_retention_days': 30,
|
|
|
+ 'log_level': 'INFO'
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -153,6 +164,7 @@ class ConfigManager:
|
|
|
output_arg=proc_config.get('output_arg', '--output_dir'),
|
|
|
extra_args=proc_config.get('extra_args', []),
|
|
|
output_subdir=proc_config.get('output_subdir', processor_name + '_results'),
|
|
|
+ log_subdir=proc_config.get('log_subdir', f'logs/{processor_name}'), # 🎯 新增
|
|
|
venv=proc_config.get('venv'),
|
|
|
description=proc_config.get('description', '')
|
|
|
)
|
|
|
@@ -250,11 +262,13 @@ class PDFBatchProcessor:
|
|
|
self,
|
|
|
processor_config: ProcessorConfig,
|
|
|
output_subdir: Optional[str] = None,
|
|
|
+ log_base_dir: Optional[str] = None, # 🎯 新增:日志基础目录
|
|
|
dry_run: bool = False
|
|
|
):
|
|
|
self.processor_config = processor_config
|
|
|
# 如果指定了output_subdir,使用指定的;否则使用处理器配置中的
|
|
|
self.output_subdir = output_subdir or processor_config.output_subdir
|
|
|
+ self.log_base_dir = Path(log_base_dir) if log_base_dir else Path('logs') # 🎯 新增
|
|
|
self.dry_run = dry_run
|
|
|
|
|
|
# 设置日志
|
|
|
@@ -282,12 +296,37 @@ class PDFBatchProcessor:
|
|
|
|
|
|
return logger
|
|
|
|
|
|
+ def _get_log_file_path(self, pdf_file: Path) -> Path:
|
|
|
+ """
|
|
|
+ 🎯 获取日志文件路径
|
|
|
+
|
|
|
+ 日志结构:
|
|
|
+ base_dir/
|
|
|
+ └── PDF名称/
|
|
|
+ └── logs/
|
|
|
+ └── processor_name/
|
|
|
+ └── PDF名称_YYYYMMDD_HHMMSS.log
|
|
|
+ """
|
|
|
+ # PDF 目录
|
|
|
+ pdf_dir = pdf_file.parent / pdf_file.stem
|
|
|
+
|
|
|
+ # 日志目录: pdf_dir / logs / processor_name
|
|
|
+ log_dir = pdf_dir / self.processor_config.log_subdir
|
|
|
+ log_dir.mkdir(parents=True, exist_ok=True)
|
|
|
+
|
|
|
+ # 日志文件名: PDF名称_时间戳.log
|
|
|
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
|
+ log_file = log_dir / f"{pdf_file.stem}_{timestamp}.log"
|
|
|
+
|
|
|
+ return log_file
|
|
|
+
|
|
|
def process_files(self, pdf_files: List[Path]) -> Dict[str, Any]:
|
|
|
"""批量处理文件"""
|
|
|
self.logger.info(f"开始处理 {len(pdf_files)} 个文件")
|
|
|
self.logger.info(f"处理器: {self.processor_config.description}")
|
|
|
self.logger.info(f"脚本: {self.processor_config.script}")
|
|
|
self.logger.info(f"输出目录: {self.output_subdir}")
|
|
|
+ self.logger.info(f"日志目录: {self.processor_config.log_subdir}")
|
|
|
|
|
|
if self.processor_config.venv:
|
|
|
self.logger.info(f"虚拟环境: {self.processor_config.venv}")
|
|
|
@@ -312,14 +351,12 @@ class PDFBatchProcessor:
|
|
|
|
|
|
# 生成统计信息
|
|
|
stats = self._generate_stats(total_duration)
|
|
|
-
|
|
|
- # 保存日志
|
|
|
- self._save_log(stats)
|
|
|
+ self._save_summary_log(stats)
|
|
|
|
|
|
return stats
|
|
|
|
|
|
def _process_single_file(self, pdf_file: Path) -> ProcessResult:
|
|
|
- """处理单个文件"""
|
|
|
+ """🎯 处理单个文件(支持日志重定向)"""
|
|
|
self.logger.info(f"处理: {pdf_file}")
|
|
|
|
|
|
# 检查文件是否存在
|
|
|
@@ -335,10 +372,14 @@ class PDFBatchProcessor:
|
|
|
# 确定输出目录
|
|
|
output_dir = pdf_file.parent / pdf_file.stem / self.output_subdir
|
|
|
|
|
|
+ # 🎯 获取日志文件路径
|
|
|
+ log_file = self._get_log_file_path(pdf_file)
|
|
|
+
|
|
|
# 构建命令
|
|
|
cmd = self._build_command(pdf_file, output_dir)
|
|
|
|
|
|
self.logger.debug(f"执行命令: {cmd if isinstance(cmd, str) else ' '.join(cmd)}")
|
|
|
+ self.logger.info(f"日志输出: {log_file}")
|
|
|
|
|
|
if self.dry_run:
|
|
|
self.logger.info(f"[DRY RUN] 将执行: {cmd if isinstance(cmd, str) else ' '.join(cmd)}")
|
|
|
@@ -346,53 +387,103 @@ class PDFBatchProcessor:
|
|
|
pdf_file=str(pdf_file),
|
|
|
success=True,
|
|
|
duration=0,
|
|
|
- error_message=""
|
|
|
+ error_message="",
|
|
|
+ log_file=str(log_file)
|
|
|
)
|
|
|
|
|
|
- # 执行命令
|
|
|
+ # 🎯 执行命令并重定向输出到日志文件
|
|
|
start_time = time.time()
|
|
|
try:
|
|
|
- # 如果是 shell 命令(包含 venv),使用 shell=True
|
|
|
- if isinstance(cmd, str):
|
|
|
- result = subprocess.run(
|
|
|
- cmd,
|
|
|
- shell=True,
|
|
|
- executable='/bin/bash', # 使用 bash
|
|
|
- capture_output=True,
|
|
|
- text=True,
|
|
|
- check=True
|
|
|
- )
|
|
|
- else:
|
|
|
- result = subprocess.run(
|
|
|
- cmd,
|
|
|
- capture_output=True,
|
|
|
- text=True,
|
|
|
- check=True
|
|
|
- )
|
|
|
+ with open(log_file, 'w', encoding='utf-8') as log_f:
|
|
|
+ # 写入日志头
|
|
|
+ log_f.write(f"{'='*80}\n")
|
|
|
+ log_f.write(f"处理器: {self.processor_config.description}\n")
|
|
|
+ log_f.write(f"PDF 文件: {pdf_file}\n")
|
|
|
+ log_f.write(f"输出目录: {output_dir}\n")
|
|
|
+ log_f.write(f"开始时间: {datetime.now()}\n")
|
|
|
+ log_f.write(f"{'='*80}\n\n")
|
|
|
+ log_f.flush()
|
|
|
+
|
|
|
+ # 执行命令
|
|
|
+ if isinstance(cmd, str):
|
|
|
+ result = subprocess.run(
|
|
|
+ cmd,
|
|
|
+ shell=True,
|
|
|
+ executable='/bin/bash',
|
|
|
+ stdout=log_f, # 🎯 重定向 stdout
|
|
|
+ stderr=subprocess.STDOUT, # 🎯 合并 stderr 到 stdout
|
|
|
+ text=True,
|
|
|
+ check=True
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ result = subprocess.run(
|
|
|
+ cmd,
|
|
|
+ stdout=log_f, # 🎯 重定向 stdout
|
|
|
+ stderr=subprocess.STDOUT, # 🎯 合并 stderr
|
|
|
+ text=True,
|
|
|
+ check=True
|
|
|
+ )
|
|
|
+
|
|
|
+ # 写入日志尾
|
|
|
+ log_f.write(f"\n{'='*80}\n")
|
|
|
+ log_f.write(f"结束时间: {datetime.now()}\n")
|
|
|
+ log_f.write(f"状态: 成功\n")
|
|
|
+ log_f.write(f"{'='*80}\n")
|
|
|
|
|
|
duration = time.time() - start_time
|
|
|
-
|
|
|
self.logger.info(f"✓ 成功 (耗时: {duration:.2f}秒)")
|
|
|
|
|
|
return ProcessResult(
|
|
|
pdf_file=str(pdf_file),
|
|
|
success=True,
|
|
|
duration=duration,
|
|
|
- error_message=""
|
|
|
+ error_message="",
|
|
|
+ log_file=str(log_file)
|
|
|
)
|
|
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
|
duration = time.time() - start_time
|
|
|
- error_msg = e.stderr if e.stderr else str(e)
|
|
|
+ error_msg = f"命令执行失败 (退出码: {e.returncode})"
|
|
|
+
|
|
|
+ # 🎯 在日志文件中追加错误信息
|
|
|
+ with open(log_file, 'a', encoding='utf-8') as log_f:
|
|
|
+ log_f.write(f"\n{'='*80}\n")
|
|
|
+ log_f.write(f"结束时间: {datetime.now()}\n")
|
|
|
+ log_f.write(f"状态: 失败\n")
|
|
|
+ log_f.write(f"错误: {error_msg}\n")
|
|
|
+ log_f.write(f"{'='*80}\n")
|
|
|
|
|
|
self.logger.error(f"✗ 失败 (耗时: {duration:.2f}秒)")
|
|
|
self.logger.error(f"错误信息: {error_msg}")
|
|
|
+ self.logger.error(f"详细日志: {log_file}")
|
|
|
+
|
|
|
+ return ProcessResult(
|
|
|
+ pdf_file=str(pdf_file),
|
|
|
+ success=False,
|
|
|
+ duration=duration,
|
|
|
+ error_message=error_msg,
|
|
|
+ log_file=str(log_file)
|
|
|
+ )
|
|
|
+ except Exception as e:
|
|
|
+ duration = time.time() - start_time
|
|
|
+ error_msg = str(e)
|
|
|
+
|
|
|
+ with open(log_file, 'a', encoding='utf-8') as log_f:
|
|
|
+ log_f.write(f"\n{'='*80}\n")
|
|
|
+ log_f.write(f"结束时间: {datetime.now()}\n")
|
|
|
+ log_f.write(f"状态: 异常\n")
|
|
|
+ log_f.write(f"错误: {error_msg}\n")
|
|
|
+ log_f.write(f"{'='*80}\n")
|
|
|
+
|
|
|
+ self.logger.error(f"✗ 异常 (耗时: {duration:.2f}秒)")
|
|
|
+ self.logger.error(f"错误信息: {error_msg}")
|
|
|
|
|
|
return ProcessResult(
|
|
|
pdf_file=str(pdf_file),
|
|
|
success=False,
|
|
|
duration=duration,
|
|
|
- error_message=error_msg
|
|
|
+ error_message=error_msg,
|
|
|
+ log_file=str(log_file)
|
|
|
)
|
|
|
|
|
|
def _build_command(self, pdf_file: Path, output_dir: Path):
|
|
|
@@ -451,7 +542,14 @@ eval "$(conda shell.bash hook)"
|
|
|
success_count = sum(1 for r in self.results if r.success)
|
|
|
failed_count = len(self.results) - success_count
|
|
|
|
|
|
- failed_files = [r.pdf_file for r in self.results if not r.success]
|
|
|
+ failed_files = [
|
|
|
+ {
|
|
|
+ 'file': r.pdf_file,
|
|
|
+ 'error': r.error_message,
|
|
|
+ 'log': r.log_file
|
|
|
+ }
|
|
|
+ for r in self.results if not r.success
|
|
|
+ ]
|
|
|
|
|
|
stats = {
|
|
|
'total': len(self.results),
|
|
|
@@ -464,7 +562,8 @@ eval "$(conda shell.bash hook)"
|
|
|
'file': r.pdf_file,
|
|
|
'success': r.success,
|
|
|
'duration': r.duration,
|
|
|
- 'error': r.error_message
|
|
|
+ 'error': r.error_message,
|
|
|
+ 'log': r.log_file
|
|
|
}
|
|
|
for r in self.results
|
|
|
]
|
|
|
@@ -472,19 +571,23 @@ eval "$(conda shell.bash hook)"
|
|
|
|
|
|
return stats
|
|
|
|
|
|
- def _save_log(self, stats: Dict[str, Any]):
|
|
|
- """保存日志"""
|
|
|
+ def _save_summary_log(self, stats: Dict[str, Any]):
|
|
|
+ """🎯 保存汇总日志"""
|
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
|
- log_file = f"batch_process_{self.processor_config.name}_{timestamp}.log"
|
|
|
+ summary_log_file = self.log_base_dir / f"batch_summary_{self.processor_config.name}_{timestamp}.log"
|
|
|
+
|
|
|
+ # 确保目录存在
|
|
|
+ summary_log_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
- with open(log_file, 'w', encoding='utf-8') as f:
|
|
|
- f.write("PDF 批量处理日志\n")
|
|
|
+ with open(summary_log_file, 'w', encoding='utf-8') as f:
|
|
|
+ f.write("PDF 批量处理汇总日志\n")
|
|
|
f.write("=" * 80 + "\n\n")
|
|
|
|
|
|
f.write(f"处理器: {self.processor_config.description}\n")
|
|
|
f.write(f"处理器名称: {self.processor_config.name}\n")
|
|
|
f.write(f"脚本: {self.processor_config.script}\n")
|
|
|
f.write(f"输出目录: {self.output_subdir}\n")
|
|
|
+ f.write(f"日志目录: {self.processor_config.log_subdir}\n")
|
|
|
|
|
|
if self.processor_config.venv:
|
|
|
f.write(f"虚拟环境: {self.processor_config.venv}\n")
|
|
|
@@ -499,18 +602,20 @@ eval "$(conda shell.bash hook)"
|
|
|
|
|
|
if stats['failed_files']:
|
|
|
f.write("失败的文件:\n")
|
|
|
- for file in stats['failed_files']:
|
|
|
- f.write(f" - {file}\n")
|
|
|
- f.write("\n")
|
|
|
+ for item in stats['failed_files']:
|
|
|
+ f.write(f" ✗ {item['file']}\n")
|
|
|
+ f.write(f" 错误: {item['error']}\n")
|
|
|
+ f.write(f" 日志: {item['log']}\n\n")
|
|
|
|
|
|
f.write("详细结果:\n")
|
|
|
for result in stats['results']:
|
|
|
status = "✓" if result['success'] else "✗"
|
|
|
f.write(f"{status} {result['file']} ({result['duration']:.2f}s)\n")
|
|
|
+ f.write(f" 日志: {result['log']}\n")
|
|
|
if result['error']:
|
|
|
f.write(f" 错误: {result['error']}\n")
|
|
|
|
|
|
- self.logger.info(f"日志已保存: {log_file}")
|
|
|
+ self.logger.info(f"汇总日志已保存: {summary_log_file}")
|
|
|
|
|
|
|
|
|
# ============================================================================
|
|
|
@@ -684,6 +789,7 @@ def main():
|
|
|
base_dir = args.base_dir or config_manager.get_global_config('base_dir')
|
|
|
if not base_dir:
|
|
|
parser.error("必须指定 -d 参数或在配置文件中设置 base_dir")
|
|
|
+ log_base_dir = base_dir + '/' + config_manager.get_global_config('log_dir', 'logs')
|
|
|
|
|
|
# 查找 PDF 文件
|
|
|
finder = PDFFileFinder(base_dir)
|
|
|
@@ -727,6 +833,7 @@ def main():
|
|
|
processor = PDFBatchProcessor(
|
|
|
processor_config=processor_config,
|
|
|
output_subdir=args.output_subdir,
|
|
|
+ log_base_dir=log_base_dir, # 🎯 传递日志目录
|
|
|
dry_run=args.dry_run
|
|
|
)
|
|
|
|
|
|
@@ -739,8 +846,7 @@ def main():
|
|
|
print(f"\n📊 统计信息:")
|
|
|
print(f" 处理器: {processor_config.description}")
|
|
|
print(f" 输出目录: {processor.output_subdir}")
|
|
|
- if stats.get('venv'):
|
|
|
- print(f" 虚拟环境: {stats['venv']}")
|
|
|
+ print(f" 日志目录: {processor.processor_config.log_subdir}")
|
|
|
print(f" 总文件数: {stats['total']}")
|
|
|
print(f" ✓ 成功: {stats['success']}")
|
|
|
print(f" ✗ 失败: {stats['failed']}")
|
|
|
@@ -748,11 +854,37 @@ def main():
|
|
|
|
|
|
if stats['failed_files']:
|
|
|
print(f"\n失败的文件:")
|
|
|
- for file in stats['failed_files']:
|
|
|
- print(f" ✗ {file}")
|
|
|
+ for item in stats['failed_files']:
|
|
|
+ print(f" ✗ {item['file']}")
|
|
|
+ print(f" 错误: {item['error']}")
|
|
|
+ print(f" 日志: {item['log']}")
|
|
|
|
|
|
return 0 if stats['failed'] == 0 else 1
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
+ print("🚀 启动批量OCR程序...")
|
|
|
+
|
|
|
+ import sys
|
|
|
+
|
|
|
+ if len(sys.argv) == 1:
|
|
|
+ # 如果没有命令行参数,使用默认配置运行
|
|
|
+ print("ℹ️ 未提供命令行参数,使用默认配置运行...")
|
|
|
+
|
|
|
+ # 默认配置
|
|
|
+ default_config = {
|
|
|
+ "processor": "mineru_vllm",
|
|
|
+ "file-list": "pdf_list.txt",
|
|
|
+ }
|
|
|
+
|
|
|
+ print("⚙️ 默认参数:")
|
|
|
+ for key, value in default_config.items():
|
|
|
+ print(f" --{key}: {value}")
|
|
|
+ # 构造参数
|
|
|
+ sys.argv = [sys.argv[0]]
|
|
|
+ for key, value in default_config.items():
|
|
|
+ sys.argv.extend([f"--{key}", str(value)])
|
|
|
+ sys.argv.append("--dry-run")
|
|
|
+ sys.argv.append("--verbose") # 添加详细输出参数
|
|
|
+
|
|
|
sys.exit(main())
|