# ============================================================================ # PDF 批量处理器配置文件 # ============================================================================ # 处理器定义 processors: # ------------------------------------------------------------------------- # PaddleOCR-VL 处理器 # ------------------------------------------------------------------------- paddleocr_vl_single_process: script: "/Users/zhch158/workspace/repository.git/PaddleX/zhch/paddleocr_vl_single_process.py" input_arg: "--input_file" output_arg: "--output_dir" extra_args: - "--pipeline=/Users/zhch158/workspace/repository.git/PaddleX/zhch/my_config/PaddleOCR-VL-Client-RT-DETR-H_layout_17cls.yaml" - "--device=cpu" # - "--no-adapter" output_subdir: "paddleocr_vl_results" log_subdir: "logs/paddleocr_vl" # 🎯 新增:日志子目录 venv: "source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate" description: "PaddleOCR-VL 处理器 - 视觉语言模型OCR" # ------------------------------------------------------------------------- # PP-StructureV3 本地处理器 # ------------------------------------------------------------------------- ppstructurev3_single_process: script: "/home/ubuntu/zhch/PaddleX/zhch/ppstructurev3_single_process.py" input_arg: "--input_file" output_arg: "--output_dir" extra_args: - "--pipeline=/home/ubuntu/zhch/PaddleX/zhch/my_config/PP-StructureV3.yaml" - "--device=cpu" output_subdir: "ppstructurev3_results" log_subdir: "logs/ppstructurev3" venv: "conda activate paddle" description: "PP-StructureV3 处理器 - 本地处理" ppstructurev3_gpu: script: "/home/ubuntu/zhch/PaddleX/zhch/ppstructurev3_single_process.py" input_arg: "--input_file" output_arg: "--output_dir" extra_args: - "--pipeline=/home/ubuntu/zhch/PaddleX/zhch/my_config/PP-StructureV3.yaml" input_arg: "--input_file" output_arg: "--output_dir" extra_args: - "--pipeline=/home/ubuntu/zhch/PaddleX/zhch/my_config/PP-StructureV3.yaml" - "--device=gpu" output_subdir: "ppstructurev3_gpu_results" log_subdir: "logs/ppstructurev3_gpu" venv: "conda activate paddle" description: "PP-StructureV3 处理器 - GPU加速" # ------------------------------------------------------------------------- # PP-StructureV3 CPU 处理器 # 明确使用 CPU 处理 # ------------------------------------------------------------------------- ppstructurev3_cpu: script: "/Users/zhch158/workspace/repository.git/PaddleX/zhch/ppstructurev3_single_process.py" input_arg: "--input_file" output_arg: "--output_dir" extra_args: - "--pipeline=/Users/zhch158/workspace/repository.git/PaddleX/zhch/my_config/PP-StructureV3-zhch.yaml" - "--device=cpu" output_subdir: "ppstructurev3_cpu_results" log_subdir: "logs/ppstructurev3_cpu" venv: "source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate" description: "PP-StructureV3 处理器 - CPU处理" # ------------------------------------------------------------------------- # PP-StructureV3 API 客户端 (默认) # 通过 HTTP API 调用远程服务 # ------------------------------------------------------------------------- ppstructurev3_single_client: script: "/Users/zhch158/workspace/repository.git/PaddleX/zhch/ppstructurev3_single_client.py" input_arg: "--input_file" output_arg: "--output_dir" extra_args: - "--api_url=http://10.192.72.11:8111/layout-parsing" - "--timeout=300" output_subdir: "ppstructurev3_client_results" log_subdir: "logs/ppstructurev3_client" venv: "source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate" description: "PP-StructureV3 HTTP API 客户端 - 远程服务" # ------------------------------------------------------------------------- # MinerU vLLM 处理器 # 基于 MinerU 的多线程批量处理(支持 PDF 和图片) # ------------------------------------------------------------------------- mineru_vllm: script: "/Users/zhch158/workspace/repository.git/MinerU/zhch/mineru2_vllm_multthreads.py" input_arg: "--input_file" output_arg: "--output_dir" extra_args: - "--server_url=http://10.192.72.11:8121" - "--timeout=300" - "--batch_size=1" output_subdir: "mineru_vllm_results" log_subdir: "logs/mineru_vllm" venv: "conda activate mineru2" description: "MinerU vLLM 处理器 - 支持PDF和图片" # ------------------------------------------------------------------------- # DotsOCR vLLM 处理器 # 基于 DotsOCR 的批量处理(支持 PDF 和图片) # ------------------------------------------------------------------------- dotsocr_vllm: script: "/Users/zhch158/workspace/repository.git/dots.ocr/zhch/dotsocr_vllm_multthreads.py" input_arg: "--input_file" output_arg: "--output_dir" extra_args: - "--ip=10.192.72.11" - "--port=8101" - "--model_name=DotsOCR" - "--prompt_mode=prompt_layout_all_en" - "--batch_size=1" - "--max_workers=1" - "--dpi=200" output_subdir: "dotsocr_vllm_results" log_subdir: "logs/dotsocr_vllm" venv: "conda activate py312" description: "DotsOCR vLLM 处理器 - 支持PDF和图片" # ============================================================================ # 全局配置 # ============================================================================ global: # PDF 文件基础目录 base_dir: "/Users/zhch158/workspace/data/流水分析" # 默认输出子目录名称(如果处理器未指定) output_subdir: "results" # 🎯 新增:全局日志配置 log_dir: "logs" # 全局日志目录(相对于 base_dir) log_retention_days: 30 # 日志保留天数 log_level: "INFO" # 日志级别: DEBUG, INFO, WARNING, ERROR