| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132 |
- # ============================================================================
- # PDF 批量处理器配置文件
- # ============================================================================
- # 处理器定义
- processors:
- # -------------------------------------------------------------------------
- # PaddleOCR-VL 处理器
- # -------------------------------------------------------------------------
- paddleocr_vl_single_process:
- script: "/Users/zhch158/workspace/repository.git/PaddleX/zhch/paddleocr_vl_single_process.py"
- input_arg: "--input_file"
- output_arg: "--output_dir"
- extra_args:
- - "--pipeline=/Users/zhch158/workspace/repository.git/PaddleX/zhch/my_config/PaddleOCR-VL-Client-RT-DETR-H_layout_17cls.yaml"
- - "--device=cpu"
- # - "--no-adapter"
- output_subdir: "paddleocr_vl_results"
- log_subdir: "logs/paddleocr_vl" # 🎯 新增:日志子目录
- venv: "source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate"
- description: "PaddleOCR-VL 处理器 - 视觉语言模型OCR"
- # -------------------------------------------------------------------------
- # PP-StructureV3 本地处理器
- # -------------------------------------------------------------------------
- ppstructurev3_single_process:
- script: "/home/ubuntu/zhch/PaddleX/zhch/ppstructurev3_single_process.py"
- input_arg: "--input_file"
- output_arg: "--output_dir"
- extra_args:
- - "--pipeline=/home/ubuntu/zhch/PaddleX/zhch/my_config/PP-StructureV3.yaml"
- - "--device=cpu"
- output_subdir: "ppstructurev3_results"
- log_subdir: "logs/ppstructurev3"
- venv: "conda activate paddle"
- description: "PP-StructureV3 处理器 - 本地处理"
- ppstructurev3_gpu:
- script: "/home/ubuntu/zhch/PaddleX/zhch/ppstructurev3_single_process.py"
- input_arg: "--input_file"
- output_arg: "--output_dir"
- extra_args:
- - "--pipeline=/home/ubuntu/zhch/PaddleX/zhch/my_config/PP-StructureV3.yaml"
- output_subdir: "ppstructurev3_gpu_results"
- log_subdir: "logs/ppstructurev3_gpu"
- venv: "conda activate paddle"
- description: "PP-StructureV3 处理器 - GPU加速"
- # -------------------------------------------------------------------------
- # PP-StructureV3 CPU 处理器
- # 明确使用 CPU 处理
- # -------------------------------------------------------------------------
- ppstructurev3_cpu:
- script: "/Users/zhch158/workspace/repository.git/PaddleX/zhch/ppstructurev3_single_process.py"
- input_arg: "--input_file"
- output_arg: "--output_dir"
- extra_args:
- - "--pipeline=/Users/zhch158/workspace/repository.git/PaddleX/zhch/my_config/PP-StructureV3-zhch.yaml"
- - "--device=cpu"
- output_subdir: "ppstructurev3_cpu_results"
- log_subdir: "logs/ppstructurev3_cpu"
- venv: "source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate"
- description: "PP-StructureV3 处理器 - CPU处理"
- # -------------------------------------------------------------------------
- # PP-StructureV3 API 客户端 (默认)
- # 通过 HTTP API 调用远程服务
- # -------------------------------------------------------------------------
- ppstructurev3_single_client:
- script: "/Users/zhch158/workspace/repository.git/PaddleX/zhch/ppstructurev3_single_client.py"
- input_arg: "--input_file"
- output_arg: "--output_dir"
- extra_args:
- - "--api_url=http://10.192.72.11:8111/layout-parsing"
- - "--timeout=300"
- output_subdir: "ppstructurev3_client_results"
- log_subdir: "logs/ppstructurev3_client"
- venv: "source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate"
- description: "PP-StructureV3 HTTP API 客户端 - 远程服务"
- # -------------------------------------------------------------------------
- # MinerU vLLM 处理器
- # 基于 MinerU 的多线程批量处理(支持 PDF 和图片)
- # -------------------------------------------------------------------------
- mineru_vllm:
- script: "/Users/zhch158/workspace/repository.git/MinerU/zhch/mineru2_vllm_multthreads.py"
- input_arg: "--input_file"
- output_arg: "--output_dir"
- extra_args:
- - "--server_url=http://10.192.72.11:8121"
- - "--timeout=300"
- - "--batch_size=1"
- output_subdir: "mineru_vllm_results"
- log_subdir: "logs/mineru_vllm"
- venv: "conda activate mineru2"
- description: "MinerU vLLM 处理器 - 支持PDF和图片"
- # -------------------------------------------------------------------------
- # DotsOCR vLLM 处理器
- # 基于 DotsOCR 的批量处理(支持 PDF 和图片)
- # -------------------------------------------------------------------------
- dotsocr_vllm:
- script: "/Users/zhch158/workspace/repository.git/dots.ocr/zhch/dotsocr_vllm_multthreads.py"
- input_arg: "--input_file"
- output_arg: "--output_dir"
- extra_args:
- - "--ip=10.192.72.11"
- - "--port=8101"
- - "--model_name=DotsOCR"
- - "--prompt_mode=prompt_layout_all_en"
- - "--batch_size=1"
- - "--max_workers=1"
- - "--dpi=200"
- output_subdir: "dotsocr_vllm_results"
- log_subdir: "logs/dotsocr_vllm"
- venv: "conda activate py312"
- description: "DotsOCR vLLM 处理器 - 支持PDF和图片"
- # ============================================================================
- # 全局配置
- # ============================================================================
- global:
- # PDF 文件基础目录
- base_dir: "/Users/zhch158/workspace/data/流水分析"
-
- # 默认输出子目录名称(如果处理器未指定)
- output_subdir: "results"
-
- # 🎯 新增:全局日志配置
- log_dir: "logs" # 全局日志目录(相对于 base_dir)
- log_retention_days: 30 # 日志保留天数
- log_level: "INFO" # 日志级别: DEBUG, INFO, WARNING, ERROR
|