processor_configs.yaml 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. # ============================================================================
  2. # PDF 批量处理器配置文件
  3. # ============================================================================
  4. # 处理器定义
  5. processors:
  6. # -------------------------------------------------------------------------
  7. # PaddleOCR-VL 处理器
  8. # -------------------------------------------------------------------------
  9. paddleocr_vl_single_process:
  10. script: "/Users/zhch158/workspace/repository.git/PaddleX/zhch/paddleocr_vl_single_process.py"
  11. input_arg: "--input_file"
  12. output_arg: "--output_dir"
  13. extra_args:
  14. - "--pipeline=/Users/zhch158/workspace/repository.git/PaddleX/zhch/my_config/PaddleOCR-VL-Client-RT-DETR-H_layout_17cls.yaml"
  15. - "--device=cpu"
  16. # - "--no-adapter"
  17. output_subdir: "paddleocr_vl_results"
  18. log_subdir: "logs/paddleocr_vl" # 🎯 新增:日志子目录
  19. venv: "source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate"
  20. description: "PaddleOCR-VL 处理器 - 视觉语言模型OCR"
  21. # -------------------------------------------------------------------------
  22. # PP-StructureV3 本地处理器
  23. # -------------------------------------------------------------------------
  24. ppstructurev3_single_process:
  25. script: "/home/ubuntu/zhch/PaddleX/zhch/ppstructurev3_single_process.py"
  26. input_arg: "--input_file"
  27. output_arg: "--output_dir"
  28. extra_args:
  29. - "--pipeline=/home/ubuntu/zhch/PaddleX/zhch/my_config/PP-StructureV3.yaml"
  30. - "--device=cpu"
  31. output_subdir: "ppstructurev3_results"
  32. log_subdir: "logs/ppstructurev3"
  33. venv: "conda activate paddle"
  34. description: "PP-StructureV3 处理器 - 本地处理"
  35. ppstructurev3_gpu:
  36. script: "/home/ubuntu/zhch/PaddleX/zhch/ppstructurev3_single_process.py"
  37. input_arg: "--input_file"
  38. output_arg: "--output_dir"
  39. extra_args:
  40. - "--pipeline=/home/ubuntu/zhch/PaddleX/zhch/my_config/PP-StructureV3.yaml"
  41. input_arg: "--input_file"
  42. output_arg: "--output_dir"
  43. extra_args:
  44. - "--pipeline=/home/ubuntu/zhch/PaddleX/zhch/my_config/PP-StructureV3.yaml"
  45. - "--device=gpu"
  46. output_subdir: "ppstructurev3_gpu_results"
  47. log_subdir: "logs/ppstructurev3_gpu"
  48. venv: "conda activate paddle"
  49. description: "PP-StructureV3 处理器 - GPU加速"
  50. # -------------------------------------------------------------------------
  51. # PP-StructureV3 CPU 处理器
  52. # 明确使用 CPU 处理
  53. # -------------------------------------------------------------------------
  54. ppstructurev3_cpu:
  55. script: "/Users/zhch158/workspace/repository.git/PaddleX/zhch/ppstructurev3_single_process.py"
  56. input_arg: "--input_file"
  57. output_arg: "--output_dir"
  58. extra_args:
  59. - "--pipeline=/Users/zhch158/workspace/repository.git/PaddleX/zhch/my_config/PP-StructureV3-zhch.yaml"
  60. - "--device=cpu"
  61. output_subdir: "ppstructurev3_cpu_results"
  62. log_subdir: "logs/ppstructurev3_cpu"
  63. venv: "source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate"
  64. description: "PP-StructureV3 处理器 - CPU处理"
  65. # -------------------------------------------------------------------------
  66. # PP-StructureV3 API 客户端 (默认)
  67. # 通过 HTTP API 调用远程服务
  68. # -------------------------------------------------------------------------
  69. ppstructurev3_single_client:
  70. script: "/Users/zhch158/workspace/repository.git/PaddleX/zhch/ppstructurev3_single_client.py"
  71. input_arg: "--input_file"
  72. output_arg: "--output_dir"
  73. extra_args:
  74. - "--api_url=http://10.192.72.11:8111/layout-parsing"
  75. - "--timeout=300"
  76. output_subdir: "ppstructurev3_client_results"
  77. log_subdir: "logs/ppstructurev3_client"
  78. venv: "source /Users/zhch158/workspace/repository.git/PaddleX/paddle_env/bin/activate"
  79. description: "PP-StructureV3 HTTP API 客户端 - 远程服务"
  80. # -------------------------------------------------------------------------
  81. # MinerU vLLM 处理器
  82. # 基于 MinerU 的多线程批量处理(支持 PDF 和图片)
  83. # -------------------------------------------------------------------------
  84. mineru_vllm:
  85. script: "/Users/zhch158/workspace/repository.git/MinerU/zhch/mineru2_vllm_multthreads.py"
  86. input_arg: "--input_file"
  87. output_arg: "--output_dir"
  88. extra_args:
  89. - "--server_url=http://10.192.72.11:8121"
  90. - "--timeout=300"
  91. - "--batch_size=1"
  92. output_subdir: "mineru_vllm_results"
  93. log_subdir: "logs/mineru_vllm"
  94. venv: "conda activate mineru2"
  95. description: "MinerU vLLM 处理器 - 支持PDF和图片"
  96. # -------------------------------------------------------------------------
  97. # DotsOCR vLLM 处理器
  98. # 基于 DotsOCR 的批量处理(支持 PDF 和图片)
  99. # -------------------------------------------------------------------------
  100. dotsocr_vllm:
  101. script: "/Users/zhch158/workspace/repository.git/dots.ocr/zhch/dotsocr_vllm_multthreads.py"
  102. input_arg: "--input_file"
  103. output_arg: "--output_dir"
  104. extra_args:
  105. - "--ip=10.192.72.11"
  106. - "--port=8101"
  107. - "--model_name=DotsOCR"
  108. - "--prompt_mode=prompt_layout_all_en"
  109. - "--batch_size=1"
  110. - "--max_workers=1"
  111. - "--dpi=200"
  112. output_subdir: "dotsocr_vllm_results"
  113. log_subdir: "logs/dotsocr_vllm"
  114. venv: "conda activate py312"
  115. description: "DotsOCR vLLM 处理器 - 支持PDF和图片"
  116. # ============================================================================
  117. # 全局配置
  118. # ============================================================================
  119. global:
  120. # PDF 文件基础目录
  121. base_dir: "/Users/zhch158/workspace/data/流水分析"
  122. # 默认输出子目录名称(如果处理器未指定)
  123. output_subdir: "results"
  124. # 🎯 新增:全局日志配置
  125. log_dir: "logs" # 全局日志目录(相对于 base_dir)
  126. log_retention_days: 30 # 日志保留天数
  127. log_level: "INFO" # 日志级别: DEBUG, INFO, WARNING, ERROR