소스 검색

feat: 添加 PDF 批量处理器配置文件,支持多种处理器和输出设置

zhch158_admin 2 주 전
부모
커밋
e4447cabec
1개의 변경된 파일85개의 추가작업 그리고 0개의 파일을 삭제
  1. 85 0
      zhch/processor_configs.yaml

+ 85 - 0
zhch/processor_configs.yaml

@@ -0,0 +1,85 @@
+# ============================================================================
+# PDF 批量处理器配置文件
+# ============================================================================
+
+# 处理器定义
+processors:
+  # -------------------------------------------------------------------------
+  # PaddleOCR-VL 处理器
+  # 用于视觉语言模型的 OCR 处理
+  # -------------------------------------------------------------------------
+  paddleocr_vl_single_process:
+    script: "paddleocr_vl_single_process.py"
+    input_arg: "--input_file"
+    output_arg: "--output_dir"
+    extra_args:
+      - "--pipeline=./my_config/PaddleOCR-VL-Client-RT-DETR-H_layout_17cls.yaml"
+      - "--no-adapter"
+    output_subdir: "paddleocr_vl_results"
+    description: "PaddleOCR-VL 处理器 - 视觉语言模型OCR"
+
+  # -------------------------------------------------------------------------
+  # PP-StructureV3 本地处理器
+  # 用于文档结构化分析(本地GPU/CPU处理)
+  # -------------------------------------------------------------------------
+  ppstructurev3_single_process:
+    script: "ppstructurev3_single_process.py"
+    input_arg: "--input_file"
+    output_arg: "--output_dir"
+    extra_args:
+      - "--pipeline=./my_config/PP-StructureV3.yaml"
+    output_subdir: "ppstructurev3_results"
+    description: "PP-StructureV3 处理器 - 本地处理"
+
+  # -------------------------------------------------------------------------
+  # PP-StructureV3 GPU 处理器
+  # 明确使用 GPU 加速
+  # -------------------------------------------------------------------------
+  ppstructurev3_gpu:
+    script: "ppstructurev3_single_process.py"
+    input_arg: "--input_file"
+    output_arg: "--output_dir"
+    extra_args:
+      - "--pipeline=./my_config/PP-StructureV3.yaml"
+      - "--device=gpu"
+    output_subdir: "ppstructurev3_gpu_results"
+    description: "PP-StructureV3 处理器 - GPU加速"
+
+  # -------------------------------------------------------------------------
+  # PP-StructureV3 CPU 处理器
+  # 明确使用 CPU 处理
+  # -------------------------------------------------------------------------
+  ppstructurev3_cpu:
+    script: "ppstructurev3_single_process.py"
+    input_arg: "--input_file"
+    output_arg: "--output_dir"
+    extra_args:
+      - "--pipeline=./my_config/PP-StructureV3.yaml"
+      - "--device=cpu"
+    output_subdir: "ppstructurev3_cpu_results"
+    description: "PP-StructureV3 处理器 - CPU处理"
+
+  # -------------------------------------------------------------------------
+  # PP-StructureV3 API 客户端 (默认)
+  # 通过 HTTP API 调用远程服务
+  # -------------------------------------------------------------------------
+  ppstructurev3_single_client:
+    script: "ppstructurev3_single_client.py"
+    input_arg: "--input_file"
+    output_arg: "--output_dir"
+    extra_args:
+      - "--api_url=http://10.192.72.11:8111/layout-parsing"
+      - "--timeout=300"
+    output_subdir: "ppstructurev3_client_results"
+    description: "PP-StructureV3 HTTP API 客户端 - 远程服务"
+
+# ============================================================================
+# 全局配置
+# ============================================================================
+global:
+  # PDF 文件基础目录
+  base_dir: "/Users/zhch158/workspace/data/流水分析"
+  
+  # 默认输出子目录名称(如果处理器未指定)
+  output_subdir: "results"
+