2 Angajamente 62190e9d59 ... 88905951a0

Autor SHA1 Permisiunea de a trimite mesaje. Dacă este dezactivată, utilizatorul nu va putea trimite nici un fel de mesaj Data
  zhch158_admin 88905951a0 feat: 添加康强_北京农村商业银行的OCR配置文件及数据源 1 săptămână în urmă
  zhch158_admin c7cd1d7fe4 feat: 更新PDF列表并移除PP-StructureV3处理器的冗余配置 1 săptămână în urmă

+ 1 - 0
batch_ocr/pdf_list.txt

@@ -2,4 +2,5 @@
 对公_招商银行图.pdf
 A用户_单元格扫描流水.pdf
 B用户_扫描流水.pdf
+康强_北京农村商业银行.pdf
 2023年度报告母公司.pdf

+ 0 - 5
batch_ocr/processor_configs.yaml

@@ -41,11 +41,6 @@ processors:
     output_arg: "--output_dir"
     extra_args:
       - "--pipeline=/home/ubuntu/zhch/PaddleX/zhch/my_config/PP-StructureV3.yaml"
-    input_arg: "--input_file"
-    output_arg: "--output_dir"
-    extra_args:
-      - "--pipeline=/home/ubuntu/zhch/PaddleX/zhch/my_config/PP-StructureV3.yaml"
-      - "--device=gpu"
     output_subdir: "ppstructurev3_gpu_results"
     log_subdir: "logs/ppstructurev3_gpu"
     venv: "conda activate paddle"

+ 1 - 0
config/global.yaml

@@ -149,5 +149,6 @@ data_sources:
   - 对公_招商银行图.yaml
   - A用户_单元格扫描流水.yaml
   - B用户_扫描流水.yaml
+  - 康强_北京农村商业银行.yaml
   - 至远彩色_2023年报.yaml
 

+ 55 - 0
config/康强_北京农村商业银行.yaml

@@ -0,0 +1,55 @@
+# 文档: 康强_北京农村商业银行
+document:
+  name: "康强_北京农村商业银行"
+  base_dir: "/Users/zhch158/workspace/data/流水分析/康强_北京农村商业银行"
+  
+  # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
+  ocr_results:
+    # PPStructV3
+    - tool: "ppstructv3"
+      result_dir: "ppstructurev3_client_results"
+      image_dir: "ppstructurev3_client_results/{{name}}"
+      description: "PPStructV3 图片合成结果"
+      enabled: true
+    
+    # PaddleOCR-VL
+    - tool: "paddleocr_vl"
+      result_dir: "paddleocr_vl_results"
+      image_dir: "paddleocr_vl_results/{{name}}"
+      description: "PaddleOCR VLM 图片合成结果"
+      enabled: true
+    
+    # PaddleOCR-VL (带 cell bbox)
+    - tool: "mineru"  # 格式同 MinerU
+      result_dir: "paddleocr_vl_results_cell_bbox"
+      image_dir: "paddleocr_vl_results/{{name}}"
+      description: "PaddleOCR VLM + PaddleOCR 坐标"
+      enabled: true
+    
+    # MinerU
+    - tool: "mineru"
+      result_dir: "mineru_vllm_results"
+      image_dir: "mineru_vllm_results/{{name}}"
+      description: "MinerU 图片合成结果"
+      enabled: true
+    
+    # MinerU (带 cell bbox)
+    - tool: "mineru"
+      result_dir: "mineru_vllm_results_cell_bbox"
+      image_dir: "mineru_vllm_results/{{name}}"
+      description: "MinerU + PaddleOCR 坐标"
+      enabled: true
+    
+    # DotsOCR
+    - tool: "dots_ocr"
+      result_dir: "dotsocr_vllm_results"
+      image_dir: "dotsocr_vllm_results/{{name}}"
+      description: "Dots OCR 图片合成结果"
+      enabled: true
+  
+    # DotsOCR (带 cell bbox)
+    - tool: "mineru"
+      result_dir: "dotsocr_vllm_results_cell_bbox"
+      image_dir: "dotsocr_vllm_results/{{name}}"
+      description: "Dots OCR + PaddleOCR 坐标"
+      enabled: true