Browse Source

feat: 添加多个OCR工具配置文件,支持不同文档的OCR结果管理

zhch158_admin 1 tuần trước cách đây
mục cha
commit
21757ecf65

+ 48 - 0
config/A用户_单元格扫描流水.yaml

@@ -0,0 +1,48 @@
+document:
+  name: "A用户_单元格扫描流水"
+  base_dir: "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水"
+
+  # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
+  ocr_results:
+    # PPStructV3
+    - tool: "ppstructv3"
+      result_dir: "ppstructurev3_client_results"
+      image_dir: "ppstructurev3_client_results/{{name}}"
+      description: "PPStructV3 图片合成结果"
+      enabled: true
+    
+    # PaddleOCR-VL
+    - tool: "paddleocr_vl"
+      result_dir: "paddleocr_vl_results"
+      image_dir: "paddleocr_vl_results/{{name}}"
+      description: "PaddleOCR VLM 图片合成结果"
+      enabled: true
+    
+    # PaddleOCR-VL (带 cell bbox)
+    - tool: "mineru"  # 格式同 MinerU
+      result_dir: "paddleocr_vl_results_cell_bbox"
+      image_dir: "paddleocr_vl_results/{{name}}"
+      description: "PaddleOCR VLM + PaddleOCR 坐标"
+      enabled: true
+    
+    # MinerU
+    - tool: "mineru"
+      result_dir: "mineru_vllm_results"
+      image_dir: "mineru_vllm_results/{{name}}"
+      description: "MinerU 图片合成结果"
+      enabled: true
+    
+    # MinerU (带 cell bbox)
+    - tool: "mineru"
+      result_dir: "mineru_vllm_results_cell_bbox"
+      image_dir: "mineru_vllm_results/{{name}}"
+      description: "MinerU + PaddleOCR 坐标"
+      enabled: true
+    
+    # DotsOCR
+    - tool: "dots_ocr"
+      result_dir: "dotsocr_vllm_results"
+      image_dir: "dotsocr_vllm_results/{{name}}"
+      description: "Dots OCR 图片合成结果"
+      enabled: true
+  

+ 47 - 0
config/B用户_扫描流水.yaml

@@ -0,0 +1,47 @@
+document:
+  name: "B用户_扫描流水"
+  base_dir: "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水"
+  
+  # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
+  ocr_results:
+    # PPStructV3
+    - tool: "ppstructv3"
+      result_dir: "ppstructurev3_client_results"
+      image_dir: "ppstructurev3_client_results/{{name}}"
+      description: "PPStructV3 图片合成结果"
+      enabled: true
+    
+    # PaddleOCR-VL
+    - tool: "paddleocr_vl"
+      result_dir: "paddleocr_vl_results"
+      image_dir: "paddleocr_vl_results/{{name}}"
+      description: "PaddleOCR VLM 图片合成结果"
+      enabled: true
+    
+    # PaddleOCR-VL (带 cell bbox)
+    - tool: "mineru"  # 格式同 MinerU
+      result_dir: "paddleocr_vl_results_cell_bbox"
+      image_dir: "paddleocr_vl_results/{{name}}"
+      description: "PaddleOCR VLM + PaddleOCR 坐标"
+      enabled: true
+    
+    # MinerU
+    - tool: "mineru"
+      result_dir: "mineru_vllm_results"
+      image_dir: "mineru_vllm_results/{{name}}"
+      description: "MinerU 图片合成结果"
+      enabled: true
+    
+    # MinerU (带 cell bbox)
+    - tool: "mineru"
+      result_dir: "mineru_vllm_results_cell_bbox"
+      image_dir: "mineru_vllm_results/{{name}}"
+      description: "MinerU + PaddleOCR 坐标"
+      enabled: true
+    
+    # DotsOCR
+    - tool: "dots_ocr"
+      result_dir: "dotsocr_vllm_results"
+      image_dir: "dotsocr_vllm_results/{{name}}"
+      description: "Dots OCR 图片合成结果"
+      enabled: true

+ 153 - 0
config/global.yaml

@@ -0,0 +1,153 @@
+# OCR验证工具配置文件
+
+# 样式配置
+styles:
+  font_size: 8
+  
+  colors:
+    primary: "#0288d1"
+    secondary: "#ff9800"
+    success: "#4caf50"
+    error: "#f44336"
+    warning: "#ff9800"
+    background: "#fafafa"
+    text: "#333333"
+  
+  layout:
+    default_zoom: 1.0
+    default_height: 800
+    sidebar_width: 1
+    content_width: 0.65
+
+# 界面配置
+ui:
+  page_title: "OCR可视化校验工具"
+  page_icon: "🔍"
+  layout: "wide"
+  sidebar_state: "expanded"
+  
+# OCR数据配置
+ocr:
+  min_text_length: 2
+  default_confidence: 1.0
+  exclude_texts: ["Picture", ""]
+  
+  # 图片方向检测配置
+  orientation_detection:
+    enabled: true
+    confidence_threshold: 0.3  # 置信度阈值
+    methods: ["opencv_analysis"]  # 检测方法
+    cache_results: true  # 缓存检测结果
+  
+  # OCR工具类型配置
+  tools:
+    dots_ocr:
+      name: "Dots OCR"
+      description: "专业VLM OCR"
+      json_structure: "array"  # JSON为数组格式
+      text_field: "text"
+      bbox_field: "bbox"
+      category_field: "category"
+      confidence_field: "confidence"
+      # 旋转处理配置
+      rotation:
+        coordinates_are_pre_rotated: false  # 坐标不是预旋转的
+        
+    ppstructv3:
+      name: "PPStructV3"
+      description: "PaddleOCR PP-StructureV3"
+      json_structure: "object"  # JSON为对象格式
+      parsing_results_field: "parsing_res_list"
+      text_field: "block_content"
+      bbox_field: "block_bbox"
+      rec_texts_field: "overall_ocr_res.rec_texts" # 针对表格中的文字块
+      rec_boxes_field: "overall_ocr_res.rec_boxes" # 针对表格中的文字块
+      category_field: "block_label"
+      confidence_field: "confidence"
+      # 旋转处理配置
+      rotation:
+        coordinates_are_pre_rotated: true  # 坐标已经是预旋转的
+      
+    table_recognition_v2:
+      name: "TableRecognitionV2"
+      description: "PaddleOCR Table Recognition V2"
+      json_structure: "object"
+      parsing_results_field: "table_res_list"
+      text_field: "pred_html"
+      bbox_field: "cell_box_list"            # 原先的 cell_box_listox 为笔误
+      rec_texts_field: "table_ocr_pred.rec_texts" # 针对表格中的文字块
+      rec_boxes_field: "table_ocr_pred.rec_boxes" # 针对表格中的文字块
+      category_field: "type"
+      confidence_field: "confidence"
+      rotation:
+        coordinates_are_pre_rotated: true
+    
+    mineru:
+      name: "MinerU"
+      description: "MinerU OCR"
+      json_structure: "array"  # JSON为数组格式
+      text_field: "text"
+      bbox_field: "bbox"
+      category_field: "type"
+      confidence_field: "confidence"
+      # 表格相关字段
+      table_body_field: "table_body"
+      table_cells_field: "table_cells"
+      img_path_field: "img_path"
+      # 旋转处理配置
+      rotation:
+        coordinates_are_pre_rotated: false
+  
+  # 自动检测工具类型的规则(按优先级从高到低)
+  auto_detection:
+    enabled: true
+    rules:
+      # Table Recognition V2 - 最高优先级
+      - tool_type: "table_recognition_v2"
+        conditions:
+          - type: "field_exists"
+            field: "table_res_list"
+          - type: "field_not_exists"
+            field: "parsing_res_list"
+        priority: 4
+      
+      # PPStructV3 - 第二优先级
+      - tool_type: "ppstructv3"
+        conditions:
+          - type: "field_exists"
+            field: "parsing_res_list"
+          - type: "field_exists"
+            field: "doc_preprocessor_res"
+        priority: 2
+      
+      # MinerU - 第三优先级
+      - tool_type: "mineru"
+        conditions:
+          - type: "field_exists"
+            field: "page_idx"
+          - type: "field_exists"
+            field: "type"
+          - type: "json_structure"
+            structure: "array"
+        priority: 1
+      
+      # Dots OCR - 最低优先级(默认)
+      - tool_type: "dots_ocr"
+        conditions:
+          - type: "json_structure"
+            structure: "array"
+          - type: "field_exists"
+            field: "category"
+        priority: 3
+
+# 预校验结果文件路径
+pre_validation:
+  out_dir: "./output/pre_validation/"
+
+data_sources:
+  - 德_内蒙古银行照.yaml
+  - 对公_招商银行图.yaml
+  - A用户_单元格扫描流水.yaml
+  - B用户_扫描流水.yaml
+  - 至远彩色_2023年报.yaml
+

+ 47 - 0
config/对公_招商银行图.yaml

@@ -0,0 +1,47 @@
+document:
+  name: "对公_招商银行图"
+  base_dir: "/Users/zhch158/workspace/data/流水分析/对公_招商银行图"
+  
+  # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
+  ocr_results:
+    # PPStructV3
+    - tool: "ppstructv3"
+      result_dir: "ppstructurev3_client_results"
+      image_dir: "ppstructurev3_client_results/{{name}}"
+      description: "PPStructV3 图片合成结果"
+      enabled: true
+    
+    # PaddleOCR-VL
+    - tool: "paddleocr_vl"
+      result_dir: "paddleocr_vl_results"
+      image_dir: "paddleocr_vl_results/{{name}}"
+      description: "PaddleOCR VLM 图片合成结果"
+      enabled: true
+    
+    # PaddleOCR-VL (带 cell bbox)
+    - tool: "mineru"  # 格式同 MinerU
+      result_dir: "paddleocr_vl_results_cell_bbox"
+      image_dir: "paddleocr_vl_results/{{name}}"
+      description: "PaddleOCR VLM + PaddleOCR 坐标"
+      enabled: true
+    
+    # MinerU
+    - tool: "mineru"
+      result_dir: "mineru_vllm_results"
+      image_dir: "mineru_vllm_results/{{name}}"
+      description: "MinerU 图片合成结果"
+      enabled: true
+    
+    # MinerU (带 cell bbox)
+    - tool: "mineru"
+      result_dir: "mineru_vllm_results_cell_bbox"
+      image_dir: "mineru_vllm_results/{{name}}"
+      description: "MinerU + PaddleOCR 坐标"
+      enabled: true
+    
+    # DotsOCR
+    - tool: "dots_ocr"
+      result_dir: "dotsocr_vllm_results"
+      image_dir: "dotsocr_vllm_results/{{name}}"
+      description: "Dots OCR 图片合成结果"
+      enabled: true

+ 48 - 0
config/德_内蒙古银行照.yaml

@@ -0,0 +1,48 @@
+# 文档: 德_内蒙古银行照
+document:
+  name: "德_内蒙古银行照"
+  base_dir: "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照"
+  
+  # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
+  ocr_results:
+    # PPStructV3
+    - tool: "ppstructv3"
+      result_dir: "ppstructurev3_client_results"
+      image_dir: "ppstructurev3_client_results/{{name}}"
+      description: "PPStructV3 图片合成结果"
+      enabled: true
+    
+    # PaddleOCR-VL
+    - tool: "paddleocr_vl"
+      result_dir: "paddleocr_vl_results"
+      image_dir: "paddleocr_vl_results/{{name}}"
+      description: "PaddleOCR VLM 图片合成结果"
+      enabled: true
+    
+    # PaddleOCR-VL (带 cell bbox)
+    - tool: "mineru"  # 格式同 MinerU
+      result_dir: "paddleocr_vl_results_cell_bbox"
+      image_dir: "paddleocr_vl_results/{{name}}"
+      description: "PaddleOCR VLM + PaddleOCR 坐标"
+      enabled: true
+    
+    # MinerU
+    - tool: "mineru"
+      result_dir: "mineru_vllm_results"
+      image_dir: "mineru_vllm_results/{{name}}"
+      description: "MinerU 图片合成结果"
+      enabled: true
+    
+    # MinerU (带 cell bbox)
+    - tool: "mineru"
+      result_dir: "mineru_vllm_results_cell_bbox"
+      image_dir: "mineru_vllm_results/{{name}}"
+      description: "MinerU + PaddleOCR 坐标"
+      enabled: true
+    
+    # DotsOCR
+    - tool: "dots_ocr"
+      result_dir: "dotsocr_vllm_results"
+      image_dir: "dotsocr_vllm_results/{{name}}"
+      description: "Dots OCR 图片合成结果"
+      enabled: true

+ 47 - 0
config/至远彩色_2023年报.yaml

@@ -0,0 +1,47 @@
+document:
+  name: "至远彩色_2023年报"
+  base_dir: "/Users/zhch158/workspace/data/流水分析/至远彩色_2023年报"
+  
+  # 🎯 关键改进:定义该文档使用的 OCR 工具及其结果目录
+  ocr_results:
+    # PPStructV3
+    - tool: "ppstructv3"
+      result_dir: "ppstructurev3_client_results"
+      image_dir: "ppstructurev3_client_results/{{name}}"
+      description: "PPStructV3 图片合成结果"
+      enabled: true
+    
+    # PaddleOCR-VL
+    - tool: "paddleocr_vl"
+      result_dir: "paddleocr_vl_results"
+      image_dir: "paddleocr_vl_results/{{name}}"
+      description: "PaddleOCR VLM 图片合成结果"
+      enabled: true
+    
+    # PaddleOCR-VL (带 cell bbox)
+    - tool: "mineru"  # 格式同 MinerU
+      result_dir: "paddleocr_vl_results_cell_bbox"
+      image_dir: "paddleocr_vl_results/{{name}}"
+      description: "PaddleOCR VLM + PaddleOCR 坐标"
+      enabled: true
+    
+    # MinerU
+    - tool: "mineru"
+      result_dir: "mineru_vllm_results"
+      image_dir: "mineru_vllm_results/{{name}}"
+      description: "MinerU 图片合成结果"
+      enabled: true
+    
+    # MinerU (带 cell bbox)
+    - tool: "mineru"
+      result_dir: "mineru_vllm_results_cell_bbox"
+      image_dir: "mineru_vllm_results/{{name}}"
+      description: "MinerU + PaddleOCR 坐标"
+      enabled: true
+    
+    # DotsOCR
+    - tool: "dots_ocr"
+      result_dir: "dotsocr_vllm_results"
+      image_dir: "dotsocr_vllm_results/{{name}}"
+      description: "Dots OCR 图片合成结果"
+      enabled: true