| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485 |
- # OCR验证工具配置文件
- # 样式配置
- styles:
- font_size: 8
-
- colors:
- primary: "#0288d1"
- secondary: "#ff9800"
- success: "#4caf50"
- error: "#f44336"
- warning: "#ff9800"
- background: "#fafafa"
- text: "#333333"
-
- layout:
- default_zoom: 1.0
- default_height: 800
- sidebar_width: 1
- content_width: 0.65
- # 界面配置
- ui:
- page_title: "OCR可视化校验工具"
- page_icon: "🔍"
- layout: "wide"
- sidebar_state: "expanded"
-
- # 文件路径配置
- paths:
- ocr_out_dir: "/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/data_DotsOCR_Results"
- # ocr_out_dir: "/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/data_PPStructureV3_Results"
- src_img_dir: "/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/data_PPStructureV3_Results/2023年度报告母公司"
- supported_image_formats: [".png", ".jpg", ".jpeg"]
- # 预校验结果文件路径
- pre_validation_dir: "./output/pre_validation/"
-
- # OCR数据配置
- ocr:
- min_text_length: 2
- default_confidence: 1.0
- exclude_texts: ["Picture", ""]
-
- # 图片方向检测配置
- orientation_detection:
- enabled: true
- confidence_threshold: 0.3 # 置信度阈值
- methods: ["opencv_analysis"] # 检测方法
- cache_results: true # 缓存检测结果
-
- # OCR工具类型配置
- tools:
- dots_ocr:
- name: "Dots OCR"
- description: "专业VLM OCR"
- json_structure: "array" # JSON为数组格式
- text_field: "text"
- bbox_field: "bbox"
- category_field: "category"
- confidence_field: "confidence"
- # 旋转处理配置
- rotation:
- coordinates_are_pre_rotated: false # 坐标不是预旋转的
-
- ppstructv3:
- name: "PPStructV3"
- description: "PaddleOCR PP-StructureV3"
- json_structure: "object" # JSON为对象格式
- parsing_results_field: "parsing_res_list"
- text_field: "block_content"
- bbox_field: "block_bbox"
- category_field: "block_label"
- confidence_field: "confidence"
- # 旋转处理配置
- rotation:
- coordinates_are_pre_rotated: true # 坐标已经是预旋转的
-
- # 自动检测工具类型的规则
- auto_detection:
- enabled: true
- rules:
- - field_exists: "parsing_res_list" # 如果存在该字段,判断为ppstructv3
- tool_type: "ppstructv3"
- - json_is_array: true # 如果JSON是数组,判断为dots_ocr
- tool_type: "dots_ocr"
|