|
|
@@ -36,12 +36,42 @@ ui:
|
|
|
|
|
|
# 文件路径配置
|
|
|
paths:
|
|
|
- output_dir: "output"
|
|
|
- sample_data_dir: "./sample_data"
|
|
|
+ ocr_out_dir: "/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/data_PPStructureV3_Results"
|
|
|
+ src_img_dir: "/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/data_PPStructureV3_Results/2023年度报告母公司"
|
|
|
supported_image_formats: [".png", ".jpg", ".jpeg"]
|
|
|
|
|
|
# OCR数据配置
|
|
|
ocr:
|
|
|
min_text_length: 2
|
|
|
default_confidence: 1.0
|
|
|
- exclude_texts: ["Picture", ""]
|
|
|
+ exclude_texts: ["Picture", ""]
|
|
|
+
|
|
|
+ # OCR工具类型配置
|
|
|
+ tools:
|
|
|
+ dots_ocr:
|
|
|
+ name: "Dots OCR"
|
|
|
+ description: "专业VLM OCR"
|
|
|
+ json_structure: "array" # JSON为数组格式
|
|
|
+ text_field: "text"
|
|
|
+ bbox_field: "bbox"
|
|
|
+ category_field: "category"
|
|
|
+ confidence_field: "confidence"
|
|
|
+
|
|
|
+ ppstructv3:
|
|
|
+ name: "PPStructV3"
|
|
|
+ description: "PaddleOCR PP-StructureV3"
|
|
|
+ json_structure: "object" # JSON为对象格式
|
|
|
+ parsing_results_field: "parsing_res_list"
|
|
|
+ text_field: "block_content"
|
|
|
+ bbox_field: "block_bbox"
|
|
|
+ category_field: "block_label"
|
|
|
+ confidence_field: "confidence"
|
|
|
+
|
|
|
+ # 自动检测工具类型的规则
|
|
|
+ auto_detection:
|
|
|
+ enabled: true
|
|
|
+ rules:
|
|
|
+ - field_exists: "parsing_res_list" # 如果存在该字段,判断为ppstructv3
|
|
|
+ tool_type: "ppstructv3"
|
|
|
+ - json_is_array: true # 如果JSON是数组,判断为dots_ocr
|
|
|
+ tool_type: "dots_ocr"
|