|
|
@@ -0,0 +1,86 @@
|
|
|
+# 银行交易流水场景配置(增强版)
|
|
|
+scene_name: "bank_statement"
|
|
|
+description: "银行交易流水、对账单等场景"
|
|
|
+
|
|
|
+input:
|
|
|
+ supported_formats: [".pdf", ".png", ".jpg"]
|
|
|
+ dpi: 200
|
|
|
+
|
|
|
+preprocessor:
|
|
|
+ module: "mineru"
|
|
|
+ orientation_classifier:
|
|
|
+ enabled: true
|
|
|
+ model_name: "paddle_orientation_classification"
|
|
|
+ model_dir: null # 使用默认路径
|
|
|
+ unwarping:
|
|
|
+ enabled: false
|
|
|
+
|
|
|
+layout_detection:
|
|
|
+ # module: "paddle"
|
|
|
+ # model_name: "RT-DETR-H_layout_17cls"
|
|
|
+ # model_dir: /Users/zhch158/workspace/repository.git/PaddleX/zhch/unified_pytorch_models/Layout/RT-DETR-H_layout_17cls.onnx # 使用默认路径,或指定: "./Layout/RT-DETR-H_layout_17cls.onnx"
|
|
|
+ module: "mineru"
|
|
|
+ model_name: "layout"
|
|
|
+ model_dir: null # 使用默认路径
|
|
|
+ device: "cpu"
|
|
|
+ # batch_size: 4
|
|
|
+ # conf: 0.1
|
|
|
+ # iou: 0.45
|
|
|
+
|
|
|
+vl_recognition:
|
|
|
+ module: "paddle"
|
|
|
+ backend: "http-client"
|
|
|
+ model_name: "PaddleOCR-VL-0.9B"
|
|
|
+ server_url: "http://10.192.72.11:8110"
|
|
|
+ max_image_size: 4096 # 🔧 添加:最大图片尺寸
|
|
|
+ resize_mode: 'max' # 🔧 添加:缩放模式 ('max' 保持宽高比, 'fixed' 固定尺寸)
|
|
|
+ device: "cpu"
|
|
|
+ batch_size: 1
|
|
|
+ model_params:
|
|
|
+ max_concurrency: 10
|
|
|
+ http_timeout: 600
|
|
|
+
|
|
|
+ # 场景特定配置
|
|
|
+ table_recognition:
|
|
|
+ return_cells_coordinate: true
|
|
|
+ bank_statement_mode: true
|
|
|
+
|
|
|
+ocr_recognition:
|
|
|
+ module: "mineru"
|
|
|
+ language: "ch"
|
|
|
+ det_threshold: 0.3
|
|
|
+ unclip_ratio: 1.8
|
|
|
+ batch_size: 8
|
|
|
+ device: "cpu"
|
|
|
+
|
|
|
+output:
|
|
|
+ save_json: true
|
|
|
+ save_markdown: true
|
|
|
+ save_html: true
|
|
|
+ save_layout_image: true
|
|
|
+ save_ocr_image: true
|
|
|
+ draw_type_label: true
|
|
|
+ draw_bbox_number: true
|
|
|
+
|
|
|
+# 场景特定配置
|
|
|
+scene_config:
|
|
|
+ bank_statement:
|
|
|
+ table_structure: "single_column_list"
|
|
|
+ merged_cells: false
|
|
|
+ expected_columns: ["日期", "摘要", "收入", "支出", "余额"]
|
|
|
+ amount_validation: true
|
|
|
+ date_validation: true
|
|
|
+
|
|
|
+ processing_rules:
|
|
|
+ # 表格处理规则
|
|
|
+ table_rules:
|
|
|
+ - detect_table_type: ["wired", "wireless"]
|
|
|
+ - extract_header_automatically: true
|
|
|
+ - validate_amount_format: true
|
|
|
+ - merge_continuation_rows: true
|
|
|
+
|
|
|
+ # OCR后处理规则
|
|
|
+ ocr_rules:
|
|
|
+ - filter_low_confidence: 0.7
|
|
|
+ - merge_adjacent_text: true
|
|
|
+ - number_format_normalization: true
|