| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- # 银行交易流水场景配置 v2
- # 支持完整的处理流程:PDF分类 → 方向识别 → Layout检测 → OCR/VLM并行处理 → 坐标匹配
- scene_name: "bank_statement"
- description: "银行交易流水、对账单等场景 - 增强版"
- # ============================================================
- # 输入配置
- # ============================================================
- input:
- supported_formats: [".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"]
- dpi: 200 # PDF转图片的DPI
- # ============================================================
- # 预处理配置(方向识别)
- # ============================================================
- preprocessor:
- module: "mineru"
- orientation_classifier:
- enabled: true # 扫描件自动开启,数字PDF自动跳过
- model_name: "paddle_orientation_classification"
- model_dir: null # 使用默认路径
- unwarping:
- enabled: false # 图像矫正(可选)
- # ============================================================
- # 版式检测配置
- # ============================================================
- layout_detection:
- module: "mineru"
- model_name: "layout"
- model_dir: null # 使用默认路径,自动下载 doclayout_yolo_docstructbench_imgsz1280_2501.pt
- device: "cpu" # 可选: "cpu", "cuda", "mps"
- # batch_size: 4
- # conf: 0.25
- # iou: 0.45
- # ============================================================
- # VL识别配置(表格、公式)
- # ============================================================
- vl_recognition:
- # 可选: "mineru" (MinerU VLM) 或 "paddle" (PaddleOCR-VL)
- module: "mineru"
-
- # 后端配置
- backend: "http-client" # 可选: "http-client", "vllm-engine", "transformers"
- server_url: "http://10.192.72.11:8121" # MinerU VLM 服务地址
-
- # 图片尺寸限制(避免序列长度超限)
- max_image_size: 4096
- resize_mode: 'max' # 'max' 保持宽高比, 'fixed' 固定尺寸
-
- device: "cpu"
- batch_size: 1
-
- model_params:
- max_concurrency: 10
- http_timeout: 600
-
- # 表格识别特定配置
- table_recognition:
- return_cells_coordinate: true # 返回单元格坐标
- bank_statement_mode: true # 银行流水优化模式
- # ============================================================
- # OCR识别配置(文本检测+识别)
- # ============================================================
- ocr_recognition:
- module: "mineru"
- language: "ch" # 语言: ch, ch_lite, en, japan 等
- det_threshold: 0.3 # 检测阈值
- unclip_ratio: 1.8 # 文本框扩展比例
- batch_size: 8
- device: "cpu"
- # ============================================================
- # 输出配置
- # ============================================================
- output:
- # 基础输出
- save_json: true # 保存 middle.json(MinerU标准格式)
- save_markdown: true # 保存 Markdown 文件
- save_html: true # 保存表格 HTML 文件
-
- # Debug 输出(通过命令行 --debug 开启)
- save_layout_image: false # 保存 layout 可视化图片
- save_ocr_image: false # 保存 OCR 可视化图片
- draw_type_label: true # 在可视化图片上标注类型
- draw_bbox_number: true # 在可视化图片上标注序号
-
- # 增强输出
- save_enhanced_json: true # 保存增强版 JSON(包含单元格坐标)
- coordinate_precision: 2 # 坐标精度(小数位数)
- # ============================================================
- # 场景特定配置
- # ============================================================
- scene_config:
- bank_statement:
- # 表格结构特征
- table_structure: "single_column_list" # 单栏列表形式
- merged_cells: false # 无合并单元格
-
- # 预期列名(用于验证)
- expected_columns: ["日期", "摘要", "收入", "支出", "余额"]
-
- # 验证规则
- amount_validation: true # 金额格式验证
- date_validation: true # 日期格式验证
- balance_validation: true # 余额一致性验证
-
- processing_rules:
- # 表格处理规则
- table_rules:
- - detect_table_type: ["wired", "wireless"] # 检测有线/无线表格
- - extract_header_automatically: true # 自动提取表头
- - validate_amount_format: true # 验证金额格式
- - merge_continuation_rows: true # 合并续行
-
- # OCR后处理规则
- ocr_rules:
- - filter_low_confidence: 0.7 # 过滤低置信度结果
- - merge_adjacent_text: true # 合并相邻文本
- - number_format_normalization: true # 数字格式标准化
- # ============================================================
- # 跨页表格合并配置
- # ============================================================
- cross_page_merge:
- enabled: true
- # 判断表格是否跨页的条件
- conditions:
- - table_at_page_bottom: true # 表格位于页面底部
- - table_at_page_top: true # 下一页表格位于顶部
- - similar_column_count: true # 列数相似
- - header_match: false # 表头匹配(跨页表格通常没有重复表头)
|