bank_statement_v2.yaml 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. # 银行交易流水场景配置 v2
  2. # 支持完整的处理流程:PDF分类 → 方向识别 → Layout检测 → OCR/VLM并行处理 → 坐标匹配
  3. scene_name: "bank_statement"
  4. description: "银行交易流水、对账单等场景 - 增强版"
  5. # ============================================================
  6. # 输入配置
  7. # ============================================================
  8. input:
  9. supported_formats: [".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"]
  10. dpi: 200 # PDF转图片的DPI
  11. # ============================================================
  12. # 预处理配置(方向识别)
  13. # ============================================================
  14. preprocessor:
  15. module: "mineru"
  16. orientation_classifier:
  17. enabled: true # 扫描件自动开启,数字PDF自动跳过
  18. model_name: "paddle_orientation_classification"
  19. model_dir: null # 使用默认路径
  20. unwarping:
  21. enabled: false # 图像矫正(可选)
  22. # ============================================================
  23. # 版式检测配置
  24. # ============================================================
  25. layout_detection:
  26. module: "mineru"
  27. model_name: "layout"
  28. model_dir: null # 使用默认路径,自动下载 doclayout_yolo_docstructbench_imgsz1280_2501.pt
  29. device: "cpu" # 可选: "cpu", "cuda", "mps"
  30. # batch_size: 4
  31. # conf: 0.25
  32. # iou: 0.45
  33. # ============================================================
  34. # VL识别配置(表格、公式)
  35. # ============================================================
  36. vl_recognition:
  37. # 可选: "mineru" (MinerU VLM) 或 "paddle" (PaddleOCR-VL)
  38. module: "mineru"
  39. # 后端配置
  40. backend: "http-client" # 可选: "http-client", "vllm-engine", "transformers"
  41. server_url: "http://10.192.72.11:8121" # MinerU VLM 服务地址
  42. # 图片尺寸限制(避免序列长度超限)
  43. max_image_size: 4096
  44. resize_mode: 'max' # 'max' 保持宽高比, 'fixed' 固定尺寸
  45. device: "cpu"
  46. batch_size: 1
  47. model_params:
  48. max_concurrency: 10
  49. http_timeout: 600
  50. # 表格识别特定配置
  51. table_recognition:
  52. return_cells_coordinate: true # 返回单元格坐标
  53. bank_statement_mode: true # 银行流水优化模式
  54. # ============================================================
  55. # OCR识别配置(文本检测+识别)
  56. # ============================================================
  57. ocr_recognition:
  58. module: "mineru"
  59. language: "ch" # 语言: ch, ch_lite, en, japan 等
  60. det_threshold: 0.3 # 检测阈值
  61. unclip_ratio: 1.8 # 文本框扩展比例
  62. batch_size: 8
  63. device: "cpu"
  64. # ============================================================
  65. # 输出配置
  66. # ============================================================
  67. output:
  68. # 基础输出
  69. save_json: true # 保存 middle.json(MinerU标准格式)
  70. save_markdown: true # 保存 Markdown 文件
  71. save_html: true # 保存表格 HTML 文件
  72. # Debug 输出(通过命令行 --debug 开启)
  73. save_layout_image: false # 保存 layout 可视化图片
  74. save_ocr_image: false # 保存 OCR 可视化图片
  75. draw_type_label: true # 在可视化图片上标注类型
  76. draw_bbox_number: true # 在可视化图片上标注序号
  77. # 增强输出
  78. save_enhanced_json: true # 保存增强版 JSON(包含单元格坐标)
  79. coordinate_precision: 2 # 坐标精度(小数位数)
  80. # ============================================================
  81. # 场景特定配置
  82. # ============================================================
  83. scene_config:
  84. bank_statement:
  85. # 表格结构特征
  86. table_structure: "single_column_list" # 单栏列表形式
  87. merged_cells: false # 无合并单元格
  88. # 预期列名(用于验证)
  89. expected_columns: ["日期", "摘要", "收入", "支出", "余额"]
  90. # 验证规则
  91. amount_validation: true # 金额格式验证
  92. date_validation: true # 日期格式验证
  93. balance_validation: true # 余额一致性验证
  94. processing_rules:
  95. # 表格处理规则
  96. table_rules:
  97. - detect_table_type: ["wired", "wireless"] # 检测有线/无线表格
  98. - extract_header_automatically: true # 自动提取表头
  99. - validate_amount_format: true # 验证金额格式
  100. - merge_continuation_rows: true # 合并续行
  101. # OCR后处理规则
  102. ocr_rules:
  103. - filter_low_confidence: 0.7 # 过滤低置信度结果
  104. - merge_adjacent_text: true # 合并相邻文本
  105. - number_format_normalization: true # 数字格式标准化
  106. # ============================================================
  107. # 跨页表格合并配置
  108. # ============================================================
  109. cross_page_merge:
  110. enabled: true
  111. # 判断表格是否跨页的条件
  112. conditions:
  113. - table_at_page_bottom: true # 表格位于页面底部
  114. - table_at_page_top: true # 下一页表格位于顶部
  115. - similar_column_count: true # 列数相似
  116. - header_match: false # 表头匹配(跨页表格通常没有重复表头)