bank_statement_enhanced.yaml 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. # 银行交易流水场景配置(增强版)
  2. scene_name: "bank_statement"
  3. description: "银行交易流水、对账单等场景"
  4. input:
  5. supported_formats: [".pdf", ".png", ".jpg"]
  6. dpi: 200
  7. preprocessor:
  8. module: "mineru"
  9. orientation_classifier:
  10. enabled: true
  11. model_name: "paddle_orientation_classification"
  12. model_dir: null # 使用默认路径
  13. unwarping:
  14. enabled: false
  15. layout_detection:
  16. module: "paddle"
  17. model_name: "RT-DETR-H_layout_17cls"
  18. model_dir: /Users/zhch158/workspace/repository.git/PaddleX/zhch/unified_pytorch_models/Layout/RT-DETR-H_layout_17cls.onnx # 使用默认路径,或指定: "./Layout/RT-DETR-H_layout_17cls.onnx"
  19. device: "cpu"
  20. # batch_size: 4
  21. # conf: 0.1
  22. # iou: 0.45
  23. vl_recognition:
  24. module: "mineru"
  25. backend: "http-client"
  26. server_url: "http://10.192.72.11:8121"
  27. max_image_size: 4096 # 🔧 添加:最大图片尺寸
  28. resize_mode: 'max' # 🔧 添加:缩放模式 ('max' 保持宽高比, 'fixed' 固定尺寸)
  29. device: "cpu"
  30. batch_size: 1
  31. model_params:
  32. max_concurrency: 10
  33. http_timeout: 600
  34. # 场景特定配置
  35. table_recognition:
  36. return_cells_coordinate: true
  37. bank_statement_mode: true
  38. ocr_recognition:
  39. module: "mineru"
  40. language: "ch"
  41. det_threshold: 0.3
  42. unclip_ratio: 1.8
  43. batch_size: 8
  44. device: "cpu"
  45. output:
  46. save_json: true
  47. save_markdown: true
  48. save_html: true
  49. save_layout_image: true
  50. save_ocr_image: true
  51. draw_type_label: true
  52. draw_bbox_number: true
  53. # 场景特定配置
  54. scene_config:
  55. bank_statement:
  56. table_structure: "single_column_list"
  57. merged_cells: false
  58. expected_columns: ["日期", "摘要", "收入", "支出", "余额"]
  59. amount_validation: true
  60. date_validation: true
  61. processing_rules:
  62. # 表格处理规则
  63. table_rules:
  64. - detect_table_type: ["wired", "wireless"]
  65. - extract_header_automatically: true
  66. - validate_amount_format: true
  67. - merge_continuation_rows: true
  68. # OCR后处理规则
  69. ocr_rules:
  70. - filter_low_confidence: 0.7
  71. - merge_adjacent_text: true
  72. - number_format_normalization: true