4 months ago · 9292deaf3d
--- a/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v4.yaml
+++ b/ocr_tools/universal_doc_parser/config/bank_statement_yusys_v4.yaml
@@ -0,0 +1,178 @@
 
				+# 银行交易流水场景配置 - V4版本
			
 
				+# Pipeline V3逻辑: 有线表格使用MinerU UNet, 无线表格/seal使用GLM-OCR VLM
			
 
				+scene_name: "bank_statement_yusys_v4"
			
 
				+
			
 
				+description: "银行流水V4: PP-DocLayoutV3 layout + PaddleOCR + MinerU UNet（有线表格）+ GLM-OCR VLM（无线表格/seal）"
			
 
				+
			
 
				+input:
			
 
				+  supported_formats: [".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"]
			
 
				+  dpi: 200
			
 
				+
			
 
				+preprocessor:
			
 
				+  module: "mineru"
			
 
				+  orientation_classifier:
			
 
				+    enabled: true
			
 
				+    model_name: "paddle_orientation_classification"
			
 
				+    model_dir: null  # 使用默认路径
			
 
				+  unwarping:
			
 
				+    enabled: false
			
 
				+
			
 
				+# ============================================================
			
 
				+# Layout 检测配置 - 使用 PP-DocLayoutV3
			
 
				+# ============================================================
			
 
				+layout_detection:
			
 
				+  module: "paddle"
			
 
				+  model_name: "PP-DocLayoutV3"
			
 
				+  model_dir: "PaddlePaddle/PP-DocLayoutV3_safetensors"
			
 
				+  device: "cpu"
			
 
				+  conf: 0.3
			
 
				+  num_threads: 4
			
 
				+  batch_size: 1
			
 
				+  
			
 
				+  # 后处理配置
			
 
				+  post_process:
			
 
				+    # 将大面积文本块转换为表格（后处理）
			
 
				+    convert_large_text_to_table: true  # 是否启用
			
 
				+    min_text_area_ratio: 0.25         # 最小面积占比（25%）
			
 
				+    min_text_width_ratio: 0.4         # 最小宽度占比（40%）
			
 
				+    min_text_height_ratio: 0.3        # 最小高度占比（30%）
			
 
				+
			
 
				+  # Debug 可视化配置
			
 
				+  debug_options:
			
 
				+    enabled: true               # 是否开启调试可视化输出
			
 
				+    output_dir: null             # 调试输出目录；null不输出
			
 
				+    prefix: ""                  # 保存文件名前缀（如设置为页码）
			
 
				+
			
 
				+# ============================================================
			
 
				+# OCR 识别配置
			
 
				+# ============================================================
			
 
				+ocr_recognition:
			
 
				+  module: "mineru"
			
 
				+  language: "ch"
			
 
				+  det_threshold: 0.5
			
 
				+  unclip_ratio: 1.5
			
 
				+  enable_merge_det_boxes: false
			
 
				+  batch_size: 8
			
 
				+  device: "cpu"
			
 
				+
			
 
				+# ============================================================
			
 
				+# 表格分类配置（自动区分有线/无线表格）
			
 
				+# ============================================================
			
 
				+table_classification:
			
 
				+  enabled: true               # 启用自动表格分类
			
 
				+  module: "paddle"            # 分类模型：paddle（MinerU PaddleTableClsModel）
			
 
				+  confidence_threshold: 0.5   # 分类置信度阈值
			
 
				+  batch_size: 16              # 批处理大小
			
 
				+
			
 
				+  # Debug 可视化配置
			
 
				+  debug_options:
			
 
				+    enabled: true               # 是否开启调试可视化输出
			
 
				+    output_dir: null             # 调试输出目录；null不输出
			
 
				+    save_table_lines: true       # 保存表格线可视化（unet横线/竖线叠加）
			
 
				+    image_format: "png"          # 可视化图片格式：png/jpg
			
 
				+    prefix: ""                  # 保存文件名前缀（如设置为页码/表格序号）
			
 
				+
			
 
				+# ============================================================
			
 
				+# 有线表格识别专用配置（MinerU UNet）
			
 
				+# ============================================================
			
 
				+table_recognition_wired:
			
 
				+  use_wired_unet: true
			
 
				+  upscale_ratio: 3.333
			
 
				+  need_ocr: true
			
 
				+  row_threshold: 10
			
 
				+  col_threshold: 15
			
 
				+  ocr_conf_threshold: 0.9       # 单元格 OCR 置信度阈值
			
 
				+  cell_crop_margin: 2
			
 
				+  use_custom_postprocess: true  # 是否使用自定义后处理（默认启用）
			
 
				+
			
 
				+  # 是否启用倾斜矫正
			
 
				+  enable_deskew: true
			
 
				+
			
 
				+  # 🆕 启用多源单元格融合
			
 
				+  use_cell_fusion: true
			
 
				+  
			
 
				+  # 融合引擎配置
			
 
				+  cell_fusion:
			
 
				+    # RT-DETR 模型路径（必需）
			
 
				+    rtdetr_model_path: "/Users/zhch158/models/pytorch_models/Table/RT-DETR-L_wired_table_cell_det.onnx"
			
 
				+    
			
 
				+    # 融合权重
			
 
				+    unet_weight: 0.6        # UNet 权重（结构性强）
			
 
				+    rtdetr_weight: 0.4      # RT-DETR 权重（鲁棒性强）
			
 
				+    
			
 
				+    # 阈值配置
			
 
				+    iou_merge_threshold: 0.7    # 高IoU合并阈值（>0.7则加权平均）
			
 
				+    iou_nms_threshold: 0.5      # NMS去重阈值
			
 
				+    rtdetr_conf_threshold: 0.5  # RT-DETR置信度阈值
			
 
				+    
			
 
				+    # 功能开关
			
 
				+    enable_ocr_compensation: true      # 启用OCR边缘补偿
			
 
				+
			
 
				+  # Debug 可视化配置
			
 
				+  debug_options:
			
 
				+    enabled: true               # 是否开启调试可视化输出
			
 
				+    output_dir: null             # 调试输出目录；null不输出
			
 
				+    save_table_lines: true       # 保存表格线可视化（unet横线/竖线叠加）
			
 
				+    save_connected_components: true  # 保存连通域提取的单元格图
			
 
				+    save_grid_structure: true    # 保存逻辑网格结构（row/col/rowspan/colspan）
			
 
				+    save_text_overlay: true      # 保存文本填充覆盖图
			
 
				+    image_format: "png"          # 可视化图片格式：png/jpg
			
 
				+    prefix: ""                  # 保存文件名前缀（如设置为页码/表格序号）
			
 
				+
			
 
				+# ============================================================
			
 
				+# VL识别配置 - 使用 GLM-OCR（无线表格 + seal识别）
			
 
				+# ============================================================
			
 
				+vl_recognition:
			
 
				+  module: "glmocr"
			
 
				+  api_url: "http://10.192.72.11:20036/v1/chat/completions"
			
 
				+  api_key: null  # 可选，如需要可填写
			
 
				+  model: "glm-ocr"
			
 
				+  max_image_size: 3500  # GLM-OCR 推荐的最大图片尺寸
			
 
				+  resize_mode: 'max'    # 缩放模式: 'max' 保持宽高比, 'fixed' 固定尺寸
			
 
				+  verify_ssl: false
			
 
				+  
			
 
				+  # Task prompt mapping - 针对不同任务使用不同提示词
			
 
				+  task_prompt_mapping:
			
 
				+    text: "Text Recognition:"
			
 
				+    table: "Table Recognition:"
			
 
				+    formula: "Formula Recognition:"
			
 
				+    seal: "Seal Recognition:"  # 印章识别的专用提示词
			
 
				+  
			
 
				+  # 模型参数
			
 
				+  model_params:
			
 
				+    connection_pool_size: 128  # HTTP 连接池大小（应 >= max_workers）
			
 
				+    http_timeout: 300          # HTTP 请求超时时间（秒）
			
 
				+    connect_timeout: 30        # 连接超时时间（秒）
			
 
				+    retry_max_attempts: 2      # 最大重试次数
			
 
				+    retry_backoff_base_seconds: 0.5
			
 
				+    retry_backoff_max_seconds: 8.0
			
 
				+    retry_jitter_ratio: 0.2
			
 
				+    retry_status_codes: [429, 500, 502, 503, 504]
			
 
				+    max_tokens: 4096
			
 
				+    temperature: 0.8
			
 
				+    top_p: 0.9
			
 
				+    top_k: 50
			
 
				+    repetition_penalty: 1.1
			
 
				+  
			
 
				+  # 场景特定配置
			
 
				+  table_recognition:
			
 
				+    return_cells_coordinate: false  # GLM-OCR 不直接返回单元格坐标
			
 
				+
			
 
				+# ============================================================
			
 
				+# 输出配置
			
 
				+# ============================================================
			
 
				+output:
			
 
				+  create_subdir: false
			
 
				+  save_pdf_images: true
			
 
				+  save_json: true
			
 
				+  save_page_json: true
			
 
				+  save_markdown: true
			
 
				+  save_page_markdown: true
			
 
				+  save_html: true
			
 
				+  save_layout_image: true
			
 
				+  save_ocr_image: true
			
 
				+  draw_type_label: true
			
 
				+  draw_bbox_number: true
			
 
				+  save_enhanced_json: true
			
 
				+  normalize_numbers: true
			
 
				+  debug_mode: true