1 miesiąc temu · 0cb48eed12
--- a/ocr_tools/universal_doc_parser/config/bank_statement_mineru_vl_local.yaml
+++ b/ocr_tools/universal_doc_parser/config/bank_statement_mineru_vl_local.yaml
@@ -0,0 +1,264 @@
 
															+# 银行交易流水场景配置（增强版）
														
 
															+scene_name: "bank_statement_mineru_vl_local"
														
 
															+description: "银行交易流水、对账单等场景"
														
 
															+
														
 
															+input:
														
 
															+  supported_formats: [".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"]
														
 
															+  dpi: 200  # PDF转图片的DPI
														
 
															+  txt_pdf_watermark_removal:
														
 
															+    enabled: true   # 文字型PDF渲染前去除水印XObject（保留文字可搜索性）
														
 
															+    sample_pages: 3  # 扫描前N页快速预检
														
 
															+
														
 
															+preprocessor:
														
 
															+  module: "mineru"
														
 
															+  # 页级预处理顺序：orient_first=先扶正再去水印（银行斜纹水印推荐）；watermark_first=兼容旧行为
														
 
															+  order: orient_first
														
 
															+  orientation_classifier:
														
 
															+    enabled: true
														
 
															+    model_name: "paddle_orientation_classification"
														
 
															+    model_dir: null  # 使用默认路径
														
 
															+  unwarping:
														
 
															+    enabled: false
														
 
															+  # 页级水印（细参见 ocr_utils/watermark/presets.py PAGE_WATERMARK_PRESETS）
														
 
															+  watermark_removal:
														
 
															+    enabled: false
														
 
															+    detect_before_remove: true
														
 
															+    method: threshold   # threshold | masked | masked_adaptive
														
 
															+    threshold: 175
														
 
															+    contrast_enhancement:
														
 
															+      enabled: false
														
 
															+      method: text_restore
														
 
															+      text_black_target: 85
														
 
															+    debug_options:
														
 
															+      enabled: false
														
 
															+      output_dir: null
														
 
															+      prefix: ""
														
 
															+      subdir: watermark_removal
														
 
															+      save_compare: true
														
 
															+      image_format: "png"
														
 
															+
														
 
															+# ============================================================
														
 
															+# Layout 检测配置 - 智能路由器（按场景直接选择模型）
														
 
															+# ============================================================
														
 
															+layout_detection:
														
 
															+  module: "smart_router"
														
 
															+  strategy: "scene"  # 按场景直接选择模型，不走ocr_eval
														
 
															+
														
 
															+  # 场景策略：指定场景直接选用的布局模型
														
 
															+  scene_strategy:
														
 
															+    bank_statement:
														
 
															+      model: "docling"
														
 
															+    financial_report:
														
 
															+      model: "paddle_ppdoclayoutv3"
														
 
															+  default_model: "docling"
														
 
															+
														
 
															+  # 配置多个模型
														
 
															+  models:
														
 
															+    docling:
														
 
															+      module: "docling"
														
 
															+      model_name: "docling-layout-old"
														
 
															+      model_dir: "ds4sd/docling-layout-old"
														
 
															+      device: "cpu"
														
 
															+      conf: 0.3
														
 
															+      num_threads: 4
														
 
															+
														
 
															+    paddle_ppdoclayoutv3:
														
 
															+      module: "paddle"
														
 
															+      model_name: "PP-DocLayoutV3"
														
 
															+      model_dir: "PaddlePaddle/PP-DocLayoutV3_safetensors"
														
 
															+      device: "cpu"
														
 
															+      conf: 0.3
														
 
															+      num_threads: 4
														
 
															+      batch_size: 1
														
 
															+  
														
 
															+  # 后处理配置
														
 
															+  post_process:
														
 
															+    # 将大面积文本块转换为表格（后处理）
														
 
															+    convert_large_text_to_table: true  # 是否启用
														
 
															+    min_text_area_ratio: 0.25         # 最小面积占比（25%）
														
 
															+    min_text_width_ratio: 0.4         # 最小宽度占比（40%）
														
 
															+    min_text_height_ratio: 0.3        # 最小高度占比（30%）
														
 
															+
														
 
															+  # 印章补充检测：使用 PP-DocLayoutV3 补充 docling 无法识别的密封区域
														
 
															+  seal_supplement:
														
 
															+    enabled: true                # 启用 seal 补充检测
														
 
															+    replace_existing: false      # false=增量合并; true=完全替换主结果中已有 seal
														
 
															+    replace_overlapping_image: true   # seal 与 image_body/image 等高 IoU 时替换为 seal（非丢弃）
														
 
															+    replace_iou_threshold: 0.7        # 触发替换的最小 IoU
														
 
															+    duplicate_iou_threshold: 0.3      # 未替换时，与任意框 IoU 超此值视为重复 seal
														
 
															+    # seal_detector 使用的模型配置，默认复用 paddle_ppdoclayoutv3 的配置
														
 
															+    model_config:
														
 
															+      module: "paddle"
														
 
															+      model_name: "PP-DocLayoutV3"
														
 
															+      model_dir: "PaddlePaddle/PP-DocLayoutV3_safetensors"
														
 
															+      device: "cpu"
														
 
															+      conf: 0.3
														
 
															+      num_threads: 4
														
 
															+
														
 
															+  # Debug 可视化（底图为 inference_image，与 Layout 检测输入一致）
														
 
															+  debug_options:
														
 
															+    enabled: false              # 由命令行 --debug / --debug-layout 控制
														
 
															+    output_dir: null            # null 时由 pipeline 按页注入
														
 
															+    prefix: ""
														
 
															+    subdir: layout_detection    # 输出至 debug/layout_detection/
														
 
															+    save_raw: true              # 后处理前
														
 
															+    save_post_processed: true   # 后处理后
														
 
															+    save_json: true
														
 
															+    image_format: "png"
														
 
															+
														
 
															+# ============================================================
														
 
															+# OCR 识别配置
														
 
															+# ============================================================
														
 
															+ocr_recognition:
														
 
															+  module: "mineru"
														
 
															+  language: "ch"
														
 
															+  det_threshold: 0.5
														
 
															+  unclip_ratio: 1.5
														
 
															+  enable_merge_det_boxes: false
														
 
															+  batch_size: 8
														
 
															+  device: "cpu"
														
 
															+
														
 
															+  # Debug 可视化（底图为 inference_image，与整页 OCR 输入一致）
														
 
															+  debug_options:
														
 
															+    enabled: false              # 由命令行 --debug / --debug-ocr 控制
														
 
															+    output_dir: null
														
 
															+    prefix: ""
														
 
															+    subdir: ocr_recognition     # 输出至 debug/ocr_recognition/
														
 
															+    save_json: true
														
 
															+    image_format: png
														
 
															+
														
 
															+# ============================================================
														
 
															+# 表格分类配置（自动区分有线/无线表格）
														
 
															+# ============================================================
														
 
															+table_classification:
														
 
															+  enabled: true               # 启用自动表格分类
														
 
															+  module: "paddle"            # 分类模型：paddle（MinerU PaddleTableClsModel）
														
 
															+  confidence_threshold: 0.5   # 分类置信度阈值
														
 
															+  batch_size: 16              # 批处理大小
														
 
															+
														
 
															+  # Debug 可视化配置
														
 
															+  debug_options:
														
 
															+    enabled: false              # 由命令行 --debug / --debug-table 统一控制
														
 
															+    output_dir: null            # null 时由 pipeline 按页注入
														
 
															+    prefix: ""
														
 
															+    subdir: table_classification  # 输出至 debug/table_classification/
														
 
															+    save_table_lines: true      # paddle 线条检测叠加图
														
 
															+    image_format: "png"
														
 
															+
														
 
															+# ============================================================
														
 
															+# 有线表格识别专用配置（MinerU UNet）
														
 
															+# ============================================================
														
 
															+table_recognition_wired:
														
 
															+  use_wired_unet: false
														
 
															+  upscale_ratio: 3.333
														
 
															+  need_ocr: true
														
 
															+  row_threshold: 10
														
 
															+  col_threshold: 15
														
 
															+  ocr_conf_threshold: 0.9       # 单元格 OCR 置信度阈值
														
 
															+  cell_crop_margin: 2
														
 
															+  use_custom_postprocess: true  # 是否使用自定义后处理（默认启用）
														
 
															+
														
 
															+  # 是否启用倾斜矫正
														
 
															+  enable_deskew: true
														
 
															+
														
 
															+  # 🆕 启用多源单元格融合
														
 
															+  use_cell_fusion: true
														
 
															+  
														
 
															+  # 融合引擎配置
														
 
															+  cell_fusion:
														
 
															+    # RT-DETR 模型路径（必需）
														
 
															+    rtdetr_model_path: "/Users/zhch158/models/pytorch_models/Table/RT-DETR-L_wired_table_cell_det.onnx"
														
 
															+    
														
 
															+    # 融合权重
														
 
															+    unet_weight: 0.6        # UNet 权重（结构性强）
														
 
															+    rtdetr_weight: 0.4      # RT-DETR 权重（鲁棒性强）
														
 
															+    
														
 
															+    # 阈值配置
														
 
															+    iou_merge_threshold: 0.7    # 高IoU合并阈值（>0.7则加权平均）
														
 
															+    iou_nms_threshold: 0.5      # NMS去重阈值
														
 
															+    rtdetr_conf_threshold: 0.5  # RT-DETR置信度阈值
														
 
															+    
														
 
															+    # 功能开关
														
 
															+    enable_ocr_compensation: true      # 启用OCR边缘补偿
														
 
															+
														
 
															+  # 单元格二次 OCR（参数对齐 cell_sweep lab：threshold_t150_cl_1.0_8_ob_u128 / Pass2 tile=4）
														
 
															+  second_pass_ocr:
														
 
															+    reocr_mode: bank_statement
														
 
															+    line_min_score: 0.8
														
 
															+    cell_preprocess:
														
 
															+      watermark:
														
 
															+        enabled: true
														
 
															+        method: threshold
														
 
															+        threshold: 150
														
 
															+      contrast:                      # Pass1：去水印后 CLAHE
														
 
															+        enabled: true
														
 
															+        method: clahe
														
 
															+        clip_limit: 1.0
														
 
															+        tile_grid_size: 8
														
 
															+      upscale_min_side: 96          # Pass1：常规二次 OCR 放大最短边
														
 
															+      enhance_retry:                   # Pass2：低分/难例再试（可单独配置 upscale + contrast）
														
 
															+        enabled: true
														
 
															+        upscale_min_side: 128         # Pass2 放大最短边；未配置时沿用 Pass1
														
 
															+        contrast:
														
 
															+          enabled: true
														
 
															+          method: clahe
														
 
															+          clip_limit: 1.0
														
 
															+          tile_grid_size: 4
														
 
															+
														
 
															+  # Debug 可视化配置
														
 
															+  debug_options:
														
 
															+    enabled: false              # 由命令行 --debug / --debug-table 统一控制
														
 
															+    output_dir: null            # null 时由 pipeline 按页注入
														
 
															+    prefix: ""
														
 
															+    subdir: table_recognition_wired  # 输出至 debug/table_recognition_wired/
														
 
															+    save_table_lines: true
														
 
															+    save_connected_components: true
														
 
															+    save_grid_structure: true
														
 
															+    save_text_overlay: true
														
 
															+    image_format: "png"
														
 
															+    # 单元格二次 OCR 裁剪图：debug/table_recognition_wired/tablecell_ocr/
														
 
															+
														
 
															+# ============================================================
														
 
															+# VL识别配置 - 使用 PaddleOcr-VL（无线表格 + seal识别）
														
 
															+# ============================================================
														
 
															+vl_recognition:
														
 
															+  module: "mineru"
														
 
															+  backend: "http-client"
														
 
															+  model_name: "MinerU2.5-Pro-2604-1.2B"  # 与 mineru_local_daemon.sh 中 MODEL_NAME 一致
														
 
															+  server_url: "http://localhost:8103"
														
 
															+  max_image_size: 4096  # 🔧 添加：最大图片尺寸
														
 
															+  resize_mode: 'max'    # 🔧 添加：缩放模式 ('max' 保持宽高比, 'fixed' 固定尺寸)
														
 
															+  device: "cpu"
														
 
															+  batch_size: 1
														
 
															+  model_params:
														
 
															+    max_concurrency: 10
														
 
															+    http_timeout: 600
														
 
															+  
														
 
															+  # Task prompt mapping - 针对不同任务使用不同提示词
														
 
															+  task_prompt_mapping:
														
 
															+    text: "Text Recognition:"
														
 
															+    table: "Table Recognition:"
														
 
															+    formula: "Formula Recognition:"
														
 
															+    seal: "Seal Recognition:"  # 印章识别的专用提示词
														
 
															+  
														
 
															+  # 场景特定配置
														
 
															+  table_recognition:
														
 
															+
														
 
															+# ============================================================
														
 
															+# 输出配置
														
 
															+# ============================================================
														
 
															+output:
														
 
															+  create_subdir: false
														
 
															+  save_pdf_images: true
														
 
															+  save_json: true
														
 
															+  save_page_json: true
														
 
															+  save_markdown: true
														
 
															+  save_page_markdown: true
														
 
															+  save_html: true
														
 
															+  save_layout_image: true
														
 
															+  save_ocr_image: true
														
 
															+  draw_type_label: true
														
 
															+  draw_bbox_number: true
														
 
															+  save_enhanced_json: true
														
 
															+  normalize_numbers: true
														
 
															+  debug_mode: false