|
@@ -0,0 +1,264 @@
|
|
|
|
|
+# 银行交易流水场景配置(增强版)
|
|
|
|
|
+scene_name: "bank_statement_mineru_vl_local"
|
|
|
|
|
+description: "银行交易流水、对账单等场景"
|
|
|
|
|
+
|
|
|
|
|
+input:
|
|
|
|
|
+ supported_formats: [".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"]
|
|
|
|
|
+ dpi: 200 # PDF转图片的DPI
|
|
|
|
|
+ txt_pdf_watermark_removal:
|
|
|
|
|
+ enabled: true # 文字型PDF渲染前去除水印XObject(保留文字可搜索性)
|
|
|
|
|
+ sample_pages: 3 # 扫描前N页快速预检
|
|
|
|
|
+
|
|
|
|
|
+preprocessor:
|
|
|
|
|
+ module: "mineru"
|
|
|
|
|
+ # 页级预处理顺序:orient_first=先扶正再去水印(银行斜纹水印推荐);watermark_first=兼容旧行为
|
|
|
|
|
+ order: orient_first
|
|
|
|
|
+ orientation_classifier:
|
|
|
|
|
+ enabled: true
|
|
|
|
|
+ model_name: "paddle_orientation_classification"
|
|
|
|
|
+ model_dir: null # 使用默认路径
|
|
|
|
|
+ unwarping:
|
|
|
|
|
+ enabled: false
|
|
|
|
|
+ # 页级水印(细参见 ocr_utils/watermark/presets.py PAGE_WATERMARK_PRESETS)
|
|
|
|
|
+ watermark_removal:
|
|
|
|
|
+ enabled: false
|
|
|
|
|
+ detect_before_remove: true
|
|
|
|
|
+ method: threshold # threshold | masked | masked_adaptive
|
|
|
|
|
+ threshold: 175
|
|
|
|
|
+ contrast_enhancement:
|
|
|
|
|
+ enabled: false
|
|
|
|
|
+ method: text_restore
|
|
|
|
|
+ text_black_target: 85
|
|
|
|
|
+ debug_options:
|
|
|
|
|
+ enabled: false
|
|
|
|
|
+ output_dir: null
|
|
|
|
|
+ prefix: ""
|
|
|
|
|
+ subdir: watermark_removal
|
|
|
|
|
+ save_compare: true
|
|
|
|
|
+ image_format: "png"
|
|
|
|
|
+
|
|
|
|
|
+# ============================================================
|
|
|
|
|
+# Layout 检测配置 - 智能路由器(按场景直接选择模型)
|
|
|
|
|
+# ============================================================
|
|
|
|
|
+layout_detection:
|
|
|
|
|
+ module: "smart_router"
|
|
|
|
|
+ strategy: "scene" # 按场景直接选择模型,不走ocr_eval
|
|
|
|
|
+
|
|
|
|
|
+ # 场景策略:指定场景直接选用的布局模型
|
|
|
|
|
+ scene_strategy:
|
|
|
|
|
+ bank_statement:
|
|
|
|
|
+ model: "docling"
|
|
|
|
|
+ financial_report:
|
|
|
|
|
+ model: "paddle_ppdoclayoutv3"
|
|
|
|
|
+ default_model: "docling"
|
|
|
|
|
+
|
|
|
|
|
+ # 配置多个模型
|
|
|
|
|
+ models:
|
|
|
|
|
+ docling:
|
|
|
|
|
+ module: "docling"
|
|
|
|
|
+ model_name: "docling-layout-old"
|
|
|
|
|
+ model_dir: "ds4sd/docling-layout-old"
|
|
|
|
|
+ device: "cpu"
|
|
|
|
|
+ conf: 0.3
|
|
|
|
|
+ num_threads: 4
|
|
|
|
|
+
|
|
|
|
|
+ paddle_ppdoclayoutv3:
|
|
|
|
|
+ module: "paddle"
|
|
|
|
|
+ model_name: "PP-DocLayoutV3"
|
|
|
|
|
+ model_dir: "PaddlePaddle/PP-DocLayoutV3_safetensors"
|
|
|
|
|
+ device: "cpu"
|
|
|
|
|
+ conf: 0.3
|
|
|
|
|
+ num_threads: 4
|
|
|
|
|
+ batch_size: 1
|
|
|
|
|
+
|
|
|
|
|
+ # 后处理配置
|
|
|
|
|
+ post_process:
|
|
|
|
|
+ # 将大面积文本块转换为表格(后处理)
|
|
|
|
|
+ convert_large_text_to_table: true # 是否启用
|
|
|
|
|
+ min_text_area_ratio: 0.25 # 最小面积占比(25%)
|
|
|
|
|
+ min_text_width_ratio: 0.4 # 最小宽度占比(40%)
|
|
|
|
|
+ min_text_height_ratio: 0.3 # 最小高度占比(30%)
|
|
|
|
|
+
|
|
|
|
|
+ # 印章补充检测:使用 PP-DocLayoutV3 补充 docling 无法识别的密封区域
|
|
|
|
|
+ seal_supplement:
|
|
|
|
|
+ enabled: true # 启用 seal 补充检测
|
|
|
|
|
+ replace_existing: false # false=增量合并; true=完全替换主结果中已有 seal
|
|
|
|
|
+ replace_overlapping_image: true # seal 与 image_body/image 等高 IoU 时替换为 seal(非丢弃)
|
|
|
|
|
+ replace_iou_threshold: 0.7 # 触发替换的最小 IoU
|
|
|
|
|
+ duplicate_iou_threshold: 0.3 # 未替换时,与任意框 IoU 超此值视为重复 seal
|
|
|
|
|
+ # seal_detector 使用的模型配置,默认复用 paddle_ppdoclayoutv3 的配置
|
|
|
|
|
+ model_config:
|
|
|
|
|
+ module: "paddle"
|
|
|
|
|
+ model_name: "PP-DocLayoutV3"
|
|
|
|
|
+ model_dir: "PaddlePaddle/PP-DocLayoutV3_safetensors"
|
|
|
|
|
+ device: "cpu"
|
|
|
|
|
+ conf: 0.3
|
|
|
|
|
+ num_threads: 4
|
|
|
|
|
+
|
|
|
|
|
+ # Debug 可视化(底图为 inference_image,与 Layout 检测输入一致)
|
|
|
|
|
+ debug_options:
|
|
|
|
|
+ enabled: false # 由命令行 --debug / --debug-layout 控制
|
|
|
|
|
+ output_dir: null # null 时由 pipeline 按页注入
|
|
|
|
|
+ prefix: ""
|
|
|
|
|
+ subdir: layout_detection # 输出至 debug/layout_detection/
|
|
|
|
|
+ save_raw: true # 后处理前
|
|
|
|
|
+ save_post_processed: true # 后处理后
|
|
|
|
|
+ save_json: true
|
|
|
|
|
+ image_format: "png"
|
|
|
|
|
+
|
|
|
|
|
+# ============================================================
|
|
|
|
|
+# OCR 识别配置
|
|
|
|
|
+# ============================================================
|
|
|
|
|
+ocr_recognition:
|
|
|
|
|
+ module: "mineru"
|
|
|
|
|
+ language: "ch"
|
|
|
|
|
+ det_threshold: 0.5
|
|
|
|
|
+ unclip_ratio: 1.5
|
|
|
|
|
+ enable_merge_det_boxes: false
|
|
|
|
|
+ batch_size: 8
|
|
|
|
|
+ device: "cpu"
|
|
|
|
|
+
|
|
|
|
|
+ # Debug 可视化(底图为 inference_image,与整页 OCR 输入一致)
|
|
|
|
|
+ debug_options:
|
|
|
|
|
+ enabled: false # 由命令行 --debug / --debug-ocr 控制
|
|
|
|
|
+ output_dir: null
|
|
|
|
|
+ prefix: ""
|
|
|
|
|
+ subdir: ocr_recognition # 输出至 debug/ocr_recognition/
|
|
|
|
|
+ save_json: true
|
|
|
|
|
+ image_format: png
|
|
|
|
|
+
|
|
|
|
|
+# ============================================================
|
|
|
|
|
+# 表格分类配置(自动区分有线/无线表格)
|
|
|
|
|
+# ============================================================
|
|
|
|
|
+table_classification:
|
|
|
|
|
+ enabled: true # 启用自动表格分类
|
|
|
|
|
+ module: "paddle" # 分类模型:paddle(MinerU PaddleTableClsModel)
|
|
|
|
|
+ confidence_threshold: 0.5 # 分类置信度阈值
|
|
|
|
|
+ batch_size: 16 # 批处理大小
|
|
|
|
|
+
|
|
|
|
|
+ # Debug 可视化配置
|
|
|
|
|
+ debug_options:
|
|
|
|
|
+ enabled: false # 由命令行 --debug / --debug-table 统一控制
|
|
|
|
|
+ output_dir: null # null 时由 pipeline 按页注入
|
|
|
|
|
+ prefix: ""
|
|
|
|
|
+ subdir: table_classification # 输出至 debug/table_classification/
|
|
|
|
|
+ save_table_lines: true # paddle 线条检测叠加图
|
|
|
|
|
+ image_format: "png"
|
|
|
|
|
+
|
|
|
|
|
+# ============================================================
|
|
|
|
|
+# 有线表格识别专用配置(MinerU UNet)
|
|
|
|
|
+# ============================================================
|
|
|
|
|
+table_recognition_wired:
|
|
|
|
|
+ use_wired_unet: false
|
|
|
|
|
+ upscale_ratio: 3.333
|
|
|
|
|
+ need_ocr: true
|
|
|
|
|
+ row_threshold: 10
|
|
|
|
|
+ col_threshold: 15
|
|
|
|
|
+ ocr_conf_threshold: 0.9 # 单元格 OCR 置信度阈值
|
|
|
|
|
+ cell_crop_margin: 2
|
|
|
|
|
+ use_custom_postprocess: true # 是否使用自定义后处理(默认启用)
|
|
|
|
|
+
|
|
|
|
|
+ # 是否启用倾斜矫正
|
|
|
|
|
+ enable_deskew: true
|
|
|
|
|
+
|
|
|
|
|
+ # 🆕 启用多源单元格融合
|
|
|
|
|
+ use_cell_fusion: true
|
|
|
|
|
+
|
|
|
|
|
+ # 融合引擎配置
|
|
|
|
|
+ cell_fusion:
|
|
|
|
|
+ # RT-DETR 模型路径(必需)
|
|
|
|
|
+ rtdetr_model_path: "/Users/zhch158/models/pytorch_models/Table/RT-DETR-L_wired_table_cell_det.onnx"
|
|
|
|
|
+
|
|
|
|
|
+ # 融合权重
|
|
|
|
|
+ unet_weight: 0.6 # UNet 权重(结构性强)
|
|
|
|
|
+ rtdetr_weight: 0.4 # RT-DETR 权重(鲁棒性强)
|
|
|
|
|
+
|
|
|
|
|
+ # 阈值配置
|
|
|
|
|
+ iou_merge_threshold: 0.7 # 高IoU合并阈值(>0.7则加权平均)
|
|
|
|
|
+ iou_nms_threshold: 0.5 # NMS去重阈值
|
|
|
|
|
+ rtdetr_conf_threshold: 0.5 # RT-DETR置信度阈值
|
|
|
|
|
+
|
|
|
|
|
+ # 功能开关
|
|
|
|
|
+ enable_ocr_compensation: true # 启用OCR边缘补偿
|
|
|
|
|
+
|
|
|
|
|
+ # 单元格二次 OCR(参数对齐 cell_sweep lab:threshold_t150_cl_1.0_8_ob_u128 / Pass2 tile=4)
|
|
|
|
|
+ second_pass_ocr:
|
|
|
|
|
+ reocr_mode: bank_statement
|
|
|
|
|
+ line_min_score: 0.8
|
|
|
|
|
+ cell_preprocess:
|
|
|
|
|
+ watermark:
|
|
|
|
|
+ enabled: true
|
|
|
|
|
+ method: threshold
|
|
|
|
|
+ threshold: 150
|
|
|
|
|
+ contrast: # Pass1:去水印后 CLAHE
|
|
|
|
|
+ enabled: true
|
|
|
|
|
+ method: clahe
|
|
|
|
|
+ clip_limit: 1.0
|
|
|
|
|
+ tile_grid_size: 8
|
|
|
|
|
+ upscale_min_side: 96 # Pass1:常规二次 OCR 放大最短边
|
|
|
|
|
+ enhance_retry: # Pass2:低分/难例再试(可单独配置 upscale + contrast)
|
|
|
|
|
+ enabled: true
|
|
|
|
|
+ upscale_min_side: 128 # Pass2 放大最短边;未配置时沿用 Pass1
|
|
|
|
|
+ contrast:
|
|
|
|
|
+ enabled: true
|
|
|
|
|
+ method: clahe
|
|
|
|
|
+ clip_limit: 1.0
|
|
|
|
|
+ tile_grid_size: 4
|
|
|
|
|
+
|
|
|
|
|
+ # Debug 可视化配置
|
|
|
|
|
+ debug_options:
|
|
|
|
|
+ enabled: false # 由命令行 --debug / --debug-table 统一控制
|
|
|
|
|
+ output_dir: null # null 时由 pipeline 按页注入
|
|
|
|
|
+ prefix: ""
|
|
|
|
|
+ subdir: table_recognition_wired # 输出至 debug/table_recognition_wired/
|
|
|
|
|
+ save_table_lines: true
|
|
|
|
|
+ save_connected_components: true
|
|
|
|
|
+ save_grid_structure: true
|
|
|
|
|
+ save_text_overlay: true
|
|
|
|
|
+ image_format: "png"
|
|
|
|
|
+ # 单元格二次 OCR 裁剪图:debug/table_recognition_wired/tablecell_ocr/
|
|
|
|
|
+
|
|
|
|
|
+# ============================================================
|
|
|
|
|
+# VL识别配置 - 使用 PaddleOcr-VL(无线表格 + seal识别)
|
|
|
|
|
+# ============================================================
|
|
|
|
|
+vl_recognition:
|
|
|
|
|
+ module: "mineru"
|
|
|
|
|
+ backend: "http-client"
|
|
|
|
|
+ model_name: "MinerU2.5-Pro-2604-1.2B" # 与 mineru_local_daemon.sh 中 MODEL_NAME 一致
|
|
|
|
|
+ server_url: "http://localhost:8103"
|
|
|
|
|
+ max_image_size: 4096 # 🔧 添加:最大图片尺寸
|
|
|
|
|
+ resize_mode: 'max' # 🔧 添加:缩放模式 ('max' 保持宽高比, 'fixed' 固定尺寸)
|
|
|
|
|
+ device: "cpu"
|
|
|
|
|
+ batch_size: 1
|
|
|
|
|
+ model_params:
|
|
|
|
|
+ max_concurrency: 10
|
|
|
|
|
+ http_timeout: 600
|
|
|
|
|
+
|
|
|
|
|
+ # Task prompt mapping - 针对不同任务使用不同提示词
|
|
|
|
|
+ task_prompt_mapping:
|
|
|
|
|
+ text: "Text Recognition:"
|
|
|
|
|
+ table: "Table Recognition:"
|
|
|
|
|
+ formula: "Formula Recognition:"
|
|
|
|
|
+ seal: "Seal Recognition:" # 印章识别的专用提示词
|
|
|
|
|
+
|
|
|
|
|
+ # 场景特定配置
|
|
|
|
|
+ table_recognition:
|
|
|
|
|
+
|
|
|
|
|
+# ============================================================
|
|
|
|
|
+# 输出配置
|
|
|
|
|
+# ============================================================
|
|
|
|
|
+output:
|
|
|
|
|
+ create_subdir: false
|
|
|
|
|
+ save_pdf_images: true
|
|
|
|
|
+ save_json: true
|
|
|
|
|
+ save_page_json: true
|
|
|
|
|
+ save_markdown: true
|
|
|
|
|
+ save_page_markdown: true
|
|
|
|
|
+ save_html: true
|
|
|
|
|
+ save_layout_image: true
|
|
|
|
|
+ save_ocr_image: true
|
|
|
|
|
+ draw_type_label: true
|
|
|
|
|
+ draw_bbox_number: true
|
|
|
|
|
+ save_enhanced_json: true
|
|
|
|
|
+ normalize_numbers: true
|
|
|
|
|
+ debug_mode: false
|