|
|
@@ -22,79 +22,24 @@ preprocessor:
|
|
|
model_dir: null # 使用默认路径
|
|
|
unwarping:
|
|
|
enabled: false
|
|
|
- # -------------------------------------------------------
|
|
|
- # 水印去除配置(适用于银行流水浅色斜向文字水印)
|
|
|
- # -------------------------------------------------------
|
|
|
+ # 页级水印(细参见 ocr_utils/watermark/presets.py PAGE_WATERMARK_PRESETS)
|
|
|
watermark_removal:
|
|
|
- enabled: false # 是否启用水印去除
|
|
|
- method: masked_adaptive # threshold | masked | masked_adaptive
|
|
|
- threshold: 175 # 全局阈值或掩膜失败时的回退阈值(140-180)
|
|
|
- morph_close_kernel: 0 # 去水印后灰度图闭运算,0 跳过
|
|
|
- mask:
|
|
|
- mask_mode: light_on_white # light_on_white | diagonal_midtone
|
|
|
- text_protect_gray_max: 130 # gray<=130 正文硬保护,永不置白
|
|
|
- light_gray_low: 236 # 浅色候选(geom_candidate 用)
|
|
|
- light_gray_high: 253
|
|
|
- whiten_gray_low: 200 # 几何带内置白灰度下限(方案 E,低于 candidate)
|
|
|
- direction_filter: hough # hough=方案C斜向线段 | block=旧分块梯度
|
|
|
- morph_close_kernel: 0
|
|
|
- morph_dilate_kernel: 0
|
|
|
- min_component_area: 200
|
|
|
- debug_block_maps: true # 输出 diag/hv 热力图
|
|
|
- debug_block_size: 48
|
|
|
- hough_midtone_low: 200 # Canny 仅在中间调带
|
|
|
- hough_midtone_high: 254
|
|
|
- hough_canny_low: 30
|
|
|
- hough_canny_high: 100
|
|
|
- hough_threshold: 25
|
|
|
- hough_min_line_length: 35
|
|
|
- hough_max_line_gap: 18
|
|
|
- hough_line_thickness: 12
|
|
|
- hough_band_dilate_radius: 16
|
|
|
- hough_use_angle_statistics: true # 角度直方图统计主峰
|
|
|
- hough_angle_tolerance: 5.0 # 与主峰角度差≤该值(度)
|
|
|
- hough_secondary_peak_ratio: 0.35 # 次峰相对主峰权重
|
|
|
- hough_min_length_percentile: 25.0 # 过滤短线段
|
|
|
- midtone_low: 95
|
|
|
- midtone_high: 235 # diagonal_midtone 模式用
|
|
|
- remove_horizontal_vertical: true
|
|
|
- diagonal_enhance: true
|
|
|
- diagonal_kernel_length: 25
|
|
|
- horizontal_kernel_length: 35
|
|
|
- vertical_kernel_length: 35
|
|
|
- morph_open_kernel: 2
|
|
|
- dmorph_close_kernel: 3
|
|
|
- text_protect_percentile: 10.0
|
|
|
- background_threshold: 248
|
|
|
- seal_protect: true
|
|
|
- adaptive:
|
|
|
- whiten_mode: mask_fill # mask_fill=掩膜内一律置白 | threshold_in_mask
|
|
|
- text_percentile: 10.0
|
|
|
- watermark_percentile: 70.0 # threshold_in_mask 时生效
|
|
|
- background_percentile: 95.0
|
|
|
- background_threshold: 248
|
|
|
- wm_margin: 12
|
|
|
- text_protect_max: 120
|
|
|
- # 去水印后对比度增强(text_restore 将笔画拉深,比全局 gamma 更接近原图)
|
|
|
+ enabled: false
|
|
|
+ detect_before_remove: true
|
|
|
+ method: masked_adaptive # threshold | masked | masked_adaptive
|
|
|
+ threshold: 175
|
|
|
+ morph_close_kernel: 0
|
|
|
contrast_enhancement:
|
|
|
enabled: true
|
|
|
- method: text_restore # text_restore | clahe | gamma | linear
|
|
|
- text_black_target: 85 # 略提高,减轻去水印后笔画被拉花(原 75 过深)
|
|
|
- background_threshold: 248
|
|
|
- text_lo_percentile: 1.0
|
|
|
- text_hi_percentile: 99.0
|
|
|
- gamma: 0.75 # method=gamma 时生效
|
|
|
- clip_limit: 2.0 # method=clahe
|
|
|
- tile_grid_size: 8
|
|
|
- black_percentile: 2.0 # method=linear
|
|
|
- white_percentile: 98.0
|
|
|
+ method: text_restore
|
|
|
+ text_black_target: 85
|
|
|
debug_options:
|
|
|
- enabled: false # 由命令行 --debug / --debug-layout 统一控制
|
|
|
- output_dir: null # null 时使用 pipeline 输出目录
|
|
|
- prefix: "" # 文件名前缀(运行时注入 page_name)
|
|
|
- subdir: watermark_removal # 输出至 debug/watermark_removal/
|
|
|
- save_compare: true # 保存左右对比图 *_watermark_compare.*
|
|
|
- image_format: "png" # jpg / png
|
|
|
+ enabled: false
|
|
|
+ output_dir: null
|
|
|
+ prefix: ""
|
|
|
+ subdir: watermark_removal
|
|
|
+ save_compare: true
|
|
|
+ image_format: "png"
|
|
|
|
|
|
# ============================================================
|
|
|
# Layout 检测配置 - 智能路由器(按场景直接选择模型)
|
|
|
@@ -224,13 +169,42 @@ table_recognition_wired:
|
|
|
# 功能开关
|
|
|
enable_ocr_compensation: true # 启用OCR边缘补偿
|
|
|
|
|
|
- # 单元格二次 OCR(det 分行 + 整格兜底 + 低分块过滤)
|
|
|
+ # 单元格二次 OCR(det 分行 + 整格/条带兜底 + 低分笔画增强重试)
|
|
|
second_pass_ocr:
|
|
|
+ reocr_mode: bank_statement # 表体空单元必跑 + 同行多数非空则空格也跑
|
|
|
+ header_row: 0 # 表头行号(0=首行)
|
|
|
+ row_peer_min_nonempty: 5 # 同行至少 N 个非空格时,本格空也触发二次 OCR
|
|
|
line_min_score: 0.8 # 低于此分的分行从文本与计分中丢弃
|
|
|
drop_low_score_blocks: true
|
|
|
- whole_cell_fallback: true # 整格 det+rec + 条带扫描,与分行择优
|
|
|
+ whole_cell_fallback: true # 整格 det=False 兜底 + 条带扫描
|
|
|
prefer_whole_on_tie: true
|
|
|
+ whole_longer_min_extra_chars: 2 # 整格/条带文本比分行多长至少 N 字则优先
|
|
|
strip_fallback_aspect_ratio: 1.8 # 高/宽>=该值且仅检出<=1行时滑动条带分行
|
|
|
+ cell_preprocess:
|
|
|
+ watermark:
|
|
|
+ enabled: true
|
|
|
+ method: masked_adaptive
|
|
|
+ denoise:
|
|
|
+ enabled: false # 小格 median 易糊笔画;lab 用 --denoise 对比
|
|
|
+ method: median
|
|
|
+ contrast:
|
|
|
+ enabled: false
|
|
|
+ method: text_restore
|
|
|
+ text_black_target: 88
|
|
|
+ light:
|
|
|
+ upscale_min_side: 64
|
|
|
+ enhance_retry:
|
|
|
+ enabled: false
|
|
|
+ score_below: 0.90
|
|
|
+ min_chars: 4
|
|
|
+ short_text_in_tall_cell: true
|
|
|
+ contrast:
|
|
|
+ enabled: true
|
|
|
+ method: text_restore
|
|
|
+ text_black_target: 75
|
|
|
+ sharpen:
|
|
|
+ enabled: false
|
|
|
+ amount: 0.3
|
|
|
|
|
|
# Debug 可视化配置
|
|
|
debug_options:
|