|
@@ -14,6 +14,8 @@ input:
|
|
|
|
|
|
|
|
preprocessor:
|
|
preprocessor:
|
|
|
module: "mineru"
|
|
module: "mineru"
|
|
|
|
|
+ # 页级预处理顺序:orient_first=先扶正再去水印(银行斜纹水印推荐);watermark_first=兼容旧行为
|
|
|
|
|
+ order: orient_first
|
|
|
orientation_classifier:
|
|
orientation_classifier:
|
|
|
enabled: true
|
|
enabled: true
|
|
|
model_name: "paddle_orientation_classification"
|
|
model_name: "paddle_orientation_classification"
|
|
@@ -24,10 +26,30 @@ preprocessor:
|
|
|
# 水印去除配置(适用于银行流水浅色斜向文字水印)
|
|
# 水印去除配置(适用于银行流水浅色斜向文字水印)
|
|
|
# -------------------------------------------------------
|
|
# -------------------------------------------------------
|
|
|
watermark_removal:
|
|
watermark_removal:
|
|
|
- enabled: true # 是否启用水印去除
|
|
|
|
|
- threshold: 160 # 灰度阈值(140-180):高于此值视为水印变白
|
|
|
|
|
- # 值越大保守(残留水印),值越小激进(损失浅色正文)
|
|
|
|
|
- morph_close_kernel: 0 # 形态学闭运算核大小(像素),默认的 morph_kernel 改为 0(非二值图像时形态学闭运算会适得其反)
|
|
|
|
|
|
|
+ enabled: false # 是否启用水印去除
|
|
|
|
|
+ method: threshold # threshold | masked | masked_adaptive
|
|
|
|
|
+ threshold: 175 # 全局阈值或掩膜失败时的回退阈值(140-180)
|
|
|
|
|
+ morph_close_kernel: 0 # 去水印后灰度图闭运算,0 跳过
|
|
|
|
|
+ # 去水印后对比度增强(text_restore 将笔画拉深,比全局 gamma 更接近原图)
|
|
|
|
|
+ contrast_enhancement:
|
|
|
|
|
+ enabled: true
|
|
|
|
|
+ method: text_restore # text_restore | clahe | gamma | linear
|
|
|
|
|
+ text_black_target: 85 # 略提高,减轻去水印后笔画被拉花(原 75 过深)
|
|
|
|
|
+ background_threshold: 248
|
|
|
|
|
+ text_lo_percentile: 1.0
|
|
|
|
|
+ text_hi_percentile: 99.0
|
|
|
|
|
+ gamma: 0.75 # method=gamma 时生效
|
|
|
|
|
+ clip_limit: 2.0 # method=clahe
|
|
|
|
|
+ tile_grid_size: 8
|
|
|
|
|
+ black_percentile: 2.0 # method=linear
|
|
|
|
|
+ white_percentile: 98.0
|
|
|
|
|
+ debug_options:
|
|
|
|
|
+ enabled: false # 由命令行 --debug / --debug-layout 统一控制
|
|
|
|
|
+ output_dir: null # null 时使用 pipeline 输出目录
|
|
|
|
|
+ prefix: "" # 文件名前缀(运行时注入 page_name)
|
|
|
|
|
+ subdir: watermark_removal # 输出至 debug/watermark_removal/
|
|
|
|
|
+ save_compare: true # 保存左右对比图 *_watermark_compare.*
|
|
|
|
|
+ image_format: "png" # jpg / png
|
|
|
|
|
|
|
|
# ============================================================
|
|
# ============================================================
|
|
|
# Layout 检测配置 - 智能路由器(按场景直接选择模型)
|
|
# Layout 检测配置 - 智能路由器(按场景直接选择模型)
|
|
@@ -71,11 +93,16 @@ layout_detection:
|
|
|
min_text_width_ratio: 0.4 # 最小宽度占比(40%)
|
|
min_text_width_ratio: 0.4 # 最小宽度占比(40%)
|
|
|
min_text_height_ratio: 0.3 # 最小高度占比(30%)
|
|
min_text_height_ratio: 0.3 # 最小高度占比(30%)
|
|
|
|
|
|
|
|
- # Debug 可视化配置
|
|
|
|
|
|
|
+ # Debug 可视化(底图为 inference_image,与 Layout 检测输入一致)
|
|
|
debug_options:
|
|
debug_options:
|
|
|
- enabled: false # 由命令行 --debug 统一控制,勿在此 hardcode true
|
|
|
|
|
- output_dir: null # 调试输出目录;null不输出
|
|
|
|
|
- prefix: "" # 保存文件名前缀(如设置为页码)
|
|
|
|
|
|
|
+ enabled: false # 由命令行 --debug / --debug-layout 控制
|
|
|
|
|
+ output_dir: null # null 时由 pipeline 按页注入
|
|
|
|
|
+ prefix: ""
|
|
|
|
|
+ subdir: layout_detection # 输出至 debug/layout_detection/
|
|
|
|
|
+ save_raw: true # 后处理前
|
|
|
|
|
+ save_post_processed: true # 后处理后
|
|
|
|
|
+ save_json: true
|
|
|
|
|
+ image_format: "png"
|
|
|
|
|
|
|
|
# ============================================================
|
|
# ============================================================
|
|
|
# OCR 识别配置
|
|
# OCR 识别配置
|
|
@@ -89,6 +116,16 @@ ocr_recognition:
|
|
|
batch_size: 8
|
|
batch_size: 8
|
|
|
device: "cpu"
|
|
device: "cpu"
|
|
|
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # Debug 可视化(底图为 inference_image,与整页 OCR 输入一致)
|
|
|
|
|
+ debug_options:
|
|
|
|
|
+ enabled: false # 由命令行 --debug / --debug-ocr 控制
|
|
|
|
|
+ output_dir: null
|
|
|
|
|
+ prefix: ""
|
|
|
|
|
+ subdir: ocr_recognition # 输出至 debug/ocr_recognition/
|
|
|
|
|
+ save_json: true
|
|
|
|
|
+ image_format: png
|
|
|
|
|
+
|
|
|
# ============================================================
|
|
# ============================================================
|
|
|
# 表格分类配置(自动区分有线/无线表格)
|
|
# 表格分类配置(自动区分有线/无线表格)
|
|
|
# ============================================================
|
|
# ============================================================
|
|
@@ -100,11 +137,12 @@ table_classification:
|
|
|
|
|
|
|
|
# Debug 可视化配置
|
|
# Debug 可视化配置
|
|
|
debug_options:
|
|
debug_options:
|
|
|
- enabled: false # 由命令行 --debug 统一控制,勿在此 hardcode true
|
|
|
|
|
- output_dir: null # 调试输出目录;null不输出
|
|
|
|
|
- save_table_lines: true # 保存表格线可视化(unet横线/竖线叠加)
|
|
|
|
|
- image_format: "png" # 可视化图片格式:png/jpg
|
|
|
|
|
- prefix: "" # 保存文件名前缀(如设置为页码/表格序号)
|
|
|
|
|
|
|
+ enabled: false # 由命令行 --debug / --debug-table 统一控制
|
|
|
|
|
+ output_dir: null # null 时由 pipeline 按页注入
|
|
|
|
|
+ prefix: ""
|
|
|
|
|
+ subdir: table_classification # 输出至 debug/table_classification/
|
|
|
|
|
+ save_table_lines: true # paddle 线条检测叠加图
|
|
|
|
|
+ image_format: "png"
|
|
|
|
|
|
|
|
# ============================================================
|
|
# ============================================================
|
|
|
# 有线表格识别专用配置(MinerU UNet)
|
|
# 有线表格识别专用配置(MinerU UNet)
|
|
@@ -144,14 +182,16 @@ table_recognition_wired:
|
|
|
|
|
|
|
|
# Debug 可视化配置
|
|
# Debug 可视化配置
|
|
|
debug_options:
|
|
debug_options:
|
|
|
- enabled: false # 由命令行 --debug 统一控制,勿在此 hardcode true
|
|
|
|
|
- output_dir: null # 调试输出目录;null不输出
|
|
|
|
|
- save_table_lines: true # 保存表格线可视化(unet横线/竖线叠加)
|
|
|
|
|
- save_connected_components: true # 保存连通域提取的单元格图
|
|
|
|
|
- save_grid_structure: true # 保存逻辑网格结构(row/col/rowspan/colspan)
|
|
|
|
|
- save_text_overlay: true # 保存文本填充覆盖图
|
|
|
|
|
- image_format: "png" # 可视化图片格式:png/jpg
|
|
|
|
|
- prefix: "" # 保存文件名前缀(如设置为页码/表格序号)
|
|
|
|
|
|
|
+ enabled: false # 由命令行 --debug / --debug-table 统一控制
|
|
|
|
|
+ output_dir: null # null 时由 pipeline 按页注入
|
|
|
|
|
+ prefix: ""
|
|
|
|
|
+ subdir: table_recognition_wired # 输出至 debug/table_recognition_wired/
|
|
|
|
|
+ save_table_lines: true
|
|
|
|
|
+ save_connected_components: true
|
|
|
|
|
+ save_grid_structure: true
|
|
|
|
|
+ save_text_overlay: true
|
|
|
|
|
+ image_format: "png"
|
|
|
|
|
+ # 单元格二次 OCR 裁剪图:debug/table_recognition_wired/tablecell_ocr/
|
|
|
|
|
|
|
|
# ============================================================
|
|
# ============================================================
|
|
|
# VL识别配置 - 使用 PaddleOcr-VL(无线表格 + seal识别)
|
|
# VL识别配置 - 使用 PaddleOcr-VL(无线表格 + seal识别)
|