|
|
@@ -18,11 +18,25 @@ preprocessor:
|
|
|
# 水印去除配置(适用于银行流水浅色斜向文字水印)
|
|
|
# -------------------------------------------------------
|
|
|
watermark_removal:
|
|
|
- enabled: true # 是否启用水印去除
|
|
|
- threshold: 175 # 灰度阈值(140-180):高于此值视为水印变白
|
|
|
- # 值越大保守(残留水印),值越小激进(损失浅色正文)
|
|
|
- morph_close_kernel: 0 # 形态学闭运算核大小(像素),默认的 morph_kernel 改为 0(非二值图像时形态学闭运算会适得其反)
|
|
|
-
|
|
|
+ enabled: false
|
|
|
+ detect_before_remove: true
|
|
|
+ method: threshold # threshold | masked | masked_adaptive
|
|
|
+ threshold: 175
|
|
|
+ contrast_enhancement:
|
|
|
+ enabled: false
|
|
|
+ method: text_restore
|
|
|
+ text_black_target: 85
|
|
|
+ debug_options:
|
|
|
+ enabled: false
|
|
|
+ output_dir: null
|
|
|
+ prefix: ""
|
|
|
+ subdir: watermark_removal
|
|
|
+ save_compare: true
|
|
|
+ image_format: "png"
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# Layout 检测配置 - 智能路由器(按场景直接选择模型)
|
|
|
+# ============================================================
|
|
|
layout_detection:
|
|
|
module: "docling"
|
|
|
model_name: "docling-layout-old"
|
|
|
@@ -39,13 +53,20 @@ layout_detection:
|
|
|
min_text_width_ratio: 0.4 # 最小宽度占比(40%)
|
|
|
min_text_height_ratio: 0.3 # 最小高度占比(30%)
|
|
|
|
|
|
- # Debug 可视化配置(与 MinerUWiredTableRecognizer.DebugOptions 对齐)
|
|
|
- # 默认关闭。开启后将保存:layout检测结果
|
|
|
+ # Debug 可视化(底图为 inference_image,与 Layout 检测输入一致)
|
|
|
debug_options:
|
|
|
- enabled: false # 由命令行 --debug 统一控制,勿在此 hardcode true
|
|
|
- output_dir: null # 调试输出目录;null不输出
|
|
|
- prefix: "" # 保存文件名前缀(如设置为页码)
|
|
|
-
|
|
|
+ enabled: false # 由命令行 --debug / --debug-layout 控制
|
|
|
+ output_dir: null # null 时由 pipeline 按页注入
|
|
|
+ prefix: ""
|
|
|
+ subdir: layout_detection # 输出至 debug/layout_detection/
|
|
|
+ save_raw: true # 后处理前
|
|
|
+ save_post_processed: true # 后处理后
|
|
|
+ save_json: true
|
|
|
+ image_format: "png"
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# OCR 识别配置
|
|
|
+# ============================================================
|
|
|
ocr_recognition:
|
|
|
module: "mineru"
|
|
|
language: "ch"
|
|
|
@@ -55,6 +76,16 @@ ocr_recognition:
|
|
|
batch_size: 8
|
|
|
device: "cpu"
|
|
|
|
|
|
+ # Debug 可视化(底图为 inference_image,与整页 OCR 输入一致)
|
|
|
+ debug_options:
|
|
|
+ enabled: false # 由命令行 --debug / --debug-ocr 控制
|
|
|
+ output_dir: null
|
|
|
+ prefix: ""
|
|
|
+ subdir: ocr_recognition # 输出至 debug/ocr_recognition/
|
|
|
+ save_json: true
|
|
|
+ image_format: png
|
|
|
+
|
|
|
+# ============================================================
|
|
|
# 表格分类配置(自动区分有线/无线表格)
|
|
|
# 启用后将自动调用分类模型,根据结果选择合适的表格识别器
|
|
|
table_classification:
|
|
|
@@ -63,16 +94,18 @@ table_classification:
|
|
|
confidence_threshold: 0.5 # 分类置信度阈值
|
|
|
batch_size: 16 # 批处理大小
|
|
|
|
|
|
- # Debug 可视化配置(与 MinerUWiredTableRecognizer.DebugOptions 对齐)
|
|
|
- # 默认关闭。开启后将保存:表格线
|
|
|
+ # Debug 可视化配置
|
|
|
debug_options:
|
|
|
- enabled: false # 由命令行 --debug 统一控制,勿在此 hardcode true
|
|
|
- output_dir: null # 调试输出目录;null不输出
|
|
|
- save_table_lines: true # 保存表格线可视化(unet横线/竖线叠加)
|
|
|
- image_format: "png" # 可视化图片格式:png/jpg
|
|
|
- prefix: "" # 保存文件名前缀(如设置为页码/表格序号)
|
|
|
-
|
|
|
-# 有线表格识别专用配置
|
|
|
+ enabled: false # 由命令行 --debug / --debug-table 统一控制
|
|
|
+ output_dir: null # null 时由 pipeline 按页注入
|
|
|
+ prefix: ""
|
|
|
+ subdir: table_classification # 输出至 debug/table_classification/
|
|
|
+ save_table_lines: true # paddle 线条检测叠加图
|
|
|
+ image_format: "png"
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# 有线表格识别专用配置(MinerU UNet)
|
|
|
+# ============================================================
|
|
|
table_recognition_wired:
|
|
|
use_wired_unet: true
|
|
|
upscale_ratio: 3.333
|
|
|
@@ -106,27 +139,46 @@ table_recognition_wired:
|
|
|
# 功能开关
|
|
|
enable_ocr_compensation: true # 启用OCR边缘补偿
|
|
|
|
|
|
-
|
|
|
- # 单元格二次 OCR(det 分行 + 整格兜底 + 低分块过滤)
|
|
|
+ # 单元格二次 OCR(参数对齐 cell_sweep lab:threshold_t150_cl_1.0_8_ob_u128 / Pass2 tile=4)
|
|
|
second_pass_ocr:
|
|
|
+ reocr_mode: bank_statement
|
|
|
line_min_score: 0.8
|
|
|
- drop_low_score_blocks: true
|
|
|
- whole_cell_fallback: true
|
|
|
- prefer_whole_on_tie: true
|
|
|
-
|
|
|
- # Debug 可视化配置(与 MinerUWiredTableRecognizer.DebugOptions 对齐)
|
|
|
- # 默认关闭。开启后将保存:表格线、连通域、逻辑网格结构、文本覆盖可视化。
|
|
|
+ cell_preprocess:
|
|
|
+ watermark:
|
|
|
+ enabled: true
|
|
|
+ method: threshold
|
|
|
+ threshold: 150
|
|
|
+ contrast: # Pass1:去水印后 CLAHE
|
|
|
+ enabled: true
|
|
|
+ method: clahe
|
|
|
+ clip_limit: 1.0
|
|
|
+ tile_grid_size: 8
|
|
|
+ upscale_min_side: 96 # Pass1:常规二次 OCR 放大最短边
|
|
|
+ enhance_retry: # Pass2:低分/难例再试(可单独配置 upscale + contrast)
|
|
|
+ enabled: true
|
|
|
+ upscale_min_side: 128 # Pass2 放大最短边;未配置时沿用 Pass1
|
|
|
+ contrast:
|
|
|
+ enabled: true
|
|
|
+ method: clahe
|
|
|
+ clip_limit: 1.0
|
|
|
+ tile_grid_size: 4
|
|
|
+
|
|
|
+ # Debug 可视化配置
|
|
|
debug_options:
|
|
|
- enabled: false # 由命令行 --debug 统一控制,勿在此 hardcode true
|
|
|
- output_dir: null # 调试输出目录;null不输出
|
|
|
- save_table_lines: true # 保存表格线可视化(unet横线/竖线叠加)
|
|
|
- save_connected_components: true # 保存连通域提取的单元格图
|
|
|
- save_grid_structure: true # 保存逻辑网格结构(row/col/rowspan/colspan)
|
|
|
- save_text_overlay: true # 保存文本填充覆盖图
|
|
|
- image_format: "png" # 可视化图片格式:png/jpg
|
|
|
- prefix: "" # 保存文件名前缀(如设置为页码/表格序号)
|
|
|
-
|
|
|
-# VLM 表格识别配置(当分类为 'wireless' 时使用)
|
|
|
+ enabled: false # 由命令行 --debug / --debug-table 统一控制
|
|
|
+ output_dir: null # null 时由 pipeline 按页注入
|
|
|
+ prefix: ""
|
|
|
+ subdir: table_recognition_wired # 输出至 debug/table_recognition_wired/
|
|
|
+ save_table_lines: true
|
|
|
+ save_connected_components: true
|
|
|
+ save_grid_structure: true
|
|
|
+ save_text_overlay: true
|
|
|
+ image_format: "png"
|
|
|
+ # 单元格二次 OCR 裁剪图:debug/table_recognition_wired/tablecell_ocr/
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# VL识别配置 - 使用 GLM-OCR(无线表格 + seal识别)
|
|
|
+# ============================================================
|
|
|
vl_recognition:
|
|
|
# 可选: "mineru" (MinerU VLM) 或 "paddle" (PaddleOCR-VL)
|
|
|
module: "mineru"
|
|
|
@@ -153,6 +205,9 @@ vl_recognition:
|
|
|
# 表格识别特定配置
|
|
|
table_recognition:
|
|
|
|
|
|
+# ============================================================
|
|
|
+# 输出配置
|
|
|
+# ============================================================
|
|
|
output:
|
|
|
create_subdir: false
|
|
|
save_pdf_images: true
|