|
|
@@ -21,35 +21,27 @@ preprocessor:
|
|
|
model_dir: null # 使用默认路径
|
|
|
unwarping:
|
|
|
enabled: false
|
|
|
- # -------------------------------------------------------
|
|
|
- # 水印去除配置(适用于银行流水浅色斜向文字水印)
|
|
|
- # -------------------------------------------------------
|
|
|
+ # 页级水印(细参见 ocr_utils/watermark/presets.py PAGE_WATERMARK_PRESETS)
|
|
|
watermark_removal:
|
|
|
- enabled: false # 是否启用水印去除
|
|
|
- method: threshold # threshold | masked | masked_adaptive
|
|
|
- threshold: 175 # 全局阈值或掩膜失败时的回退阈值(140-180)
|
|
|
- morph_close_kernel: 0 # 去水印后灰度图闭运算,0 跳过
|
|
|
- # 去水印后对比度增强(text_restore 将笔画拉深,比全局 gamma 更接近原图)
|
|
|
+ enabled: false
|
|
|
+ detect_before_remove: true
|
|
|
+ method: threshold # threshold | masked | masked_adaptive
|
|
|
+ threshold: 175
|
|
|
contrast_enhancement:
|
|
|
- enabled: true
|
|
|
- method: text_restore # text_restore | clahe | gamma | linear
|
|
|
- text_black_target: 85 # 略提高,减轻去水印后笔画被拉花(原 75 过深)
|
|
|
- background_threshold: 248
|
|
|
- text_lo_percentile: 1.0
|
|
|
- text_hi_percentile: 99.0
|
|
|
- gamma: 0.75 # method=gamma 时生效
|
|
|
- clip_limit: 2.0 # method=clahe
|
|
|
- tile_grid_size: 8
|
|
|
- black_percentile: 2.0 # method=linear
|
|
|
- white_percentile: 98.0
|
|
|
+ enabled: false
|
|
|
+ method: text_restore
|
|
|
+ text_black_target: 85
|
|
|
debug_options:
|
|
|
- enabled: false # 由命令行 --debug / --debug-layout 统一控制
|
|
|
- output_dir: null # null 时使用 pipeline 输出目录
|
|
|
- prefix: "" # 文件名前缀(运行时注入 page_name)
|
|
|
- subdir: watermark_removal # 输出至 debug/watermark_removal/
|
|
|
- save_compare: true # 保存左右对比图 *_watermark_compare.*
|
|
|
- image_format: "png" # jpg / png
|
|
|
-
|
|
|
+ enabled: false
|
|
|
+ output_dir: null
|
|
|
+ prefix: ""
|
|
|
+ subdir: watermark_removal
|
|
|
+ save_compare: true
|
|
|
+ image_format: "png"
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# Layout 检测配置 - 智能路由器(按场景直接选择模型)
|
|
|
+# ============================================================
|
|
|
layout_detection:
|
|
|
module: "smart_router"
|
|
|
strategy: "ocr_eval" # ocr_eval(推荐,基于OCR评估选择最佳), auto(快速模式,基于文档特征)
|
|
|
@@ -73,14 +65,6 @@ layout_detection:
|
|
|
model_dir: null # 使用默认路径
|
|
|
device: "cpu"
|
|
|
|
|
|
- # Debug 可视化配置(与 MinerUWiredTableRecognizer.DebugOptions 对齐)
|
|
|
- # 默认关闭。开启后将保存:layout检测结果
|
|
|
- debug_options:
|
|
|
- enabled: true # 是否开启调试可视化输出
|
|
|
- output_dir: null # 调试输出目录;null不输出
|
|
|
- prefix: "" # 保存文件名前缀(如设置为页码)
|
|
|
-
|
|
|
-
|
|
|
# 可选:回退模型(当所有模型都失败时使用)
|
|
|
fallback_model:
|
|
|
module: "mineru"
|
|
|
@@ -90,11 +74,25 @@ layout_detection:
|
|
|
# 后处理配置
|
|
|
post_process:
|
|
|
# 将大面积文本块转换为表格(后处理)
|
|
|
- convert_large_text_to_table: true
|
|
|
- min_text_area_ratio: 0.25
|
|
|
- min_text_width_ratio: 0.4
|
|
|
- min_text_height_ratio: 0.3
|
|
|
+ convert_large_text_to_table: true # 是否启用
|
|
|
+ min_text_area_ratio: 0.25 # 最小面积占比(25%)
|
|
|
+ min_text_width_ratio: 0.4 # 最小宽度占比(40%)
|
|
|
+ min_text_height_ratio: 0.3 # 最小高度占比(30%)
|
|
|
+
|
|
|
+ # Debug 可视化(底图为 inference_image,与 Layout 检测输入一致)
|
|
|
+ debug_options:
|
|
|
+ enabled: false # 由命令行 --debug / --debug-layout 控制
|
|
|
+ output_dir: null # null 时由 pipeline 按页注入
|
|
|
+ prefix: ""
|
|
|
+ subdir: layout_detection # 输出至 debug/layout_detection/
|
|
|
+ save_raw: true # 后处理前
|
|
|
+ save_post_processed: true # 后处理后
|
|
|
+ save_json: true
|
|
|
+ image_format: "png"
|
|
|
|
|
|
+# ============================================================
|
|
|
+# OCR 识别配置
|
|
|
+# ============================================================
|
|
|
ocr_recognition:
|
|
|
module: "mineru"
|
|
|
language: "ch"
|
|
|
@@ -104,7 +102,6 @@ ocr_recognition:
|
|
|
batch_size: 8
|
|
|
device: "cpu"
|
|
|
|
|
|
-
|
|
|
# Debug 可视化(底图为 inference_image,与整页 OCR 输入一致)
|
|
|
debug_options:
|
|
|
enabled: true # 由命令行 --debug / --debug-ocr 控制
|
|
|
@@ -114,56 +111,100 @@ ocr_recognition:
|
|
|
save_json: true
|
|
|
image_format: png
|
|
|
|
|
|
+# ============================================================
|
|
|
# 表格分类配置(自动区分有线/无线表格)
|
|
|
+# ============================================================
|
|
|
table_classification:
|
|
|
enabled: true # 是否启用自动表格分类(默认关闭,使用手动配置)
|
|
|
module: "paddle" # 分类模型:paddle(MinerU PaddleTableClsModel)
|
|
|
confidence_threshold: 0.5 # 分类置信度阈值
|
|
|
batch_size: 16 # 批处理大小
|
|
|
|
|
|
-
|
|
|
-
|
|
|
- # Debug 可视化(底图为 inference_image,与 Layout 检测输入一致)
|
|
|
+ # Debug 可视化配置
|
|
|
debug_options:
|
|
|
- enabled: true # 由命令行 --debug / --debug-layout 控制
|
|
|
+ enabled: false # 由命令行 --debug / --debug-table 统一控制
|
|
|
output_dir: null # null 时由 pipeline 按页注入
|
|
|
prefix: ""
|
|
|
- subdir: layout_detection # 输出至 debug/layout_detection/
|
|
|
- save_raw: true # 后处理前
|
|
|
- save_post_processed: true # 后处理后
|
|
|
- save_json: true
|
|
|
+ subdir: table_classification # 输出至 debug/table_classification/
|
|
|
+ save_table_lines: true # paddle 线条检测叠加图
|
|
|
image_format: "png"
|
|
|
|
|
|
-# 有线表格识别专用配置
|
|
|
+# ============================================================
|
|
|
+# 有线表格识别专用配置(MinerU UNet)
|
|
|
+# ============================================================
|
|
|
table_recognition_wired:
|
|
|
use_wired_unet: true
|
|
|
upscale_ratio: 3.333
|
|
|
need_ocr: true
|
|
|
row_threshold: 10
|
|
|
col_threshold: 15
|
|
|
- ocr_conf_threshold: 0.8
|
|
|
+ ocr_conf_threshold: 0.9 # 单元格 OCR 置信度阈值
|
|
|
cell_crop_margin: 2
|
|
|
use_custom_postprocess: true # 是否使用自定义后处理(默认启用)
|
|
|
|
|
|
# 是否启用倾斜矫正
|
|
|
enable_deskew: true
|
|
|
|
|
|
+ # 🆕 启用多源单元格融合
|
|
|
+ use_cell_fusion: true
|
|
|
+
|
|
|
+ # 融合引擎配置
|
|
|
+ cell_fusion:
|
|
|
+ # RT-DETR 模型路径(必需)
|
|
|
+ rtdetr_model_path: "/Users/zhch158/models/pytorch_models/Table/RT-DETR-L_wired_table_cell_det.onnx"
|
|
|
+
|
|
|
+ # 融合权重
|
|
|
+ unet_weight: 0.6 # UNet 权重(结构性强)
|
|
|
+ rtdetr_weight: 0.4 # RT-DETR 权重(鲁棒性强)
|
|
|
+
|
|
|
+ # 阈值配置
|
|
|
+ iou_merge_threshold: 0.7 # 高IoU合并阈值(>0.7则加权平均)
|
|
|
+ iou_nms_threshold: 0.5 # NMS去重阈值
|
|
|
+ rtdetr_conf_threshold: 0.5 # RT-DETR置信度阈值
|
|
|
+
|
|
|
+ # 功能开关
|
|
|
+ enable_ocr_compensation: true # 启用OCR边缘补偿
|
|
|
|
|
|
- # 单元格二次 OCR(det 分行 + 整格兜底 + 低分块过滤)
|
|
|
+ # 单元格二次 OCR(det 分行 + 整格/条带兜底 + 低分笔画增强重试)
|
|
|
second_pass_ocr:
|
|
|
- line_min_score: 0.8
|
|
|
+ reocr_mode: bank_statement # 表体空单元必跑 + 同行多数非空则空格也跑
|
|
|
+ header_row: 0 # 表头行号(0=首行)
|
|
|
+ row_peer_min_nonempty: 5 # 同行至少 N 个非空格时,本格空也触发二次 OCR
|
|
|
+ line_min_score: 0.8 # 低于此分的分行从文本与计分中丢弃
|
|
|
drop_low_score_blocks: true
|
|
|
- whole_cell_fallback: true
|
|
|
+ whole_cell_fallback: true # 整格 det=False 兜底 + 条带扫描
|
|
|
prefer_whole_on_tie: true
|
|
|
+ whole_longer_min_extra_chars: 2 # 整格/条带文本比分行多长至少 N 字则优先
|
|
|
+ strip_fallback_aspect_ratio: 1.8 # 高/宽>=该值且仅检出<=1行时滑动条带分行
|
|
|
+ suspicious_short_min_chars: 4 # 高分但过短仍跑整格/条带兜底(与 enhance_retry 无关)
|
|
|
+ cell_preprocess:
|
|
|
+ watermark:
|
|
|
+ enabled: true
|
|
|
+ method: threshold
|
|
|
+ denoise:
|
|
|
+ enabled: false # 小格 median 易糊笔画;lab 用 --denoise 对比
|
|
|
+ contrast:
|
|
|
+ enabled: false # Pass1 去水印后可选;lab 对比 text_restore
|
|
|
+ method: text_restore
|
|
|
+ text_black_target: 88
|
|
|
+ light:
|
|
|
+ upscale_min_side: 192 # 128, 192 用于难例日期列
|
|
|
+ enhance_retry:
|
|
|
+ enabled: false
|
|
|
+ # enabled: true 时 Pass2 预处理,默认见代码(cell_preprocess.enhance_retry 已废弃)
|
|
|
|
|
|
# Debug 可视化配置
|
|
|
debug_options:
|
|
|
- enabled: true # 由命令行 --debug / --debug-table 统一控制
|
|
|
+ enabled: false # 由命令行 --debug / --debug-table 统一控制
|
|
|
output_dir: null # null 时由 pipeline 按页注入
|
|
|
prefix: ""
|
|
|
- subdir: table_classification # 输出至 debug/table_classification/
|
|
|
- save_table_lines: true # paddle 线条检测叠加图
|
|
|
+ subdir: table_recognition_wired # 输出至 debug/table_recognition_wired/
|
|
|
+ save_table_lines: true
|
|
|
+ save_connected_components: true
|
|
|
+ save_grid_structure: true
|
|
|
+ save_text_overlay: true
|
|
|
image_format: "png"
|
|
|
+ # 单元格二次 OCR 裁剪图:debug/table_recognition_wired/tablecell_ocr/
|
|
|
|
|
|
# VLM 表格识别配置(当分类为 'wireless' 时使用)
|
|
|
vl_recognition:
|
|
|
@@ -187,6 +228,9 @@ vl_recognition:
|
|
|
# 表格识别特定配置
|
|
|
table_recognition:
|
|
|
|
|
|
+# ============================================================
|
|
|
+# 输出配置
|
|
|
+# ============================================================
|
|
|
output:
|
|
|
create_subdir: false
|
|
|
save_pdf_images: true
|