|
|
@@ -45,14 +45,15 @@ if str(_repo_root) not in sys.path:
|
|
|
sys.path.insert(0, str(_repo_root))
|
|
|
|
|
|
from loguru import logger
|
|
|
-from ocr_utils.watermark_utils import (
|
|
|
+from ocr_utils.watermark import (
|
|
|
+ WatermarkProcessor,
|
|
|
detect_watermark,
|
|
|
- remove_watermark_from_image_rgb,
|
|
|
+ merge_watermark_config,
|
|
|
+ remove_txt_pdf_watermark,
|
|
|
render_watermark_mask_overlay,
|
|
|
- save_watermark_removal_debug,
|
|
|
save_watermark_mask_debug_layers,
|
|
|
+ save_watermark_removal_debug,
|
|
|
scan_pdf_watermark_xobjs,
|
|
|
- remove_txt_pdf_watermark,
|
|
|
)
|
|
|
|
|
|
# 支持的图片后缀(小写)
|
|
|
@@ -72,6 +73,7 @@ class WatermarkToolSettings:
|
|
|
morph_close_kernel: int = 0
|
|
|
dpi: int = 200
|
|
|
method: str = "threshold"
|
|
|
+ scope: str = "page"
|
|
|
contrast_enhancement: Optional[Dict[str, Any]] = None
|
|
|
debug_options: Optional[Dict[str, Any]] = None
|
|
|
watermark_enabled: bool = True
|
|
|
@@ -83,11 +85,15 @@ class WatermarkToolSettings:
|
|
|
return str(opts.get("image_format") or "png").lstrip(".")
|
|
|
|
|
|
|
|
|
-def load_watermark_settings(config_path: Path) -> WatermarkToolSettings:
|
|
|
+def load_watermark_settings(
|
|
|
+ config_path: Path,
|
|
|
+ *,
|
|
|
+ scope: str = "page",
|
|
|
+) -> WatermarkToolSettings:
|
|
|
"""
|
|
|
从 universal_doc_parser 场景配置读取 preprocessor.watermark_removal 与 input.dpi。
|
|
|
|
|
|
- 不依赖完整 ConfigManager,避免仅调试水印时强依赖 layout/ocr 等段。
|
|
|
+ scope=cell 时读取 table_recognition_wired.second_pass_ocr.cell_preprocess.watermark。
|
|
|
"""
|
|
|
config_path = Path(config_path)
|
|
|
if not config_path.is_file():
|
|
|
@@ -96,25 +102,33 @@ def load_watermark_settings(config_path: Path) -> WatermarkToolSettings:
|
|
|
with open(config_path, encoding="utf-8") as f:
|
|
|
raw = yaml.safe_load(f) or {}
|
|
|
|
|
|
- preprocessor = raw.get("preprocessor") or {}
|
|
|
- wm = preprocessor.get("watermark_removal") or {}
|
|
|
input_cfg = raw.get("input") or {}
|
|
|
+ if scope == "cell":
|
|
|
+ wired = raw.get("table_recognition_wired") or {}
|
|
|
+ sp = wired.get("second_pass_ocr") or {}
|
|
|
+ cpp = sp.get("cell_preprocess") or {}
|
|
|
+ wm_user = cpp.get("watermark") or {}
|
|
|
+ wm_full = merge_watermark_config("cell", wm_user)
|
|
|
+ else:
|
|
|
+ preprocessor = raw.get("preprocessor") or {}
|
|
|
+ wm_user = preprocessor.get("watermark_removal") or {}
|
|
|
+ wm_full = merge_watermark_config("page", wm_user)
|
|
|
|
|
|
- contrast = wm.get("contrast_enhancement")
|
|
|
+ contrast = wm_full.get("contrast_enhancement")
|
|
|
if contrast is not None and not isinstance(contrast, dict):
|
|
|
contrast = None
|
|
|
|
|
|
- wm_full = copy.deepcopy(wm)
|
|
|
return WatermarkToolSettings(
|
|
|
- threshold=int(wm.get("threshold", 160)),
|
|
|
- morph_close_kernel=int(wm.get("morph_close_kernel", 0)),
|
|
|
+ threshold=int(wm_full.get("threshold", 160)),
|
|
|
+ morph_close_kernel=int(wm_full.get("morph_close_kernel", 0)),
|
|
|
dpi=int(input_cfg.get("dpi", 200)),
|
|
|
- method=str(wm.get("method") or "threshold"),
|
|
|
+ method=str(wm_full.get("method") or "masked_adaptive"),
|
|
|
+ scope=scope,
|
|
|
contrast_enhancement=copy.deepcopy(contrast) if contrast else None,
|
|
|
- debug_options=copy.deepcopy(wm.get("debug_options"))
|
|
|
- if wm.get("debug_options")
|
|
|
+ debug_options=copy.deepcopy(wm_full.get("debug_options"))
|
|
|
+ if wm_full.get("debug_options")
|
|
|
else None,
|
|
|
- watermark_enabled=bool(wm.get("enabled", True)),
|
|
|
+ watermark_enabled=bool(wm_full.get("enabled", True)),
|
|
|
watermark_config=wm_full,
|
|
|
)
|
|
|
|
|
|
@@ -122,6 +136,7 @@ def load_watermark_settings(config_path: Path) -> WatermarkToolSettings:
|
|
|
def resolve_watermark_settings(
|
|
|
config_path: Path,
|
|
|
*,
|
|
|
+ scope: str = "page",
|
|
|
threshold: Optional[int] = None,
|
|
|
morph_close_kernel: Optional[int] = None,
|
|
|
dpi: Optional[int] = None,
|
|
|
@@ -130,7 +145,7 @@ def resolve_watermark_settings(
|
|
|
method: Optional[str] = None,
|
|
|
) -> WatermarkToolSettings:
|
|
|
"""加载配置并应用命令行覆盖。"""
|
|
|
- settings = load_watermark_settings(config_path)
|
|
|
+ settings = load_watermark_settings(config_path, scope=scope)
|
|
|
|
|
|
if threshold is not None:
|
|
|
settings.threshold = threshold
|
|
|
@@ -176,21 +191,19 @@ def _apply_image_watermark_removal(
|
|
|
contrast_enhancement: Optional[Dict[str, Any]] = None,
|
|
|
apply_watermark_removal: bool = True,
|
|
|
removal_debug: Optional[Dict[str, Any]] = None,
|
|
|
+ scope: str = "page",
|
|
|
) -> np.ndarray:
|
|
|
"""与 universal_doc_parser 一致的 RGB 去水印 + 可选对比度增强。"""
|
|
|
- wm_cfg = _watermark_removal_cfg_for_method(settings, settings.method)
|
|
|
- return np.asarray(
|
|
|
- remove_watermark_from_image_rgb(
|
|
|
- img_np,
|
|
|
- threshold=settings.threshold,
|
|
|
- morph_close_kernel=settings.morph_close_kernel,
|
|
|
- contrast_enhancement=contrast_enhancement,
|
|
|
- apply_watermark_removal=apply_watermark_removal,
|
|
|
- watermark_removal_cfg=wm_cfg,
|
|
|
- removal_debug=removal_debug,
|
|
|
- return_pil=False,
|
|
|
- )
|
|
|
+ proc = WatermarkProcessor(settings.watermark_config or {}, scope=scope) # type: ignore[arg-type]
|
|
|
+ apply_contrast = contrast_enhancement is not None
|
|
|
+ cleaned, _ = proc.process(
|
|
|
+ img_np,
|
|
|
+ apply_removal=apply_watermark_removal,
|
|
|
+ contrast_override=contrast_enhancement,
|
|
|
+ removal_debug=removal_debug,
|
|
|
+ force=scope == "cell",
|
|
|
)
|
|
|
+ return np.asarray(cleaned)
|
|
|
|
|
|
|
|
|
def _active_contrast_enhancement(
|
|
|
@@ -418,6 +431,7 @@ def process_document(
|
|
|
contrast_enhancement=contrast_enhancement,
|
|
|
apply_watermark_removal=apply_watermark_removal,
|
|
|
removal_debug=removal_dbg,
|
|
|
+ scope=settings.scope,
|
|
|
)
|
|
|
if save_debug:
|
|
|
_maybe_save_watermark_debug(
|
|
|
@@ -459,6 +473,7 @@ def process_document(
|
|
|
contrast_enhancement=contrast_enhancement,
|
|
|
apply_watermark_removal=apply_watermark_removal,
|
|
|
removal_debug=removal_dbg,
|
|
|
+ scope=settings.scope,
|
|
|
)
|
|
|
if save_debug:
|
|
|
_maybe_save_watermark_debug(
|
|
|
@@ -519,6 +534,7 @@ def preview_page(
|
|
|
settings=settings,
|
|
|
contrast_enhancement=contrast,
|
|
|
apply_watermark_removal=settings.watermark_enabled,
|
|
|
+ scope=settings.scope,
|
|
|
)
|
|
|
cleaned = cv2.cvtColor(cleaned_rgb, cv2.COLOR_BGR2GRAY)
|
|
|
|
|
|
@@ -584,11 +600,13 @@ def compare_watermark_methods(
|
|
|
sub = copy.deepcopy(settings)
|
|
|
sub.method = method
|
|
|
dbg: Dict[str, Any] = {}
|
|
|
+ sub.watermark_config = _watermark_removal_cfg_for_method(sub, method)
|
|
|
out = _apply_image_watermark_removal(
|
|
|
img_rgb,
|
|
|
settings=sub,
|
|
|
contrast_enhancement=contrast,
|
|
|
removal_debug=dbg,
|
|
|
+ scope=settings.scope,
|
|
|
)
|
|
|
out_rgb = cv2.cvtColor(out, cv2.COLOR_BGR2RGB)
|
|
|
results[method] = out_rgb
|
|
|
@@ -726,6 +744,13 @@ def main():
|
|
|
help="覆盖 watermark_removal.method",
|
|
|
)
|
|
|
parser.add_argument(
|
|
|
+ "--scope",
|
|
|
+ type=str,
|
|
|
+ default="page",
|
|
|
+ choices=["page", "cell"],
|
|
|
+ help="page=页级 preprocessor;cell=二次 OCR 单元格 preset",
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
"--compare-methods",
|
|
|
action="store_true",
|
|
|
help="对比 threshold 与 masked_adaptive,输出三联图到 -o 目录",
|
|
|
@@ -736,6 +761,7 @@ def main():
|
|
|
try:
|
|
|
settings = resolve_watermark_settings(
|
|
|
args.config,
|
|
|
+ scope=args.scope,
|
|
|
threshold=args.threshold,
|
|
|
morph_close_kernel=args.morph_kernel,
|
|
|
dpi=args.dpi,
|