| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129 |
- """水印 调试图保存(由 ocr_utils.watermark_utils 迁入)。"""
- from __future__ import annotations
- import json
- import re
- from pathlib import Path
- from typing import Any, Dict, Optional, Tuple, Union
- import cv2
- import numpy as np
- from loguru import logger
- from PIL import Image
- from ocr_utils.watermark.removal import render_watermark_mask_overlay
- def _image_to_bgr_for_debug(img: np.ndarray) -> np.ndarray:
- """将 ndarray 转为 BGR,供 cv2.imwrite 使用。"""
- if img.ndim == 2:
- return cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
- out = img.copy()
- if out.shape[2] == 3:
- return cv2.cvtColor(out, cv2.COLOR_RGB2BGR)
- return out
- def save_watermark_removal_debug(
- before: Union[np.ndarray, Image.Image],
- after: Union[np.ndarray, Image.Image],
- output_dir: Union[str, Path],
- page_name: str,
- *,
- processing_params: Optional[Dict[str, Any]] = None,
- image_format: str = "png",
- save_compare: bool = True,
- subdir: str = "watermark_removal",
- mask_overlay: Optional[np.ndarray] = None,
- ) -> Dict[str, str]:
- """
- 保存去水印调试图(before / after / compare / meta.json)。
- 与 universal_doc_parser 的 module debug 目录结构一致:
- ``{output_dir}/debug/{subdir}/``
- Args:
- before: 处理前图像(RGB/BGR/灰度)
- after: 处理后图像
- output_dir: 输出根目录(通常为 pipeline 或工具的输出目录)
- page_name: 文件名前缀(如 ``doc_page_002``)
- processing_params: 写入 meta.json 的参数(threshold、contrast_enhancement 等)
- image_format: 图片格式,png/jpg
- save_compare: 是否保存左右拼接对比图
- subdir: debug 根目录下的子目录名(默认 watermark_removal)
- Returns:
- 已保存文件路径字典(before/after/compare/meta,未保存的键省略)
- """
- if isinstance(before, Image.Image):
- before = np.array(before)
- if isinstance(after, Image.Image):
- after = np.array(after)
- from ocr_utils.module_debug_viz import resolve_module_debug_dir
- debug_dir = resolve_module_debug_dir(output_dir, subdir)
- fmt = (image_format or "png").lstrip(".")
- before_bgr = _image_to_bgr_for_debug(before)
- after_bgr = _image_to_bgr_for_debug(after)
- paths: Dict[str, str] = {}
- before_path = debug_dir / f"{page_name}_watermark_before.{fmt}"
- after_path = debug_dir / f"{page_name}_watermark_after.{fmt}"
- cv2.imwrite(str(before_path), before_bgr)
- cv2.imwrite(str(after_path), after_bgr)
- paths["before"] = str(before_path)
- paths["after"] = str(after_path)
- if save_compare:
- h = max(before_bgr.shape[0], after_bgr.shape[0])
- if before_bgr.shape[0] != h:
- before_bgr = cv2.resize(before_bgr, (before_bgr.shape[1], h))
- if after_bgr.shape[0] != h:
- after_bgr = cv2.resize(after_bgr, (after_bgr.shape[1], h))
- compare = np.hstack([before_bgr, after_bgr])
- compare_path = debug_dir / f"{page_name}_watermark_compare.{fmt}"
- cv2.imwrite(str(compare_path), compare)
- paths["compare"] = str(compare_path)
- logger.info(f"Saved watermark compare: {compare_path}")
- if mask_overlay is not None:
- mask_bgr = _image_to_bgr_for_debug(mask_overlay)
- mask_path = debug_dir / f"{page_name}_watermark_mask.{fmt}"
- cv2.imwrite(str(mask_path), mask_bgr)
- paths["mask"] = str(mask_path)
- meta: Dict[str, Any] = {"page_name": page_name}
- if processing_params:
- _skip_meta = (
- "midtone_mask",
- "wm_mask",
- "wm_candidate",
- "geom_region",
- "geom_candidate",
- "diag_region",
- "text_protect",
- "seal_protect",
- "hough_lines_bgr",
- "diag_ratio_heatmap",
- "hv_ratio_heatmap",
- )
- meta_params = {
- k: v
- for k, v in processing_params.items()
- if k not in _skip_meta
- }
- meta.update(meta_params)
- else:
- meta.update({})
- meta["before"] = paths["before"]
- meta["after"] = paths["after"]
- if "compare" in paths:
- meta["compare"] = paths["compare"]
- meta_path = debug_dir / f"{page_name}_watermark_meta.json"
- meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
- paths["meta"] = str(meta_path)
- logger.info(f"Saved watermark debug: {before_path}, {after_path}")
- return paths
|