zhengchun
/
ocr_platform


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870
							"""
银行流水水印去除工具

支持 PDF 和常见图片格式（jpg/png/tif/bmp/webp）。
参数默认从与 main_v2 相同的场景 YAML 读取（preprocessor.watermark_removal），
命令行仅用于输入/输出、批量、预览及少量覆盖项。

用法:
    # 使用默认场景配置（bank_statement_yusys_local.yaml）
    python remove_watermark.py input.pdf

    # 指定场景配置（与 Pipeline 一致）
    python remove_watermark.py input.png -c ../universal_doc_parser/config/bank_statement_yusys_local.yaml

    # 保存调试图（before/after/compare/meta）
    python remove_watermark.py input.png -o ./out --debug

    # 临时覆盖阈值（其余仍来自配置文件）
    python remove_watermark.py input.pdf --threshold 170

    # 预览
    python remove_watermark.py input.pdf --preview --page 0

    # 批量
    python remove_watermark.py /path/to/dir/ --batch -o ./cleaned

    # 对比 threshold vs masked_adaptive（输出三联图）
    python remove_watermark.py page_002.png --compare-methods -o ./method_compare
"""
import argparse
import copy
import json
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Optional

import cv2
import numpy as np
import yaml

# 将 ocr_platform 根目录加入 sys.path，以便导入 ocr_utils
_repo_root = Path(__file__).parents[2]
if str(_repo_root) not in sys.path:
    sys.path.insert(0, str(_repo_root))

from loguru import logger
from ocr_utils.watermark_utils import (
    detect_watermark,
    remove_watermark_from_image_rgb,
    render_watermark_mask_overlay,
    save_watermark_removal_debug,
    save_watermark_mask_debug_layers,
    scan_pdf_watermark_xobjs,
    remove_txt_pdf_watermark,
)

# 支持的图片后缀（小写）
IMAGE_SUFFIXES = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp"}

_DEFAULT_CONFIG_PATH = (
    _repo_root
    / "ocr_tools/universal_doc_parser/config/bank_statement_yusys_local.yaml"
)


@dataclass
class WatermarkToolSettings:
    """从场景 YAML 解析的水印处理参数（与 Pipeline preprocessor 对齐）。"""

    threshold: int = 160
    morph_close_kernel: int = 0
    dpi: int = 200
    method: str = "threshold"
    contrast_enhancement: Optional[Dict[str, Any]] = None
    debug_options: Optional[Dict[str, Any]] = None
    watermark_enabled: bool = True
    watermark_config: Optional[Dict[str, Any]] = None

    @property
    def debug_image_format(self) -> str:
        opts = self.debug_options or {}
        return str(opts.get("image_format") or "png").lstrip(".")


def load_watermark_settings(config_path: Path) -> WatermarkToolSettings:
    """
    从 universal_doc_parser 场景配置读取 preprocessor.watermark_removal 与 input.dpi。

    不依赖完整 ConfigManager，避免仅调试水印时强依赖 layout/ocr 等段。
    """
    config_path = Path(config_path)
    if not config_path.is_file():
        raise FileNotFoundError(f"配置文件不存在: {config_path}")

    with open(config_path, encoding="utf-8") as f:
        raw = yaml.safe_load(f) or {}

    preprocessor = raw.get("preprocessor") or {}
    wm = preprocessor.get("watermark_removal") or {}
    input_cfg = raw.get("input") or {}

    contrast = wm.get("contrast_enhancement")
    if contrast is not None and not isinstance(contrast, dict):
        contrast = None

    wm_full = copy.deepcopy(wm)
    return WatermarkToolSettings(
        threshold=int(wm.get("threshold", 160)),
        morph_close_kernel=int(wm.get("morph_close_kernel", 0)),
        dpi=int(input_cfg.get("dpi", 200)),
        method=str(wm.get("method") or "threshold"),
        contrast_enhancement=copy.deepcopy(contrast) if contrast else None,
        debug_options=copy.deepcopy(wm.get("debug_options"))
        if wm.get("debug_options")
        else None,
        watermark_enabled=bool(wm.get("enabled", True)),
        watermark_config=wm_full,
    )


def resolve_watermark_settings(
    config_path: Path,
    *,
    threshold: Optional[int] = None,
    morph_close_kernel: Optional[int] = None,
    dpi: Optional[int] = None,
    no_contrast: bool = False,
    text_black_target: Optional[int] = None,
    method: Optional[str] = None,
) -> WatermarkToolSettings:
    """加载配置并应用命令行覆盖。"""
    settings = load_watermark_settings(config_path)

    if threshold is not None:
        settings.threshold = threshold
    if morph_close_kernel is not None:
        settings.morph_close_kernel = morph_close_kernel
    if dpi is not None:
        settings.dpi = dpi
    if method is not None:
        settings.method = method
        if settings.watermark_config is not None:
            settings.watermark_config["method"] = method

    if no_contrast and settings.contrast_enhancement:
        settings.contrast_enhancement = copy.deepcopy(settings.contrast_enhancement)
        settings.contrast_enhancement["enabled"] = False
    elif text_black_target is not None:
        if not settings.contrast_enhancement:
            settings.contrast_enhancement = {"enabled": True, "method": "text_restore"}
        else:
            settings.contrast_enhancement = copy.deepcopy(settings.contrast_enhancement)
        settings.contrast_enhancement["enabled"] = True
        settings.contrast_enhancement["text_black_target"] = text_black_target

    return settings


def _watermark_removal_cfg_for_method(
    settings: WatermarkToolSettings,
    method: str,
) -> Dict[str, Any]:
    """构造指定 method 的 watermark_removal 配置副本。"""
    cfg = copy.deepcopy(settings.watermark_config or {})
    cfg["method"] = method
    cfg["threshold"] = settings.threshold
    cfg["morph_close_kernel"] = settings.morph_close_kernel
    return cfg


def _apply_image_watermark_removal(
    img_np: np.ndarray,
    *,
    settings: WatermarkToolSettings,
    contrast_enhancement: Optional[Dict[str, Any]] = None,
    apply_watermark_removal: bool = True,
    removal_debug: Optional[Dict[str, Any]] = None,
) -> np.ndarray:
    """与 universal_doc_parser 一致的 RGB 去水印 + 可选对比度增强。"""
    wm_cfg = _watermark_removal_cfg_for_method(settings, settings.method)
    return np.asarray(
        remove_watermark_from_image_rgb(
            img_np,
            threshold=settings.threshold,
            morph_close_kernel=settings.morph_close_kernel,
            contrast_enhancement=contrast_enhancement,
            apply_watermark_removal=apply_watermark_removal,
            watermark_removal_cfg=wm_cfg,
            removal_debug=removal_debug,
            return_pil=False,
        )
    )


def _active_contrast_enhancement(
    settings: WatermarkToolSettings,
) -> Optional[Dict[str, Any]]:
    ce = settings.contrast_enhancement
    if not ce or not ce.get("enabled", False):
        return None
    return ce


def _maybe_save_watermark_debug(
    before: np.ndarray,
    after: np.ndarray,
    debug_output_dir: Path,
    page_name: str,
    *,
    settings: WatermarkToolSettings,
    contrast_enhancement: Optional[Dict[str, Any]] = None,
    removal_debug: Optional[Dict[str, Any]] = None,
) -> None:
    """保存调试图到 debug/watermark_removal/（与 pipeline 相同布局）。"""
    params: Dict[str, Any] = {
        "method": settings.method,
        "threshold": settings.threshold,
        "morph_close_kernel": settings.morph_close_kernel,
    }
    if contrast_enhancement:
        params["contrast_enhancement"] = contrast_enhancement
    if removal_debug:
        for key in ("mode", "T_wm", "T_protect", "wm_mask_ratio", "white_pixel_ratio"):
            if key in removal_debug:
                params[key] = removal_debug[key]

    mask_overlay = None
    if removal_debug and "wm_mask" in removal_debug:
        mask_overlay = render_watermark_mask_overlay(
            before, removal_debug["wm_mask"]
        )

    save_watermark_removal_debug(
        before,
        after,
        debug_output_dir,
        page_name,
        processing_params=params,
        image_format=settings.debug_image_format,
        save_compare=True,
        mask_overlay=mask_overlay,
    )


def _try_remove_txt_pdf_watermark(input_path: Path, output_path: Path) -> int:
    """
    对文字型 PDF 执行原生水印去除，保留文字可搜索性。

    内部委托给 watermark_utils.remove_txt_pdf_watermark() 完成内存流处理，
    有水印时将结果写入 output_path。

    流程：
    1. scan_pdf_watermark_xobjs() 快速扫描前 3 页，无水印直接返回 0
    2. remove_txt_pdf_watermark() 执行全量去除，返回 bytes 或 None
    3. 有水印时写 output_path

    Returns:
        1 表示去除成功，0 表示未发现水印
    """
    pdf_bytes = input_path.read_bytes()

    if not scan_pdf_watermark_xobjs(pdf_bytes, sample_pages=3):
        return 0

    cleaned = remove_txt_pdf_watermark(pdf_bytes)
    if cleaned is None:
        return 0

    output_path.write_bytes(cleaned)
    return 1


def process_document(
    input_path: Path,
    output_path: Path,
    settings: WatermarkToolSettings,
    page_range: Optional[str] = None,
    force_image: bool = False,
    save_debug: bool = False,
    debug_output_dir: Optional[Path] = None,
    apply_watermark_removal: Optional[bool] = None,
) -> int:
    """
    统一处理函数：支持 PDF（扫描件）和图片，去除水印后保存。

    使用 PDFUtils.load_and_classify_document 加载并分类：
    - 文字型 PDF（pdf_type='txt'）：优先尝试原生 XObject 水印去除（保留可搜索性）；
      失败时自动回退图像化处理，或 force_image=True 时直接走图像处理
    - 扫描件 PDF（pdf_type='ocr'）：逐页去水印后重新打包为 PDF
    - 图片：检测水印后去除并保存

    Args:
        input_path: 输入文件路径（PDF 或图片）
        output_path: 输出文件路径
        settings: 水印配置（含 method / threshold / mask / adaptive）
        page_range: 页面范围字符串，如 "1-5,7,9-12"（从 1 开始，仅对 PDF 有效）
        force_image: 强制对文字型 PDF 使用图像化处理（会失去文字可搜索性，
                     但能处理水印嵌在内容流中的情况）
        save_debug: 是否保存 before/after/compare/meta 到 debug/watermark_removal/
        debug_output_dir: 调试图根目录，默认 output_path 的父目录
        apply_watermark_removal: 默认取 settings.watermark_enabled

    Returns:
        实际处理的页/图片数
    """
    import shutil
    from io import BytesIO
    from PIL import Image
    from ocr_utils.pdf_utils import PDFUtils

    is_pdf = input_path.suffix.lower() == ".pdf"
    dpi = settings.dpi
    contrast_enhancement = _active_contrast_enhancement(settings)
    if apply_watermark_removal is None:
        apply_watermark_removal = settings.watermark_enabled

    # 统一加载 + 分类（PDF 用 MinerU pdf_classify，图片直接读取）
    images, pdf_type, pdf_doc, renderer = PDFUtils.load_and_classify_document(
        input_path, dpi=dpi, page_range=page_range
    )

    # _known_has_wm: 当 txt 分支已确认有水印时设为 True，避免公共段用更严格阈值误判
    _known_has_wm: Optional[bool] = None

    # 文字型 PDF：优先尝试原生 XObject 水印去除，保留可搜索性
    if is_pdf and pdf_type == "txt" and not force_image:
        output_path.parent.mkdir(parents=True, exist_ok=True)
        removed = _try_remove_txt_pdf_watermark(input_path, output_path)
        if removed > 0:
            logger.info(
                f"✅ 文字型 PDF '{input_path.name}'：删除 {removed} 个水印 XObject，"
                "保留文字可搜索性，已保存。"
            )
            return removed

        # XObject 扫描无结果，用较低阈值（0.5%）做图像水印检测二次确认
        # 文字 PDF 背景干净，降低阈值以检测稀疏文字水印
        first_np = np.array(images[0]["img_pil"])
        if detect_watermark(first_np, ratio_threshold=0.005):
            logger.warning(
                f"⚠️  文字型 PDF '{input_path.name}'：未找到 XObject 水印，"
                "但图像检测发现水印（内联内容流水印），"
                "回退为图像化处理（输出将失去文字可搜索性）。"
            )
            _known_has_wm = True  # 明确检测到水印，跳过公共段二次检测
        else:
            logger.info(
                f"✅ 文字型 PDF '{input_path.name}'：未检测到水印，直接复制。"
            )
            shutil.copy2(str(input_path), str(output_path))
            return 0
    elif is_pdf and pdf_type == "txt" and force_image:
        logger.warning(
            f"⚠️  文字型 PDF '{input_path.name}'：--force-image 模式，"
            "强制图像化处理（输出将失去文字可搜索性）。"
        )
        _known_has_wm = True  # force_image 模式不再检测，直接去除

    logger.info(
        f"{'📄' if is_pdf else '🖼️ '} 处理: {input_path.name}  "
        f"共 {len(images)} {'页' if is_pdf else '张'}  "
        f"method={settings.method} threshold={settings.threshold}"
    )

    contrast_only = (
        not apply_watermark_removal
        and contrast_enhancement
        and contrast_enhancement.get("enabled", False)
    )

    # 水印检测（仅用第一页/图判断，同一文档水印通常一致）
    # _known_has_wm 已在 txt 分支设置时，跳过重复检测
    if contrast_only:
        has_wm = True
        logger.info("📋 配置关闭去水印，仅应用 contrast_enhancement")
    elif _known_has_wm is not None:
        has_wm = _known_has_wm
        logger.info("🔍 检测到水印，启动去水印处理" if has_wm else "✅ 未检测到水印，跳过")
    else:
        first_np = np.array(images[0]["img_pil"])
        # 扫描件/图片路径：使用宽松一档的中间调阈值（2.5%）以避免边界误判，
        # 斜向直线验证仍作为双重保险防止误报
        has_wm = detect_watermark(first_np, ratio_threshold=0.025)
        if has_wm:
            logger.info("🔍 检测到水印，启动去水印处理")
        else:
            logger.info("✅ 未检测到水印，跳过去水印处理")
            if not is_pdf:
                # 图片无水印：直接复制
                output_path.parent.mkdir(parents=True, exist_ok=True)
                shutil.copy2(str(input_path), str(output_path))
                return 1

    output_path.parent.mkdir(parents=True, exist_ok=True)
    debug_root = debug_output_dir or output_path.parent

    if is_pdf:
        # 逐页处理后重新打包为 PDF
        try:
            import fitz
        except ImportError:
            raise ImportError("请安装 PyMuPDF: pip install PyMuPDF")

        new_doc = fitz.open()
        for i, img_dict in enumerate(images):
            pil_img = img_dict["img_pil"]
            img_np = np.array(pil_img)
            page_name = f"{input_path.stem}_page_{i + 1:03d}"

            if has_wm:
                before = img_np.copy()
                removal_dbg: Dict[str, Any] = {}
                cleaned_rgb = _apply_image_watermark_removal(
                    img_np,
                    settings=settings,
                    contrast_enhancement=contrast_enhancement,
                    apply_watermark_removal=apply_watermark_removal,
                    removal_debug=removal_dbg,
                )
                if save_debug:
                    _maybe_save_watermark_debug(
                        before,
                        cleaned_rgb,
                        debug_root,
                        page_name,
                        settings=settings,
                        contrast_enhancement=contrast_enhancement,
                        removal_debug=removal_dbg,
                    )
                out_pil = Image.fromarray(
                    cv2.cvtColor(cleaned_rgb, cv2.COLOR_BGR2RGB)
                )
            else:
                out_pil = pil_img

            buf = BytesIO()
            out_pil.save(buf, format="PNG", optimize=False)
            buf.seek(0)

            # 按渲染图尺寸创建新页面（保持原始 DPI 尺寸）
            w_px, h_px = out_pil.size
            new_page = new_doc.new_page(width=w_px * 72 / dpi, height=h_px * 72 / dpi)
            new_page.insert_image(new_page.rect, stream=buf.read())

            if (i + 1) % 10 == 0 or i == len(images) - 1:
                logger.info(f"  进度: {i + 1}/{len(images)}")

        new_doc.save(str(output_path), garbage=4, deflate=True)
    else:
        # 图片：有水印则去除后保存
        img_np = np.array(images[0]["img_pil"])
        before = img_np.copy()
        removal_dbg = {}
        cleaned_rgb = _apply_image_watermark_removal(
            img_np,
            settings=settings,
            contrast_enhancement=contrast_enhancement,
            apply_watermark_removal=apply_watermark_removal,
            removal_debug=removal_dbg,
        )
        if save_debug:
            _maybe_save_watermark_debug(
                before,
                cleaned_rgb,
                debug_root,
                input_path.stem,
                settings=settings,
                contrast_enhancement=contrast_enhancement,
                removal_debug=removal_dbg,
            )
        out_rgb = cv2.cvtColor(cleaned_rgb, cv2.COLOR_BGR2RGB)
        Image.fromarray(out_rgb).save(str(output_path))

    logger.info(f"✅ 保存到: {output_path}")
    return len(images)


def preview_page(
    input_path: Path,
    settings: WatermarkToolSettings,
    page_idx: int = 0,
):
    """展示单页原图与去水印对比（需要 matplotlib）。支持 PDF 和图片文件。"""
    try:
        import matplotlib.pyplot as plt
        import matplotlib
        matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'sans-serif']
        matplotlib.rcParams['axes.unicode_minus'] = False
    except ImportError as e:
        raise ImportError(f"预览需要 matplotlib: {e}")

    suffix = input_path.suffix.lower()

    if suffix == ".pdf":
        try:
            import fitz
        except ImportError:
            raise ImportError("PDF 预览需要 PyMuPDF: pip install PyMuPDF")
        doc = fitz.open(str(input_path))
        if page_idx >= len(doc):
            raise ValueError(f"页码 {page_idx} 超出范围（共 {len(doc)} 页）")
        mat = fitz.Matrix(settings.dpi / 72, settings.dpi / 72)
        page = doc[page_idx]
        pix = page.get_pixmap(matrix=mat, alpha=False)
        img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
        title_orig = f"原图  第 {page_idx + 1} 页"
    elif suffix in IMAGE_SUFFIXES:
        from PIL import Image
        img_np = np.array(Image.open(str(input_path)).convert("RGB"))
        title_orig = f"原图  {input_path.name}"
    else:
        raise ValueError(f"不支持的文件格式: {suffix}")

    contrast = _active_contrast_enhancement(settings)
    cleaned_rgb = _apply_image_watermark_removal(
        img_np,
        settings=settings,
        contrast_enhancement=contrast,
        apply_watermark_removal=settings.watermark_enabled,
    )
    cleaned = cv2.cvtColor(cleaned_rgb, cv2.COLOR_BGR2GRAY)

    fig, axes = plt.subplots(1, 2, figsize=(20, 14))
    axes[0].imshow(img_np)
    axes[0].set_title(title_orig, fontsize=14)
    axes[0].axis('off')

    subtitle = f"method={settings.method}, threshold={settings.threshold}"
    if contrast:
        subtitle += f", contrast={contrast.get('method', 'on')}"
    axes[1].imshow(cleaned, cmap='gray')
    axes[1].set_title(f"去水印后  {subtitle}", fontsize=14)
    axes[1].axis('off')

    plt.tight_layout()
    plt.show()


def _run_process_document(
    input_path: Path,
    output_path: Path,
    settings: WatermarkToolSettings,
    *,
    page_range: Optional[str] = None,
    force_image: bool = False,
    save_debug: bool = False,
    debug_output_dir: Optional[Path] = None,
) -> int:
    return process_document(
        input_path,
        output_path,
        settings,
        page_range=page_range,
        force_image=force_image,
        save_debug=save_debug,
        debug_output_dir=debug_output_dir,
    )


def compare_watermark_methods(
    input_path: Path,
    output_dir: Path,
    settings: WatermarkToolSettings,
) -> Dict[str, str]:
    """
    同一张图对比 threshold 与 masked_adaptive，输出三联图与 meta。

    Returns:
        各输出文件路径
    """
    from PIL import Image

    output_dir.mkdir(parents=True, exist_ok=True)
    stem = input_path.stem
    img_rgb = np.array(Image.open(str(input_path)).convert("RGB"))
    contrast = _active_contrast_enhancement(settings)

    paths: Dict[str, str] = {}
    results: Dict[str, np.ndarray] = {}

    for method in ("threshold", "masked_adaptive"):
        sub = copy.deepcopy(settings)
        sub.method = method
        dbg: Dict[str, Any] = {}
        out = _apply_image_watermark_removal(
            img_rgb,
            settings=sub,
            contrast_enhancement=contrast,
            removal_debug=dbg,
        )
        out_rgb = cv2.cvtColor(out, cv2.COLOR_BGR2RGB)
        results[method] = out_rgb
        out_path = output_dir / f"{stem}_cleaned_{method}.png"
        Image.fromarray(out_rgb).save(str(out_path))
        paths[method] = str(out_path)
        meta_path = output_dir / f"{stem}_meta_{method}.json"
        meta = {
            "method": method,
            "threshold": settings.threshold,
            "mask_mode": dbg.get("mask_mode"),
            "direction_filter": dbg.get("direction_filter"),
            "whiten_mode": dbg.get("whiten_mode"),
            "T_wm": dbg.get("T_wm"),
            "T_protect": dbg.get("T_protect"),
            "mode": dbg.get("mode"),
            "midtone_ratio": dbg.get("midtone_ratio"),
            "wm_candidate_ratio": dbg.get("wm_candidate_ratio"),
            "geom_mask_ratio": dbg.get("geom_mask_ratio"),
            "geom_candidate_ratio": dbg.get("geom_candidate_ratio"),
            "wm_mask_ratio": dbg.get("wm_mask_ratio"),
            "white_pixel_ratio": dbg.get("white_pixel_ratio"),
            "hough_kept_lines": dbg.get("hough_kept_lines"),
            "hough_diag_candidates": dbg.get("hough_diag_candidates"),
            "hough_total_lines": dbg.get("hough_total_lines"),
            "dominant_angles": dbg.get("dominant_angles"),
            "whiten_gray_low": dbg.get("whiten_gray_low"),
        }
        meta_path.write_text(
            json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
        )
        paths[f"meta_{method}"] = str(meta_path)
        if method == "masked_adaptive":
            layer_paths = save_watermark_mask_debug_layers(
                img_rgb, output_dir, stem, dbg, image_format="png"
            )
            paths.update(layer_paths)

    h = max(results["threshold"].shape[0], results["masked_adaptive"].shape[0])

    def _resize_rgb(arr: np.ndarray) -> np.ndarray:
        if arr.shape[0] == h:
            return arr
        scale = h / arr.shape[0]
        w = int(arr.shape[1] * scale)
        return cv2.resize(arr, (w, h))

    triple = np.hstack(
        [_resize_rgb(img_rgb)]
        + [_resize_rgb(results[m]) for m in ("threshold", "masked_adaptive")]
    )
    compare_path = output_dir / f"{stem}_compare_orig_threshold_masked.png"
    cv2.imwrite(
        str(compare_path),
        cv2.cvtColor(triple, cv2.COLOR_RGB2BGR),
    )
    paths["compare_triple"] = str(compare_path)
    logger.info(f"✅ 方法对比已保存: {compare_path}")
    return paths


def main():
    parser = argparse.ArgumentParser(
        description="银行流水水印去除工具（参数默认来自场景 YAML，与 main_v2 Pipeline 一致）",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )
    parser.add_argument("input", type=Path, help="输入 PDF / 图片文件或目录（批量模式）")
    parser.add_argument(
        "-c",
        "--config",
        type=Path,
        default=_DEFAULT_CONFIG_PATH,
        help=f"场景配置文件，读取 preprocessor.watermark_removal（默认: {_DEFAULT_CONFIG_PATH.name}）",
    )
    parser.add_argument(
        "-o",
        "--output",
        type=Path,
        default=None,
        help="输出路径（单文件模式；默认在原文件名后加 _cleaned）",
    )
    parser.add_argument("--batch", action="store_true", help="批量处理目录下所有 PDF 和图片")
    parser.add_argument("--preview", action="store_true", help="预览模式：展示单页对比图（不保存）")
    parser.add_argument("--page", type=int, default=0, help="预览页码（0-based）")
    parser.add_argument(
        "--page-range",
        type=str,
        default=None,
        help="PDF 页面范围，如 '1-3,5,7-9'（从 1 开始）",
    )
    parser.add_argument(
        "--force-image",
        action="store_true",
        help="文字型 PDF 强制走图像去水印（失去可搜索性）",
    )
    parser.add_argument(
        "--debug",
        action="store_true",
        help="保存调试图到 debug/watermark_removal/",
    )
    parser.add_argument(
        "--debug-dir",
        type=Path,
        default=None,
        help="调试图根目录（默认 -o 的父目录；格式见配置文件 debug_options.image_format）",
    )
    # 以下为覆盖配置文件的少量旋钮（未指定则完全使用 YAML）
    override = parser.add_argument_group("覆盖配置文件（可选）")
    override.add_argument(
        "--threshold",
        type=int,
        default=None,
        help="覆盖 watermark_removal.threshold（140-180）",
    )
    override.add_argument(
        "--morph-kernel",
        type=int,
        default=None,
        help="覆盖 watermark_removal.morph_close_kernel",
    )
    override.add_argument("--dpi", type=int, default=None, help="覆盖 input.dpi")
    override.add_argument("--no-contrast", action="store_true", help="关闭 contrast_enhancement")
    override.add_argument(
        "--text-black-target",
        type=int,
        default=None,
        help="覆盖 contrast_enhancement.text_black_target（text_restore）",
    )
    override.add_argument(
        "--method",
        type=str,
        default=None,
        choices=["threshold", "masked", "masked_adaptive"],
        help="覆盖 watermark_removal.method",
    )
    parser.add_argument(
        "--compare-methods",
        action="store_true",
        help="对比 threshold 与 masked_adaptive，输出三联图到 -o 目录",
    )

    args = parser.parse_args()

    try:
        settings = resolve_watermark_settings(
            args.config,
            threshold=args.threshold,
            morph_close_kernel=args.morph_kernel,
            dpi=args.dpi,
            no_contrast=args.no_contrast,
            text_black_target=args.text_black_target,
            method=args.method,
        )
    except FileNotFoundError as e:
        logger.error(str(e))
        sys.exit(1)

    logger.info(
        f"📋 配置: {args.config} | method={settings.method} | "
        f"threshold={settings.threshold} | morph_kernel={settings.morph_close_kernel} | "
        f"dpi={settings.dpi} | contrast={settings.contrast_enhancement}"
    )

    if args.compare_methods:
        input_path = args.input
        if not input_path.is_file():
            logger.error(f"文件不存在: {input_path}")
            sys.exit(1)
        out_dir = args.output or (
            input_path.parent / "debug" / "watermark_method_compare"
        )
        paths = compare_watermark_methods(input_path, out_dir, settings)
        for k, v in paths.items():
            logger.info(f"  {k}: {v}")
        return

    if args.preview:
        preview_page(args.input, settings, page_idx=args.page)
        return

    if args.batch:
        # 批量模式：处理目录下所有 PDF 和图片
        input_dir = args.input
        if not input_dir.is_dir():
            logger.error(f"批量模式需要传入目录: {input_dir}")
            sys.exit(1)

        # 收集所有支持的文件
        all_files: list[Path] = sorted(input_dir.glob("*.pdf"))
        for ext in IMAGE_SUFFIXES:
            all_files.extend(sorted(input_dir.glob(f"*{ext}")))
            all_files.extend(sorted(input_dir.glob(f"*{ext.upper()}")))
        all_files = sorted(set(all_files))

        if not all_files:
            logger.warning(f"目录中没有可处理的文件（PDF/图片）: {input_dir}")
            return
        out_dir = args.output or input_dir / "cleaned"
        out_dir.mkdir(parents=True, exist_ok=True)
        for file in all_files:
            out_file = out_dir / f"{file.stem}_cleaned{file.suffix}"
            try:
                _run_process_document(
                    file,
                    out_file,
                    settings,
                    page_range=args.page_range,
                    force_image=args.force_image,
                    save_debug=args.debug,
                    debug_output_dir=args.debug_dir or out_dir,
                )
            except Exception as e:
                logger.error(f"❌ 处理失败 {file.name}: {e}")
        logger.info(f"✅ 批量处理完成，共 {len(all_files)} 个文件 -> {out_dir}")
    else:
        # 单文件模式
        input_path = args.input
        if not input_path.is_file():
            logger.error(f"文件不存在: {input_path}")
            sys.exit(1)
        output_path = args.output or input_path.with_name(
            f"{input_path.stem}_cleaned{input_path.suffix}"
        )
        suffix = input_path.suffix.lower()
        if suffix == ".pdf" or suffix in IMAGE_SUFFIXES:
            _run_process_document(
                input_path,
                output_path,
                settings,
                page_range=args.page_range,
                force_image=args.force_image,
                save_debug=args.debug,
                debug_output_dir=args.debug_dir or output_path.parent,
            )
        else:
            logger.error(f"不支持的文件格式: {suffix}，支持 PDF 和 {IMAGE_SUFFIXES}")
            sys.exit(1)


if __name__ == "__main__":
    if len(sys.argv) == 1:
        print("ℹ️  未提供命令行参数，使用默认配置运行...")

        # 默认配置（用于开发测试）
        default_config = {
            # 测试输入
            # "input": "/Users/zhch158/workspace/data/流水分析/杨万益_福建农信.pdf",
            # "input": "Users/zhch158/workspace/data/流水分析/提取自杨万益_福建农信.png",
            
            # 文字PDF测试
            # "input": "/Users/zhch158/workspace/data/流水分析/提取自赤峰黄金2023年报.pdf",
            # "input": "/Users/zhch158/workspace/data/测试文字PDF-水印.pdf",
            # "input": "/Users/zhch158/workspace/data/非结构化文档识别统一平台（ocr_platform）-交易流水识别，财报识别.pdf",
            "input": "/Users/zhch158/workspace/data/流水分析/彭_广东兴宁农村商业银行/bank_statement_yusys_local/彭_广东兴宁农村商业银行/彭_广东兴宁农村商业银行_page_002.png",
            # "output": "./output/杨万益_福建农信",
            # 页面范围（可选，支持 "1-5,7" 语法，仅对 PDF 有效）
            # "page_range": "3",  # 仅处理第 1 页（对应 --page-range 参数）
            "config": str(_DEFAULT_CONFIG_PATH),
            "preview": True,
            "debug": True,
            "compare-methods": True,
        }

        # 构造参数（注意 input 是位置参数，morph_kernel 对应 --morph-kernel）
        sys.argv = [sys.argv[0], default_config["input"]]
        skip_keys = {"input"}
        for key, value in default_config.items():
            if key in skip_keys:
                continue
            # 将下划线转换为连字符（如 morph_kernel -> morph-kernel）
            flag = f"--{key.replace('_', '-')}"
            if isinstance(value, bool):
                if value:
                    sys.argv.append(flag)
            else:
                sys.argv.extend([flag, str(value)])

    sys.exit(main())