zhengchun
/
ocr_platform


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
							"""
银行流水水印去除工具

支持 PDF 和常见图片格式（jpg/png/tif/bmp/webp）。
- 输入 PDF → 输出去水印 PDF（扫描件）或直接复制（文字型）
- 输入图片 → 输出去水印图片（保持原格式）
适用于福建农信、邮储银行等带有半透明文字水印的银行流水单。

用法:
    # 处理单个 PDF 或图片
    python remove_watermark.py input.pdf
    python remove_watermark.py input.jpg

    # 指定输出路径
    python remove_watermark.py input.pdf -o output.pdf

    # 指定页面范围（支持 "1-5,7,9-12" 格式）
    python remove_watermark.py input.pdf --page-range 1-3

    # 调整去除阈值（默认 160，范围建议 140-180）
    python remove_watermark.py input.pdf --threshold 170

    # 批量处理目录下所有 PDF 和图片
    python remove_watermark.py /path/to/dir/ --batch

    # 预览单页/图片效果（不保存，直接展示对比图）
    python remove_watermark.py input.pdf --preview --page 0
    python remove_watermark.py input.jpg --preview
"""
import argparse
import sys
from pathlib import Path
from typing import Optional

# 将 ocr_platform 根目录加入 sys.path，以便导入 ocr_utils
_repo_root = Path(__file__).parents[2]
if str(_repo_root) not in sys.path:
    sys.path.insert(0, str(_repo_root))

from loguru import logger
from ocr_utils.watermark_utils import (
    detect_watermark,
    remove_watermark_from_image,
    scan_pdf_watermark_xobjs,
    remove_txt_pdf_watermark,
)

# 支持的图片后缀（小写）
IMAGE_SUFFIXES = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp"}


def _try_remove_txt_pdf_watermark(input_path: Path, output_path: Path) -> int:
    """
    对文字型 PDF 执行原生水印去除，保留文字可搜索性。

    内部委托给 watermark_utils.remove_txt_pdf_watermark() 完成内存流处理，
    有水印时将结果写入 output_path。

    流程：
    1. scan_pdf_watermark_xobjs() 快速扫描前 3 页，无水印直接返回 0
    2. remove_txt_pdf_watermark() 执行全量去除，返回 bytes 或 None
    3. 有水印时写 output_path

    Returns:
        1 表示去除成功，0 表示未发现水印
    """
    pdf_bytes = input_path.read_bytes()

    if not scan_pdf_watermark_xobjs(pdf_bytes, sample_pages=3):
        return 0

    cleaned = remove_txt_pdf_watermark(pdf_bytes)
    if cleaned is None:
        return 0

    output_path.write_bytes(cleaned)
    return 1


def process_document(
    input_path: Path,
    output_path: Path,
    threshold: int = 160,
    morph_close_kernel: int = 0,
    dpi: int = 200,
    page_range: Optional[str] = None,
    force_image: bool = False,
) -> int:
    """
    统一处理函数：支持 PDF（扫描件）和图片，去除水印后保存。

    使用 PDFUtils.load_and_classify_document 加载并分类：
    - 文字型 PDF（pdf_type='txt'）：优先尝试原生 XObject 水印去除（保留可搜索性）；
      失败时自动回退图像化处理，或 force_image=True 时直接走图像处理
    - 扫描件 PDF（pdf_type='ocr'）：逐页去水印后重新打包为 PDF
    - 图片：检测水印后去除并保存

    Args:
        input_path: 输入文件路径（PDF 或图片）
        output_path: 输出文件路径
        threshold: 灰度阈值（140-180），越大保守，越小激进
        morph_close_kernel: 形态学闭运算核大小，0 跳过
        dpi: PDF 渲染分辨率
        page_range: 页面范围字符串，如 "1-5,7,9-12"（从 1 开始，仅对 PDF 有效）
        force_image: 强制对文字型 PDF 使用图像化处理（会失去文字可搜索性，
                     但能处理水印嵌在内容流中的情况）

    Returns:
        实际处理的页/图片数
    """
    import shutil
    import numpy as np
    from io import BytesIO
    from PIL import Image
    from ocr_utils.pdf_utils import PDFUtils

    is_pdf = input_path.suffix.lower() == ".pdf"

    # 统一加载 + 分类（PDF 用 MinerU pdf_classify，图片直接读取）
    images, pdf_type, pdf_doc, renderer = PDFUtils.load_and_classify_document(
        input_path, dpi=dpi, page_range=page_range
    )

    # _known_has_wm: 当 txt 分支已确认有水印时设为 True，避免公共段用更严格阈值误判
    _known_has_wm: Optional[bool] = None

    # 文字型 PDF：优先尝试原生 XObject 水印去除，保留可搜索性
    if is_pdf and pdf_type == "txt" and not force_image:
        output_path.parent.mkdir(parents=True, exist_ok=True)
        removed = _try_remove_txt_pdf_watermark(input_path, output_path)
        if removed > 0:
            logger.info(
                f"✅ 文字型 PDF '{input_path.name}'：删除 {removed} 个水印 XObject，"
                "保留文字可搜索性，已保存。"
            )
            return removed

        # XObject 扫描无结果，用较低阈值（0.5%）做图像水印检测二次确认
        # 文字 PDF 背景干净，降低阈值以检测稀疏文字水印
        first_np = np.array(images[0]["img_pil"])
        if detect_watermark(first_np, ratio_threshold=0.005):
            logger.warning(
                f"⚠️  文字型 PDF '{input_path.name}'：未找到 XObject 水印，"
                "但图像检测发现水印（内联内容流水印），"
                "回退为图像化处理（输出将失去文字可搜索性）。"
            )
            _known_has_wm = True  # 明确检测到水印，跳过公共段二次检测
        else:
            logger.info(
                f"✅ 文字型 PDF '{input_path.name}'：未检测到水印，直接复制。"
            )
            shutil.copy2(str(input_path), str(output_path))
            return 0
    elif is_pdf and pdf_type == "txt" and force_image:
        logger.warning(
            f"⚠️  文字型 PDF '{input_path.name}'：--force-image 模式，"
            "强制图像化处理（输出将失去文字可搜索性）。"
        )
        _known_has_wm = True  # force_image 模式不再检测，直接去除

    logger.info(
        f"{'📄' if is_pdf else '🖼️ '} 处理: {input_path.name}  "
        f"共 {len(images)} {'页' if is_pdf else '张'}  threshold={threshold}"
    )

    # 水印检测（仅用第一页/图判断，同一文档水印通常一致）
    # _known_has_wm 已在 txt 分支设置时，跳过重复检测
    if _known_has_wm is not None:
        has_wm = _known_has_wm
        logger.info("🔍 检测到水印，启动去水印处理" if has_wm else "✅ 未检测到水印，跳过")
    else:
        first_np = np.array(images[0]["img_pil"])
        # 扫描件/图片路径：使用宽松一档的中间调阈值（2.5%）以避免边界误判，
        # 斜向直线验证仍作为双重保险防止误报
        has_wm = detect_watermark(first_np, ratio_threshold=0.025)
        if has_wm:
            logger.info("🔍 检测到水印，启动去水印处理")
        else:
            logger.info("✅ 未检测到水印，跳过去水印处理")
            if not is_pdf:
                # 图片无水印：直接复制
                output_path.parent.mkdir(parents=True, exist_ok=True)
                shutil.copy2(str(input_path), str(output_path))
                return 1

    output_path.parent.mkdir(parents=True, exist_ok=True)

    if is_pdf:
        # 逐页处理后重新打包为 PDF
        try:
            import fitz
        except ImportError:
            raise ImportError("请安装 PyMuPDF: pip install PyMuPDF")

        new_doc = fitz.open()
        for i, img_dict in enumerate(images):
            pil_img = img_dict["img_pil"]
            img_np = np.array(pil_img)

            if has_wm:
                cleaned_gray = remove_watermark_from_image(
                    img_np, threshold=threshold,
                    morph_close_kernel=morph_close_kernel, return_pil=False,
                )
                out_pil = Image.fromarray(cleaned_gray).convert("RGB")
            else:
                out_pil = pil_img

            buf = BytesIO()
            out_pil.save(buf, format="PNG", optimize=False)
            buf.seek(0)

            # 按渲染图尺寸创建新页面（保持原始 DPI 尺寸）
            w_px, h_px = out_pil.size
            new_page = new_doc.new_page(width=w_px * 72 / dpi, height=h_px * 72 / dpi)
            new_page.insert_image(new_page.rect, stream=buf.read())

            if (i + 1) % 10 == 0 or i == len(images) - 1:
                logger.info(f"  进度: {i + 1}/{len(images)}")

        new_doc.save(str(output_path), garbage=4, deflate=True)
    else:
        # 图片：有水印则去除后保存
        img_np = np.array(images[0]["img_pil"])
        cleaned_gray = remove_watermark_from_image(
            img_np, threshold=threshold,
            morph_close_kernel=morph_close_kernel, return_pil=False,
        )
        Image.fromarray(cleaned_gray, mode="L").save(str(output_path))

    logger.info(f"✅ 保存到: {output_path}")
    return len(images)


def preview_page(
    input_path: Path,
    page_idx: int = 0,
    threshold: int = 160,
    dpi: int = 200,
):
    """展示单页原图与去水印对比（需要 matplotlib）。支持 PDF 和图片文件。"""
    try:
        import numpy as np
        import matplotlib.pyplot as plt
        import matplotlib
        matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'sans-serif']
        matplotlib.rcParams['axes.unicode_minus'] = False
    except ImportError as e:
        raise ImportError(f"预览需要 matplotlib: {e}")

    suffix = input_path.suffix.lower()

    if suffix == ".pdf":
        try:
            import fitz
        except ImportError:
            raise ImportError("PDF 预览需要 PyMuPDF: pip install PyMuPDF")
        doc = fitz.open(str(input_path))
        if page_idx >= len(doc):
            raise ValueError(f"页码 {page_idx} 超出范围（共 {len(doc)} 页）")
        mat = fitz.Matrix(dpi / 72, dpi / 72)
        page = doc[page_idx]
        pix = page.get_pixmap(matrix=mat, alpha=False)
        img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
        title_orig = f"原图  第 {page_idx + 1} 页"
    elif suffix in IMAGE_SUFFIXES:
        from PIL import Image
        img_np = np.array(Image.open(str(input_path)).convert("RGB"))
        title_orig = f"原图  {input_path.name}"
    else:
        raise ValueError(f"不支持的文件格式: {suffix}")

    cleaned = remove_watermark_from_image(img_np, threshold=threshold, return_pil=False)

    fig, axes = plt.subplots(1, 2, figsize=(20, 14))
    axes[0].imshow(img_np)
    axes[0].set_title(title_orig, fontsize=14)
    axes[0].axis('off')

    axes[1].imshow(cleaned, cmap='gray')
    axes[1].set_title(f"去水印后  threshold={threshold}", fontsize=14)
    axes[1].axis('off')

    plt.tight_layout()
    plt.show()


def main():
    parser = argparse.ArgumentParser(
        description="银行流水水印去除工具",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )
    parser.add_argument("input", type=Path, help="输入 PDF / 图片文件或目录（批量模式）")
    parser.add_argument("-o", "--output", type=Path, default=None,
                        help="输出路径（单文件模式；默认在原文件名后加 _cleaned）")
    parser.add_argument("--threshold", type=int, default=160,
                        help="灰度阈值 (140-180)，默认 160")
    parser.add_argument("--morph-kernel", type=int, default=2,
                        help="形态学闭运算核大小，0 跳过，默认 2")
    parser.add_argument("--dpi", type=int, default=200,
                        help="渲染 DPI，默认 200")
    parser.add_argument("--batch", action="store_true",
                        help="批量模式：处理目录下所有 PDF 和图片")
    parser.add_argument("--preview", action="store_true",
                        help="预览模式：展示单页对比图（不保存）")
    parser.add_argument("--page", type=int, default=0,
                        help="预览页码（0-based），默认第 0 页")
    parser.add_argument("--page-range", type=str, default=None,
                        help="处理页面范围，如 '1-3,5,7-9'（从 1 开始，仅对 PDF 有效）")
    parser.add_argument("--force-image", action="store_true",
                        help="强制对文字型 PDF 使用图像化处理（会失去可搜索性，适用于 XObject 方法无法去除的内联水印）")

    args = parser.parse_args()

    if args.preview:
        preview_page(
            args.input,
            page_idx=args.page,
            threshold=args.threshold,
            dpi=args.dpi,
        )
        return

    if args.batch:
        # 批量模式：处理目录下所有 PDF 和图片
        input_dir = args.input
        if not input_dir.is_dir():
            logger.error(f"批量模式需要传入目录: {input_dir}")
            sys.exit(1)

        # 收集所有支持的文件
        all_files: list[Path] = sorted(input_dir.glob("*.pdf"))
        for ext in IMAGE_SUFFIXES:
            all_files.extend(sorted(input_dir.glob(f"*{ext}")))
            all_files.extend(sorted(input_dir.glob(f"*{ext.upper()}")))
        all_files = sorted(set(all_files))

        if not all_files:
            logger.warning(f"目录中没有可处理的文件（PDF/图片）: {input_dir}")
            return
        out_dir = args.output or input_dir / "cleaned"
        out_dir.mkdir(parents=True, exist_ok=True)
        for file in all_files:
            out_file = out_dir / f"{file.stem}_cleaned{file.suffix}"
            try:
                process_document(file, out_file, args.threshold, args.morph_kernel, args.dpi, args.page_range, args.force_image)
            except Exception as e:
                logger.error(f"❌ 处理失败 {file.name}: {e}")
        logger.info(f"✅ 批量处理完成，共 {len(all_files)} 个文件 -> {out_dir}")
    else:
        # 单文件模式
        input_path = args.input
        if not input_path.is_file():
            logger.error(f"文件不存在: {input_path}")
            sys.exit(1)
        output_path = args.output or input_path.with_name(
            f"{input_path.stem}_cleaned{input_path.suffix}"
        )
        suffix = input_path.suffix.lower()
        if suffix == ".pdf" or suffix in IMAGE_SUFFIXES:
            process_document(input_path, output_path, args.threshold, args.morph_kernel, args.dpi, args.page_range, args.force_image)
        else:
            logger.error(f"不支持的文件格式: {suffix}，支持 PDF 和 {IMAGE_SUFFIXES}")
            sys.exit(1)


if __name__ == "__main__":
    if len(sys.argv) == 1:
        print("ℹ️  未提供命令行参数，使用默认配置运行...")

        # 默认配置（用于开发测试）
        default_config = {
            # 测试输入
            # "input": "/Users/zhch158/workspace/data/流水分析/杨万益_福建农信.pdf",
            # "input": "Users/zhch158/workspace/data/流水分析/提取自杨万益_福建农信.png",
            
            # 文字PDF测试
            # "input": "/Users/zhch158/workspace/data/流水分析/提取自赤峰黄金2023年报.pdf",
            # "input": "/Users/zhch158/workspace/data/测试文字PDF-水印.pdf",
            "input": "/Users/zhch158/workspace/data/非结构化文档识别统一平台（ocr_platform）-交易流水识别，财报识别.pdf",
            # "output": "./output/杨万益_福建农信",
            # 页面范围（可选，支持 "1-5,7" 语法，仅对 PDF 有效）
            # "page_range": "3",  # 仅处理第 1 页（对应 --page-range 参数）
            "dpi": 200,
            "threshold": 160,
            "morph_kernel": 0,  # 遮罩替换模式下不需要闭运算
            # "preview": True,
        }

        # 构造参数（注意 input 是位置参数，morph_kernel 对应 --morph-kernel）
        sys.argv = [sys.argv[0], default_config["input"]]
        skip_keys = {"input"}
        for key, value in default_config.items():
            if key in skip_keys:
                continue
            # 将下划线转换为连字符（如 morph_kernel -> morph-kernel）
            flag = f"--{key.replace('_', '-')}"
            if isinstance(value, bool):
                if value:
                    sys.argv.append(flag)
            else:
                sys.argv.extend([flag, str(value)])

    sys.exit(main())