| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896 |
- """
- 银行流水水印去除工具
- 支持 PDF 和常见图片格式(jpg/png/tif/bmp/webp)。
- 参数默认从与 main_v2 相同的场景 YAML 读取(preprocessor.watermark_removal),
- 命令行仅用于输入/输出、批量、预览及少量覆盖项。
- 用法:
- # 使用默认场景配置(bank_statement_yusys_local.yaml)
- python remove_watermark.py input.pdf
- # 指定场景配置(与 Pipeline 一致)
- python remove_watermark.py input.png -c ../universal_doc_parser/config/bank_statement_yusys_local.yaml
- # 保存调试图(before/after/compare/meta)
- python remove_watermark.py input.png -o ./out --debug
- # 临时覆盖阈值(其余仍来自配置文件)
- python remove_watermark.py input.pdf --threshold 170
- # 预览
- python remove_watermark.py input.pdf --preview --page 0
- # 批量
- python remove_watermark.py /path/to/dir/ --batch -o ./cleaned
- # 对比 threshold vs masked_adaptive(输出三联图)
- python remove_watermark.py page_002.png --compare-methods -o ./method_compare
- """
- import argparse
- import copy
- import json
- import sys
- from dataclasses import dataclass
- from pathlib import Path
- from typing import Any, Dict, Optional
- import cv2
- import numpy as np
- import yaml
- # 将 ocr_platform 根目录加入 sys.path,以便导入 ocr_utils
- _repo_root = Path(__file__).parents[2]
- if str(_repo_root) not in sys.path:
- sys.path.insert(0, str(_repo_root))
- from loguru import logger
- from ocr_utils.watermark import (
- WatermarkProcessor,
- detect_watermark,
- merge_watermark_config,
- remove_txt_pdf_watermark,
- render_watermark_mask_overlay,
- save_watermark_mask_debug_layers,
- save_watermark_removal_debug,
- scan_pdf_watermark_xobjs,
- )
- # 支持的图片后缀(小写)
- IMAGE_SUFFIXES = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp"}
- _DEFAULT_CONFIG_PATH = (
- _repo_root
- / "ocr_tools/universal_doc_parser/config/bank_statement_yusys_local.yaml"
- )
- @dataclass
- class WatermarkToolSettings:
- """从场景 YAML 解析的水印处理参数(与 Pipeline preprocessor 对齐)。"""
- threshold: int = 160
- morph_close_kernel: int = 0
- dpi: int = 200
- method: str = "threshold"
- scope: str = "page"
- contrast_enhancement: Optional[Dict[str, Any]] = None
- debug_options: Optional[Dict[str, Any]] = None
- watermark_enabled: bool = True
- watermark_config: Optional[Dict[str, Any]] = None
- @property
- def debug_image_format(self) -> str:
- opts = self.debug_options or {}
- return str(opts.get("image_format") or "png").lstrip(".")
- def load_watermark_settings(
- config_path: Path,
- *,
- scope: str = "page",
- ) -> WatermarkToolSettings:
- """
- 从 universal_doc_parser 场景配置读取 preprocessor.watermark_removal 与 input.dpi。
- scope=cell 时读取 table_recognition_wired.second_pass_ocr.cell_preprocess.watermark。
- """
- config_path = Path(config_path)
- if not config_path.is_file():
- raise FileNotFoundError(f"配置文件不存在: {config_path}")
- with open(config_path, encoding="utf-8") as f:
- raw = yaml.safe_load(f) or {}
- input_cfg = raw.get("input") or {}
- if scope == "cell":
- wired = raw.get("table_recognition_wired") or {}
- sp = wired.get("second_pass_ocr") or {}
- cpp = sp.get("cell_preprocess") or {}
- wm_user = cpp.get("watermark") or {}
- wm_full = merge_watermark_config("cell", wm_user)
- else:
- preprocessor = raw.get("preprocessor") or {}
- wm_user = preprocessor.get("watermark_removal") or {}
- wm_full = merge_watermark_config("page", wm_user)
- contrast = wm_full.get("contrast_enhancement")
- if contrast is not None and not isinstance(contrast, dict):
- contrast = None
- return WatermarkToolSettings(
- threshold=int(wm_full.get("threshold", 160)),
- morph_close_kernel=int(wm_full.get("morph_close_kernel", 0)),
- dpi=int(input_cfg.get("dpi", 200)),
- method=str(wm_full.get("method") or "masked_adaptive"),
- scope=scope,
- contrast_enhancement=copy.deepcopy(contrast) if contrast else None,
- debug_options=copy.deepcopy(wm_full.get("debug_options"))
- if wm_full.get("debug_options")
- else None,
- watermark_enabled=bool(wm_full.get("enabled", True)),
- watermark_config=wm_full,
- )
- def resolve_watermark_settings(
- config_path: Path,
- *,
- scope: str = "page",
- threshold: Optional[int] = None,
- morph_close_kernel: Optional[int] = None,
- dpi: Optional[int] = None,
- no_contrast: bool = False,
- text_black_target: Optional[int] = None,
- method: Optional[str] = None,
- ) -> WatermarkToolSettings:
- """加载配置并应用命令行覆盖。"""
- settings = load_watermark_settings(config_path, scope=scope)
- if threshold is not None:
- settings.threshold = threshold
- if morph_close_kernel is not None:
- settings.morph_close_kernel = morph_close_kernel
- if dpi is not None:
- settings.dpi = dpi
- if method is not None:
- settings.method = method
- if settings.watermark_config is not None:
- settings.watermark_config["method"] = method
- if no_contrast and settings.contrast_enhancement:
- settings.contrast_enhancement = copy.deepcopy(settings.contrast_enhancement)
- settings.contrast_enhancement["enabled"] = False
- elif text_black_target is not None:
- if not settings.contrast_enhancement:
- settings.contrast_enhancement = {"enabled": True, "method": "text_restore"}
- else:
- settings.contrast_enhancement = copy.deepcopy(settings.contrast_enhancement)
- settings.contrast_enhancement["enabled"] = True
- settings.contrast_enhancement["text_black_target"] = text_black_target
- return settings
- def _watermark_removal_cfg_for_method(
- settings: WatermarkToolSettings,
- method: str,
- ) -> Dict[str, Any]:
- """构造指定 method 的 watermark_removal 配置副本。"""
- cfg = copy.deepcopy(settings.watermark_config or {})
- cfg["method"] = method
- cfg["threshold"] = settings.threshold
- cfg["morph_close_kernel"] = settings.morph_close_kernel
- return cfg
- def _apply_image_watermark_removal(
- img_np: np.ndarray,
- *,
- settings: WatermarkToolSettings,
- contrast_enhancement: Optional[Dict[str, Any]] = None,
- apply_watermark_removal: bool = True,
- removal_debug: Optional[Dict[str, Any]] = None,
- scope: str = "page",
- ) -> np.ndarray:
- """与 universal_doc_parser 一致的 RGB 去水印 + 可选对比度增强。"""
- proc = WatermarkProcessor(settings.watermark_config or {}, scope=scope) # type: ignore[arg-type]
- apply_contrast = contrast_enhancement is not None
- cleaned, _ = proc.process(
- img_np,
- apply_removal=apply_watermark_removal,
- contrast_override=contrast_enhancement,
- removal_debug=removal_debug,
- force=scope == "cell",
- )
- return np.asarray(cleaned)
- def _active_contrast_enhancement(
- settings: WatermarkToolSettings,
- ) -> Optional[Dict[str, Any]]:
- ce = settings.contrast_enhancement
- if not ce or not ce.get("enabled", False):
- return None
- return ce
- def _maybe_save_watermark_debug(
- before: np.ndarray,
- after: np.ndarray,
- debug_output_dir: Path,
- page_name: str,
- *,
- settings: WatermarkToolSettings,
- contrast_enhancement: Optional[Dict[str, Any]] = None,
- removal_debug: Optional[Dict[str, Any]] = None,
- ) -> None:
- """保存调试图到 debug/watermark_removal/(与 pipeline 相同布局)。"""
- params: Dict[str, Any] = {
- "method": settings.method,
- "threshold": settings.threshold,
- "morph_close_kernel": settings.morph_close_kernel,
- }
- if contrast_enhancement:
- params["contrast_enhancement"] = contrast_enhancement
- if removal_debug:
- for key in ("mode", "T_wm", "T_protect", "wm_mask_ratio", "white_pixel_ratio"):
- if key in removal_debug:
- params[key] = removal_debug[key]
- mask_overlay = None
- if removal_debug and "wm_mask" in removal_debug:
- mask_overlay = render_watermark_mask_overlay(
- before, removal_debug["wm_mask"]
- )
- save_watermark_removal_debug(
- before,
- after,
- debug_output_dir,
- page_name,
- processing_params=params,
- image_format=settings.debug_image_format,
- save_compare=True,
- mask_overlay=mask_overlay,
- )
- def _try_remove_txt_pdf_watermark(input_path: Path, output_path: Path) -> int:
- """
- 对文字型 PDF 执行原生水印去除,保留文字可搜索性。
- 内部委托给 watermark_utils.remove_txt_pdf_watermark() 完成内存流处理,
- 有水印时将结果写入 output_path。
- 流程:
- 1. scan_pdf_watermark_xobjs() 快速扫描前 3 页,无水印直接返回 0
- 2. remove_txt_pdf_watermark() 执行全量去除,返回 bytes 或 None
- 3. 有水印时写 output_path
- Returns:
- 1 表示去除成功,0 表示未发现水印
- """
- pdf_bytes = input_path.read_bytes()
- if not scan_pdf_watermark_xobjs(pdf_bytes, sample_pages=3):
- return 0
- cleaned = remove_txt_pdf_watermark(pdf_bytes)
- if cleaned is None:
- return 0
- output_path.write_bytes(cleaned)
- return 1
- def process_document(
- input_path: Path,
- output_path: Path,
- settings: WatermarkToolSettings,
- page_range: Optional[str] = None,
- force_image: bool = False,
- save_debug: bool = False,
- debug_output_dir: Optional[Path] = None,
- apply_watermark_removal: Optional[bool] = None,
- ) -> int:
- """
- 统一处理函数:支持 PDF(扫描件)和图片,去除水印后保存。
- 使用 PDFUtils.load_and_classify_document 加载并分类:
- - 文字型 PDF(pdf_type='txt'):优先尝试原生 XObject 水印去除(保留可搜索性);
- 失败时自动回退图像化处理,或 force_image=True 时直接走图像处理
- - 扫描件 PDF(pdf_type='ocr'):逐页去水印后重新打包为 PDF
- - 图片:检测水印后去除并保存
- Args:
- input_path: 输入文件路径(PDF 或图片)
- output_path: 输出文件路径
- settings: 水印配置(含 method / threshold / mask / adaptive)
- page_range: 页面范围字符串,如 "1-5,7,9-12"(从 1 开始,仅对 PDF 有效)
- force_image: 强制对文字型 PDF 使用图像化处理(会失去文字可搜索性,
- 但能处理水印嵌在内容流中的情况)
- save_debug: 是否保存 before/after/compare/meta 到 debug/watermark_removal/
- debug_output_dir: 调试图根目录,默认 output_path 的父目录
- apply_watermark_removal: 默认取 settings.watermark_enabled
- Returns:
- 实际处理的页/图片数
- """
- import shutil
- from io import BytesIO
- from PIL import Image
- from ocr_utils.pdf_utils import PDFUtils
- is_pdf = input_path.suffix.lower() == ".pdf"
- dpi = settings.dpi
- contrast_enhancement = _active_contrast_enhancement(settings)
- if apply_watermark_removal is None:
- apply_watermark_removal = settings.watermark_enabled
- # 统一加载 + 分类(PDF 用 MinerU pdf_classify,图片直接读取)
- images, pdf_type, pdf_doc, renderer = PDFUtils.load_and_classify_document(
- input_path, dpi=dpi, page_range=page_range
- )
- # _known_has_wm: 当 txt 分支已确认有水印时设为 True,避免公共段用更严格阈值误判
- _known_has_wm: Optional[bool] = None
- # 文字型 PDF:优先尝试原生 XObject 水印去除,保留可搜索性
- if is_pdf and pdf_type == "txt" and not force_image:
- output_path.parent.mkdir(parents=True, exist_ok=True)
- removed = _try_remove_txt_pdf_watermark(input_path, output_path)
- if removed > 0:
- logger.info(
- f"✅ 文字型 PDF '{input_path.name}':删除 {removed} 个水印 XObject,"
- "保留文字可搜索性,已保存。"
- )
- return removed
- # XObject 扫描无结果,用较低阈值(0.5%)做图像水印检测二次确认
- # 文字 PDF 背景干净,降低阈值以检测稀疏文字水印
- first_np = np.array(images[0]["img_pil"])
- if detect_watermark(first_np, ratio_threshold=0.005):
- logger.warning(
- f"⚠️ 文字型 PDF '{input_path.name}':未找到 XObject 水印,"
- "但图像检测发现水印(内联内容流水印),"
- "回退为图像化处理(输出将失去文字可搜索性)。"
- )
- _known_has_wm = True # 明确检测到水印,跳过公共段二次检测
- else:
- logger.info(
- f"✅ 文字型 PDF '{input_path.name}':未检测到水印,直接复制。"
- )
- shutil.copy2(str(input_path), str(output_path))
- return 0
- elif is_pdf and pdf_type == "txt" and force_image:
- logger.warning(
- f"⚠️ 文字型 PDF '{input_path.name}':--force-image 模式,"
- "强制图像化处理(输出将失去文字可搜索性)。"
- )
- _known_has_wm = True # force_image 模式不再检测,直接去除
- logger.info(
- f"{'📄' if is_pdf else '🖼️ '} 处理: {input_path.name} "
- f"共 {len(images)} {'页' if is_pdf else '张'} "
- f"method={settings.method} threshold={settings.threshold}"
- )
- contrast_only = (
- not apply_watermark_removal
- and contrast_enhancement
- and contrast_enhancement.get("enabled", False)
- )
- # 水印检测(仅用第一页/图判断,同一文档水印通常一致)
- # _known_has_wm 已在 txt 分支设置时,跳过重复检测
- if contrast_only:
- has_wm = True
- logger.info("📋 配置关闭去水印,仅应用 contrast_enhancement")
- elif _known_has_wm is not None:
- has_wm = _known_has_wm
- logger.info("🔍 检测到水印,启动去水印处理" if has_wm else "✅ 未检测到水印,跳过")
- else:
- first_np = np.array(images[0]["img_pil"])
- # 扫描件/图片路径:使用宽松一档的中间调阈值(2.5%)以避免边界误判,
- # 斜向直线验证仍作为双重保险防止误报
- has_wm = detect_watermark(first_np, ratio_threshold=0.025)
- if has_wm:
- logger.info("🔍 检测到水印,启动去水印处理")
- else:
- logger.info("✅ 未检测到水印,跳过去水印处理")
- if not is_pdf:
- # 图片无水印:直接复制
- output_path.parent.mkdir(parents=True, exist_ok=True)
- shutil.copy2(str(input_path), str(output_path))
- return 1
- output_path.parent.mkdir(parents=True, exist_ok=True)
- debug_root = debug_output_dir or output_path.parent
- if is_pdf:
- # 逐页处理后重新打包为 PDF
- try:
- import fitz
- except ImportError:
- raise ImportError("请安装 PyMuPDF: pip install PyMuPDF")
- new_doc = fitz.open()
- for i, img_dict in enumerate(images):
- pil_img = img_dict["img_pil"]
- img_np = np.array(pil_img)
- page_name = f"{input_path.stem}_page_{i + 1:03d}"
- if has_wm:
- before = img_np.copy()
- removal_dbg: Dict[str, Any] = {}
- cleaned_rgb = _apply_image_watermark_removal(
- img_np,
- settings=settings,
- contrast_enhancement=contrast_enhancement,
- apply_watermark_removal=apply_watermark_removal,
- removal_debug=removal_dbg,
- scope=settings.scope,
- )
- if save_debug:
- _maybe_save_watermark_debug(
- before,
- cleaned_rgb,
- debug_root,
- page_name,
- settings=settings,
- contrast_enhancement=contrast_enhancement,
- removal_debug=removal_dbg,
- )
- out_pil = Image.fromarray(
- cv2.cvtColor(cleaned_rgb, cv2.COLOR_BGR2RGB)
- )
- else:
- out_pil = pil_img
- buf = BytesIO()
- out_pil.save(buf, format="PNG", optimize=False)
- buf.seek(0)
- # 按渲染图尺寸创建新页面(保持原始 DPI 尺寸)
- w_px, h_px = out_pil.size
- new_page = new_doc.new_page(width=w_px * 72 / dpi, height=h_px * 72 / dpi)
- new_page.insert_image(new_page.rect, stream=buf.read())
- if (i + 1) % 10 == 0 or i == len(images) - 1:
- logger.info(f" 进度: {i + 1}/{len(images)}")
- new_doc.save(str(output_path), garbage=4, deflate=True)
- else:
- # 图片:有水印则去除后保存
- img_np = np.array(images[0]["img_pil"])
- before = img_np.copy()
- removal_dbg = {}
- cleaned_rgb = _apply_image_watermark_removal(
- img_np,
- settings=settings,
- contrast_enhancement=contrast_enhancement,
- apply_watermark_removal=apply_watermark_removal,
- removal_debug=removal_dbg,
- scope=settings.scope,
- )
- if save_debug:
- _maybe_save_watermark_debug(
- before,
- cleaned_rgb,
- debug_root,
- input_path.stem,
- settings=settings,
- contrast_enhancement=contrast_enhancement,
- removal_debug=removal_dbg,
- )
- out_rgb = cv2.cvtColor(cleaned_rgb, cv2.COLOR_BGR2RGB)
- Image.fromarray(out_rgb).save(str(output_path))
- logger.info(f"✅ 保存到: {output_path}")
- return len(images)
- def preview_page(
- input_path: Path,
- settings: WatermarkToolSettings,
- page_idx: int = 0,
- ):
- """展示单页原图与去水印对比(需要 matplotlib)。支持 PDF 和图片文件。"""
- try:
- import matplotlib.pyplot as plt
- import matplotlib
- matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'sans-serif']
- matplotlib.rcParams['axes.unicode_minus'] = False
- except ImportError as e:
- raise ImportError(f"预览需要 matplotlib: {e}")
- suffix = input_path.suffix.lower()
- if suffix == ".pdf":
- try:
- import fitz
- except ImportError:
- raise ImportError("PDF 预览需要 PyMuPDF: pip install PyMuPDF")
- doc = fitz.open(str(input_path))
- if page_idx >= len(doc):
- raise ValueError(f"页码 {page_idx} 超出范围(共 {len(doc)} 页)")
- mat = fitz.Matrix(settings.dpi / 72, settings.dpi / 72)
- page = doc[page_idx]
- pix = page.get_pixmap(matrix=mat, alpha=False)
- img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
- title_orig = f"原图 第 {page_idx + 1} 页"
- elif suffix in IMAGE_SUFFIXES:
- from PIL import Image
- img_np = np.array(Image.open(str(input_path)).convert("RGB"))
- title_orig = f"原图 {input_path.name}"
- else:
- raise ValueError(f"不支持的文件格式: {suffix}")
- contrast = _active_contrast_enhancement(settings)
- cleaned_rgb = _apply_image_watermark_removal(
- img_np,
- settings=settings,
- contrast_enhancement=contrast,
- apply_watermark_removal=settings.watermark_enabled,
- scope=settings.scope,
- )
- cleaned = cv2.cvtColor(cleaned_rgb, cv2.COLOR_BGR2GRAY)
- fig, axes = plt.subplots(1, 2, figsize=(20, 14))
- axes[0].imshow(img_np)
- axes[0].set_title(title_orig, fontsize=14)
- axes[0].axis('off')
- subtitle = f"method={settings.method}, threshold={settings.threshold}"
- if contrast:
- subtitle += f", contrast={contrast.get('method', 'on')}"
- axes[1].imshow(cleaned, cmap='gray')
- axes[1].set_title(f"去水印后 {subtitle}", fontsize=14)
- axes[1].axis('off')
- plt.tight_layout()
- plt.show()
- def _run_process_document(
- input_path: Path,
- output_path: Path,
- settings: WatermarkToolSettings,
- *,
- page_range: Optional[str] = None,
- force_image: bool = False,
- save_debug: bool = False,
- debug_output_dir: Optional[Path] = None,
- ) -> int:
- return process_document(
- input_path,
- output_path,
- settings,
- page_range=page_range,
- force_image=force_image,
- save_debug=save_debug,
- debug_output_dir=debug_output_dir,
- )
- def compare_watermark_methods(
- input_path: Path,
- output_dir: Path,
- settings: WatermarkToolSettings,
- ) -> Dict[str, str]:
- """
- 同一张图对比 threshold 与 masked_adaptive,输出三联图与 meta。
- Returns:
- 各输出文件路径
- """
- from PIL import Image
- output_dir.mkdir(parents=True, exist_ok=True)
- stem = input_path.stem
- img_rgb = np.array(Image.open(str(input_path)).convert("RGB"))
- contrast = _active_contrast_enhancement(settings)
- paths: Dict[str, str] = {}
- results: Dict[str, np.ndarray] = {}
- for method in ("threshold", "masked_adaptive"):
- sub = copy.deepcopy(settings)
- sub.method = method
- dbg: Dict[str, Any] = {}
- sub.watermark_config = _watermark_removal_cfg_for_method(sub, method)
- out = _apply_image_watermark_removal(
- img_rgb,
- settings=sub,
- contrast_enhancement=contrast,
- removal_debug=dbg,
- scope=settings.scope,
- )
- out_rgb = cv2.cvtColor(out, cv2.COLOR_BGR2RGB)
- results[method] = out_rgb
- out_path = output_dir / f"{stem}_cleaned_{method}.png"
- Image.fromarray(out_rgb).save(str(out_path))
- paths[method] = str(out_path)
- meta_path = output_dir / f"{stem}_meta_{method}.json"
- meta = {
- "method": method,
- "threshold": settings.threshold,
- "mask_mode": dbg.get("mask_mode"),
- "direction_filter": dbg.get("direction_filter"),
- "whiten_mode": dbg.get("whiten_mode"),
- "T_wm": dbg.get("T_wm"),
- "T_protect": dbg.get("T_protect"),
- "mode": dbg.get("mode"),
- "midtone_ratio": dbg.get("midtone_ratio"),
- "wm_candidate_ratio": dbg.get("wm_candidate_ratio"),
- "geom_mask_ratio": dbg.get("geom_mask_ratio"),
- "geom_candidate_ratio": dbg.get("geom_candidate_ratio"),
- "wm_mask_ratio": dbg.get("wm_mask_ratio"),
- "white_pixel_ratio": dbg.get("white_pixel_ratio"),
- "hough_kept_lines": dbg.get("hough_kept_lines"),
- "hough_diag_candidates": dbg.get("hough_diag_candidates"),
- "hough_total_lines": dbg.get("hough_total_lines"),
- "dominant_angles": dbg.get("dominant_angles"),
- "whiten_gray_low": dbg.get("whiten_gray_low"),
- }
- meta_path.write_text(
- json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
- )
- paths[f"meta_{method}"] = str(meta_path)
- if method == "masked_adaptive":
- layer_paths = save_watermark_mask_debug_layers(
- img_rgb, output_dir, stem, dbg, image_format="png"
- )
- paths.update(layer_paths)
- h = max(results["threshold"].shape[0], results["masked_adaptive"].shape[0])
- def _resize_rgb(arr: np.ndarray) -> np.ndarray:
- if arr.shape[0] == h:
- return arr
- scale = h / arr.shape[0]
- w = int(arr.shape[1] * scale)
- return cv2.resize(arr, (w, h))
- triple = np.hstack(
- [_resize_rgb(img_rgb)]
- + [_resize_rgb(results[m]) for m in ("threshold", "masked_adaptive")]
- )
- compare_path = output_dir / f"{stem}_compare_orig_threshold_masked.png"
- cv2.imwrite(
- str(compare_path),
- cv2.cvtColor(triple, cv2.COLOR_RGB2BGR),
- )
- paths["compare_triple"] = str(compare_path)
- logger.info(f"✅ 方法对比已保存: {compare_path}")
- return paths
- def main():
- parser = argparse.ArgumentParser(
- description="银行流水水印去除工具(参数默认来自场景 YAML,与 main_v2 Pipeline 一致)",
- formatter_class=argparse.RawDescriptionHelpFormatter,
- epilog=__doc__,
- )
- parser.add_argument("input", type=Path, help="输入 PDF / 图片文件或目录(批量模式)")
- parser.add_argument(
- "-c",
- "--config",
- type=Path,
- default=_DEFAULT_CONFIG_PATH,
- help=f"场景配置文件,读取 preprocessor.watermark_removal(默认: {_DEFAULT_CONFIG_PATH.name})",
- )
- parser.add_argument(
- "-o",
- "--output",
- type=Path,
- default=None,
- help="输出路径(单文件模式;默认在原文件名后加 _cleaned)",
- )
- parser.add_argument("--batch", action="store_true", help="批量处理目录下所有 PDF 和图片")
- parser.add_argument("--preview", action="store_true", help="预览模式:展示单页对比图(不保存)")
- parser.add_argument("--page", type=int, default=0, help="预览页码(0-based)")
- parser.add_argument(
- "--page-range",
- type=str,
- default=None,
- help="PDF 页面范围,如 '1-3,5,7-9'(从 1 开始)",
- )
- parser.add_argument(
- "--force-image",
- action="store_true",
- help="文字型 PDF 强制走图像去水印(失去可搜索性)",
- )
- parser.add_argument(
- "--debug",
- action="store_true",
- help="保存调试图到 debug/watermark_removal/",
- )
- parser.add_argument(
- "--debug-dir",
- type=Path,
- default=None,
- help="调试图根目录(默认 -o 的父目录;格式见配置文件 debug_options.image_format)",
- )
- # 以下为覆盖配置文件的少量旋钮(未指定则完全使用 YAML)
- override = parser.add_argument_group("覆盖配置文件(可选)")
- override.add_argument(
- "--threshold",
- type=int,
- default=None,
- help="覆盖 watermark_removal.threshold(140-180)",
- )
- override.add_argument(
- "--morph-kernel",
- type=int,
- default=None,
- help="覆盖 watermark_removal.morph_close_kernel",
- )
- override.add_argument("--dpi", type=int, default=None, help="覆盖 input.dpi")
- override.add_argument("--no-contrast", action="store_true", help="关闭 contrast_enhancement")
- override.add_argument(
- "--text-black-target",
- type=int,
- default=None,
- help="覆盖 contrast_enhancement.text_black_target(text_restore)",
- )
- override.add_argument(
- "--method",
- type=str,
- default=None,
- choices=["threshold", "masked", "masked_adaptive"],
- help="覆盖 watermark_removal.method",
- )
- parser.add_argument(
- "--scope",
- type=str,
- default="page",
- choices=["page", "cell"],
- help="page=页级 preprocessor;cell=二次 OCR 单元格 preset",
- )
- parser.add_argument(
- "--compare-methods",
- action="store_true",
- help="对比 threshold 与 masked_adaptive,输出三联图到 -o 目录",
- )
- args = parser.parse_args()
- try:
- settings = resolve_watermark_settings(
- args.config,
- scope=args.scope,
- threshold=args.threshold,
- morph_close_kernel=args.morph_kernel,
- dpi=args.dpi,
- no_contrast=args.no_contrast,
- text_black_target=args.text_black_target,
- method=args.method,
- )
- except FileNotFoundError as e:
- logger.error(str(e))
- sys.exit(1)
- logger.info(
- f"📋 配置: {args.config} | method={settings.method} | "
- f"threshold={settings.threshold} | morph_kernel={settings.morph_close_kernel} | "
- f"dpi={settings.dpi} | contrast={settings.contrast_enhancement}"
- )
- if args.compare_methods:
- input_path = args.input
- if not input_path.is_file():
- logger.error(f"文件不存在: {input_path}")
- sys.exit(1)
- out_dir = args.output or (
- input_path.parent / "debug" / "watermark_method_compare"
- )
- paths = compare_watermark_methods(input_path, out_dir, settings)
- for k, v in paths.items():
- logger.info(f" {k}: {v}")
- return
- if args.preview:
- preview_page(args.input, settings, page_idx=args.page)
- return
- if args.batch:
- # 批量模式:处理目录下所有 PDF 和图片
- input_dir = args.input
- if not input_dir.is_dir():
- logger.error(f"批量模式需要传入目录: {input_dir}")
- sys.exit(1)
- # 收集所有支持的文件
- all_files: list[Path] = sorted(input_dir.glob("*.pdf"))
- for ext in IMAGE_SUFFIXES:
- all_files.extend(sorted(input_dir.glob(f"*{ext}")))
- all_files.extend(sorted(input_dir.glob(f"*{ext.upper()}")))
- all_files = sorted(set(all_files))
- if not all_files:
- logger.warning(f"目录中没有可处理的文件(PDF/图片): {input_dir}")
- return
- out_dir = args.output or input_dir / "cleaned"
- out_dir.mkdir(parents=True, exist_ok=True)
- for file in all_files:
- out_file = out_dir / f"{file.stem}_cleaned{file.suffix}"
- try:
- _run_process_document(
- file,
- out_file,
- settings,
- page_range=args.page_range,
- force_image=args.force_image,
- save_debug=args.debug,
- debug_output_dir=args.debug_dir or out_dir,
- )
- except Exception as e:
- logger.error(f"❌ 处理失败 {file.name}: {e}")
- logger.info(f"✅ 批量处理完成,共 {len(all_files)} 个文件 -> {out_dir}")
- else:
- # 单文件模式
- input_path = args.input
- if not input_path.is_file():
- logger.error(f"文件不存在: {input_path}")
- sys.exit(1)
- output_path = args.output or input_path.with_name(
- f"{input_path.stem}_cleaned{input_path.suffix}"
- )
- suffix = input_path.suffix.lower()
- if suffix == ".pdf" or suffix in IMAGE_SUFFIXES:
- _run_process_document(
- input_path,
- output_path,
- settings,
- page_range=args.page_range,
- force_image=args.force_image,
- save_debug=args.debug,
- debug_output_dir=args.debug_dir or output_path.parent,
- )
- else:
- logger.error(f"不支持的文件格式: {suffix},支持 PDF 和 {IMAGE_SUFFIXES}")
- sys.exit(1)
- if __name__ == "__main__":
- if len(sys.argv) == 1:
- print("ℹ️ 未提供命令行参数,使用默认配置运行...")
- # 默认配置(用于开发测试)
- default_config = {
- # 测试输入
- # "input": "/Users/zhch158/workspace/data/流水分析/杨万益_福建农信.pdf",
- # "input": "Users/zhch158/workspace/data/流水分析/提取自杨万益_福建农信.png",
-
- # 文字PDF测试
- # "input": "/Users/zhch158/workspace/data/流水分析/提取自赤峰黄金2023年报.pdf",
- # "input": "/Users/zhch158/workspace/data/测试文字PDF-水印.pdf",
- # "input": "/Users/zhch158/workspace/data/非结构化文档识别统一平台(ocr_platform)-交易流水识别,财报识别.pdf",
- "input": "/Users/zhch158/workspace/data/流水分析/彭_广东兴宁农村商业银行/bank_statement_yusys_local/彭_广东兴宁农村商业银行/彭_广东兴宁农村商业银行_page_002.png",
- # "output": "./output/杨万益_福建农信",
- # 页面范围(可选,支持 "1-5,7" 语法,仅对 PDF 有效)
- # "page_range": "3", # 仅处理第 1 页(对应 --page-range 参数)
- "config": str(_DEFAULT_CONFIG_PATH),
- "preview": True,
- "debug": True,
- "compare-methods": True,
- }
- # 构造参数(注意 input 是位置参数,morph_kernel 对应 --morph-kernel)
- sys.argv = [sys.argv[0], default_config["input"]]
- skip_keys = {"input"}
- for key, value in default_config.items():
- if key in skip_keys:
- continue
- # 将下划线转换为连字符(如 morph_kernel -> morph-kernel)
- flag = f"--{key.replace('_', '-')}"
- if isinstance(value, bool):
- if value:
- sys.argv.append(flag)
- else:
- sys.argv.extend([flag, str(value)])
- sys.exit(main())
|