remove_watermark.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870
  1. """
  2. 银行流水水印去除工具
  3. 支持 PDF 和常见图片格式(jpg/png/tif/bmp/webp)。
  4. 参数默认从与 main_v2 相同的场景 YAML 读取(preprocessor.watermark_removal),
  5. 命令行仅用于输入/输出、批量、预览及少量覆盖项。
  6. 用法:
  7. # 使用默认场景配置(bank_statement_yusys_local.yaml)
  8. python remove_watermark.py input.pdf
  9. # 指定场景配置(与 Pipeline 一致)
  10. python remove_watermark.py input.png -c ../universal_doc_parser/config/bank_statement_yusys_local.yaml
  11. # 保存调试图(before/after/compare/meta)
  12. python remove_watermark.py input.png -o ./out --debug
  13. # 临时覆盖阈值(其余仍来自配置文件)
  14. python remove_watermark.py input.pdf --threshold 170
  15. # 预览
  16. python remove_watermark.py input.pdf --preview --page 0
  17. # 批量
  18. python remove_watermark.py /path/to/dir/ --batch -o ./cleaned
  19. # 对比 threshold vs masked_adaptive(输出三联图)
  20. python remove_watermark.py page_002.png --compare-methods -o ./method_compare
  21. """
  22. import argparse
  23. import copy
  24. import json
  25. import sys
  26. from dataclasses import dataclass
  27. from pathlib import Path
  28. from typing import Any, Dict, Optional
  29. import cv2
  30. import numpy as np
  31. import yaml
  32. # 将 ocr_platform 根目录加入 sys.path,以便导入 ocr_utils
  33. _repo_root = Path(__file__).parents[2]
  34. if str(_repo_root) not in sys.path:
  35. sys.path.insert(0, str(_repo_root))
  36. from loguru import logger
  37. from ocr_utils.watermark_utils import (
  38. detect_watermark,
  39. remove_watermark_from_image_rgb,
  40. render_watermark_mask_overlay,
  41. save_watermark_removal_debug,
  42. save_watermark_mask_debug_layers,
  43. scan_pdf_watermark_xobjs,
  44. remove_txt_pdf_watermark,
  45. )
  46. # 支持的图片后缀(小写)
  47. IMAGE_SUFFIXES = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp"}
  48. _DEFAULT_CONFIG_PATH = (
  49. _repo_root
  50. / "ocr_tools/universal_doc_parser/config/bank_statement_yusys_local.yaml"
  51. )
  52. @dataclass
  53. class WatermarkToolSettings:
  54. """从场景 YAML 解析的水印处理参数(与 Pipeline preprocessor 对齐)。"""
  55. threshold: int = 160
  56. morph_close_kernel: int = 0
  57. dpi: int = 200
  58. method: str = "threshold"
  59. contrast_enhancement: Optional[Dict[str, Any]] = None
  60. debug_options: Optional[Dict[str, Any]] = None
  61. watermark_enabled: bool = True
  62. watermark_config: Optional[Dict[str, Any]] = None
  63. @property
  64. def debug_image_format(self) -> str:
  65. opts = self.debug_options or {}
  66. return str(opts.get("image_format") or "png").lstrip(".")
  67. def load_watermark_settings(config_path: Path) -> WatermarkToolSettings:
  68. """
  69. 从 universal_doc_parser 场景配置读取 preprocessor.watermark_removal 与 input.dpi。
  70. 不依赖完整 ConfigManager,避免仅调试水印时强依赖 layout/ocr 等段。
  71. """
  72. config_path = Path(config_path)
  73. if not config_path.is_file():
  74. raise FileNotFoundError(f"配置文件不存在: {config_path}")
  75. with open(config_path, encoding="utf-8") as f:
  76. raw = yaml.safe_load(f) or {}
  77. preprocessor = raw.get("preprocessor") or {}
  78. wm = preprocessor.get("watermark_removal") or {}
  79. input_cfg = raw.get("input") or {}
  80. contrast = wm.get("contrast_enhancement")
  81. if contrast is not None and not isinstance(contrast, dict):
  82. contrast = None
  83. wm_full = copy.deepcopy(wm)
  84. return WatermarkToolSettings(
  85. threshold=int(wm.get("threshold", 160)),
  86. morph_close_kernel=int(wm.get("morph_close_kernel", 0)),
  87. dpi=int(input_cfg.get("dpi", 200)),
  88. method=str(wm.get("method") or "threshold"),
  89. contrast_enhancement=copy.deepcopy(contrast) if contrast else None,
  90. debug_options=copy.deepcopy(wm.get("debug_options"))
  91. if wm.get("debug_options")
  92. else None,
  93. watermark_enabled=bool(wm.get("enabled", True)),
  94. watermark_config=wm_full,
  95. )
  96. def resolve_watermark_settings(
  97. config_path: Path,
  98. *,
  99. threshold: Optional[int] = None,
  100. morph_close_kernel: Optional[int] = None,
  101. dpi: Optional[int] = None,
  102. no_contrast: bool = False,
  103. text_black_target: Optional[int] = None,
  104. method: Optional[str] = None,
  105. ) -> WatermarkToolSettings:
  106. """加载配置并应用命令行覆盖。"""
  107. settings = load_watermark_settings(config_path)
  108. if threshold is not None:
  109. settings.threshold = threshold
  110. if morph_close_kernel is not None:
  111. settings.morph_close_kernel = morph_close_kernel
  112. if dpi is not None:
  113. settings.dpi = dpi
  114. if method is not None:
  115. settings.method = method
  116. if settings.watermark_config is not None:
  117. settings.watermark_config["method"] = method
  118. if no_contrast and settings.contrast_enhancement:
  119. settings.contrast_enhancement = copy.deepcopy(settings.contrast_enhancement)
  120. settings.contrast_enhancement["enabled"] = False
  121. elif text_black_target is not None:
  122. if not settings.contrast_enhancement:
  123. settings.contrast_enhancement = {"enabled": True, "method": "text_restore"}
  124. else:
  125. settings.contrast_enhancement = copy.deepcopy(settings.contrast_enhancement)
  126. settings.contrast_enhancement["enabled"] = True
  127. settings.contrast_enhancement["text_black_target"] = text_black_target
  128. return settings
  129. def _watermark_removal_cfg_for_method(
  130. settings: WatermarkToolSettings,
  131. method: str,
  132. ) -> Dict[str, Any]:
  133. """构造指定 method 的 watermark_removal 配置副本。"""
  134. cfg = copy.deepcopy(settings.watermark_config or {})
  135. cfg["method"] = method
  136. cfg["threshold"] = settings.threshold
  137. cfg["morph_close_kernel"] = settings.morph_close_kernel
  138. return cfg
  139. def _apply_image_watermark_removal(
  140. img_np: np.ndarray,
  141. *,
  142. settings: WatermarkToolSettings,
  143. contrast_enhancement: Optional[Dict[str, Any]] = None,
  144. apply_watermark_removal: bool = True,
  145. removal_debug: Optional[Dict[str, Any]] = None,
  146. ) -> np.ndarray:
  147. """与 universal_doc_parser 一致的 RGB 去水印 + 可选对比度增强。"""
  148. wm_cfg = _watermark_removal_cfg_for_method(settings, settings.method)
  149. return np.asarray(
  150. remove_watermark_from_image_rgb(
  151. img_np,
  152. threshold=settings.threshold,
  153. morph_close_kernel=settings.morph_close_kernel,
  154. contrast_enhancement=contrast_enhancement,
  155. apply_watermark_removal=apply_watermark_removal,
  156. watermark_removal_cfg=wm_cfg,
  157. removal_debug=removal_debug,
  158. return_pil=False,
  159. )
  160. )
  161. def _active_contrast_enhancement(
  162. settings: WatermarkToolSettings,
  163. ) -> Optional[Dict[str, Any]]:
  164. ce = settings.contrast_enhancement
  165. if not ce or not ce.get("enabled", False):
  166. return None
  167. return ce
  168. def _maybe_save_watermark_debug(
  169. before: np.ndarray,
  170. after: np.ndarray,
  171. debug_output_dir: Path,
  172. page_name: str,
  173. *,
  174. settings: WatermarkToolSettings,
  175. contrast_enhancement: Optional[Dict[str, Any]] = None,
  176. removal_debug: Optional[Dict[str, Any]] = None,
  177. ) -> None:
  178. """保存调试图到 debug/watermark_removal/(与 pipeline 相同布局)。"""
  179. params: Dict[str, Any] = {
  180. "method": settings.method,
  181. "threshold": settings.threshold,
  182. "morph_close_kernel": settings.morph_close_kernel,
  183. }
  184. if contrast_enhancement:
  185. params["contrast_enhancement"] = contrast_enhancement
  186. if removal_debug:
  187. for key in ("mode", "T_wm", "T_protect", "wm_mask_ratio", "white_pixel_ratio"):
  188. if key in removal_debug:
  189. params[key] = removal_debug[key]
  190. mask_overlay = None
  191. if removal_debug and "wm_mask" in removal_debug:
  192. mask_overlay = render_watermark_mask_overlay(
  193. before, removal_debug["wm_mask"]
  194. )
  195. save_watermark_removal_debug(
  196. before,
  197. after,
  198. debug_output_dir,
  199. page_name,
  200. processing_params=params,
  201. image_format=settings.debug_image_format,
  202. save_compare=True,
  203. mask_overlay=mask_overlay,
  204. )
  205. def _try_remove_txt_pdf_watermark(input_path: Path, output_path: Path) -> int:
  206. """
  207. 对文字型 PDF 执行原生水印去除,保留文字可搜索性。
  208. 内部委托给 watermark_utils.remove_txt_pdf_watermark() 完成内存流处理,
  209. 有水印时将结果写入 output_path。
  210. 流程:
  211. 1. scan_pdf_watermark_xobjs() 快速扫描前 3 页,无水印直接返回 0
  212. 2. remove_txt_pdf_watermark() 执行全量去除,返回 bytes 或 None
  213. 3. 有水印时写 output_path
  214. Returns:
  215. 1 表示去除成功,0 表示未发现水印
  216. """
  217. pdf_bytes = input_path.read_bytes()
  218. if not scan_pdf_watermark_xobjs(pdf_bytes, sample_pages=3):
  219. return 0
  220. cleaned = remove_txt_pdf_watermark(pdf_bytes)
  221. if cleaned is None:
  222. return 0
  223. output_path.write_bytes(cleaned)
  224. return 1
  225. def process_document(
  226. input_path: Path,
  227. output_path: Path,
  228. settings: WatermarkToolSettings,
  229. page_range: Optional[str] = None,
  230. force_image: bool = False,
  231. save_debug: bool = False,
  232. debug_output_dir: Optional[Path] = None,
  233. apply_watermark_removal: Optional[bool] = None,
  234. ) -> int:
  235. """
  236. 统一处理函数:支持 PDF(扫描件)和图片,去除水印后保存。
  237. 使用 PDFUtils.load_and_classify_document 加载并分类:
  238. - 文字型 PDF(pdf_type='txt'):优先尝试原生 XObject 水印去除(保留可搜索性);
  239. 失败时自动回退图像化处理,或 force_image=True 时直接走图像处理
  240. - 扫描件 PDF(pdf_type='ocr'):逐页去水印后重新打包为 PDF
  241. - 图片:检测水印后去除并保存
  242. Args:
  243. input_path: 输入文件路径(PDF 或图片)
  244. output_path: 输出文件路径
  245. settings: 水印配置(含 method / threshold / mask / adaptive)
  246. page_range: 页面范围字符串,如 "1-5,7,9-12"(从 1 开始,仅对 PDF 有效)
  247. force_image: 强制对文字型 PDF 使用图像化处理(会失去文字可搜索性,
  248. 但能处理水印嵌在内容流中的情况)
  249. save_debug: 是否保存 before/after/compare/meta 到 debug/watermark_removal/
  250. debug_output_dir: 调试图根目录,默认 output_path 的父目录
  251. apply_watermark_removal: 默认取 settings.watermark_enabled
  252. Returns:
  253. 实际处理的页/图片数
  254. """
  255. import shutil
  256. from io import BytesIO
  257. from PIL import Image
  258. from ocr_utils.pdf_utils import PDFUtils
  259. is_pdf = input_path.suffix.lower() == ".pdf"
  260. dpi = settings.dpi
  261. contrast_enhancement = _active_contrast_enhancement(settings)
  262. if apply_watermark_removal is None:
  263. apply_watermark_removal = settings.watermark_enabled
  264. # 统一加载 + 分类(PDF 用 MinerU pdf_classify,图片直接读取)
  265. images, pdf_type, pdf_doc, renderer = PDFUtils.load_and_classify_document(
  266. input_path, dpi=dpi, page_range=page_range
  267. )
  268. # _known_has_wm: 当 txt 分支已确认有水印时设为 True,避免公共段用更严格阈值误判
  269. _known_has_wm: Optional[bool] = None
  270. # 文字型 PDF:优先尝试原生 XObject 水印去除,保留可搜索性
  271. if is_pdf and pdf_type == "txt" and not force_image:
  272. output_path.parent.mkdir(parents=True, exist_ok=True)
  273. removed = _try_remove_txt_pdf_watermark(input_path, output_path)
  274. if removed > 0:
  275. logger.info(
  276. f"✅ 文字型 PDF '{input_path.name}':删除 {removed} 个水印 XObject,"
  277. "保留文字可搜索性,已保存。"
  278. )
  279. return removed
  280. # XObject 扫描无结果,用较低阈值(0.5%)做图像水印检测二次确认
  281. # 文字 PDF 背景干净,降低阈值以检测稀疏文字水印
  282. first_np = np.array(images[0]["img_pil"])
  283. if detect_watermark(first_np, ratio_threshold=0.005):
  284. logger.warning(
  285. f"⚠️ 文字型 PDF '{input_path.name}':未找到 XObject 水印,"
  286. "但图像检测发现水印(内联内容流水印),"
  287. "回退为图像化处理(输出将失去文字可搜索性)。"
  288. )
  289. _known_has_wm = True # 明确检测到水印,跳过公共段二次检测
  290. else:
  291. logger.info(
  292. f"✅ 文字型 PDF '{input_path.name}':未检测到水印,直接复制。"
  293. )
  294. shutil.copy2(str(input_path), str(output_path))
  295. return 0
  296. elif is_pdf and pdf_type == "txt" and force_image:
  297. logger.warning(
  298. f"⚠️ 文字型 PDF '{input_path.name}':--force-image 模式,"
  299. "强制图像化处理(输出将失去文字可搜索性)。"
  300. )
  301. _known_has_wm = True # force_image 模式不再检测,直接去除
  302. logger.info(
  303. f"{'📄' if is_pdf else '🖼️ '} 处理: {input_path.name} "
  304. f"共 {len(images)} {'页' if is_pdf else '张'} "
  305. f"method={settings.method} threshold={settings.threshold}"
  306. )
  307. contrast_only = (
  308. not apply_watermark_removal
  309. and contrast_enhancement
  310. and contrast_enhancement.get("enabled", False)
  311. )
  312. # 水印检测(仅用第一页/图判断,同一文档水印通常一致)
  313. # _known_has_wm 已在 txt 分支设置时,跳过重复检测
  314. if contrast_only:
  315. has_wm = True
  316. logger.info("📋 配置关闭去水印,仅应用 contrast_enhancement")
  317. elif _known_has_wm is not None:
  318. has_wm = _known_has_wm
  319. logger.info("🔍 检测到水印,启动去水印处理" if has_wm else "✅ 未检测到水印,跳过")
  320. else:
  321. first_np = np.array(images[0]["img_pil"])
  322. # 扫描件/图片路径:使用宽松一档的中间调阈值(2.5%)以避免边界误判,
  323. # 斜向直线验证仍作为双重保险防止误报
  324. has_wm = detect_watermark(first_np, ratio_threshold=0.025)
  325. if has_wm:
  326. logger.info("🔍 检测到水印,启动去水印处理")
  327. else:
  328. logger.info("✅ 未检测到水印,跳过去水印处理")
  329. if not is_pdf:
  330. # 图片无水印:直接复制
  331. output_path.parent.mkdir(parents=True, exist_ok=True)
  332. shutil.copy2(str(input_path), str(output_path))
  333. return 1
  334. output_path.parent.mkdir(parents=True, exist_ok=True)
  335. debug_root = debug_output_dir or output_path.parent
  336. if is_pdf:
  337. # 逐页处理后重新打包为 PDF
  338. try:
  339. import fitz
  340. except ImportError:
  341. raise ImportError("请安装 PyMuPDF: pip install PyMuPDF")
  342. new_doc = fitz.open()
  343. for i, img_dict in enumerate(images):
  344. pil_img = img_dict["img_pil"]
  345. img_np = np.array(pil_img)
  346. page_name = f"{input_path.stem}_page_{i + 1:03d}"
  347. if has_wm:
  348. before = img_np.copy()
  349. removal_dbg: Dict[str, Any] = {}
  350. cleaned_rgb = _apply_image_watermark_removal(
  351. img_np,
  352. settings=settings,
  353. contrast_enhancement=contrast_enhancement,
  354. apply_watermark_removal=apply_watermark_removal,
  355. removal_debug=removal_dbg,
  356. )
  357. if save_debug:
  358. _maybe_save_watermark_debug(
  359. before,
  360. cleaned_rgb,
  361. debug_root,
  362. page_name,
  363. settings=settings,
  364. contrast_enhancement=contrast_enhancement,
  365. removal_debug=removal_dbg,
  366. )
  367. out_pil = Image.fromarray(
  368. cv2.cvtColor(cleaned_rgb, cv2.COLOR_BGR2RGB)
  369. )
  370. else:
  371. out_pil = pil_img
  372. buf = BytesIO()
  373. out_pil.save(buf, format="PNG", optimize=False)
  374. buf.seek(0)
  375. # 按渲染图尺寸创建新页面(保持原始 DPI 尺寸)
  376. w_px, h_px = out_pil.size
  377. new_page = new_doc.new_page(width=w_px * 72 / dpi, height=h_px * 72 / dpi)
  378. new_page.insert_image(new_page.rect, stream=buf.read())
  379. if (i + 1) % 10 == 0 or i == len(images) - 1:
  380. logger.info(f" 进度: {i + 1}/{len(images)}")
  381. new_doc.save(str(output_path), garbage=4, deflate=True)
  382. else:
  383. # 图片:有水印则去除后保存
  384. img_np = np.array(images[0]["img_pil"])
  385. before = img_np.copy()
  386. removal_dbg = {}
  387. cleaned_rgb = _apply_image_watermark_removal(
  388. img_np,
  389. settings=settings,
  390. contrast_enhancement=contrast_enhancement,
  391. apply_watermark_removal=apply_watermark_removal,
  392. removal_debug=removal_dbg,
  393. )
  394. if save_debug:
  395. _maybe_save_watermark_debug(
  396. before,
  397. cleaned_rgb,
  398. debug_root,
  399. input_path.stem,
  400. settings=settings,
  401. contrast_enhancement=contrast_enhancement,
  402. removal_debug=removal_dbg,
  403. )
  404. out_rgb = cv2.cvtColor(cleaned_rgb, cv2.COLOR_BGR2RGB)
  405. Image.fromarray(out_rgb).save(str(output_path))
  406. logger.info(f"✅ 保存到: {output_path}")
  407. return len(images)
  408. def preview_page(
  409. input_path: Path,
  410. settings: WatermarkToolSettings,
  411. page_idx: int = 0,
  412. ):
  413. """展示单页原图与去水印对比(需要 matplotlib)。支持 PDF 和图片文件。"""
  414. try:
  415. import matplotlib.pyplot as plt
  416. import matplotlib
  417. matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'sans-serif']
  418. matplotlib.rcParams['axes.unicode_minus'] = False
  419. except ImportError as e:
  420. raise ImportError(f"预览需要 matplotlib: {e}")
  421. suffix = input_path.suffix.lower()
  422. if suffix == ".pdf":
  423. try:
  424. import fitz
  425. except ImportError:
  426. raise ImportError("PDF 预览需要 PyMuPDF: pip install PyMuPDF")
  427. doc = fitz.open(str(input_path))
  428. if page_idx >= len(doc):
  429. raise ValueError(f"页码 {page_idx} 超出范围(共 {len(doc)} 页)")
  430. mat = fitz.Matrix(settings.dpi / 72, settings.dpi / 72)
  431. page = doc[page_idx]
  432. pix = page.get_pixmap(matrix=mat, alpha=False)
  433. img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
  434. title_orig = f"原图 第 {page_idx + 1} 页"
  435. elif suffix in IMAGE_SUFFIXES:
  436. from PIL import Image
  437. img_np = np.array(Image.open(str(input_path)).convert("RGB"))
  438. title_orig = f"原图 {input_path.name}"
  439. else:
  440. raise ValueError(f"不支持的文件格式: {suffix}")
  441. contrast = _active_contrast_enhancement(settings)
  442. cleaned_rgb = _apply_image_watermark_removal(
  443. img_np,
  444. settings=settings,
  445. contrast_enhancement=contrast,
  446. apply_watermark_removal=settings.watermark_enabled,
  447. )
  448. cleaned = cv2.cvtColor(cleaned_rgb, cv2.COLOR_BGR2GRAY)
  449. fig, axes = plt.subplots(1, 2, figsize=(20, 14))
  450. axes[0].imshow(img_np)
  451. axes[0].set_title(title_orig, fontsize=14)
  452. axes[0].axis('off')
  453. subtitle = f"method={settings.method}, threshold={settings.threshold}"
  454. if contrast:
  455. subtitle += f", contrast={contrast.get('method', 'on')}"
  456. axes[1].imshow(cleaned, cmap='gray')
  457. axes[1].set_title(f"去水印后 {subtitle}", fontsize=14)
  458. axes[1].axis('off')
  459. plt.tight_layout()
  460. plt.show()
  461. def _run_process_document(
  462. input_path: Path,
  463. output_path: Path,
  464. settings: WatermarkToolSettings,
  465. *,
  466. page_range: Optional[str] = None,
  467. force_image: bool = False,
  468. save_debug: bool = False,
  469. debug_output_dir: Optional[Path] = None,
  470. ) -> int:
  471. return process_document(
  472. input_path,
  473. output_path,
  474. settings,
  475. page_range=page_range,
  476. force_image=force_image,
  477. save_debug=save_debug,
  478. debug_output_dir=debug_output_dir,
  479. )
  480. def compare_watermark_methods(
  481. input_path: Path,
  482. output_dir: Path,
  483. settings: WatermarkToolSettings,
  484. ) -> Dict[str, str]:
  485. """
  486. 同一张图对比 threshold 与 masked_adaptive,输出三联图与 meta。
  487. Returns:
  488. 各输出文件路径
  489. """
  490. from PIL import Image
  491. output_dir.mkdir(parents=True, exist_ok=True)
  492. stem = input_path.stem
  493. img_rgb = np.array(Image.open(str(input_path)).convert("RGB"))
  494. contrast = _active_contrast_enhancement(settings)
  495. paths: Dict[str, str] = {}
  496. results: Dict[str, np.ndarray] = {}
  497. for method in ("threshold", "masked_adaptive"):
  498. sub = copy.deepcopy(settings)
  499. sub.method = method
  500. dbg: Dict[str, Any] = {}
  501. out = _apply_image_watermark_removal(
  502. img_rgb,
  503. settings=sub,
  504. contrast_enhancement=contrast,
  505. removal_debug=dbg,
  506. )
  507. out_rgb = cv2.cvtColor(out, cv2.COLOR_BGR2RGB)
  508. results[method] = out_rgb
  509. out_path = output_dir / f"{stem}_cleaned_{method}.png"
  510. Image.fromarray(out_rgb).save(str(out_path))
  511. paths[method] = str(out_path)
  512. meta_path = output_dir / f"{stem}_meta_{method}.json"
  513. meta = {
  514. "method": method,
  515. "threshold": settings.threshold,
  516. "mask_mode": dbg.get("mask_mode"),
  517. "direction_filter": dbg.get("direction_filter"),
  518. "whiten_mode": dbg.get("whiten_mode"),
  519. "T_wm": dbg.get("T_wm"),
  520. "T_protect": dbg.get("T_protect"),
  521. "mode": dbg.get("mode"),
  522. "midtone_ratio": dbg.get("midtone_ratio"),
  523. "wm_candidate_ratio": dbg.get("wm_candidate_ratio"),
  524. "geom_mask_ratio": dbg.get("geom_mask_ratio"),
  525. "geom_candidate_ratio": dbg.get("geom_candidate_ratio"),
  526. "wm_mask_ratio": dbg.get("wm_mask_ratio"),
  527. "white_pixel_ratio": dbg.get("white_pixel_ratio"),
  528. "hough_kept_lines": dbg.get("hough_kept_lines"),
  529. "hough_diag_candidates": dbg.get("hough_diag_candidates"),
  530. "hough_total_lines": dbg.get("hough_total_lines"),
  531. "dominant_angles": dbg.get("dominant_angles"),
  532. "whiten_gray_low": dbg.get("whiten_gray_low"),
  533. }
  534. meta_path.write_text(
  535. json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
  536. )
  537. paths[f"meta_{method}"] = str(meta_path)
  538. if method == "masked_adaptive":
  539. layer_paths = save_watermark_mask_debug_layers(
  540. img_rgb, output_dir, stem, dbg, image_format="png"
  541. )
  542. paths.update(layer_paths)
  543. h = max(results["threshold"].shape[0], results["masked_adaptive"].shape[0])
  544. def _resize_rgb(arr: np.ndarray) -> np.ndarray:
  545. if arr.shape[0] == h:
  546. return arr
  547. scale = h / arr.shape[0]
  548. w = int(arr.shape[1] * scale)
  549. return cv2.resize(arr, (w, h))
  550. triple = np.hstack(
  551. [_resize_rgb(img_rgb)]
  552. + [_resize_rgb(results[m]) for m in ("threshold", "masked_adaptive")]
  553. )
  554. compare_path = output_dir / f"{stem}_compare_orig_threshold_masked.png"
  555. cv2.imwrite(
  556. str(compare_path),
  557. cv2.cvtColor(triple, cv2.COLOR_RGB2BGR),
  558. )
  559. paths["compare_triple"] = str(compare_path)
  560. logger.info(f"✅ 方法对比已保存: {compare_path}")
  561. return paths
  562. def main():
  563. parser = argparse.ArgumentParser(
  564. description="银行流水水印去除工具(参数默认来自场景 YAML,与 main_v2 Pipeline 一致)",
  565. formatter_class=argparse.RawDescriptionHelpFormatter,
  566. epilog=__doc__,
  567. )
  568. parser.add_argument("input", type=Path, help="输入 PDF / 图片文件或目录(批量模式)")
  569. parser.add_argument(
  570. "-c",
  571. "--config",
  572. type=Path,
  573. default=_DEFAULT_CONFIG_PATH,
  574. help=f"场景配置文件,读取 preprocessor.watermark_removal(默认: {_DEFAULT_CONFIG_PATH.name})",
  575. )
  576. parser.add_argument(
  577. "-o",
  578. "--output",
  579. type=Path,
  580. default=None,
  581. help="输出路径(单文件模式;默认在原文件名后加 _cleaned)",
  582. )
  583. parser.add_argument("--batch", action="store_true", help="批量处理目录下所有 PDF 和图片")
  584. parser.add_argument("--preview", action="store_true", help="预览模式:展示单页对比图(不保存)")
  585. parser.add_argument("--page", type=int, default=0, help="预览页码(0-based)")
  586. parser.add_argument(
  587. "--page-range",
  588. type=str,
  589. default=None,
  590. help="PDF 页面范围,如 '1-3,5,7-9'(从 1 开始)",
  591. )
  592. parser.add_argument(
  593. "--force-image",
  594. action="store_true",
  595. help="文字型 PDF 强制走图像去水印(失去可搜索性)",
  596. )
  597. parser.add_argument(
  598. "--debug",
  599. action="store_true",
  600. help="保存调试图到 debug/watermark_removal/",
  601. )
  602. parser.add_argument(
  603. "--debug-dir",
  604. type=Path,
  605. default=None,
  606. help="调试图根目录(默认 -o 的父目录;格式见配置文件 debug_options.image_format)",
  607. )
  608. # 以下为覆盖配置文件的少量旋钮(未指定则完全使用 YAML)
  609. override = parser.add_argument_group("覆盖配置文件(可选)")
  610. override.add_argument(
  611. "--threshold",
  612. type=int,
  613. default=None,
  614. help="覆盖 watermark_removal.threshold(140-180)",
  615. )
  616. override.add_argument(
  617. "--morph-kernel",
  618. type=int,
  619. default=None,
  620. help="覆盖 watermark_removal.morph_close_kernel",
  621. )
  622. override.add_argument("--dpi", type=int, default=None, help="覆盖 input.dpi")
  623. override.add_argument("--no-contrast", action="store_true", help="关闭 contrast_enhancement")
  624. override.add_argument(
  625. "--text-black-target",
  626. type=int,
  627. default=None,
  628. help="覆盖 contrast_enhancement.text_black_target(text_restore)",
  629. )
  630. override.add_argument(
  631. "--method",
  632. type=str,
  633. default=None,
  634. choices=["threshold", "masked", "masked_adaptive"],
  635. help="覆盖 watermark_removal.method",
  636. )
  637. parser.add_argument(
  638. "--compare-methods",
  639. action="store_true",
  640. help="对比 threshold 与 masked_adaptive,输出三联图到 -o 目录",
  641. )
  642. args = parser.parse_args()
  643. try:
  644. settings = resolve_watermark_settings(
  645. args.config,
  646. threshold=args.threshold,
  647. morph_close_kernel=args.morph_kernel,
  648. dpi=args.dpi,
  649. no_contrast=args.no_contrast,
  650. text_black_target=args.text_black_target,
  651. method=args.method,
  652. )
  653. except FileNotFoundError as e:
  654. logger.error(str(e))
  655. sys.exit(1)
  656. logger.info(
  657. f"📋 配置: {args.config} | method={settings.method} | "
  658. f"threshold={settings.threshold} | morph_kernel={settings.morph_close_kernel} | "
  659. f"dpi={settings.dpi} | contrast={settings.contrast_enhancement}"
  660. )
  661. if args.compare_methods:
  662. input_path = args.input
  663. if not input_path.is_file():
  664. logger.error(f"文件不存在: {input_path}")
  665. sys.exit(1)
  666. out_dir = args.output or (
  667. input_path.parent / "debug" / "watermark_method_compare"
  668. )
  669. paths = compare_watermark_methods(input_path, out_dir, settings)
  670. for k, v in paths.items():
  671. logger.info(f" {k}: {v}")
  672. return
  673. if args.preview:
  674. preview_page(args.input, settings, page_idx=args.page)
  675. return
  676. if args.batch:
  677. # 批量模式:处理目录下所有 PDF 和图片
  678. input_dir = args.input
  679. if not input_dir.is_dir():
  680. logger.error(f"批量模式需要传入目录: {input_dir}")
  681. sys.exit(1)
  682. # 收集所有支持的文件
  683. all_files: list[Path] = sorted(input_dir.glob("*.pdf"))
  684. for ext in IMAGE_SUFFIXES:
  685. all_files.extend(sorted(input_dir.glob(f"*{ext}")))
  686. all_files.extend(sorted(input_dir.glob(f"*{ext.upper()}")))
  687. all_files = sorted(set(all_files))
  688. if not all_files:
  689. logger.warning(f"目录中没有可处理的文件(PDF/图片): {input_dir}")
  690. return
  691. out_dir = args.output or input_dir / "cleaned"
  692. out_dir.mkdir(parents=True, exist_ok=True)
  693. for file in all_files:
  694. out_file = out_dir / f"{file.stem}_cleaned{file.suffix}"
  695. try:
  696. _run_process_document(
  697. file,
  698. out_file,
  699. settings,
  700. page_range=args.page_range,
  701. force_image=args.force_image,
  702. save_debug=args.debug,
  703. debug_output_dir=args.debug_dir or out_dir,
  704. )
  705. except Exception as e:
  706. logger.error(f"❌ 处理失败 {file.name}: {e}")
  707. logger.info(f"✅ 批量处理完成,共 {len(all_files)} 个文件 -> {out_dir}")
  708. else:
  709. # 单文件模式
  710. input_path = args.input
  711. if not input_path.is_file():
  712. logger.error(f"文件不存在: {input_path}")
  713. sys.exit(1)
  714. output_path = args.output or input_path.with_name(
  715. f"{input_path.stem}_cleaned{input_path.suffix}"
  716. )
  717. suffix = input_path.suffix.lower()
  718. if suffix == ".pdf" or suffix in IMAGE_SUFFIXES:
  719. _run_process_document(
  720. input_path,
  721. output_path,
  722. settings,
  723. page_range=args.page_range,
  724. force_image=args.force_image,
  725. save_debug=args.debug,
  726. debug_output_dir=args.debug_dir or output_path.parent,
  727. )
  728. else:
  729. logger.error(f"不支持的文件格式: {suffix},支持 PDF 和 {IMAGE_SUFFIXES}")
  730. sys.exit(1)
  731. if __name__ == "__main__":
  732. if len(sys.argv) == 1:
  733. print("ℹ️ 未提供命令行参数,使用默认配置运行...")
  734. # 默认配置(用于开发测试)
  735. default_config = {
  736. # 测试输入
  737. # "input": "/Users/zhch158/workspace/data/流水分析/杨万益_福建农信.pdf",
  738. # "input": "Users/zhch158/workspace/data/流水分析/提取自杨万益_福建农信.png",
  739. # 文字PDF测试
  740. # "input": "/Users/zhch158/workspace/data/流水分析/提取自赤峰黄金2023年报.pdf",
  741. # "input": "/Users/zhch158/workspace/data/测试文字PDF-水印.pdf",
  742. # "input": "/Users/zhch158/workspace/data/非结构化文档识别统一平台(ocr_platform)-交易流水识别,财报识别.pdf",
  743. "input": "/Users/zhch158/workspace/data/流水分析/彭_广东兴宁农村商业银行/bank_statement_yusys_local/彭_广东兴宁农村商业银行/彭_广东兴宁农村商业银行_page_002.png",
  744. # "output": "./output/杨万益_福建农信",
  745. # 页面范围(可选,支持 "1-5,7" 语法,仅对 PDF 有效)
  746. # "page_range": "3", # 仅处理第 1 页(对应 --page-range 参数)
  747. "config": str(_DEFAULT_CONFIG_PATH),
  748. "preview": True,
  749. "debug": True,
  750. "compare-methods": True,
  751. }
  752. # 构造参数(注意 input 是位置参数,morph_kernel 对应 --morph-kernel)
  753. sys.argv = [sys.argv[0], default_config["input"]]
  754. skip_keys = {"input"}
  755. for key, value in default_config.items():
  756. if key in skip_keys:
  757. continue
  758. # 将下划线转换为连字符(如 morph_kernel -> morph-kernel)
  759. flag = f"--{key.replace('_', '-')}"
  760. if isinstance(value, bool):
  761. if value:
  762. sys.argv.append(flag)
  763. else:
  764. sys.argv.extend([flag, str(value)])
  765. sys.exit(main())