remove_watermark.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896
  1. """
  2. 银行流水水印去除工具
  3. 支持 PDF 和常见图片格式(jpg/png/tif/bmp/webp)。
  4. 参数默认从与 main_v2 相同的场景 YAML 读取(preprocessor.watermark_removal),
  5. 命令行仅用于输入/输出、批量、预览及少量覆盖项。
  6. 用法:
  7. # 使用默认场景配置(bank_statement_yusys_local.yaml)
  8. python remove_watermark.py input.pdf
  9. # 指定场景配置(与 Pipeline 一致)
  10. python remove_watermark.py input.png -c ../universal_doc_parser/config/bank_statement_yusys_local.yaml
  11. # 保存调试图(before/after/compare/meta)
  12. python remove_watermark.py input.png -o ./out --debug
  13. # 临时覆盖阈值(其余仍来自配置文件)
  14. python remove_watermark.py input.pdf --threshold 170
  15. # 预览
  16. python remove_watermark.py input.pdf --preview --page 0
  17. # 批量
  18. python remove_watermark.py /path/to/dir/ --batch -o ./cleaned
  19. # 对比 threshold vs masked_adaptive(输出三联图)
  20. python remove_watermark.py page_002.png --compare-methods -o ./method_compare
  21. """
  22. import argparse
  23. import copy
  24. import json
  25. import sys
  26. from dataclasses import dataclass
  27. from pathlib import Path
  28. from typing import Any, Dict, Optional
  29. import cv2
  30. import numpy as np
  31. import yaml
  32. # 将 ocr_platform 根目录加入 sys.path,以便导入 ocr_utils
  33. _repo_root = Path(__file__).parents[2]
  34. if str(_repo_root) not in sys.path:
  35. sys.path.insert(0, str(_repo_root))
  36. from loguru import logger
  37. from ocr_utils.watermark import (
  38. WatermarkProcessor,
  39. detect_watermark,
  40. merge_watermark_config,
  41. remove_txt_pdf_watermark,
  42. render_watermark_mask_overlay,
  43. save_watermark_mask_debug_layers,
  44. save_watermark_removal_debug,
  45. scan_pdf_watermark_xobjs,
  46. )
  47. # 支持的图片后缀(小写)
  48. IMAGE_SUFFIXES = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp"}
  49. _DEFAULT_CONFIG_PATH = (
  50. _repo_root
  51. / "ocr_tools/universal_doc_parser/config/bank_statement_yusys_local.yaml"
  52. )
  53. @dataclass
  54. class WatermarkToolSettings:
  55. """从场景 YAML 解析的水印处理参数(与 Pipeline preprocessor 对齐)。"""
  56. threshold: int = 160
  57. morph_close_kernel: int = 0
  58. dpi: int = 200
  59. method: str = "threshold"
  60. scope: str = "page"
  61. contrast_enhancement: Optional[Dict[str, Any]] = None
  62. debug_options: Optional[Dict[str, Any]] = None
  63. watermark_enabled: bool = True
  64. watermark_config: Optional[Dict[str, Any]] = None
  65. @property
  66. def debug_image_format(self) -> str:
  67. opts = self.debug_options or {}
  68. return str(opts.get("image_format") or "png").lstrip(".")
  69. def load_watermark_settings(
  70. config_path: Path,
  71. *,
  72. scope: str = "page",
  73. ) -> WatermarkToolSettings:
  74. """
  75. 从 universal_doc_parser 场景配置读取 preprocessor.watermark_removal 与 input.dpi。
  76. scope=cell 时读取 table_recognition_wired.second_pass_ocr.cell_preprocess.watermark。
  77. """
  78. config_path = Path(config_path)
  79. if not config_path.is_file():
  80. raise FileNotFoundError(f"配置文件不存在: {config_path}")
  81. with open(config_path, encoding="utf-8") as f:
  82. raw = yaml.safe_load(f) or {}
  83. input_cfg = raw.get("input") or {}
  84. if scope == "cell":
  85. wired = raw.get("table_recognition_wired") or {}
  86. sp = wired.get("second_pass_ocr") or {}
  87. cpp = sp.get("cell_preprocess") or {}
  88. wm_user = cpp.get("watermark") or {}
  89. wm_full = merge_watermark_config("cell", wm_user)
  90. else:
  91. preprocessor = raw.get("preprocessor") or {}
  92. wm_user = preprocessor.get("watermark_removal") or {}
  93. wm_full = merge_watermark_config("page", wm_user)
  94. contrast = wm_full.get("contrast_enhancement")
  95. if contrast is not None and not isinstance(contrast, dict):
  96. contrast = None
  97. return WatermarkToolSettings(
  98. threshold=int(wm_full.get("threshold", 160)),
  99. morph_close_kernel=int(wm_full.get("morph_close_kernel", 0)),
  100. dpi=int(input_cfg.get("dpi", 200)),
  101. method=str(wm_full.get("method") or "masked_adaptive"),
  102. scope=scope,
  103. contrast_enhancement=copy.deepcopy(contrast) if contrast else None,
  104. debug_options=copy.deepcopy(wm_full.get("debug_options"))
  105. if wm_full.get("debug_options")
  106. else None,
  107. watermark_enabled=bool(wm_full.get("enabled", True)),
  108. watermark_config=wm_full,
  109. )
  110. def resolve_watermark_settings(
  111. config_path: Path,
  112. *,
  113. scope: str = "page",
  114. threshold: Optional[int] = None,
  115. morph_close_kernel: Optional[int] = None,
  116. dpi: Optional[int] = None,
  117. no_contrast: bool = False,
  118. text_black_target: Optional[int] = None,
  119. method: Optional[str] = None,
  120. ) -> WatermarkToolSettings:
  121. """加载配置并应用命令行覆盖。"""
  122. settings = load_watermark_settings(config_path, scope=scope)
  123. if threshold is not None:
  124. settings.threshold = threshold
  125. if morph_close_kernel is not None:
  126. settings.morph_close_kernel = morph_close_kernel
  127. if dpi is not None:
  128. settings.dpi = dpi
  129. if method is not None:
  130. settings.method = method
  131. if settings.watermark_config is not None:
  132. settings.watermark_config["method"] = method
  133. if no_contrast and settings.contrast_enhancement:
  134. settings.contrast_enhancement = copy.deepcopy(settings.contrast_enhancement)
  135. settings.contrast_enhancement["enabled"] = False
  136. elif text_black_target is not None:
  137. if not settings.contrast_enhancement:
  138. settings.contrast_enhancement = {"enabled": True, "method": "text_restore"}
  139. else:
  140. settings.contrast_enhancement = copy.deepcopy(settings.contrast_enhancement)
  141. settings.contrast_enhancement["enabled"] = True
  142. settings.contrast_enhancement["text_black_target"] = text_black_target
  143. return settings
  144. def _watermark_removal_cfg_for_method(
  145. settings: WatermarkToolSettings,
  146. method: str,
  147. ) -> Dict[str, Any]:
  148. """构造指定 method 的 watermark_removal 配置副本。"""
  149. cfg = copy.deepcopy(settings.watermark_config or {})
  150. cfg["method"] = method
  151. cfg["threshold"] = settings.threshold
  152. cfg["morph_close_kernel"] = settings.morph_close_kernel
  153. return cfg
  154. def _apply_image_watermark_removal(
  155. img_np: np.ndarray,
  156. *,
  157. settings: WatermarkToolSettings,
  158. contrast_enhancement: Optional[Dict[str, Any]] = None,
  159. apply_watermark_removal: bool = True,
  160. removal_debug: Optional[Dict[str, Any]] = None,
  161. scope: str = "page",
  162. ) -> np.ndarray:
  163. """与 universal_doc_parser 一致的 RGB 去水印 + 可选对比度增强。"""
  164. proc = WatermarkProcessor(settings.watermark_config or {}, scope=scope) # type: ignore[arg-type]
  165. apply_contrast = contrast_enhancement is not None
  166. cleaned, _ = proc.process(
  167. img_np,
  168. apply_removal=apply_watermark_removal,
  169. contrast_override=contrast_enhancement,
  170. removal_debug=removal_debug,
  171. force=scope == "cell",
  172. )
  173. return np.asarray(cleaned)
  174. def _active_contrast_enhancement(
  175. settings: WatermarkToolSettings,
  176. ) -> Optional[Dict[str, Any]]:
  177. ce = settings.contrast_enhancement
  178. if not ce or not ce.get("enabled", False):
  179. return None
  180. return ce
  181. def _maybe_save_watermark_debug(
  182. before: np.ndarray,
  183. after: np.ndarray,
  184. debug_output_dir: Path,
  185. page_name: str,
  186. *,
  187. settings: WatermarkToolSettings,
  188. contrast_enhancement: Optional[Dict[str, Any]] = None,
  189. removal_debug: Optional[Dict[str, Any]] = None,
  190. ) -> None:
  191. """保存调试图到 debug/watermark_removal/(与 pipeline 相同布局)。"""
  192. params: Dict[str, Any] = {
  193. "method": settings.method,
  194. "threshold": settings.threshold,
  195. "morph_close_kernel": settings.morph_close_kernel,
  196. }
  197. if contrast_enhancement:
  198. params["contrast_enhancement"] = contrast_enhancement
  199. if removal_debug:
  200. for key in ("mode", "T_wm", "T_protect", "wm_mask_ratio", "white_pixel_ratio"):
  201. if key in removal_debug:
  202. params[key] = removal_debug[key]
  203. mask_overlay = None
  204. if removal_debug and "wm_mask" in removal_debug:
  205. mask_overlay = render_watermark_mask_overlay(
  206. before, removal_debug["wm_mask"]
  207. )
  208. save_watermark_removal_debug(
  209. before,
  210. after,
  211. debug_output_dir,
  212. page_name,
  213. processing_params=params,
  214. image_format=settings.debug_image_format,
  215. save_compare=True,
  216. mask_overlay=mask_overlay,
  217. )
  218. def _try_remove_txt_pdf_watermark(input_path: Path, output_path: Path) -> int:
  219. """
  220. 对文字型 PDF 执行原生水印去除,保留文字可搜索性。
  221. 内部委托给 watermark_utils.remove_txt_pdf_watermark() 完成内存流处理,
  222. 有水印时将结果写入 output_path。
  223. 流程:
  224. 1. scan_pdf_watermark_xobjs() 快速扫描前 3 页,无水印直接返回 0
  225. 2. remove_txt_pdf_watermark() 执行全量去除,返回 bytes 或 None
  226. 3. 有水印时写 output_path
  227. Returns:
  228. 1 表示去除成功,0 表示未发现水印
  229. """
  230. pdf_bytes = input_path.read_bytes()
  231. if not scan_pdf_watermark_xobjs(pdf_bytes, sample_pages=3):
  232. return 0
  233. cleaned = remove_txt_pdf_watermark(pdf_bytes)
  234. if cleaned is None:
  235. return 0
  236. output_path.write_bytes(cleaned)
  237. return 1
  238. def process_document(
  239. input_path: Path,
  240. output_path: Path,
  241. settings: WatermarkToolSettings,
  242. page_range: Optional[str] = None,
  243. force_image: bool = False,
  244. save_debug: bool = False,
  245. debug_output_dir: Optional[Path] = None,
  246. apply_watermark_removal: Optional[bool] = None,
  247. ) -> int:
  248. """
  249. 统一处理函数:支持 PDF(扫描件)和图片,去除水印后保存。
  250. 使用 PDFUtils.load_and_classify_document 加载并分类:
  251. - 文字型 PDF(pdf_type='txt'):优先尝试原生 XObject 水印去除(保留可搜索性);
  252. 失败时自动回退图像化处理,或 force_image=True 时直接走图像处理
  253. - 扫描件 PDF(pdf_type='ocr'):逐页去水印后重新打包为 PDF
  254. - 图片:检测水印后去除并保存
  255. Args:
  256. input_path: 输入文件路径(PDF 或图片)
  257. output_path: 输出文件路径
  258. settings: 水印配置(含 method / threshold / mask / adaptive)
  259. page_range: 页面范围字符串,如 "1-5,7,9-12"(从 1 开始,仅对 PDF 有效)
  260. force_image: 强制对文字型 PDF 使用图像化处理(会失去文字可搜索性,
  261. 但能处理水印嵌在内容流中的情况)
  262. save_debug: 是否保存 before/after/compare/meta 到 debug/watermark_removal/
  263. debug_output_dir: 调试图根目录,默认 output_path 的父目录
  264. apply_watermark_removal: 默认取 settings.watermark_enabled
  265. Returns:
  266. 实际处理的页/图片数
  267. """
  268. import shutil
  269. from io import BytesIO
  270. from PIL import Image
  271. from ocr_utils.pdf_utils import PDFUtils
  272. is_pdf = input_path.suffix.lower() == ".pdf"
  273. dpi = settings.dpi
  274. contrast_enhancement = _active_contrast_enhancement(settings)
  275. if apply_watermark_removal is None:
  276. apply_watermark_removal = settings.watermark_enabled
  277. # 统一加载 + 分类(PDF 用 MinerU pdf_classify,图片直接读取)
  278. images, pdf_type, pdf_doc, renderer = PDFUtils.load_and_classify_document(
  279. input_path, dpi=dpi, page_range=page_range
  280. )
  281. # _known_has_wm: 当 txt 分支已确认有水印时设为 True,避免公共段用更严格阈值误判
  282. _known_has_wm: Optional[bool] = None
  283. # 文字型 PDF:优先尝试原生 XObject 水印去除,保留可搜索性
  284. if is_pdf and pdf_type == "txt" and not force_image:
  285. output_path.parent.mkdir(parents=True, exist_ok=True)
  286. removed = _try_remove_txt_pdf_watermark(input_path, output_path)
  287. if removed > 0:
  288. logger.info(
  289. f"✅ 文字型 PDF '{input_path.name}':删除 {removed} 个水印 XObject,"
  290. "保留文字可搜索性,已保存。"
  291. )
  292. return removed
  293. # XObject 扫描无结果,用较低阈值(0.5%)做图像水印检测二次确认
  294. # 文字 PDF 背景干净,降低阈值以检测稀疏文字水印
  295. first_np = np.array(images[0]["img_pil"])
  296. if detect_watermark(first_np, ratio_threshold=0.005):
  297. logger.warning(
  298. f"⚠️ 文字型 PDF '{input_path.name}':未找到 XObject 水印,"
  299. "但图像检测发现水印(内联内容流水印),"
  300. "回退为图像化处理(输出将失去文字可搜索性)。"
  301. )
  302. _known_has_wm = True # 明确检测到水印,跳过公共段二次检测
  303. else:
  304. logger.info(
  305. f"✅ 文字型 PDF '{input_path.name}':未检测到水印,直接复制。"
  306. )
  307. shutil.copy2(str(input_path), str(output_path))
  308. return 0
  309. elif is_pdf and pdf_type == "txt" and force_image:
  310. logger.warning(
  311. f"⚠️ 文字型 PDF '{input_path.name}':--force-image 模式,"
  312. "强制图像化处理(输出将失去文字可搜索性)。"
  313. )
  314. _known_has_wm = True # force_image 模式不再检测,直接去除
  315. logger.info(
  316. f"{'📄' if is_pdf else '🖼️ '} 处理: {input_path.name} "
  317. f"共 {len(images)} {'页' if is_pdf else '张'} "
  318. f"method={settings.method} threshold={settings.threshold}"
  319. )
  320. contrast_only = (
  321. not apply_watermark_removal
  322. and contrast_enhancement
  323. and contrast_enhancement.get("enabled", False)
  324. )
  325. # 水印检测(仅用第一页/图判断,同一文档水印通常一致)
  326. # _known_has_wm 已在 txt 分支设置时,跳过重复检测
  327. if contrast_only:
  328. has_wm = True
  329. logger.info("📋 配置关闭去水印,仅应用 contrast_enhancement")
  330. elif _known_has_wm is not None:
  331. has_wm = _known_has_wm
  332. logger.info("🔍 检测到水印,启动去水印处理" if has_wm else "✅ 未检测到水印,跳过")
  333. else:
  334. first_np = np.array(images[0]["img_pil"])
  335. # 扫描件/图片路径:使用宽松一档的中间调阈值(2.5%)以避免边界误判,
  336. # 斜向直线验证仍作为双重保险防止误报
  337. has_wm = detect_watermark(first_np, ratio_threshold=0.025)
  338. if has_wm:
  339. logger.info("🔍 检测到水印,启动去水印处理")
  340. else:
  341. logger.info("✅ 未检测到水印,跳过去水印处理")
  342. if not is_pdf:
  343. # 图片无水印:直接复制
  344. output_path.parent.mkdir(parents=True, exist_ok=True)
  345. shutil.copy2(str(input_path), str(output_path))
  346. return 1
  347. output_path.parent.mkdir(parents=True, exist_ok=True)
  348. debug_root = debug_output_dir or output_path.parent
  349. if is_pdf:
  350. # 逐页处理后重新打包为 PDF
  351. try:
  352. import fitz
  353. except ImportError:
  354. raise ImportError("请安装 PyMuPDF: pip install PyMuPDF")
  355. new_doc = fitz.open()
  356. for i, img_dict in enumerate(images):
  357. pil_img = img_dict["img_pil"]
  358. img_np = np.array(pil_img)
  359. page_name = f"{input_path.stem}_page_{i + 1:03d}"
  360. if has_wm:
  361. before = img_np.copy()
  362. removal_dbg: Dict[str, Any] = {}
  363. cleaned_rgb = _apply_image_watermark_removal(
  364. img_np,
  365. settings=settings,
  366. contrast_enhancement=contrast_enhancement,
  367. apply_watermark_removal=apply_watermark_removal,
  368. removal_debug=removal_dbg,
  369. scope=settings.scope,
  370. )
  371. if save_debug:
  372. _maybe_save_watermark_debug(
  373. before,
  374. cleaned_rgb,
  375. debug_root,
  376. page_name,
  377. settings=settings,
  378. contrast_enhancement=contrast_enhancement,
  379. removal_debug=removal_dbg,
  380. )
  381. out_pil = Image.fromarray(
  382. cv2.cvtColor(cleaned_rgb, cv2.COLOR_BGR2RGB)
  383. )
  384. else:
  385. out_pil = pil_img
  386. buf = BytesIO()
  387. out_pil.save(buf, format="PNG", optimize=False)
  388. buf.seek(0)
  389. # 按渲染图尺寸创建新页面(保持原始 DPI 尺寸)
  390. w_px, h_px = out_pil.size
  391. new_page = new_doc.new_page(width=w_px * 72 / dpi, height=h_px * 72 / dpi)
  392. new_page.insert_image(new_page.rect, stream=buf.read())
  393. if (i + 1) % 10 == 0 or i == len(images) - 1:
  394. logger.info(f" 进度: {i + 1}/{len(images)}")
  395. new_doc.save(str(output_path), garbage=4, deflate=True)
  396. else:
  397. # 图片:有水印则去除后保存
  398. img_np = np.array(images[0]["img_pil"])
  399. before = img_np.copy()
  400. removal_dbg = {}
  401. cleaned_rgb = _apply_image_watermark_removal(
  402. img_np,
  403. settings=settings,
  404. contrast_enhancement=contrast_enhancement,
  405. apply_watermark_removal=apply_watermark_removal,
  406. removal_debug=removal_dbg,
  407. scope=settings.scope,
  408. )
  409. if save_debug:
  410. _maybe_save_watermark_debug(
  411. before,
  412. cleaned_rgb,
  413. debug_root,
  414. input_path.stem,
  415. settings=settings,
  416. contrast_enhancement=contrast_enhancement,
  417. removal_debug=removal_dbg,
  418. )
  419. out_rgb = cv2.cvtColor(cleaned_rgb, cv2.COLOR_BGR2RGB)
  420. Image.fromarray(out_rgb).save(str(output_path))
  421. logger.info(f"✅ 保存到: {output_path}")
  422. return len(images)
  423. def preview_page(
  424. input_path: Path,
  425. settings: WatermarkToolSettings,
  426. page_idx: int = 0,
  427. ):
  428. """展示单页原图与去水印对比(需要 matplotlib)。支持 PDF 和图片文件。"""
  429. try:
  430. import matplotlib.pyplot as plt
  431. import matplotlib
  432. matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'sans-serif']
  433. matplotlib.rcParams['axes.unicode_minus'] = False
  434. except ImportError as e:
  435. raise ImportError(f"预览需要 matplotlib: {e}")
  436. suffix = input_path.suffix.lower()
  437. if suffix == ".pdf":
  438. try:
  439. import fitz
  440. except ImportError:
  441. raise ImportError("PDF 预览需要 PyMuPDF: pip install PyMuPDF")
  442. doc = fitz.open(str(input_path))
  443. if page_idx >= len(doc):
  444. raise ValueError(f"页码 {page_idx} 超出范围(共 {len(doc)} 页)")
  445. mat = fitz.Matrix(settings.dpi / 72, settings.dpi / 72)
  446. page = doc[page_idx]
  447. pix = page.get_pixmap(matrix=mat, alpha=False)
  448. img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
  449. title_orig = f"原图 第 {page_idx + 1} 页"
  450. elif suffix in IMAGE_SUFFIXES:
  451. from PIL import Image
  452. img_np = np.array(Image.open(str(input_path)).convert("RGB"))
  453. title_orig = f"原图 {input_path.name}"
  454. else:
  455. raise ValueError(f"不支持的文件格式: {suffix}")
  456. contrast = _active_contrast_enhancement(settings)
  457. cleaned_rgb = _apply_image_watermark_removal(
  458. img_np,
  459. settings=settings,
  460. contrast_enhancement=contrast,
  461. apply_watermark_removal=settings.watermark_enabled,
  462. scope=settings.scope,
  463. )
  464. cleaned = cv2.cvtColor(cleaned_rgb, cv2.COLOR_BGR2GRAY)
  465. fig, axes = plt.subplots(1, 2, figsize=(20, 14))
  466. axes[0].imshow(img_np)
  467. axes[0].set_title(title_orig, fontsize=14)
  468. axes[0].axis('off')
  469. subtitle = f"method={settings.method}, threshold={settings.threshold}"
  470. if contrast:
  471. subtitle += f", contrast={contrast.get('method', 'on')}"
  472. axes[1].imshow(cleaned, cmap='gray')
  473. axes[1].set_title(f"去水印后 {subtitle}", fontsize=14)
  474. axes[1].axis('off')
  475. plt.tight_layout()
  476. plt.show()
  477. def _run_process_document(
  478. input_path: Path,
  479. output_path: Path,
  480. settings: WatermarkToolSettings,
  481. *,
  482. page_range: Optional[str] = None,
  483. force_image: bool = False,
  484. save_debug: bool = False,
  485. debug_output_dir: Optional[Path] = None,
  486. ) -> int:
  487. return process_document(
  488. input_path,
  489. output_path,
  490. settings,
  491. page_range=page_range,
  492. force_image=force_image,
  493. save_debug=save_debug,
  494. debug_output_dir=debug_output_dir,
  495. )
  496. def compare_watermark_methods(
  497. input_path: Path,
  498. output_dir: Path,
  499. settings: WatermarkToolSettings,
  500. ) -> Dict[str, str]:
  501. """
  502. 同一张图对比 threshold 与 masked_adaptive,输出三联图与 meta。
  503. Returns:
  504. 各输出文件路径
  505. """
  506. from PIL import Image
  507. output_dir.mkdir(parents=True, exist_ok=True)
  508. stem = input_path.stem
  509. img_rgb = np.array(Image.open(str(input_path)).convert("RGB"))
  510. contrast = _active_contrast_enhancement(settings)
  511. paths: Dict[str, str] = {}
  512. results: Dict[str, np.ndarray] = {}
  513. for method in ("threshold", "masked_adaptive"):
  514. sub = copy.deepcopy(settings)
  515. sub.method = method
  516. dbg: Dict[str, Any] = {}
  517. sub.watermark_config = _watermark_removal_cfg_for_method(sub, method)
  518. out = _apply_image_watermark_removal(
  519. img_rgb,
  520. settings=sub,
  521. contrast_enhancement=contrast,
  522. removal_debug=dbg,
  523. scope=settings.scope,
  524. )
  525. out_rgb = cv2.cvtColor(out, cv2.COLOR_BGR2RGB)
  526. results[method] = out_rgb
  527. out_path = output_dir / f"{stem}_cleaned_{method}.png"
  528. Image.fromarray(out_rgb).save(str(out_path))
  529. paths[method] = str(out_path)
  530. meta_path = output_dir / f"{stem}_meta_{method}.json"
  531. meta = {
  532. "method": method,
  533. "threshold": settings.threshold,
  534. "mask_mode": dbg.get("mask_mode"),
  535. "direction_filter": dbg.get("direction_filter"),
  536. "whiten_mode": dbg.get("whiten_mode"),
  537. "T_wm": dbg.get("T_wm"),
  538. "T_protect": dbg.get("T_protect"),
  539. "mode": dbg.get("mode"),
  540. "midtone_ratio": dbg.get("midtone_ratio"),
  541. "wm_candidate_ratio": dbg.get("wm_candidate_ratio"),
  542. "geom_mask_ratio": dbg.get("geom_mask_ratio"),
  543. "geom_candidate_ratio": dbg.get("geom_candidate_ratio"),
  544. "wm_mask_ratio": dbg.get("wm_mask_ratio"),
  545. "white_pixel_ratio": dbg.get("white_pixel_ratio"),
  546. "hough_kept_lines": dbg.get("hough_kept_lines"),
  547. "hough_diag_candidates": dbg.get("hough_diag_candidates"),
  548. "hough_total_lines": dbg.get("hough_total_lines"),
  549. "dominant_angles": dbg.get("dominant_angles"),
  550. "whiten_gray_low": dbg.get("whiten_gray_low"),
  551. }
  552. meta_path.write_text(
  553. json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
  554. )
  555. paths[f"meta_{method}"] = str(meta_path)
  556. if method == "masked_adaptive":
  557. layer_paths = save_watermark_mask_debug_layers(
  558. img_rgb, output_dir, stem, dbg, image_format="png"
  559. )
  560. paths.update(layer_paths)
  561. h = max(results["threshold"].shape[0], results["masked_adaptive"].shape[0])
  562. def _resize_rgb(arr: np.ndarray) -> np.ndarray:
  563. if arr.shape[0] == h:
  564. return arr
  565. scale = h / arr.shape[0]
  566. w = int(arr.shape[1] * scale)
  567. return cv2.resize(arr, (w, h))
  568. triple = np.hstack(
  569. [_resize_rgb(img_rgb)]
  570. + [_resize_rgb(results[m]) for m in ("threshold", "masked_adaptive")]
  571. )
  572. compare_path = output_dir / f"{stem}_compare_orig_threshold_masked.png"
  573. cv2.imwrite(
  574. str(compare_path),
  575. cv2.cvtColor(triple, cv2.COLOR_RGB2BGR),
  576. )
  577. paths["compare_triple"] = str(compare_path)
  578. logger.info(f"✅ 方法对比已保存: {compare_path}")
  579. return paths
  580. def main():
  581. parser = argparse.ArgumentParser(
  582. description="银行流水水印去除工具(参数默认来自场景 YAML,与 main_v2 Pipeline 一致)",
  583. formatter_class=argparse.RawDescriptionHelpFormatter,
  584. epilog=__doc__,
  585. )
  586. parser.add_argument("input", type=Path, help="输入 PDF / 图片文件或目录(批量模式)")
  587. parser.add_argument(
  588. "-c",
  589. "--config",
  590. type=Path,
  591. default=_DEFAULT_CONFIG_PATH,
  592. help=f"场景配置文件,读取 preprocessor.watermark_removal(默认: {_DEFAULT_CONFIG_PATH.name})",
  593. )
  594. parser.add_argument(
  595. "-o",
  596. "--output",
  597. type=Path,
  598. default=None,
  599. help="输出路径(单文件模式;默认在原文件名后加 _cleaned)",
  600. )
  601. parser.add_argument("--batch", action="store_true", help="批量处理目录下所有 PDF 和图片")
  602. parser.add_argument("--preview", action="store_true", help="预览模式:展示单页对比图(不保存)")
  603. parser.add_argument("--page", type=int, default=0, help="预览页码(0-based)")
  604. parser.add_argument(
  605. "--page-range",
  606. type=str,
  607. default=None,
  608. help="PDF 页面范围,如 '1-3,5,7-9'(从 1 开始)",
  609. )
  610. parser.add_argument(
  611. "--force-image",
  612. action="store_true",
  613. help="文字型 PDF 强制走图像去水印(失去可搜索性)",
  614. )
  615. parser.add_argument(
  616. "--debug",
  617. action="store_true",
  618. help="保存调试图到 debug/watermark_removal/",
  619. )
  620. parser.add_argument(
  621. "--debug-dir",
  622. type=Path,
  623. default=None,
  624. help="调试图根目录(默认 -o 的父目录;格式见配置文件 debug_options.image_format)",
  625. )
  626. # 以下为覆盖配置文件的少量旋钮(未指定则完全使用 YAML)
  627. override = parser.add_argument_group("覆盖配置文件(可选)")
  628. override.add_argument(
  629. "--threshold",
  630. type=int,
  631. default=None,
  632. help="覆盖 watermark_removal.threshold(140-180)",
  633. )
  634. override.add_argument(
  635. "--morph-kernel",
  636. type=int,
  637. default=None,
  638. help="覆盖 watermark_removal.morph_close_kernel",
  639. )
  640. override.add_argument("--dpi", type=int, default=None, help="覆盖 input.dpi")
  641. override.add_argument("--no-contrast", action="store_true", help="关闭 contrast_enhancement")
  642. override.add_argument(
  643. "--text-black-target",
  644. type=int,
  645. default=None,
  646. help="覆盖 contrast_enhancement.text_black_target(text_restore)",
  647. )
  648. override.add_argument(
  649. "--method",
  650. type=str,
  651. default=None,
  652. choices=["threshold", "masked", "masked_adaptive"],
  653. help="覆盖 watermark_removal.method",
  654. )
  655. parser.add_argument(
  656. "--scope",
  657. type=str,
  658. default="page",
  659. choices=["page", "cell"],
  660. help="page=页级 preprocessor;cell=二次 OCR 单元格 preset",
  661. )
  662. parser.add_argument(
  663. "--compare-methods",
  664. action="store_true",
  665. help="对比 threshold 与 masked_adaptive,输出三联图到 -o 目录",
  666. )
  667. args = parser.parse_args()
  668. try:
  669. settings = resolve_watermark_settings(
  670. args.config,
  671. scope=args.scope,
  672. threshold=args.threshold,
  673. morph_close_kernel=args.morph_kernel,
  674. dpi=args.dpi,
  675. no_contrast=args.no_contrast,
  676. text_black_target=args.text_black_target,
  677. method=args.method,
  678. )
  679. except FileNotFoundError as e:
  680. logger.error(str(e))
  681. sys.exit(1)
  682. logger.info(
  683. f"📋 配置: {args.config} | method={settings.method} | "
  684. f"threshold={settings.threshold} | morph_kernel={settings.morph_close_kernel} | "
  685. f"dpi={settings.dpi} | contrast={settings.contrast_enhancement}"
  686. )
  687. if args.compare_methods:
  688. input_path = args.input
  689. if not input_path.is_file():
  690. logger.error(f"文件不存在: {input_path}")
  691. sys.exit(1)
  692. out_dir = args.output or (
  693. input_path.parent / "debug" / "watermark_method_compare"
  694. )
  695. paths = compare_watermark_methods(input_path, out_dir, settings)
  696. for k, v in paths.items():
  697. logger.info(f" {k}: {v}")
  698. return
  699. if args.preview:
  700. preview_page(args.input, settings, page_idx=args.page)
  701. return
  702. if args.batch:
  703. # 批量模式:处理目录下所有 PDF 和图片
  704. input_dir = args.input
  705. if not input_dir.is_dir():
  706. logger.error(f"批量模式需要传入目录: {input_dir}")
  707. sys.exit(1)
  708. # 收集所有支持的文件
  709. all_files: list[Path] = sorted(input_dir.glob("*.pdf"))
  710. for ext in IMAGE_SUFFIXES:
  711. all_files.extend(sorted(input_dir.glob(f"*{ext}")))
  712. all_files.extend(sorted(input_dir.glob(f"*{ext.upper()}")))
  713. all_files = sorted(set(all_files))
  714. if not all_files:
  715. logger.warning(f"目录中没有可处理的文件(PDF/图片): {input_dir}")
  716. return
  717. out_dir = args.output or input_dir / "cleaned"
  718. out_dir.mkdir(parents=True, exist_ok=True)
  719. for file in all_files:
  720. out_file = out_dir / f"{file.stem}_cleaned{file.suffix}"
  721. try:
  722. _run_process_document(
  723. file,
  724. out_file,
  725. settings,
  726. page_range=args.page_range,
  727. force_image=args.force_image,
  728. save_debug=args.debug,
  729. debug_output_dir=args.debug_dir or out_dir,
  730. )
  731. except Exception as e:
  732. logger.error(f"❌ 处理失败 {file.name}: {e}")
  733. logger.info(f"✅ 批量处理完成,共 {len(all_files)} 个文件 -> {out_dir}")
  734. else:
  735. # 单文件模式
  736. input_path = args.input
  737. if not input_path.is_file():
  738. logger.error(f"文件不存在: {input_path}")
  739. sys.exit(1)
  740. output_path = args.output or input_path.with_name(
  741. f"{input_path.stem}_cleaned{input_path.suffix}"
  742. )
  743. suffix = input_path.suffix.lower()
  744. if suffix == ".pdf" or suffix in IMAGE_SUFFIXES:
  745. _run_process_document(
  746. input_path,
  747. output_path,
  748. settings,
  749. page_range=args.page_range,
  750. force_image=args.force_image,
  751. save_debug=args.debug,
  752. debug_output_dir=args.debug_dir or output_path.parent,
  753. )
  754. else:
  755. logger.error(f"不支持的文件格式: {suffix},支持 PDF 和 {IMAGE_SUFFIXES}")
  756. sys.exit(1)
  757. if __name__ == "__main__":
  758. if len(sys.argv) == 1:
  759. print("ℹ️ 未提供命令行参数,使用默认配置运行...")
  760. # 默认配置(用于开发测试)
  761. default_config = {
  762. # 测试输入
  763. # "input": "/Users/zhch158/workspace/data/流水分析/杨万益_福建农信.pdf",
  764. # "input": "Users/zhch158/workspace/data/流水分析/提取自杨万益_福建农信.png",
  765. # 文字PDF测试
  766. # "input": "/Users/zhch158/workspace/data/流水分析/提取自赤峰黄金2023年报.pdf",
  767. # "input": "/Users/zhch158/workspace/data/测试文字PDF-水印.pdf",
  768. # "input": "/Users/zhch158/workspace/data/非结构化文档识别统一平台(ocr_platform)-交易流水识别,财报识别.pdf",
  769. "input": "/Users/zhch158/workspace/data/流水分析/彭_广东兴宁农村商业银行/bank_statement_yusys_local/彭_广东兴宁农村商业银行/彭_广东兴宁农村商业银行_page_002.png",
  770. # "output": "./output/杨万益_福建农信",
  771. # 页面范围(可选,支持 "1-5,7" 语法,仅对 PDF 有效)
  772. # "page_range": "3", # 仅处理第 1 页(对应 --page-range 参数)
  773. "config": str(_DEFAULT_CONFIG_PATH),
  774. "preview": True,
  775. "debug": True,
  776. "compare-methods": True,
  777. }
  778. # 构造参数(注意 input 是位置参数,morph_kernel 对应 --morph-kernel)
  779. sys.argv = [sys.argv[0], default_config["input"]]
  780. skip_keys = {"input"}
  781. for key, value in default_config.items():
  782. if key in skip_keys:
  783. continue
  784. # 将下划线转换为连字符(如 morph_kernel -> morph-kernel)
  785. flag = f"--{key.replace('_', '-')}"
  786. if isinstance(value, bool):
  787. if value:
  788. sys.argv.append(flag)
  789. else:
  790. sys.argv.extend([flag, str(value)])
  791. sys.exit(main())