|
@@ -2,35 +2,42 @@
|
|
|
银行流水水印去除工具
|
|
银行流水水印去除工具
|
|
|
|
|
|
|
|
支持 PDF 和常见图片格式(jpg/png/tif/bmp/webp)。
|
|
支持 PDF 和常见图片格式(jpg/png/tif/bmp/webp)。
|
|
|
-- 输入 PDF → 输出去水印 PDF(扫描件)或直接复制(文字型)
|
|
|
|
|
-- 输入图片 → 输出去水印图片(保持原格式)
|
|
|
|
|
-适用于福建农信、邮储银行等带有半透明文字水印的银行流水单。
|
|
|
|
|
|
|
+参数默认从与 main_v2 相同的场景 YAML 读取(preprocessor.watermark_removal),
|
|
|
|
|
+命令行仅用于输入/输出、批量、预览及少量覆盖项。
|
|
|
|
|
|
|
|
用法:
|
|
用法:
|
|
|
- # 处理单个 PDF 或图片
|
|
|
|
|
|
|
+ # 使用默认场景配置(bank_statement_yusys_local.yaml)
|
|
|
python remove_watermark.py input.pdf
|
|
python remove_watermark.py input.pdf
|
|
|
- python remove_watermark.py input.jpg
|
|
|
|
|
|
|
|
|
|
- # 指定输出路径
|
|
|
|
|
- python remove_watermark.py input.pdf -o output.pdf
|
|
|
|
|
|
|
+ # 指定场景配置(与 Pipeline 一致)
|
|
|
|
|
+ python remove_watermark.py input.png -c ../universal_doc_parser/config/bank_statement_yusys_local.yaml
|
|
|
|
|
|
|
|
- # 指定页面范围(支持 "1-5,7,9-12" 格式)
|
|
|
|
|
- python remove_watermark.py input.pdf --page-range 1-3
|
|
|
|
|
|
|
+ # 保存调试图(before/after/compare/meta)
|
|
|
|
|
+ python remove_watermark.py input.png -o ./out --debug
|
|
|
|
|
|
|
|
- # 调整去除阈值(默认 160,范围建议 140-180)
|
|
|
|
|
|
|
+ # 临时覆盖阈值(其余仍来自配置文件)
|
|
|
python remove_watermark.py input.pdf --threshold 170
|
|
python remove_watermark.py input.pdf --threshold 170
|
|
|
|
|
|
|
|
- # 批量处理目录下所有 PDF 和图片
|
|
|
|
|
- python remove_watermark.py /path/to/dir/ --batch
|
|
|
|
|
-
|
|
|
|
|
- # 预览单页/图片效果(不保存,直接展示对比图)
|
|
|
|
|
|
|
+ # 预览
|
|
|
python remove_watermark.py input.pdf --preview --page 0
|
|
python remove_watermark.py input.pdf --preview --page 0
|
|
|
- python remove_watermark.py input.jpg --preview
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # 批量
|
|
|
|
|
+ python remove_watermark.py /path/to/dir/ --batch -o ./cleaned
|
|
|
|
|
+
|
|
|
|
|
+ # 对比 threshold vs masked_adaptive(输出三联图)
|
|
|
|
|
+ python remove_watermark.py page_002.png --compare-methods -o ./method_compare
|
|
|
"""
|
|
"""
|
|
|
import argparse
|
|
import argparse
|
|
|
|
|
+import copy
|
|
|
|
|
+import json
|
|
|
import sys
|
|
import sys
|
|
|
|
|
+from dataclasses import dataclass
|
|
|
from pathlib import Path
|
|
from pathlib import Path
|
|
|
-from typing import Optional
|
|
|
|
|
|
|
+from typing import Any, Dict, Optional
|
|
|
|
|
+
|
|
|
|
|
+import cv2
|
|
|
|
|
+import numpy as np
|
|
|
|
|
+import yaml
|
|
|
|
|
|
|
|
# 将 ocr_platform 根目录加入 sys.path,以便导入 ocr_utils
|
|
# 将 ocr_platform 根目录加入 sys.path,以便导入 ocr_utils
|
|
|
_repo_root = Path(__file__).parents[2]
|
|
_repo_root = Path(__file__).parents[2]
|
|
@@ -40,7 +47,10 @@ if str(_repo_root) not in sys.path:
|
|
|
from loguru import logger
|
|
from loguru import logger
|
|
|
from ocr_utils.watermark_utils import (
|
|
from ocr_utils.watermark_utils import (
|
|
|
detect_watermark,
|
|
detect_watermark,
|
|
|
- remove_watermark_from_image,
|
|
|
|
|
|
|
+ remove_watermark_from_image_rgb,
|
|
|
|
|
+ render_watermark_mask_overlay,
|
|
|
|
|
+ save_watermark_removal_debug,
|
|
|
|
|
+ save_watermark_mask_debug_layers,
|
|
|
scan_pdf_watermark_xobjs,
|
|
scan_pdf_watermark_xobjs,
|
|
|
remove_txt_pdf_watermark,
|
|
remove_txt_pdf_watermark,
|
|
|
)
|
|
)
|
|
@@ -48,6 +58,190 @@ from ocr_utils.watermark_utils import (
|
|
|
# 支持的图片后缀(小写)
|
|
# 支持的图片后缀(小写)
|
|
|
IMAGE_SUFFIXES = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp"}
|
|
IMAGE_SUFFIXES = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp"}
|
|
|
|
|
|
|
|
|
|
+_DEFAULT_CONFIG_PATH = (
|
|
|
|
|
+ _repo_root
|
|
|
|
|
+ / "ocr_tools/universal_doc_parser/config/bank_statement_yusys_local.yaml"
|
|
|
|
|
+)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+@dataclass
|
|
|
|
|
+class WatermarkToolSettings:
|
|
|
|
|
+ """从场景 YAML 解析的水印处理参数(与 Pipeline preprocessor 对齐)。"""
|
|
|
|
|
+
|
|
|
|
|
+ threshold: int = 160
|
|
|
|
|
+ morph_close_kernel: int = 0
|
|
|
|
|
+ dpi: int = 200
|
|
|
|
|
+ method: str = "threshold"
|
|
|
|
|
+ contrast_enhancement: Optional[Dict[str, Any]] = None
|
|
|
|
|
+ debug_options: Optional[Dict[str, Any]] = None
|
|
|
|
|
+ watermark_enabled: bool = True
|
|
|
|
|
+ watermark_config: Optional[Dict[str, Any]] = None
|
|
|
|
|
+
|
|
|
|
|
+ @property
|
|
|
|
|
+ def debug_image_format(self) -> str:
|
|
|
|
|
+ opts = self.debug_options or {}
|
|
|
|
|
+ return str(opts.get("image_format") or "png").lstrip(".")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def load_watermark_settings(config_path: Path) -> WatermarkToolSettings:
|
|
|
|
|
+ """
|
|
|
|
|
+ 从 universal_doc_parser 场景配置读取 preprocessor.watermark_removal 与 input.dpi。
|
|
|
|
|
+
|
|
|
|
|
+ 不依赖完整 ConfigManager,避免仅调试水印时强依赖 layout/ocr 等段。
|
|
|
|
|
+ """
|
|
|
|
|
+ config_path = Path(config_path)
|
|
|
|
|
+ if not config_path.is_file():
|
|
|
|
|
+ raise FileNotFoundError(f"配置文件不存在: {config_path}")
|
|
|
|
|
+
|
|
|
|
|
+ with open(config_path, encoding="utf-8") as f:
|
|
|
|
|
+ raw = yaml.safe_load(f) or {}
|
|
|
|
|
+
|
|
|
|
|
+ preprocessor = raw.get("preprocessor") or {}
|
|
|
|
|
+ wm = preprocessor.get("watermark_removal") or {}
|
|
|
|
|
+ input_cfg = raw.get("input") or {}
|
|
|
|
|
+
|
|
|
|
|
+ contrast = wm.get("contrast_enhancement")
|
|
|
|
|
+ if contrast is not None and not isinstance(contrast, dict):
|
|
|
|
|
+ contrast = None
|
|
|
|
|
+
|
|
|
|
|
+ wm_full = copy.deepcopy(wm)
|
|
|
|
|
+ return WatermarkToolSettings(
|
|
|
|
|
+ threshold=int(wm.get("threshold", 160)),
|
|
|
|
|
+ morph_close_kernel=int(wm.get("morph_close_kernel", 0)),
|
|
|
|
|
+ dpi=int(input_cfg.get("dpi", 200)),
|
|
|
|
|
+ method=str(wm.get("method") or "threshold"),
|
|
|
|
|
+ contrast_enhancement=copy.deepcopy(contrast) if contrast else None,
|
|
|
|
|
+ debug_options=copy.deepcopy(wm.get("debug_options"))
|
|
|
|
|
+ if wm.get("debug_options")
|
|
|
|
|
+ else None,
|
|
|
|
|
+ watermark_enabled=bool(wm.get("enabled", True)),
|
|
|
|
|
+ watermark_config=wm_full,
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def resolve_watermark_settings(
|
|
|
|
|
+ config_path: Path,
|
|
|
|
|
+ *,
|
|
|
|
|
+ threshold: Optional[int] = None,
|
|
|
|
|
+ morph_close_kernel: Optional[int] = None,
|
|
|
|
|
+ dpi: Optional[int] = None,
|
|
|
|
|
+ no_contrast: bool = False,
|
|
|
|
|
+ text_black_target: Optional[int] = None,
|
|
|
|
|
+ method: Optional[str] = None,
|
|
|
|
|
+) -> WatermarkToolSettings:
|
|
|
|
|
+ """加载配置并应用命令行覆盖。"""
|
|
|
|
|
+ settings = load_watermark_settings(config_path)
|
|
|
|
|
+
|
|
|
|
|
+ if threshold is not None:
|
|
|
|
|
+ settings.threshold = threshold
|
|
|
|
|
+ if morph_close_kernel is not None:
|
|
|
|
|
+ settings.morph_close_kernel = morph_close_kernel
|
|
|
|
|
+ if dpi is not None:
|
|
|
|
|
+ settings.dpi = dpi
|
|
|
|
|
+ if method is not None:
|
|
|
|
|
+ settings.method = method
|
|
|
|
|
+ if settings.watermark_config is not None:
|
|
|
|
|
+ settings.watermark_config["method"] = method
|
|
|
|
|
+
|
|
|
|
|
+ if no_contrast and settings.contrast_enhancement:
|
|
|
|
|
+ settings.contrast_enhancement = copy.deepcopy(settings.contrast_enhancement)
|
|
|
|
|
+ settings.contrast_enhancement["enabled"] = False
|
|
|
|
|
+ elif text_black_target is not None:
|
|
|
|
|
+ if not settings.contrast_enhancement:
|
|
|
|
|
+ settings.contrast_enhancement = {"enabled": True, "method": "text_restore"}
|
|
|
|
|
+ else:
|
|
|
|
|
+ settings.contrast_enhancement = copy.deepcopy(settings.contrast_enhancement)
|
|
|
|
|
+ settings.contrast_enhancement["enabled"] = True
|
|
|
|
|
+ settings.contrast_enhancement["text_black_target"] = text_black_target
|
|
|
|
|
+
|
|
|
|
|
+ return settings
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _watermark_removal_cfg_for_method(
|
|
|
|
|
+ settings: WatermarkToolSettings,
|
|
|
|
|
+ method: str,
|
|
|
|
|
+) -> Dict[str, Any]:
|
|
|
|
|
+ """构造指定 method 的 watermark_removal 配置副本。"""
|
|
|
|
|
+ cfg = copy.deepcopy(settings.watermark_config or {})
|
|
|
|
|
+ cfg["method"] = method
|
|
|
|
|
+ cfg["threshold"] = settings.threshold
|
|
|
|
|
+ cfg["morph_close_kernel"] = settings.morph_close_kernel
|
|
|
|
|
+ return cfg
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _apply_image_watermark_removal(
|
|
|
|
|
+ img_np: np.ndarray,
|
|
|
|
|
+ *,
|
|
|
|
|
+ settings: WatermarkToolSettings,
|
|
|
|
|
+ contrast_enhancement: Optional[Dict[str, Any]] = None,
|
|
|
|
|
+ apply_watermark_removal: bool = True,
|
|
|
|
|
+ removal_debug: Optional[Dict[str, Any]] = None,
|
|
|
|
|
+) -> np.ndarray:
|
|
|
|
|
+ """与 universal_doc_parser 一致的 RGB 去水印 + 可选对比度增强。"""
|
|
|
|
|
+ wm_cfg = _watermark_removal_cfg_for_method(settings, settings.method)
|
|
|
|
|
+ return np.asarray(
|
|
|
|
|
+ remove_watermark_from_image_rgb(
|
|
|
|
|
+ img_np,
|
|
|
|
|
+ threshold=settings.threshold,
|
|
|
|
|
+ morph_close_kernel=settings.morph_close_kernel,
|
|
|
|
|
+ contrast_enhancement=contrast_enhancement,
|
|
|
|
|
+ apply_watermark_removal=apply_watermark_removal,
|
|
|
|
|
+ watermark_removal_cfg=wm_cfg,
|
|
|
|
|
+ removal_debug=removal_debug,
|
|
|
|
|
+ return_pil=False,
|
|
|
|
|
+ )
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _active_contrast_enhancement(
|
|
|
|
|
+ settings: WatermarkToolSettings,
|
|
|
|
|
+) -> Optional[Dict[str, Any]]:
|
|
|
|
|
+ ce = settings.contrast_enhancement
|
|
|
|
|
+ if not ce or not ce.get("enabled", False):
|
|
|
|
|
+ return None
|
|
|
|
|
+ return ce
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _maybe_save_watermark_debug(
|
|
|
|
|
+ before: np.ndarray,
|
|
|
|
|
+ after: np.ndarray,
|
|
|
|
|
+ debug_output_dir: Path,
|
|
|
|
|
+ page_name: str,
|
|
|
|
|
+ *,
|
|
|
|
|
+ settings: WatermarkToolSettings,
|
|
|
|
|
+ contrast_enhancement: Optional[Dict[str, Any]] = None,
|
|
|
|
|
+ removal_debug: Optional[Dict[str, Any]] = None,
|
|
|
|
|
+) -> None:
|
|
|
|
|
+ """保存调试图到 debug_comparison/watermark_removal/(与 pipeline 相同布局)。"""
|
|
|
|
|
+ params: Dict[str, Any] = {
|
|
|
|
|
+ "method": settings.method,
|
|
|
|
|
+ "threshold": settings.threshold,
|
|
|
|
|
+ "morph_close_kernel": settings.morph_close_kernel,
|
|
|
|
|
+ }
|
|
|
|
|
+ if contrast_enhancement:
|
|
|
|
|
+ params["contrast_enhancement"] = contrast_enhancement
|
|
|
|
|
+ if removal_debug:
|
|
|
|
|
+ for key in ("mode", "T_wm", "T_protect", "wm_mask_ratio", "white_pixel_ratio"):
|
|
|
|
|
+ if key in removal_debug:
|
|
|
|
|
+ params[key] = removal_debug[key]
|
|
|
|
|
+
|
|
|
|
|
+ mask_overlay = None
|
|
|
|
|
+ if removal_debug and "wm_mask" in removal_debug:
|
|
|
|
|
+ mask_overlay = render_watermark_mask_overlay(
|
|
|
|
|
+ before, removal_debug["wm_mask"]
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ save_watermark_removal_debug(
|
|
|
|
|
+ before,
|
|
|
|
|
+ after,
|
|
|
|
|
+ debug_output_dir,
|
|
|
|
|
+ page_name,
|
|
|
|
|
+ processing_params=params,
|
|
|
|
|
+ image_format=settings.debug_image_format,
|
|
|
|
|
+ save_compare=True,
|
|
|
|
|
+ mask_overlay=mask_overlay,
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
|
|
|
def _try_remove_txt_pdf_watermark(input_path: Path, output_path: Path) -> int:
|
|
def _try_remove_txt_pdf_watermark(input_path: Path, output_path: Path) -> int:
|
|
|
"""
|
|
"""
|
|
@@ -81,11 +275,12 @@ def _try_remove_txt_pdf_watermark(input_path: Path, output_path: Path) -> int:
|
|
|
def process_document(
|
|
def process_document(
|
|
|
input_path: Path,
|
|
input_path: Path,
|
|
|
output_path: Path,
|
|
output_path: Path,
|
|
|
- threshold: int = 160,
|
|
|
|
|
- morph_close_kernel: int = 0,
|
|
|
|
|
- dpi: int = 200,
|
|
|
|
|
|
|
+ settings: WatermarkToolSettings,
|
|
|
page_range: Optional[str] = None,
|
|
page_range: Optional[str] = None,
|
|
|
force_image: bool = False,
|
|
force_image: bool = False,
|
|
|
|
|
+ save_debug: bool = False,
|
|
|
|
|
+ debug_output_dir: Optional[Path] = None,
|
|
|
|
|
+ apply_watermark_removal: Optional[bool] = None,
|
|
|
) -> int:
|
|
) -> int:
|
|
|
"""
|
|
"""
|
|
|
统一处理函数:支持 PDF(扫描件)和图片,去除水印后保存。
|
|
统一处理函数:支持 PDF(扫描件)和图片,去除水印后保存。
|
|
@@ -99,23 +294,27 @@ def process_document(
|
|
|
Args:
|
|
Args:
|
|
|
input_path: 输入文件路径(PDF 或图片)
|
|
input_path: 输入文件路径(PDF 或图片)
|
|
|
output_path: 输出文件路径
|
|
output_path: 输出文件路径
|
|
|
- threshold: 灰度阈值(140-180),越大保守,越小激进
|
|
|
|
|
- morph_close_kernel: 形态学闭运算核大小,0 跳过
|
|
|
|
|
- dpi: PDF 渲染分辨率
|
|
|
|
|
|
|
+ settings: 水印配置(含 method / threshold / mask / adaptive)
|
|
|
page_range: 页面范围字符串,如 "1-5,7,9-12"(从 1 开始,仅对 PDF 有效)
|
|
page_range: 页面范围字符串,如 "1-5,7,9-12"(从 1 开始,仅对 PDF 有效)
|
|
|
force_image: 强制对文字型 PDF 使用图像化处理(会失去文字可搜索性,
|
|
force_image: 强制对文字型 PDF 使用图像化处理(会失去文字可搜索性,
|
|
|
但能处理水印嵌在内容流中的情况)
|
|
但能处理水印嵌在内容流中的情况)
|
|
|
|
|
+ save_debug: 是否保存 before/after/compare/meta 到 debug_comparison/watermark_removal/
|
|
|
|
|
+ debug_output_dir: 调试图根目录,默认 output_path 的父目录
|
|
|
|
|
+ apply_watermark_removal: 默认取 settings.watermark_enabled
|
|
|
|
|
|
|
|
Returns:
|
|
Returns:
|
|
|
实际处理的页/图片数
|
|
实际处理的页/图片数
|
|
|
"""
|
|
"""
|
|
|
import shutil
|
|
import shutil
|
|
|
- import numpy as np
|
|
|
|
|
from io import BytesIO
|
|
from io import BytesIO
|
|
|
from PIL import Image
|
|
from PIL import Image
|
|
|
from ocr_utils.pdf_utils import PDFUtils
|
|
from ocr_utils.pdf_utils import PDFUtils
|
|
|
|
|
|
|
|
is_pdf = input_path.suffix.lower() == ".pdf"
|
|
is_pdf = input_path.suffix.lower() == ".pdf"
|
|
|
|
|
+ dpi = settings.dpi
|
|
|
|
|
+ contrast_enhancement = _active_contrast_enhancement(settings)
|
|
|
|
|
+ if apply_watermark_removal is None:
|
|
|
|
|
+ apply_watermark_removal = settings.watermark_enabled
|
|
|
|
|
|
|
|
# 统一加载 + 分类(PDF 用 MinerU pdf_classify,图片直接读取)
|
|
# 统一加载 + 分类(PDF 用 MinerU pdf_classify,图片直接读取)
|
|
|
images, pdf_type, pdf_doc, renderer = PDFUtils.load_and_classify_document(
|
|
images, pdf_type, pdf_doc, renderer = PDFUtils.load_and_classify_document(
|
|
@@ -161,12 +360,22 @@ def process_document(
|
|
|
|
|
|
|
|
logger.info(
|
|
logger.info(
|
|
|
f"{'📄' if is_pdf else '🖼️ '} 处理: {input_path.name} "
|
|
f"{'📄' if is_pdf else '🖼️ '} 处理: {input_path.name} "
|
|
|
- f"共 {len(images)} {'页' if is_pdf else '张'} threshold={threshold}"
|
|
|
|
|
|
|
+ f"共 {len(images)} {'页' if is_pdf else '张'} "
|
|
|
|
|
+ f"method={settings.method} threshold={settings.threshold}"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ contrast_only = (
|
|
|
|
|
+ not apply_watermark_removal
|
|
|
|
|
+ and contrast_enhancement
|
|
|
|
|
+ and contrast_enhancement.get("enabled", False)
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
# 水印检测(仅用第一页/图判断,同一文档水印通常一致)
|
|
# 水印检测(仅用第一页/图判断,同一文档水印通常一致)
|
|
|
# _known_has_wm 已在 txt 分支设置时,跳过重复检测
|
|
# _known_has_wm 已在 txt 分支设置时,跳过重复检测
|
|
|
- if _known_has_wm is not None:
|
|
|
|
|
|
|
+ if contrast_only:
|
|
|
|
|
+ has_wm = True
|
|
|
|
|
+ logger.info("📋 配置关闭去水印,仅应用 contrast_enhancement")
|
|
|
|
|
+ elif _known_has_wm is not None:
|
|
|
has_wm = _known_has_wm
|
|
has_wm = _known_has_wm
|
|
|
logger.info("🔍 检测到水印,启动去水印处理" if has_wm else "✅ 未检测到水印,跳过")
|
|
logger.info("🔍 检测到水印,启动去水印处理" if has_wm else "✅ 未检测到水印,跳过")
|
|
|
else:
|
|
else:
|
|
@@ -185,6 +394,7 @@ def process_document(
|
|
|
return 1
|
|
return 1
|
|
|
|
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
+ debug_root = debug_output_dir or output_path.parent
|
|
|
|
|
|
|
|
if is_pdf:
|
|
if is_pdf:
|
|
|
# 逐页处理后重新打包为 PDF
|
|
# 逐页处理后重新打包为 PDF
|
|
@@ -197,13 +407,31 @@ def process_document(
|
|
|
for i, img_dict in enumerate(images):
|
|
for i, img_dict in enumerate(images):
|
|
|
pil_img = img_dict["img_pil"]
|
|
pil_img = img_dict["img_pil"]
|
|
|
img_np = np.array(pil_img)
|
|
img_np = np.array(pil_img)
|
|
|
|
|
+ page_name = f"{input_path.stem}_page_{i + 1:03d}"
|
|
|
|
|
|
|
|
if has_wm:
|
|
if has_wm:
|
|
|
- cleaned_gray = remove_watermark_from_image(
|
|
|
|
|
- img_np, threshold=threshold,
|
|
|
|
|
- morph_close_kernel=morph_close_kernel, return_pil=False,
|
|
|
|
|
|
|
+ before = img_np.copy()
|
|
|
|
|
+ removal_dbg: Dict[str, Any] = {}
|
|
|
|
|
+ cleaned_rgb = _apply_image_watermark_removal(
|
|
|
|
|
+ img_np,
|
|
|
|
|
+ settings=settings,
|
|
|
|
|
+ contrast_enhancement=contrast_enhancement,
|
|
|
|
|
+ apply_watermark_removal=apply_watermark_removal,
|
|
|
|
|
+ removal_debug=removal_dbg,
|
|
|
|
|
+ )
|
|
|
|
|
+ if save_debug:
|
|
|
|
|
+ _maybe_save_watermark_debug(
|
|
|
|
|
+ before,
|
|
|
|
|
+ cleaned_rgb,
|
|
|
|
|
+ debug_root,
|
|
|
|
|
+ page_name,
|
|
|
|
|
+ settings=settings,
|
|
|
|
|
+ contrast_enhancement=contrast_enhancement,
|
|
|
|
|
+ removal_debug=removal_dbg,
|
|
|
|
|
+ )
|
|
|
|
|
+ out_pil = Image.fromarray(
|
|
|
|
|
+ cv2.cvtColor(cleaned_rgb, cv2.COLOR_BGR2RGB)
|
|
|
)
|
|
)
|
|
|
- out_pil = Image.fromarray(cleaned_gray).convert("RGB")
|
|
|
|
|
else:
|
|
else:
|
|
|
out_pil = pil_img
|
|
out_pil = pil_img
|
|
|
|
|
|
|
@@ -223,11 +451,27 @@ def process_document(
|
|
|
else:
|
|
else:
|
|
|
# 图片:有水印则去除后保存
|
|
# 图片:有水印则去除后保存
|
|
|
img_np = np.array(images[0]["img_pil"])
|
|
img_np = np.array(images[0]["img_pil"])
|
|
|
- cleaned_gray = remove_watermark_from_image(
|
|
|
|
|
- img_np, threshold=threshold,
|
|
|
|
|
- morph_close_kernel=morph_close_kernel, return_pil=False,
|
|
|
|
|
|
|
+ before = img_np.copy()
|
|
|
|
|
+ removal_dbg = {}
|
|
|
|
|
+ cleaned_rgb = _apply_image_watermark_removal(
|
|
|
|
|
+ img_np,
|
|
|
|
|
+ settings=settings,
|
|
|
|
|
+ contrast_enhancement=contrast_enhancement,
|
|
|
|
|
+ apply_watermark_removal=apply_watermark_removal,
|
|
|
|
|
+ removal_debug=removal_dbg,
|
|
|
)
|
|
)
|
|
|
- Image.fromarray(cleaned_gray, mode="L").save(str(output_path))
|
|
|
|
|
|
|
+ if save_debug:
|
|
|
|
|
+ _maybe_save_watermark_debug(
|
|
|
|
|
+ before,
|
|
|
|
|
+ cleaned_rgb,
|
|
|
|
|
+ debug_root,
|
|
|
|
|
+ input_path.stem,
|
|
|
|
|
+ settings=settings,
|
|
|
|
|
+ contrast_enhancement=contrast_enhancement,
|
|
|
|
|
+ removal_debug=removal_dbg,
|
|
|
|
|
+ )
|
|
|
|
|
+ out_rgb = cv2.cvtColor(cleaned_rgb, cv2.COLOR_BGR2RGB)
|
|
|
|
|
+ Image.fromarray(out_rgb).save(str(output_path))
|
|
|
|
|
|
|
|
logger.info(f"✅ 保存到: {output_path}")
|
|
logger.info(f"✅ 保存到: {output_path}")
|
|
|
return len(images)
|
|
return len(images)
|
|
@@ -235,13 +479,11 @@ def process_document(
|
|
|
|
|
|
|
|
def preview_page(
|
|
def preview_page(
|
|
|
input_path: Path,
|
|
input_path: Path,
|
|
|
|
|
+ settings: WatermarkToolSettings,
|
|
|
page_idx: int = 0,
|
|
page_idx: int = 0,
|
|
|
- threshold: int = 160,
|
|
|
|
|
- dpi: int = 200,
|
|
|
|
|
):
|
|
):
|
|
|
"""展示单页原图与去水印对比(需要 matplotlib)。支持 PDF 和图片文件。"""
|
|
"""展示单页原图与去水印对比(需要 matplotlib)。支持 PDF 和图片文件。"""
|
|
|
try:
|
|
try:
|
|
|
- import numpy as np
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.pyplot as plt
|
|
|
import matplotlib
|
|
import matplotlib
|
|
|
matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'sans-serif']
|
|
matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'sans-serif']
|
|
@@ -259,7 +501,7 @@ def preview_page(
|
|
|
doc = fitz.open(str(input_path))
|
|
doc = fitz.open(str(input_path))
|
|
|
if page_idx >= len(doc):
|
|
if page_idx >= len(doc):
|
|
|
raise ValueError(f"页码 {page_idx} 超出范围(共 {len(doc)} 页)")
|
|
raise ValueError(f"页码 {page_idx} 超出范围(共 {len(doc)} 页)")
|
|
|
- mat = fitz.Matrix(dpi / 72, dpi / 72)
|
|
|
|
|
|
|
+ mat = fitz.Matrix(settings.dpi / 72, settings.dpi / 72)
|
|
|
page = doc[page_idx]
|
|
page = doc[page_idx]
|
|
|
pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
|
img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
|
|
img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
|
|
@@ -271,56 +513,261 @@ def preview_page(
|
|
|
else:
|
|
else:
|
|
|
raise ValueError(f"不支持的文件格式: {suffix}")
|
|
raise ValueError(f"不支持的文件格式: {suffix}")
|
|
|
|
|
|
|
|
- cleaned = remove_watermark_from_image(img_np, threshold=threshold, return_pil=False)
|
|
|
|
|
|
|
+ contrast = _active_contrast_enhancement(settings)
|
|
|
|
|
+ cleaned_rgb = _apply_image_watermark_removal(
|
|
|
|
|
+ img_np,
|
|
|
|
|
+ settings=settings,
|
|
|
|
|
+ contrast_enhancement=contrast,
|
|
|
|
|
+ apply_watermark_removal=settings.watermark_enabled,
|
|
|
|
|
+ )
|
|
|
|
|
+ cleaned = cv2.cvtColor(cleaned_rgb, cv2.COLOR_BGR2GRAY)
|
|
|
|
|
|
|
|
fig, axes = plt.subplots(1, 2, figsize=(20, 14))
|
|
fig, axes = plt.subplots(1, 2, figsize=(20, 14))
|
|
|
axes[0].imshow(img_np)
|
|
axes[0].imshow(img_np)
|
|
|
axes[0].set_title(title_orig, fontsize=14)
|
|
axes[0].set_title(title_orig, fontsize=14)
|
|
|
axes[0].axis('off')
|
|
axes[0].axis('off')
|
|
|
|
|
|
|
|
|
|
+ subtitle = f"method={settings.method}, threshold={settings.threshold}"
|
|
|
|
|
+ if contrast:
|
|
|
|
|
+ subtitle += f", contrast={contrast.get('method', 'on')}"
|
|
|
axes[1].imshow(cleaned, cmap='gray')
|
|
axes[1].imshow(cleaned, cmap='gray')
|
|
|
- axes[1].set_title(f"去水印后 threshold={threshold}", fontsize=14)
|
|
|
|
|
|
|
+ axes[1].set_title(f"去水印后 {subtitle}", fontsize=14)
|
|
|
axes[1].axis('off')
|
|
axes[1].axis('off')
|
|
|
|
|
|
|
|
plt.tight_layout()
|
|
plt.tight_layout()
|
|
|
plt.show()
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def _run_process_document(
|
|
|
|
|
+ input_path: Path,
|
|
|
|
|
+ output_path: Path,
|
|
|
|
|
+ settings: WatermarkToolSettings,
|
|
|
|
|
+ *,
|
|
|
|
|
+ page_range: Optional[str] = None,
|
|
|
|
|
+ force_image: bool = False,
|
|
|
|
|
+ save_debug: bool = False,
|
|
|
|
|
+ debug_output_dir: Optional[Path] = None,
|
|
|
|
|
+) -> int:
|
|
|
|
|
+ return process_document(
|
|
|
|
|
+ input_path,
|
|
|
|
|
+ output_path,
|
|
|
|
|
+ settings,
|
|
|
|
|
+ page_range=page_range,
|
|
|
|
|
+ force_image=force_image,
|
|
|
|
|
+ save_debug=save_debug,
|
|
|
|
|
+ debug_output_dir=debug_output_dir,
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def compare_watermark_methods(
|
|
|
|
|
+ input_path: Path,
|
|
|
|
|
+ output_dir: Path,
|
|
|
|
|
+ settings: WatermarkToolSettings,
|
|
|
|
|
+) -> Dict[str, str]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 同一张图对比 threshold 与 masked_adaptive,输出三联图与 meta。
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 各输出文件路径
|
|
|
|
|
+ """
|
|
|
|
|
+ from PIL import Image
|
|
|
|
|
+
|
|
|
|
|
+ output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
+ stem = input_path.stem
|
|
|
|
|
+ img_rgb = np.array(Image.open(str(input_path)).convert("RGB"))
|
|
|
|
|
+ contrast = _active_contrast_enhancement(settings)
|
|
|
|
|
+
|
|
|
|
|
+ paths: Dict[str, str] = {}
|
|
|
|
|
+ results: Dict[str, np.ndarray] = {}
|
|
|
|
|
+
|
|
|
|
|
+ for method in ("threshold", "masked_adaptive"):
|
|
|
|
|
+ sub = copy.deepcopy(settings)
|
|
|
|
|
+ sub.method = method
|
|
|
|
|
+ dbg: Dict[str, Any] = {}
|
|
|
|
|
+ out = _apply_image_watermark_removal(
|
|
|
|
|
+ img_rgb,
|
|
|
|
|
+ settings=sub,
|
|
|
|
|
+ contrast_enhancement=contrast,
|
|
|
|
|
+ removal_debug=dbg,
|
|
|
|
|
+ )
|
|
|
|
|
+ out_rgb = cv2.cvtColor(out, cv2.COLOR_BGR2RGB)
|
|
|
|
|
+ results[method] = out_rgb
|
|
|
|
|
+ out_path = output_dir / f"{stem}_cleaned_{method}.png"
|
|
|
|
|
+ Image.fromarray(out_rgb).save(str(out_path))
|
|
|
|
|
+ paths[method] = str(out_path)
|
|
|
|
|
+ meta_path = output_dir / f"{stem}_meta_{method}.json"
|
|
|
|
|
+ meta = {
|
|
|
|
|
+ "method": method,
|
|
|
|
|
+ "threshold": settings.threshold,
|
|
|
|
|
+ "mask_mode": dbg.get("mask_mode"),
|
|
|
|
|
+ "direction_filter": dbg.get("direction_filter"),
|
|
|
|
|
+ "whiten_mode": dbg.get("whiten_mode"),
|
|
|
|
|
+ "T_wm": dbg.get("T_wm"),
|
|
|
|
|
+ "T_protect": dbg.get("T_protect"),
|
|
|
|
|
+ "mode": dbg.get("mode"),
|
|
|
|
|
+ "midtone_ratio": dbg.get("midtone_ratio"),
|
|
|
|
|
+ "wm_candidate_ratio": dbg.get("wm_candidate_ratio"),
|
|
|
|
|
+ "geom_mask_ratio": dbg.get("geom_mask_ratio"),
|
|
|
|
|
+ "geom_candidate_ratio": dbg.get("geom_candidate_ratio"),
|
|
|
|
|
+ "wm_mask_ratio": dbg.get("wm_mask_ratio"),
|
|
|
|
|
+ "white_pixel_ratio": dbg.get("white_pixel_ratio"),
|
|
|
|
|
+ "hough_kept_lines": dbg.get("hough_kept_lines"),
|
|
|
|
|
+ "hough_diag_candidates": dbg.get("hough_diag_candidates"),
|
|
|
|
|
+ "hough_total_lines": dbg.get("hough_total_lines"),
|
|
|
|
|
+ "dominant_angles": dbg.get("dominant_angles"),
|
|
|
|
|
+ "whiten_gray_low": dbg.get("whiten_gray_low"),
|
|
|
|
|
+ }
|
|
|
|
|
+ meta_path.write_text(
|
|
|
|
|
+ json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
|
|
|
|
|
+ )
|
|
|
|
|
+ paths[f"meta_{method}"] = str(meta_path)
|
|
|
|
|
+ if method == "masked_adaptive":
|
|
|
|
|
+ layer_paths = save_watermark_mask_debug_layers(
|
|
|
|
|
+ img_rgb, output_dir, stem, dbg, image_format="png"
|
|
|
|
|
+ )
|
|
|
|
|
+ paths.update(layer_paths)
|
|
|
|
|
+
|
|
|
|
|
+ h = max(results["threshold"].shape[0], results["masked_adaptive"].shape[0])
|
|
|
|
|
+
|
|
|
|
|
+ def _resize_rgb(arr: np.ndarray) -> np.ndarray:
|
|
|
|
|
+ if arr.shape[0] == h:
|
|
|
|
|
+ return arr
|
|
|
|
|
+ scale = h / arr.shape[0]
|
|
|
|
|
+ w = int(arr.shape[1] * scale)
|
|
|
|
|
+ return cv2.resize(arr, (w, h))
|
|
|
|
|
+
|
|
|
|
|
+ triple = np.hstack(
|
|
|
|
|
+ [_resize_rgb(img_rgb)]
|
|
|
|
|
+ + [_resize_rgb(results[m]) for m in ("threshold", "masked_adaptive")]
|
|
|
|
|
+ )
|
|
|
|
|
+ compare_path = output_dir / f"{stem}_compare_orig_threshold_masked.png"
|
|
|
|
|
+ cv2.imwrite(
|
|
|
|
|
+ str(compare_path),
|
|
|
|
|
+ cv2.cvtColor(triple, cv2.COLOR_RGB2BGR),
|
|
|
|
|
+ )
|
|
|
|
|
+ paths["compare_triple"] = str(compare_path)
|
|
|
|
|
+ logger.info(f"✅ 方法对比已保存: {compare_path}")
|
|
|
|
|
+ return paths
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def main():
|
|
def main():
|
|
|
parser = argparse.ArgumentParser(
|
|
parser = argparse.ArgumentParser(
|
|
|
- description="银行流水水印去除工具",
|
|
|
|
|
|
|
+ description="银行流水水印去除工具(参数默认来自场景 YAML,与 main_v2 Pipeline 一致)",
|
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
epilog=__doc__,
|
|
epilog=__doc__,
|
|
|
)
|
|
)
|
|
|
parser.add_argument("input", type=Path, help="输入 PDF / 图片文件或目录(批量模式)")
|
|
parser.add_argument("input", type=Path, help="输入 PDF / 图片文件或目录(批量模式)")
|
|
|
- parser.add_argument("-o", "--output", type=Path, default=None,
|
|
|
|
|
- help="输出路径(单文件模式;默认在原文件名后加 _cleaned)")
|
|
|
|
|
- parser.add_argument("--threshold", type=int, default=160,
|
|
|
|
|
- help="灰度阈值 (140-180),默认 160")
|
|
|
|
|
- parser.add_argument("--morph-kernel", type=int, default=2,
|
|
|
|
|
- help="形态学闭运算核大小,0 跳过,默认 2")
|
|
|
|
|
- parser.add_argument("--dpi", type=int, default=200,
|
|
|
|
|
- help="渲染 DPI,默认 200")
|
|
|
|
|
- parser.add_argument("--batch", action="store_true",
|
|
|
|
|
- help="批量模式:处理目录下所有 PDF 和图片")
|
|
|
|
|
- parser.add_argument("--preview", action="store_true",
|
|
|
|
|
- help="预览模式:展示单页对比图(不保存)")
|
|
|
|
|
- parser.add_argument("--page", type=int, default=0,
|
|
|
|
|
- help="预览页码(0-based),默认第 0 页")
|
|
|
|
|
- parser.add_argument("--page-range", type=str, default=None,
|
|
|
|
|
- help="处理页面范围,如 '1-3,5,7-9'(从 1 开始,仅对 PDF 有效)")
|
|
|
|
|
- parser.add_argument("--force-image", action="store_true",
|
|
|
|
|
- help="强制对文字型 PDF 使用图像化处理(会失去可搜索性,适用于 XObject 方法无法去除的内联水印)")
|
|
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "-c",
|
|
|
|
|
+ "--config",
|
|
|
|
|
+ type=Path,
|
|
|
|
|
+ default=_DEFAULT_CONFIG_PATH,
|
|
|
|
|
+ help=f"场景配置文件,读取 preprocessor.watermark_removal(默认: {_DEFAULT_CONFIG_PATH.name})",
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "-o",
|
|
|
|
|
+ "--output",
|
|
|
|
|
+ type=Path,
|
|
|
|
|
+ default=None,
|
|
|
|
|
+ help="输出路径(单文件模式;默认在原文件名后加 _cleaned)",
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument("--batch", action="store_true", help="批量处理目录下所有 PDF 和图片")
|
|
|
|
|
+ parser.add_argument("--preview", action="store_true", help="预览模式:展示单页对比图(不保存)")
|
|
|
|
|
+ parser.add_argument("--page", type=int, default=0, help="预览页码(0-based)")
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--page-range",
|
|
|
|
|
+ type=str,
|
|
|
|
|
+ default=None,
|
|
|
|
|
+ help="PDF 页面范围,如 '1-3,5,7-9'(从 1 开始)",
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--force-image",
|
|
|
|
|
+ action="store_true",
|
|
|
|
|
+ help="文字型 PDF 强制走图像去水印(失去可搜索性)",
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--debug",
|
|
|
|
|
+ action="store_true",
|
|
|
|
|
+ help="保存调试图到 debug_comparison/watermark_removal/",
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--debug-dir",
|
|
|
|
|
+ type=Path,
|
|
|
|
|
+ default=None,
|
|
|
|
|
+ help="调试图根目录(默认 -o 的父目录;格式见配置文件 debug_options.image_format)",
|
|
|
|
|
+ )
|
|
|
|
|
+ # 以下为覆盖配置文件的少量旋钮(未指定则完全使用 YAML)
|
|
|
|
|
+ override = parser.add_argument_group("覆盖配置文件(可选)")
|
|
|
|
|
+ override.add_argument(
|
|
|
|
|
+ "--threshold",
|
|
|
|
|
+ type=int,
|
|
|
|
|
+ default=None,
|
|
|
|
|
+ help="覆盖 watermark_removal.threshold(140-180)",
|
|
|
|
|
+ )
|
|
|
|
|
+ override.add_argument(
|
|
|
|
|
+ "--morph-kernel",
|
|
|
|
|
+ type=int,
|
|
|
|
|
+ default=None,
|
|
|
|
|
+ help="覆盖 watermark_removal.morph_close_kernel",
|
|
|
|
|
+ )
|
|
|
|
|
+ override.add_argument("--dpi", type=int, default=None, help="覆盖 input.dpi")
|
|
|
|
|
+ override.add_argument("--no-contrast", action="store_true", help="关闭 contrast_enhancement")
|
|
|
|
|
+ override.add_argument(
|
|
|
|
|
+ "--text-black-target",
|
|
|
|
|
+ type=int,
|
|
|
|
|
+ default=None,
|
|
|
|
|
+ help="覆盖 contrast_enhancement.text_black_target(text_restore)",
|
|
|
|
|
+ )
|
|
|
|
|
+ override.add_argument(
|
|
|
|
|
+ "--method",
|
|
|
|
|
+ type=str,
|
|
|
|
|
+ default=None,
|
|
|
|
|
+ choices=["threshold", "masked", "masked_adaptive"],
|
|
|
|
|
+ help="覆盖 watermark_removal.method",
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--compare-methods",
|
|
|
|
|
+ action="store_true",
|
|
|
|
|
+ help="对比 threshold 与 masked_adaptive,输出三联图到 -o 目录",
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
- if args.preview:
|
|
|
|
|
- preview_page(
|
|
|
|
|
- args.input,
|
|
|
|
|
- page_idx=args.page,
|
|
|
|
|
|
|
+ try:
|
|
|
|
|
+ settings = resolve_watermark_settings(
|
|
|
|
|
+ args.config,
|
|
|
threshold=args.threshold,
|
|
threshold=args.threshold,
|
|
|
|
|
+ morph_close_kernel=args.morph_kernel,
|
|
|
dpi=args.dpi,
|
|
dpi=args.dpi,
|
|
|
|
|
+ no_contrast=args.no_contrast,
|
|
|
|
|
+ text_black_target=args.text_black_target,
|
|
|
|
|
+ method=args.method,
|
|
|
)
|
|
)
|
|
|
|
|
+ except FileNotFoundError as e:
|
|
|
|
|
+ logger.error(str(e))
|
|
|
|
|
+ sys.exit(1)
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(
|
|
|
|
|
+ f"📋 配置: {args.config} | method={settings.method} | "
|
|
|
|
|
+ f"threshold={settings.threshold} | morph_kernel={settings.morph_close_kernel} | "
|
|
|
|
|
+ f"dpi={settings.dpi} | contrast={settings.contrast_enhancement}"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if args.compare_methods:
|
|
|
|
|
+ input_path = args.input
|
|
|
|
|
+ if not input_path.is_file():
|
|
|
|
|
+ logger.error(f"文件不存在: {input_path}")
|
|
|
|
|
+ sys.exit(1)
|
|
|
|
|
+ out_dir = args.output or (
|
|
|
|
|
+ input_path.parent / "debug_comparison" / "watermark_method_compare"
|
|
|
|
|
+ )
|
|
|
|
|
+ paths = compare_watermark_methods(input_path, out_dir, settings)
|
|
|
|
|
+ for k, v in paths.items():
|
|
|
|
|
+ logger.info(f" {k}: {v}")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ if args.preview:
|
|
|
|
|
+ preview_page(args.input, settings, page_idx=args.page)
|
|
|
return
|
|
return
|
|
|
|
|
|
|
|
if args.batch:
|
|
if args.batch:
|
|
@@ -345,7 +792,15 @@ def main():
|
|
|
for file in all_files:
|
|
for file in all_files:
|
|
|
out_file = out_dir / f"{file.stem}_cleaned{file.suffix}"
|
|
out_file = out_dir / f"{file.stem}_cleaned{file.suffix}"
|
|
|
try:
|
|
try:
|
|
|
- process_document(file, out_file, args.threshold, args.morph_kernel, args.dpi, args.page_range, args.force_image)
|
|
|
|
|
|
|
+ _run_process_document(
|
|
|
|
|
+ file,
|
|
|
|
|
+ out_file,
|
|
|
|
|
+ settings,
|
|
|
|
|
+ page_range=args.page_range,
|
|
|
|
|
+ force_image=args.force_image,
|
|
|
|
|
+ save_debug=args.debug,
|
|
|
|
|
+ debug_output_dir=args.debug_dir or out_dir,
|
|
|
|
|
+ )
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
logger.error(f"❌ 处理失败 {file.name}: {e}")
|
|
logger.error(f"❌ 处理失败 {file.name}: {e}")
|
|
|
logger.info(f"✅ 批量处理完成,共 {len(all_files)} 个文件 -> {out_dir}")
|
|
logger.info(f"✅ 批量处理完成,共 {len(all_files)} 个文件 -> {out_dir}")
|
|
@@ -360,7 +815,15 @@ def main():
|
|
|
)
|
|
)
|
|
|
suffix = input_path.suffix.lower()
|
|
suffix = input_path.suffix.lower()
|
|
|
if suffix == ".pdf" or suffix in IMAGE_SUFFIXES:
|
|
if suffix == ".pdf" or suffix in IMAGE_SUFFIXES:
|
|
|
- process_document(input_path, output_path, args.threshold, args.morph_kernel, args.dpi, args.page_range, args.force_image)
|
|
|
|
|
|
|
+ _run_process_document(
|
|
|
|
|
+ input_path,
|
|
|
|
|
+ output_path,
|
|
|
|
|
+ settings,
|
|
|
|
|
+ page_range=args.page_range,
|
|
|
|
|
+ force_image=args.force_image,
|
|
|
|
|
+ save_debug=args.debug,
|
|
|
|
|
+ debug_output_dir=args.debug_dir or output_path.parent,
|
|
|
|
|
+ )
|
|
|
else:
|
|
else:
|
|
|
logger.error(f"不支持的文件格式: {suffix},支持 PDF 和 {IMAGE_SUFFIXES}")
|
|
logger.error(f"不支持的文件格式: {suffix},支持 PDF 和 {IMAGE_SUFFIXES}")
|
|
|
sys.exit(1)
|
|
sys.exit(1)
|
|
@@ -379,14 +842,15 @@ if __name__ == "__main__":
|
|
|
# 文字PDF测试
|
|
# 文字PDF测试
|
|
|
# "input": "/Users/zhch158/workspace/data/流水分析/提取自赤峰黄金2023年报.pdf",
|
|
# "input": "/Users/zhch158/workspace/data/流水分析/提取自赤峰黄金2023年报.pdf",
|
|
|
# "input": "/Users/zhch158/workspace/data/测试文字PDF-水印.pdf",
|
|
# "input": "/Users/zhch158/workspace/data/测试文字PDF-水印.pdf",
|
|
|
- "input": "/Users/zhch158/workspace/data/非结构化文档识别统一平台(ocr_platform)-交易流水识别,财报识别.pdf",
|
|
|
|
|
|
|
+ # "input": "/Users/zhch158/workspace/data/非结构化文档识别统一平台(ocr_platform)-交易流水识别,财报识别.pdf",
|
|
|
|
|
+ "input": "/Users/zhch158/workspace/data/流水分析/彭_广东兴宁农村商业银行/bank_statement_yusys_local/彭_广东兴宁农村商业银行/彭_广东兴宁农村商业银行_page_002.png",
|
|
|
# "output": "./output/杨万益_福建农信",
|
|
# "output": "./output/杨万益_福建农信",
|
|
|
# 页面范围(可选,支持 "1-5,7" 语法,仅对 PDF 有效)
|
|
# 页面范围(可选,支持 "1-5,7" 语法,仅对 PDF 有效)
|
|
|
# "page_range": "3", # 仅处理第 1 页(对应 --page-range 参数)
|
|
# "page_range": "3", # 仅处理第 1 页(对应 --page-range 参数)
|
|
|
- "dpi": 200,
|
|
|
|
|
- "threshold": 160,
|
|
|
|
|
- "morph_kernel": 0, # 遮罩替换模式下不需要闭运算
|
|
|
|
|
- # "preview": True,
|
|
|
|
|
|
|
+ "config": str(_DEFAULT_CONFIG_PATH),
|
|
|
|
|
+ "preview": True,
|
|
|
|
|
+ "debug": True,
|
|
|
|
|
+ "compare-methods": True,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
# 构造参数(注意 input 是位置参数,morph_kernel 对应 --morph-kernel)
|
|
# 构造参数(注意 input 是位置参数,morph_kernel 对应 --morph-kernel)
|