|
|
@@ -18,7 +18,7 @@ if str(ocr_platform_root) not in sys.path:
|
|
|
|
|
|
from .base import BasePreprocessor, BaseLayoutDetector, BaseVLRecognizer, BaseOCRRecognizer
|
|
|
from ocr_utils.coordinate_utils import CoordinateUtils
|
|
|
-from ocr_utils.watermark_utils import remove_watermark_from_image_rgb
|
|
|
+from ocr_utils.watermark import WatermarkProcessor
|
|
|
|
|
|
# 导入MinerU组件
|
|
|
try:
|
|
|
@@ -41,6 +41,11 @@ class MinerUPreprocessor(BasePreprocessor):
|
|
|
|
|
|
self.atom_model_manager = AtomModelSingleton()
|
|
|
self.orientation_classifier = None
|
|
|
+ wm_user = config.get("watermark_removal") or {}
|
|
|
+ self._wm_processor = WatermarkProcessor.from_user_config(
|
|
|
+ wm_user if isinstance(wm_user, dict) else {},
|
|
|
+ scope="page",
|
|
|
+ )
|
|
|
|
|
|
def initialize(self):
|
|
|
"""初始化预处理组件"""
|
|
|
@@ -63,46 +68,37 @@ class MinerUPreprocessor(BasePreprocessor):
|
|
|
if isinstance(image, Image.Image):
|
|
|
image = np.array(image)
|
|
|
|
|
|
- watermark_cfg = self.config.get('watermark_removal', {})
|
|
|
- wm_enabled = bool(watermark_cfg.get('enabled', False))
|
|
|
- # 对比度增强只有在水印去除之后才能生效
|
|
|
- contrast_cfg = watermark_cfg.get('contrast_enhancement', {})
|
|
|
- contrast_enabled = bool(
|
|
|
- contrast_cfg.get('enabled', False) if isinstance(contrast_cfg, dict) else False
|
|
|
- )
|
|
|
+ if not self._wm_processor.enabled:
|
|
|
+ return image
|
|
|
|
|
|
- if not wm_enabled:
|
|
|
+ page_name = getattr(self, "page_name", None) or "?"
|
|
|
+ if not self._wm_processor.should_apply(image):
|
|
|
+ logger.info(
|
|
|
+ f"未检测到水印,跳过去水印 (page={page_name}, detect_before_remove=true)"
|
|
|
+ )
|
|
|
return image
|
|
|
|
|
|
- threshold = watermark_cfg.get('threshold', 175)
|
|
|
- morph_close_kernel = watermark_cfg.get('morph_close_kernel', 0)
|
|
|
before_image = image.copy()
|
|
|
try:
|
|
|
- cleaned = remove_watermark_from_image_rgb(
|
|
|
- image,
|
|
|
- threshold=threshold,
|
|
|
- morph_close_kernel=morph_close_kernel,
|
|
|
- return_pil=False,
|
|
|
- contrast_enhancement=contrast_cfg if isinstance(contrast_cfg, dict) else None,
|
|
|
- apply_watermark_removal=wm_enabled,
|
|
|
- watermark_removal_cfg=watermark_cfg,
|
|
|
- )
|
|
|
- if wm_enabled:
|
|
|
- method = watermark_cfg.get("method", "threshold")
|
|
|
+ cleaned, stages = self._wm_processor.process(image)
|
|
|
+ if "wm" in stages:
|
|
|
logger.info(
|
|
|
- f"🧹 Watermark removed (method={method}, threshold={threshold})"
|
|
|
+ f"🧹 Watermark removed (method={self._wm_processor.method}, "
|
|
|
+ f"threshold={self._wm_processor.threshold})"
|
|
|
)
|
|
|
- if contrast_enabled:
|
|
|
- method = contrast_cfg.get('method', 'clahe') if isinstance(contrast_cfg, dict) else 'clahe'
|
|
|
+ if "contrast" in stages:
|
|
|
+ ce = self._wm_processor.config.get("contrast_enhancement") or {}
|
|
|
+ method = ce.get("method", "clahe") if isinstance(ce, dict) else "clahe"
|
|
|
logger.info(f"📈 Contrast enhanced (method={method})")
|
|
|
if self._is_watermark_debug_enabled():
|
|
|
try:
|
|
|
+ ce = self._wm_processor.contrast_config()
|
|
|
self._save_watermark_debug_images(
|
|
|
before_image,
|
|
|
np.array(cleaned),
|
|
|
- threshold,
|
|
|
- morph_close_kernel,
|
|
|
- contrast_cfg if isinstance(contrast_cfg, dict) else None,
|
|
|
+ self._wm_processor.threshold,
|
|
|
+ self._wm_processor.morph_close_kernel,
|
|
|
+ ce,
|
|
|
)
|
|
|
except Exception as dbg_e:
|
|
|
logger.warning(f"⚠️ Watermark debug save failed: {dbg_e}")
|