瀏覽代碼

feat(优化水印处理与OCR逻辑): 重构MinerUPreprocessor类以整合WatermarkProcessor,简化水印去除流程并增强对比度调整功能,同时更新MinerUWiredTableRecognizer类以支持更灵活的单元格OCR处理,提升整体OCR准确性与灵活性。

zhch158_admin 4 天之前
父節點
當前提交
9dd99bce76

+ 24 - 28
ocr_tools/universal_doc_parser/models/adapters/mineru_adapter.py

@@ -18,7 +18,7 @@ if str(ocr_platform_root) not in sys.path:
 
 from .base import BasePreprocessor, BaseLayoutDetector, BaseVLRecognizer, BaseOCRRecognizer
 from ocr_utils.coordinate_utils import CoordinateUtils
-from ocr_utils.watermark_utils import remove_watermark_from_image_rgb
+from ocr_utils.watermark import WatermarkProcessor
 
 # 导入MinerU组件
 try:
@@ -41,6 +41,11 @@ class MinerUPreprocessor(BasePreprocessor):
             
         self.atom_model_manager = AtomModelSingleton()
         self.orientation_classifier = None
+        wm_user = config.get("watermark_removal") or {}
+        self._wm_processor = WatermarkProcessor.from_user_config(
+            wm_user if isinstance(wm_user, dict) else {},
+            scope="page",
+        )
         
     def initialize(self):
         """初始化预处理组件"""
@@ -63,46 +68,37 @@ class MinerUPreprocessor(BasePreprocessor):
         if isinstance(image, Image.Image):
             image = np.array(image)
 
-        watermark_cfg = self.config.get('watermark_removal', {})
-        wm_enabled = bool(watermark_cfg.get('enabled', False))
-        # 对比度增强只有在水印去除之后才能生效
-        contrast_cfg = watermark_cfg.get('contrast_enhancement', {})
-        contrast_enabled = bool(
-            contrast_cfg.get('enabled', False) if isinstance(contrast_cfg, dict) else False
-        )
+        if not self._wm_processor.enabled:
+            return image
 
-        if not wm_enabled:
+        page_name = getattr(self, "page_name", None) or "?"
+        if not self._wm_processor.should_apply(image):
+            logger.info(
+                f"未检测到水印,跳过去水印 (page={page_name}, detect_before_remove=true)"
+            )
             return image
 
-        threshold = watermark_cfg.get('threshold', 175)
-        morph_close_kernel = watermark_cfg.get('morph_close_kernel', 0)
         before_image = image.copy()
         try:
-            cleaned = remove_watermark_from_image_rgb(
-                image,
-                threshold=threshold,
-                morph_close_kernel=morph_close_kernel,
-                return_pil=False,
-                contrast_enhancement=contrast_cfg if isinstance(contrast_cfg, dict) else None,
-                apply_watermark_removal=wm_enabled,
-                watermark_removal_cfg=watermark_cfg,
-            )
-            if wm_enabled:
-                method = watermark_cfg.get("method", "threshold")
+            cleaned, stages = self._wm_processor.process(image)
+            if "wm" in stages:
                 logger.info(
-                    f"🧹 Watermark removed (method={method}, threshold={threshold})"
+                    f"🧹 Watermark removed (method={self._wm_processor.method}, "
+                    f"threshold={self._wm_processor.threshold})"
                 )
-            if contrast_enabled:
-                method = contrast_cfg.get('method', 'clahe') if isinstance(contrast_cfg, dict) else 'clahe'
+            if "contrast" in stages:
+                ce = self._wm_processor.config.get("contrast_enhancement") or {}
+                method = ce.get("method", "clahe") if isinstance(ce, dict) else "clahe"
                 logger.info(f"📈 Contrast enhanced (method={method})")
             if self._is_watermark_debug_enabled():
                 try:
+                    ce = self._wm_processor.contrast_config()
                     self._save_watermark_debug_images(
                         before_image,
                         np.array(cleaned),
-                        threshold,
-                        morph_close_kernel,
-                        contrast_cfg if isinstance(contrast_cfg, dict) else None,
+                        self._wm_processor.threshold,
+                        self._wm_processor.morph_close_kernel,
+                        ce,
                     )
                 except Exception as dbg_e:
                     logger.warning(f"⚠️ Watermark debug save failed: {dbg_e}")

+ 3 - 4
ocr_tools/universal_doc_parser/models/adapters/mineru_wired_table.py

@@ -464,9 +464,7 @@ class MinerUWiredTableRecognizer:
             bboxes_merged = [cell["bbox"] for cell in merged_cells]
             texts, scores, matched_boxes_list, need_reocr_indices = self.text_filler.fill_text_by_center_point(bboxes_merged, ocr_boxes or [])
             
-            # Step 4.5: 二次 OCR 修正 (强制全量 OCR)
-            # 策略调整:默认对所有单元格进行 Cropped OCR,以解决 Header 误合并和文本分配错误问题。
-            # Full-page OCR 结果仅作为 Fallback(在 text_filling.py 中逻辑是: 如果 Cropped OCR 结果为空或低分,才保留原值)
+            # Step 4.5: 二次 OCR(银行流水:表体空单元必跑 + 低分/跨格;可选笔画增强重试)
             if hasattr(self, 'ocr_engine') and self.ocr_engine:
                 cell_ocr_dir = None
                 if debug_root is not None:
@@ -475,9 +473,10 @@ class MinerUWiredTableRecognizer:
                     table_image, bboxes_merged, texts, scores,
                     need_reocr_indices=need_reocr_indices,
                     pdf_type=pdf_type,
-                    force_all=False,  # Force Per-Cell OCR
+                    force_all=False,
                     output_dir=cell_ocr_dir,
                     debug_prefix=dbg.prefix or None,
+                    merged_cells=merged_cells,
                 )
 
             for i, cell in enumerate(merged_cells):