Sfoglia il codice sorgente

fix(优化二次OCR配置): 将二次OCR逻辑参数硬编码,移除YAML配置覆盖,调整suspicious_short_min_chars默认值为4,优化光照增强配置,提升OCR处理的准确性与一致性。

zhch158_admin 2 giorni fa
parent
commit
dbc00409e6

+ 18 - 28
ocr_tools/universal_doc_parser/models/adapters/wired_table/text_filling.py

@@ -47,39 +47,29 @@ class TextFiller:
         if not isinstance(sp_cfg, dict):
             sp_cfg = {}
         self.second_pass_line_min_score: float = float(sp_cfg.get("line_min_score", 0.8))
-        self.second_pass_drop_low: bool = bool(sp_cfg.get("drop_low_score_blocks", True))
-        self.second_pass_whole_fallback: bool = bool(sp_cfg.get("whole_cell_fallback", True))
-        self.second_pass_prefer_whole_on_tie: bool = bool(
-            sp_cfg.get("prefer_whole_on_tie", True)
-        )
         self.second_pass_reocr_mode: str = str(sp_cfg.get("reocr_mode", "default"))
-        self.second_pass_header_row: int = int(sp_cfg.get("header_row", 0))
-        self.second_pass_strip_aspect: float = float(
-            sp_cfg.get("strip_fallback_aspect_ratio", 1.8)
-        )
-        self.second_pass_whole_longer_extra: int = int(
-            sp_cfg.get("whole_longer_min_extra_chars", 2)
-        )
-        self.second_pass_row_peer_min_nonempty: int = int(
-            sp_cfg.get("row_peer_min_nonempty", 5)
-        )
-        _short_min = sp_cfg.get("suspicious_short_min_chars")
+        # 以下逻辑参数不再从 YAML 暴露,与 bank_statement 场景固定
+        self.second_pass_drop_low: bool = True
+        self.second_pass_whole_fallback: bool = True
+        self.second_pass_prefer_whole_on_tie: bool = True
+        self.second_pass_header_row: int = 0
+        self.second_pass_strip_aspect: float = 1.8
+        self.second_pass_whole_longer_extra: int = 2
+        self.second_pass_row_peer_min_nonempty: int = 5
+        self.second_pass_suspicious_short_min_chars: int = 4
         cpp = sp_cfg.get("cell_preprocess") or {}
         if not isinstance(cpp, dict):
             cpp = {}
-        light = cpp.get("light") or {}
-        if not isinstance(light, dict):
-            light = {}
-        self.second_pass_light_upscale_min: int = int(
-            light.get("upscale_min_side", 192)
-        )
-        er = sp_cfg.get("enhance_retry") or cpp.get("enhance_retry") or {}
+        upscale = cpp.get("upscale_min_side")
+        if upscale is None:
+            light = cpp.get("light") or {}
+            if isinstance(light, dict):
+                upscale = light.get("upscale_min_side")
+        self.second_pass_light_upscale_min: int = int(upscale if upscale is not None else 128)
+        er = cpp.get("enhance_retry") or {}
         if not isinstance(er, dict):
             er = {}
-        if _short_min is None:
-            _short_min = er.get("min_chars", 4)
-        self.second_pass_suspicious_short_min_chars: int = int(_short_min)
-        self.second_pass_enhance_retry_enabled: bool = bool(er.get("enabled", True))
+        self.second_pass_enhance_retry_enabled: bool = bool(er.get("enabled", False))
         self.second_pass_enhance_score_below: float = float(
             er.get("score_below", 0.90)
         )
@@ -508,7 +498,7 @@ class TextFiller:
                 stages.append("contrast")
         elif mode == "enhance":
             contrast_cfg = self.second_pass_enhance_contrast
-            if self._cell_contrast_cfg.get("enabled", False):
+            if not contrast_cfg.get("method") and self._cell_contrast_cfg.get("enabled", False):
                 contrast_cfg = self._cell_contrast_cfg
             if contrast_cfg.get("enabled", False) and "wm" in stages:
                 img = self._apply_cell_contrast(

+ 3 - 2
ocr_tools/universal_doc_parser/tests/test_second_pass_ocr_aggregate.py

@@ -95,12 +95,13 @@ class TestShouldRunWholeFallback:
 
 
 class TestCellPreprocessConfig:
-    def test_suspicious_short_from_top_level(self):
+    def test_suspicious_short_hardcoded(self):
+        """逻辑参数已硬编码,YAML 不再覆盖 suspicious_short_min_chars。"""
         f = TextFiller(
             ocr_engine=None,
             config={"second_pass_ocr": {"suspicious_short_min_chars": 6}},
         )
-        assert f.second_pass_suspicious_short_min_chars == 6
+        assert f.second_pass_suspicious_short_min_chars == 4
 
     def test_light_contrast_stage_when_enabled(self):
         import numpy as np