Selaa lähdekoodia

feat(增强二次OCR处理与单元格预处理功能): 在test_second_pass_ocr_aggregate.py中新增测试类和用例,验证短文本最小字符配置、单元格预处理的对比度调整及水印处理逻辑,提升OCR处理的准确性与灵活性。

zhch158_admin 3 päivää sitten
vanhempi
commit
a2311846f1

+ 56 - 1
ocr_tools/universal_doc_parser/tests/test_second_pass_ocr_aggregate.py

@@ -72,7 +72,7 @@ class TestShouldRunWholeFallback:
             config={
                 "second_pass_ocr": {
                     "whole_cell_fallback": True,
-                    "enhance_retry": {"min_chars": 4},
+                    "suspicious_short_min_chars": 4,
                 }
             },
         )
@@ -94,6 +94,61 @@ class TestShouldRunWholeFallback:
         assert f._should_run_whole_fallback("", 0.0, cell, [], 0.9)
 
 
+class TestCellPreprocessConfig:
+    def test_suspicious_short_from_top_level(self):
+        f = TextFiller(
+            ocr_engine=None,
+            config={"second_pass_ocr": {"suspicious_short_min_chars": 6}},
+        )
+        assert f.second_pass_suspicious_short_min_chars == 6
+
+    def test_light_contrast_stage_when_enabled(self):
+        import numpy as np
+
+        f = TextFiller(
+            ocr_engine=None,
+            config={
+                "second_pass_ocr": {
+                    "cell_preprocess": {
+                        "watermark": {"enabled": True, "method": "threshold"},
+                        "contrast": {
+                            "enabled": True,
+                            "method": "text_restore",
+                            "text_black_target": 88,
+                        },
+                    }
+                }
+            },
+        )
+        cell = np.ones((40, 80, 3), dtype=np.uint8) * 200
+        _, stages = f._preprocess_cell_for_ocr(cell, mode="light")
+        assert "wm" in stages
+        assert "contrast" in stages
+
+
+class TestWholeCellParse:
+    def test_parse_det_rec_item_uses_rec_not_box(self):
+        item = [
+            [[146.0, 15.0], [199.0, 15.0], [199.0, 85.0], [146.0, 85.0]],
+            ("/", 0.9213118553161621),
+        ]
+        t, s = TextFiller._parse_det_rec_item(item)
+        assert t == "/"
+        assert abs(s - 0.9213118553161621) < 1e-6
+
+    def test_normalize_rec_score_percent(self):
+        assert abs(TextFiller._normalize_rec_score(92.5) - 0.925) < 1e-6
+        assert TextFiller._normalize_rec_score(0.921) == 0.921
+        assert TextFiller._normalize_rec_score(999) == 0.0
+
+    def test_pick_line_when_whole_score_invalid(self):
+        f = TextFiller(ocr_engine=None, config={"second_pass_ocr": {}})
+        t, s, strat = f._pick_line_vs_whole("/", 0.92, "146.0199.0146.0/", 999.0)
+        assert t == "/"
+        assert strat == "lines"
+        assert abs(s - 0.92) < 1e-6
+
+
 class TestPickBetterOcrResult:
     def test_reject_invalid_pass2_score(self):
         pass1 = {"final_text": "取款", "final_score": 0.99, "accepted": True}