Quellcode durchsuchen

feat(新增二次OCR处理与测试用例): 在test_second_pass_ocr_aggregate.py中新增多个测试类和用例,验证整体OCR处理逻辑,包括短文本高分触发整体OCR和空行触发逻辑,增强对银行对账单的二次OCR触发条件的测试,提升OCR处理的准确性和可靠性。

zhch158_admin vor 4 Tagen
Ursprung
Commit
b68a0e5003

+ 73 - 0
ocr_tools/universal_doc_parser/tests/test_second_pass_ocr_aggregate.py

@@ -65,6 +65,43 @@ class TestPickLineVsWhole:
         assert strat == "whole"
 
 
+class TestShouldRunWholeFallback:
+    def _filler(self) -> TextFiller:
+        return TextFiller(
+            ocr_engine=None,
+            config={
+                "second_pass_ocr": {
+                    "whole_cell_fallback": True,
+                    "enhance_retry": {"min_chars": 4},
+                }
+            },
+        )
+
+    def test_high_score_short_text_triggers_whole(self):
+        f = self._filler()
+        import numpy as np
+
+        cell = np.ones((40, 120, 3), dtype=np.uint8) * 255
+        assert f._should_run_whole_fallback(
+            "取款", 0.99, cell, [("取款", 0.99)], 0.9
+        )
+
+    def test_empty_line_triggers_whole(self):
+        f = self._filler()
+        import numpy as np
+
+        cell = np.ones((40, 80, 3), dtype=np.uint8) * 255
+        assert f._should_run_whole_fallback("", 0.0, cell, [], 0.9)
+
+
+class TestPickBetterOcrResult:
+    def test_reject_invalid_pass2_score(self):
+        pass1 = {"final_text": "取款", "final_score": 0.99, "accepted": True}
+        pass2 = {"final_text": "14.089", "final_score": 44.5, "accepted": False}
+        chosen = TextFiller._pick_better_ocr_result(pass1, pass2)
+        assert chosen is pass1
+
+
 class TestSanitizeDebugFilename:
     def test_illegal_chars(self):
         assert TextFiller.sanitize_debug_filename("a/b:c") == "a_b_c"
@@ -128,6 +165,42 @@ class TestStripFallbackHeuristic:
         assert score == 0.0
 
 
+class TestBankStatementReocrTrigger:
+    def _filler(self) -> TextFiller:
+        return TextFiller(
+            ocr_engine=None,
+            config={
+                "second_pass_ocr": {
+                    "reocr_mode": "bank_statement",
+                    "header_row": 0,
+                    "row_peer_min_nonempty": 3,
+                }
+            },
+        )
+
+    def test_body_row_empty_triggers(self):
+        f = self._filler()
+        merged = [
+            {"row": 0, "col": 0, "bbox": [0, 0, 10, 10]},
+            {"row": 1, "col": 0, "bbox": [0, 10, 10, 20]},
+        ]
+        texts = ["header", ""]
+        scores = [0.99, 0.0]
+        ok, reasons = f._should_second_pass_cell(
+            1, texts, scores, [], merged, "ocr", False, 0
+        )
+        assert ok is True
+        assert "body_row_empty" in reasons
+
+    def test_header_empty_not_body_row_forced(self):
+        f = self._filler()
+        merged = [{"row": 0, "col": 0, "bbox": [0, 0, 10, 10]}]
+        ok, reasons = f._should_second_pass_cell(
+            0, [""], [0.99], [], merged, "ocr", False, 0
+        )
+        assert "body_row_empty" not in reasons
+
+
 class TestResolveCellMatchedBoxes:
     """空大框套小框:避免仅用碎片字触发高置信填格。"""