Procházet zdrojové kódy

feat(新增单元格匹配框处理单元测试): 在test_second_pass_ocr_aggregate.py中新增TestResolveCellMatchedBoxes类,包含多个测试用例以验证单元格匹配框的处理逻辑,确保在不同情况下的文本填充和分数计算的准确性,提升OCR处理的可靠性和可维护性。

zhch158_admin před 5 dny
rodič
revize
e2bb737026

+ 69 - 0
ocr_tools/universal_doc_parser/tests/test_second_pass_ocr_aggregate.py

@@ -126,3 +126,72 @@ class TestStripFallbackHeuristic:
         text, score = TextFiller._parse_single_rec_item(("", 1.0))
         assert text == ""
         assert score == 0.0
+
+
+class TestResolveCellMatchedBoxes:
+    """空大框套小框:避免仅用碎片字触发高置信填格。"""
+
+    def _matched_entry(self, text, bbox, score=1.0):
+        return (
+            text,
+            bbox[1],
+            bbox[0],
+            1.0,
+            score,
+            {
+                "bbox": bbox,
+                "original_bbox": bbox,
+                "text": text,
+                "confidence": score,
+            },
+        )
+
+    def test_empty_outer_with_inner_char_forces_zero_score(self):
+        # 类似 cell 196:大空框 [877,1593,1084,1671] + 小「部」[966,1644,998,1676]
+        outer = [877.0, 1593.0, 1084.0, 1671.0]
+        inner = [966.0, 1644.0, 998.0, 1676.0]
+        matched = [
+            self._matched_entry("", outer, 1.0),
+            self._matched_entry("部", inner, 0.994),
+        ]
+        resolved, force_zero = TextFiller._resolve_cell_matched_boxes(matched)
+        assert force_zero is True
+        assert len(resolved) == 1
+        assert resolved[0][0] == ""
+        text = "".join(t for t, *_ in resolved)
+        assert text == ""
+
+    def test_outer_with_text_drops_inner_fragment(self):
+        outer = [100.0, 100.0, 400.0, 150.0]
+        inner = [350.0, 110.0, 380.0, 140.0]
+        matched = [
+            self._matched_entry("广东兴宁农村商业银", outer, 0.99),
+            self._matched_entry("行", inner, 0.95),
+        ]
+        resolved, force_zero = TextFiller._resolve_cell_matched_boxes(matched)
+        assert force_zero is False
+        assert len(resolved) == 1
+        assert resolved[0][0] == "广东兴宁农村商业银"
+
+    def test_fill_by_center_point_empty_container(self):
+        filler = TextFiller(ocr_engine=None, config={})
+        cell = [900.0, 1580.0, 1100.0, 1680.0]
+        ocr_boxes = [
+            {
+                "bbox": [877.0, 1593.0, 1084.0, 1671.0],
+                "text": "",
+                "confidence": 1.0,
+            },
+            {
+                "bbox": [966.0, 1644.0, 998.0, 1676.0],
+                "text": "部",
+                "confidence": 0.994,
+            },
+        ]
+        texts, scores, matched_boxes, _ = filler.fill_text_by_center_point(
+            [cell], ocr_boxes
+        )
+        assert texts[0] == ""
+        assert scores[0] == 0.0
+        assert len(matched_boxes[0]) == 1
+        assert matched_boxes[0][0].get("text", "") == ""