Przeglądaj źródła

feat(增强OCR框架): 在TextFiller类中添加新的配置参数以处理OCR框的宽度溢出和邻格重叠,优化跨单元格检测逻辑,提升文本填充的准确性。

zhch158_admin 2 dni temu
rodzic
commit
5f33763ee3

+ 54 - 9
ocr_tools/universal_doc_parser/models/adapters/wired_table/text_filling.py

@@ -36,6 +36,12 @@ class TextFiller:
         self.min_overlap_area: float = config.get("min_overlap_area", 50.0)
         self.center_cell_ratio: float = config.get("center_cell_ratio", 0.5)
         self.other_cell_max_ratio: float = config.get("other_cell_max_ratio", 0.3)
+        # OCR box 宽度超过中心单元格宽度 * 该比例 → 视为横向跨格误合并
+        self.ocr_bbox_width_overflow_ratio: float = config.get("ocr_bbox_width_overflow_ratio", 1.08)
+        # 相邻列单元格与 OCR box 的重叠比例下限(低于 other_cell_max_ratio,用于捕获 ~20-30% 的邻格重叠)
+        self.horizontal_secondary_overlap_ratio: float = config.get(
+            "horizontal_secondary_overlap_ratio", 0.15
+        )
     
     @staticmethod
     def calculate_dynamic_confidence_threshold(text: str, base_threshold: float = 0.9) -> float:
@@ -266,6 +272,34 @@ class TextFiller:
                 logger.debug(f"检测到 OCR box 跨 {len(overlapping_cells)} 个单元格[{', '.join(map(str, overlapping_cells))}]: {ocr_item['text'][:20]}...")
                 
                 processed_ocr_indices.add(ocr_idx)
+
+        # 已匹配到单元格但 OCR box 宽度明显超出单元格(漏检跨格的补充)
+        # for cell_idx, cell_bbox in enumerate(bboxes):
+        #     if not matched_boxes_list[cell_idx]:
+        #         continue
+        #     cell_w = cell_bbox[2] - cell_bbox[0]
+        #     if cell_w <= 0:
+        #         continue
+        #     for box in matched_boxes_list[cell_idx]:
+        #         ocr_bbox = CoordinateUtils.poly_to_bbox(box.get("bbox", []))
+        #         if not ocr_bbox or len(ocr_bbox) < 4:
+        #             continue
+        #         ocr_w = ocr_bbox[2] - ocr_bbox[0]
+        #         if ocr_w <= cell_w * self.ocr_bbox_width_overflow_ratio:
+        #             continue
+        #         cx = (ocr_bbox[0] + ocr_bbox[2]) / 2
+        #         cy = (ocr_bbox[1] + ocr_bbox[3]) / 2
+        #         spanning = self.detect_ocr_box_spanning_cells(
+        #             ocr_bbox, bboxes, center_point=(cx, cy)
+        #         )
+        #         targets = spanning if len(spanning) >= 2 else [cell_idx]
+        #         for tidx in targets:
+        #             if tidx not in need_reocr_indices:
+        #                 need_reocr_indices.append(tidx)
+        #         logger.debug(
+        #             f"OCR box 宽度({ocr_w:.0f})超出单元格{cell_idx}宽度({cell_w:.0f}),"
+        #             f"标记重识别: {targets}"
+        #         )
         
         return texts, scores, matched_boxes_list, need_reocr_indices
     
@@ -383,19 +417,30 @@ class TextFiller:
                 if is_overlapping:
                     cell_overlaps.append((idx, overlap_ratio))
         
-        # 如果中心点在某个单元格内,且该单元格的重叠比例符合阈值,且没有其他单元格达到次要阈值,则不标记为跨单元格
+        # 中心单元格占主导时可豁免跨格标记,但横向误合并(OCR 框过宽 / 邻格有显著重叠)除外
         if center_cell_idx is not None and cell_overlaps:
-            # 找到中心点所在单元格的重叠比例
-            center_overlap = next((overlap for idx, overlap in cell_overlaps if idx == center_cell_idx), None)
+            center_overlap = next(
+                (overlap for idx, overlap in cell_overlaps if idx == center_cell_idx), None
+            )
             if center_overlap is not None and center_overlap >= self.center_cell_ratio:
-                # 检查是否有其他单元格的重叠比例也超过次要阈值
-                other_high_overlaps = [idx for idx, overlap in cell_overlaps 
-                                      if idx != center_cell_idx and overlap >= self.other_cell_max_ratio]
-                if not other_high_overlaps:
-                    # 中心点所在单元格占主导,不应该标记为跨单元格
+                other_high_overlaps = [
+                    idx for idx, overlap in cell_overlaps
+                    if idx != center_cell_idx and overlap >= self.other_cell_max_ratio
+                ]
+                other_horizontal_overlaps = [
+                    idx for idx, overlap in cell_overlaps
+                    if idx != center_cell_idx
+                    and overlap >= self.horizontal_secondary_overlap_ratio
+                ]
+                center_cell = cell_bboxes[center_cell_idx]
+                center_w = center_cell[2] - center_cell[0]
+                width_overflow = (
+                    center_w > 0
+                    and ocr_width > center_w * self.ocr_bbox_width_overflow_ratio
+                )
+                if not other_high_overlaps and not other_horizontal_overlaps and not width_overflow:
                     return []
         
-        # 返回所有满足阈值的单元格索引
         return [idx for idx, _ in cell_overlaps]
     
     def second_pass_ocr_fill(