6 месяцев назад · c628acd7b7
--- a/ocr_tools/universal_doc_parser/models/adapters/wired_table/text_filling.py
+++ b/ocr_tools/universal_doc_parser/models/adapters/wired_table/text_filling.py
@@ -4,6 +4,7 @@
 
															 提供表格单元格文本填充功能，包括OCR文本匹配和二次OCR填充。
														
 
															 """
														
 
															 from typing import List, Dict, Any, Tuple, Optional
														
 
															+import bisect
														
 
															 import cv2
														
 
															 import numpy as np
														
 
															 from loguru import logger
														
@@ -26,17 +27,58 @@ class TextFiller:
 
															         self.cell_crop_margin: int = config.get("cell_crop_margin", 2)
														
 
															         self.ocr_conf_threshold: float = config.get("ocr_conf_threshold", 0.5)
														
 
															+    @staticmethod
														
 
															+    def calculate_overlap_ratio(ocr_bbox: List[float], cell_bbox: List[float]) -> float:
														
 
															+        """
														
 
															+        计算 OCR box 与单元格的重叠比例（重叠面积 / OCR box 面积）
														
 
															+        
														
 
															+        这个比例表示 OCR box 有多少部分在单元格内，用于判断 OCR box 是否主要属于该单元格。
														
 
															+        
														
 
															+        Args:
														
 
															+            ocr_bbox: OCR box 坐标 [x1, y1, x2, y2]
														
 
															+            cell_bbox: 单元格坐标 [x1, y1, x2, y2]
														
 
															+            
														
 
															+        Returns:
														
 
															+            重叠比例 (0.0 ~ 1.0)，表示 OCR box 有多少部分在单元格内
														
 
															+        """
														
 
															+        if not ocr_bbox or not cell_bbox or len(ocr_bbox) < 4 or len(cell_bbox) < 4:
														
 
															+            return 0.0
														
 
															+        
														
 
															+        # 计算交集
														
 
															+        inter_x1 = max(ocr_bbox[0], cell_bbox[0])
														
 
															+        inter_y1 = max(ocr_bbox[1], cell_bbox[1])
														
 
															+        inter_x2 = min(ocr_bbox[2], cell_bbox[2])
														
 
															+        inter_y2 = min(ocr_bbox[3], cell_bbox[3])
														
 
															+        
														
 
															+        if inter_x2 <= inter_x1 or inter_y2 <= inter_y1:
														
 
															+            return 0.0
														
 
															+        
														
 
															+        inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
														
 
															+        ocr_area = (ocr_bbox[2] - ocr_bbox[0]) * (ocr_bbox[3] - ocr_bbox[1])
														
 
															+        
														
 
															+        if ocr_area <= 0:
														
 
															+            return 0.0
														
 
															+        
														
 
															+        return inter_area / ocr_area
														
 
															+    
														
 
															     def fill_text_by_center_point(
														
 
															         self,
														
 
															         bboxes: List[List[float]],
														
 
															         ocr_boxes: List[Dict[str, Any]],
														
 
															     ) -> Tuple[List[str], List[float], List[List[Dict[str, Any]]], List[int]]:
														
 
															         """
														
 
															-        使用中心点落格策略填充文本。
														
 
															+        使用混合匹配策略填充文本：中心点 + 重叠比例。
														
 
															+        
														
 
															+        策略说明：
														
 
															+        1. 首先用中心点快速筛选：OCR box 的中心点在单元格内
														
 
															+        2. 然后检查重叠比例：OCR box 与单元格的重叠面积 / OCR box 面积 >= 0.5
														
 
															+           （这确保 OCR box 主要属于该单元格，避免跨单元格匹配）
														
 
															+        3. 如果多个单元格都满足条件，选择重叠比例最高的
														
 
															-        参考 fill_html_with_ocr_by_bbox：
														
 
															-        - OCR文本中心点落入单元格bbox内则匹配
														
 
															-        - 多行文本按y坐标排序拼接
														
 
															+        优点：
														
 
															+        - 比纯 IOU 更宽松，能匹配到更多 OCR box
														
 
															+        - 比纯中心点更准确，能过滤跨单元格的 OCR box
														
 
															+        - 适合表格场景，OCR box 通常比单元格小或部分重叠
														
 
															         Args:
														
 
															             bboxes: 单元格坐标 [[x1,y1,x2,y2], ...]
														
@@ -45,7 +87,7 @@ class TextFiller:
 
															         Returns:
														
 
															             每个单元格的文本列表
														
 
															             每个单元格的置信度列表
														
 
															-            每个单元格匹配到的 OCR boxes 列表
														
 
															+            每个单元格匹配到的 OCR boxes 列表（已过滤跨单元格的 OCR box）
														
 
															             需要二次 OCR 的单元格索引列表（OCR box 跨多个单元格或过大）
														
 
															         """
														
 
															         texts: List[str] = ["" for _ in bboxes]
														
@@ -56,61 +98,96 @@ class TextFiller:
 
															         if not ocr_boxes:
														
 
															             return texts, scores, matched_boxes_list, need_reocr_indices
														
 
															-        # 预处理OCR结果：计算中心点
														
 
															+        # 预处理OCR结果：转换为 bbox 格式，并计算中心点
														
 
															         ocr_items: List[Dict[str, Any]] = []
														
 
															         for item in ocr_boxes:
														
 
															             # 使用 CoordinateUtils.poly_to_bbox() 替换 _normalize_bbox()
														
 
															             box = CoordinateUtils.poly_to_bbox(item.get("bbox", []))
														
 
															-            if not box:
														
 
															+            if not box or len(box) < 4:
														
 
															                 continue
														
 
															             cx = (box[0] + box[2]) / 2
														
 
															             cy = (box[1] + box[3]) / 2
														
 
															             ocr_items.append({
														
 
															+                "bbox": box,
														
 
															                 "center_x": cx,
														
 
															                 "center_y": cy,
														
 
															-                "y1": box[1],
														
 
															-                "bbox": box,  # 保存 bbox 用于跨单元格检测
														
 
															                 "text": item.get("text", ""),
														
 
															                 "confidence": float(item.get("confidence", item.get("score", 1.0))),
														
 
															                 "original_box": item,  # 保存完整的 OCR box 对象
														
 
															             })
														
 
															-        # 为每个单元格匹配OCR文本
														
 
															-        for idx, bbox in enumerate(bboxes):
														
 
															-            x1, y1, x2, y2 = bbox
														
 
															-            matched: List[Tuple[str, float, float, Dict[str, Any]]] = [] # (text, y1, score, original_box)
														
 
															+        # 按 (y1, x1) 排序，便于后续二分查找和提前退出
														
 
															+        # 排序只需要一次，对整体性能影响很小（O(n log n)）
														
 
															+        ocr_items.sort(key=lambda item: (item["bbox"][1], item["bbox"][0]))
														
 
															+        
														
 
															+        # 重叠比例阈值：OCR box 与单元格的重叠面积必须 >= OCR box 面积的 50%
														
 
															+        # 这确保 OCR box 主要属于该单元格
														
 
															+        overlap_ratio_threshold = 0.5
														
 
															+        
														
 
															+        # 为每个单元格匹配OCR文本（使用中心点 + 重叠比例）
														
 
															+        # 优化：使用二分查找和提前退出机制，减少遍历次数
														
 
															+        # 创建一个 y1 值的列表用于二分查找（兼容 Python < 3.10）
														
 
															+        ocr_y1_list = [item["bbox"][1] for item in ocr_items]
														
 
															+        
														
 
															+        for idx, cell_bbox in enumerate(bboxes):
														
 
															+            cell_x1, cell_y1, cell_x2, cell_y2 = cell_bbox
														
 
															+            matched: List[Tuple[str, float, float, float, float, Dict[str, Any]]] = [] # (text, y1, x1, overlap_ratio, score, original_box)
														
 
															-            for ocr in ocr_items:
														
 
															-                if x1 <= ocr["center_x"] <= x2 and y1 <= ocr["center_y"] <= y2:
														
 
															-                    matched.append((ocr["text"], ocr["y1"], ocr["confidence"], ocr["original_box"]))
														
 
															+            # 使用二分查找找到第一个 y1 >= cell_y1 的 OCR item
														
 
															+            # 由于 ocr_items 已按 (y1, x1) 排序，可以使用 bisect_left
														
 
															+            start_idx = bisect.bisect_left(ocr_y1_list, cell_y1)
														
 
															+            
														
 
															+            # 关键优化：OCR box 的 y1 可能 < cell_y1，但 y2 >= cell_y1（跨越单元格上边界）
														
 
															+            # 为了不遗漏这种情况，我们需要向前查找一些 items
														
 
															+            # 向前查找的最大数量：假设 OCR box 最大高度不超过 100 像素（可根据实际情况调整）
														
 
															+            max_lookback = 20  # 向前查找最多 20 个 items
														
 
															+            actual_start_idx = max(0, start_idx - max_lookback)
														
 
															+            
														
 
															+            # 从 actual_start_idx 开始遍历，当 y1 > cell_y2 时提前退出
														
 
															+            for i in range(actual_start_idx, len(ocr_items)):
														
 
															+                ocr_item = ocr_items[i]
														
 
															+                ocr_bbox = ocr_item["bbox"]
														
 
															+                
														
 
															+                # 提前退出：如果 y1 > cell_y2，后续的 items 都不可能在单元格内
														
 
															+                if ocr_bbox[1] > cell_y2:
														
 
															+                    break
														
 
															+                
														
 
															+                # 快速过滤：如果 OCR box 的 y2 < cell_y1，说明它完全在单元格上方，跳过
														
 
															+                if ocr_bbox[3] < cell_y1:
														
 
															+                    continue
														
 
															+                
														
 
															+                cx = ocr_item["center_x"]
														
 
															+                cy = ocr_item["center_y"]
														
 
															+                
														
 
															+                # 第一步：中心点必须在单元格内
														
 
															+                if not (cell_x1 <= cx <= cell_x2 and cell_y1 <= cy <= cell_y2):
														
 
															+                    continue
														
 
															+                
														
 
															+                # 第二步：检查重叠比例（OCR box 有多少部分在单元格内）
														
 
															+                overlap_ratio = self.calculate_overlap_ratio(ocr_bbox, cell_bbox)
														
 
															+                if overlap_ratio >= overlap_ratio_threshold:
														
 
															+                    matched.append((
														
 
															+                        ocr_item["text"], 
														
 
															+                        ocr_bbox[1],  # y1 坐标
														
 
															+                        ocr_bbox[0],  # 添加 x1 坐标
														
 
															+                        overlap_ratio,
														
 
															+                        ocr_item["confidence"], 
														
 
															+                        ocr_item["original_box"]
														
 
															+                    ))
														
 
															             if matched:
														
 
															-                # 按y坐标排序，确保多行文本顺序正确
														
 
															-                matched.sort(key=lambda x: x[1])
														
 
															-                texts[idx] = "".join([t for t, _, _, _ in matched])
														
 
															+                # 直接按 y1 和 x1 排序，确保文本顺序正确
														
 
															+                # y_tolerance 用于将相近的 y1 归为同一行（容差范围内视为同一行）
														
 
															+                # 同一行内按 x1 从左到右排序
														
 
															+                y_tolerance = 5
														
 
															+                matched.sort(key=lambda x: (round(x[1] / y_tolerance), x[2]))  # 先按 y_group，再按 x1
														
 
															+                
														
 
															+                texts[idx] = "".join([t for t, _, _, _, _, _ in matched])
														
 
															                 # 计算平均置信度
														
 
															-                avg_score = sum([s for _, _, s, _ in matched]) / len(matched)
														
 
															+                avg_score = sum([s for _, _, _, _, s, _ in matched]) / len(matched)
														
 
															                 scores[idx] = avg_score
														
 
															                 # 保存匹配到的 OCR boxes
														
 
															-                matched_boxes_list[idx] = [box for _, _, _, box in matched]
														
 
															-                
														
 
															-                # 检测 OCR box 是否跨多个单元格或过大
														
 
															-                for ocr_item in ocr_items:
														
 
															-                    ocr_bbox = ocr_item["bbox"]
														
 
															-                    # 检测是否跨多个单元格
														
 
															-                    overlapping_cells = self.detect_ocr_box_spanning_cells(ocr_bbox, bboxes, overlap_threshold=0.3)
														
 
															-                    if len(overlapping_cells) >= 2:
														
 
															-                        # OCR box 跨多个单元格，标记所有相关单元格需要二次 OCR
														
 
															-                        for cell_idx in overlapping_cells:
														
 
															-                            if cell_idx not in need_reocr_indices:
														
 
															-                                need_reocr_indices.append(cell_idx)
														
 
															-                        logger.debug(f"检测到 OCR box 跨 {len(overlapping_cells)} 个单元格: {ocr_item['text'][:20]}...")
														
 
															-                    
														
 
															-                    # 检测 OCR box 是否相对于当前单元格过大
														
 
															-                    if self.is_ocr_box_too_large(ocr_bbox, bbox, size_ratio_threshold=1.5):
														
 
															-                        if idx not in need_reocr_indices:
														
 
															-                            need_reocr_indices.append(idx)
														
 
															-                        logger.debug(f"检测到 OCR box 相对于单元格过大 (单元格 {idx}): {ocr_item['text'][:20]}...")
														
 
															+                matched_boxes_list[idx] = [box for _, _, _, _, _, box in matched]
														
 
															             else:
														
 
															                 scores[idx] = 0.0 # 无匹配文本，置信度为0
														
@@ -189,35 +266,6 @@ class TextFiller:
 
															         return overlapping_cells
														
 
															-    @staticmethod
														
 
															-    def is_ocr_box_too_large(
														
 
															-        ocr_bbox: List[float],
														
 
															-        cell_bbox: List[float],
														
 
															-        size_ratio_threshold: float = 1.5
														
 
															-    ) -> bool:
														
 
															-        """
														
 
															-        检测 OCR box 是否相对于单元格过大
														
 
															-        
														
 
															-        Args:
														
 
															-            ocr_bbox: OCR box 坐标 [x1, y1, x2, y2]
														
 
															-            cell_bbox: 单元格坐标 [x1, y1, x2, y2]
														
 
															-            size_ratio_threshold: 面积比阈值，如果 OCR box 面积 > 单元格面积 * 阈值，则认为过大
														
 
															-            
														
 
															-        Returns:
														
 
															-            是否过大
														
 
															-        """
														
 
															-        if not ocr_bbox or len(ocr_bbox) < 4 or not cell_bbox or len(cell_bbox) < 4:
														
 
															-            return False
														
 
															-        
														
 
															-        ocr_area = (ocr_bbox[2] - ocr_bbox[0]) * (ocr_bbox[3] - ocr_bbox[1])
														
 
															-        cell_area = (cell_bbox[2] - cell_bbox[0]) * (cell_bbox[3] - cell_bbox[1])
														
 
															-        
														
 
															-        if cell_area <= 0:
														
 
															-            return False
														
 
															-        
														
 
															-        size_ratio = ocr_area / cell_area
														
 
															-        return size_ratio > size_ratio_threshold
														
 
															-    
														
 
															     def second_pass_ocr_fill(
														
 
															         self,
														
 
															         table_image: np.ndarray,
														
@@ -377,6 +425,10 @@ class TextFiller:
 
															                     if x2 > x1 and y2 > y1:
														
 
															                         cropped = cell_img[y1:y2, x1:x2]
														
 
															+                        ch, cw = cropped.shape[:2]
														
 
															+                        # 小图放大
														
 
															+                        if ch < 64 or cw < 64:
														
 
															+                            cropped = cv2.resize(cropped, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_CUBIC)
														
 
															                         if cropped.size > 0:
														
 
															                             rec_img_list.append(cropped)
														
 
															                             rec_indices.append((cell_idx, box_idx))