5 месяцев назад · 88a323bc84
--- a/ocr_tools/universal_doc_parser/models/adapters/wired_table/grid_recovery.py
+++ b/ocr_tools/universal_doc_parser/models/adapters/wired_table/grid_recovery.py
@@ -479,11 +479,12 @@ class GridRecovery:
 
															             if cell_orig_h < 4.0 or cell_orig_w < 4.0:
														
 
															                 continue
														
 
															+            # 显式转换为Python float，避免numpy.float32导致JSON序列化错误
														
 
															             bboxes.append([
														
 
															-                x / scale_w,
														
 
															-                y / scale_h,
														
 
															-                (x + w_cell) / scale_w,
														
 
															-                (y + h_cell) / scale_h
														
 
															+                float(x / scale_w),
														
 
															+                float(y / scale_h),
														
 
															+                float((x + w_cell) / scale_w),
														
 
															+                float((y + h_cell) / scale_h)
														
 
															             ])
														
 
															         bboxes.sort(key=lambda b: (int(b[1] / 10), b[0]))
														
@@ -510,12 +511,54 @@ class GridRecovery:
 
															                 for line in colboxes
														
 
															             ]
														
 
															+            # 🆕 过滤线条：只保留与existing_bboxes边界对齐的线条
														
 
															+            # 因为OCR补偿只针对与现有单元格相邻的空单元格
														
 
															+            def filter_lines_by_bboxes(lines, bboxes, is_horizontal, tolerance=5.0):
														
 
															+                """过滤线条，只保留与bboxes边界对齐的线条"""
														
 
															+                if not bboxes:
														
 
															+                    return lines
														
 
															+                
														
 
															+                # 提取所有bbox的边界坐标
														
 
															+                if is_horizontal:
														
 
															+                    # 横线：检查是否与bbox的y1或y2对齐
														
 
															+                    bbox_coords = set()
														
 
															+                    for bbox in bboxes:
														
 
															+                        bbox_coords.add(bbox[1])  # y1
														
 
															+                        bbox_coords.add(bbox[3])  # y2
														
 
															+                else:
														
 
															+                    # 竖线：检查是否与bbox的x1或x2对齐
														
 
															+                    bbox_coords = set()
														
 
															+                    for bbox in bboxes:
														
 
															+                        bbox_coords.add(bbox[0])  # x1
														
 
															+                        bbox_coords.add(bbox[2])  # x2
														
 
															+                
														
 
															+                # 过滤线条
														
 
															+                filtered_lines = []
														
 
															+                for line in lines:
														
 
															+                    line_coord = (line[1] + line[3]) / 2 if is_horizontal else (line[0] + line[2]) / 2
														
 
															+                    
														
 
															+                    # 检查是否与任意bbox边界对齐
														
 
															+                    is_aligned = any(abs(line_coord - coord) < tolerance for coord in bbox_coords)
														
 
															+                    if is_aligned:
														
 
															+                        filtered_lines.append(line)
														
 
															+                
														
 
															+                return filtered_lines
														
 
															+            
														
 
															+            # 过滤掉与existing_bboxes不对齐的干扰线条
														
 
															+            rowboxes_filtered = filter_lines_by_bboxes(rowboxes_orig, bboxes, is_horizontal=True)
														
 
															+            colboxes_filtered = filter_lines_by_bboxes(colboxes_orig, bboxes, is_horizontal=False)
														
 
															+            
														
 
															+            logger.debug(
														
 
															+                f"🔍 线条过滤: 横线 {len(rowboxes_orig)}→{len(rowboxes_filtered)}, "
														
 
															+                f"竖线 {len(colboxes_orig)}→{len(colboxes_filtered)}"
														
 
															+            )
														
 
															+            
														
 
															             # 调用OCR补偿算法 (所有坐标均为原图坐标系)
														
 
															             compensated_bboxes = GridRecovery._compensate_unclosed_cells(
														
 
															                 existing_bboxes=bboxes,  # 已有bbox (原图坐标系)
														
 
															                 ocr_bboxes=ocr_bboxes,   # OCR结果 (原图坐标系)
														
 
															-                rowboxes=rowboxes_orig,  # 水平线 (原图坐标系)
														
 
															-                colboxes=colboxes_orig,  # 垂直线 (原图坐标系)
														
 
															+                rowboxes=rowboxes_filtered,  # 🆕 使用过滤后的水平线
														
 
															+                colboxes=colboxes_filtered,  # 🆕 使用过滤后的垂直线
														
 
															                 img_h=orig_h,
														
 
															                 img_w=orig_w,
														
 
															                 debug_dir=debug_dir,
														
@@ -858,49 +901,130 @@ class GridRecovery:
 
															         logger.debug(f"📊 已占用: {len(grid)}个网格单元 (共{(len(row_dividers)-1)*(len(col_dividers)-1)}个)")
														
 
															-        # Step 4: 第一遍 - 为所有OCR找到其覆盖的空单元格（不扩展）
														
 
															-        ocr_to_empty_cells = {}  # {ocr_index: [empty_cells]}
														
 
															+        # Step 4: 迭代补偿 - 多轮查找有相邻单元格的OCR
														
 
															+        # 第一轮补偿的OCR成为"已占用"，让后续OCR能找到相邻单元格
														
 
															+        ocr_to_empty_cells = {}  # {ocr_index: {'ocr', 'empty_cells'}}
														
 
															+        remaining_ocr_indices = set(range(len(valid_ocr)))  # 剩余未处理的OCR索引
														
 
															+        iteration = 0
														
 
															+        max_iterations = 10  # 防止无限循环
														
 
															-        for idx, ocr in enumerate(valid_ocr):
														
 
															-            ocr_bbox = ocr['bbox']
														
 
															-            ocr_text = ocr.get('text', '')[:30]
														
 
															+        while remaining_ocr_indices and iteration < max_iterations:
														
 
															+            iteration += 1
														
 
															+            newly_added = {}  # 本轮新增的OCR
														
 
															-            # 找到OCR覆盖的所有网格单元
														
 
															-            overlapping_cells = find_overlapping_cells(ocr_bbox)
														
 
															-            
														
 
															-            if not overlapping_cells:
														
 
															-                continue
														
 
															-            
														
 
															-            # 找出未被占用的单元格
														
 
															-            empty_cells = [cell for cell in overlapping_cells if cell not in grid]
														
 
															+            for idx in list(remaining_ocr_indices):
														
 
															+                ocr = valid_ocr[idx]
														
 
															+                ocr_bbox = ocr['bbox']
														
 
															+                ocr_text = ocr.get('text', '')[:30]
														
 
															+                
														
 
															+                # 🆕 使用OCR bbox的中心点查找所在单元格，避免跨多行/列的错误映射
														
 
															+                ocr_center_x = (ocr_bbox[0] + ocr_bbox[2]) / 2
														
 
															+                ocr_center_y = (ocr_bbox[1] + ocr_bbox[3]) / 2
														
 
															+                
														
 
															+                # 找到中心点所在的行和列
														
 
															+                center_row = None
														
 
															+                center_col = None
														
 
															+                for i in range(len(row_dividers) - 1):
														
 
															+                    if row_dividers[i] <= ocr_center_y < row_dividers[i + 1]:
														
 
															+                        center_row = i
														
 
															+                        break
														
 
															+                for j in range(len(col_dividers) - 1):
														
 
															+                    if col_dividers[j] <= ocr_center_x < col_dividers[j + 1]:
														
 
															+                        center_col = j
														
 
															+                        break
														
 
															+                
														
 
															+                if center_row is None or center_col is None:
														
 
															+                    logger.debug(
														
 
															+                        f"⏭️ 跳过OCR '{ocr_text}': 中心点({ocr_center_x:.1f},{ocr_center_y:.1f})不在网格内"
														
 
															+                    )
														
 
															+                    remaining_ocr_indices.remove(idx)
														
 
															+                    continue
														
 
															+                
														
 
															+                # 检查中心点所在单元格是否为空
														
 
															+                center_cell = (center_row, center_col)
														
 
															+                if center_cell in grid:
														
 
															+                    logger.debug(
														
 
															+                        f"⏭️ 跳过OCR '{ocr_text}': 单元格[{center_row},{center_col}]已被占用"
														
 
															+                    )
														
 
															+                    remaining_ocr_indices.remove(idx)
														
 
															+                    continue
														
 
															+                
														
 
															+                # 只使用中心点所在的单元格作为初始empty_cells
														
 
															+                empty_cells = [center_cell]
														
 
															+                
														
 
															+                # 检查是否是边缘单元格（至少一个空单元格与已占用单元格相邻）
														
 
															+                has_neighbor = False
														
 
															+                for row, col in empty_cells:
														
 
															+                    for dr, dc in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
														
 
															+                        neighbor = (row + dr, col + dc)
														
 
															+                        if neighbor in grid:
														
 
															+                            has_neighbor = True
														
 
															+                            break
														
 
															+                    if has_neighbor:
														
 
															+                        break
														
 
															+                
														
 
															+                if not has_neighbor:
														
 
															+                    # 本轮没有相邻单元格，留到下一轮
														
 
															+                    continue
														
 
															+                
														
 
															+                # 找到有相邻单元格的OCR，添加到本轮结果
														
 
															+                newly_added[idx] = {
														
 
															+                    'ocr': ocr,
														
 
															+                    'empty_cells': empty_cells
														
 
															+                }
														
 
															+                remaining_ocr_indices.remove(idx)
														
 
															-            if not empty_cells:
														
 
															-                continue
														
 
															+            if not newly_added:
														
 
															+                # 本轮没有新增OCR，终止迭代
														
 
															+                logger.debug(f"📊 迭代终止: 第{iteration}轮无新增OCR")
														
 
															+                break
														
 
															-            # 检查是否是边缘单元格（至少一个空单元格与已占用单元格相邻）
														
 
															-            has_neighbor = False
														
 
															-            for row, col in empty_cells:
														
 
															-                for dr, dc in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
														
 
															-                    neighbor = (row + dr, col + dc)
														
 
															-                    if neighbor in grid:
														
 
															-                        has_neighbor = True
														
 
															-                        break
														
 
															-                if has_neighbor:
														
 
															-                    break
														
 
															+            # 将本轮新增的OCR添加到总结果
														
 
															+            ocr_to_empty_cells.update(newly_added)
														
 
															-            if not has_neighbor:
														
 
															-                logger.debug(f"⏭️ 跳过OCR '{ocr_text}': 无相邻单元格")
														
 
															-                continue
														
 
															+            # 🆕 立即将本轮新增的OCR标记到grid，作为下一轮的"已占用单元格"
														
 
															+            for idx, ocr_data in newly_added.items():
														
 
															+                for cell in ocr_data['empty_cells']:
														
 
															+                    grid[cell] = True
														
 
															-            # 记录这个OCR的初始空单元格
														
 
															-            ocr_to_empty_cells[idx] = {
														
 
															-                'ocr': ocr,
														
 
															-                'empty_cells': empty_cells
														
 
															-            }
														
 
															+            logger.debug(
														
 
															+                f"📊 第{iteration}轮: 新增{len(newly_added)}个OCR, "
														
 
															+                f"剩余{len(remaining_ocr_indices)}个待处理"
														
 
															+            )
														
 
															+        
														
 
															+        if remaining_ocr_indices:
														
 
															+            logger.debug(
														
 
															+                f"⏭️ {len(remaining_ocr_indices)}个OCR无法补偿（无相邻单元格或超出迭代次数）"
														
 
															+            )
														
 
															-        logger.debug(f"📊 第一遍完成: {len(ocr_to_empty_cells)}个OCR需要补偿")
														
 
															+        logger.debug(f"📊 Step 4完成: {len(ocr_to_empty_cells)}个OCR需要补偿（共{iteration}轮迭代）")
														
 
															-        # Step 5: 第二遍 - 对所有标记的OCR区域统一扩展
														
 
															+        # Step 5: grid已在迭代过程中更新，跳过
														
 
															+        # （不需要再次标记，因为每轮迭代都已经更新了grid）
														
 
															+        
														
 
															+        # Step 6: 去除边缘整行或整列的空网格（确定表格实际内容边界）
														
 
															+        occupied_rows = set(r for r, c in grid.keys())
														
 
															+        occupied_cols = set(c for r, c in grid.keys())
														
 
															+        
														
 
															+        if not occupied_rows or not occupied_cols:
														
 
															+            logger.warning("⚠️ 没有占用的单元格，无法确定表格边界")
														
 
															+            return []
														
 
															+        
														
 
															+        # 确定表格实际内容范围
														
 
															+        content_min_row = min(occupied_rows)
														
 
															+        content_max_row = max(occupied_rows)
														
 
															+        content_min_col = min(occupied_cols)
														
 
															+        content_max_col = max(occupied_cols)
														
 
															+        
														
 
															+        logger.debug(
														
 
															+            f"📊 Step 6完成: 表格内容边界 = "
														
 
															+            f"row[{content_min_row}-{content_max_row}] × col[{content_min_col}-{content_max_col}]"
														
 
															+        )
														
 
															+        
														
 
															+        # 🆕 不恢复grid状态，保持OCR单元格的临时标记
														
 
															+        # 这样在扩展时，每个OCR都能看到其他OCR的占用，避免重复扩展
														
 
															+        
														
 
															+        # Step 7: 对所有标记的OCR区域统一扩展（只能向表格内部扩展）
														
 
															         # 🆕 辅助函数：检查侧边相邻列/行的已占用单元格边界
														
 
															         def get_side_boundary_for_vertical_expansion(current_min_col, current_max_col, direction='up'):
														
 
															             """向上/下扩展时，检查左右两侧相邻列的单元格边界"""
														
@@ -954,13 +1078,14 @@ class GridRecovery:
 
															             return boundary_cols
														
 
															-        # 对每个OCR区域进行扩展
														
 
															+        # 🆕 逐个处理每个OCR，扩展完立即更新grid状态
														
 
															+        # 这样后续OCR能看到前面OCR已经扩展占据的单元格，避免重复扩展
														
 
															         for idx, ocr_data in ocr_to_empty_cells.items():
														
 
															             empty_cells = ocr_data['empty_cells']
														
 
															             ocr = ocr_data['ocr']
														
 
															             ocr_text = ocr.get('text', '')[:30]
														
 
															-            # 向上下左右扩展连续的空单元格（必须与侧边已有单元格对齐）
														
 
															+            # 向上下左右扩展连续的空单元格（只能在表格内容边界内扩展）
														
 
															             expanded = set(empty_cells)
														
 
															             changed = True
														
 
															             while changed:
														
@@ -970,12 +1095,12 @@ class GridRecovery:
 
															                 current_min_col = min(c for r, c in expanded)
														
 
															                 current_max_col = max(c for r, c in expanded)
														
 
															-                # 🆕 尝试向上扩展（整行都是空的，且不超过左右侧单元格的上边界）
														
 
															-                if current_min_row > 0:
														
 
															+                # 🆕 向上扩展（不能超过表格内容上边界 content_min_row）
														
 
															+                if current_min_row > content_min_row:
														
 
															                     row_above = current_min_row - 1
														
 
															                     # 检查该行是否都是空的
														
 
															                     if all((row_above, col) not in grid for col in range(current_min_col, current_max_col + 1)):
														
 
															-                        # 🆕 检查左右侧相邻列的单元格最小行（上边界）
														
 
															+                        # 检查左右侧相邻列的单元格最小行（上边界）
														
 
															                         side_boundaries = get_side_boundary_for_vertical_expansion(
														
 
															                             current_min_col, current_max_col, 'up'
														
 
															                         )
														
@@ -991,8 +1116,8 @@ class GridRecovery:
 
															                                 expanded.add((row_above, col))
														
 
															                             changed = True
														
 
															-                # 🆕 尝试向下扩展（整行都是空的，且不超过左右侧单元格的下边界）
														
 
															-                if current_max_row < len(row_dividers) - 2:
														
 
															+                # 🆕 向下扩展（不能超过表格内容下边界 content_max_row）
														
 
															+                if current_max_row < content_max_row:
														
 
															                     row_below = current_max_row + 1
														
 
															                     if all((row_below, col) not in grid for col in range(current_min_col, current_max_col + 1)):
														
 
															                         side_boundaries = get_side_boundary_for_vertical_expansion(
														
@@ -1009,8 +1134,8 @@ class GridRecovery:
 
															                                 expanded.add((row_below, col))
														
 
															                             changed = True
														
 
															-                # 🆕 尝试向左扩展（整列都是空的，且不超过上下侧单元格的左边界）
														
 
															-                if current_min_col > 0:
														
 
															+                # 🆕 向左扩展（不能超过表格内容左边界 content_min_col）
														
 
															+                if current_min_col > content_min_col:
														
 
															                     col_left = current_min_col - 1
														
 
															                     if all((row, col_left) not in grid for row in range(current_min_row, current_max_row + 1)):
														
 
															                         side_boundaries = get_side_boundary_for_horizontal_expansion(
														
@@ -1027,8 +1152,8 @@ class GridRecovery:
 
															                                 expanded.add((row, col_left))
														
 
															                             changed = True
														
 
															-                # 🆕 尝试向右扩展（整列都是空的，且不超过上下侧单元格的右边界）
														
 
															-                if current_max_col < len(col_dividers) - 2:
														
 
															+                # 🆕 向右扩展（不能超过表格内容右边界 content_max_col）
														
 
															+                if current_max_col < content_max_col:
														
 
															                     col_right = current_max_col + 1
														
 
															                     if all((row, col_right) not in grid for row in range(current_min_row, current_max_row + 1)):
														
 
															                         side_boundaries = get_side_boundary_for_horizontal_expansion(
														
@@ -1045,12 +1170,18 @@ class GridRecovery:
 
															                                 expanded.add((row, col_right))
														
 
															                             changed = True
														
 
															+            # 🆕 扩展完成后，立即将扩展后的单元格标记到grid中
														
 
															+            # 这样后续OCR扩展时能看到这个OCR占据的区域，避免重复扩展
														
 
															+            for cell in expanded:
														
 
															+                grid[cell] = True
														
 
															+            
														
 
															             # 更新扩展后的空单元格
														
 
															             ocr_to_empty_cells[idx]['expanded_cells'] = list(expanded)
														
 
															+            logger.debug(f"  OCR '{ocr_text}' 扩展完成: {list(expanded)}")
														
 
															-        logger.debug(f"📊 第二遍完成: 所有OCR区域已扩展")
														
 
															+        logger.debug(f"📊 Step 7完成: 所有OCR区域已扩展")
														
 
															-        # Step 6: 第三遍 - 生成补偿bbox
														
 
															+        # Step 8: 生成补偿bbox
														
 
															         compensated_bboxes = []
														
 
															         for idx, ocr_data in ocr_to_empty_cells.items():
														
@@ -1065,10 +1196,11 @@ class GridRecovery:
 
															             max_col = max(c for r, c in empty_cells)
														
 
															             # 使用网格边界作为bbox（精确对齐）
														
 
															-            y1 = row_dividers[min_row]
														
 
															-            y2 = row_dividers[max_row + 1]
														
 
															-            x1 = col_dividers[min_col]
														
 
															-            x2 = col_dividers[max_col + 1]
														
 
															+            # 显式转换为Python float，避免numpy.float32导致JSON序列化错误
														
 
															+            y1 = float(row_dividers[min_row])
														
 
															+            y2 = float(row_dividers[max_row + 1])
														
 
															+            x1 = float(col_dividers[min_col])
														
 
															+            x2 = float(col_dividers[max_col + 1])
														
 
															             compensated_bbox = [x1, y1, x2, y2]
														
 
															             compensated_bboxes.append(compensated_bbox)