|
|
@@ -544,7 +544,7 @@ class MinerUWiredTableRecognizer:
|
|
|
return str(soup)
|
|
|
|
|
|
# ========== 基于表格线交点的单元格计算 ==========
|
|
|
- def _compute_cells_from_lines(
|
|
|
+ def _compute_cells_from_lines_4_1(
|
|
|
self,
|
|
|
hpred_up: np.ndarray,
|
|
|
vpred_up: np.ndarray,
|
|
|
@@ -626,18 +626,28 @@ class MinerUWiredTableRecognizer:
|
|
|
for bbox in bboxes:
|
|
|
h_cell = bbox[3] - bbox[1]
|
|
|
w_cell = bbox[2] - bbox[0]
|
|
|
- # 1. 绝对高度过滤:过滤极矮的噪点 (例如 < 6px)
|
|
|
+
|
|
|
+ # 1. 绝对高度过滤:过滤极矮的噪点
|
|
|
+ # 降低阈值到 6px,防止漏掉极小的字号
|
|
|
if h_cell < 6.0:
|
|
|
continue
|
|
|
|
|
|
- # 2. 相对高度+形态过滤:过滤扁长形的缝隙
|
|
|
- # 场景:表格底部双线或粗线产生的缝隙,通常高度显著小于正常行,且宽度较大
|
|
|
- # 阈值:高度小于中位数的 0.6 且 宽高比 > 5
|
|
|
- # 例如你的case: h=12.9, median~28. h < 16.8 且 ratio=51 > 5 -> 过滤
|
|
|
- elif median_h > 0 and h_cell < median_h * 0.6 and w_cell > h_cell * 5:
|
|
|
- continue
|
|
|
- elif median_h > 0 and h_cell < median_h * 0.33:
|
|
|
- continue
|
|
|
+ # 2. 相对高度过滤 (更保守的策略)
|
|
|
+ # 仅当高度同时满足 "相对极小" AND "绝对较小" 时才过滤
|
|
|
+ # 这样可以防止在 median_h 很大(如160px)时误删正常的小行(如25px)
|
|
|
+ if median_h > 0:
|
|
|
+ ratio = h_cell / median_h
|
|
|
+
|
|
|
+ # 策略A: 极矮行过滤
|
|
|
+ # 高度 < 10% median 且 绝对高度 < 10px
|
|
|
+ # (你的case: 25/164 = 0.15 > 0.1, 且 25 > 10, 故保留)
|
|
|
+ if ratio < 0.1 and h_cell < 10.0:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 策略B: 扁长缝隙过滤 (通常是双线造成的)
|
|
|
+ # 高度 < 20% median 且 宽高比 > 5 且 绝对高度 < 15px
|
|
|
+ if ratio < 0.2 and w_cell > h_cell * 5 and h_cell < 15.0:
|
|
|
+ continue
|
|
|
|
|
|
final_bboxes.append(bbox)
|
|
|
|
|
|
@@ -652,6 +662,104 @@ class MinerUWiredTableRecognizer:
|
|
|
|
|
|
return bboxes
|
|
|
|
|
|
+ def _compute_cells_from_lines(
|
|
|
+ self,
|
|
|
+ hpred_up: np.ndarray,
|
|
|
+ vpred_up: np.ndarray,
|
|
|
+ upscale: float = 1.0,
|
|
|
+ ) -> List[List[float]]:
|
|
|
+ """
|
|
|
+ 基于连通域分析从表格线 Mask 提取单元格 (健壮版)
|
|
|
+
|
|
|
+ 改进点:
|
|
|
+ 1. 使用形态学闭运算(Closing)修复断线,而非简单膨胀。
|
|
|
+ 2. 移除基于中位数的统计过滤,改为基于几何特征(长宽比、绝对尺寸)的过滤。
|
|
|
+ 3. 专门处理双线表格产生的细长缝隙噪声。
|
|
|
+ """
|
|
|
+ h, w = hpred_up.shape[:2]
|
|
|
+
|
|
|
+ # 1. 预处理:二值化
|
|
|
+ _, h_bin = cv2.threshold(hpred_up, 127, 255, cv2.THRESH_BINARY)
|
|
|
+ _, v_bin = cv2.threshold(vpred_up, 127, 255, cv2.THRESH_BINARY)
|
|
|
+
|
|
|
+ # 2. 形态学修复:使用闭运算 (Closing) 连接断线
|
|
|
+ # 闭运算 = 先膨胀后腐蚀,能填补小黑洞(断点)而不改变整体轮廓大小
|
|
|
+ # 横线:横向连接能力强;竖线:竖向连接能力强
|
|
|
+ kernel_h = cv2.getStructuringElement(cv2.MORPH_RECT, (7, 1))
|
|
|
+ kernel_v = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 7))
|
|
|
+
|
|
|
+ h_bin = cv2.morphologyEx(h_bin, cv2.MORPH_CLOSE, kernel_h)
|
|
|
+ v_bin = cv2.morphologyEx(v_bin, cv2.MORPH_CLOSE, kernel_v)
|
|
|
+
|
|
|
+ # 3. 合成网格图
|
|
|
+ grid_mask = cv2.bitwise_or(h_bin, v_bin)
|
|
|
+
|
|
|
+ # 4. 反转图像 (黑线白底),提取白色连通域
|
|
|
+ inv_grid = cv2.bitwise_not(grid_mask)
|
|
|
+
|
|
|
+ # 5. 提取连通域
|
|
|
+ num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(inv_grid, connectivity=8)
|
|
|
+
|
|
|
+ bboxes = []
|
|
|
+
|
|
|
+ # 6. 几何特征过滤 (不依赖全局统计,只看个体特征)
|
|
|
+ for i in range(1, num_labels):
|
|
|
+ x = stats[i, cv2.CC_STAT_LEFT]
|
|
|
+ y = stats[i, cv2.CC_STAT_TOP]
|
|
|
+ w_cell = stats[i, cv2.CC_STAT_WIDTH]
|
|
|
+ h_cell = stats[i, cv2.CC_STAT_HEIGHT]
|
|
|
+ area = stats[i, cv2.CC_STAT_AREA]
|
|
|
+
|
|
|
+ # --- 过滤规则 ---
|
|
|
+
|
|
|
+ # A. 边界与背景过滤
|
|
|
+ # 过滤掉几乎占据全图的背景,或极小的噪点
|
|
|
+ if area < 50 or w_cell > w * 0.95 or h_cell > h * 0.95:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 转换到原图尺度进行判断
|
|
|
+ orig_h = h_cell / upscale
|
|
|
+ orig_w = w_cell / upscale
|
|
|
+
|
|
|
+ # B. 绝对尺寸过滤 (物理极限)
|
|
|
+ # 任何小于 4px 的东西都不可能是有效的文本单元格
|
|
|
+ if orig_h < 4.0 or orig_w < 4.0:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # C. 形态过滤:双线缝隙 (Sliver)
|
|
|
+ # 特征:长宽比极大,且短边极短
|
|
|
+ # 例如:宽100,高5 -> 比例20,且高<12 -> 判定为横向缝隙
|
|
|
+ # 例如:宽5,高100 -> 比例0.05,且宽<12 -> 判定为纵向缝隙
|
|
|
+
|
|
|
+ ratio = w_cell / h_cell
|
|
|
+
|
|
|
+ # 横向缝隙 (极扁)
|
|
|
+ if ratio > 10.0 and orig_h < 15.0:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 纵向缝隙 (极细)
|
|
|
+ if ratio < 0.1 and orig_w < 12.0:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # D. 包含关系过滤 (可选,但 CCA 通常不会产生重叠框)
|
|
|
+ # 如果需要处理嵌套表格,这里需要更复杂的逻辑,但一般表格不需要。
|
|
|
+
|
|
|
+ # 通过所有检查,加入结果
|
|
|
+ bboxes.append([
|
|
|
+ x / upscale,
|
|
|
+ y / upscale,
|
|
|
+ (x + w_cell) / upscale,
|
|
|
+ (y + h_cell) / upscale
|
|
|
+ ])
|
|
|
+
|
|
|
+ # 按阅读顺序排序 (先上后下,再左后右)
|
|
|
+ # 允许 10px 的行误差,防止轻微歪斜导致的排序混乱
|
|
|
+ bboxes.sort(key=lambda b: (int(b[1] / 10), b[0]))
|
|
|
+
|
|
|
+ logger.info(f"连通域分析提取到 {len(bboxes)} 个单元格")
|
|
|
+
|
|
|
+ return bboxes
|
|
|
+
|
|
|
def _visualize_detected_lines(
|
|
|
self,
|
|
|
hpred: np.ndarray,
|
|
|
@@ -796,7 +904,7 @@ class MinerUWiredTableRecognizer:
|
|
|
|
|
|
return new_cells
|
|
|
|
|
|
- def _recover_grid_structure(self, bboxes: List[List[float]]) -> List[Dict]:
|
|
|
+ def _recover_grid_structure_4_1(self, bboxes: List[List[float]]) -> List[Dict]:
|
|
|
"""
|
|
|
从散乱的单元格 bbox 恢复表格的行列结构 (row, col, rowspan, colspan)
|
|
|
改进版:使用边界投影聚类,更稳健
|
|
|
@@ -883,6 +991,314 @@ class MinerUWiredTableRecognizer:
|
|
|
|
|
|
return structured_cells
|
|
|
|
|
|
+ def _recover_grid_structure_4_2(self, bboxes: List[List[float]]) -> List[Dict]:
|
|
|
+ """
|
|
|
+ 从散乱的单元格 bbox 恢复表格的行列结构 (row, col, rowspan, colspan)
|
|
|
+ 重构版:基于标准行骨架的匹配,解决密集行与跨行单元格混合的问题
|
|
|
+ """
|
|
|
+ if not bboxes:
|
|
|
+ return []
|
|
|
+
|
|
|
+ # --- 1. 识别行结构 (Row Structure) ---
|
|
|
+
|
|
|
+ # 计算高度中位数,用于区分"标准行"和"跨行单元格"
|
|
|
+ heights = [b[3] - b[1] for b in bboxes]
|
|
|
+ median_h = np.median(heights) if heights else 0
|
|
|
+
|
|
|
+ # 定义标准行单元格:高度在 [0.5, 1.5] 倍中位数之间
|
|
|
+ # 这样可以排除跨行的大单元格,也可以排除极小的噪点
|
|
|
+ standard_cells = []
|
|
|
+ for i, bbox in enumerate(bboxes):
|
|
|
+ h = bbox[3] - bbox[1]
|
|
|
+ if median_h > 0 and 0.5 * median_h < h < 1.8 * median_h:
|
|
|
+ standard_cells.append({"bbox": bbox, "index": i})
|
|
|
+
|
|
|
+ # 兜底:如果找不到标准行(比如表格全是奇怪的单元格),则使用所有单元格
|
|
|
+ if not standard_cells:
|
|
|
+ standard_cells = [{"bbox": b, "index": i} for i, b in enumerate(bboxes)]
|
|
|
+
|
|
|
+ # 对标准单元格按 Y 中心排序
|
|
|
+ standard_cells.sort(key=lambda x: (x["bbox"][1] + x["bbox"][3]) / 2)
|
|
|
+
|
|
|
+ # 贪心聚类生成"行骨架"
|
|
|
+ # rows_defs 存储每一行的垂直范围 {'top': y1, 'bottom': y2, 'center': yc}
|
|
|
+ rows_defs = []
|
|
|
+
|
|
|
+ for item in standard_cells:
|
|
|
+ box = item["bbox"]
|
|
|
+ cy = (box[1] + box[3]) / 2
|
|
|
+
|
|
|
+ # 尝试匹配已有的行
|
|
|
+ matched = False
|
|
|
+ for r_def in rows_defs:
|
|
|
+ # 判断条件:中心点距离小于行高的一半 (假设行高近似 median_h)
|
|
|
+ # 或者:垂直重叠率高
|
|
|
+ r_h = r_def['bottom'] - r_def['top']
|
|
|
+ ref_h = max(r_h, median_h) # 参考高度
|
|
|
+
|
|
|
+ if abs(cy - r_def['center']) < ref_h * 0.6:
|
|
|
+ # 匹配成功,更新行范围
|
|
|
+ r_def['top'] = min(r_def['top'], box[1])
|
|
|
+ r_def['bottom'] = max(r_def['bottom'], box[3])
|
|
|
+ r_def['center'] = (r_def['top'] + r_def['bottom']) / 2
|
|
|
+ matched = True
|
|
|
+ break
|
|
|
+
|
|
|
+ if not matched:
|
|
|
+ rows_defs.append({
|
|
|
+ 'top': box[1],
|
|
|
+ 'bottom': box[3],
|
|
|
+ 'center': cy
|
|
|
+ })
|
|
|
+
|
|
|
+ # 对行骨架按位置排序
|
|
|
+ rows_defs.sort(key=lambda x: x['center'])
|
|
|
+
|
|
|
+ # 合并靠得太近的行骨架 (防止过度切分)
|
|
|
+ # 阈值:0.5 * median_h
|
|
|
+ merged_rows = []
|
|
|
+ if rows_defs:
|
|
|
+ curr = rows_defs[0]
|
|
|
+ for next_row in rows_defs[1:]:
|
|
|
+ if next_row['center'] - curr['center'] < median_h * 0.5:
|
|
|
+ # 合并
|
|
|
+ curr['top'] = min(curr['top'], next_row['top'])
|
|
|
+ curr['bottom'] = max(curr['bottom'], next_row['bottom'])
|
|
|
+ curr['center'] = (curr['top'] + curr['bottom']) / 2
|
|
|
+ else:
|
|
|
+ merged_rows.append(curr)
|
|
|
+ curr = next_row
|
|
|
+ merged_rows.append(curr)
|
|
|
+ rows_defs = merged_rows
|
|
|
+
|
|
|
+ # --- 2. 识别列结构 (Col Structure) ---
|
|
|
+ # 列分割线逻辑保持不变,通常列比较规整
|
|
|
+ x_coords = []
|
|
|
+ for b in bboxes:
|
|
|
+ x_coords.append(b[0])
|
|
|
+ x_coords.append(b[2])
|
|
|
+ x_coords.sort()
|
|
|
+
|
|
|
+ col_dividers = []
|
|
|
+ if x_coords:
|
|
|
+ thresh = 5 # 列间隙阈值
|
|
|
+ curr_cluster = [x_coords[0]]
|
|
|
+ for x in x_coords[1:]:
|
|
|
+ if x - curr_cluster[-1] < thresh:
|
|
|
+ curr_cluster.append(x)
|
|
|
+ else:
|
|
|
+ col_dividers.append(sum(curr_cluster)/len(curr_cluster))
|
|
|
+ curr_cluster = [x]
|
|
|
+ col_dividers.append(sum(curr_cluster)/len(curr_cluster))
|
|
|
+
|
|
|
+ # --- 3. 匹配单元格到网格 ---
|
|
|
+ structured_cells = []
|
|
|
+ for bbox in bboxes:
|
|
|
+ # --- 匹配行 (Row) ---
|
|
|
+ b_top, b_bottom = bbox[1], bbox[3]
|
|
|
+ b_h = b_bottom - b_top
|
|
|
+
|
|
|
+ matched_row_indices = []
|
|
|
+
|
|
|
+ for r_idx, r_def in enumerate(rows_defs):
|
|
|
+ # 计算 Y 轴重叠
|
|
|
+ inter_top = max(b_top, r_def['top'])
|
|
|
+ inter_bottom = min(b_bottom, r_def['bottom'])
|
|
|
+ inter_h = max(0, inter_bottom - inter_top)
|
|
|
+
|
|
|
+ r_h = r_def['bottom'] - r_def['top']
|
|
|
+
|
|
|
+ # 判定覆盖:
|
|
|
+ # 1. 单元格覆盖了该行的大部分 (跨行情况) -> inter_h / r_h > 0.5
|
|
|
+ # 2. 该行覆盖了单元格的大部分 (小单元格情况) -> inter_h / b_h > 0.5
|
|
|
+ if r_h > 0 and (inter_h / r_h > 0.5 or inter_h / b_h > 0.5):
|
|
|
+ matched_row_indices.append(r_idx)
|
|
|
+
|
|
|
+ if not matched_row_indices:
|
|
|
+ # 兜底:找中心点最近的行
|
|
|
+ cy = (b_top + b_bottom) / 2
|
|
|
+ closest_r = min(range(len(rows_defs)), key=lambda i: abs(rows_defs[i]['center'] - cy))
|
|
|
+ matched_row_indices = [closest_r]
|
|
|
+
|
|
|
+ row_start = min(matched_row_indices)
|
|
|
+ row_end = max(matched_row_indices)
|
|
|
+ rowspan = row_end - row_start + 1
|
|
|
+
|
|
|
+ # --- 匹配列 (Col) ---
|
|
|
+ # 找左右边界最近的 divider
|
|
|
+ c1 = 0
|
|
|
+ c2 = 0
|
|
|
+ if len(col_dividers) >= 2:
|
|
|
+ c1 = min(range(len(col_dividers)), key=lambda i: abs(col_dividers[i] - bbox[0]))
|
|
|
+ c2 = min(range(len(col_dividers)), key=lambda i: abs(col_dividers[i] - bbox[2]))
|
|
|
+ if c1 > c2: c1, c2 = c2, c1
|
|
|
+
|
|
|
+ colspan = max(1, c2 - c1)
|
|
|
+
|
|
|
+ structured_cells.append({
|
|
|
+ "bbox": bbox,
|
|
|
+ "row": row_start,
|
|
|
+ "col": c1,
|
|
|
+ "rowspan": rowspan,
|
|
|
+ "colspan": colspan
|
|
|
+ })
|
|
|
+
|
|
|
+ # 按行列排序
|
|
|
+ structured_cells.sort(key=lambda c: (c["row"], c["col"]))
|
|
|
+
|
|
|
+ # 压缩网格,移除空行空列
|
|
|
+ structured_cells = self._compress_grid(structured_cells)
|
|
|
+
|
|
|
+ return structured_cells
|
|
|
+
|
|
|
+ def _recover_grid_structure(self, bboxes: List[List[float]]) -> List[Dict]:
|
|
|
+ """
|
|
|
+ 从散乱的单元格 bbox 恢复表格的行列结构 (row, col, rowspan, colspan)
|
|
|
+ 重构版:基于投影网格线 (Projected Grid Lines) 的算法
|
|
|
+ 适用于行高差异巨大、存在密集小行的复杂表格
|
|
|
+ """
|
|
|
+ if not bboxes:
|
|
|
+ return []
|
|
|
+
|
|
|
+ # 1. 识别行分割线 (Y轴)
|
|
|
+ # 收集所有单元格的 top 和 bottom
|
|
|
+ y_coords = []
|
|
|
+ for b in bboxes:
|
|
|
+ y_coords.append(b[1])
|
|
|
+ y_coords.append(b[3])
|
|
|
+
|
|
|
+ # 聚类并筛选有效的行网格线
|
|
|
+ # 阈值:5像素容差,至少对齐 2 个单元格 (防止噪点)
|
|
|
+ row_dividers = self._find_grid_lines(y_coords, tolerance=5, min_support=2)
|
|
|
+
|
|
|
+ # 2. 识别列分割线 (X轴)
|
|
|
+ x_coords = []
|
|
|
+ for b in bboxes:
|
|
|
+ x_coords.append(b[0])
|
|
|
+ x_coords.append(b[2])
|
|
|
+ col_dividers = self._find_grid_lines(x_coords, tolerance=5, min_support=2)
|
|
|
+
|
|
|
+ # 3. 构建网格结构
|
|
|
+ structured_cells = []
|
|
|
+
|
|
|
+ # 定义行区间 (Row Intervals)
|
|
|
+ row_intervals = []
|
|
|
+ for i in range(len(row_dividers) - 1):
|
|
|
+ row_intervals.append({
|
|
|
+ "top": row_dividers[i],
|
|
|
+ "bottom": row_dividers[i+1],
|
|
|
+ "height": row_dividers[i+1] - row_dividers[i],
|
|
|
+ "index": i
|
|
|
+ })
|
|
|
+
|
|
|
+ # 定义列区间 (Col Intervals)
|
|
|
+ col_intervals = []
|
|
|
+ for i in range(len(col_dividers) - 1):
|
|
|
+ col_intervals.append({
|
|
|
+ "left": col_dividers[i],
|
|
|
+ "right": col_dividers[i+1],
|
|
|
+ "width": col_dividers[i+1] - col_dividers[i],
|
|
|
+ "index": i
|
|
|
+ })
|
|
|
+
|
|
|
+ for bbox in bboxes:
|
|
|
+ b_top, b_bottom = bbox[1], bbox[3]
|
|
|
+ b_left, b_right = bbox[0], bbox[2]
|
|
|
+ b_h = b_bottom - b_top
|
|
|
+ b_w = b_right - b_left
|
|
|
+
|
|
|
+ # --- 匹配行 (Row) ---
|
|
|
+ matched_rows = []
|
|
|
+ for r in row_intervals:
|
|
|
+ # 计算垂直重叠
|
|
|
+ inter_top = max(b_top, r["top"])
|
|
|
+ inter_bottom = min(b_bottom, r["bottom"])
|
|
|
+ inter_h = max(0, inter_bottom - inter_top)
|
|
|
+
|
|
|
+ # 判定属于该行的条件:
|
|
|
+ # 1. 单元格覆盖了该行的大部分 (inter_h / r_height > 0.5) -> 适用于跨行单元格覆盖矮行
|
|
|
+ # 2. 该行覆盖了单元格的大部分 (inter_h / b_h > 0.5) -> 适用于单元格完全在行内
|
|
|
+ if r["height"] > 0 and (inter_h / r["height"] > 0.5 or inter_h / b_h > 0.5):
|
|
|
+ matched_rows.append(r["index"])
|
|
|
+
|
|
|
+ if not matched_rows:
|
|
|
+ # 兜底:找中心点所在的行
|
|
|
+ cy = (b_top + b_bottom) / 2
|
|
|
+ closest_r = min(row_intervals, key=lambda r: abs((r["top"]+r["bottom"])/2 - cy))
|
|
|
+ matched_rows = [closest_r["index"]]
|
|
|
+
|
|
|
+ row_start = min(matched_rows)
|
|
|
+ row_end = max(matched_rows)
|
|
|
+ rowspan = row_end - row_start + 1
|
|
|
+
|
|
|
+ # --- 匹配列 (Col) ---
|
|
|
+ matched_cols = []
|
|
|
+ for c in col_intervals:
|
|
|
+ inter_left = max(b_left, c["left"])
|
|
|
+ inter_right = min(b_right, c["right"])
|
|
|
+ inter_w = max(0, inter_right - inter_left)
|
|
|
+
|
|
|
+ if c["width"] > 0 and (inter_w / c["width"] > 0.5 or inter_w / b_w > 0.5):
|
|
|
+ matched_cols.append(c["index"])
|
|
|
+
|
|
|
+ if not matched_cols:
|
|
|
+ cx = (b_left + b_right) / 2
|
|
|
+ closest_c = min(col_intervals, key=lambda c: abs((c["left"]+c["right"])/2 - cx))
|
|
|
+ matched_cols = [closest_c["index"]]
|
|
|
+
|
|
|
+ col_start = min(matched_cols)
|
|
|
+ col_end = max(matched_cols)
|
|
|
+ colspan = col_end - col_start + 1
|
|
|
+
|
|
|
+ structured_cells.append({
|
|
|
+ "bbox": bbox,
|
|
|
+ "row": row_start,
|
|
|
+ "col": col_start,
|
|
|
+ "rowspan": rowspan,
|
|
|
+ "colspan": colspan
|
|
|
+ })
|
|
|
+
|
|
|
+ # 按行列排序
|
|
|
+ structured_cells.sort(key=lambda c: (c["row"], c["col"]))
|
|
|
+
|
|
|
+ # 压缩网格 (移除空行空列)
|
|
|
+ structured_cells = self._compress_grid(structured_cells)
|
|
|
+
|
|
|
+ return structured_cells
|
|
|
+
|
|
|
+ def _find_grid_lines(self, coords: List[float], tolerance: float = 5.0, min_support: int = 2) -> List[float]:
|
|
|
+ """
|
|
|
+ 聚类坐标点并筛选出高支持度的网格线
|
|
|
+ """
|
|
|
+ if not coords:
|
|
|
+ return []
|
|
|
+
|
|
|
+ coords.sort()
|
|
|
+
|
|
|
+ # 1. 简单聚类
|
|
|
+ clusters = []
|
|
|
+ if coords:
|
|
|
+ curr_cluster = [coords[0]]
|
|
|
+ for x in coords[1:]:
|
|
|
+ if x - curr_cluster[-1] < tolerance:
|
|
|
+ curr_cluster.append(x)
|
|
|
+ else:
|
|
|
+ clusters.append(curr_cluster)
|
|
|
+ curr_cluster = [x]
|
|
|
+ clusters.append(curr_cluster)
|
|
|
+
|
|
|
+ # 2. 计算聚类中心和支持度
|
|
|
+ grid_lines = []
|
|
|
+ for cluster in clusters:
|
|
|
+ # 支持度 = 该位置出现的坐标点数量
|
|
|
+ # 注意:这里传入的是所有box的边,所以支持度直接反映了有多少个单元格对齐到了这条线
|
|
|
+ if len(cluster) >= min_support:
|
|
|
+ center = sum(cluster) / len(cluster)
|
|
|
+ grid_lines.append(center)
|
|
|
+
|
|
|
+ return grid_lines
|
|
|
+
|
|
|
+
|
|
|
def _build_html_from_merged_cells(self, merged_cells: List[Dict]) -> str:
|
|
|
"""
|
|
|
基于矩阵填充法生成 HTML,防止错位
|