|
|
@@ -578,6 +578,7 @@ class MinerUWiredTableRecognizer:
|
|
|
num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(inv_grid, connectivity=8)
|
|
|
|
|
|
bboxes = []
|
|
|
+ heights = []
|
|
|
# 过滤掉背景(label=0)和过小的噪声
|
|
|
min_area = 50 # 最小面积阈值
|
|
|
|
|
|
@@ -613,7 +614,36 @@ class MinerUWiredTableRecognizer:
|
|
|
(x + w_cell) / upscale,
|
|
|
(y + h_cell) / upscale
|
|
|
])
|
|
|
+ heights.append(orig_h)
|
|
|
|
|
|
+ # --- 动态过滤逻辑开始 ---
|
|
|
+ # 计算中位数高度,代表正常行的典型高度
|
|
|
+ median_h = np.median(heights) if heights else 0
|
|
|
+ # 设定动态阈值:
|
|
|
+ # 1. 高度小于中位数的 1/3 (显著偏矮)
|
|
|
+ # 2. 且高度小于 5 像素 (确保不误删本来就很密集的正常小行)
|
|
|
+ final_bboxes = []
|
|
|
+ for bbox in bboxes:
|
|
|
+ h_cell = bbox[3] - bbox[1]
|
|
|
+ w_cell = bbox[2] - bbox[0]
|
|
|
+ # 1. 绝对高度过滤:过滤极矮的噪点 (例如 < 6px)
|
|
|
+ if h_cell < 6.0:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 2. 相对高度+形态过滤:过滤扁长形的缝隙
|
|
|
+ # 场景:表格底部双线或粗线产生的缝隙,通常高度显著小于正常行,且宽度较大
|
|
|
+ # 阈值:高度小于中位数的 0.6 且 宽高比 > 5
|
|
|
+ # 例如你的case: h=12.9, median~28. h < 16.8 且 ratio=51 > 5 -> 过滤
|
|
|
+ elif median_h > 0 and h_cell < median_h * 0.6 and w_cell > h_cell * 5:
|
|
|
+ continue
|
|
|
+ elif median_h > 0 and h_cell < median_h * 0.33:
|
|
|
+ continue
|
|
|
+
|
|
|
+ final_bboxes.append(bbox)
|
|
|
+
|
|
|
+ # --- 动态过滤逻辑结束 ---
|
|
|
+ bboxes = final_bboxes
|
|
|
+
|
|
|
# 按阅读顺序排序 (先上后下,再左后右)
|
|
|
# 允许一定的行误差
|
|
|
bboxes.sort(key=lambda b: (int(b[1] / 10), b[0]))
|