|
|
@@ -184,7 +184,7 @@ class WiredTableRecognition:
|
|
|
# 从img中截取对应的区域
|
|
|
x1, y1, x2, y2 = int(box[0][0])+1, int(box[0][1])+1, int(box[2][0])-1, int(box[2][1])-1
|
|
|
if x1 >= x2 or y1 >= y2:
|
|
|
- logger.warning(f"Invalid box coordinates: {box}")
|
|
|
+ # logger.warning(f"Invalid box coordinates: {x1, y1, x2, y2}")
|
|
|
continue
|
|
|
# 判断长宽比
|
|
|
if (x2 - x1) / (y2 - y1) > 20 or (y2 - y1) / (x2 - x1) > 20:
|
|
|
@@ -308,11 +308,24 @@ class UnetTableModel:
|
|
|
wired_blank_count = sum(1 for cell in wired_soup.find_all(['td', 'th']) if not cell.text.strip())
|
|
|
# logger.debug(f"wireless table blank cell count: {wireless_blank_count}, wired table blank cell count: {wired_blank_count}")
|
|
|
|
|
|
+ # 计算非空单元格数量
|
|
|
+ wireless_non_blank_count = wireless_len - wireless_blank_count
|
|
|
+ wired_non_blank_count = wired_len - wired_blank_count
|
|
|
+ # 无线表非空格数量大于有线表非空格数量时,才考虑切换
|
|
|
+ switch_flag = False
|
|
|
+ if wireless_non_blank_count > wired_non_blank_count:
|
|
|
+ # 假设非空表格是接近正方表,使用非空单元格数量开平方作为表格规模的估计
|
|
|
+ wired_table_scale = round(wired_non_blank_count ** 0.5)
|
|
|
+ # logger.debug(f"wireless non-blank cell count: {wireless_non_blank_count}, wired non-blank cell count: {wired_non_blank_count}, wired table scale: {wired_table_scale}")
|
|
|
+ # 如果无线表非空格的数量比有线表多一列或以上,需要切换到无线表
|
|
|
+ wired_scale_plus_2_cols = wired_non_blank_count + (wired_table_scale * 2)
|
|
|
+ wired_scale_squared_plus_2_rows = wired_table_scale * (wired_table_scale + 2)
|
|
|
+ if (wireless_non_blank_count + 3) >= max(wired_scale_plus_2_cols, wired_scale_squared_plus_2_rows):
|
|
|
+ switch_flag = True
|
|
|
+
|
|
|
# 判断是否使用无线表格模型的结果
|
|
|
if (
|
|
|
- # (int(wireless_len * 0.04) <= wired_len <= int(wireless_len * 0.62)+1 and wireless_blank_count <= wired_blank_count+50)
|
|
|
- # or int(wireless_len * 0.04) <= wired_len <= int(wireless_len * 0.55)+1 # 有线模型检测到的单元格数太少(低于无线模型的55%)
|
|
|
- (int(wireless_len * 0.04) <= (wired_len-wired_blank_count) <= int((wireless_len-wireless_blank_count) * 0.76) and wired_len <= int(wireless_len * 0.5)) # 非空表数量有线表明显少于无线表模型60%
|
|
|
+ switch_flag
|
|
|
or (0 <= gap_of_len <= 5 and wired_len <= round(wireless_len * 0.75)) # 两者相差不大但有线模型结果较少
|
|
|
or (gap_of_len == 0 and wired_len <= 4) # 单元格数量完全相等且总量小于等于4
|
|
|
or (wired_text_count <= wireless_text_count * 0.6 and wireless_text_count >=10) # 有线模型填入的文字明显少于无线模型
|