|
|
@@ -23,7 +23,8 @@ class TableCellMatcher:
|
|
|
|
|
|
def __init__(self, text_matcher: TextMatcher,
|
|
|
x_tolerance: int = 3,
|
|
|
- y_tolerance: int = 10):
|
|
|
+ y_tolerance: int = 10,
|
|
|
+ inclination_threshold: float = 0.3):
|
|
|
"""
|
|
|
Args:
|
|
|
text_matcher: 文本匹配器
|
|
|
@@ -33,6 +34,7 @@ class TableCellMatcher:
|
|
|
self.text_matcher = text_matcher
|
|
|
self.x_tolerance = x_tolerance
|
|
|
self.y_tolerance = y_tolerance
|
|
|
+ self.inclination_threshold = inclination_threshold # 倾斜校正阈值(度数)
|
|
|
|
|
|
def enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
|
|
|
start_pointer: int, table_bbox: Optional[List[int]] = None) -> Tuple[str, List[Dict], int]:
|
|
|
@@ -72,7 +74,8 @@ class TableCellMatcher:
|
|
|
grouped_boxes = self._group_paddle_boxes_by_rows(
|
|
|
table_region_boxes,
|
|
|
y_tolerance=self.y_tolerance,
|
|
|
- auto_correct_skew=True
|
|
|
+ auto_correct_skew=True,
|
|
|
+ inclination_threshold=self.inclination_threshold
|
|
|
)
|
|
|
|
|
|
# 🔑 第三步:在每组内按 x 坐标排序
|
|
|
@@ -501,7 +504,8 @@ class TableCellMatcher:
|
|
|
|
|
|
def _group_paddle_boxes_by_rows(self, paddle_boxes: List[Dict],
|
|
|
y_tolerance: int = 10,
|
|
|
- auto_correct_skew: bool = True) -> List[Dict]:
|
|
|
+ auto_correct_skew: bool = True,
|
|
|
+ inclination_threshold: float = 0.3) -> List[Dict]:
|
|
|
"""
|
|
|
将 paddle_text_boxes 按 y 坐标分组(聚类)- 增强版本
|
|
|
|
|
|
@@ -520,7 +524,7 @@ class TableCellMatcher:
|
|
|
if auto_correct_skew:
|
|
|
rotation_angle = BBoxExtractor.calculate_skew_angle(paddle_boxes)
|
|
|
|
|
|
- if abs(rotation_angle) > 0.5:
|
|
|
+ if abs(rotation_angle) > inclination_threshold:
|
|
|
max_x = max(box['bbox'][2] for box in paddle_boxes)
|
|
|
max_y = max(box['bbox'][3] for box in paddle_boxes)
|
|
|
image_size = (max_x, max_y)
|
|
|
@@ -968,7 +972,7 @@ class TableCellMatcher:
|
|
|
return self._build_match_result([box], box['text'], 100.0, boxes.index(box))
|
|
|
|
|
|
# 🔑 策略 2: 多个 boxes 合并匹配
|
|
|
- unused_boxes = [b for b in boxes if not b.get('used')]
|
|
|
+ unused_boxes = [b for b in boxes[first_unused_idx:] if not b.get('used')]
|
|
|
# 合并同列的 boxes 合并
|
|
|
merged_bboxes = []
|
|
|
for col_idx in range(len(col_boundaries)):
|
|
|
@@ -1003,6 +1007,59 @@ class TableCellMatcher:
|
|
|
# partial_ratio: 子串模糊匹配,解决 OCR 识别错误
|
|
|
partial_sim = fuzz.partial_ratio(cell_text_normalized, merged_text_normalized)
|
|
|
|
|
|
+ # 🛡️ 增强版防御:防止“短文本”误匹配“长文本”
|
|
|
+ if partial_sim > 80:
|
|
|
+ len_cell = len(cell_text_normalized)
|
|
|
+ len_box = len(merged_text_normalized)
|
|
|
+
|
|
|
+ # 确定短方和长方
|
|
|
+ if len_cell < len_box:
|
|
|
+ len_short, len_long = len_cell, len_box
|
|
|
+ text_short = cell_text_normalized
|
|
|
+ text_long = merged_text_normalized
|
|
|
+ else:
|
|
|
+ len_short, len_long = len_box, len_cell
|
|
|
+ text_short = merged_text_normalized
|
|
|
+ text_long = cell_text_normalized
|
|
|
+
|
|
|
+ # 🎯 修正:检测有效内容 (字母、数字、汉字)
|
|
|
+ # 使用 Unicode 范围匹配汉字: \u4e00-\u9fa5
|
|
|
+ import re
|
|
|
+ def has_valid_content(text):
|
|
|
+ return bool(re.search(r'[a-zA-Z0-9\u4e00-\u9fa5]', text))
|
|
|
+
|
|
|
+ short_has_content = has_valid_content(text_short)
|
|
|
+ long_has_content = has_valid_content(text_long)
|
|
|
+
|
|
|
+ # 🛑 拒绝条件 1: 短方是纯符号 (无有效内容),且长方有内容
|
|
|
+ # 例如: Cell="-" vs Box="-200" (拦截)
|
|
|
+ # 例如: Cell="中国银行" vs Box="中国银行储蓄卡" (不拦截,因为都有汉字)
|
|
|
+ if not short_has_content and long_has_content:
|
|
|
+ # 允许例外:如果长方也很短 (比如 Cell="-" Box="- "),可能只是多了个空格,不拦截
|
|
|
+ if len_long > len_short + 2:
|
|
|
+ print(f" ⚠️ 拒绝纯符号部分匹配: '{cell_text}' vs '{merged_text_normalized}'")
|
|
|
+ partial_sim = 0.0
|
|
|
+
|
|
|
+ # 🛑 拒绝条件 2: 短方虽然有内容,但太短了 (信息量不足)
|
|
|
+ elif short_has_content:
|
|
|
+ # 如果短方只有 1 个字符,且长方超过 3 个字符 -> 拒绝
|
|
|
+ if len_short == 1 and len_long > 3:
|
|
|
+ print(f" ⚠️ 拒绝单字符部分匹配: '{cell_text}' vs '{merged_text_normalized}'")
|
|
|
+ partial_sim = 0.0
|
|
|
+ # 如果短方只有 2 个字符,且长方超过 8 个字符 -> 拒绝
|
|
|
+ elif len_short == 2 and len_long > 8:
|
|
|
+ print(f" ⚠️ 拒绝微小碎片部分匹配: '{cell_text}' vs '{merged_text_normalized}'")
|
|
|
+ partial_sim = 0.0
|
|
|
+
|
|
|
+ # 🆕 新增条件 3: 覆盖率过低 (防止 "2024" 匹配 "ID2024...")
|
|
|
+ # 场景: Cell 是长文本, Box 是短文本, 恰好包含在 Cell 中
|
|
|
+ # 逻辑: 如果覆盖率 < 30% 且 整体相似度(token_sort) < 45,说明 Box 缺失了 Cell 的绝大部分内容
|
|
|
+ else:
|
|
|
+ coverage = len_short / len_long if len_long > 0 else 0
|
|
|
+ if coverage < 0.3 and token_sort_sim < 45:
|
|
|
+ print(f" ⚠️ 拒绝低覆盖率部分匹配: '{text_short}' in '{text_long}' (cov={coverage:.2f})")
|
|
|
+ partial_sim = 0.0
|
|
|
+
|
|
|
# 🎯 新增:token_set_ratio (集合匹配)
|
|
|
# 专门解决:目标文本被 OCR 文本中的噪音隔开的情况
|
|
|
# 例如 Target="A B", OCR="A noise B" -> token_set_ratio 会很高
|
|
|
@@ -1042,12 +1099,17 @@ class TableCellMatcher:
|
|
|
|
|
|
# 1. 长度差异过大 (Box 比 Cell 长很多)
|
|
|
if len_box > len_cell * 1.5:
|
|
|
- # 2. 且 Cell 是数字/日期/时间类型 (容易在长ID中误配)
|
|
|
+ # 2. 且 Cell 是数字/日期/时间类型
|
|
|
import re
|
|
|
- # 匹配纯数字、日期时间格式
|
|
|
if re.match(r'^[\d\-\:\.\s]+$', cell_text_normalized):
|
|
|
- print(f" ⚠️ 拒绝子序列匹配: 长度差异大且为数字类型 (sim={subseq_sim})")
|
|
|
- subseq_sim = 0.0
|
|
|
+ # 🧠 智能豁免:如果 Cell 本身很长 (例如 > 12字符),说明是长ID
|
|
|
+ # 长ID即使夹杂了噪音 (如 "ID...日期...文字"),只要子序列匹配高,通常也是对的
|
|
|
+ # 只有短文本 (如 "2024") 才需要严格防御
|
|
|
+ if len_cell < 12:
|
|
|
+ print(f" ⚠️ 拒绝子序列匹配: 长度差异大且为短数字类型 (sim={subseq_sim})")
|
|
|
+ subseq_sim = 0.0
|
|
|
+ else:
|
|
|
+ print(f" ✅ 接受长ID子序列匹配: 尽管长度差异大,但特征显著 (len={len_cell})")
|
|
|
|
|
|
if subseq_sim > 90:
|
|
|
print(f" 🔗 子序列匹配生效: '{cell_text[:10]}...' (sim={subseq_sim:.1f})")
|