Browse Source

feat: 添加倾斜校正阈值和增强的部分匹配防御逻辑,优化表格单元格匹配

zhch158_admin 11 hours ago
parent
commit
253b8b6635
1 changed files with 71 additions and 9 deletions
  1. 71 9
      merger/table_cell_matcher.py

+ 71 - 9
merger/table_cell_matcher.py

@@ -23,7 +23,8 @@ class TableCellMatcher:
     
     def __init__(self, text_matcher: TextMatcher, 
                  x_tolerance: int = 3, 
-                 y_tolerance: int = 10):
+                 y_tolerance: int = 10,
+                 inclination_threshold: float = 0.3):
         """
         Args:
             text_matcher: 文本匹配器
@@ -33,6 +34,7 @@ class TableCellMatcher:
         self.text_matcher = text_matcher
         self.x_tolerance = x_tolerance
         self.y_tolerance = y_tolerance
+        self.inclination_threshold = inclination_threshold  # 倾斜校正阈值(度数)
     
     def enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
                                   start_pointer: int, table_bbox: Optional[List[int]] = None) -> Tuple[str, List[Dict], int]:
@@ -72,7 +74,8 @@ class TableCellMatcher:
         grouped_boxes = self._group_paddle_boxes_by_rows(
             table_region_boxes,
             y_tolerance=self.y_tolerance,
-            auto_correct_skew=True
+            auto_correct_skew=True,
+            inclination_threshold=self.inclination_threshold
         )
         
         # 🔑 第三步:在每组内按 x 坐标排序
@@ -501,7 +504,8 @@ class TableCellMatcher:
 
     def _group_paddle_boxes_by_rows(self, paddle_boxes: List[Dict], 
                                     y_tolerance: int = 10,
-                                    auto_correct_skew: bool = True) -> List[Dict]:
+                                    auto_correct_skew: bool = True,
+                                    inclination_threshold: float = 0.3) -> List[Dict]:
         """
         将 paddle_text_boxes 按 y 坐标分组(聚类)- 增强版本
     
@@ -520,7 +524,7 @@ class TableCellMatcher:
         if auto_correct_skew:
             rotation_angle = BBoxExtractor.calculate_skew_angle(paddle_boxes)
             
-            if abs(rotation_angle) > 0.5:
+            if abs(rotation_angle) > inclination_threshold:
                 max_x = max(box['bbox'][2] for box in paddle_boxes)
                 max_y = max(box['bbox'][3] for box in paddle_boxes)
                 image_size = (max_x, max_y)
@@ -968,7 +972,7 @@ class TableCellMatcher:
                 return self._build_match_result([box], box['text'], 100.0, boxes.index(box))
         
         # 🔑 策略 2: 多个 boxes 合并匹配
-        unused_boxes = [b for b in boxes if not b.get('used')]
+        unused_boxes = [b for b in boxes[first_unused_idx:] if not b.get('used')]
         # 合并同列的 boxes 合并
         merged_bboxes = []
         for col_idx in range(len(col_boundaries)):
@@ -1003,6 +1007,59 @@ class TableCellMatcher:
             # partial_ratio: 子串模糊匹配,解决 OCR 识别错误
             partial_sim = fuzz.partial_ratio(cell_text_normalized, merged_text_normalized)
             
+            # 🛡️ 增强版防御:防止“短文本”误匹配“长文本”
+            if partial_sim > 80:
+                len_cell = len(cell_text_normalized)
+                len_box = len(merged_text_normalized)
+                
+                # 确定短方和长方
+                if len_cell < len_box:
+                    len_short, len_long = len_cell, len_box
+                    text_short = cell_text_normalized
+                    text_long = merged_text_normalized
+                else:
+                    len_short, len_long = len_box, len_cell
+                    text_short = merged_text_normalized
+                    text_long = cell_text_normalized
+                
+                # 🎯 修正:检测有效内容 (字母、数字、汉字)
+                # 使用 Unicode 范围匹配汉字: \u4e00-\u9fa5
+                import re
+                def has_valid_content(text):
+                    return bool(re.search(r'[a-zA-Z0-9\u4e00-\u9fa5]', text))
+
+                short_has_content = has_valid_content(text_short)
+                long_has_content = has_valid_content(text_long)
+                
+                # 🛑 拒绝条件 1: 短方是纯符号 (无有效内容),且长方有内容
+                # 例如: Cell="-" vs Box="-200" (拦截)
+                # 例如: Cell="中国银行" vs Box="中国银行储蓄卡" (不拦截,因为都有汉字)
+                if not short_has_content and long_has_content:
+                     # 允许例外:如果长方也很短 (比如 Cell="-" Box="- "),可能只是多了个空格,不拦截
+                     if len_long > len_short + 2:
+                        print(f"         ⚠️ 拒绝纯符号部分匹配: '{cell_text}' vs '{merged_text_normalized}'")
+                        partial_sim = 0.0
+
+                # 🛑 拒绝条件 2: 短方虽然有内容,但太短了 (信息量不足)
+                elif short_has_content:
+                    # 如果短方只有 1 个字符,且长方超过 3 个字符 -> 拒绝
+                    if len_short == 1 and len_long > 3:
+                        print(f"         ⚠️ 拒绝单字符部分匹配: '{cell_text}' vs '{merged_text_normalized}'")
+                        partial_sim = 0.0
+                    # 如果短方只有 2 个字符,且长方超过 8 个字符 -> 拒绝
+                    elif len_short == 2 and len_long > 8:
+                        print(f"         ⚠️ 拒绝微小碎片部分匹配: '{cell_text}' vs '{merged_text_normalized}'")
+                        partial_sim = 0.0
+
+                    # 🆕 新增条件 3: 覆盖率过低 (防止 "2024" 匹配 "ID2024...")
+                    # 场景: Cell 是长文本, Box 是短文本, 恰好包含在 Cell 中
+                    # 逻辑: 如果覆盖率 < 30% 且 整体相似度(token_sort) < 45,说明 Box 缺失了 Cell 的绝大部分内容
+                    else:
+                        coverage = len_short / len_long if len_long > 0 else 0
+                        if coverage < 0.3 and token_sort_sim < 45:
+                             print(f"         ⚠️ 拒绝低覆盖率部分匹配: '{text_short}' in '{text_long}' (cov={coverage:.2f})")
+                             partial_sim = 0.0
+
             # 🎯 新增:token_set_ratio (集合匹配)
             # 专门解决:目标文本被 OCR 文本中的噪音隔开的情况
             # 例如 Target="A B", OCR="A noise B" -> token_set_ratio 会很高
@@ -1042,12 +1099,17 @@ class TableCellMatcher:
                     
                     # 1. 长度差异过大 (Box 比 Cell 长很多)
                     if len_box > len_cell * 1.5:
-                        # 2. 且 Cell 是数字/日期/时间类型 (容易在长ID中误配)
+                        # 2. 且 Cell 是数字/日期/时间类型
                         import re
-                        # 匹配纯数字、日期时间格式
                         if re.match(r'^[\d\-\:\.\s]+$', cell_text_normalized):
-                            print(f"         ⚠️ 拒绝子序列匹配: 长度差异大且为数字类型 (sim={subseq_sim})")
-                            subseq_sim = 0.0
+                            # 🧠 智能豁免:如果 Cell 本身很长 (例如 > 12字符),说明是长ID
+                            # 长ID即使夹杂了噪音 (如 "ID...日期...文字"),只要子序列匹配高,通常也是对的
+                            # 只有短文本 (如 "2024") 才需要严格防御
+                            if len_cell < 12:
+                                print(f"         ⚠️ 拒绝子序列匹配: 长度差异大且为短数字类型 (sim={subseq_sim})")
+                                subseq_sim = 0.0
+                            else:
+                                print(f"         ✅ 接受长ID子序列匹配: 尽管长度差异大,但特征显著 (len={len_cell})")
 
                 if subseq_sim > 90:
                     print(f"         🔗 子序列匹配生效: '{cell_text[:10]}...' (sim={subseq_sim:.1f})")