12 hours ago · 253b8b6635
--- a/merger/table_cell_matcher.py
+++ b/merger/table_cell_matcher.py
@@ -23,7 +23,8 @@ class TableCellMatcher:
 
				     
			
 
				     def __init__(self, text_matcher: TextMatcher, 
			
 
				                  x_tolerance: int = 3, 
			
 
				-                 y_tolerance: int = 10):
			
 
				+                 y_tolerance: int = 10,
			
 
				+                 inclination_threshold: float = 0.3):
			
 
				         """
			
 
				         Args:
			
 
				             text_matcher: 文本匹配器
			
@@ -33,6 +34,7 @@ class TableCellMatcher:
 
				         self.text_matcher = text_matcher
			
 
				         self.x_tolerance = x_tolerance
			
 
				         self.y_tolerance = y_tolerance
			
 
				+        self.inclination_threshold = inclination_threshold  # 倾斜校正阈值（度数）
			
 
				     
			
 
				     def enhance_table_html_with_bbox(self, html: str, paddle_text_boxes: List[Dict],
			
 
				                                   start_pointer: int, table_bbox: Optional[List[int]] = None) -> Tuple[str, List[Dict], int]:
			
@@ -72,7 +74,8 @@ class TableCellMatcher:
 
				         grouped_boxes = self._group_paddle_boxes_by_rows(
			
 
				             table_region_boxes,
			
 
				             y_tolerance=self.y_tolerance,
			
 
				-            auto_correct_skew=True
			
 
				+            auto_correct_skew=True,
			
 
				+            inclination_threshold=self.inclination_threshold
			
 
				         )
			
 
				         
			
 
				         # 🔑 第三步：在每组内按 x 坐标排序
			
@@ -501,7 +504,8 @@ class TableCellMatcher:
 
				 
			
 
				     def _group_paddle_boxes_by_rows(self, paddle_boxes: List[Dict], 
			
 
				                                     y_tolerance: int = 10,
			
 
				-                                    auto_correct_skew: bool = True) -> List[Dict]:
			
 
				+                                    auto_correct_skew: bool = True,
			
 
				+                                    inclination_threshold: float = 0.3) -> List[Dict]:
			
 
				         """
			
 
				         将 paddle_text_boxes 按 y 坐标分组（聚类）- 增强版本
			
 
				     
			
@@ -520,7 +524,7 @@ class TableCellMatcher:
 
				         if auto_correct_skew:
			
 
				             rotation_angle = BBoxExtractor.calculate_skew_angle(paddle_boxes)
			
 
				             
			
 
				-            if abs(rotation_angle) > 0.5:
			
 
				+            if abs(rotation_angle) > inclination_threshold:
			
 
				                 max_x = max(box['bbox'][2] for box in paddle_boxes)
			
 
				                 max_y = max(box['bbox'][3] for box in paddle_boxes)
			
 
				                 image_size = (max_x, max_y)
			
@@ -968,7 +972,7 @@ class TableCellMatcher:
 
				                 return self._build_match_result([box], box['text'], 100.0, boxes.index(box))
			
 
				         
			
 
				         # 🔑 策略 2: 多个 boxes 合并匹配
			
 
				-        unused_boxes = [b for b in boxes if not b.get('used')]
			
 
				+        unused_boxes = [b for b in boxes[first_unused_idx:] if not b.get('used')]
			
 
				         # 合并同列的 boxes 合并
			
 
				         merged_bboxes = []
			
 
				         for col_idx in range(len(col_boundaries)):
			
@@ -1003,6 +1007,59 @@ class TableCellMatcher:
 
				             # partial_ratio: 子串模糊匹配，解决 OCR 识别错误
			
 
				             partial_sim = fuzz.partial_ratio(cell_text_normalized, merged_text_normalized)
			
 
				             
			
 
				+            # 🛡️ 增强版防御：防止“短文本”误匹配“长文本”
			
 
				+            if partial_sim > 80:
			
 
				+                len_cell = len(cell_text_normalized)
			
 
				+                len_box = len(merged_text_normalized)
			
 
				+                
			
 
				+                # 确定短方和长方
			
 
				+                if len_cell < len_box:
			
 
				+                    len_short, len_long = len_cell, len_box
			
 
				+                    text_short = cell_text_normalized
			
 
				+                    text_long = merged_text_normalized
			
 
				+                else:
			
 
				+                    len_short, len_long = len_box, len_cell
			
 
				+                    text_short = merged_text_normalized
			
 
				+                    text_long = cell_text_normalized
			
 
				+                
			
 
				+                # 🎯 修正：检测有效内容 (字母、数字、汉字)
			
 
				+                # 使用 Unicode 范围匹配汉字: \u4e00-\u9fa5
			
 
				+                import re
			
 
				+                def has_valid_content(text):
			
 
				+                    return bool(re.search(r'[a-zA-Z0-9\u4e00-\u9fa5]', text))
			
 
				+
			
 
				+                short_has_content = has_valid_content(text_short)
			
 
				+                long_has_content = has_valid_content(text_long)
			
 
				+                
			
 
				+                # 🛑 拒绝条件 1: 短方是纯符号 (无有效内容)，且长方有内容
			
 
				+                # 例如: Cell="-" vs Box="-200" (拦截)
			
 
				+                # 例如: Cell="中国银行" vs Box="中国银行储蓄卡" (不拦截，因为都有汉字)
			
 
				+                if not short_has_content and long_has_content:
			
 
				+                     # 允许例外：如果长方也很短 (比如 Cell="-" Box="- ")，可能只是多了个空格，不拦截
			
 
				+                     if len_long > len_short + 2:
			
 
				+                        print(f"         ⚠️ 拒绝纯符号部分匹配: '{cell_text}' vs '{merged_text_normalized}'")
			
 
				+                        partial_sim = 0.0
			
 
				+
			
 
				+                # 🛑 拒绝条件 2: 短方虽然有内容，但太短了 (信息量不足)
			
 
				+                elif short_has_content:
			
 
				+                    # 如果短方只有 1 个字符，且长方超过 3 个字符 -> 拒绝
			
 
				+                    if len_short == 1 and len_long > 3:
			
 
				+                        print(f"         ⚠️ 拒绝单字符部分匹配: '{cell_text}' vs '{merged_text_normalized}'")
			
 
				+                        partial_sim = 0.0
			
 
				+                    # 如果短方只有 2 个字符，且长方超过 8 个字符 -> 拒绝
			
 
				+                    elif len_short == 2 and len_long > 8:
			
 
				+                        print(f"         ⚠️ 拒绝微小碎片部分匹配: '{cell_text}' vs '{merged_text_normalized}'")
			
 
				+                        partial_sim = 0.0
			
 
				+
			
 
				+                    # 🆕 新增条件 3: 覆盖率过低 (防止 "2024" 匹配 "ID2024...")
			
 
				+                    # 场景: Cell 是长文本, Box 是短文本, 恰好包含在 Cell 中
			
 
				+                    # 逻辑: 如果覆盖率 < 30% 且 整体相似度(token_sort) < 45，说明 Box 缺失了 Cell 的绝大部分内容
			
 
				+                    else:
			
 
				+                        coverage = len_short / len_long if len_long > 0 else 0
			
 
				+                        if coverage < 0.3 and token_sort_sim < 45:
			
 
				+                             print(f"         ⚠️ 拒绝低覆盖率部分匹配: '{text_short}' in '{text_long}' (cov={coverage:.2f})")
			
 
				+                             partial_sim = 0.0
			
 
				+
			
 
				             # 🎯 新增：token_set_ratio (集合匹配)
			
 
				             # 专门解决：目标文本被 OCR 文本中的噪音隔开的情况
			
 
				             # 例如 Target="A B", OCR="A noise B" -> token_set_ratio 会很高
			
@@ -1042,12 +1099,17 @@ class TableCellMatcher:
 
				                     
			
 
				                     # 1. 长度差异过大 (Box 比 Cell 长很多)
			
 
				                     if len_box > len_cell * 1.5:
			
 
				-                        # 2. 且 Cell 是数字/日期/时间类型 (容易在长ID中误配)
			
 
				+                        # 2. 且 Cell 是数字/日期/时间类型
			
 
				                         import re
			
 
				-                        # 匹配纯数字、日期时间格式
			
 
				                         if re.match(r'^[\d\-\:\.\s]+$', cell_text_normalized):
			
 
				-                            print(f"         ⚠️ 拒绝子序列匹配: 长度差异大且为数字类型 (sim={subseq_sim})")
			
 
				-                            subseq_sim = 0.0
			
 
				+                            # 🧠 智能豁免：如果 Cell 本身很长 (例如 > 12字符)，说明是长ID
			
 
				+                            # 长ID即使夹杂了噪音 (如 "ID...日期...文字")，只要子序列匹配高，通常也是对的
			
 
				+                            # 只有短文本 (如 "2024") 才需要严格防御
			
 
				+                            if len_cell < 12:
			
 
				+                                print(f"         ⚠️ 拒绝子序列匹配: 长度差异大且为短数字类型 (sim={subseq_sim})")
			
 
				+                                subseq_sim = 0.0
			
 
				+                            else:
			
 
				+                                print(f"         ✅ 接受长ID子序列匹配: 尽管长度差异大，但特征显著 (len={len_cell})")
			
 
				 
			
 
				                 if subseq_sim > 90:
			
 
				                     print(f"         🔗 子序列匹配生效: '{cell_text[:10]}...' (sim={subseq_sim:.1f})")