Ver Fonte

feat: 优化文本匹配逻辑,增强相似度计算和匹配准确性

zhch158_admin há 4 semanas atrás
pai
commit
0ecea1078f
1 ficheiros alterados com 38 adições e 37 exclusões
  1. 38 37
      merger/text_matcher.py

+ 38 - 37
merger/text_matcher.py

@@ -59,12 +59,20 @@ class TextMatcher:
         if len(target_text) < 2:
             return None, start_index, last_match_index
 
-        # 确定搜索范围
+        # 由于minerU和Paddle的顺序基本一致, 也有不一致的地方, 所以需要向前找第一个未使用的位置
+        # MinerU和Paddle都可能识别错误,所以需要一个look_ahead_window来避免漏掉匹配
+        # 匹配时会遇到一些特殊情况,比如Paddle把两个连着的cell识别为一个字符串,MinerU将单元格上下2行识别为一行
+        # 	'1|2024-08-11|扫二维码付'   minerU识别为“扫二维码付款”,Paddle识别为'12024-08-11扫二维码付'  
+        #                  款
+        # 字符串的顺序极大概率是一致的,所以如果短字符串是长字符串的子串,可以增加相似权重
         search_start = self._find_search_start(
             text_boxes, last_match_index, start_index, look_ahead_window
         )
         search_end = min(start_index + look_ahead_window, len(text_boxes))
         
+        best_match = None
+        best_index = start_index
+
         # 在搜索范围内查找最佳匹配
         for i in range(search_start, search_end):
             if text_boxes[i]['used']:
@@ -74,22 +82,40 @@ class TextMatcher:
             
             # 精确匹配优先
             if target_text == box_text:
-                return self._return_match(text_boxes[i], i, start_index)
+                if i >= start_index:
+                    return text_boxes[i], i + 1, i
+                else:
+                    return text_boxes[i], start_index, i
             
-            # 过滤过短的候选文本
+            # 过滤过短的候选文本(避免单字符匹配)
             if len(box_text) < 2:
                 continue
             
-            # 长度比例检查
-            if not self._check_length_ratio(target_text, box_text):
+            # 长度比例检查 - 避免长度差异过大的匹配
+            length_ratio = min(len(target_text), len(box_text)) / max(len(target_text), len(box_text))
+            if length_ratio < 0.3:  # 长度差异超过70%则跳过
                 continue
+
+            # 子串检查
+            shorter = target_text if len(target_text) < len(box_text) else box_text
+            longer = box_text if len(target_text) < len(box_text) else target_text
+            is_substring = shorter in longer        
+
+            # 计算多种相似度
+            # token_sort_ratio = fuzz.token_sort_ratio(target_text, box_text)
+            partial_ratio = fuzz.partial_ratio(target_text, box_text)
+            if is_substring:
+                partial_ratio += 10  # 子串时提升相似度
             
-            # 计算相似度
-            if self._is_similar(target_text, box_text):
-                return self._return_match(text_boxes[i], i, start_index)
-        
-        return None, start_index, last_match_index
-    
+            # 综合相似度 - 两种算法都要达到阈值
+            if (partial_ratio >= self.similarity_threshold):
+                if i >= start_index:
+                    return text_boxes[i], i + 1, last_match_index
+                else:
+                    return text_boxes[i], start_index, last_match_index
+
+        return best_match, best_index, last_match_index    
+
     def _find_search_start(self, text_boxes: List[Dict], last_match_index: int,
                           start_index: int, look_ahead_window: int) -> int:
         """确定搜索起始位置"""
@@ -109,29 +135,4 @@ class TextMatcher:
                 search_start += 1
         
         return search_start
-    
-    def _check_length_ratio(self, text1: str, text2: str) -> bool:
-        """检查长度比例"""
-        length_ratio = min(len(text1), len(text2)) / max(len(text1), len(text2))
-        return length_ratio >= 0.3
-    
-    def _is_similar(self, text1: str, text2: str) -> bool:
-        """判断两个文本是否相似"""
-        # 子串检查
-        shorter = text1 if len(text1) < len(text2) else text2
-        longer = text2 if len(text1) < len(text2) else text1
-        is_substring = shorter in longer
-        
-        # 计算相似度
-        partial_ratio = fuzz.partial_ratio(text1, text2)
-        if is_substring:
-            partial_ratio += 10  # 子串时提升相似度
-        
-        return partial_ratio >= self.similarity_threshold
-    
-    def _return_match(self, text_box: Dict, index: int, start_index: int) -> tuple:
-        """返回匹配结果"""
-        if index >= start_index:
-            return text_box, index + 1, index
-        else:
-            return text_box, start_index, index
+