|
|
@@ -59,12 +59,20 @@ class TextMatcher:
|
|
|
if len(target_text) < 2:
|
|
|
return None, start_index, last_match_index
|
|
|
|
|
|
- # 确定搜索范围
|
|
|
+ # 由于minerU和Paddle的顺序基本一致, 也有不一致的地方, 所以需要向前找第一个未使用的位置
|
|
|
+ # MinerU和Paddle都可能识别错误,所以需要一个look_ahead_window来避免漏掉匹配
|
|
|
+ # 匹配时会遇到一些特殊情况,比如Paddle把两个连着的cell识别为一个字符串,MinerU将单元格上下2行识别为一行
|
|
|
+ # '1|2024-08-11|扫二维码付' minerU识别为“扫二维码付款”,Paddle识别为'12024-08-11扫二维码付'
|
|
|
+ # 款
|
|
|
+ # 字符串的顺序极大概率是一致的,所以如果短字符串是长字符串的子串,可以增加相似权重
|
|
|
search_start = self._find_search_start(
|
|
|
text_boxes, last_match_index, start_index, look_ahead_window
|
|
|
)
|
|
|
search_end = min(start_index + look_ahead_window, len(text_boxes))
|
|
|
|
|
|
+ best_match = None
|
|
|
+ best_index = start_index
|
|
|
+
|
|
|
# 在搜索范围内查找最佳匹配
|
|
|
for i in range(search_start, search_end):
|
|
|
if text_boxes[i]['used']:
|
|
|
@@ -74,22 +82,40 @@ class TextMatcher:
|
|
|
|
|
|
# 精确匹配优先
|
|
|
if target_text == box_text:
|
|
|
- return self._return_match(text_boxes[i], i, start_index)
|
|
|
+ if i >= start_index:
|
|
|
+ return text_boxes[i], i + 1, i
|
|
|
+ else:
|
|
|
+ return text_boxes[i], start_index, i
|
|
|
|
|
|
- # 过滤过短的候选文本
|
|
|
+ # 过滤过短的候选文本(避免单字符匹配)
|
|
|
if len(box_text) < 2:
|
|
|
continue
|
|
|
|
|
|
- # 长度比例检查
|
|
|
- if not self._check_length_ratio(target_text, box_text):
|
|
|
+ # 长度比例检查 - 避免长度差异过大的匹配
|
|
|
+ length_ratio = min(len(target_text), len(box_text)) / max(len(target_text), len(box_text))
|
|
|
+ if length_ratio < 0.3: # 长度差异超过70%则跳过
|
|
|
continue
|
|
|
+
|
|
|
+ # 子串检查
|
|
|
+ shorter = target_text if len(target_text) < len(box_text) else box_text
|
|
|
+ longer = box_text if len(target_text) < len(box_text) else target_text
|
|
|
+ is_substring = shorter in longer
|
|
|
+
|
|
|
+ # 计算多种相似度
|
|
|
+ # token_sort_ratio = fuzz.token_sort_ratio(target_text, box_text)
|
|
|
+ partial_ratio = fuzz.partial_ratio(target_text, box_text)
|
|
|
+ if is_substring:
|
|
|
+ partial_ratio += 10 # 子串时提升相似度
|
|
|
|
|
|
- # 计算相似度
|
|
|
- if self._is_similar(target_text, box_text):
|
|
|
- return self._return_match(text_boxes[i], i, start_index)
|
|
|
-
|
|
|
- return None, start_index, last_match_index
|
|
|
-
|
|
|
+ # 综合相似度 - 两种算法都要达到阈值
|
|
|
+ if (partial_ratio >= self.similarity_threshold):
|
|
|
+ if i >= start_index:
|
|
|
+ return text_boxes[i], i + 1, last_match_index
|
|
|
+ else:
|
|
|
+ return text_boxes[i], start_index, last_match_index
|
|
|
+
|
|
|
+ return best_match, best_index, last_match_index
|
|
|
+
|
|
|
def _find_search_start(self, text_boxes: List[Dict], last_match_index: int,
|
|
|
start_index: int, look_ahead_window: int) -> int:
|
|
|
"""确定搜索起始位置"""
|
|
|
@@ -109,29 +135,4 @@ class TextMatcher:
|
|
|
search_start += 1
|
|
|
|
|
|
return search_start
|
|
|
-
|
|
|
- def _check_length_ratio(self, text1: str, text2: str) -> bool:
|
|
|
- """检查长度比例"""
|
|
|
- length_ratio = min(len(text1), len(text2)) / max(len(text1), len(text2))
|
|
|
- return length_ratio >= 0.3
|
|
|
-
|
|
|
- def _is_similar(self, text1: str, text2: str) -> bool:
|
|
|
- """判断两个文本是否相似"""
|
|
|
- # 子串检查
|
|
|
- shorter = text1 if len(text1) < len(text2) else text2
|
|
|
- longer = text2 if len(text1) < len(text2) else text1
|
|
|
- is_substring = shorter in longer
|
|
|
-
|
|
|
- # 计算相似度
|
|
|
- partial_ratio = fuzz.partial_ratio(text1, text2)
|
|
|
- if is_substring:
|
|
|
- partial_ratio += 10 # 子串时提升相似度
|
|
|
-
|
|
|
- return partial_ratio >= self.similarity_threshold
|
|
|
-
|
|
|
- def _return_match(self, text_box: Dict, index: int, start_index: int) -> tuple:
|
|
|
- """返回匹配结果"""
|
|
|
- if index >= start_index:
|
|
|
- return text_box, index + 1, index
|
|
|
- else:
|
|
|
- return text_box, start_index, index
|
|
|
+
|