""" 文本匹配工具模块 负责文本标准化、相似度计算等 """ import re from typing import Optional, List, Dict from fuzzywuzzy import fuzz class TextMatcher: """文本匹配器""" def __init__(self, similarity_threshold: int = 90): """ Args: similarity_threshold: 文本相似度阈值 """ self.similarity_threshold = similarity_threshold def normalize_text(self, text: str) -> str: """标准化文本(去除空格、标点等)""" # 移除所有空白字符 text = re.sub(r'\s+', '', text) # 转换全角数字和字母为半角 text = self._full_to_half(text) return text.lower() def _full_to_half(self, text: str) -> str: """全角转半角""" result = [] for char in text: code = ord(char) if code == 0x3000: # 全角空格 code = 0x0020 elif 0xFF01 <= code <= 0xFF5E: # 全角字符 code -= 0xFEE0 result.append(chr(code)) return ''.join(result) def find_matching_bbox(self, target_text: str, text_boxes: List[Dict], start_index: int, last_match_index: int, look_ahead_window: int = 10) -> tuple[Optional[Dict], int, int]: """ 查找匹配的文字框 Args: target_text: 目标文本 text_boxes: 文字框列表 start_index: 起始索引 last_match_index: 上次匹配成功的索引 look_ahead_window: 向前查找窗口 Returns: (匹配的文字框信息, 新的指针位置, last_match_index) """ target_text = self.normalize_text(target_text) # 过滤过短的目标文本 if len(target_text) < 2: return None, start_index, last_match_index # 由于minerU和Paddle的顺序基本一致, 也有不一致的地方, 所以需要向前找第一个未使用的位置 # MinerU和Paddle都可能识别错误,所以需要一个look_ahead_window来避免漏掉匹配 # 匹配时会遇到一些特殊情况,比如Paddle把两个连着的cell识别为一个字符串,MinerU将单元格上下2行识别为一行 # '1|2024-08-11|扫二维码付' minerU识别为“扫二维码付款”,Paddle识别为'12024-08-11扫二维码付' # 款 # 字符串的顺序极大概率是一致的,所以如果短字符串是长字符串的子串,可以增加相似权重 search_start = self._find_search_start( text_boxes, last_match_index, start_index, look_ahead_window ) search_end = min(start_index + look_ahead_window, len(text_boxes)) best_match = None best_index = start_index # 在搜索范围内查找最佳匹配 for i in range(search_start, search_end): if text_boxes[i]['used']: continue box_text = self.normalize_text(text_boxes[i]['text']) # 精确匹配优先 if target_text == box_text: if i >= start_index: return text_boxes[i], i + 1, i else: return text_boxes[i], start_index, i # 过滤过短的候选文本(避免单字符匹配) if len(box_text) < 2: continue # 长度比例检查 - 避免长度差异过大的匹配 length_ratio = min(len(target_text), len(box_text)) / max(len(target_text), len(box_text)) if length_ratio < 0.3: # 长度差异超过70%则跳过 continue # 子串检查 shorter = target_text if len(target_text) < len(box_text) else box_text longer = box_text if len(target_text) < len(box_text) else target_text is_substring = shorter in longer # 计算多种相似度 # token_sort_ratio = fuzz.token_sort_ratio(target_text, box_text) partial_ratio = fuzz.partial_ratio(target_text, box_text) if is_substring: partial_ratio += 10 # 子串时提升相似度 # 综合相似度 - 两种算法都要达到阈值 if (partial_ratio >= self.similarity_threshold): if i >= start_index: return text_boxes[i], i + 1, last_match_index else: return text_boxes[i], start_index, last_match_index return best_match, best_index, last_match_index def _find_search_start(self, text_boxes: List[Dict], last_match_index: int, start_index: int, look_ahead_window: int) -> int: """确定搜索起始位置""" search_start = last_match_index - 1 unused_count = 0 while search_start >= 0: if not text_boxes[search_start]['used']: unused_count += 1 if unused_count >= look_ahead_window: break search_start -= 1 if search_start < 0: search_start = 0 while search_start < start_index and text_boxes[search_start]['used']: search_start += 1 return search_start