""" 文本匹配工具模块 负责文本标准化、相似度计算等 """ import re from typing import Optional, List, Dict from fuzzywuzzy import fuzz class TextMatcher: """文本匹配器""" def __init__(self, similarity_threshold: int = 90): """ Args: similarity_threshold: 文本相似度阈值 """ self.similarity_threshold = similarity_threshold def normalize_text(self, text: str) -> str: """标准化文本(去除空格、标点等)""" # 移除所有空白字符 text = re.sub(r'\s+', '', text) # 转换全角数字和字母为半角 text = self._full_to_half(text) return text.lower() def _full_to_half(self, text: str) -> str: """全角转半角""" result = [] for char in text: code = ord(char) if code == 0x3000: # 全角空格 code = 0x0020 elif 0xFF01 <= code <= 0xFF5E: # 全角字符 code -= 0xFEE0 result.append(chr(code)) return ''.join(result) def find_matching_bbox(self, target_text: str, text_boxes: List[Dict], start_index: int, last_match_index: int, look_ahead_window: int = 10) -> tuple[Optional[Dict], int, int]: """ 查找匹配的文字框 Args: target_text: 目标文本 text_boxes: 文字框列表 start_index: 起始索引 last_match_index: 上次匹配成功的索引 look_ahead_window: 向前查找窗口 Returns: (匹配的文字框信息, 新的指针位置, last_match_index) """ target_text = self.normalize_text(target_text) # 过滤过短的目标文本 if len(target_text) < 2: return None, start_index, last_match_index # 确定搜索范围 search_start = self._find_search_start( text_boxes, last_match_index, start_index, look_ahead_window ) search_end = min(start_index + look_ahead_window, len(text_boxes)) # 在搜索范围内查找最佳匹配 for i in range(search_start, search_end): if text_boxes[i]['used']: continue box_text = self.normalize_text(text_boxes[i]['text']) # 精确匹配优先 if target_text == box_text: return self._return_match(text_boxes[i], i, start_index) # 过滤过短的候选文本 if len(box_text) < 2: continue # 长度比例检查 if not self._check_length_ratio(target_text, box_text): continue # 计算相似度 if self._is_similar(target_text, box_text): return self._return_match(text_boxes[i], i, start_index) return None, start_index, last_match_index def _find_search_start(self, text_boxes: List[Dict], last_match_index: int, start_index: int, look_ahead_window: int) -> int: """确定搜索起始位置""" search_start = last_match_index - 1 unused_count = 0 while search_start >= 0: if not text_boxes[search_start]['used']: unused_count += 1 if unused_count >= look_ahead_window: break search_start -= 1 if search_start < 0: search_start = 0 while search_start < start_index and text_boxes[search_start]['used']: search_start += 1 return search_start def _check_length_ratio(self, text1: str, text2: str) -> bool: """检查长度比例""" length_ratio = min(len(text1), len(text2)) / max(len(text1), len(text2)) return length_ratio >= 0.3 def _is_similar(self, text1: str, text2: str) -> bool: """判断两个文本是否相似""" # 子串检查 shorter = text1 if len(text1) < len(text2) else text2 longer = text2 if len(text1) < len(text2) else text1 is_substring = shorter in longer # 计算相似度 partial_ratio = fuzz.partial_ratio(text1, text2) if is_substring: partial_ratio += 10 # 子串时提升相似度 return partial_ratio >= self.similarity_threshold def _return_match(self, text_box: Dict, index: int, start_index: int) -> tuple: """返回匹配结果""" if index >= start_index: return text_box, index + 1, index else: return text_box, start_index, index