| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- """
- 文本匹配工具模块
- 负责文本标准化、相似度计算等
- """
- import re
- from typing import Optional, List, Dict
- from fuzzywuzzy import fuzz
- class TextMatcher:
- """文本匹配器"""
-
- def __init__(self, similarity_threshold: int = 90):
- """
- Args:
- similarity_threshold: 文本相似度阈值
- """
- self.similarity_threshold = similarity_threshold
-
- def normalize_text(self, text: str) -> str:
- """标准化文本(去除空格、标点等)"""
- # 移除所有空白字符
- text = re.sub(r'\s+', '', text)
- # 转换全角数字和字母为半角
- text = self._full_to_half(text)
- return text.lower()
-
- def _full_to_half(self, text: str) -> str:
- """全角转半角"""
- result = []
- for char in text:
- code = ord(char)
- if code == 0x3000: # 全角空格
- code = 0x0020
- elif 0xFF01 <= code <= 0xFF5E: # 全角字符
- code -= 0xFEE0
- result.append(chr(code))
- return ''.join(result)
-
- def find_matching_bbox(self, target_text: str, text_boxes: List[Dict],
- start_index: int, last_match_index: int,
- look_ahead_window: int = 10) -> tuple[Optional[Dict], int, int]:
- """
- 查找匹配的文字框
-
- Args:
- target_text: 目标文本
- text_boxes: 文字框列表
- start_index: 起始索引
- last_match_index: 上次匹配成功的索引
- look_ahead_window: 向前查找窗口
-
- Returns:
- (匹配的文字框信息, 新的指针位置, last_match_index)
- """
- target_text = self.normalize_text(target_text)
-
- # 过滤过短的目标文本
- if len(target_text) < 2:
- return None, start_index, last_match_index
- # 确定搜索范围
- search_start = self._find_search_start(
- text_boxes, last_match_index, start_index, look_ahead_window
- )
- search_end = min(start_index + look_ahead_window, len(text_boxes))
-
- # 在搜索范围内查找最佳匹配
- for i in range(search_start, search_end):
- if text_boxes[i]['used']:
- continue
-
- box_text = self.normalize_text(text_boxes[i]['text'])
-
- # 精确匹配优先
- if target_text == box_text:
- return self._return_match(text_boxes[i], i, start_index)
-
- # 过滤过短的候选文本
- if len(box_text) < 2:
- continue
-
- # 长度比例检查
- if not self._check_length_ratio(target_text, box_text):
- continue
-
- # 计算相似度
- if self._is_similar(target_text, box_text):
- return self._return_match(text_boxes[i], i, start_index)
-
- return None, start_index, last_match_index
-
- def _find_search_start(self, text_boxes: List[Dict], last_match_index: int,
- start_index: int, look_ahead_window: int) -> int:
- """确定搜索起始位置"""
- search_start = last_match_index - 1
- unused_count = 0
-
- while search_start >= 0:
- if not text_boxes[search_start]['used']:
- unused_count += 1
- if unused_count >= look_ahead_window:
- break
- search_start -= 1
-
- if search_start < 0:
- search_start = 0
- while search_start < start_index and text_boxes[search_start]['used']:
- search_start += 1
-
- return search_start
-
- def _check_length_ratio(self, text1: str, text2: str) -> bool:
- """检查长度比例"""
- length_ratio = min(len(text1), len(text2)) / max(len(text1), len(text2))
- return length_ratio >= 0.3
-
- def _is_similar(self, text1: str, text2: str) -> bool:
- """判断两个文本是否相似"""
- # 子串检查
- shorter = text1 if len(text1) < len(text2) else text2
- longer = text2 if len(text1) < len(text2) else text1
- is_substring = shorter in longer
-
- # 计算相似度
- partial_ratio = fuzz.partial_ratio(text1, text2)
- if is_substring:
- partial_ratio += 10 # 子串时提升相似度
-
- return partial_ratio >= self.similarity_threshold
-
- def _return_match(self, text_box: Dict, index: int, start_index: int) -> tuple:
- """返回匹配结果"""
- if index >= start_index:
- return text_box, index + 1, index
- else:
- return text_box, start_index, index
|