text_matcher.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. """
  2. 文本匹配工具模块
  3. 负责文本标准化、相似度计算等
  4. """
  5. import re
  6. from typing import Optional, List, Dict
  7. from fuzzywuzzy import fuzz
  8. class TextMatcher:
  9. """文本匹配器"""
  10. def __init__(self, similarity_threshold: int = 90):
  11. """
  12. Args:
  13. similarity_threshold: 文本相似度阈值
  14. """
  15. self.similarity_threshold = similarity_threshold
  16. def normalize_text(self, text: str) -> str:
  17. """标准化文本(去除空格、标点等)"""
  18. # 移除所有空白字符
  19. text = re.sub(r'\s+', '', text)
  20. # 转换全角数字和字母为半角
  21. text = self._full_to_half(text)
  22. return text.lower()
  23. def _full_to_half(self, text: str) -> str:
  24. """全角转半角"""
  25. result = []
  26. for char in text:
  27. code = ord(char)
  28. if code == 0x3000: # 全角空格
  29. code = 0x0020
  30. elif 0xFF01 <= code <= 0xFF5E: # 全角字符
  31. code -= 0xFEE0
  32. result.append(chr(code))
  33. return ''.join(result)
  34. def find_matching_bbox(self, target_text: str, text_boxes: List[Dict],
  35. start_index: int, last_match_index: int,
  36. look_ahead_window: int = 10) -> tuple[Optional[Dict], int, int]:
  37. """
  38. 查找匹配的文字框
  39. Args:
  40. target_text: 目标文本
  41. text_boxes: 文字框列表
  42. start_index: 起始索引
  43. last_match_index: 上次匹配成功的索引
  44. look_ahead_window: 向前查找窗口
  45. Returns:
  46. (匹配的文字框信息, 新的指针位置, last_match_index)
  47. """
  48. target_text = self.normalize_text(target_text)
  49. # 过滤过短的目标文本
  50. if len(target_text) < 2:
  51. return None, start_index, last_match_index
  52. # 由于minerU和Paddle的顺序基本一致, 也有不一致的地方, 所以需要向前找第一个未使用的位置
  53. # MinerU和Paddle都可能识别错误,所以需要一个look_ahead_window来避免漏掉匹配
  54. # 匹配时会遇到一些特殊情况,比如Paddle把两个连着的cell识别为一个字符串,MinerU将单元格上下2行识别为一行
  55. # '1|2024-08-11|扫二维码付' minerU识别为“扫二维码付款”,Paddle识别为'12024-08-11扫二维码付'
  56. # 款
  57. # 字符串的顺序极大概率是一致的,所以如果短字符串是长字符串的子串,可以增加相似权重
  58. search_start = self._find_search_start(
  59. text_boxes, last_match_index, start_index, look_ahead_window
  60. )
  61. search_end = min(start_index + look_ahead_window, len(text_boxes))
  62. best_match = None
  63. best_index = start_index
  64. # 在搜索范围内查找最佳匹配
  65. for i in range(search_start, search_end):
  66. if text_boxes[i]['used']:
  67. continue
  68. box_text = self.normalize_text(text_boxes[i]['text'])
  69. # 精确匹配优先
  70. if target_text == box_text:
  71. if i >= start_index:
  72. return text_boxes[i], i + 1, i
  73. else:
  74. return text_boxes[i], start_index, i
  75. # 过滤过短的候选文本(避免单字符匹配)
  76. # if len(box_text) < 2:
  77. # continue
  78. # 长度比例检查 - 避免长度差异过大的匹配
  79. length_ratio = min(len(target_text), len(box_text)) / max(len(target_text), len(box_text))
  80. if length_ratio < 0.3: # 长度差异超过70%则跳过
  81. continue
  82. # 子串检查
  83. shorter = target_text if len(target_text) < len(box_text) else box_text
  84. longer = box_text if len(target_text) < len(box_text) else target_text
  85. is_substring = shorter in longer
  86. # 计算多种相似度
  87. # token_sort_ratio = fuzz.token_sort_ratio(target_text, box_text)
  88. partial_ratio = fuzz.partial_ratio(target_text, box_text)
  89. if is_substring:
  90. partial_ratio += 10 # 子串时提升相似度
  91. # 综合相似度 - 两种算法都要达到阈值
  92. if (partial_ratio >= self.similarity_threshold):
  93. if i >= start_index:
  94. return text_boxes[i], i + 1, last_match_index
  95. else:
  96. return text_boxes[i], start_index, last_match_index
  97. return best_match, best_index, last_match_index
  98. def _find_search_start(self, text_boxes: List[Dict], last_match_index: int,
  99. start_index: int, look_ahead_window: int) -> int:
  100. """确定搜索起始位置"""
  101. search_start = last_match_index - 1
  102. unused_count = 0
  103. while search_start >= 0:
  104. if not text_boxes[search_start]['used']:
  105. unused_count += 1
  106. if unused_count >= look_ahead_window:
  107. break
  108. search_start -= 1
  109. if search_start < 0:
  110. search_start = 0
  111. while search_start < start_index and text_boxes[search_start]['used']:
  112. search_start += 1
  113. return search_start