similarity_calculator.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. from fuzzywuzzy import fuzz
  2. from typing import Dict, List
  3. class SimilarityCalculator:
  4. """文本相似度计算"""
  5. @staticmethod
  6. def calculate_text_similarity(text1: str, text2: str) -> float:
  7. """改进的相似度计算"""
  8. if not text1 and not text2:
  9. return 100.0
  10. if not text1 or not text2:
  11. return 0.0
  12. if text1 == text2:
  13. return 100.0
  14. similarity_scores = [fuzz.ratio(text1, text2)]
  15. return max(similarity_scores)
  16. @staticmethod
  17. def check_punctuation_differences(text1: str, text2: str, normalize_func) -> List[Dict]:
  18. """检查两段文本的标点符号差异"""
  19. differences = []
  20. normalized1 = normalize_func(text1)
  21. normalized2 = normalize_func(text2)
  22. if normalized1 == normalized2 and text1 != text2:
  23. min_len = min(len(text1), len(text2))
  24. for i in range(min_len):
  25. if text1[i] != text2[i]:
  26. char1 = text1[i]
  27. char2 = text2[i]
  28. if normalize_func(char1) == normalize_func(char2):
  29. start = max(0, i - 3)
  30. end = min(len(text1), i + 4)
  31. context1 = text1[start:end]
  32. context2 = text2[start:end]
  33. differences.append({
  34. 'position': i,
  35. 'char1': char1,
  36. 'char2': char2,
  37. 'context1': context1,
  38. 'context2': context2,
  39. 'type': 'full_half_width'
  40. })
  41. return differences