paragraph_comparator.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. from typing import Dict, List
  2. # ✅ 兼容相对导入和绝对导入
  3. try:
  4. from .text_processor import TextProcessor
  5. from .similarity_calculator import SimilarityCalculator
  6. except ImportError:
  7. from text_processor import TextProcessor
  8. from similarity_calculator import SimilarityCalculator
  9. class ParagraphComparator:
  10. """段落比较"""
  11. def __init__(self):
  12. self.text_processor = TextProcessor()
  13. self.calculator = SimilarityCalculator()
  14. self.paragraph_match_threshold = 80
  15. self.content_similarity_threshold = 95
  16. self.max_paragraph_window = 6
  17. def compare_paragraphs(self, paras1: List[str], paras2: List[str]) -> List[Dict]:
  18. """改进的段落匹配算法"""
  19. differences = []
  20. # 预处理
  21. normalized_paras1 = [self.text_processor.normalize_text_for_comparison(p) for p in paras1]
  22. normalized_paras2 = [self.text_processor.normalize_text_for_comparison(p) for p in paras2]
  23. original_paras1 = [self.text_processor.strip_markdown_formatting(p) for p in paras1]
  24. original_paras2 = [self.text_processor.strip_markdown_formatting(p) for p in paras2]
  25. used_paras1 = set()
  26. used_paras2 = set()
  27. start_index2 = 0
  28. last_match_index2 = 0
  29. for window_size1 in range(1, min(self.max_paragraph_window, len(normalized_paras1) + 1)):
  30. for i in range(len(normalized_paras1) - window_size1 + 1):
  31. if any(idx in used_paras1 for idx in range(i, i + window_size1)):
  32. continue
  33. combined_normalized1 = "".join(normalized_paras1[i:i+window_size1])
  34. combined_original1 = "".join(original_paras1[i:i+window_size1])
  35. best_match = self._find_best_match(
  36. combined_normalized1,
  37. normalized_paras2,
  38. start_index2,
  39. last_match_index2,
  40. used_paras2
  41. )
  42. if best_match and best_match['similarity'] >= self.paragraph_match_threshold:
  43. matched_indices = best_match['indices']
  44. last_match_index2 = matched_indices[-1]
  45. start_index2 = last_match_index2 + 1
  46. for idx in range(i, i + window_size1):
  47. used_paras1.add(idx)
  48. for idx in matched_indices:
  49. used_paras2.add(idx)
  50. combined_original2 = "".join([original_paras2[idx] for idx in matched_indices])
  51. # 检查标点差异
  52. punctuation_diffs = self.calculator.check_punctuation_differences(
  53. combined_original1,
  54. combined_original2,
  55. self.text_processor.normalize_punctuation
  56. )
  57. if punctuation_diffs:
  58. diff_description = []
  59. for pdiff in punctuation_diffs:
  60. diff_description.append(
  61. f"位置{pdiff['position']}: '{pdiff['char1']}' vs '{pdiff['char2']}'"
  62. )
  63. differences.append({
  64. 'type': 'paragraph_punctuation',
  65. 'position': f'段落{i+1}' + (f'-{i+window_size1}' if window_size1 > 1 else ''),
  66. 'file1_value': combined_original1,
  67. 'file2_value': combined_original2,
  68. 'description': f'段落全角半角标点差异: {"; ".join(diff_description)}',
  69. 'punctuation_differences': punctuation_diffs,
  70. 'similarity': 100.0,
  71. 'severity': 'low'
  72. })
  73. elif best_match['similarity'] < self.content_similarity_threshold:
  74. severity = 'low' if best_match['similarity'] >= 90 else 'medium'
  75. differences.append({
  76. 'type': 'paragraph',
  77. 'position': f'段落{i+1}' + (f'-{i+window_size1}' if window_size1 > 1 else ''),
  78. 'file1_value': combined_original1,
  79. 'file2_value': combined_original2,
  80. 'description': f'段落内容差异 (相似度: {best_match["similarity"]:.1f}%)',
  81. 'similarity': best_match['similarity'],
  82. 'severity': severity
  83. })
  84. # 处理未匹配的段落
  85. for i, para in enumerate(original_paras1):
  86. if i not in used_paras1:
  87. differences.append({
  88. 'type': 'paragraph',
  89. 'position': f'段落{i+1}',
  90. 'file1_value': para,
  91. 'file2_value': "",
  92. 'description': '文件1中独有的段落',
  93. 'similarity': 0.0,
  94. 'severity': 'medium'
  95. })
  96. for j, para in enumerate(original_paras2):
  97. if j not in used_paras2:
  98. differences.append({
  99. 'type': 'paragraph',
  100. 'position': f'段落{j+1}',
  101. 'file1_value': "",
  102. 'file2_value': para,
  103. 'description': '文件2中独有的段落',
  104. 'similarity': 0.0,
  105. 'severity': 'medium'
  106. })
  107. return differences
  108. def _find_best_match(self, target_text: str, paras2: List[str],
  109. start_index: int, last_match_index: int,
  110. used_paras2: set) -> Dict:
  111. """改进的段落匹配方法"""
  112. search_start = last_match_index - 1
  113. unused_count = 0
  114. while search_start >= 0:
  115. if search_start not in used_paras2:
  116. unused_count += 1
  117. if unused_count >= self.max_paragraph_window:
  118. break
  119. search_start -= 1
  120. if search_start < 0:
  121. search_start = 0
  122. while search_start < start_index and search_start in used_paras2:
  123. search_start += 1
  124. search_end = min(start_index + self.max_paragraph_window, len(paras2))
  125. best_match = None
  126. for window_size in range(1, self.max_paragraph_window + 1):
  127. for j in range(search_start, search_end):
  128. if any(idx in used_paras2 for idx in range(j, min(j + window_size, len(paras2)))):
  129. continue
  130. if j + window_size > len(paras2):
  131. break
  132. combined_para2 = "".join(paras2[j:j+window_size])
  133. if target_text == combined_para2:
  134. similarity = 100.0
  135. else:
  136. similarity = self.calculator.calculate_text_similarity(target_text, combined_para2)
  137. if not best_match or similarity > best_match['similarity']:
  138. best_match = {
  139. 'text': combined_para2,
  140. 'similarity': similarity,
  141. 'indices': list(range(j, j + window_size))
  142. }
  143. if similarity == 100.0:
  144. return best_match
  145. if best_match is None:
  146. return {
  147. 'text': '',
  148. 'similarity': 0.0,
  149. 'indices': []
  150. }
  151. return best_match