| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183 |
- from typing import Dict, List
- # ✅ 兼容相对导入和绝对导入
- try:
- from .text_processor import TextProcessor
- from .similarity_calculator import SimilarityCalculator
- except ImportError:
- from text_processor import TextProcessor
- from similarity_calculator import SimilarityCalculator
- class ParagraphComparator:
- """段落比较"""
-
- def __init__(self):
- self.text_processor = TextProcessor()
- self.calculator = SimilarityCalculator()
- self.paragraph_match_threshold = 80
- self.content_similarity_threshold = 95
- self.max_paragraph_window = 6
-
- def compare_paragraphs(self, paras1: List[str], paras2: List[str]) -> List[Dict]:
- """改进的段落匹配算法"""
- differences = []
-
- # 预处理
- normalized_paras1 = [self.text_processor.normalize_text_for_comparison(p) for p in paras1]
- normalized_paras2 = [self.text_processor.normalize_text_for_comparison(p) for p in paras2]
-
- original_paras1 = [self.text_processor.strip_markdown_formatting(p) for p in paras1]
- original_paras2 = [self.text_processor.strip_markdown_formatting(p) for p in paras2]
-
- used_paras1 = set()
- used_paras2 = set()
-
- start_index2 = 0
- last_match_index2 = 0
-
- for window_size1 in range(1, min(self.max_paragraph_window, len(normalized_paras1) + 1)):
- for i in range(len(normalized_paras1) - window_size1 + 1):
- if any(idx in used_paras1 for idx in range(i, i + window_size1)):
- continue
-
- combined_normalized1 = "".join(normalized_paras1[i:i+window_size1])
- combined_original1 = "".join(original_paras1[i:i+window_size1])
-
- best_match = self._find_best_match(
- combined_normalized1,
- normalized_paras2,
- start_index2,
- last_match_index2,
- used_paras2
- )
-
- if best_match and best_match['similarity'] >= self.paragraph_match_threshold:
- matched_indices = best_match['indices']
- last_match_index2 = matched_indices[-1]
- start_index2 = last_match_index2 + 1
-
- for idx in range(i, i + window_size1):
- used_paras1.add(idx)
- for idx in matched_indices:
- used_paras2.add(idx)
-
- combined_original2 = "".join([original_paras2[idx] for idx in matched_indices])
-
- # 检查标点差异
- punctuation_diffs = self.calculator.check_punctuation_differences(
- combined_original1,
- combined_original2,
- self.text_processor.normalize_punctuation
- )
-
- if punctuation_diffs:
- diff_description = []
- for pdiff in punctuation_diffs:
- diff_description.append(
- f"位置{pdiff['position']}: '{pdiff['char1']}' vs '{pdiff['char2']}'"
- )
-
- differences.append({
- 'type': 'paragraph_punctuation',
- 'position': f'段落{i+1}' + (f'-{i+window_size1}' if window_size1 > 1 else ''),
- 'file1_value': combined_original1,
- 'file2_value': combined_original2,
- 'description': f'段落全角半角标点差异: {"; ".join(diff_description)}',
- 'punctuation_differences': punctuation_diffs,
- 'similarity': 100.0,
- 'severity': 'low'
- })
-
- elif best_match['similarity'] < self.content_similarity_threshold:
- severity = 'low' if best_match['similarity'] >= 90 else 'medium'
- differences.append({
- 'type': 'paragraph',
- 'position': f'段落{i+1}' + (f'-{i+window_size1}' if window_size1 > 1 else ''),
- 'file1_value': combined_original1,
- 'file2_value': combined_original2,
- 'description': f'段落内容差异 (相似度: {best_match["similarity"]:.1f}%)',
- 'similarity': best_match['similarity'],
- 'severity': severity
- })
-
- # 处理未匹配的段落
- for i, para in enumerate(original_paras1):
- if i not in used_paras1:
- differences.append({
- 'type': 'paragraph',
- 'position': f'段落{i+1}',
- 'file1_value': para,
- 'file2_value': "",
- 'description': '文件1中独有的段落',
- 'similarity': 0.0,
- 'severity': 'medium'
- })
-
- for j, para in enumerate(original_paras2):
- if j not in used_paras2:
- differences.append({
- 'type': 'paragraph',
- 'position': f'段落{j+1}',
- 'file1_value': "",
- 'file2_value': para,
- 'description': '文件2中独有的段落',
- 'similarity': 0.0,
- 'severity': 'medium'
- })
-
- return differences
-
- def _find_best_match(self, target_text: str, paras2: List[str],
- start_index: int, last_match_index: int,
- used_paras2: set) -> Dict:
- """改进的段落匹配方法"""
- search_start = last_match_index - 1
- unused_count = 0
-
- while search_start >= 0:
- if search_start not in used_paras2:
- unused_count += 1
- if unused_count >= self.max_paragraph_window:
- break
- search_start -= 1
-
- if search_start < 0:
- search_start = 0
- while search_start < start_index and search_start in used_paras2:
- search_start += 1
-
- search_end = min(start_index + self.max_paragraph_window, len(paras2))
- best_match = None
-
- for window_size in range(1, self.max_paragraph_window + 1):
- for j in range(search_start, search_end):
- if any(idx in used_paras2 for idx in range(j, min(j + window_size, len(paras2)))):
- continue
-
- if j + window_size > len(paras2):
- break
-
- combined_para2 = "".join(paras2[j:j+window_size])
-
- if target_text == combined_para2:
- similarity = 100.0
- else:
- similarity = self.calculator.calculate_text_similarity(target_text, combined_para2)
-
- if not best_match or similarity > best_match['similarity']:
- best_match = {
- 'text': combined_para2,
- 'similarity': similarity,
- 'indices': list(range(j, j + window_size))
- }
-
- if similarity == 100.0:
- return best_match
-
- if best_match is None:
- return {
- 'text': '',
- 'similarity': 0.0,
- 'indices': []
- }
-
- return best_match
|