from typing import Dict, List # ✅ 兼容相对导入和绝对导入 try: from .text_processor import TextProcessor from .similarity_calculator import SimilarityCalculator except ImportError: from text_processor import TextProcessor from similarity_calculator import SimilarityCalculator class ParagraphComparator: """段落比较""" def __init__(self): self.text_processor = TextProcessor() self.calculator = SimilarityCalculator() self.paragraph_match_threshold = 80 self.content_similarity_threshold = 95 self.max_paragraph_window = 6 def compare_paragraphs(self, paras1: List[str], paras2: List[str]) -> List[Dict]: """改进的段落匹配算法""" differences = [] # 预处理 normalized_paras1 = [self.text_processor.normalize_text_for_comparison(p) for p in paras1] normalized_paras2 = [self.text_processor.normalize_text_for_comparison(p) for p in paras2] original_paras1 = [self.text_processor.strip_markdown_formatting(p) for p in paras1] original_paras2 = [self.text_processor.strip_markdown_formatting(p) for p in paras2] used_paras1 = set() used_paras2 = set() start_index2 = 0 last_match_index2 = 0 for window_size1 in range(1, min(self.max_paragraph_window, len(normalized_paras1) + 1)): for i in range(len(normalized_paras1) - window_size1 + 1): if any(idx in used_paras1 for idx in range(i, i + window_size1)): continue combined_normalized1 = "".join(normalized_paras1[i:i+window_size1]) combined_original1 = "".join(original_paras1[i:i+window_size1]) best_match = self._find_best_match( combined_normalized1, normalized_paras2, start_index2, last_match_index2, used_paras2 ) if best_match and best_match['similarity'] >= self.paragraph_match_threshold: matched_indices = best_match['indices'] last_match_index2 = matched_indices[-1] start_index2 = last_match_index2 + 1 for idx in range(i, i + window_size1): used_paras1.add(idx) for idx in matched_indices: used_paras2.add(idx) combined_original2 = "".join([original_paras2[idx] for idx in matched_indices]) # 检查标点差异 punctuation_diffs = self.calculator.check_punctuation_differences( combined_original1, combined_original2, self.text_processor.normalize_punctuation ) if punctuation_diffs: diff_description = [] for pdiff in punctuation_diffs: diff_description.append( f"位置{pdiff['position']}: '{pdiff['char1']}' vs '{pdiff['char2']}'" ) differences.append({ 'type': 'paragraph_punctuation', 'position': f'段落{i+1}' + (f'-{i+window_size1}' if window_size1 > 1 else ''), 'file1_value': combined_original1, 'file2_value': combined_original2, 'description': f'段落全角半角标点差异: {"; ".join(diff_description)}', 'punctuation_differences': punctuation_diffs, 'similarity': 100.0, 'severity': 'low' }) elif best_match['similarity'] < self.content_similarity_threshold: severity = 'low' if best_match['similarity'] >= 90 else 'medium' differences.append({ 'type': 'paragraph', 'position': f'段落{i+1}' + (f'-{i+window_size1}' if window_size1 > 1 else ''), 'file1_value': combined_original1, 'file2_value': combined_original2, 'description': f'段落内容差异 (相似度: {best_match["similarity"]:.1f}%)', 'similarity': best_match['similarity'], 'severity': severity }) # 处理未匹配的段落 for i, para in enumerate(original_paras1): if i not in used_paras1: differences.append({ 'type': 'paragraph', 'position': f'段落{i+1}', 'file1_value': para, 'file2_value': "", 'description': '文件1中独有的段落', 'similarity': 0.0, 'severity': 'medium' }) for j, para in enumerate(original_paras2): if j not in used_paras2: differences.append({ 'type': 'paragraph', 'position': f'段落{j+1}', 'file1_value': "", 'file2_value': para, 'description': '文件2中独有的段落', 'similarity': 0.0, 'severity': 'medium' }) return differences def _find_best_match(self, target_text: str, paras2: List[str], start_index: int, last_match_index: int, used_paras2: set) -> Dict: """改进的段落匹配方法""" search_start = last_match_index - 1 unused_count = 0 while search_start >= 0: if search_start not in used_paras2: unused_count += 1 if unused_count >= self.max_paragraph_window: break search_start -= 1 if search_start < 0: search_start = 0 while search_start < start_index and search_start in used_paras2: search_start += 1 search_end = min(start_index + self.max_paragraph_window, len(paras2)) best_match = None for window_size in range(1, self.max_paragraph_window + 1): for j in range(search_start, search_end): if any(idx in used_paras2 for idx in range(j, min(j + window_size, len(paras2)))): continue if j + window_size > len(paras2): break combined_para2 = "".join(paras2[j:j+window_size]) if target_text == combined_para2: similarity = 100.0 else: similarity = self.calculator.calculate_text_similarity(target_text, combined_para2) if not best_match or similarity > best_match['similarity']: best_match = { 'text': combined_para2, 'similarity': similarity, 'indices': list(range(j, j + window_size)) } if similarity == 100.0: return best_match if best_match is None: return { 'text': '', 'similarity': 0.0, 'indices': [] } return best_match