zhengchun
/
ocr_verify


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
							from fuzzywuzzy import fuzz
from typing import Dict, List


class SimilarityCalculator:
    """文本相似度计算"""
    
    @staticmethod
    def calculate_text_similarity(text1: str, text2: str) -> float:
        """改进的相似度计算"""
        if not text1 and not text2:
            return 100.0
        if not text1 or not text2:
            return 0.0
        
        if text1 == text2:
            return 100.0
        
        similarity_scores = [fuzz.ratio(text1, text2)]
        return max(similarity_scores)
    
    @staticmethod
    def check_punctuation_differences(text1: str, text2: str, normalize_func) -> List[Dict]:
        """检查两段文本的标点符号差异"""
        differences = []
        
        normalized1 = normalize_func(text1)
        normalized2 = normalize_func(text2)
        
        if normalized1 == normalized2 and text1 != text2:
            min_len = min(len(text1), len(text2))
            
            for i in range(min_len):
                if text1[i] != text2[i]:
                    char1 = text1[i]
                    char2 = text2[i]
                    
                    if normalize_func(char1) == normalize_func(char2):
                        start = max(0, i - 3)
                        end = min(len(text1), i + 4)
                        context1 = text1[start:end]
                        context2 = text2[start:end]
                        
                        differences.append({
                            'position': i,
                            'char1': char1,
                            'char2': char2,
                            'context1': context1,
                            'context2': context2,
                            'type': 'full_half_width'
                        })
        
        return differences