zhengchun
/
ocr_verify


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
							"""
文本匹配工具模块
负责文本标准化、相似度计算等
"""
import re
from typing import Optional, List, Dict
from fuzzywuzzy import fuzz


class TextMatcher:
    """文本匹配器"""
    
    def __init__(self, similarity_threshold: int = 90):
        """
        Args:
            similarity_threshold: 文本相似度阈值
        """
        self.similarity_threshold = similarity_threshold
    
    def normalize_text(self, text: str) -> str:
        """标准化文本（去除空格、标点等）"""
        # 移除所有空白字符
        text = re.sub(r'\s+', '', text)
        # 转换全角数字和字母为半角
        text = self._full_to_half(text)
        return text.lower()
    
    def _full_to_half(self, text: str) -> str:
        """全角转半角"""
        result = []
        for char in text:
            code = ord(char)
            if code == 0x3000:  # 全角空格
                code = 0x0020
            elif 0xFF01 <= code <= 0xFF5E:  # 全角字符
                code -= 0xFEE0
            result.append(chr(code))
        return ''.join(result)
    
    def find_matching_bbox(self, target_text: str, text_boxes: List[Dict], 
                          start_index: int, last_match_index: int,
                          look_ahead_window: int = 10) -> tuple[Optional[Dict], int, int]:
        """
        查找匹配的文字框
        
        Args:
            target_text: 目标文本
            text_boxes: 文字框列表
            start_index: 起始索引
            last_match_index: 上次匹配成功的索引
            look_ahead_window: 向前查找窗口
        
        Returns:
            (匹配的文字框信息, 新的指针位置, last_match_index)
        """
        target_text = self.normalize_text(target_text)
        
        # 过滤过短的目标文本
        if len(target_text) < 2:
            return None, start_index, last_match_index

        # 由于minerU和Paddle的顺序基本一致, 也有不一致的地方, 所以需要向前找第一个未使用的位置
        # MinerU和Paddle都可能识别错误，所以需要一个look_ahead_window来避免漏掉匹配
        # 匹配时会遇到一些特殊情况，比如Paddle把两个连着的cell识别为一个字符串，MinerU将单元格上下2行识别为一行
        # 	'1|2024-08-11|扫二维码付'   minerU识别为“扫二维码付款”，Paddle识别为'12024-08-11扫二维码付'  
        #                  款
        # 字符串的顺序极大概率是一致的，所以如果短字符串是长字符串的子串，可以增加相似权重
        search_start = self._find_search_start(
            text_boxes, last_match_index, start_index, look_ahead_window
        )
        search_end = min(start_index + look_ahead_window, len(text_boxes))
        
        best_match = None
        best_index = start_index

        # 在搜索范围内查找最佳匹配
        for i in range(search_start, search_end):
            if text_boxes[i]['used']:
                continue
            
            box_text = self.normalize_text(text_boxes[i]['text'])
            
            # 精确匹配优先
            if target_text == box_text:
                if i >= start_index:
                    return text_boxes[i], i + 1, i
                else:
                    return text_boxes[i], start_index, i
            
            # 过滤过短的候选文本(避免单字符匹配)
            # if len(box_text) < 2:
            #     continue
            
            # 长度比例检查 - 避免长度差异过大的匹配
            length_ratio = min(len(target_text), len(box_text)) / max(len(target_text), len(box_text))
            if length_ratio < 0.3:  # 长度差异超过70%则跳过
                continue

            # 子串检查
            shorter = target_text if len(target_text) < len(box_text) else box_text
            longer = box_text if len(target_text) < len(box_text) else target_text
            is_substring = shorter in longer        

            # 计算多种相似度
            # token_sort_ratio = fuzz.token_sort_ratio(target_text, box_text)
            partial_ratio = fuzz.partial_ratio(target_text, box_text)
            if is_substring:
                partial_ratio += 10  # 子串时提升相似度
            
            # 综合相似度 - 两种算法都要达到阈值
            if (partial_ratio >= self.similarity_threshold):
                if i >= start_index:
                    return text_boxes[i], i + 1, last_match_index
                else:
                    return text_boxes[i], start_index, last_match_index

        return best_match, best_index, last_match_index    

    def _find_search_start(self, text_boxes: List[Dict], last_match_index: int,
                          start_index: int, look_ahead_window: int) -> int:
        """确定搜索起始位置"""
        search_start = last_match_index - 1
        unused_count = 0
        
        while search_start >= 0:
            if not text_boxes[search_start]['used']:
                unused_count += 1
            if unused_count >= look_ahead_window:
                break
            search_start -= 1
        
        if search_start < 0:
            search_start = 0
            while search_start < start_index and text_boxes[search_start]['used']:
                search_start += 1
        
        return search_start