zhengchun
/
ocr_verify


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
							import re
from typing import List


class TextProcessor:
    """文本标准化和预处理"""
    
    @staticmethod
    def normalize_text(text: str) -> str:
        """标准化文本：去除多余空格、回车等无效字符"""
        if not text:
            return ""
        text = re.sub(r'\s+', ' ', text.strip())
        text = re.sub(r'\s*([，。：；！？、])\s*', r'\1', text)
        return text
    
    @staticmethod
    def strip_markdown_formatting(text: str) -> str:
        """移除Markdown格式标记，只保留纯文本内容"""
        if not text:
            return ""
        
        text = re.sub(r'^#+\s*', '', text)
        text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
        text = re.sub(r'__(.+?)__', r'\1', text)
        text = re.sub(r'\*(.+?)\*', r'\1', text)
        text = re.sub(r'_(.+?)_', r'\1', text)
        text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text)
        text = re.sub(r'!\[.*?\]\(.+?\)', '', text)
        text = re.sub(r'`(.+?)`', r'\1', text)
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'^\s*[-*+]\s+', '', text)
        text = re.sub(r'^\s*\d+\.\s+', '', text)
        text = re.sub(r'^\s*>\s+', '', text)
        text = re.sub(r'\s+', ' ', text.strip())
        
        return text
    
    @staticmethod
    def normalize_punctuation(text: str) -> str:
        """统一标点符号 - 将中文标点转换为英文标点"""
        if not text:
            return ""
        
        punctuation_map = {
            '：': ':', '；': ';', '，': ',', '。': '.', '！': '!', '？': '?',
            '（': '(', '）': ')', '【': '[', '】': ']', '《': '<', '》': '>',
            '"': '"', '"': '"', ''': "'", ''': "'", '、': ',', '—': '-',
            '…': '...', '～': '~',
        }
        
        for cn_punct, en_punct in punctuation_map.items():
            text = text.replace(cn_punct, en_punct)
        
        return text
    
    @staticmethod
    def normalize_text_for_comparison(text: str) -> str:
        """用于比较的文本标准化"""
        text = TextProcessor.strip_markdown_formatting(text)
        text = TextProcessor.normalize_punctuation(text)
        text = TextProcessor.normalize_text(text)
        return text
    
    @staticmethod
    def is_image_reference(text: str) -> bool:
        """判断是否为图片引用或描述"""
        image_keywords = [
            '图', '图片', '图像', 'image', 'figure', 'fig',
            '照片', '截图', '示意图', '流程图', '结构图'
        ]
        
        for keyword in image_keywords:
            if keyword in text.lower():
                return True
        
        if re.search(r'!\[.*?\]\(.*?\)', text):
            return True
            
        if re.search(r'<img[^>]*>', text, re.IGNORECASE):
            return True
            
        return False