import re from typing import List class TextProcessor: """文本标准化和预处理""" @staticmethod def normalize_text(text: str) -> str: """标准化文本:去除多余空格、回车等无效字符""" if not text: return "" text = re.sub(r'\s+', ' ', text.strip()) text = re.sub(r'\s*([,。:;!?、])\s*', r'\1', text) return text @staticmethod def strip_markdown_formatting(text: str) -> str: """移除Markdown格式标记,只保留纯文本内容""" if not text: return "" text = re.sub(r'^#+\s*', '', text) text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) text = re.sub(r'__(.+?)__', r'\1', text) text = re.sub(r'\*(.+?)\*', r'\1', text) text = re.sub(r'_(.+?)_', r'\1', text) text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text) text = re.sub(r'!\[.*?\]\(.+?\)', '', text) text = re.sub(r'`(.+?)`', r'\1', text) text = re.sub(r'<[^>]+>', '', text) text = re.sub(r'^\s*[-*+]\s+', '', text) text = re.sub(r'^\s*\d+\.\s+', '', text) text = re.sub(r'^\s*>\s+', '', text) text = re.sub(r'\s+', ' ', text.strip()) return text @staticmethod def normalize_punctuation(text: str) -> str: """统一标点符号 - 将中文标点转换为英文标点""" if not text: return "" punctuation_map = { ':': ':', ';': ';', ',': ',', '。': '.', '!': '!', '?': '?', '(': '(', ')': ')', '【': '[', '】': ']', '《': '<', '》': '>', '"': '"', '"': '"', ''': "'", ''': "'", '、': ',', '—': '-', '…': '...', '~': '~', } for cn_punct, en_punct in punctuation_map.items(): text = text.replace(cn_punct, en_punct) return text @staticmethod def normalize_text_for_comparison(text: str) -> str: """用于比较的文本标准化""" text = TextProcessor.strip_markdown_formatting(text) text = TextProcessor.normalize_punctuation(text) text = TextProcessor.normalize_text(text) return text @staticmethod def is_image_reference(text: str) -> bool: """判断是否为图片引用或描述""" image_keywords = [ '图', '图片', '图像', 'image', 'figure', 'fig', '照片', '截图', '示意图', '流程图', '结构图' ] for keyword in image_keywords: if keyword in text.lower(): return True if re.search(r'!\[.*?\]\(.*?\)', text): return True if re.search(r']*>', text, re.IGNORECASE): return True return False