| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- import re
- from typing import List
- class TextProcessor:
- """文本标准化和预处理"""
-
- @staticmethod
- def normalize_text(text: str) -> str:
- """标准化文本:去除多余空格、回车等无效字符"""
- if not text:
- return ""
- text = re.sub(r'\s+', ' ', text.strip())
- text = re.sub(r'\s*([,。:;!?、])\s*', r'\1', text)
- return text
-
- @staticmethod
- def strip_markdown_formatting(text: str) -> str:
- """移除Markdown格式标记,只保留纯文本内容"""
- if not text:
- return ""
-
- text = re.sub(r'^#+\s*', '', text)
- text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
- text = re.sub(r'__(.+?)__', r'\1', text)
- text = re.sub(r'\*(.+?)\*', r'\1', text)
- text = re.sub(r'_(.+?)_', r'\1', text)
- text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text)
- text = re.sub(r'!\[.*?\]\(.+?\)', '', text)
- text = re.sub(r'`(.+?)`', r'\1', text)
- text = re.sub(r'<[^>]+>', '', text)
- text = re.sub(r'^\s*[-*+]\s+', '', text)
- text = re.sub(r'^\s*\d+\.\s+', '', text)
- text = re.sub(r'^\s*>\s+', '', text)
- text = re.sub(r'\s+', ' ', text.strip())
-
- return text
-
- @staticmethod
- def normalize_punctuation(text: str) -> str:
- """统一标点符号 - 将中文标点转换为英文标点"""
- if not text:
- return ""
-
- punctuation_map = {
- ':': ':', ';': ';', ',': ',', '。': '.', '!': '!', '?': '?',
- '(': '(', ')': ')', '【': '[', '】': ']', '《': '<', '》': '>',
- '"': '"', '"': '"', ''': "'", ''': "'", '、': ',', '—': '-',
- '…': '...', '~': '~',
- }
-
- for cn_punct, en_punct in punctuation_map.items():
- text = text.replace(cn_punct, en_punct)
-
- return text
-
- @staticmethod
- def normalize_text_for_comparison(text: str) -> str:
- """用于比较的文本标准化"""
- text = TextProcessor.strip_markdown_formatting(text)
- text = TextProcessor.normalize_punctuation(text)
- text = TextProcessor.normalize_text(text)
- return text
-
- @staticmethod
- def is_image_reference(text: str) -> bool:
- """判断是否为图片引用或描述"""
- image_keywords = [
- '图', '图片', '图像', 'image', 'figure', 'fig',
- '照片', '截图', '示意图', '流程图', '结构图'
- ]
-
- for keyword in image_keywords:
- if keyword in text.lower():
- return True
-
- if re.search(r'!\[.*?\]\(.*?\)', text):
- return True
-
- if re.search(r'<img[^>]*>', text, re.IGNORECASE):
- return True
-
- return False
|