text_processor.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. import re
  2. from typing import List
  3. class TextProcessor:
  4. """文本标准化和预处理"""
  5. @staticmethod
  6. def normalize_text(text: str) -> str:
  7. """标准化文本:去除多余空格、回车等无效字符"""
  8. if not text:
  9. return ""
  10. text = re.sub(r'\s+', ' ', text.strip())
  11. text = re.sub(r'\s*([,。:;!?、])\s*', r'\1', text)
  12. return text
  13. @staticmethod
  14. def strip_markdown_formatting(text: str) -> str:
  15. """移除Markdown格式标记,只保留纯文本内容"""
  16. if not text:
  17. return ""
  18. text = re.sub(r'^#+\s*', '', text)
  19. text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
  20. text = re.sub(r'__(.+?)__', r'\1', text)
  21. text = re.sub(r'\*(.+?)\*', r'\1', text)
  22. text = re.sub(r'_(.+?)_', r'\1', text)
  23. text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text)
  24. text = re.sub(r'!\[.*?\]\(.+?\)', '', text)
  25. text = re.sub(r'`(.+?)`', r'\1', text)
  26. text = re.sub(r'<[^>]+>', '', text)
  27. text = re.sub(r'^\s*[-*+]\s+', '', text)
  28. text = re.sub(r'^\s*\d+\.\s+', '', text)
  29. text = re.sub(r'^\s*>\s+', '', text)
  30. text = re.sub(r'\s+', ' ', text.strip())
  31. return text
  32. @staticmethod
  33. def normalize_punctuation(text: str) -> str:
  34. """统一标点符号 - 将中文标点转换为英文标点"""
  35. if not text:
  36. return ""
  37. punctuation_map = {
  38. ':': ':', ';': ';', ',': ',', '。': '.', '!': '!', '?': '?',
  39. '(': '(', ')': ')', '【': '[', '】': ']', '《': '<', '》': '>',
  40. '"': '"', '"': '"', ''': "'", ''': "'", '、': ',', '—': '-',
  41. '…': '...', '~': '~',
  42. }
  43. for cn_punct, en_punct in punctuation_map.items():
  44. text = text.replace(cn_punct, en_punct)
  45. return text
  46. @staticmethod
  47. def normalize_text_for_comparison(text: str) -> str:
  48. """用于比较的文本标准化"""
  49. text = TextProcessor.strip_markdown_formatting(text)
  50. text = TextProcessor.normalize_punctuation(text)
  51. text = TextProcessor.normalize_text(text)
  52. return text
  53. @staticmethod
  54. def is_image_reference(text: str) -> bool:
  55. """判断是否为图片引用或描述"""
  56. image_keywords = [
  57. '图', '图片', '图像', 'image', 'figure', 'fig',
  58. '照片', '截图', '示意图', '流程图', '结构图'
  59. ]
  60. for keyword in image_keywords:
  61. if keyword in text.lower():
  62. return True
  63. if re.search(r'!\[.*?\]\(.*?\)', text):
  64. return True
  65. if re.search(r'<img[^>]*>', text, re.IGNORECASE):
  66. return True
  67. return False