_tokenizer.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. from __future__ import annotations
  2. import contextlib
  3. import re
  4. from dataclasses import dataclass
  5. from typing import Iterator, NoReturn
  6. from .specifiers import Specifier
  7. @dataclass
  8. class Token:
  9. name: str
  10. text: str
  11. position: int
  12. class ParserSyntaxError(Exception):
  13. """The provided source text could not be parsed correctly."""
  14. def __init__(
  15. self,
  16. message: str,
  17. *,
  18. source: str,
  19. span: tuple[int, int],
  20. ) -> None:
  21. self.span = span
  22. self.message = message
  23. self.source = source
  24. super().__init__()
  25. def __str__(self) -> str:
  26. marker = " " * self.span[0] + "~" * (self.span[1] - self.span[0]) + "^"
  27. return "\n ".join([self.message, self.source, marker])
  28. DEFAULT_RULES: dict[str, str | re.Pattern[str]] = {
  29. "LEFT_PARENTHESIS": r"\(",
  30. "RIGHT_PARENTHESIS": r"\)",
  31. "LEFT_BRACKET": r"\[",
  32. "RIGHT_BRACKET": r"\]",
  33. "SEMICOLON": r";",
  34. "COMMA": r",",
  35. "QUOTED_STRING": re.compile(
  36. r"""
  37. (
  38. ('[^']*')
  39. |
  40. ("[^"]*")
  41. )
  42. """,
  43. re.VERBOSE,
  44. ),
  45. "OP": r"(===|==|~=|!=|<=|>=|<|>)",
  46. "BOOLOP": r"\b(or|and)\b",
  47. "IN": r"\bin\b",
  48. "NOT": r"\bnot\b",
  49. "VARIABLE": re.compile(
  50. r"""
  51. \b(
  52. python_version
  53. |python_full_version
  54. |os[._]name
  55. |sys[._]platform
  56. |platform_(release|system)
  57. |platform[._](version|machine|python_implementation)
  58. |python_implementation
  59. |implementation_(name|version)
  60. |extras?
  61. |dependency_groups
  62. )\b
  63. """,
  64. re.VERBOSE,
  65. ),
  66. "SPECIFIER": re.compile(
  67. Specifier._operator_regex_str + Specifier._version_regex_str,
  68. re.VERBOSE | re.IGNORECASE,
  69. ),
  70. "AT": r"\@",
  71. "URL": r"[^ \t]+",
  72. "IDENTIFIER": r"\b[a-zA-Z0-9][a-zA-Z0-9._-]*\b",
  73. "VERSION_PREFIX_TRAIL": r"\.\*",
  74. "VERSION_LOCAL_LABEL_TRAIL": r"\+[a-z0-9]+(?:[-_\.][a-z0-9]+)*",
  75. "WS": r"[ \t]+",
  76. "END": r"$",
  77. }
  78. class Tokenizer:
  79. """Context-sensitive token parsing.
  80. Provides methods to examine the input stream to check whether the next token
  81. matches.
  82. """
  83. def __init__(
  84. self,
  85. source: str,
  86. *,
  87. rules: dict[str, str | re.Pattern[str]],
  88. ) -> None:
  89. self.source = source
  90. self.rules: dict[str, re.Pattern[str]] = {
  91. name: re.compile(pattern) for name, pattern in rules.items()
  92. }
  93. self.next_token: Token | None = None
  94. self.position = 0
  95. def consume(self, name: str) -> None:
  96. """Move beyond provided token name, if at current position."""
  97. if self.check(name):
  98. self.read()
  99. def check(self, name: str, *, peek: bool = False) -> bool:
  100. """Check whether the next token has the provided name.
  101. By default, if the check succeeds, the token *must* be read before
  102. another check. If `peek` is set to `True`, the token is not loaded and
  103. would need to be checked again.
  104. """
  105. assert self.next_token is None, (
  106. f"Cannot check for {name!r}, already have {self.next_token!r}"
  107. )
  108. assert name in self.rules, f"Unknown token name: {name!r}"
  109. expression = self.rules[name]
  110. match = expression.match(self.source, self.position)
  111. if match is None:
  112. return False
  113. if not peek:
  114. self.next_token = Token(name, match[0], self.position)
  115. return True
  116. def expect(self, name: str, *, expected: str) -> Token:
  117. """Expect a certain token name next, failing with a syntax error otherwise.
  118. The token is *not* read.
  119. """
  120. if not self.check(name):
  121. raise self.raise_syntax_error(f"Expected {expected}")
  122. return self.read()
  123. def read(self) -> Token:
  124. """Consume the next token and return it."""
  125. token = self.next_token
  126. assert token is not None
  127. self.position += len(token.text)
  128. self.next_token = None
  129. return token
  130. def raise_syntax_error(
  131. self,
  132. message: str,
  133. *,
  134. span_start: int | None = None,
  135. span_end: int | None = None,
  136. ) -> NoReturn:
  137. """Raise ParserSyntaxError at the given position."""
  138. span = (
  139. self.position if span_start is None else span_start,
  140. self.position if span_end is None else span_end,
  141. )
  142. raise ParserSyntaxError(
  143. message,
  144. source=self.source,
  145. span=span,
  146. )
  147. @contextlib.contextmanager
  148. def enclosing_tokens(
  149. self, open_token: str, close_token: str, *, around: str
  150. ) -> Iterator[None]:
  151. if self.check(open_token):
  152. open_position = self.position
  153. self.read()
  154. else:
  155. open_position = None
  156. yield
  157. if open_position is None:
  158. return
  159. if not self.check(close_token):
  160. self.raise_syntax_error(
  161. f"Expected matching {close_token} for {open_token}, after {around}",
  162. span_start=open_position,
  163. )
  164. self.read()