qwen_tokenizer.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import base64
  15. import importlib.util
  16. import os
  17. import unicodedata
  18. from typing import Collection, Dict, List, Set, Tuple, Union
  19. from .tokenizer_utils import PretrainedTokenizer
  20. from .tokenizer_utils_base import AddedToken
  21. __all__ = ["QWenTokenizer"]
  22. VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
  23. PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
  24. ENDOFTEXT = "<|endoftext|>"
  25. IMSTART = "<|im_start|>"
  26. IMEND = "<|im_end|>"
  27. # as the default behavior is changed to allow special tokens in
  28. # regular texts, the surface forms of special tokens need to be
  29. # as different as possible to minimize the impact
  30. EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
  31. SPECIAL_TOKENS = (
  32. ENDOFTEXT,
  33. IMSTART,
  34. IMEND,
  35. ) + EXTRAS
  36. tiktoken = None
  37. def is_tiktoken_available():
  38. return importlib.util.find_spec("tiktoken") is not None
  39. def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
  40. with open(tiktoken_bpe_file, "rb") as f:
  41. contents = f.read()
  42. return {
  43. base64.b64decode(token): int(rank)
  44. for token, rank in (line.split() for line in contents.splitlines() if line)
  45. }
  46. class QWenTokenizer(PretrainedTokenizer):
  47. """QWen tokenizer."""
  48. model_input_names = ["input_ids", "attention_mask", "position_ids"]
  49. resource_files_names = VOCAB_FILES_NAMES
  50. def __init__(
  51. self,
  52. vocab_file,
  53. errors="replace",
  54. padding_side="left",
  55. **kwargs,
  56. ):
  57. super().__init__(**kwargs)
  58. if not is_tiktoken_available():
  59. raise ValueError(
  60. "tiktoken is not installed, please install it use: pip install tiktoken"
  61. )
  62. import tiktoken as tk
  63. tiktoken = tk
  64. self.errors = errors # how to handle errors in decoding
  65. self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
  66. self.special_tokens = {
  67. token: index
  68. for index, token in enumerate(
  69. SPECIAL_TOKENS, start=len(self.mergeable_ranks)
  70. )
  71. }
  72. enc = tiktoken.Encoding(
  73. "Qwen",
  74. pat_str=PAT_STR,
  75. mergeable_ranks=self.mergeable_ranks,
  76. special_tokens=self.special_tokens,
  77. )
  78. assert (
  79. len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
  80. ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
  81. self.decoder = {
  82. v: k for k, v in self.mergeable_ranks.items()
  83. } # type: dict[int, bytes|str]
  84. self.decoder.update({v: k for k, v in self.special_tokens.items()})
  85. self.tokenizer = enc # type: tiktoken.Encoding
  86. self.eod_id = self.tokenizer.eot_token
  87. self.im_start_id = self.special_tokens[IMSTART]
  88. self.im_end_id = self.special_tokens[IMEND]
  89. if "pad_token_id" in kwargs:
  90. self.pad_token_id = kwargs["pad_token_id"]
  91. if "eos_token_id" in kwargs:
  92. self.eos_token_id = kwargs["eos_token_id"]
  93. def __len__(self) -> int:
  94. return self.tokenizer.n_vocab
  95. def get_vocab(self) -> Dict[bytes, int]:
  96. return self.mergeable_ranks
  97. def convert_tokens_to_ids(
  98. self, tokens: Union[bytes, str, List[Union[bytes, str]]]
  99. ) -> List[int]:
  100. ids = []
  101. if isinstance(tokens, (str, bytes)):
  102. if tokens in self.special_tokens:
  103. return self.special_tokens[tokens]
  104. else:
  105. return self.mergeable_ranks.get(tokens)
  106. for token in tokens:
  107. if token in self.special_tokens:
  108. ids.append(self.special_tokens[token])
  109. else:
  110. ids.append(self.mergeable_ranks.get(token))
  111. return ids
  112. def _update_tiktoken(self, tokens: List[str], special_tokens: bool = False) -> int:
  113. if special_tokens:
  114. added_tokens = []
  115. for token in tokens:
  116. if token in self.special_tokens:
  117. continue
  118. token_id = len(self.mergeable_ranks) + len(self.special_tokens)
  119. self.special_tokens[token] = token_id
  120. self.decoder[token_id] = token
  121. added_tokens.append(token)
  122. import tiktoken
  123. self.tokenizer = tiktoken.Encoding(
  124. "Qwen",
  125. pat_str=PAT_STR,
  126. mergeable_ranks=self.mergeable_ranks,
  127. special_tokens=self.special_tokens,
  128. )
  129. return len(added_tokens)
  130. else:
  131. raise ValueError("Adding regular tokens is not supported")
  132. def _add_tokens(
  133. self,
  134. new_tokens: Union[List[str], List[AddedToken]],
  135. special_tokens: bool = False,
  136. ) -> int:
  137. if not special_tokens and new_tokens:
  138. raise ValueError("Adding regular tokens is not supported")
  139. new_tokens_str = []
  140. for token in new_tokens:
  141. surface_form = token.content if isinstance(token, AddedToken) else token
  142. new_tokens_str.append(surface_form)
  143. return self._update_tiktoken(new_tokens_str, special_tokens)
  144. def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
  145. """
  146. Save only the vocabulary of the tokenizer (vocabulary).
  147. Returns:
  148. `Tuple(str)`: Paths to the files saved.
  149. """
  150. file_path = os.path.join(save_directory, "qwen.tiktoken")
  151. with open(file_path, "w", encoding="utf8") as w:
  152. for k, v in self.mergeable_ranks.items():
  153. line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
  154. w.write(line)
  155. return (file_path,)
  156. def tokenize(
  157. self,
  158. text: str,
  159. allowed_special: Union[Set, str] = "all",
  160. disallowed_special: Union[Collection, str] = (),
  161. **kwargs,
  162. ) -> List[Union[bytes, str]]:
  163. """
  164. Converts a string in a sequence of tokens.
  165. Args:
  166. text (`str`):
  167. The sequence to be encoded.
  168. allowed_special (`Literal["all"]` or `set`):
  169. The surface forms of the tokens to be encoded as special tokens in regular texts.
  170. Default to "all".
  171. disallowed_special (`Literal["all"]` or `Collection`):
  172. The surface forms of the tokens that should not be in regular texts and trigger errors.
  173. Default to an empty tuple.
  174. kwargs (additional keyword arguments, *optional*):
  175. Will be passed to the underlying model specific encode method.
  176. Returns:
  177. `List[bytes|str]`: The list of tokens.
  178. """
  179. tokens = []
  180. text = unicodedata.normalize("NFC", text)
  181. # this implementation takes a detour: text -> token id -> token surface forms
  182. for t in self.tokenizer.encode(
  183. text, allowed_special=allowed_special, disallowed_special=disallowed_special
  184. ):
  185. tokens.append(self.decoder[t])
  186. return tokens
  187. def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
  188. """
  189. Converts a sequence of tokens in a single string.
  190. """
  191. text = ""
  192. temp = b""
  193. for t in tokens:
  194. if isinstance(t, str):
  195. if temp:
  196. text += temp.decode("utf-8", errors=self.errors)
  197. temp = b""
  198. text += t
  199. elif isinstance(t, bytes):
  200. temp += t
  201. else:
  202. raise TypeError("token should only be of type types or str")
  203. if temp:
  204. text += temp.decode("utf-8", errors=self.errors)
  205. return text
  206. @property
  207. def vocab_size(self):
  208. return self.tokenizer.n_vocab
  209. def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
  210. """Converts an id to a token, special tokens included"""
  211. if index in self.decoder:
  212. return self.decoder[index]
  213. raise ValueError("unknown ids")
  214. def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
  215. """Converts a token to an id using the vocab, special tokens included"""
  216. if token in self.special_tokens:
  217. return self.special_tokens[token]
  218. if token in self.mergeable_ranks:
  219. return self.mergeable_ranks[token]
  220. raise ValueError("unknown token")
  221. def _tokenize(self, text: str, **kwargs):
  222. """
  223. Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
  224. vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
  225. Do NOT take care of added tokens.
  226. """
  227. raise NotImplementedError
  228. def _decode(
  229. self,
  230. token_ids: Union[int, List[int]],
  231. skip_special_tokens: bool = False,
  232. errors: str = None,
  233. **kwargs,
  234. ) -> str:
  235. if isinstance(token_ids, int):
  236. token_ids = [token_ids]
  237. if skip_special_tokens:
  238. token_ids = [i for i in token_ids if i < self.eod_id]
  239. return self.tokenizer.decode(token_ids, errors=errors or self.errors)