llama_tokenizer.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. from shutil import copyfile
  16. from typing import List, Optional, Tuple
  17. from paddlex.inference.models.common.tokenizer.tokenizer_utils import (
  18. PretrainedTokenizer,
  19. )
  20. class LlamaTokenizer(PretrainedTokenizer):
  21. model_input_names = ["input_ids", "attention_mask", "position_ids"]
  22. resource_files_names = {
  23. "vocab_file": "sentencepiece.bpe.model",
  24. }
  25. pretrained_resource_files_map = {
  26. "vocab_file": {
  27. "__internal_testing__/micro-random-llama": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
  28. "__internal_testing__/tiny-random-llama": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
  29. "facebook/llama-7b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
  30. "facebook/llama-13b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
  31. "facebook/llama-30b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
  32. "facebook/llama-65b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
  33. },
  34. }
  35. pretrained_init_configuration = {
  36. "__internal_testing__/micro-random-llama": {},
  37. "__internal_testing__/tiny-random-llama": {},
  38. "facebook/llama-7b": {},
  39. "facebook/llama-13b": {},
  40. "facebook/llama-30b": {},
  41. "facebook/llama-65b": {},
  42. }
  43. padding_side = "left"
  44. def __init__(
  45. self,
  46. vocab_file,
  47. unk_token="<unk>",
  48. bos_token="<s>",
  49. eos_token="</s>",
  50. add_bos_token=True,
  51. add_eos_token=False,
  52. sp_model_kwargs=None,
  53. decode_with_prefix_space=False,
  54. **kwargs,
  55. ):
  56. self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
  57. super().__init__(
  58. bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs
  59. )
  60. self.vocab_file = vocab_file
  61. self.add_bos_token = add_bos_token
  62. self.add_eos_token = add_eos_token
  63. self.decode_with_prefix_space = decode_with_prefix_space
  64. self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", True))
  65. @property
  66. def vocab_size(self):
  67. """Returns vocab size"""
  68. return self.sp_model.get_piece_size()
  69. def __len__(self):
  70. """
  71. Returns the vocabulary size. added_tokens_encoder has to be added in the sp_model
  72. """
  73. added_size = 0
  74. for id in self.added_tokens_decoder:
  75. if id >= self.sp_model.get_piece_size():
  76. added_size += 1
  77. return self.vocab_size + added_size
  78. @property
  79. def bos_token_id(self) -> Optional[int]:
  80. return self.sp_model.bos_id()
  81. @property
  82. def eos_token_id(self) -> Optional[int]:
  83. return self.sp_model.eos_id()
  84. def get_spm_processor(self, from_slow=True):
  85. import sentencepiece as spm
  86. from sentencepiece import sentencepiece_model_pb2 as model_pb2
  87. tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
  88. if from_slow: # no dependency on protobuf
  89. tokenizer.Load(self.vocab_file)
  90. return tokenizer
  91. with open(self.vocab_file, "rb") as f:
  92. sp_model = f.read()
  93. model = model_pb2.ModelProto.FromString(sp_model)
  94. normalizer_spec = model_pb2.NormalizerSpec()
  95. normalizer_spec.add_dummy_prefix = False
  96. model.normalizer_spec.MergeFrom(normalizer_spec)
  97. sp_model = model.SerializeToString()
  98. tokenizer.LoadFromSerializedProto(sp_model)
  99. return tokenizer
  100. def get_vocab(self):
  101. """Returns vocab as a dict"""
  102. vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
  103. vocab.update(self.added_tokens_encoder)
  104. return vocab
  105. def _tokenize(self, text):
  106. """Returns a tokenized string."""
  107. return self.sp_model.encode(text, out_type=str)
  108. def _convert_token_to_id(self, token):
  109. """Converts a token (str) in an id using the vocab."""
  110. return self.sp_model.piece_to_id(token)
  111. def _convert_id_to_token(self, index):
  112. """Converts an index (integer) in a token (str) using the vocab."""
  113. token = self.sp_model.id_to_piece(index)
  114. return token
  115. def convert_tokens_to_string(self, tokens):
  116. """Converts a sequence of tokens (string) in a single string."""
  117. current_sub_tokens = []
  118. out_string = ""
  119. prev_is_special = False
  120. for i, token in enumerate(tokens):
  121. # make sure that special tokens are not decoded using sentencepiece model
  122. if token in self.all_special_tokens:
  123. if not prev_is_special and i != 0:
  124. out_string += " "
  125. out_string += self.sp_model.decode(current_sub_tokens) + token
  126. prev_is_special = True
  127. current_sub_tokens = []
  128. else:
  129. current_sub_tokens.append(token)
  130. prev_is_special = False
  131. out_string += self.sp_model.decode(current_sub_tokens)
  132. return out_string
  133. def save_vocabulary(
  134. self, save_directory, filename_prefix: Optional[str] = None
  135. ) -> Tuple[str]:
  136. """
  137. Save the vocabulary and special tokens file to a directory.
  138. Args:
  139. save_directory (`str`):
  140. The directory in which to save the vocabulary.
  141. Returns:
  142. `Tuple(str)`: Paths to the files saved.
  143. """
  144. if not os.path.isdir(save_directory):
  145. raise ValueError(
  146. f"Vocabulary path ({save_directory}) should be a directory"
  147. )
  148. out_vocab_file = os.path.join(
  149. save_directory,
  150. (filename_prefix + "-" if filename_prefix else "")
  151. + self.resource_files_names["vocab_file"],
  152. )
  153. if os.path.abspath(self.vocab_file) != os.path.abspath(
  154. out_vocab_file
  155. ) and os.path.isfile(self.vocab_file):
  156. copyfile(self.vocab_file, out_vocab_file)
  157. elif not os.path.isfile(self.vocab_file):
  158. with open(out_vocab_file, "wb") as fi:
  159. content_spiece_model = self.sp_model.serialized_model_proto()
  160. fi.write(content_spiece_model)
  161. return (out_vocab_file,)
  162. def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
  163. if self.add_bos_token:
  164. bos_token_ids = [self.bos_token_id]
  165. else:
  166. bos_token_ids = []
  167. output = bos_token_ids + token_ids_0
  168. if token_ids_1 is not None:
  169. output = output + token_ids_1
  170. if self.add_eos_token:
  171. output = output + [self.eos_token_id]
  172. return output
  173. def get_special_tokens_mask(
  174. self,
  175. token_ids_0: List[int],
  176. token_ids_1: Optional[List[int]] = None,
  177. already_has_special_tokens: bool = False,
  178. ) -> List[int]:
  179. """
  180. Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
  181. special tokens using the tokenizer `prepare_for_model` method.
  182. Args:
  183. token_ids_0 (`List[int]`):
  184. List of IDs.
  185. token_ids_1 (`List[int]`, *optional*):
  186. Optional second list of IDs for sequence pairs.
  187. already_has_special_tokens (`bool`, *optional*, defaults to `False`):
  188. Whether or not the token list is already formatted with special tokens for the model.
  189. Returns:
  190. `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
  191. """
  192. if already_has_special_tokens:
  193. return super().get_special_tokens_mask(
  194. token_ids_0=token_ids_0,
  195. token_ids_1=token_ids_1,
  196. already_has_special_tokens=True,
  197. )
  198. if token_ids_1 is None:
  199. return [1] + ([0] * len(token_ids_0)) + [1]
  200. return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
  201. def create_token_type_ids_from_sequences(
  202. self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
  203. ) -> List[int]:
  204. """
  205. Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
  206. use of token type ids, therefore a list of zeros is returned.
  207. Args:
  208. token_ids_0 (`List[int]`):
  209. List of IDs.
  210. token_ids_1 (`List[int]`, *optional*):
  211. Optional second list of IDs for sequence pairs.
  212. Returns:
  213. `List[int]`: List of zeros.
  214. """
  215. eos = [self.eos_token_id]
  216. if token_ids_1 is None:
  217. return len(token_ids_0 + eos) * [0]
  218. return len(token_ids_0 + eos + token_ids_1 + eos) * [0]