qwen2_5_tokenizer.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import re
  15. from typing import List
  16. from .qwen2_tokenizer import Qwen2Tokenizer
  17. from .tokenizer_utils_base import AddedToken, TextInput
  18. class MIXQwen2_5_Tokenizer(Qwen2Tokenizer):
  19. def __init__(self, *args, **kwargs):
  20. super(MIXQwen2_5_Tokenizer, self).__init__(*args, **kwargs)
  21. def tokenize(self, text: TextInput, **kwargs) -> List[str]:
  22. """
  23. Converts a string in a sequence of tokens, using the tokenizer.
  24. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
  25. (BPE/SentencePieces/WordPieces). Takes care of added tokens.
  26. Args:
  27. text (`str`):
  28. The sequence to be encoded.
  29. **kwargs (additional keyword arguments):
  30. Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
  31. Returns:
  32. `List[str]`: The list of tokens.
  33. """
  34. split_special_tokens = kwargs.pop(
  35. "split_special_tokens", self.split_special_tokens
  36. )
  37. all_special_tokens_extended = dict(
  38. (str(t), t)
  39. for t in self.all_special_tokens_extended
  40. if isinstance(t, AddedToken)
  41. )
  42. # Add special tokens
  43. for t in self.added_tokens_decoder:
  44. token = self.added_tokens_decoder[t]
  45. if isinstance(token, AddedToken) and token.special:
  46. all_special_tokens_extended[str(token)] = token
  47. if str(token) not in self.all_special_tokens:
  48. self.all_special_tokens.append(str(token))
  49. if str(token) not in self.unique_no_split_tokens:
  50. self.unique_no_split_tokens.append(str(token))
  51. self._create_trie(self.unique_no_split_tokens)
  52. text, kwargs = self.prepare_for_tokenization(text, **kwargs)
  53. # TODO: should this be in the base class?
  54. if hasattr(self, "do_lower_case") and self.do_lower_case:
  55. # convert non-special tokens to lowercase
  56. escaped_special_toks = [
  57. re.escape(s_tok)
  58. for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
  59. ]
  60. pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
  61. text = re.sub(
  62. pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text
  63. )
  64. if split_special_tokens:
  65. no_split_token = []
  66. tokens = [text]
  67. else:
  68. no_split_token = set(
  69. self.unique_no_split_tokens
  70. ) # don't split on any of the added tokens
  71. tokens = self.tokens_trie.split(text)
  72. for i, token in enumerate(tokens):
  73. if token in no_split_token:
  74. tok_extended = all_special_tokens_extended.get(token, None)
  75. left = tokens[i - 1] if i > 0 else None
  76. right = tokens[i + 1] if i < len(tokens) - 1 else None
  77. if isinstance(tok_extended, AddedToken):
  78. if tok_extended.rstrip and right:
  79. # A bit counter-intuitive but we strip the left of the string
  80. # since tok_extended.rstrip means the special token is eating all white spaces on its right
  81. tokens[i + 1] = right.lstrip()
  82. # Strip white spaces on the left
  83. if tok_extended.lstrip and left:
  84. tokens[i - 1] = left.rstrip() # Opposite here
  85. tokenized_text = []
  86. for token in tokens:
  87. # Need to skip eventual empty (fully stripped) tokens
  88. if not token:
  89. continue
  90. if token in no_split_token:
  91. tokenized_text.append(token)
  92. else:
  93. tokenized_text.extend(self._tokenize(token))
  94. return tokenized_text