| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112 |
- # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import re
- from typing import List
- from .qwen2_tokenizer import Qwen2Tokenizer
- from .tokenizer_utils_base import AddedToken, TextInput
- class MIXQwen2_5_Tokenizer(Qwen2Tokenizer):
- def __init__(self, *args, **kwargs):
- super(MIXQwen2_5_Tokenizer, self).__init__(*args, **kwargs)
- def tokenize(self, text: TextInput, **kwargs) -> List[str]:
- """
- Converts a string in a sequence of tokens, using the tokenizer.
- Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
- (BPE/SentencePieces/WordPieces). Takes care of added tokens.
- Args:
- text (`str`):
- The sequence to be encoded.
- **kwargs (additional keyword arguments):
- Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
- Returns:
- `List[str]`: The list of tokens.
- """
- split_special_tokens = kwargs.pop(
- "split_special_tokens", self.split_special_tokens
- )
- all_special_tokens_extended = dict(
- (str(t), t)
- for t in self.all_special_tokens_extended
- if isinstance(t, AddedToken)
- )
- # Add special tokens
- for t in self.added_tokens_decoder:
- token = self.added_tokens_decoder[t]
- if isinstance(token, AddedToken) and token.special:
- all_special_tokens_extended[str(token)] = token
- if str(token) not in self.all_special_tokens:
- self.all_special_tokens.append(str(token))
- if str(token) not in self.unique_no_split_tokens:
- self.unique_no_split_tokens.append(str(token))
- self._create_trie(self.unique_no_split_tokens)
- text, kwargs = self.prepare_for_tokenization(text, **kwargs)
- # TODO: should this be in the base class?
- if hasattr(self, "do_lower_case") and self.do_lower_case:
- # convert non-special tokens to lowercase
- escaped_special_toks = [
- re.escape(s_tok)
- for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
- ]
- pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
- text = re.sub(
- pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text
- )
- if split_special_tokens:
- no_split_token = []
- tokens = [text]
- else:
- no_split_token = set(
- self.unique_no_split_tokens
- ) # don't split on any of the added tokens
- tokens = self.tokens_trie.split(text)
- for i, token in enumerate(tokens):
- if token in no_split_token:
- tok_extended = all_special_tokens_extended.get(token, None)
- left = tokens[i - 1] if i > 0 else None
- right = tokens[i + 1] if i < len(tokens) - 1 else None
- if isinstance(tok_extended, AddedToken):
- if tok_extended.rstrip and right:
- # A bit counter-intuitive but we strip the left of the string
- # since tok_extended.rstrip means the special token is eating all white spaces on its right
- tokens[i + 1] = right.lstrip()
- # Strip white spaces on the left
- if tok_extended.lstrip and left:
- tokens[i - 1] = left.rstrip() # Opposite here
- tokenized_text = []
- for token in tokens:
- # Need to skip eventual empty (fully stripped) tokens
- if not token:
- continue
- if token in no_split_token:
- tokenized_text.append(token)
- else:
- tokenized_text.extend(self._tokenize(token))
- return tokenized_text
|