clip_tokenizer.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import json
  15. import logging
  16. import os
  17. import shutil
  18. import unicodedata
  19. from functools import lru_cache
  20. from typing import List, Optional
  21. from .tokenizer_utils import (
  22. PretrainedTokenizer,
  23. _is_control,
  24. _is_punctuation,
  25. _is_whitespace,
  26. )
  27. from .tokenizer_utils_base import AddedToken
  28. __all__ = ["CLIPTokenizer"]
  29. @lru_cache()
  30. def bytes_to_unicode():
  31. """
  32. Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
  33. characters the bpe code barfs on.
  34. The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
  35. if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
  36. decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
  37. tables between utf-8 bytes and unicode strings.
  38. """
  39. bs = (
  40. list(range(ord("!"), ord("~") + 1))
  41. + list(range(ord("¡"), ord("¬") + 1))
  42. + list(range(ord("®"), ord("ÿ") + 1))
  43. )
  44. cs = bs[:]
  45. n = 0
  46. for b in range(2**8):
  47. if b not in bs:
  48. bs.append(b)
  49. cs.append(2**8 + n)
  50. n += 1
  51. cs = [chr(n) for n in cs]
  52. return dict(zip(bs, cs))
  53. def get_pairs(word):
  54. """
  55. Return set of symbol pairs in a word.
  56. Word is represented as tuple of symbols (symbols being variable-length strings).
  57. """
  58. pairs = set()
  59. prev_char = word[0]
  60. for char in word[1:]:
  61. pairs.add((prev_char, char))
  62. prev_char = char
  63. return pairs
  64. def whitespace_clean(text, re):
  65. text = re.sub(r"\s+", " ", text)
  66. text = text.strip()
  67. return text
  68. def whitespace_tokenize(text):
  69. """Runs basic whitespace cleaning and splitting on a piece of text."""
  70. text = text.strip()
  71. if not text:
  72. return []
  73. tokens = text.split()
  74. return tokens
  75. # Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
  76. class BasicTokenizer(object):
  77. """
  78. Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
  79. Args:
  80. do_lower_case (`bool`, *optional*, defaults to `True`):
  81. Whether or not to lowercase the input when tokenizing.
  82. never_split (`Iterable`, *optional*):
  83. Collection of tokens which will never be split during tokenization. Only has an effect when
  84. `do_basic_tokenize=True`
  85. tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
  86. Whether or not to tokenize Chinese characters.
  87. This should likely be deactivated for Japanese (see this
  88. [issue](https://github.com/huggingface/transformers/issues/328)).
  89. strip_accents (`bool`, *optional*):
  90. Whether or not to strip all accents. If this option is not specified, then it will be determined by the
  91. value for `lowercase` (as in the original BERT).
  92. do_split_on_punc (`bool`, *optional*, defaults to `True`):
  93. In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
  94. the full context of the words, such as contractions.
  95. """
  96. def __init__(
  97. self,
  98. do_lower_case=True,
  99. never_split=None,
  100. tokenize_chinese_chars=True,
  101. strip_accents=None,
  102. do_split_on_punc=True,
  103. ):
  104. if never_split is None:
  105. never_split = []
  106. self.do_lower_case = do_lower_case
  107. self.never_split = set(never_split)
  108. self.tokenize_chinese_chars = tokenize_chinese_chars
  109. self.strip_accents = strip_accents
  110. self.do_split_on_punc = do_split_on_punc
  111. def tokenize(self, text, never_split=None):
  112. """
  113. Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
  114. Args:
  115. never_split (`List[str]`, *optional*)
  116. Kept for backward compatibility purposes. Now implemented directly at the base class level (see
  117. [`PreTrainedTokenizer.tokenize`]) List of token not to split.
  118. """
  119. # union() returns a new set by concatenating the two sets.
  120. never_split = (
  121. self.never_split.union(set(never_split))
  122. if never_split
  123. else self.never_split
  124. )
  125. text = self._clean_text(text)
  126. # This was added on November 1st, 2018 for the multilingual and Chinese
  127. # models. This is also applied to the English models now, but it doesn't
  128. # matter since the English models were not trained on any Chinese data
  129. # and generally don't have any Chinese data in them (there are Chinese
  130. # characters in the vocabulary because Wikipedia does have some Chinese
  131. # words in the English Wikipedia.).
  132. if self.tokenize_chinese_chars:
  133. text = self._tokenize_chinese_chars(text)
  134. # prevents treating the same character with different unicode codepoints as different characters
  135. unicode_normalized_text = unicodedata.normalize("NFC", text)
  136. orig_tokens = whitespace_tokenize(unicode_normalized_text)
  137. split_tokens = []
  138. for token in orig_tokens:
  139. if token not in never_split:
  140. if self.do_lower_case:
  141. token = token.lower()
  142. if self.strip_accents is not False:
  143. token = self._run_strip_accents(token)
  144. elif self.strip_accents:
  145. token = self._run_strip_accents(token)
  146. split_tokens.extend(self._run_split_on_punc(token, never_split))
  147. output_tokens = whitespace_tokenize(" ".join(split_tokens))
  148. return output_tokens
  149. def _run_strip_accents(self, text):
  150. """Strips accents from a piece of text."""
  151. text = unicodedata.normalize("NFD", text)
  152. output = []
  153. for char in text:
  154. cat = unicodedata.category(char)
  155. if cat == "Mn":
  156. continue
  157. output.append(char)
  158. return "".join(output)
  159. def _run_split_on_punc(self, text, never_split=None):
  160. """Splits punctuation on a piece of text."""
  161. if not self.do_split_on_punc or (
  162. never_split is not None and text in never_split
  163. ):
  164. return [text]
  165. chars = list(text)
  166. i = 0
  167. start_new_word = True
  168. output = []
  169. while i < len(chars):
  170. char = chars[i]
  171. if _is_punctuation(char):
  172. output.append([char])
  173. start_new_word = True
  174. else:
  175. if start_new_word:
  176. output.append([])
  177. start_new_word = False
  178. output[-1].append(char)
  179. i += 1
  180. return ["".join(x) for x in output]
  181. def _tokenize_chinese_chars(self, text):
  182. """Adds whitespace around any CJK character."""
  183. output = []
  184. for char in text:
  185. cp = ord(char)
  186. if self._is_chinese_char(cp):
  187. output.append(" ")
  188. output.append(char)
  189. output.append(" ")
  190. else:
  191. output.append(char)
  192. return "".join(output)
  193. def _is_chinese_char(self, cp):
  194. """Checks whether CP is the codepoint of a CJK character."""
  195. # This defines a "chinese character" as anything in the CJK Unicode block:
  196. # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
  197. #
  198. # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
  199. # despite its name. The modern Korean Hangul alphabet is a different block,
  200. # as is Japanese Hiragana and Katakana. Those alphabets are used to write
  201. # space-separated words, so they are not treated specially and handled
  202. # like the all of the other languages.
  203. if (
  204. (cp >= 0x4E00 and cp <= 0x9FFF)
  205. or (cp >= 0x3400 and cp <= 0x4DBF) #
  206. or (cp >= 0x20000 and cp <= 0x2A6DF) #
  207. or (cp >= 0x2A700 and cp <= 0x2B73F) #
  208. or (cp >= 0x2B740 and cp <= 0x2B81F) #
  209. or (cp >= 0x2B820 and cp <= 0x2CEAF) #
  210. or (cp >= 0xF900 and cp <= 0xFAFF)
  211. or (cp >= 0x2F800 and cp <= 0x2FA1F) #
  212. ): #
  213. return True
  214. return False
  215. def _clean_text(self, text):
  216. """Performs invalid character removal and whitespace cleanup on text."""
  217. output = []
  218. for char in text:
  219. cp = ord(char)
  220. if cp == 0 or cp == 0xFFFD or _is_control(char):
  221. continue
  222. if _is_whitespace(char):
  223. output.append(" ")
  224. else:
  225. output.append(char)
  226. return "".join(output)
  227. class CLIPTokenizer(PretrainedTokenizer):
  228. r"""
  229. Construct a CLIP tokenizer based on byte-level Byte-Pair-Encoding.
  230. This tokenizer inherits from :class:`~paddlenlp.transformers.gpt.tokenizer.GPTTokenizer`.
  231. For more information regarding those methods, please refer to this superclass.
  232. Args:
  233. vocab_file (str):
  234. Path to the vocabulary file.
  235. The vocab file contains a mapping from vocabulary strings to indices.
  236. merges_file (str):
  237. Path to the merge file.
  238. The merge file is used to split the input sentence into "subword" units.
  239. The vocab file is then used to encode those units as intices.
  240. errors (str):
  241. Paradigm to follow when decoding bytes to UTF-8.
  242. Defaults to `'replace'`.
  243. max_len (int, optional):
  244. The maximum value of the input sequence length.
  245. Defaults to `77`.
  246. bos_token (str, optional):
  247. The beginning of sequence token that was used during pretraining. Can be
  248. used a sequence classifier token.
  249. Defaults to `"<|startoftext|>"`.
  250. eos_token (str, optional):
  251. A special token representing the end of a sequence that was used during pretraining.
  252. Defaults to `"<|endoftext|>"`.
  253. unk_token (str, optional):
  254. A special token representing the *unknown (out-of-vocabulary)* token.
  255. An unknown token is set to be `unk_token` inorder to be converted to an ID.
  256. Defaults to `"<|endoftext|>"`.
  257. pad_token (str, optional):
  258. A special token used to make arrays of tokens the same size for batching purposes.
  259. Defaults to `"<|endoftext|>"`.
  260. Examples:
  261. .. code-block::
  262. from paddlenlp.transformers import AutoTokenizer
  263. tokenizer = AutoTokenizer.from_pretrained('openai/clip-vit-base-patch32')
  264. print(tokenizer('He was a puppeteer'))
  265. '''
  266. {'input_ids': [49406, 797, 739, 320, 7116, 38820, 528, 49407]}
  267. '''
  268. """
  269. # merges and vocab same as GPT2
  270. resource_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
  271. pretrained_resource_files_map = {"vocab_file": {}, "merges_file": {}}
  272. pretrained_init_configuration = {}
  273. model_input_names = [
  274. "input_ids",
  275. "attention_mask",
  276. ]
  277. def __init__(
  278. self,
  279. vocab_file,
  280. merges_file,
  281. errors="replace",
  282. max_len=77,
  283. bos_token="<|startoftext|>",
  284. eos_token="<|endoftext|>",
  285. unk_token="<|endoftext|>",
  286. pad_token="<|endoftext|>",
  287. **kwargs
  288. ):
  289. from paddle.utils import try_import
  290. bos_token = (
  291. AddedToken(bos_token, lstrip=False, rstrip=False)
  292. if isinstance(bos_token, str)
  293. else bos_token
  294. )
  295. eos_token = (
  296. AddedToken(eos_token, lstrip=False, rstrip=False)
  297. if isinstance(eos_token, str)
  298. else eos_token
  299. )
  300. unk_token = (
  301. AddedToken(unk_token, lstrip=False, rstrip=False)
  302. if isinstance(unk_token, str)
  303. else unk_token
  304. )
  305. pad_token = (
  306. AddedToken(pad_token, lstrip=False, rstrip=False)
  307. if isinstance(pad_token, str)
  308. else pad_token
  309. )
  310. self._build_special_tokens_map_extended(
  311. bos_token=bos_token,
  312. eos_token=eos_token,
  313. unk_token=unk_token,
  314. pad_token=pad_token,
  315. )
  316. try:
  317. import ftfy
  318. self.fix_text = ftfy.fix_text
  319. except ImportError:
  320. logging.info(
  321. "ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy."
  322. )
  323. self.nlp = BasicTokenizer(
  324. strip_accents=False, do_split_on_punc=False, do_lower_case=True
  325. )
  326. self.fix_text = None
  327. self.re = try_import("regex")
  328. self._vocab_file = vocab_file
  329. self._merges_file = merges_file
  330. self.max_len = max_len if max_len is not None else int(1e12)
  331. with open(vocab_file, encoding="utf-8") as vocab_handle:
  332. self.encoder = json.load(vocab_handle)
  333. self.decoder = {v: k for k, v in self.encoder.items()}
  334. self.errors = errors # how to handle errors in decoding
  335. self.byte_encoder = bytes_to_unicode()
  336. self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
  337. with open(merges_file, encoding="utf-8") as merges_handle:
  338. bpe_merges = (
  339. merges_handle.read().strip().split("\n")[1 : 49152 - 256 - 2 + 1]
  340. )
  341. bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
  342. self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
  343. self.cache = {
  344. "<|startoftext|>": "<|startoftext|>",
  345. "<|endoftext|>": "<|endoftext|>",
  346. }
  347. self.pat = self.re.compile(
  348. r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
  349. self.re.IGNORECASE,
  350. )
  351. @property
  352. def vocab_size(self):
  353. """
  354. Returns the size of vocabulary.
  355. Returns:
  356. int: The sum of size of vocabulary and the size of speical tokens.
  357. """
  358. return len(self.encoder)
  359. def get_vocab(self):
  360. return dict(self.encoder, **self.added_tokens_encoder)
  361. def build_inputs_with_special_tokens(
  362. self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
  363. ) -> List[int]:
  364. """
  365. Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
  366. adding special tokens. A CLIP sequence has the following format:
  367. - single sequence: `<|startoftext|> X <|endoftext|>`
  368. Pairs of sequences are not the expected use case, but they will be handled without a separator.
  369. Args:
  370. token_ids_0 (`List[int]`):
  371. List of IDs to which the special tokens will be added.
  372. token_ids_1 (`List[int]`, *optional*):
  373. Optional second list of IDs for sequence pairs.
  374. Returns:
  375. `List[int]`: List of input IDs with the appropriate special tokens.
  376. """
  377. bos_token = [self.bos_token_id]
  378. eos_token = [self.eos_token_id]
  379. if token_ids_1 is None:
  380. return bos_token + token_ids_0 + eos_token
  381. return bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token
  382. def build_offset_mapping_with_special_tokens(
  383. self, offset_mapping_0, offset_mapping_1=None
  384. ):
  385. """
  386. Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
  387. Should be overridden in a subclass if the model has a special way of building those.
  388. Args:
  389. offset_mapping_0 (List[tuple]):
  390. List of char offsets to which the special tokens will be added.
  391. offset_mapping_1 (List[tuple], optional):
  392. Optional second list of char offsets for offset mapping pairs.
  393. Returns:
  394. List[tuple]: List of char offsets with the appropriate offsets of special tokens.
  395. """
  396. if offset_mapping_1 is None:
  397. return [(0, 0)] + offset_mapping_0 + [(0, 0)]
  398. return (
  399. [(0, 0)] + offset_mapping_0 + [(0, 0), (0, 0)] + offset_mapping_1 + [(0, 0)]
  400. )
  401. def get_special_tokens_mask(
  402. self, token_ids_0, token_ids_1=None, already_has_special_tokens=False
  403. ):
  404. """
  405. Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
  406. special tokens using the tokenizer `prepare_for_model` method.
  407. Args:
  408. token_ids_0 (`List[int]`):
  409. List of IDs.
  410. token_ids_1 (`List[int]`, *optional*):
  411. Optional second list of IDs for sequence pairs.
  412. already_has_special_tokens (`bool`, *optional*, defaults to `False`):
  413. Whether or not the token list is already formatted with special tokens for the model.
  414. Returns:
  415. `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
  416. """
  417. if already_has_special_tokens:
  418. return super().get_special_tokens_mask(
  419. token_ids_0=token_ids_0,
  420. token_ids_1=token_ids_1,
  421. already_has_special_tokens=True,
  422. )
  423. if token_ids_1 is None:
  424. return [1] + ([0] * len(token_ids_0)) + [1]
  425. return (
  426. [1] + ([0] * len(token_ids_0)) + [1] + [1] + ([0] * len(token_ids_1)) + [1]
  427. )
  428. def create_token_type_ids_from_sequences(
  429. self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
  430. ) -> List[int]:
  431. """
  432. Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of
  433. zeros is returned.
  434. Args:
  435. token_ids_0 (`List[int]`):
  436. List of IDs.
  437. token_ids_1 (`List[int]`, *optional*):
  438. Optional second list of IDs for sequence pairs.
  439. Returns:
  440. `List[int]`: List of zeros.
  441. """
  442. bos_token = [self.bos_token_id]
  443. eos_token = [self.eos_token_id]
  444. if token_ids_1 is None:
  445. return len(bos_token + token_ids_0 + eos_token) * [0]
  446. return len(
  447. bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token
  448. ) * [0]
  449. def bpe(self, token):
  450. if token in self.cache:
  451. return self.cache[token]
  452. word = tuple(token[:-1]) + (token[-1] + "</w>",)
  453. pairs = get_pairs(word)
  454. if not pairs:
  455. return token + "</w>"
  456. while True:
  457. bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
  458. if bigram not in self.bpe_ranks:
  459. break
  460. first, second = bigram
  461. new_word = []
  462. i = 0
  463. while i < len(word):
  464. try:
  465. j = word.index(first, i)
  466. except ValueError:
  467. new_word.extend(word[i:])
  468. break
  469. else:
  470. new_word.extend(word[i:j])
  471. i = j
  472. if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
  473. new_word.append(first + second)
  474. i += 2
  475. else:
  476. new_word.append(word[i])
  477. i += 1
  478. new_word = tuple(new_word)
  479. word = new_word
  480. if len(word) == 1:
  481. break
  482. else:
  483. pairs = get_pairs(word)
  484. word = " ".join(word)
  485. self.cache[token] = word
  486. return word
  487. def _tokenize(self, text):
  488. """Tokenize a string."""
  489. bpe_tokens = []
  490. if self.fix_text is None:
  491. text = " ".join(self.nlp.tokenize(text))
  492. else:
  493. text = whitespace_clean(self.fix_text(text), self.re).lower()
  494. for token in self.re.findall(self.pat, text):
  495. token = "".join(
  496. self.byte_encoder[b] for b in token.encode("utf-8")
  497. ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
  498. bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
  499. return bpe_tokens
  500. def _convert_token_to_id(self, token):
  501. """Converts a token (str) in an id using the vocab."""
  502. return self.encoder.get(token, self.encoder.get(self.unk_token))
  503. def _convert_id_to_token(self, index):
  504. """Converts an index (integer) in a token (str) using the vocab."""
  505. return self.decoder.get(index)
  506. def convert_tokens_to_string(self, tokens):
  507. """Converts a sequence of tokens (string) in a single string."""
  508. text = "".join(tokens)
  509. byte_array = bytearray([self.byte_decoder[c] for c in text])
  510. text = (
  511. byte_array.decode("utf-8", errors=self.errors).replace("</w>", " ").strip()
  512. )
  513. return text
  514. def save_resources(self, save_directory):
  515. """
  516. Saves `SentencePiece <https://github.com/google/sentencepiece>`__ file
  517. (ends with '.spm') under `save_directory`.
  518. Args:
  519. save_directory (str): Directory to save files into.
  520. """
  521. for name, file_name in self.resource_files_names.items():
  522. source_path = getattr(self, "_%s" % name)
  523. save_path = os.path.join(save_directory, file_name)
  524. if os.path.abspath(source_path) != os.path.abspath(save_path):
  525. shutil.copyfile(source_path, save_path)