vocab.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import collections
  15. import io
  16. import json
  17. import os
  18. import warnings
  19. import numpy as np
  20. class Vocab(object):
  21. """
  22. The class used to convert between tokens and ids. It also includes some
  23. store/load functions.
  24. Args:
  25. counter (collections.Counter, optional): A Counter intance describes
  26. the tokens and their frequencies. Its keys will be indexed accroding
  27. to the order of frequency sorting to construct mapping relationship.
  28. If None, `token_to_idx` must be provided as the mapping relationship.
  29. Default: None.
  30. max_size (int, optional): Max size of vocab, not including special tokens.
  31. Default: None.
  32. min_freq (int, optional): Ignore tokens whose frequencies are less than
  33. `min_freq`. Default: 1.
  34. token_to_idx (dict, optional): A dict specifies the mapping relationship
  35. between tokens and indices to be used. If provided, adjust the tokens
  36. and indices mapping according to it. If None, counter must be provided.
  37. Default: None.
  38. unk_token (str, optional): Special token for unknow token. If no need,
  39. it also could be None. Default: None.
  40. pad_token (str, optional): Special token for padding token. If no need,
  41. it also could be None. Default: None.
  42. bos_token (str, optional): Special token for bos token. If no need, it
  43. also could be None. Default: None.
  44. eos_token (str, optional): Special token for eos token. If no need, it
  45. lso could be None. Default: None.
  46. kwargs (dict): Keyword arguments ending with `_token`. It can be used
  47. to specify further special tokens that will be exposed as attribute
  48. of the vocabulary and associated with an index.
  49. """
  50. def __init__(
  51. self,
  52. counter=None,
  53. max_size=None,
  54. min_freq=1,
  55. token_to_idx=None,
  56. unk_token=None,
  57. pad_token=None,
  58. bos_token=None,
  59. eos_token=None,
  60. **kwargs
  61. ):
  62. # Handle special tokens
  63. combs = (
  64. ("unk_token", unk_token),
  65. ("pad_token", pad_token),
  66. ("bos_token", bos_token),
  67. ("eos_token", eos_token),
  68. )
  69. for name, value in combs:
  70. kwargs[name] = value
  71. special_tokens = []
  72. special_iter = kwargs.keys()
  73. # sort alphabetically
  74. special_iter = sorted(special_iter)
  75. for special_token_name in special_iter:
  76. # Test if kwarg specifies a special token
  77. if not special_token_name.endswith("_token"):
  78. raise ValueError(
  79. "{} is invalid. Only keyword arguments "
  80. "that end in '_token' are supported "
  81. "to declare special tokens.".format(special_token_name)
  82. )
  83. special_token = kwargs[special_token_name]
  84. if special_token is not None and special_token not in special_tokens:
  85. special_tokens.append(special_token)
  86. if counter is None:
  87. # use token_to_idx as dict to import pretrained vocabulary
  88. assert token_to_idx, "token_to_idx should not be None when counter is None"
  89. for special_token in special_tokens:
  90. assert (
  91. special_token in token_to_idx
  92. ), "{} is not in token_to_idx".format(special_token)
  93. self._token_to_idx = token_to_idx
  94. self._idx_to_token = {idx: token for token, idx in token_to_idx.items()}
  95. if unk_token:
  96. unk_index = self._token_to_idx[unk_token]
  97. self._token_to_idx = collections.defaultdict(lambda: unk_index)
  98. self._token_to_idx.update(token_to_idx)
  99. else:
  100. self._idx_to_token = {
  101. idx: special_token for idx, special_token in enumerate(special_tokens)
  102. }
  103. self._token_to_idx = collections.defaultdict()
  104. self._token_to_idx.update(
  105. (token, idx) for idx, token in self._idx_to_token.items()
  106. )
  107. self._index_counter_keys(counter, special_tokens, max_size, min_freq)
  108. if token_to_idx:
  109. self._sort_index_according_to_user_specification(token_to_idx)
  110. if unk_token:
  111. self._token_to_idx.default_factory = lambda: self._token_to_idx[
  112. unk_token
  113. ]
  114. # _expose_tokens_as_attributes
  115. self._identifiers_to_tokens = kwargs
  116. for identifier, token in kwargs.items():
  117. if identifier.startswith("_"):
  118. raise ValueError(
  119. "It is not allowed to use identifiers starting with "
  120. "underscore. In Python identifier names beginning with "
  121. "underscore are internal."
  122. )
  123. if hasattr(self, identifier):
  124. raise ValueError(
  125. "vocab.{} already exists. "
  126. "Please choose a different identifier for token {}".format(
  127. identifier, token
  128. )
  129. )
  130. setattr(self, identifier, token)
  131. def _index_counter_keys(self, counter, special_tokens, max_size, min_freq):
  132. # sort by frequency, then alphabetically
  133. token_freqs = sorted(counter.items(), key=lambda x: x[0])
  134. token_freqs.sort(key=lambda x: x[1], reverse=True)
  135. # frequencies of special tokens are not counted when building vocabulary
  136. # in frequency order
  137. special_tokens = set(special_tokens)
  138. max_size = None if max_size is None else max_size + len(special_tokens)
  139. for token, freq in token_freqs:
  140. if freq < min_freq or len(self._idx_to_token) == max_size:
  141. break
  142. if token not in special_tokens:
  143. self._idx_to_token[max(list(self._idx_to_token.keys()) + [-1]) + 1] = (
  144. token
  145. )
  146. self._token_to_idx[token] = max(self._idx_to_token.keys())
  147. def _sort_index_according_to_user_specification(self, token_to_idx):
  148. # Sanity checks
  149. if not set(token_to_idx.keys()).issubset(self.token_to_idx.keys()):
  150. raise ValueError(
  151. "User-specified token_to_idx mapping can only contain "
  152. "tokens that will be part of the vocabulary."
  153. )
  154. if len(set(token_to_idx.values())) != len(token_to_idx):
  155. raise ValueError("User-specified indices must not contain duplicates.")
  156. if min(token_to_idx.values()) < 0 or max(token_to_idx.values()) >= len(
  157. self.token_to_idx
  158. ):
  159. raise ValueError(
  160. "User-specified indices must not be < 0 or >= the number of tokens "
  161. "that will be in the vocabulary. The current vocab contains {}"
  162. "tokens.".format(len(self.token_to_idx))
  163. )
  164. # Update index ordering
  165. for token, new_idx in token_to_idx.items():
  166. old_idx = self.token_to_idx[token]
  167. ousted_token = self.idx_to_token[new_idx]
  168. self.token_to_idx[token] = new_idx
  169. self.token_to_idx[ousted_token] = old_idx
  170. self.idx_to_token[old_idx] = ousted_token
  171. self.idx_to_token[new_idx] = token
  172. def to_tokens(self, indices):
  173. """
  174. Maps the input indices to token list.
  175. Args:
  176. indices (int|list[int]|tuple[int]|numpy.ndarray): The input indice(s) for mapping.
  177. Must be an `int` or 1D `list[int]`|`tuple[int]`|`numpy.ndarray`.
  178. Returns:
  179. str|list[str]: Obtained token(s). If `indices` is an integer, it
  180. will return a str. If `indices` is a list/tuple of integers, it will
  181. return a list of str.
  182. Example:
  183. .. code-block:: python
  184. from paddlenlp.data import Vocab
  185. # The vocab file. The sample file can be downloaded firstly.
  186. # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt
  187. vocab_file_path = './senta_word_dict.txt'
  188. # Initialize the Vocab
  189. vocab = Vocab.load_vocabulary(
  190. vocab_file_path,
  191. unk_token='[UNK]',
  192. pad_token='[PAD]')
  193. tokens = vocab.to_tokens([0, 1, 2, 3])
  194. print(tokens)
  195. # ['[PAD]', '[UNK]', '一斤三', '意面屋']
  196. """
  197. to_reduce = False
  198. if not isinstance(indices, (list, tuple, np.ndarray)):
  199. indices = [indices]
  200. to_reduce = True
  201. if isinstance(indices, (list, tuple)):
  202. indices = np.asarray(indices)
  203. if isinstance(indices, (np.ndarray)) and len(indices.shape) > 1:
  204. raise ValueError(
  205. "Token indices is invalid. Expected 1D array, but received {}D array. ".format(
  206. len(indices.shape)
  207. )
  208. )
  209. tokens = []
  210. for idx in indices:
  211. if not isinstance(idx, (int, np.integer)):
  212. warnings.warn(
  213. "The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transfered to `int`. "
  214. )
  215. idx = int(idx)
  216. try:
  217. tokens.append(self._idx_to_token[idx])
  218. except KeyError:
  219. raise ValueError(
  220. "Token index {} in the provided `indices` is invalid.".format(idx)
  221. )
  222. return tokens[0] if to_reduce else tokens
  223. def to_indices(self, tokens):
  224. """
  225. Maps the input tokens into indices.
  226. Args:
  227. tokens (str|list[str]|tuple[str], optional): The input token(s) for
  228. mapping.
  229. Returns:
  230. int|list[int]: Obationed indice(s). If `tokens` is a str, it will
  231. return an integer. If `tokens` is a list/tuple of str, it will
  232. return a list of integers.
  233. Example:
  234. .. code-block:: python
  235. from paddlenlp.data import Vocab
  236. # The vocab file. The sample file can be downloaded firstly.
  237. # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt
  238. vocab_file_path = './senta_word_dict.txt'
  239. # Initialize the Vocab
  240. vocab = Vocab.load_vocabulary(
  241. vocab_file_path,
  242. unk_token='[UNK]',
  243. pad_token='[PAD]')
  244. tokens = vocab.to_indices(['[PAD]', '[UNK]', '一斤三', '意面屋'])
  245. print(tokens)
  246. # [0, 1, 2, 3]
  247. """
  248. return self[tokens]
  249. def __getitem__(self, tokens):
  250. if not isinstance(tokens, (list, tuple)):
  251. return (
  252. self._token_to_idx[tokens]
  253. if tokens in self._token_to_idx
  254. else self._token_to_idx[self.unk_token]
  255. )
  256. else:
  257. return [
  258. (
  259. self._token_to_idx[token]
  260. if token in self._token_to_idx
  261. else self._token_to_idx[self.unk_token]
  262. )
  263. for token in tokens
  264. ]
  265. def __len__(self):
  266. return len(self._idx_to_token)
  267. def __contains__(self, token):
  268. return token in self._token_to_idx
  269. def __call__(self, tokens):
  270. """
  271. Maps the input tokens into indices. Its function is the same as the
  272. :meth:`to_indices` method.
  273. See detail at `to_indices`.
  274. """
  275. return self[tokens]
  276. @property
  277. def idx_to_token(self):
  278. # Returns index-token dict
  279. return self._idx_to_token
  280. @property
  281. def token_to_idx(self):
  282. # Return token-index dict
  283. return self._token_to_idx
  284. def to_json(self, path=None):
  285. """
  286. Summarizes some information of vocab as JSON string. If path is gaven,
  287. the JSON string will be saved into files. The JSON string and the saved
  288. file all can be used to reconstruct the :class:`Vocab` by calling
  289. :meth:`from_json` method.
  290. Args:
  291. path (str, optional): The path to save JSON string. If None, the
  292. JSON will not be saved. Default: None.
  293. Returns:
  294. str: The JSON string including information of vocab.
  295. Example:
  296. .. code-block:: python
  297. from paddlenlp.data import Vocab
  298. # The vocab file. The sample file can be downloaded firstly.
  299. # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt
  300. vocab_file_path = './senta_word_dict.txt'
  301. # Initialize the Vocab
  302. vocab = Vocab.load_vocabulary(
  303. vocab_file_path,
  304. unk_token='[UNK]',
  305. pad_token='[PAD]')
  306. json_str = vocab.to_json(path='./vocab.json')
  307. """
  308. vocab_dict = {}
  309. vocab_dict["idx_to_token"] = dict(self.idx_to_token)
  310. vocab_dict["token_to_idx"] = dict(self.token_to_idx)
  311. vocab_dict["unk_token"] = self.unk_token
  312. vocab_dict["identifiers_to_tokens"] = self._identifiers_to_tokens
  313. json_str = json.dumps(vocab_dict)
  314. if path:
  315. with io.open(path, "w", encoding="utf-8") as f:
  316. f.write(json_str)
  317. return json_str
  318. @classmethod
  319. def from_json(cls, json_str):
  320. """
  321. Loads :class:`Vocab` from JSON string or JSON file, which is gotten by
  322. calling :meth:`to_json` method.
  323. Args:
  324. json_str (str): JSON string or file path of JSON string.
  325. Returns:
  326. Vocab: An instance of :class:`Vocab` generated from information
  327. contained in JSON string.
  328. Example:
  329. .. code-block:: python
  330. from paddlenlp.data import Vocab
  331. # The vocab file. The sample file can be downloaded firstly.
  332. # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt
  333. vocab_file_path = './senta_word_dict.txt'
  334. # Initialize the Vocab
  335. vocab = Vocab.load_vocabulary(
  336. vocab_file_path,
  337. unk_token='[UNK]',
  338. pad_token='[PAD]')
  339. json_str = vocab.to_json(path='./vocab.json')
  340. vocab1 = Vocab.from_json(json_str)
  341. vocab2 = Vocab.from_json('./vocab.json')
  342. print(len(vocab), len(vocab1), len(vocab2))
  343. # 1256608 1256608 1256608
  344. """
  345. if os.path.isfile(json_str):
  346. with io.open(json_str, "r", encoding="utf-8") as f:
  347. vocab_dict = json.load(f)
  348. else:
  349. vocab_dict = json.loads(json_str)
  350. token_to_idx = vocab_dict.get("token_to_idx")
  351. unk_token = vocab_dict.get("unk_token")
  352. identifiers_to_tokens = vocab_dict.get("identifiers_to_tokens", dict())
  353. if "unk_token" in identifiers_to_tokens:
  354. del identifiers_to_tokens["unk_token"]
  355. vocab = cls(
  356. counter=None,
  357. token_to_idx=token_to_idx,
  358. unk_token=unk_token,
  359. **identifiers_to_tokens,
  360. )
  361. return vocab
  362. @classmethod
  363. def from_dict(
  364. cls,
  365. token_to_idx,
  366. unk_token=None,
  367. pad_token=None,
  368. bos_token=None,
  369. eos_token=None,
  370. **kwargs
  371. ):
  372. """
  373. Builds the :class:`Vocab` from a dict.
  374. Args:
  375. token_to_idx (dict): A dict describes the mapping relationship between
  376. tokens and indices.
  377. unk_token (str, optional): The special token for unknow token. If
  378. no need, it also could be None. Default: None.
  379. pad_token (str, optional): The special token for padding token. If
  380. no need, it also could be None. Default: None.
  381. bos_token (str, optional): The special token for bos token. If no
  382. need, it also could be None. Default: None.
  383. eos_token (str, optional): The special token for eos token. If no
  384. need, it also could be None. Default: None.
  385. kwargs (dict): Keyword arguments ending with `_token`. It can be
  386. used to specify further special tokens that will be exposed as
  387. attribute of the vocabulary and associated with an index.
  388. Returns:
  389. Vocab: An instance of :class:`Vocab` generated from the given dict
  390. and special tokens.
  391. Example:
  392. .. code-block:: python
  393. from paddlenlp.data import Vocab
  394. # The vocab file. The sample file can be downloaded firstly.
  395. # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt
  396. vocab_file_path = './senta_word_dict.txt'
  397. # Initialize the Vocab
  398. vocab = Vocab.load_vocabulary(
  399. vocab_file_path,
  400. unk_token='[UNK]',
  401. pad_token='[PAD]')
  402. vocab1 = Vocab.from_dict(vocab.token_to_idx)
  403. print(len(vocab), len(vocab.token_to_idx), len(vocab1))
  404. # 1256608 1256608 1256608
  405. """
  406. vocab = cls(
  407. counter=None,
  408. token_to_idx=token_to_idx,
  409. unk_token=unk_token,
  410. pad_token=pad_token,
  411. bos_token=bos_token,
  412. eos_token=eos_token,
  413. **kwargs,
  414. )
  415. return vocab
  416. @staticmethod
  417. def build_vocab(
  418. iterator,
  419. max_size=None,
  420. min_freq=1,
  421. token_to_idx=None,
  422. unk_token=None,
  423. pad_token=None,
  424. bos_token=None,
  425. eos_token=None,
  426. **kwargs
  427. ):
  428. """
  429. Builds the :class:`Vocab` accoring to given iterator and other
  430. information. Firstly, iterate over the `iterator` to construct a
  431. :class:`collections.Counter` and used to init the as :class:`Vocab`.
  432. Args:
  433. iterator (collections.Iterable): Iterator of tokens. Each element
  434. should be a list of tokens if wordlevel vocab is needed.
  435. max_size (int, optional): The max size of vocab, not including
  436. special tokens. Default: None.
  437. min_freq (int, optional): Ignore tokens whose frequencies are less
  438. than `min_freq`. Default: 1.
  439. token_to_idx (dict, optional): A dict specifies the mapping
  440. relationship between tokens and indices to be used. If provided,
  441. adjust the tokens and indices mapping according to it. If None,
  442. counter must be provided. Default: None.
  443. unk_token (str, optional): The special token for unknow token
  444. '<unk>'. If no need, it also could be None. Default: None.
  445. pad_token (str, optional): The special token for padding token
  446. '<pad>'. If no need, it also could be None. Default: None.
  447. bos_token (str, optional): The special token for bos token '<bos>'.
  448. If no need, it also could be None. Default: None.
  449. eos_token (str, optional): The special token for eos token '<eos>'.
  450. If no need, it also could be None. Default: None.
  451. kwargs (dict): Keyword arguments ending with `_token`. It can be
  452. used to specify further special tokens that will be exposed as
  453. attribute of the vocabulary and associated with an index.
  454. Returns:
  455. Vocab: An instance of :class:`Vocab` generated from given iterator
  456. and other informations.
  457. Example:
  458. .. code-block:: python
  459. from paddlenlp.data import Vocab
  460. # The vocab file. The sample file can be downloaded firstly.
  461. # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt
  462. vocab_file_path = './senta_word_dict.txt'
  463. # Initialize the Vocab
  464. vocab = Vocab.load_vocabulary(
  465. vocab_file_path,
  466. unk_token='[UNK]',
  467. pad_token='[PAD]')
  468. vocab1 = Vocab.build_vocab([list(vocab.token_to_idx.keys())])
  469. print(len(vocab), len(vocab1))
  470. # 1256608 1256608
  471. """
  472. counter = collections.Counter()
  473. for tokens in iterator:
  474. counter.update(tokens)
  475. vocab = Vocab(
  476. counter,
  477. max_size=max_size,
  478. min_freq=min_freq,
  479. token_to_idx=token_to_idx,
  480. unk_token=unk_token,
  481. pad_token=pad_token,
  482. bos_token=bos_token,
  483. eos_token=eos_token,
  484. **kwargs,
  485. )
  486. return vocab
  487. @staticmethod
  488. def load_vocabulary(
  489. filepath,
  490. unk_token=None,
  491. pad_token=None,
  492. bos_token=None,
  493. eos_token=None,
  494. **kwargs
  495. ):
  496. """
  497. Builds the :class:`Vocab` from a file reserving all tokens by calling
  498. :meth:`Vocab.from_dict` method. The file contains a token per line, and
  499. the line index would be the index of corresponding token.
  500. Args:
  501. filepath (str): the path of file to construct vocabulary.
  502. unk_token (str, optional): special token for unknown token. If no
  503. need, it also could be None. Default: None.
  504. pad_token (str, optional): special token for padding token. If no
  505. need, it also could be None. Default: None.
  506. bos_token (str, optional): special token for bos token. If no need,
  507. it also could be None. Default: None.
  508. eos_token (str, optional): special token for eos token. If no need,
  509. it also could be None. Default: None.
  510. kwargs (dict): Keyword arguments ending with `_token`. It can be
  511. used to specify further special tokens that will be exposed as
  512. attribute of the vocabulary and associated with an index.
  513. Returns:
  514. Vocab: An instance of :class:`Vocab` generated from the given file.
  515. Example:
  516. .. code-block:: python
  517. from paddlenlp.data import Vocab
  518. # The vocab file. The sample file can be downloaded firstly.
  519. # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt
  520. vocab_file_path = './senta_word_dict.txt'
  521. # Initialize the Vocab
  522. vocab = Vocab.load_vocabulary(
  523. vocab_file_path,
  524. unk_token='[UNK]',
  525. pad_token='[PAD]')
  526. print(len(vocab))
  527. # 1256608
  528. """
  529. token_to_idx = {}
  530. with io.open(filepath, "r", encoding="utf-8") as f:
  531. for index, line in enumerate(f):
  532. token = line.rstrip("\n")
  533. token_to_idx[token] = int(index)
  534. vocab = Vocab.from_dict(
  535. token_to_idx,
  536. unk_token=unk_token,
  537. pad_token=pad_token,
  538. bos_token=bos_token,
  539. eos_token=eos_token,
  540. **kwargs,
  541. )
  542. return vocab
  543. def save_vocabulary(self, filepath):
  544. """
  545. Save the :class:`Vocab` to a specific file. Can be reloaded by calling `load_vocabulary`.
  546. Args:
  547. filepath (str): the path of file to save vocabulary.
  548. """
  549. with open(filepath, "w") as f:
  550. for idx in range(len(self._idx_to_token)):
  551. f.write(self._idx_to_token[idx] + "\n")
  552. def get_unk_token_id(self):
  553. return (
  554. self._token_to_idx[self.unk_token]
  555. if self.unk_token is not None
  556. else self.unk_token
  557. )
  558. def get_bos_token_id(self):
  559. return (
  560. self._token_to_idx[self.bos_token]
  561. if self.bos_token is not None
  562. else self.bos_token
  563. )
  564. def get_eos_token_id(self):
  565. return (
  566. self._token_to_idx[self.eos_token]
  567. if self.eos_token is not None
  568. else self.eos_token
  569. )
  570. def get_pad_token_id(self):
  571. return (
  572. self._token_to_idx[self.pad_token]
  573. if self.pad_token is not None
  574. else self.pad_token
  575. )