openai_public.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe
  2. ENDOFTEXT = "<|endoftext|>"
  3. FIM_PREFIX = "<|fim_prefix|>"
  4. FIM_MIDDLE = "<|fim_middle|>"
  5. FIM_SUFFIX = "<|fim_suffix|>"
  6. ENDOFPROMPT = "<|endofprompt|>"
  7. # The pattern in the original GPT-2 release is:
  8. # r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
  9. # This is equivalent, but executes faster:
  10. r50k_pat_str = (
  11. r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s"""
  12. )
  13. def gpt2():
  14. mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
  15. vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
  16. encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
  17. vocab_bpe_hash="1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5",
  18. encoder_json_hash="196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783",
  19. )
  20. return {
  21. "name": "gpt2",
  22. "explicit_n_vocab": 50257,
  23. "pat_str": r50k_pat_str,
  24. "mergeable_ranks": mergeable_ranks,
  25. "special_tokens": {ENDOFTEXT: 50256},
  26. }
  27. def r50k_base():
  28. mergeable_ranks = load_tiktoken_bpe(
  29. "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
  30. expected_hash="306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930",
  31. )
  32. return {
  33. "name": "r50k_base",
  34. "explicit_n_vocab": 50257,
  35. "pat_str": r50k_pat_str,
  36. "mergeable_ranks": mergeable_ranks,
  37. "special_tokens": {ENDOFTEXT: 50256},
  38. }
  39. def p50k_base():
  40. mergeable_ranks = load_tiktoken_bpe(
  41. "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
  42. expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
  43. )
  44. return {
  45. "name": "p50k_base",
  46. "explicit_n_vocab": 50281,
  47. "pat_str": r50k_pat_str,
  48. "mergeable_ranks": mergeable_ranks,
  49. "special_tokens": {ENDOFTEXT: 50256},
  50. }
  51. def p50k_edit():
  52. mergeable_ranks = load_tiktoken_bpe(
  53. "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
  54. expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
  55. )
  56. special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
  57. return {
  58. "name": "p50k_edit",
  59. "pat_str": r50k_pat_str,
  60. "mergeable_ranks": mergeable_ranks,
  61. "special_tokens": special_tokens,
  62. }
  63. def cl100k_base():
  64. mergeable_ranks = load_tiktoken_bpe(
  65. "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
  66. expected_hash="223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7",
  67. )
  68. special_tokens = {
  69. ENDOFTEXT: 100257,
  70. FIM_PREFIX: 100258,
  71. FIM_MIDDLE: 100259,
  72. FIM_SUFFIX: 100260,
  73. ENDOFPROMPT: 100276,
  74. }
  75. return {
  76. "name": "cl100k_base",
  77. "pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s""",
  78. "mergeable_ranks": mergeable_ranks,
  79. "special_tokens": special_tokens,
  80. }
  81. def o200k_base():
  82. mergeable_ranks = load_tiktoken_bpe(
  83. "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
  84. expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d",
  85. )
  86. special_tokens = {ENDOFTEXT: 199999, ENDOFPROMPT: 200018}
  87. # This regex could be made more efficient. If I was the one working on this encoding, I would
  88. # have done a few other things differently too, e.g. I think you can allocate tokens more
  89. # efficiently across languages.
  90. pat_str = "|".join(
  91. [
  92. r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
  93. r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
  94. r"""\p{N}{1,3}""",
  95. r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""",
  96. r"""\s*[\r\n]+""",
  97. r"""\s+(?!\S)""",
  98. r"""\s+""",
  99. ]
  100. )
  101. return {
  102. "name": "o200k_base",
  103. "pat_str": pat_str,
  104. "mergeable_ranks": mergeable_ranks,
  105. "special_tokens": special_tokens,
  106. }
  107. def o200k_harmony():
  108. base_enc = o200k_base()
  109. name = "o200k_harmony"
  110. pat_str = base_enc["pat_str"]
  111. mergeable_ranks = base_enc["mergeable_ranks"]
  112. special_tokens = {
  113. **base_enc["special_tokens"],
  114. "<|startoftext|>": 199998,
  115. "<|endoftext|>": 199999,
  116. "<|reserved_200000|>": 200000,
  117. "<|reserved_200001|>": 200001,
  118. "<|return|>": 200002,
  119. "<|constrain|>": 200003,
  120. "<|reserved_200004|>": 200004,
  121. "<|channel|>": 200005,
  122. "<|start|>": 200006,
  123. "<|end|>": 200007,
  124. "<|message|>": 200008,
  125. "<|reserved_200009|>": 200009,
  126. "<|reserved_200010|>": 200010,
  127. "<|reserved_200011|>": 200011,
  128. "<|call|>": 200012,
  129. } | {f"<|reserved_{i}|>": i for i in range(200013, 201088)}
  130. return {
  131. "name": name,
  132. "pat_str": pat_str,
  133. "mergeable_ranks": mergeable_ranks,
  134. "special_tokens": special_tokens,
  135. }
  136. ENCODING_CONSTRUCTORS = {
  137. "gpt2": gpt2,
  138. "r50k_base": r50k_base,
  139. "p50k_base": p50k_base,
  140. "p50k_edit": p50k_edit,
  141. "cl100k_base": cl100k_base,
  142. "o200k_base": o200k_base,
  143. "o200k_harmony": o200k_harmony,
  144. }