| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162 |
- from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe
- ENDOFTEXT = "<|endoftext|>"
- FIM_PREFIX = "<|fim_prefix|>"
- FIM_MIDDLE = "<|fim_middle|>"
- FIM_SUFFIX = "<|fim_suffix|>"
- ENDOFPROMPT = "<|endofprompt|>"
- # The pattern in the original GPT-2 release is:
- # r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
- # This is equivalent, but executes faster:
- r50k_pat_str = (
- r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s"""
- )
- def gpt2():
- mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
- vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
- encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
- vocab_bpe_hash="1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5",
- encoder_json_hash="196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783",
- )
- return {
- "name": "gpt2",
- "explicit_n_vocab": 50257,
- "pat_str": r50k_pat_str,
- "mergeable_ranks": mergeable_ranks,
- "special_tokens": {ENDOFTEXT: 50256},
- }
- def r50k_base():
- mergeable_ranks = load_tiktoken_bpe(
- "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
- expected_hash="306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930",
- )
- return {
- "name": "r50k_base",
- "explicit_n_vocab": 50257,
- "pat_str": r50k_pat_str,
- "mergeable_ranks": mergeable_ranks,
- "special_tokens": {ENDOFTEXT: 50256},
- }
- def p50k_base():
- mergeable_ranks = load_tiktoken_bpe(
- "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
- expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
- )
- return {
- "name": "p50k_base",
- "explicit_n_vocab": 50281,
- "pat_str": r50k_pat_str,
- "mergeable_ranks": mergeable_ranks,
- "special_tokens": {ENDOFTEXT: 50256},
- }
- def p50k_edit():
- mergeable_ranks = load_tiktoken_bpe(
- "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
- expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
- )
- special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
- return {
- "name": "p50k_edit",
- "pat_str": r50k_pat_str,
- "mergeable_ranks": mergeable_ranks,
- "special_tokens": special_tokens,
- }
- def cl100k_base():
- mergeable_ranks = load_tiktoken_bpe(
- "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
- expected_hash="223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7",
- )
- special_tokens = {
- ENDOFTEXT: 100257,
- FIM_PREFIX: 100258,
- FIM_MIDDLE: 100259,
- FIM_SUFFIX: 100260,
- ENDOFPROMPT: 100276,
- }
- return {
- "name": "cl100k_base",
- "pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s""",
- "mergeable_ranks": mergeable_ranks,
- "special_tokens": special_tokens,
- }
- def o200k_base():
- mergeable_ranks = load_tiktoken_bpe(
- "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
- expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d",
- )
- special_tokens = {ENDOFTEXT: 199999, ENDOFPROMPT: 200018}
- # This regex could be made more efficient. If I was the one working on this encoding, I would
- # have done a few other things differently too, e.g. I think you can allocate tokens more
- # efficiently across languages.
- pat_str = "|".join(
- [
- r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
- r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
- r"""\p{N}{1,3}""",
- r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""",
- r"""\s*[\r\n]+""",
- r"""\s+(?!\S)""",
- r"""\s+""",
- ]
- )
- return {
- "name": "o200k_base",
- "pat_str": pat_str,
- "mergeable_ranks": mergeable_ranks,
- "special_tokens": special_tokens,
- }
- def o200k_harmony():
- base_enc = o200k_base()
- name = "o200k_harmony"
- pat_str = base_enc["pat_str"]
- mergeable_ranks = base_enc["mergeable_ranks"]
- special_tokens = {
- **base_enc["special_tokens"],
- "<|startoftext|>": 199998,
- "<|endoftext|>": 199999,
- "<|reserved_200000|>": 200000,
- "<|reserved_200001|>": 200001,
- "<|return|>": 200002,
- "<|constrain|>": 200003,
- "<|reserved_200004|>": 200004,
- "<|channel|>": 200005,
- "<|start|>": 200006,
- "<|end|>": 200007,
- "<|message|>": 200008,
- "<|reserved_200009|>": 200009,
- "<|reserved_200010|>": 200010,
- "<|reserved_200011|>": 200011,
- "<|call|>": 200012,
- } | {f"<|reserved_{i}|>": i for i in range(200013, 201088)}
- return {
- "name": name,
- "pat_str": pat_str,
- "mergeable_ranks": mergeable_ranks,
- "special_tokens": special_tokens,
- }
- ENCODING_CONSTRUCTORS = {
- "gpt2": gpt2,
- "r50k_base": r50k_base,
- "p50k_base": p50k_base,
- "p50k_edit": p50k_edit,
- "cl100k_base": cl100k_base,
- "o200k_base": o200k_base,
- "o200k_harmony": o200k_harmony,
- }
|