| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118 |
- from __future__ import annotations
- from .core import Encoding
- from .registry import get_encoding
- # TODO: these will likely be replaced by an API endpoint
- MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
- "o1-": "o200k_base",
- "o3-": "o200k_base",
- "o4-mini-": "o200k_base",
- # chat
- "gpt-5-": "o200k_base",
- "gpt-4.5-": "o200k_base",
- "gpt-4.1-": "o200k_base",
- "chatgpt-4o-": "o200k_base",
- "gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13
- "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
- "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
- "gpt-35-turbo-": "cl100k_base", # Azure deployment name
- "gpt-oss-": "o200k_harmony",
- # fine-tuned
- "ft:gpt-4o": "o200k_base",
- "ft:gpt-4": "cl100k_base",
- "ft:gpt-3.5-turbo": "cl100k_base",
- "ft:davinci-002": "cl100k_base",
- "ft:babbage-002": "cl100k_base",
- }
- MODEL_TO_ENCODING: dict[str, str] = {
- # reasoning
- "o1": "o200k_base",
- "o3": "o200k_base",
- "o4-mini": "o200k_base",
- # chat
- "gpt-5": "o200k_base",
- "gpt-4.1": "o200k_base",
- "gpt-4o": "o200k_base",
- "gpt-4": "cl100k_base",
- "gpt-3.5-turbo": "cl100k_base",
- "gpt-3.5": "cl100k_base", # Common shorthand
- "gpt-35-turbo": "cl100k_base", # Azure deployment name
- # base
- "davinci-002": "cl100k_base",
- "babbage-002": "cl100k_base",
- # embeddings
- "text-embedding-ada-002": "cl100k_base",
- "text-embedding-3-small": "cl100k_base",
- "text-embedding-3-large": "cl100k_base",
- # DEPRECATED MODELS
- # text (DEPRECATED)
- "text-davinci-003": "p50k_base",
- "text-davinci-002": "p50k_base",
- "text-davinci-001": "r50k_base",
- "text-curie-001": "r50k_base",
- "text-babbage-001": "r50k_base",
- "text-ada-001": "r50k_base",
- "davinci": "r50k_base",
- "curie": "r50k_base",
- "babbage": "r50k_base",
- "ada": "r50k_base",
- # code (DEPRECATED)
- "code-davinci-002": "p50k_base",
- "code-davinci-001": "p50k_base",
- "code-cushman-002": "p50k_base",
- "code-cushman-001": "p50k_base",
- "davinci-codex": "p50k_base",
- "cushman-codex": "p50k_base",
- # edit (DEPRECATED)
- "text-davinci-edit-001": "p50k_edit",
- "code-davinci-edit-001": "p50k_edit",
- # old embeddings (DEPRECATED)
- "text-similarity-davinci-001": "r50k_base",
- "text-similarity-curie-001": "r50k_base",
- "text-similarity-babbage-001": "r50k_base",
- "text-similarity-ada-001": "r50k_base",
- "text-search-davinci-doc-001": "r50k_base",
- "text-search-curie-doc-001": "r50k_base",
- "text-search-babbage-doc-001": "r50k_base",
- "text-search-ada-doc-001": "r50k_base",
- "code-search-babbage-code-001": "r50k_base",
- "code-search-ada-code-001": "r50k_base",
- # open source
- "gpt2": "gpt2",
- "gpt-2": "gpt2", # Maintains consistency with gpt-4
- }
- def encoding_name_for_model(model_name: str) -> str:
- """Returns the name of the encoding used by a model.
- Raises a KeyError if the model name is not recognised.
- """
- encoding_name = None
- if model_name in MODEL_TO_ENCODING:
- encoding_name = MODEL_TO_ENCODING[model_name]
- else:
- # Check if the model matches a known prefix
- # Prefix matching avoids needing library updates for every model version release
- # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
- for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
- if model_name.startswith(model_prefix):
- return model_encoding_name
- if encoding_name is None:
- raise KeyError(
- f"Could not automatically map {model_name} to a tokeniser. "
- "Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
- ) from None
- return encoding_name
- def encoding_for_model(model_name: str) -> Encoding:
- """Returns the encoding used by a model.
- Raises a KeyError if the model name is not recognised.
- """
- return get_encoding(encoding_name_for_model(model_name))
|