| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980 |
- # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
- from __future__ import annotations
- import logging
- from typing import TYPE_CHECKING, List, Union, Mapping, Optional, cast
- from typing_extensions import Literal, overload, assert_never
- import httpx
- from ... import _legacy_response
- from ..._types import (
- Body,
- Omit,
- Query,
- Headers,
- NotGiven,
- FileTypes,
- SequenceNotStr,
- omit,
- not_given,
- )
- from ..._utils import extract_files, required_args, maybe_transform, deepcopy_minimal, async_maybe_transform
- from ..._compat import cached_property
- from ..._resource import SyncAPIResource, AsyncAPIResource
- from ..._response import to_streamed_response_wrapper, async_to_streamed_response_wrapper
- from ..._streaming import Stream, AsyncStream
- from ...types.audio import transcription_create_params
- from ..._base_client import make_request_options
- from ...types.audio_model import AudioModel
- from ...types.audio.transcription import Transcription
- from ...types.audio_response_format import AudioResponseFormat
- from ...types.audio.transcription_include import TranscriptionInclude
- from ...types.audio.transcription_verbose import TranscriptionVerbose
- from ...types.audio.transcription_diarized import TranscriptionDiarized
- from ...types.audio.transcription_stream_event import TranscriptionStreamEvent
- from ...types.audio.transcription_create_response import TranscriptionCreateResponse
- __all__ = ["Transcriptions", "AsyncTranscriptions"]
- log: logging.Logger = logging.getLogger("openai.audio.transcriptions")
- class Transcriptions(SyncAPIResource):
- @cached_property
- def with_raw_response(self) -> TranscriptionsWithRawResponse:
- """
- This property can be used as a prefix for any HTTP method call to return
- the raw response object instead of the parsed content.
- For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
- """
- return TranscriptionsWithRawResponse(self)
- @cached_property
- def with_streaming_response(self) -> TranscriptionsWithStreamingResponse:
- """
- An alternative to `.with_raw_response` that doesn't eagerly read the response body.
- For more information, see https://www.github.com/openai/openai-python#with_streaming_response
- """
- return TranscriptionsWithStreamingResponse(self)
- @overload
- def create(
- self,
- *,
- file: FileTypes,
- model: Union[str, AudioModel],
- chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
- include: List[TranscriptionInclude] | Omit = omit,
- language: str | Omit = omit,
- prompt: str | Omit = omit,
- response_format: Union[Literal["json"], Omit] = omit,
- stream: Optional[Literal[False]] | Omit = omit,
- temperature: float | Omit = omit,
- timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> Transcription:
- """
- Transcribes audio into the input language.
- Args:
- file:
- The audio file object (not file name) to transcribe, in one of these formats:
- flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
- model: ID of the model to use. The options are `gpt-4o-transcribe`,
- `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
- Whisper V2 model).
- chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
- first normalizes loudness and then uses voice activity detection (VAD) to choose
- boundaries. `server_vad` object can be provided to tweak VAD detection
- parameters manually. If unset, the audio is transcribed as a single block.
- include: Additional information to include in the transcription response. `logprobs` will
- return the log probabilities of the tokens in the response to understand the
- model's confidence in the transcription. `logprobs` only works with
- response_format set to `json` and only with the models `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`.
- language: The language of the input audio. Supplying the input language in
- [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- format will improve accuracy and latency.
- prompt: An optional text to guide the model's style or continue a previous audio
- segment. The
- [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language.
- response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
- `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
- the only supported format is `json`.
- stream: If set to true, the model response data will be streamed to the client as it is
- generated using
- [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
- See the
- [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
- for more information.
- Note: Streaming is not supported for the `whisper-1` model and will be ignored.
- temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
- output more random, while lower values like 0.2 will make it more focused and
- deterministic. If set to 0, the model will use
- [log probability](https://en.wikipedia.org/wiki/Log_probability) to
- automatically increase the temperature until certain thresholds are hit.
- timestamp_granularities: The timestamp granularities to populate for this transcription.
- `response_format` must be set `verbose_json` to use timestamp granularities.
- Either or both of these options are supported: `word`, or `segment`. Note: There
- is no additional latency for segment timestamps, but generating word timestamps
- incurs additional latency.
- extra_headers: Send extra headers
- extra_query: Add additional query parameters to the request
- """
- @overload
- def create(
- self,
- *,
- file: FileTypes,
- model: Union[str, AudioModel],
- chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
- include: List[TranscriptionInclude] | Omit = omit,
- response_format: Literal["verbose_json"],
- language: str | Omit = omit,
- prompt: str | Omit = omit,
- temperature: float | Omit = omit,
- timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> TranscriptionVerbose: ...
- @overload
- def create(
- self,
- *,
- file: FileTypes,
- model: Union[str, AudioModel],
- chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
- response_format: Literal["text", "srt", "vtt"],
- include: List[TranscriptionInclude] | Omit = omit,
- language: str | Omit = omit,
- prompt: str | Omit = omit,
- temperature: float | Omit = omit,
- timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> str: ...
- @overload
- def create(
- self,
- *,
- file: FileTypes,
- model: Union[str, AudioModel],
- chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
- response_format: Literal["diarized_json"],
- known_speaker_names: SequenceNotStr[str] | Omit = omit,
- known_speaker_references: SequenceNotStr[str] | Omit = omit,
- language: str | Omit = omit,
- temperature: float | Omit = omit,
- timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> TranscriptionDiarized: ...
- @overload
- def create(
- self,
- *,
- file: FileTypes,
- model: Union[str, AudioModel],
- stream: Literal[True],
- chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
- include: List[TranscriptionInclude] | Omit = omit,
- known_speaker_names: SequenceNotStr[str] | Omit = omit,
- known_speaker_references: SequenceNotStr[str] | Omit = omit,
- language: str | Omit = omit,
- prompt: str | Omit = omit,
- response_format: Union[AudioResponseFormat, Omit] = omit,
- temperature: float | Omit = omit,
- timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> Stream[TranscriptionStreamEvent]:
- """
- Transcribes audio into the input language.
- Args:
- file:
- The audio file object (not file name) to transcribe, in one of these formats:
- flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
- model: ID of the model to use. The options are `gpt-4o-transcribe`,
- `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
- Whisper V2 model), and `gpt-4o-transcribe-diarize`.
- stream: If set to true, the model response data will be streamed to the client as it is
- generated using
- [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
- See the
- [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
- for more information.
- Note: Streaming is not supported for the `whisper-1` model and will be ignored.
- chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
- first normalizes loudness and then uses voice activity detection (VAD) to choose
- boundaries. `server_vad` object can be provided to tweak VAD detection
- parameters manually. If unset, the audio is transcribed as a single block.
- Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
- seconds.
- include: Additional information to include in the transcription response. `logprobs` will
- return the log probabilities of the tokens in the response to understand the
- model's confidence in the transcription. `logprobs` only works with
- response_format set to `json` and only with the models `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`. This field is not supported when using
- `gpt-4o-transcribe-diarize`.
- known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
- `known_speaker_references[]`. Each entry should be a short identifier (for
- example `customer` or `agent`). Up to 4 speakers are supported.
- known_speaker_references: Optional list of audio samples (as
- [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
- that contain known speaker references matching `known_speaker_names[]`. Each
- sample must be between 2 and 10 seconds, and can use any of the same input audio
- formats supported by `file`.
- language: The language of the input audio. Supplying the input language in
- [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- format will improve accuracy and latency.
- prompt: An optional text to guide the model's style or continue a previous audio
- segment. The
- [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language. This field is not supported when using
- `gpt-4o-transcribe-diarize`.
- response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
- `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`, the only supported format is `json`. For
- `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
- `diarized_json`, with `diarized_json` required to receive speaker annotations.
- temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
- output more random, while lower values like 0.2 will make it more focused and
- deterministic. If set to 0, the model will use
- [log probability](https://en.wikipedia.org/wiki/Log_probability) to
- automatically increase the temperature until certain thresholds are hit.
- timestamp_granularities: The timestamp granularities to populate for this transcription.
- `response_format` must be set `verbose_json` to use timestamp granularities.
- Either or both of these options are supported: `word`, or `segment`. Note: There
- is no additional latency for segment timestamps, but generating word timestamps
- incurs additional latency. This option is not available for
- `gpt-4o-transcribe-diarize`.
- extra_headers: Send extra headers
- extra_query: Add additional query parameters to the request
- extra_body: Add additional JSON properties to the request
- timeout: Override the client-level default timeout for this request, in seconds
- """
- ...
- @overload
- def create(
- self,
- *,
- file: FileTypes,
- model: Union[str, AudioModel],
- stream: bool,
- chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
- include: List[TranscriptionInclude] | Omit = omit,
- known_speaker_names: SequenceNotStr[str] | Omit = omit,
- known_speaker_references: SequenceNotStr[str] | Omit = omit,
- language: str | Omit = omit,
- prompt: str | Omit = omit,
- response_format: Union[AudioResponseFormat, Omit] = omit,
- temperature: float | Omit = omit,
- timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> TranscriptionCreateResponse | Stream[TranscriptionStreamEvent]:
- """
- Transcribes audio into the input language.
- Args:
- file:
- The audio file object (not file name) to transcribe, in one of these formats:
- flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
- model: ID of the model to use. The options are `gpt-4o-transcribe`,
- `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
- Whisper V2 model), and `gpt-4o-transcribe-diarize`.
- stream: If set to true, the model response data will be streamed to the client as it is
- generated using
- [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
- See the
- [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
- for more information.
- Note: Streaming is not supported for the `whisper-1` model and will be ignored.
- chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
- first normalizes loudness and then uses voice activity detection (VAD) to choose
- boundaries. `server_vad` object can be provided to tweak VAD detection
- parameters manually. If unset, the audio is transcribed as a single block.
- Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
- seconds.
- include: Additional information to include in the transcription response. `logprobs` will
- return the log probabilities of the tokens in the response to understand the
- model's confidence in the transcription. `logprobs` only works with
- response_format set to `json` and only with the models `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`. This field is not supported when using
- `gpt-4o-transcribe-diarize`.
- known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
- `known_speaker_references[]`. Each entry should be a short identifier (for
- example `customer` or `agent`). Up to 4 speakers are supported.
- known_speaker_references: Optional list of audio samples (as
- [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
- that contain known speaker references matching `known_speaker_names[]`. Each
- sample must be between 2 and 10 seconds, and can use any of the same input audio
- formats supported by `file`.
- language: The language of the input audio. Supplying the input language in
- [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- format will improve accuracy and latency.
- prompt: An optional text to guide the model's style or continue a previous audio
- segment. The
- [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language. This field is not supported when using
- `gpt-4o-transcribe-diarize`.
- response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
- `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`, the only supported format is `json`. For
- `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
- `diarized_json`, with `diarized_json` required to receive speaker annotations.
- temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
- output more random, while lower values like 0.2 will make it more focused and
- deterministic. If set to 0, the model will use
- [log probability](https://en.wikipedia.org/wiki/Log_probability) to
- automatically increase the temperature until certain thresholds are hit.
- timestamp_granularities: The timestamp granularities to populate for this transcription.
- `response_format` must be set `verbose_json` to use timestamp granularities.
- Either or both of these options are supported: `word`, or `segment`. Note: There
- is no additional latency for segment timestamps, but generating word timestamps
- incurs additional latency. This option is not available for
- `gpt-4o-transcribe-diarize`.
- extra_headers: Send extra headers
- extra_query: Add additional query parameters to the request
- extra_body: Add additional JSON properties to the request
- timeout: Override the client-level default timeout for this request, in seconds
- """
- ...
- @required_args(["file", "model"], ["file", "model", "stream"])
- def create(
- self,
- *,
- file: FileTypes,
- model: Union[str, AudioModel],
- chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
- include: List[TranscriptionInclude] | Omit = omit,
- known_speaker_names: SequenceNotStr[str] | Omit = omit,
- known_speaker_references: SequenceNotStr[str] | Omit = omit,
- language: str | Omit = omit,
- prompt: str | Omit = omit,
- response_format: Union[AudioResponseFormat, Omit] = omit,
- stream: Optional[Literal[False]] | Literal[True] | Omit = omit,
- temperature: float | Omit = omit,
- timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> str | Transcription | TranscriptionDiarized | TranscriptionVerbose | Stream[TranscriptionStreamEvent]:
- body = deepcopy_minimal(
- {
- "file": file,
- "model": model,
- "chunking_strategy": chunking_strategy,
- "include": include,
- "known_speaker_names": known_speaker_names,
- "known_speaker_references": known_speaker_references,
- "language": language,
- "prompt": prompt,
- "response_format": response_format,
- "stream": stream,
- "temperature": temperature,
- "timestamp_granularities": timestamp_granularities,
- }
- )
- files = extract_files(cast(Mapping[str, object], body), paths=[["file"]])
- # It should be noted that the actual Content-Type header that will be
- # sent to the server will contain a `boundary` parameter, e.g.
- # multipart/form-data; boundary=---abc--
- extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})}
- return self._post( # type: ignore[return-value]
- "/audio/transcriptions",
- body=maybe_transform(
- body,
- transcription_create_params.TranscriptionCreateParamsStreaming
- if stream
- else transcription_create_params.TranscriptionCreateParamsNonStreaming,
- ),
- files=files,
- options=make_request_options(
- extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
- ),
- cast_to=_get_response_format_type(response_format),
- stream=stream or False,
- stream_cls=Stream[TranscriptionStreamEvent],
- )
- class AsyncTranscriptions(AsyncAPIResource):
- @cached_property
- def with_raw_response(self) -> AsyncTranscriptionsWithRawResponse:
- """
- This property can be used as a prefix for any HTTP method call to return
- the raw response object instead of the parsed content.
- For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
- """
- return AsyncTranscriptionsWithRawResponse(self)
- @cached_property
- def with_streaming_response(self) -> AsyncTranscriptionsWithStreamingResponse:
- """
- An alternative to `.with_raw_response` that doesn't eagerly read the response body.
- For more information, see https://www.github.com/openai/openai-python#with_streaming_response
- """
- return AsyncTranscriptionsWithStreamingResponse(self)
- @overload
- async def create(
- self,
- *,
- file: FileTypes,
- model: Union[str, AudioModel],
- chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
- include: List[TranscriptionInclude] | Omit = omit,
- known_speaker_names: SequenceNotStr[str] | Omit = omit,
- known_speaker_references: SequenceNotStr[str] | Omit = omit,
- language: str | Omit = omit,
- prompt: str | Omit = omit,
- response_format: Union[Literal["json"], Omit] = omit,
- stream: Optional[Literal[False]] | Omit = omit,
- temperature: float | Omit = omit,
- timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> TranscriptionCreateResponse:
- """
- Transcribes audio into the input language.
- Args:
- file:
- The audio file object (not file name) to transcribe, in one of these formats:
- flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
- model: ID of the model to use. The options are `gpt-4o-transcribe`,
- `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
- Whisper V2 model), and `gpt-4o-transcribe-diarize`.
- chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
- first normalizes loudness and then uses voice activity detection (VAD) to choose
- boundaries. `server_vad` object can be provided to tweak VAD detection
- parameters manually. If unset, the audio is transcribed as a single block.
- Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
- seconds.
- include: Additional information to include in the transcription response. `logprobs` will
- return the log probabilities of the tokens in the response to understand the
- model's confidence in the transcription. `logprobs` only works with
- response_format set to `json` and only with the models `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`. This field is not supported when using
- `gpt-4o-transcribe-diarize`.
- known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
- `known_speaker_references[]`. Each entry should be a short identifier (for
- example `customer` or `agent`). Up to 4 speakers are supported.
- known_speaker_references: Optional list of audio samples (as
- [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
- that contain known speaker references matching `known_speaker_names[]`. Each
- sample must be between 2 and 10 seconds, and can use any of the same input audio
- formats supported by `file`.
- language: The language of the input audio. Supplying the input language in
- [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- format will improve accuracy and latency.
- prompt: An optional text to guide the model's style or continue a previous audio
- segment. The
- [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language. This field is not supported when using
- `gpt-4o-transcribe-diarize`.
- response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
- `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`, the only supported format is `json`. For
- `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
- `diarized_json`, with `diarized_json` required to receive speaker annotations.
- stream: If set to true, the model response data will be streamed to the client as it is
- generated using
- [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
- See the
- [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
- for more information.
- Note: Streaming is not supported for the `whisper-1` model and will be ignored.
- temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
- output more random, while lower values like 0.2 will make it more focused and
- deterministic. If set to 0, the model will use
- [log probability](https://en.wikipedia.org/wiki/Log_probability) to
- automatically increase the temperature until certain thresholds are hit.
- timestamp_granularities: The timestamp granularities to populate for this transcription.
- `response_format` must be set `verbose_json` to use timestamp granularities.
- Either or both of these options are supported: `word`, or `segment`. Note: There
- is no additional latency for segment timestamps, but generating word timestamps
- incurs additional latency. This option is not available for
- `gpt-4o-transcribe-diarize`.
- extra_headers: Send extra headers
- extra_query: Add additional query parameters to the request
- """
- @overload
- async def create(
- self,
- *,
- file: FileTypes,
- model: Union[str, AudioModel],
- chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
- include: List[TranscriptionInclude] | Omit = omit,
- response_format: Literal["verbose_json"],
- language: str | Omit = omit,
- prompt: str | Omit = omit,
- temperature: float | Omit = omit,
- timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> TranscriptionVerbose: ...
- @overload
- async def create(
- self,
- *,
- file: FileTypes,
- model: Union[str, AudioModel],
- chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
- include: List[TranscriptionInclude] | Omit = omit,
- response_format: Literal["text", "srt", "vtt"],
- language: str | Omit = omit,
- prompt: str | Omit = omit,
- temperature: float | Omit = omit,
- timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> str: ...
- @overload
- async def create(
- self,
- *,
- file: FileTypes,
- model: Union[str, AudioModel],
- stream: Literal[True],
- chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
- include: List[TranscriptionInclude] | Omit = omit,
- known_speaker_names: SequenceNotStr[str] | Omit = omit,
- known_speaker_references: SequenceNotStr[str] | Omit = omit,
- language: str | Omit = omit,
- prompt: str | Omit = omit,
- response_format: Union[AudioResponseFormat, Omit] = omit,
- temperature: float | Omit = omit,
- timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> AsyncStream[TranscriptionStreamEvent]:
- """
- Transcribes audio into the input language.
- Args:
- file:
- The audio file object (not file name) to transcribe, in one of these formats:
- flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
- model: ID of the model to use. The options are `gpt-4o-transcribe`,
- `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
- Whisper V2 model), and `gpt-4o-transcribe-diarize`.
- stream: If set to true, the model response data will be streamed to the client as it is
- generated using
- [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
- See the
- [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
- for more information.
- Note: Streaming is not supported for the `whisper-1` model and will be ignored.
- chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
- first normalizes loudness and then uses voice activity detection (VAD) to choose
- boundaries. `server_vad` object can be provided to tweak VAD detection
- parameters manually. If unset, the audio is transcribed as a single block.
- Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
- seconds.
- include: Additional information to include in the transcription response. `logprobs` will
- return the log probabilities of the tokens in the response to understand the
- model's confidence in the transcription. `logprobs` only works with
- response_format set to `json` and only with the models `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`. This field is not supported when using
- `gpt-4o-transcribe-diarize`.
- known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
- `known_speaker_references[]`. Each entry should be a short identifier (for
- example `customer` or `agent`). Up to 4 speakers are supported.
- known_speaker_references: Optional list of audio samples (as
- [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
- that contain known speaker references matching `known_speaker_names[]`. Each
- sample must be between 2 and 10 seconds, and can use any of the same input audio
- formats supported by `file`.
- language: The language of the input audio. Supplying the input language in
- [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- format will improve accuracy and latency.
- prompt: An optional text to guide the model's style or continue a previous audio
- segment. The
- [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language. This field is not supported when using
- `gpt-4o-transcribe-diarize`.
- response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
- `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`, the only supported format is `json`. For
- `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
- `diarized_json`, with `diarized_json` required to receive speaker annotations.
- temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
- output more random, while lower values like 0.2 will make it more focused and
- deterministic. If set to 0, the model will use
- [log probability](https://en.wikipedia.org/wiki/Log_probability) to
- automatically increase the temperature until certain thresholds are hit.
- timestamp_granularities: The timestamp granularities to populate for this transcription.
- `response_format` must be set `verbose_json` to use timestamp granularities.
- Either or both of these options are supported: `word`, or `segment`. Note: There
- is no additional latency for segment timestamps, but generating word timestamps
- incurs additional latency. This option is not available for
- `gpt-4o-transcribe-diarize`.
- extra_headers: Send extra headers
- extra_query: Add additional query parameters to the request
- extra_body: Add additional JSON properties to the request
- timeout: Override the client-level default timeout for this request, in seconds
- """
- ...
- @overload
- async def create(
- self,
- *,
- file: FileTypes,
- model: Union[str, AudioModel],
- stream: bool,
- chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
- include: List[TranscriptionInclude] | Omit = omit,
- known_speaker_names: SequenceNotStr[str] | Omit = omit,
- known_speaker_references: SequenceNotStr[str] | Omit = omit,
- language: str | Omit = omit,
- prompt: str | Omit = omit,
- response_format: Union[AudioResponseFormat, Omit] = omit,
- temperature: float | Omit = omit,
- timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> TranscriptionCreateResponse | AsyncStream[TranscriptionStreamEvent]:
- """
- Transcribes audio into the input language.
- Args:
- file:
- The audio file object (not file name) to transcribe, in one of these formats:
- flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
- model: ID of the model to use. The options are `gpt-4o-transcribe`,
- `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
- Whisper V2 model), and `gpt-4o-transcribe-diarize`.
- stream: If set to true, the model response data will be streamed to the client as it is
- generated using
- [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
- See the
- [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
- for more information.
- Note: Streaming is not supported for the `whisper-1` model and will be ignored.
- chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
- first normalizes loudness and then uses voice activity detection (VAD) to choose
- boundaries. `server_vad` object can be provided to tweak VAD detection
- parameters manually. If unset, the audio is transcribed as a single block.
- Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
- seconds.
- include: Additional information to include in the transcription response. `logprobs` will
- return the log probabilities of the tokens in the response to understand the
- model's confidence in the transcription. `logprobs` only works with
- response_format set to `json` and only with the models `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`. This field is not supported when using
- `gpt-4o-transcribe-diarize`.
- known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
- `known_speaker_references[]`. Each entry should be a short identifier (for
- example `customer` or `agent`). Up to 4 speakers are supported.
- known_speaker_references: Optional list of audio samples (as
- [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
- that contain known speaker references matching `known_speaker_names[]`. Each
- sample must be between 2 and 10 seconds, and can use any of the same input audio
- formats supported by `file`.
- language: The language of the input audio. Supplying the input language in
- [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
- format will improve accuracy and latency.
- prompt: An optional text to guide the model's style or continue a previous audio
- segment. The
- [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
- should match the audio language. This field is not supported when using
- `gpt-4o-transcribe-diarize`.
- response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
- `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
- `gpt-4o-mini-transcribe`, the only supported format is `json`. For
- `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
- `diarized_json`, with `diarized_json` required to receive speaker annotations.
- temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
- output more random, while lower values like 0.2 will make it more focused and
- deterministic. If set to 0, the model will use
- [log probability](https://en.wikipedia.org/wiki/Log_probability) to
- automatically increase the temperature until certain thresholds are hit.
- timestamp_granularities: The timestamp granularities to populate for this transcription.
- `response_format` must be set `verbose_json` to use timestamp granularities.
- Either or both of these options are supported: `word`, or `segment`. Note: There
- is no additional latency for segment timestamps, but generating word timestamps
- incurs additional latency. This option is not available for
- `gpt-4o-transcribe-diarize`.
- extra_headers: Send extra headers
- extra_query: Add additional query parameters to the request
- extra_body: Add additional JSON properties to the request
- timeout: Override the client-level default timeout for this request, in seconds
- """
- ...
- @required_args(["file", "model"], ["file", "model", "stream"])
- async def create(
- self,
- *,
- file: FileTypes,
- model: Union[str, AudioModel],
- chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
- include: List[TranscriptionInclude] | Omit = omit,
- known_speaker_names: SequenceNotStr[str] | Omit = omit,
- known_speaker_references: SequenceNotStr[str] | Omit = omit,
- language: str | Omit = omit,
- prompt: str | Omit = omit,
- response_format: Union[AudioResponseFormat, Omit] = omit,
- stream: Optional[Literal[False]] | Literal[True] | Omit = omit,
- temperature: float | Omit = omit,
- timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> Transcription | TranscriptionVerbose | TranscriptionDiarized | str | AsyncStream[TranscriptionStreamEvent]:
- body = deepcopy_minimal(
- {
- "file": file,
- "model": model,
- "chunking_strategy": chunking_strategy,
- "include": include,
- "known_speaker_names": known_speaker_names,
- "known_speaker_references": known_speaker_references,
- "language": language,
- "prompt": prompt,
- "response_format": response_format,
- "stream": stream,
- "temperature": temperature,
- "timestamp_granularities": timestamp_granularities,
- }
- )
- files = extract_files(cast(Mapping[str, object], body), paths=[["file"]])
- # It should be noted that the actual Content-Type header that will be
- # sent to the server will contain a `boundary` parameter, e.g.
- # multipart/form-data; boundary=---abc--
- extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})}
- return await self._post(
- "/audio/transcriptions",
- body=await async_maybe_transform(
- body,
- transcription_create_params.TranscriptionCreateParamsStreaming
- if stream
- else transcription_create_params.TranscriptionCreateParamsNonStreaming,
- ),
- files=files,
- options=make_request_options(
- extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
- ),
- cast_to=_get_response_format_type(response_format),
- stream=stream or False,
- stream_cls=AsyncStream[TranscriptionStreamEvent],
- )
- class TranscriptionsWithRawResponse:
- def __init__(self, transcriptions: Transcriptions) -> None:
- self._transcriptions = transcriptions
- self.create = _legacy_response.to_raw_response_wrapper(
- transcriptions.create,
- )
- class AsyncTranscriptionsWithRawResponse:
- def __init__(self, transcriptions: AsyncTranscriptions) -> None:
- self._transcriptions = transcriptions
- self.create = _legacy_response.async_to_raw_response_wrapper(
- transcriptions.create,
- )
- class TranscriptionsWithStreamingResponse:
- def __init__(self, transcriptions: Transcriptions) -> None:
- self._transcriptions = transcriptions
- self.create = to_streamed_response_wrapper(
- transcriptions.create,
- )
- class AsyncTranscriptionsWithStreamingResponse:
- def __init__(self, transcriptions: AsyncTranscriptions) -> None:
- self._transcriptions = transcriptions
- self.create = async_to_streamed_response_wrapper(
- transcriptions.create,
- )
- def _get_response_format_type(
- response_format: AudioResponseFormat | Omit,
- ) -> type[Transcription | TranscriptionVerbose | TranscriptionDiarized | str]:
- if isinstance(response_format, Omit) or response_format is None: # pyright: ignore[reportUnnecessaryComparison]
- return Transcription
- if response_format == "json":
- return Transcription
- elif response_format == "verbose_json":
- return TranscriptionVerbose
- elif response_format == "diarized_json":
- return TranscriptionDiarized
- elif response_format == "srt" or response_format == "text" or response_format == "vtt":
- return str
- elif TYPE_CHECKING: # type: ignore[unreachable]
- assert_never(response_format)
- else:
- log.warn("Unexpected audio response format: %s", response_format)
- return Transcription
|