transcriptions.py 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980
  1. # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
  2. from __future__ import annotations
  3. import logging
  4. from typing import TYPE_CHECKING, List, Union, Mapping, Optional, cast
  5. from typing_extensions import Literal, overload, assert_never
  6. import httpx
  7. from ... import _legacy_response
  8. from ..._types import (
  9. Body,
  10. Omit,
  11. Query,
  12. Headers,
  13. NotGiven,
  14. FileTypes,
  15. SequenceNotStr,
  16. omit,
  17. not_given,
  18. )
  19. from ..._utils import extract_files, required_args, maybe_transform, deepcopy_minimal, async_maybe_transform
  20. from ..._compat import cached_property
  21. from ..._resource import SyncAPIResource, AsyncAPIResource
  22. from ..._response import to_streamed_response_wrapper, async_to_streamed_response_wrapper
  23. from ..._streaming import Stream, AsyncStream
  24. from ...types.audio import transcription_create_params
  25. from ..._base_client import make_request_options
  26. from ...types.audio_model import AudioModel
  27. from ...types.audio.transcription import Transcription
  28. from ...types.audio_response_format import AudioResponseFormat
  29. from ...types.audio.transcription_include import TranscriptionInclude
  30. from ...types.audio.transcription_verbose import TranscriptionVerbose
  31. from ...types.audio.transcription_diarized import TranscriptionDiarized
  32. from ...types.audio.transcription_stream_event import TranscriptionStreamEvent
  33. from ...types.audio.transcription_create_response import TranscriptionCreateResponse
  34. __all__ = ["Transcriptions", "AsyncTranscriptions"]
  35. log: logging.Logger = logging.getLogger("openai.audio.transcriptions")
  36. class Transcriptions(SyncAPIResource):
  37. @cached_property
  38. def with_raw_response(self) -> TranscriptionsWithRawResponse:
  39. """
  40. This property can be used as a prefix for any HTTP method call to return
  41. the raw response object instead of the parsed content.
  42. For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
  43. """
  44. return TranscriptionsWithRawResponse(self)
  45. @cached_property
  46. def with_streaming_response(self) -> TranscriptionsWithStreamingResponse:
  47. """
  48. An alternative to `.with_raw_response` that doesn't eagerly read the response body.
  49. For more information, see https://www.github.com/openai/openai-python#with_streaming_response
  50. """
  51. return TranscriptionsWithStreamingResponse(self)
  52. @overload
  53. def create(
  54. self,
  55. *,
  56. file: FileTypes,
  57. model: Union[str, AudioModel],
  58. chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
  59. include: List[TranscriptionInclude] | Omit = omit,
  60. language: str | Omit = omit,
  61. prompt: str | Omit = omit,
  62. response_format: Union[Literal["json"], Omit] = omit,
  63. stream: Optional[Literal[False]] | Omit = omit,
  64. temperature: float | Omit = omit,
  65. timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
  66. # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
  67. # The extra values given here take precedence over values defined on the client or passed to this method.
  68. extra_headers: Headers | None = None,
  69. extra_query: Query | None = None,
  70. extra_body: Body | None = None,
  71. timeout: float | httpx.Timeout | None | NotGiven = not_given,
  72. ) -> Transcription:
  73. """
  74. Transcribes audio into the input language.
  75. Args:
  76. file:
  77. The audio file object (not file name) to transcribe, in one of these formats:
  78. flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
  79. model: ID of the model to use. The options are `gpt-4o-transcribe`,
  80. `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
  81. Whisper V2 model).
  82. chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
  83. first normalizes loudness and then uses voice activity detection (VAD) to choose
  84. boundaries. `server_vad` object can be provided to tweak VAD detection
  85. parameters manually. If unset, the audio is transcribed as a single block.
  86. include: Additional information to include in the transcription response. `logprobs` will
  87. return the log probabilities of the tokens in the response to understand the
  88. model's confidence in the transcription. `logprobs` only works with
  89. response_format set to `json` and only with the models `gpt-4o-transcribe` and
  90. `gpt-4o-mini-transcribe`.
  91. language: The language of the input audio. Supplying the input language in
  92. [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
  93. format will improve accuracy and latency.
  94. prompt: An optional text to guide the model's style or continue a previous audio
  95. segment. The
  96. [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
  97. should match the audio language.
  98. response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
  99. `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
  100. the only supported format is `json`.
  101. stream: If set to true, the model response data will be streamed to the client as it is
  102. generated using
  103. [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
  104. See the
  105. [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
  106. for more information.
  107. Note: Streaming is not supported for the `whisper-1` model and will be ignored.
  108. temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
  109. output more random, while lower values like 0.2 will make it more focused and
  110. deterministic. If set to 0, the model will use
  111. [log probability](https://en.wikipedia.org/wiki/Log_probability) to
  112. automatically increase the temperature until certain thresholds are hit.
  113. timestamp_granularities: The timestamp granularities to populate for this transcription.
  114. `response_format` must be set `verbose_json` to use timestamp granularities.
  115. Either or both of these options are supported: `word`, or `segment`. Note: There
  116. is no additional latency for segment timestamps, but generating word timestamps
  117. incurs additional latency.
  118. extra_headers: Send extra headers
  119. extra_query: Add additional query parameters to the request
  120. """
  121. @overload
  122. def create(
  123. self,
  124. *,
  125. file: FileTypes,
  126. model: Union[str, AudioModel],
  127. chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
  128. include: List[TranscriptionInclude] | Omit = omit,
  129. response_format: Literal["verbose_json"],
  130. language: str | Omit = omit,
  131. prompt: str | Omit = omit,
  132. temperature: float | Omit = omit,
  133. timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
  134. # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
  135. # The extra values given here take precedence over values defined on the client or passed to this method.
  136. extra_headers: Headers | None = None,
  137. extra_query: Query | None = None,
  138. extra_body: Body | None = None,
  139. timeout: float | httpx.Timeout | None | NotGiven = not_given,
  140. ) -> TranscriptionVerbose: ...
  141. @overload
  142. def create(
  143. self,
  144. *,
  145. file: FileTypes,
  146. model: Union[str, AudioModel],
  147. chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
  148. response_format: Literal["text", "srt", "vtt"],
  149. include: List[TranscriptionInclude] | Omit = omit,
  150. language: str | Omit = omit,
  151. prompt: str | Omit = omit,
  152. temperature: float | Omit = omit,
  153. timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
  154. # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
  155. # The extra values given here take precedence over values defined on the client or passed to this method.
  156. extra_headers: Headers | None = None,
  157. extra_query: Query | None = None,
  158. extra_body: Body | None = None,
  159. timeout: float | httpx.Timeout | None | NotGiven = not_given,
  160. ) -> str: ...
  161. @overload
  162. def create(
  163. self,
  164. *,
  165. file: FileTypes,
  166. model: Union[str, AudioModel],
  167. chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
  168. response_format: Literal["diarized_json"],
  169. known_speaker_names: SequenceNotStr[str] | Omit = omit,
  170. known_speaker_references: SequenceNotStr[str] | Omit = omit,
  171. language: str | Omit = omit,
  172. temperature: float | Omit = omit,
  173. timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
  174. # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
  175. # The extra values given here take precedence over values defined on the client or passed to this method.
  176. extra_headers: Headers | None = None,
  177. extra_query: Query | None = None,
  178. extra_body: Body | None = None,
  179. timeout: float | httpx.Timeout | None | NotGiven = not_given,
  180. ) -> TranscriptionDiarized: ...
  181. @overload
  182. def create(
  183. self,
  184. *,
  185. file: FileTypes,
  186. model: Union[str, AudioModel],
  187. stream: Literal[True],
  188. chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
  189. include: List[TranscriptionInclude] | Omit = omit,
  190. known_speaker_names: SequenceNotStr[str] | Omit = omit,
  191. known_speaker_references: SequenceNotStr[str] | Omit = omit,
  192. language: str | Omit = omit,
  193. prompt: str | Omit = omit,
  194. response_format: Union[AudioResponseFormat, Omit] = omit,
  195. temperature: float | Omit = omit,
  196. timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
  197. # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
  198. # The extra values given here take precedence over values defined on the client or passed to this method.
  199. extra_headers: Headers | None = None,
  200. extra_query: Query | None = None,
  201. extra_body: Body | None = None,
  202. timeout: float | httpx.Timeout | None | NotGiven = not_given,
  203. ) -> Stream[TranscriptionStreamEvent]:
  204. """
  205. Transcribes audio into the input language.
  206. Args:
  207. file:
  208. The audio file object (not file name) to transcribe, in one of these formats:
  209. flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
  210. model: ID of the model to use. The options are `gpt-4o-transcribe`,
  211. `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
  212. Whisper V2 model), and `gpt-4o-transcribe-diarize`.
  213. stream: If set to true, the model response data will be streamed to the client as it is
  214. generated using
  215. [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
  216. See the
  217. [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
  218. for more information.
  219. Note: Streaming is not supported for the `whisper-1` model and will be ignored.
  220. chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
  221. first normalizes loudness and then uses voice activity detection (VAD) to choose
  222. boundaries. `server_vad` object can be provided to tweak VAD detection
  223. parameters manually. If unset, the audio is transcribed as a single block.
  224. Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
  225. seconds.
  226. include: Additional information to include in the transcription response. `logprobs` will
  227. return the log probabilities of the tokens in the response to understand the
  228. model's confidence in the transcription. `logprobs` only works with
  229. response_format set to `json` and only with the models `gpt-4o-transcribe` and
  230. `gpt-4o-mini-transcribe`. This field is not supported when using
  231. `gpt-4o-transcribe-diarize`.
  232. known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
  233. `known_speaker_references[]`. Each entry should be a short identifier (for
  234. example `customer` or `agent`). Up to 4 speakers are supported.
  235. known_speaker_references: Optional list of audio samples (as
  236. [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
  237. that contain known speaker references matching `known_speaker_names[]`. Each
  238. sample must be between 2 and 10 seconds, and can use any of the same input audio
  239. formats supported by `file`.
  240. language: The language of the input audio. Supplying the input language in
  241. [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
  242. format will improve accuracy and latency.
  243. prompt: An optional text to guide the model's style or continue a previous audio
  244. segment. The
  245. [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
  246. should match the audio language. This field is not supported when using
  247. `gpt-4o-transcribe-diarize`.
  248. response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
  249. `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
  250. `gpt-4o-mini-transcribe`, the only supported format is `json`. For
  251. `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
  252. `diarized_json`, with `diarized_json` required to receive speaker annotations.
  253. temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
  254. output more random, while lower values like 0.2 will make it more focused and
  255. deterministic. If set to 0, the model will use
  256. [log probability](https://en.wikipedia.org/wiki/Log_probability) to
  257. automatically increase the temperature until certain thresholds are hit.
  258. timestamp_granularities: The timestamp granularities to populate for this transcription.
  259. `response_format` must be set `verbose_json` to use timestamp granularities.
  260. Either or both of these options are supported: `word`, or `segment`. Note: There
  261. is no additional latency for segment timestamps, but generating word timestamps
  262. incurs additional latency. This option is not available for
  263. `gpt-4o-transcribe-diarize`.
  264. extra_headers: Send extra headers
  265. extra_query: Add additional query parameters to the request
  266. extra_body: Add additional JSON properties to the request
  267. timeout: Override the client-level default timeout for this request, in seconds
  268. """
  269. ...
  270. @overload
  271. def create(
  272. self,
  273. *,
  274. file: FileTypes,
  275. model: Union[str, AudioModel],
  276. stream: bool,
  277. chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
  278. include: List[TranscriptionInclude] | Omit = omit,
  279. known_speaker_names: SequenceNotStr[str] | Omit = omit,
  280. known_speaker_references: SequenceNotStr[str] | Omit = omit,
  281. language: str | Omit = omit,
  282. prompt: str | Omit = omit,
  283. response_format: Union[AudioResponseFormat, Omit] = omit,
  284. temperature: float | Omit = omit,
  285. timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
  286. # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
  287. # The extra values given here take precedence over values defined on the client or passed to this method.
  288. extra_headers: Headers | None = None,
  289. extra_query: Query | None = None,
  290. extra_body: Body | None = None,
  291. timeout: float | httpx.Timeout | None | NotGiven = not_given,
  292. ) -> TranscriptionCreateResponse | Stream[TranscriptionStreamEvent]:
  293. """
  294. Transcribes audio into the input language.
  295. Args:
  296. file:
  297. The audio file object (not file name) to transcribe, in one of these formats:
  298. flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
  299. model: ID of the model to use. The options are `gpt-4o-transcribe`,
  300. `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
  301. Whisper V2 model), and `gpt-4o-transcribe-diarize`.
  302. stream: If set to true, the model response data will be streamed to the client as it is
  303. generated using
  304. [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
  305. See the
  306. [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
  307. for more information.
  308. Note: Streaming is not supported for the `whisper-1` model and will be ignored.
  309. chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
  310. first normalizes loudness and then uses voice activity detection (VAD) to choose
  311. boundaries. `server_vad` object can be provided to tweak VAD detection
  312. parameters manually. If unset, the audio is transcribed as a single block.
  313. Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
  314. seconds.
  315. include: Additional information to include in the transcription response. `logprobs` will
  316. return the log probabilities of the tokens in the response to understand the
  317. model's confidence in the transcription. `logprobs` only works with
  318. response_format set to `json` and only with the models `gpt-4o-transcribe` and
  319. `gpt-4o-mini-transcribe`. This field is not supported when using
  320. `gpt-4o-transcribe-diarize`.
  321. known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
  322. `known_speaker_references[]`. Each entry should be a short identifier (for
  323. example `customer` or `agent`). Up to 4 speakers are supported.
  324. known_speaker_references: Optional list of audio samples (as
  325. [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
  326. that contain known speaker references matching `known_speaker_names[]`. Each
  327. sample must be between 2 and 10 seconds, and can use any of the same input audio
  328. formats supported by `file`.
  329. language: The language of the input audio. Supplying the input language in
  330. [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
  331. format will improve accuracy and latency.
  332. prompt: An optional text to guide the model's style or continue a previous audio
  333. segment. The
  334. [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
  335. should match the audio language. This field is not supported when using
  336. `gpt-4o-transcribe-diarize`.
  337. response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
  338. `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
  339. `gpt-4o-mini-transcribe`, the only supported format is `json`. For
  340. `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
  341. `diarized_json`, with `diarized_json` required to receive speaker annotations.
  342. temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
  343. output more random, while lower values like 0.2 will make it more focused and
  344. deterministic. If set to 0, the model will use
  345. [log probability](https://en.wikipedia.org/wiki/Log_probability) to
  346. automatically increase the temperature until certain thresholds are hit.
  347. timestamp_granularities: The timestamp granularities to populate for this transcription.
  348. `response_format` must be set `verbose_json` to use timestamp granularities.
  349. Either or both of these options are supported: `word`, or `segment`. Note: There
  350. is no additional latency for segment timestamps, but generating word timestamps
  351. incurs additional latency. This option is not available for
  352. `gpt-4o-transcribe-diarize`.
  353. extra_headers: Send extra headers
  354. extra_query: Add additional query parameters to the request
  355. extra_body: Add additional JSON properties to the request
  356. timeout: Override the client-level default timeout for this request, in seconds
  357. """
  358. ...
  359. @required_args(["file", "model"], ["file", "model", "stream"])
  360. def create(
  361. self,
  362. *,
  363. file: FileTypes,
  364. model: Union[str, AudioModel],
  365. chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
  366. include: List[TranscriptionInclude] | Omit = omit,
  367. known_speaker_names: SequenceNotStr[str] | Omit = omit,
  368. known_speaker_references: SequenceNotStr[str] | Omit = omit,
  369. language: str | Omit = omit,
  370. prompt: str | Omit = omit,
  371. response_format: Union[AudioResponseFormat, Omit] = omit,
  372. stream: Optional[Literal[False]] | Literal[True] | Omit = omit,
  373. temperature: float | Omit = omit,
  374. timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
  375. # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
  376. # The extra values given here take precedence over values defined on the client or passed to this method.
  377. extra_headers: Headers | None = None,
  378. extra_query: Query | None = None,
  379. extra_body: Body | None = None,
  380. timeout: float | httpx.Timeout | None | NotGiven = not_given,
  381. ) -> str | Transcription | TranscriptionDiarized | TranscriptionVerbose | Stream[TranscriptionStreamEvent]:
  382. body = deepcopy_minimal(
  383. {
  384. "file": file,
  385. "model": model,
  386. "chunking_strategy": chunking_strategy,
  387. "include": include,
  388. "known_speaker_names": known_speaker_names,
  389. "known_speaker_references": known_speaker_references,
  390. "language": language,
  391. "prompt": prompt,
  392. "response_format": response_format,
  393. "stream": stream,
  394. "temperature": temperature,
  395. "timestamp_granularities": timestamp_granularities,
  396. }
  397. )
  398. files = extract_files(cast(Mapping[str, object], body), paths=[["file"]])
  399. # It should be noted that the actual Content-Type header that will be
  400. # sent to the server will contain a `boundary` parameter, e.g.
  401. # multipart/form-data; boundary=---abc--
  402. extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})}
  403. return self._post( # type: ignore[return-value]
  404. "/audio/transcriptions",
  405. body=maybe_transform(
  406. body,
  407. transcription_create_params.TranscriptionCreateParamsStreaming
  408. if stream
  409. else transcription_create_params.TranscriptionCreateParamsNonStreaming,
  410. ),
  411. files=files,
  412. options=make_request_options(
  413. extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
  414. ),
  415. cast_to=_get_response_format_type(response_format),
  416. stream=stream or False,
  417. stream_cls=Stream[TranscriptionStreamEvent],
  418. )
  419. class AsyncTranscriptions(AsyncAPIResource):
  420. @cached_property
  421. def with_raw_response(self) -> AsyncTranscriptionsWithRawResponse:
  422. """
  423. This property can be used as a prefix for any HTTP method call to return
  424. the raw response object instead of the parsed content.
  425. For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
  426. """
  427. return AsyncTranscriptionsWithRawResponse(self)
  428. @cached_property
  429. def with_streaming_response(self) -> AsyncTranscriptionsWithStreamingResponse:
  430. """
  431. An alternative to `.with_raw_response` that doesn't eagerly read the response body.
  432. For more information, see https://www.github.com/openai/openai-python#with_streaming_response
  433. """
  434. return AsyncTranscriptionsWithStreamingResponse(self)
  435. @overload
  436. async def create(
  437. self,
  438. *,
  439. file: FileTypes,
  440. model: Union[str, AudioModel],
  441. chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
  442. include: List[TranscriptionInclude] | Omit = omit,
  443. known_speaker_names: SequenceNotStr[str] | Omit = omit,
  444. known_speaker_references: SequenceNotStr[str] | Omit = omit,
  445. language: str | Omit = omit,
  446. prompt: str | Omit = omit,
  447. response_format: Union[Literal["json"], Omit] = omit,
  448. stream: Optional[Literal[False]] | Omit = omit,
  449. temperature: float | Omit = omit,
  450. timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
  451. # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
  452. # The extra values given here take precedence over values defined on the client or passed to this method.
  453. extra_headers: Headers | None = None,
  454. extra_query: Query | None = None,
  455. extra_body: Body | None = None,
  456. timeout: float | httpx.Timeout | None | NotGiven = not_given,
  457. ) -> TranscriptionCreateResponse:
  458. """
  459. Transcribes audio into the input language.
  460. Args:
  461. file:
  462. The audio file object (not file name) to transcribe, in one of these formats:
  463. flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
  464. model: ID of the model to use. The options are `gpt-4o-transcribe`,
  465. `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
  466. Whisper V2 model), and `gpt-4o-transcribe-diarize`.
  467. chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
  468. first normalizes loudness and then uses voice activity detection (VAD) to choose
  469. boundaries. `server_vad` object can be provided to tweak VAD detection
  470. parameters manually. If unset, the audio is transcribed as a single block.
  471. Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
  472. seconds.
  473. include: Additional information to include in the transcription response. `logprobs` will
  474. return the log probabilities of the tokens in the response to understand the
  475. model's confidence in the transcription. `logprobs` only works with
  476. response_format set to `json` and only with the models `gpt-4o-transcribe` and
  477. `gpt-4o-mini-transcribe`. This field is not supported when using
  478. `gpt-4o-transcribe-diarize`.
  479. known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
  480. `known_speaker_references[]`. Each entry should be a short identifier (for
  481. example `customer` or `agent`). Up to 4 speakers are supported.
  482. known_speaker_references: Optional list of audio samples (as
  483. [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
  484. that contain known speaker references matching `known_speaker_names[]`. Each
  485. sample must be between 2 and 10 seconds, and can use any of the same input audio
  486. formats supported by `file`.
  487. language: The language of the input audio. Supplying the input language in
  488. [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
  489. format will improve accuracy and latency.
  490. prompt: An optional text to guide the model's style or continue a previous audio
  491. segment. The
  492. [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
  493. should match the audio language. This field is not supported when using
  494. `gpt-4o-transcribe-diarize`.
  495. response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
  496. `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
  497. `gpt-4o-mini-transcribe`, the only supported format is `json`. For
  498. `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
  499. `diarized_json`, with `diarized_json` required to receive speaker annotations.
  500. stream: If set to true, the model response data will be streamed to the client as it is
  501. generated using
  502. [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
  503. See the
  504. [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
  505. for more information.
  506. Note: Streaming is not supported for the `whisper-1` model and will be ignored.
  507. temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
  508. output more random, while lower values like 0.2 will make it more focused and
  509. deterministic. If set to 0, the model will use
  510. [log probability](https://en.wikipedia.org/wiki/Log_probability) to
  511. automatically increase the temperature until certain thresholds are hit.
  512. timestamp_granularities: The timestamp granularities to populate for this transcription.
  513. `response_format` must be set `verbose_json` to use timestamp granularities.
  514. Either or both of these options are supported: `word`, or `segment`. Note: There
  515. is no additional latency for segment timestamps, but generating word timestamps
  516. incurs additional latency. This option is not available for
  517. `gpt-4o-transcribe-diarize`.
  518. extra_headers: Send extra headers
  519. extra_query: Add additional query parameters to the request
  520. """
  521. @overload
  522. async def create(
  523. self,
  524. *,
  525. file: FileTypes,
  526. model: Union[str, AudioModel],
  527. chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
  528. include: List[TranscriptionInclude] | Omit = omit,
  529. response_format: Literal["verbose_json"],
  530. language: str | Omit = omit,
  531. prompt: str | Omit = omit,
  532. temperature: float | Omit = omit,
  533. timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
  534. # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
  535. # The extra values given here take precedence over values defined on the client or passed to this method.
  536. extra_headers: Headers | None = None,
  537. extra_query: Query | None = None,
  538. extra_body: Body | None = None,
  539. timeout: float | httpx.Timeout | None | NotGiven = not_given,
  540. ) -> TranscriptionVerbose: ...
  541. @overload
  542. async def create(
  543. self,
  544. *,
  545. file: FileTypes,
  546. model: Union[str, AudioModel],
  547. chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
  548. include: List[TranscriptionInclude] | Omit = omit,
  549. response_format: Literal["text", "srt", "vtt"],
  550. language: str | Omit = omit,
  551. prompt: str | Omit = omit,
  552. temperature: float | Omit = omit,
  553. timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
  554. # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
  555. # The extra values given here take precedence over values defined on the client or passed to this method.
  556. extra_headers: Headers | None = None,
  557. extra_query: Query | None = None,
  558. extra_body: Body | None = None,
  559. timeout: float | httpx.Timeout | None | NotGiven = not_given,
  560. ) -> str: ...
  561. @overload
  562. async def create(
  563. self,
  564. *,
  565. file: FileTypes,
  566. model: Union[str, AudioModel],
  567. stream: Literal[True],
  568. chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
  569. include: List[TranscriptionInclude] | Omit = omit,
  570. known_speaker_names: SequenceNotStr[str] | Omit = omit,
  571. known_speaker_references: SequenceNotStr[str] | Omit = omit,
  572. language: str | Omit = omit,
  573. prompt: str | Omit = omit,
  574. response_format: Union[AudioResponseFormat, Omit] = omit,
  575. temperature: float | Omit = omit,
  576. timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
  577. # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
  578. # The extra values given here take precedence over values defined on the client or passed to this method.
  579. extra_headers: Headers | None = None,
  580. extra_query: Query | None = None,
  581. extra_body: Body | None = None,
  582. timeout: float | httpx.Timeout | None | NotGiven = not_given,
  583. ) -> AsyncStream[TranscriptionStreamEvent]:
  584. """
  585. Transcribes audio into the input language.
  586. Args:
  587. file:
  588. The audio file object (not file name) to transcribe, in one of these formats:
  589. flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
  590. model: ID of the model to use. The options are `gpt-4o-transcribe`,
  591. `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
  592. Whisper V2 model), and `gpt-4o-transcribe-diarize`.
  593. stream: If set to true, the model response data will be streamed to the client as it is
  594. generated using
  595. [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
  596. See the
  597. [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
  598. for more information.
  599. Note: Streaming is not supported for the `whisper-1` model and will be ignored.
  600. chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
  601. first normalizes loudness and then uses voice activity detection (VAD) to choose
  602. boundaries. `server_vad` object can be provided to tweak VAD detection
  603. parameters manually. If unset, the audio is transcribed as a single block.
  604. Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
  605. seconds.
  606. include: Additional information to include in the transcription response. `logprobs` will
  607. return the log probabilities of the tokens in the response to understand the
  608. model's confidence in the transcription. `logprobs` only works with
  609. response_format set to `json` and only with the models `gpt-4o-transcribe` and
  610. `gpt-4o-mini-transcribe`. This field is not supported when using
  611. `gpt-4o-transcribe-diarize`.
  612. known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
  613. `known_speaker_references[]`. Each entry should be a short identifier (for
  614. example `customer` or `agent`). Up to 4 speakers are supported.
  615. known_speaker_references: Optional list of audio samples (as
  616. [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
  617. that contain known speaker references matching `known_speaker_names[]`. Each
  618. sample must be between 2 and 10 seconds, and can use any of the same input audio
  619. formats supported by `file`.
  620. language: The language of the input audio. Supplying the input language in
  621. [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
  622. format will improve accuracy and latency.
  623. prompt: An optional text to guide the model's style or continue a previous audio
  624. segment. The
  625. [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
  626. should match the audio language. This field is not supported when using
  627. `gpt-4o-transcribe-diarize`.
  628. response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
  629. `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
  630. `gpt-4o-mini-transcribe`, the only supported format is `json`. For
  631. `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
  632. `diarized_json`, with `diarized_json` required to receive speaker annotations.
  633. temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
  634. output more random, while lower values like 0.2 will make it more focused and
  635. deterministic. If set to 0, the model will use
  636. [log probability](https://en.wikipedia.org/wiki/Log_probability) to
  637. automatically increase the temperature until certain thresholds are hit.
  638. timestamp_granularities: The timestamp granularities to populate for this transcription.
  639. `response_format` must be set `verbose_json` to use timestamp granularities.
  640. Either or both of these options are supported: `word`, or `segment`. Note: There
  641. is no additional latency for segment timestamps, but generating word timestamps
  642. incurs additional latency. This option is not available for
  643. `gpt-4o-transcribe-diarize`.
  644. extra_headers: Send extra headers
  645. extra_query: Add additional query parameters to the request
  646. extra_body: Add additional JSON properties to the request
  647. timeout: Override the client-level default timeout for this request, in seconds
  648. """
  649. ...
  650. @overload
  651. async def create(
  652. self,
  653. *,
  654. file: FileTypes,
  655. model: Union[str, AudioModel],
  656. stream: bool,
  657. chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
  658. include: List[TranscriptionInclude] | Omit = omit,
  659. known_speaker_names: SequenceNotStr[str] | Omit = omit,
  660. known_speaker_references: SequenceNotStr[str] | Omit = omit,
  661. language: str | Omit = omit,
  662. prompt: str | Omit = omit,
  663. response_format: Union[AudioResponseFormat, Omit] = omit,
  664. temperature: float | Omit = omit,
  665. timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
  666. # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
  667. # The extra values given here take precedence over values defined on the client or passed to this method.
  668. extra_headers: Headers | None = None,
  669. extra_query: Query | None = None,
  670. extra_body: Body | None = None,
  671. timeout: float | httpx.Timeout | None | NotGiven = not_given,
  672. ) -> TranscriptionCreateResponse | AsyncStream[TranscriptionStreamEvent]:
  673. """
  674. Transcribes audio into the input language.
  675. Args:
  676. file:
  677. The audio file object (not file name) to transcribe, in one of these formats:
  678. flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
  679. model: ID of the model to use. The options are `gpt-4o-transcribe`,
  680. `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
  681. Whisper V2 model), and `gpt-4o-transcribe-diarize`.
  682. stream: If set to true, the model response data will be streamed to the client as it is
  683. generated using
  684. [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
  685. See the
  686. [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
  687. for more information.
  688. Note: Streaming is not supported for the `whisper-1` model and will be ignored.
  689. chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
  690. first normalizes loudness and then uses voice activity detection (VAD) to choose
  691. boundaries. `server_vad` object can be provided to tweak VAD detection
  692. parameters manually. If unset, the audio is transcribed as a single block.
  693. Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
  694. seconds.
  695. include: Additional information to include in the transcription response. `logprobs` will
  696. return the log probabilities of the tokens in the response to understand the
  697. model's confidence in the transcription. `logprobs` only works with
  698. response_format set to `json` and only with the models `gpt-4o-transcribe` and
  699. `gpt-4o-mini-transcribe`. This field is not supported when using
  700. `gpt-4o-transcribe-diarize`.
  701. known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
  702. `known_speaker_references[]`. Each entry should be a short identifier (for
  703. example `customer` or `agent`). Up to 4 speakers are supported.
  704. known_speaker_references: Optional list of audio samples (as
  705. [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
  706. that contain known speaker references matching `known_speaker_names[]`. Each
  707. sample must be between 2 and 10 seconds, and can use any of the same input audio
  708. formats supported by `file`.
  709. language: The language of the input audio. Supplying the input language in
  710. [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
  711. format will improve accuracy and latency.
  712. prompt: An optional text to guide the model's style or continue a previous audio
  713. segment. The
  714. [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
  715. should match the audio language. This field is not supported when using
  716. `gpt-4o-transcribe-diarize`.
  717. response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
  718. `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
  719. `gpt-4o-mini-transcribe`, the only supported format is `json`. For
  720. `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
  721. `diarized_json`, with `diarized_json` required to receive speaker annotations.
  722. temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
  723. output more random, while lower values like 0.2 will make it more focused and
  724. deterministic. If set to 0, the model will use
  725. [log probability](https://en.wikipedia.org/wiki/Log_probability) to
  726. automatically increase the temperature until certain thresholds are hit.
  727. timestamp_granularities: The timestamp granularities to populate for this transcription.
  728. `response_format` must be set `verbose_json` to use timestamp granularities.
  729. Either or both of these options are supported: `word`, or `segment`. Note: There
  730. is no additional latency for segment timestamps, but generating word timestamps
  731. incurs additional latency. This option is not available for
  732. `gpt-4o-transcribe-diarize`.
  733. extra_headers: Send extra headers
  734. extra_query: Add additional query parameters to the request
  735. extra_body: Add additional JSON properties to the request
  736. timeout: Override the client-level default timeout for this request, in seconds
  737. """
  738. ...
  739. @required_args(["file", "model"], ["file", "model", "stream"])
  740. async def create(
  741. self,
  742. *,
  743. file: FileTypes,
  744. model: Union[str, AudioModel],
  745. chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
  746. include: List[TranscriptionInclude] | Omit = omit,
  747. known_speaker_names: SequenceNotStr[str] | Omit = omit,
  748. known_speaker_references: SequenceNotStr[str] | Omit = omit,
  749. language: str | Omit = omit,
  750. prompt: str | Omit = omit,
  751. response_format: Union[AudioResponseFormat, Omit] = omit,
  752. stream: Optional[Literal[False]] | Literal[True] | Omit = omit,
  753. temperature: float | Omit = omit,
  754. timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
  755. # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
  756. # The extra values given here take precedence over values defined on the client or passed to this method.
  757. extra_headers: Headers | None = None,
  758. extra_query: Query | None = None,
  759. extra_body: Body | None = None,
  760. timeout: float | httpx.Timeout | None | NotGiven = not_given,
  761. ) -> Transcription | TranscriptionVerbose | TranscriptionDiarized | str | AsyncStream[TranscriptionStreamEvent]:
  762. body = deepcopy_minimal(
  763. {
  764. "file": file,
  765. "model": model,
  766. "chunking_strategy": chunking_strategy,
  767. "include": include,
  768. "known_speaker_names": known_speaker_names,
  769. "known_speaker_references": known_speaker_references,
  770. "language": language,
  771. "prompt": prompt,
  772. "response_format": response_format,
  773. "stream": stream,
  774. "temperature": temperature,
  775. "timestamp_granularities": timestamp_granularities,
  776. }
  777. )
  778. files = extract_files(cast(Mapping[str, object], body), paths=[["file"]])
  779. # It should be noted that the actual Content-Type header that will be
  780. # sent to the server will contain a `boundary` parameter, e.g.
  781. # multipart/form-data; boundary=---abc--
  782. extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})}
  783. return await self._post(
  784. "/audio/transcriptions",
  785. body=await async_maybe_transform(
  786. body,
  787. transcription_create_params.TranscriptionCreateParamsStreaming
  788. if stream
  789. else transcription_create_params.TranscriptionCreateParamsNonStreaming,
  790. ),
  791. files=files,
  792. options=make_request_options(
  793. extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
  794. ),
  795. cast_to=_get_response_format_type(response_format),
  796. stream=stream or False,
  797. stream_cls=AsyncStream[TranscriptionStreamEvent],
  798. )
  799. class TranscriptionsWithRawResponse:
  800. def __init__(self, transcriptions: Transcriptions) -> None:
  801. self._transcriptions = transcriptions
  802. self.create = _legacy_response.to_raw_response_wrapper(
  803. transcriptions.create,
  804. )
  805. class AsyncTranscriptionsWithRawResponse:
  806. def __init__(self, transcriptions: AsyncTranscriptions) -> None:
  807. self._transcriptions = transcriptions
  808. self.create = _legacy_response.async_to_raw_response_wrapper(
  809. transcriptions.create,
  810. )
  811. class TranscriptionsWithStreamingResponse:
  812. def __init__(self, transcriptions: Transcriptions) -> None:
  813. self._transcriptions = transcriptions
  814. self.create = to_streamed_response_wrapper(
  815. transcriptions.create,
  816. )
  817. class AsyncTranscriptionsWithStreamingResponse:
  818. def __init__(self, transcriptions: AsyncTranscriptions) -> None:
  819. self._transcriptions = transcriptions
  820. self.create = async_to_streamed_response_wrapper(
  821. transcriptions.create,
  822. )
  823. def _get_response_format_type(
  824. response_format: AudioResponseFormat | Omit,
  825. ) -> type[Transcription | TranscriptionVerbose | TranscriptionDiarized | str]:
  826. if isinstance(response_format, Omit) or response_format is None: # pyright: ignore[reportUnnecessaryComparison]
  827. return Transcription
  828. if response_format == "json":
  829. return Transcription
  830. elif response_format == "verbose_json":
  831. return TranscriptionVerbose
  832. elif response_format == "diarized_json":
  833. return TranscriptionDiarized
  834. elif response_format == "srt" or response_format == "text" or response_format == "vtt":
  835. return str
  836. elif TYPE_CHECKING: # type: ignore[unreachable]
  837. assert_never(response_format)
  838. else:
  839. log.warn("Unexpected audio response format: %s", response_format)
  840. return Transcription