Skip to content

Commit 1b6f4b0

Browse files
Adding support for Speech to text (#340)
* Adding support for Speech to text * lint fixes * remove unsupported fields from response * add test for transcriptions * Bump package version * remove task from response * replace audio url * change file --------- Co-authored-by: Sahil Yadav <[email protected]>
1 parent 724a192 commit 1b6f4b0

File tree

8 files changed

+761
-8
lines changed

8 files changed

+761
-8
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"
1212

1313
[tool.poetry]
1414
name = "together"
15-
version = "1.5.20"
15+
version = "1.5.21"
1616
authors = ["Together AI <[email protected]>"]
1717
description = "Python client for Together's Cloud Platform!"
1818
readme = "README.md"

src/together/client.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ class AsyncTogether:
103103
models: resources.AsyncModels
104104
fine_tuning: resources.AsyncFineTuning
105105
rerank: resources.AsyncRerank
106+
audio: resources.AsyncAudio
106107
code_interpreter: CodeInterpreter
107108
batches: resources.AsyncBatches
108109
# client options
@@ -167,6 +168,7 @@ def __init__(
167168
self.models = resources.AsyncModels(self.client)
168169
self.fine_tuning = resources.AsyncFineTuning(self.client)
169170
self.rerank = resources.AsyncRerank(self.client)
171+
self.audio = resources.AsyncAudio(self.client)
170172
self.code_interpreter = CodeInterpreter(self.client)
171173
self.batches = resources.AsyncBatches(self.client)
172174

src/together/resources/audio/__init__.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from functools import cached_property
22

33
from together.resources.audio.speech import AsyncSpeech, Speech
4+
from together.resources.audio.transcriptions import AsyncTranscriptions, Transcriptions
5+
from together.resources.audio.translations import AsyncTranslations, Translations
46
from together.types import (
57
TogetherClient,
68
)
@@ -14,6 +16,14 @@ def __init__(self, client: TogetherClient) -> None:
1416
def speech(self) -> Speech:
1517
return Speech(self._client)
1618

19+
@cached_property
20+
def transcriptions(self) -> Transcriptions:
21+
return Transcriptions(self._client)
22+
23+
@cached_property
24+
def translations(self) -> Translations:
25+
return Translations(self._client)
26+
1727

1828
class AsyncAudio:
1929
def __init__(self, client: TogetherClient) -> None:
@@ -22,3 +32,11 @@ def __init__(self, client: TogetherClient) -> None:
2232
@cached_property
2333
def speech(self) -> AsyncSpeech:
2434
return AsyncSpeech(self._client)
35+
36+
@cached_property
37+
def transcriptions(self) -> AsyncTranscriptions:
38+
return AsyncTranscriptions(self._client)
39+
40+
@cached_property
41+
def translations(self) -> AsyncTranslations:
42+
return AsyncTranslations(self._client)
Lines changed: 266 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,266 @@
1+
from __future__ import annotations
2+
3+
from pathlib import Path
4+
from typing import Any, BinaryIO, Dict, Optional, Tuple, Union
5+
6+
from together.abstract import api_requestor
7+
from together.types import (
8+
AudioTimestampGranularities,
9+
AudioTranscriptionResponse,
10+
AudioTranscriptionResponseFormat,
11+
AudioTranscriptionVerboseResponse,
12+
TogetherClient,
13+
TogetherRequest,
14+
)
15+
16+
17+
class Transcriptions:
18+
def __init__(self, client: TogetherClient) -> None:
19+
self._client = client
20+
21+
def create(
22+
self,
23+
*,
24+
file: Union[str, BinaryIO, Path],
25+
model: str = "openai/whisper-large-v3",
26+
language: Optional[str] = None,
27+
prompt: Optional[str] = None,
28+
response_format: Union[str, AudioTranscriptionResponseFormat] = "json",
29+
temperature: float = 0.0,
30+
timestamp_granularities: Optional[
31+
Union[str, AudioTimestampGranularities]
32+
] = None,
33+
**kwargs: Any,
34+
) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]:
35+
"""
36+
Transcribes audio into the input language.
37+
38+
Args:
39+
file: The audio file object (not file name) to transcribe, in one of these formats:
40+
flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
41+
Can be a file path (str/Path), file object (BinaryIO), or URL (str).
42+
model: ID of the model to use. Defaults to "openai/whisper-large-v3".
43+
language: The language of the input audio. Supplying the input language in
44+
ISO-639-1 format will improve accuracy and latency.
45+
prompt: An optional text to guide the model's style or continue a previous
46+
audio segment. The prompt should match the audio language.
47+
response_format: The format of the transcript output, in one of these options:
48+
json, verbose_json.
49+
temperature: The sampling temperature, between 0 and 1. Higher values like 0.8
50+
will make the output more random, while lower values like 0.2 will make it
51+
more focused and deterministic.
52+
timestamp_granularities: The timestamp granularities to populate for this
53+
transcription. response_format must be set verbose_json to use timestamp
54+
granularities. Either or both of these options are supported: word, or segment.
55+
56+
Returns:
57+
The transcribed text in the requested format.
58+
"""
59+
60+
requestor = api_requestor.APIRequestor(
61+
client=self._client,
62+
)
63+
64+
# Handle file input - could be a path, URL, or file object
65+
files_data: Dict[str, Union[Tuple[None, str], BinaryIO]] = {}
66+
params_data = {}
67+
68+
if isinstance(file, (str, Path)):
69+
if isinstance(file, str) and file.startswith(("http://", "https://")):
70+
# URL string - send as multipart field
71+
files_data["file"] = (None, file)
72+
else:
73+
# Local file path
74+
file_path = Path(file)
75+
files_data["file"] = open(file_path, "rb")
76+
else:
77+
# File object
78+
files_data["file"] = file
79+
80+
# Build request parameters
81+
params_data.update(
82+
{
83+
"model": model,
84+
"response_format": (
85+
response_format.value
86+
if hasattr(response_format, "value")
87+
else response_format
88+
),
89+
"temperature": temperature,
90+
}
91+
)
92+
93+
if language is not None:
94+
params_data["language"] = language
95+
96+
if prompt is not None:
97+
params_data["prompt"] = prompt
98+
99+
if timestamp_granularities is not None:
100+
params_data["timestamp_granularities"] = (
101+
timestamp_granularities.value
102+
if hasattr(timestamp_granularities, "value")
103+
else timestamp_granularities
104+
)
105+
106+
# Add any additional kwargs
107+
params_data.update(kwargs)
108+
109+
try:
110+
response, _, _ = requestor.request(
111+
options=TogetherRequest(
112+
method="POST",
113+
url="audio/transcriptions",
114+
params=params_data,
115+
files=files_data,
116+
),
117+
)
118+
finally:
119+
# Close file if we opened it
120+
if files_data and "file" in files_data:
121+
try:
122+
# Only close if it's a file object (not a tuple for URL)
123+
file_obj = files_data["file"]
124+
if hasattr(file_obj, "close") and not isinstance(file_obj, tuple):
125+
file_obj.close()
126+
except:
127+
pass
128+
129+
# Parse response based on format
130+
if (
131+
response_format == "verbose_json"
132+
or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON
133+
):
134+
return AudioTranscriptionVerboseResponse(**response.data)
135+
else:
136+
return AudioTranscriptionResponse(**response.data)
137+
138+
139+
class AsyncTranscriptions:
140+
def __init__(self, client: TogetherClient) -> None:
141+
self._client = client
142+
143+
async def create(
144+
self,
145+
*,
146+
file: Union[str, BinaryIO, Path],
147+
model: str = "openai/whisper-large-v3",
148+
language: Optional[str] = None,
149+
prompt: Optional[str] = None,
150+
response_format: Union[str, AudioTranscriptionResponseFormat] = "json",
151+
temperature: float = 0.0,
152+
timestamp_granularities: Optional[
153+
Union[str, AudioTimestampGranularities]
154+
] = None,
155+
**kwargs: Any,
156+
) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]:
157+
"""
158+
Async version of transcribe audio into the input language.
159+
160+
Args:
161+
file: The audio file object (not file name) to transcribe, in one of these formats:
162+
flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
163+
Can be a file path (str/Path), file object (BinaryIO), or URL (str).
164+
model: ID of the model to use. Defaults to "openai/whisper-large-v3".
165+
language: The language of the input audio. Supplying the input language in
166+
ISO-639-1 format will improve accuracy and latency.
167+
prompt: An optional text to guide the model's style or continue a previous
168+
audio segment. The prompt should match the audio language.
169+
response_format: The format of the transcript output, in one of these options:
170+
json, verbose_json.
171+
temperature: The sampling temperature, between 0 and 1. Higher values like 0.8
172+
will make the output more random, while lower values like 0.2 will make it
173+
more focused and deterministic.
174+
timestamp_granularities: The timestamp granularities to populate for this
175+
transcription. response_format must be set verbose_json to use timestamp
176+
granularities. Either or both of these options are supported: word, or segment.
177+
178+
Returns:
179+
The transcribed text in the requested format.
180+
"""
181+
182+
requestor = api_requestor.APIRequestor(
183+
client=self._client,
184+
)
185+
186+
# Handle file input - could be a path, URL, or file object
187+
files_data: Dict[str, Union[Tuple[None, str], BinaryIO]] = {}
188+
params_data = {}
189+
190+
if isinstance(file, (str, Path)):
191+
if isinstance(file, str) and file.startswith(("http://", "https://")):
192+
# URL string - send as multipart field
193+
files_data["file"] = (None, file)
194+
else:
195+
# Local file path
196+
file_path = Path(file)
197+
files_data["file"] = open(file_path, "rb")
198+
else:
199+
# File object
200+
files_data["file"] = file
201+
202+
# Build request parameters
203+
params_data.update(
204+
{
205+
"model": model,
206+
"response_format": (
207+
response_format
208+
if isinstance(response_format, str)
209+
else (
210+
response_format.value
211+
if hasattr(response_format, "value")
212+
else response_format
213+
)
214+
),
215+
"temperature": temperature,
216+
}
217+
)
218+
219+
if language is not None:
220+
params_data["language"] = language
221+
222+
if prompt is not None:
223+
params_data["prompt"] = prompt
224+
225+
if timestamp_granularities is not None:
226+
params_data["timestamp_granularities"] = (
227+
timestamp_granularities
228+
if isinstance(timestamp_granularities, str)
229+
else (
230+
timestamp_granularities.value
231+
if hasattr(timestamp_granularities, "value")
232+
else timestamp_granularities
233+
)
234+
)
235+
236+
# Add any additional kwargs
237+
params_data.update(kwargs)
238+
239+
try:
240+
response, _, _ = await requestor.arequest(
241+
options=TogetherRequest(
242+
method="POST",
243+
url="audio/transcriptions",
244+
params=params_data,
245+
files=files_data,
246+
),
247+
)
248+
finally:
249+
# Close file if we opened it
250+
if files_data and "file" in files_data:
251+
try:
252+
# Only close if it's a file object (not a tuple for URL)
253+
file_obj = files_data["file"]
254+
if hasattr(file_obj, "close") and not isinstance(file_obj, tuple):
255+
file_obj.close()
256+
except:
257+
pass
258+
259+
# Parse response based on format
260+
if (
261+
response_format == "verbose_json"
262+
or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON
263+
):
264+
return AudioTranscriptionVerboseResponse(**response.data)
265+
else:
266+
return AudioTranscriptionResponse(**response.data)

0 commit comments

Comments
 (0)