Skip to content

Commit b8feb14

Browse files
committed
Add support for audio transcriptions and translations (Whisper)
1 parent baad602 commit b8feb14

15 files changed

+613
-15
lines changed

OpenAI_API/Audio/AudioRequest.cs

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Text;
4+
using Newtonsoft.Json;
5+
using static OpenAI_API.Audio.TextToSpeechRequest;
6+
7+
namespace OpenAI_API.Audio
8+
{
9+
public class AudioRequest
10+
{
11+
/// <summary>
12+
/// The model to use for this request. Currently only <see cref="OpenAI_API.Models.Model.Whisper1"/> is supported.
13+
/// </summary>
14+
[JsonProperty("model")]
15+
public string Model { get; set; } = OpenAI_API.Models.Model.DefaultTranscriptionModel;
16+
17+
/// <summary>
18+
/// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language for transcriptions, or English for translations.
19+
/// </summary>
20+
[JsonProperty("prompt", DefaultValueHandling = DefaultValueHandling.Ignore)]
21+
public string Prompt { get; set; } = null;
22+
23+
/// <summary>
24+
/// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.
25+
/// </summary>
26+
[JsonProperty("language", DefaultValueHandling = DefaultValueHandling.Ignore)]
27+
public string Language { get; set; } = null;
28+
29+
/// <summary>
30+
/// The format of the transcript output, should be one of the options in <see cref="AudioRequest.ResponseFormats"/>. See <seealso href="https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-response_format"/>
31+
/// </summary>
32+
[JsonProperty("response_format", DefaultValueHandling = DefaultValueHandling.Ignore)]
33+
public string ResponseFormat { get; set; } = null;
34+
35+
/// <summary>
36+
/// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.
37+
/// </summary>
38+
[JsonProperty("temperature", DefaultValueHandling = DefaultValueHandling.Ignore)]
39+
public double Temperature { get; set; } = 0;
40+
41+
42+
/// <summary>
43+
/// The format of the transcript output. See <seealso href="https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-response_format"/>
44+
/// </summary>
45+
public static class ResponseFormats
46+
{
47+
#pragma warning disable CS1591 // Missing XML comment for publicly visible type or member
48+
public const string JSON = "json";
49+
public const string Text = "text";
50+
public const string SRT = "srt";
51+
public const string VerboseJson = "verbose_json";
52+
public const string VTT = "vtt";
53+
#pragma warning restore CS1591 // Missing XML comment for publicly visible type or member
54+
}
55+
}
56+
}

OpenAI_API/Audio/AudioResult.cs

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Text;
4+
5+
namespace OpenAI_API.Audio
6+
{
7+
/// <summary>
8+
/// Represents a verbose_json output from the OpenAI Transcribe or Translate endpoints.
9+
/// </summary>
10+
public class AudioResultVerbose : ApiResultBase
11+
{
12+
public double duration { get; set; }
13+
public string language { get; set; }
14+
public List<Segment> segments { get; set; }
15+
public string task { get; set; }
16+
public string text { get; set; }
17+
18+
public class Segment
19+
{
20+
public double avg_logprob { get; set; }
21+
public double compression_ratio { get; set; }
22+
public double end { get; set; }
23+
public int id { get; set; }
24+
public double no_speech_prob { get; set; }
25+
public int seek { get; set; }
26+
public double start { get; set; }
27+
public double temperature { get; set; }
28+
public string text { get; set; }
29+
public List<int> tokens { get; set; }
30+
}
31+
}
32+
}

OpenAI_API/Audio/ITextToSpeechEndpoint.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ public interface ITextToSpeechEndpoint
3030
/// <param name="responseFormat">The default response format is "mp3", but other formats are available in <see cref="TextToSpeechRequest.ResponseFormats"/>. See <seealso href="https://platform.openai.com/docs/guides/text-to-speech/supported-output-formats"/></param>
3131
/// <param name="model">TTS is an AI model that converts text to natural sounding spoken text. OpenAI offers two different model variates, <see cref="Model.TTS_Speed"/> is optimized for real time text to speech use cases and <see cref="Model.TTS_HD"/> is optimized for quality.</param>
3232
/// <returns>A stream of the audio file in the requested format.</returns>
33-
Task<Stream> GetSpeechAsStreamAsync(string input, string voice = null, decimal? speed = null, string responseFormat = null, Model model = null);
33+
Task<Stream> GetSpeechAsStreamAsync(string input, string voice = null, double? speed = null, string responseFormat = null, Model model = null);
3434

3535
/// <summary>
3636
/// Calls the API to create speech from text, and saves the audio file to disk.
@@ -50,7 +50,7 @@ public interface ITextToSpeechEndpoint
5050
/// <param name="responseFormat">The default response format is "mp3", but other formats are available in <see cref="TextToSpeechRequest.ResponseFormats"/>. See <seealso href="https://platform.openai.com/docs/guides/text-to-speech/supported-output-formats"/></param>
5151
/// <param name="model">TTS is an AI model that converts text to natural sounding spoken text. OpenAI offers two different model variates, <see cref="Model.TTS_Speed"/> is optimized for real time text to speech use cases and <see cref="Model.TTS_HD"/> is optimized for quality.</param>
5252
/// <returns>A stream of the audio file in the requested format.</returns>
53-
Task<FileInfo> SaveSpeechToFileAsync(string input, string localPath, string voice = null, decimal? speed = null, string responseFormat = null, Model model = null);
53+
Task<FileInfo> SaveSpeechToFileAsync(string input, string localPath, string voice = null, double? speed = null, string responseFormat = null, Model model = null);
5454

5555

5656
}
+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
using System.IO;
2+
using System.Threading.Tasks;
3+
4+
namespace OpenAI_API.Audio
5+
{
6+
/// <summary>
7+
/// Transcribe audio into text, with optional translation into English.
8+
/// </summary>
9+
public interface ITranscriptionEndpoint
10+
{
11+
/// <summary>
12+
/// This allows you to set default parameters for every request, for example to set a default language. For every request, if you do not have a parameter set on the request but do have it set here as a default, the request will automatically pick up the default value.
13+
/// </summary>
14+
AudioRequest DefaultRequestArgs { get; set; }
15+
16+
/// <summary>
17+
/// Gets the transcription of the audio stream, in the specified format
18+
/// </summary>
19+
/// <param name="audioStream">The stream containing audio data, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.</param>
20+
/// <param name="filename">The name of the audio file in the stream. This does not have to be real, but it must contain the correct file extension. For example, "file.mp3" if you are supplying an mp3 audio stream.</param>
21+
/// <param name="responseFormat">The format of the response. Suggested value are <see cref="AudioRequest.ResponseFormats.SRT"/> or <see cref="AudioRequest.ResponseFormats.VTT"/>. For text and Json formats, try <see cref="GetTranscriptionTextAsync(Stream, string, string, double?)"/> or <see cref="GetTranscriptionDetailsAsync(Stream, string, string, double?)"/> instead.</param>
22+
/// <param name="language">The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.</param>
23+
/// <param name="prompt">An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.</param>
24+
/// <param name="temperature">The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.</param>
25+
/// <returns>A string of the transcribed text</returns>
26+
Task<string> GetAsFormatAsync(Stream audioStream, string filename,string responseFormat, string language = null, string prompt = null, double? temperature = null);
27+
28+
/// <summary>
29+
/// Gets the transcription of the audio file, in the specified format
30+
/// </summary>
31+
/// <param name="audioFilePath">The local path to the audio file, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.</param>
32+
/// <param name="responseFormat">The format of the response. Suggested value are <see cref="AudioRequest.ResponseFormats.SRT"/> or <see cref="AudioRequest.ResponseFormats.VTT"/>. For text and Json formats, try <see cref="GetTranscriptionTextAsync(Stream, string, string, double?)"/> or <see cref="GetTranscriptionDetailsAsync(Stream, string, string, double?)"/> instead.</param>
33+
/// <param name="language">The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.</param>
34+
/// <param name="prompt">An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.</param>
35+
/// <param name="temperature">The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.</param>
36+
/// <returns>A string of the transcribed text</returns>
37+
Task<string> GetAsFormatAsync(string audioFilePath, string responseFormat, string language = null, string prompt = null, double? temperature = null);
38+
39+
/// <summary>
40+
/// Gets the transcription of the audio stream, with full metadata
41+
/// </summary>
42+
/// <param name="audioStream">The stream containing audio data, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.</param>
43+
/// <param name="filename">The name of the audio file in the stream. This does not have to be real, but it must contain the correct file extension. For example, "file.mp3" if you are supplying an mp3 audio stream.</param>
44+
/// <param name="language">The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.</param>
45+
/// <param name="prompt">An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.</param>
46+
/// <param name="temperature">The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.</param>
47+
/// <returns>A string of the transcribed text</returns>
48+
Task<AudioResultVerbose> GetWithDetailsAsync(Stream audioStream, string filename,string language = null, string prompt = null, double? temperature = null);
49+
50+
/// <summary>
51+
/// Gets the transcription of the audio file, with full metadata
52+
/// </summary>
53+
/// <param name="audioFilePath">The local path to the audio file, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.</param>
54+
/// <param name="language">The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.</param>
55+
/// <param name="prompt">An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.</param>
56+
/// <param name="temperature">The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.</param>
57+
/// <returns>A string of the transcribed text</returns>
58+
Task<AudioResultVerbose> GetWithDetailsAsync(string audioFilePath, string language = null, string prompt = null, double? temperature = null);
59+
60+
/// <summary>
61+
/// Gets the transcription of the audio stream as a text string
62+
/// </summary>
63+
/// <param name="audioStream">The stream containing audio data, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.</param>
64+
/// <param name="filename">The name of the audio file in the stream. This does not have to be real, but it must contain the correct file extension. For example, "file.mp3" if you are supplying an mp3 audio stream.</param>
65+
/// <param name="language">The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.</param>
66+
/// <param name="prompt">An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.</param>
67+
/// <param name="temperature">The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.</param>
68+
/// <returns>A string of the transcribed text</returns>
69+
Task<string> GetTextAsync(Stream audioStream, string filename, string language = null, string prompt = null, double? temperature = null);
70+
71+
/// <summary>
72+
/// Gets the transcription of the audio file as a text string
73+
/// </summary>
74+
/// <param name="audioFilePath">The local path to the audio file, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.</param>
75+
/// <param name="language">The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.</param>
76+
/// <param name="prompt">An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.</param>
77+
/// <param name="temperature">The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.</param>
78+
/// <returns>A string of the transcribed text</returns>
79+
Task<string> GetTextAsync(string audioFilePath, string language = null, string prompt = null, double? temperature = null);
80+
}
81+
}

OpenAI_API/Audio/TextToSpeechEndpoint.cs

+3-3
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public class TextToSpeechEndpoint : EndpointBase, ITextToSpeechEndpoint
2424
public TextToSpeechRequest DefaultTTSRequestArgs { get; set; } = new TextToSpeechRequest();
2525

2626
/// <summary>
27-
/// Constructor of the api endpoint. Rather than instantiating this yourself, access it through an instance of <see cref="OpenAIAPI"/> as <see cref="OpenAIAPI.Completions"/>.
27+
/// Constructor of the api endpoint. Rather than instantiating this yourself, access it through an instance of <see cref="OpenAIAPI"/> as <see cref="OpenAIAPI.TextToSpeech"/>.
2828
/// </summary>
2929
/// <param name="api">Pass in the instance of the api</param>
3030
internal TextToSpeechEndpoint(OpenAIAPI api) : base(api) { }
@@ -48,7 +48,7 @@ public async Task<Stream> GetSpeechAsStreamAsync(TextToSpeechRequest request)
4848
/// <param name="responseFormat">The default response format is "mp3", but other formats are available in <see cref="TextToSpeechRequest.ResponseFormats"/>. See <seealso href="https://platform.openai.com/docs/guides/text-to-speech/supported-output-formats"/></param>
4949
/// <param name="model">TTS is an AI model that converts text to natural sounding spoken text. OpenAI offers two different model variates, <see cref="Model.TTS_Speed"/> is optimized for real time text to speech use cases and <see cref="Model.TTS_HD"/> is optimized for quality.</param>
5050
/// <returns>A stream of the audio file in the requested format.</returns>
51-
public async Task<Stream> GetSpeechAsStreamAsync(string input, string voice = null, decimal? speed = null, string responseFormat = null, Model model = null)
51+
public async Task<Stream> GetSpeechAsStreamAsync(string input, string voice = null, double? speed = null, string responseFormat = null, Model model = null)
5252
{
5353
var request = new TextToSpeechRequest()
5454
{
@@ -87,7 +87,7 @@ public async Task<FileInfo> SaveSpeechToFileAsync(TextToSpeechRequest request, s
8787
/// <param name="responseFormat">The default response format is "mp3", but other formats are available in <see cref="TextToSpeechRequest.ResponseFormats"/>. See <seealso href="https://platform.openai.com/docs/guides/text-to-speech/supported-output-formats"/></param>
8888
/// <param name="model">TTS is an AI model that converts text to natural sounding spoken text. OpenAI offers two different model variates, <see cref="Model.TTS_Speed"/> is optimized for real time text to speech use cases and <see cref="Model.TTS_HD"/> is optimized for quality.</param>
8989
/// <returns>A stream of the audio file in the requested format.</returns>
90-
public async Task<FileInfo> SaveSpeechToFileAsync(string input, string localPath, string voice = null, decimal? speed = null, string responseFormat = null, Model model = null)
90+
public async Task<FileInfo> SaveSpeechToFileAsync(string input, string localPath, string voice = null, double? speed = null, string responseFormat = null, Model model = null)
9191
{
9292
var request = new TextToSpeechRequest()
9393
{

OpenAI_API/Audio/TextToSpeechRequest.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ public class TextToSpeechRequest
3535
/// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
3636
/// </summary>
3737
[JsonProperty("speed", DefaultValueHandling = DefaultValueHandling.Ignore)]
38-
public decimal? Speed { get; set; } = null;
38+
public double? Speed { get; set; } = null;
3939

4040
/// <summary>
4141
/// Supported voices are alloy, echo, fable, onyx, nova, and shimmer. Previews of the voices are available in the Text to speech guide. See <seealso href="https://platform.openai.com/docs/guides/text-to-speech/voice-options"/>.

0 commit comments

Comments
 (0)