From 9020979da9bc075d2076caddf67a5040d7821f64 Mon Sep 17 00:00:00 2001 From: Nico Nonne Date: Thu, 25 Jul 2024 12:27:29 +0200 Subject: [PATCH 1/6] - Initial WebSockets implementation. --- .../ElevenLabsClientSettings.cs | 120 ++++++++---- .../Common/ElevenLabsBaseEndPoint.cs | 81 ++++---- ElevenLabs-DotNet/ElevenLabsClient.cs | 73 +++++-- ElevenLabs-DotNet/TextToSpeech/Alignment.cs | 22 +++ .../TextToSpeech/GenerationConfig.cs | 20 ++ .../TextToSpeechWebSocketEndpoint.cs | 183 ++++++++++++++++++ ...extToSpeechWebSocketFirstMessageRequest.cs | 36 ++++ ...TextToSpeechWebSocketLastMessageRequest.cs | 21 ++ .../TextToSpeechWebSocketRequest.cs | 50 +++++ .../TextToSpeechWebSocketResponse.cs | 49 +++++ 10 files changed, 561 insertions(+), 94 deletions(-) create mode 100644 ElevenLabs-DotNet/TextToSpeech/Alignment.cs create mode 100644 ElevenLabs-DotNet/TextToSpeech/GenerationConfig.cs create mode 100644 ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs create mode 100644 ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketFirstMessageRequest.cs create mode 100644 ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketLastMessageRequest.cs create mode 100644 ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketRequest.cs create mode 100644 ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketResponse.cs diff --git a/ElevenLabs-DotNet/Authentication/ElevenLabsClientSettings.cs b/ElevenLabs-DotNet/Authentication/ElevenLabsClientSettings.cs index c876e39..16d1749 100644 --- a/ElevenLabs-DotNet/Authentication/ElevenLabsClientSettings.cs +++ b/ElevenLabs-DotNet/Authentication/ElevenLabsClientSettings.cs @@ -2,62 +2,98 @@ using System; -namespace ElevenLabs +namespace ElevenLabs; + +public sealed class ElevenLabsClientSettings { - public sealed class ElevenLabsClientSettings + internal const string HttpProtocol = "http://"; + internal const string HttpsProtocol = "https://"; + internal const string WsProtocol = "ws://"; + internal const string WssProtocol = "wss://"; + internal const string DefaultApiVersion = "v1"; + internal const string ElevenLabsDomain = "api.elevenlabs.io"; + + /// + /// Creates a new instance of for use with ElevenLabs API. + /// + public ElevenLabsClientSettings() + { + Domain = ElevenLabsDomain; + ApiVersion = DefaultApiVersion; + Protocol = HttpsProtocol; + WebSocketProtocol = WssProtocol; + BaseRequest = $"/{ApiVersion}/"; + BaseRequestUrlFormat = $"{Protocol}{Domain}{BaseRequest}{{0}}"; + BaseRequestWebSocketUrlFormat = $"{WebSocketProtocol}{Domain}{BaseRequest}{{0}}"; + } + + /// + /// Creates a new instance of for use with ElevenLabs API. + /// + /// Base api domain. Starts with https or wss. + /// The version of the ElevenLabs api you want to use. + public ElevenLabsClientSettings(string domain, string apiVersion = DefaultApiVersion) { - internal const string Https = "https://"; - internal const string DefaultApiVersion = "v1"; - internal const string ElevenLabsDomain = "api.elevenlabs.io"; - - /// - /// Creates a new instance of for use with ElevenLabs API. - /// - public ElevenLabsClientSettings() + if (string.IsNullOrWhiteSpace(domain)) { - Domain = ElevenLabsDomain; - ApiVersion = "v1"; - BaseRequest = $"/{ApiVersion}/"; - BaseRequestUrlFormat = $"{Https}{Domain}{BaseRequest}{{0}}"; + domain = ElevenLabsDomain; } - /// - /// Creates a new instance of for use with ElevenLabs API. - /// - /// Base api domain. - /// The version of the ElevenLabs api you want to use. - public ElevenLabsClientSettings(string domain, string apiVersion = DefaultApiVersion) + if (!domain.Contains('.') && + !domain.Contains(':')) { - if (string.IsNullOrWhiteSpace(domain)) - { - domain = ElevenLabsDomain; - } + throw new ArgumentException( + $"You're attempting to pass a \"resourceName\" parameter to \"{nameof(domain)}\". Please specify \"resourceName:\" for this parameter in constructor."); + } - if (!domain.Contains('.') && - !domain.Contains(':')) + // extract anything before the :// to split the domain and protocol + var splitDomain = domain.Split("://", StringSplitOptions.RemoveEmptyEntries); + if (splitDomain.Length == 2) + { + Protocol = splitDomain[0]; + // if the protocol is not https or http, throw an exception + if (Protocol != HttpsProtocol && + Protocol != HttpProtocol) { - throw new ArgumentException($"You're attempting to pass a \"resourceName\" parameter to \"{nameof(domain)}\". Please specify \"resourceName:\" for this parameter in constructor."); + throw new ArgumentException( + $"The protocol \"{Protocol}\" is not supported. Please use \"{HttpsProtocol}\" or \"{HttpProtocol}\"."); } - if (string.IsNullOrWhiteSpace(apiVersion)) - { - apiVersion = DefaultApiVersion; - } + WebSocketProtocol = Protocol == HttpsProtocol ? WssProtocol : WsProtocol; + Domain = splitDomain[1]; + } + else + { + Protocol = HttpsProtocol; + WebSocketProtocol = WssProtocol; + Domain = domain; + } - Domain = domain.Contains("http") ? domain : $"{Https}{domain}"; - ApiVersion = apiVersion; - BaseRequest = $"/{ApiVersion}/"; - BaseRequestUrlFormat = $"{Domain}{BaseRequest}{{0}}"; + if (string.IsNullOrWhiteSpace(apiVersion)) + { + apiVersion = DefaultApiVersion; } - public string Domain { get; } + Domain = domain; + ApiVersion = apiVersion; + BaseRequest = $"/{ApiVersion}/"; + BaseRequestUrlFormat = $"{Protocol}{Domain}{BaseRequest}{{0}}"; + BaseRequestWebSocketUrlFormat = $"{WebSocketProtocol}{Domain}{BaseRequest}{{0}}"; + } + + public string Protocol { get; } - public string ApiVersion { get; } + public string WebSocketProtocol { get; } - public string BaseRequest { get; } + public string Domain { get; } - public string BaseRequestUrlFormat { get; } + public string ApiVersion { get; } - public static ElevenLabsClientSettings Default { get; } = new(); - } -} + public string BaseRequest { get; } + + public string BaseRequestUrlFormat { get; } + + public string BaseRequestWebSocketUrlFormat { get; } + + public static ElevenLabsClientSettings Default { get; } = new(); +} \ No newline at end of file diff --git a/ElevenLabs-DotNet/Common/ElevenLabsBaseEndPoint.cs b/ElevenLabs-DotNet/Common/ElevenLabsBaseEndPoint.cs index 94807c3..00fa4f6 100644 --- a/ElevenLabs-DotNet/Common/ElevenLabsBaseEndPoint.cs +++ b/ElevenLabs-DotNet/Common/ElevenLabsBaseEndPoint.cs @@ -3,47 +3,58 @@ using System.Collections.Generic; using System.Linq; -namespace ElevenLabs +namespace ElevenLabs; + +public abstract class ElevenLabsBaseEndPoint { - public abstract class ElevenLabsBaseEndPoint + internal ElevenLabsBaseEndPoint(ElevenLabsClient client) => this.client = client; + + // ReSharper disable once InconsistentNaming + protected readonly ElevenLabsClient client; + + /// + /// The root endpoint address. + /// + protected abstract string Root { get; } + + /// + /// Gets the full formatted url for the API endpoint. + /// + /// The endpoint url. + /// Optional, parameters to add to the endpoint. + protected string GetUrl(string endpoint = "", Dictionary queryParameters = null) { - internal ElevenLabsBaseEndPoint(ElevenLabsClient client) => this.client = client; - - // ReSharper disable once InconsistentNaming - protected readonly ElevenLabsClient client; - - /// - /// The root endpoint address. - /// - protected abstract string Root { get; } - - /// - /// Gets the full formatted url for the API endpoint. - /// - /// The endpoint url. - /// Optional, parameters to add to the endpoint. - protected string GetUrl(string endpoint = "", Dictionary queryParameters = null) - { - var result = string.Format(client.ElevenLabsClientSettings.BaseRequestUrlFormat, $"{Root}{endpoint}"); - - if (queryParameters is { Count: not 0 }) - { - result += $"?{string.Join('&', queryParameters.Select(parameter => $"{parameter.Key}={parameter.Value}"))}"; - } + var result = string.Format(client.ElevenLabsClientSettings.BaseRequestUrlFormat, $"{Root}{endpoint}"); - return result; + if (queryParameters is { Count: not 0 }) + { + result += $"?{string.Join('&', queryParameters.Select(parameter => $"{parameter.Key}={parameter.Value}"))}"; } - private bool enableDebug; + return result; + } + + protected string GetWebSocketUrl(string endpoint = "", Dictionary queryParameters = null) + { + var result = string.Format(client.ElevenLabsClientSettings.BaseRequestWebSocketUrlFormat, $"{Root}{endpoint}"); - /// - /// Enables or disables the logging of all http responses of header and body information for this endpoint.
- /// WARNING! Enabling this in your production build, could potentially leak sensitive information! - ///
- public bool EnableDebug + if (queryParameters is { Count: not 0 }) { - get => enableDebug || client.EnableDebug; - set => enableDebug = value; + result += $"?{string.Join('&', queryParameters.Select(parameter => $"{parameter.Key}={parameter.Value}"))}"; } + + return result; + } + + private bool enableDebug; + + /// + /// Enables or disables the logging of all http responses of header and body information for this endpoint.
+ /// WARNING! Enabling this in your production build, could potentially leak sensitive information! + ///
+ public bool EnableDebug + { + get => enableDebug || client.EnableDebug; + set => enableDebug = value; } -} +} \ No newline at end of file diff --git a/ElevenLabs-DotNet/ElevenLabsClient.cs b/ElevenLabs-DotNet/ElevenLabsClient.cs index 8efcedc..3a97963 100644 --- a/ElevenLabs-DotNet/ElevenLabsClient.cs +++ b/ElevenLabs-DotNet/ElevenLabsClient.cs @@ -10,6 +10,7 @@ using ElevenLabs.Voices; using System; using System.Net.Http; +using System.Net.WebSockets; using System.Security.Authentication; using System.Text.Json; using System.Text.Json.Serialization; @@ -19,31 +20,41 @@ namespace ElevenLabs public sealed class ElevenLabsClient : IDisposable { /// - /// Creates a new client for the Eleven Labs API, handling auth and allowing for access to various API endpoints. + /// Creates a new client for the Eleven Labs API, handling auth and allowing for access to various API endpoints. /// - /// The API authentication information to use for API calls, - /// or to attempt to use the , - /// potentially loading from environment vars or from a config file. + /// + /// The API authentication information to use for API calls, + /// or to attempt to use the , + /// potentially loading from environment vars or from a config file. /// /// - /// Optional, for specifying a proxy domain. + /// Optional, for specifying a proxy domain. /// - /// Optional, . + /// Optional, . + /// Optional, . /// Raised when authentication details are missing or invalid. - /// implements to manage the lifecycle of the resources it uses, including . + /// + /// implements + /// + /// to manage the lifecycle of the resources it uses, including + /// + /// . /// - /// When you initialize , it will create an internal instance if one is not provided. - /// This internal HttpClient is disposed of when ElevenLabsClient is disposed of. - /// If you provide an external HttpClient instance to ElevenLabsClient, you are responsible for managing its disposal. + /// When you initialize , it will create an internal instance + /// if one is not provided. + /// This internal HttpClient is disposed of when ElevenLabsClient is disposed of. + /// If you provide an external HttpClient instance to ElevenLabsClient, you are responsible for managing its disposal. /// - public ElevenLabsClient(ElevenLabsAuthentication authentication = null, ElevenLabsClientSettings settings = null, HttpClient httpClient = null) + public ElevenLabsClient(ElevenLabsAuthentication authentication = null, + ElevenLabsClientSettings settings = null, HttpClient httpClient = null, ClientWebSocket webSocketClient = null) { ElevenLabsAuthentication = authentication ?? ElevenLabsAuthentication.Default; ElevenLabsClientSettings = settings ?? ElevenLabsClientSettings.Default; if (string.IsNullOrWhiteSpace(ElevenLabsAuthentication?.ApiKey)) { - throw new AuthenticationException("You must provide API authentication. Please refer to https://github.com/RageAgainstThePixel/ElevenLabs-DotNet#authentication for details."); + throw new AuthenticationException( + "You must provide API authentication. Please refer to https://github.com/RageAgainstThePixel/ElevenLabs-DotNet#authentication for details."); } if (httpClient == null) @@ -58,16 +69,30 @@ public ElevenLabsClient(ElevenLabsAuthentication authentication = null, ElevenLa isCustomClient = true; } + if (webSocketClient == null) + { + webSocketClient = new ClientWebSocket(); + } + else + { + isCustomWebSocketClient = true; + } + Client = httpClient; Client.DefaultRequestHeaders.Add("User-Agent", "ElevenLabs-DotNet"); Client.DefaultRequestHeaders.Add("xi-api-key", ElevenLabsAuthentication.ApiKey); + WebSocketClient = webSocketClient; + WebSocketClient.Options.SetRequestHeader("User-Agent", "ElevenLabs-DotNet"); + WebSocketClient.Options.SetRequestHeader("xi-api-key", ElevenLabsAuthentication.ApiKey); + UserEndpoint = new UserEndpoint(this); VoicesEndpoint = new VoicesEndpoint(this); SharedVoicesEndpoint = new SharedVoicesEndpoint(this); ModelsEndpoint = new ModelsEndpoint(this); HistoryEndpoint = new HistoryEndpoint(this); TextToSpeechEndpoint = new TextToSpeechEndpoint(this); + TextToSpeechWebSocketEndpoint = new TextToSpeechWebSocketEndpoint(this); VoiceGenerationEndpoint = new VoiceGenerationEndpoint(this); SoundGenerationEndpoint = new SoundGenerationEndpoint(this); DubbingEndpoint = new DubbingEndpoint(this); @@ -96,6 +121,11 @@ private void Dispose(bool disposing) { Client?.Dispose(); } + + if (!isCustomWebSocketClient) + { + WebSocketClient?.Dispose(); + } isDisposed = true; } @@ -105,13 +135,20 @@ private void Dispose(bool disposing) private bool isCustomClient; + private bool isCustomWebSocketClient; + /// - /// to use when making calls to the API. + /// to use when making calls to the API. /// internal HttpClient Client { get; } /// - /// The to use when making calls to the API. + /// to use when making calls to the API. + /// + internal ClientWebSocket WebSocketClient { get; } + + /// + /// The to use when making calls to the API. /// internal static JsonSerializerOptions JsonSerializationOptions { get; } = new() { @@ -119,12 +156,12 @@ private void Dispose(bool disposing) }; /// - /// Enables or disables debugging for all endpoints. + /// Enables or disables debugging for all endpoints. /// public bool EnableDebug { get; set; } /// - /// The API authentication information to use for API calls + /// The API authentication information to use for API calls /// public ElevenLabsAuthentication ElevenLabsAuthentication { get; } @@ -141,6 +178,8 @@ private void Dispose(bool disposing) public HistoryEndpoint HistoryEndpoint { get; } public TextToSpeechEndpoint TextToSpeechEndpoint { get; } + + public TextToSpeechWebSocketEndpoint TextToSpeechWebSocketEndpoint { get; } public VoiceGenerationEndpoint VoiceGenerationEndpoint { get; } @@ -148,4 +187,4 @@ private void Dispose(bool disposing) public DubbingEndpoint DubbingEndpoint { get; } } -} +} \ No newline at end of file diff --git a/ElevenLabs-DotNet/TextToSpeech/Alignment.cs b/ElevenLabs-DotNet/TextToSpeech/Alignment.cs new file mode 100644 index 0000000..037965e --- /dev/null +++ b/ElevenLabs-DotNet/TextToSpeech/Alignment.cs @@ -0,0 +1,22 @@ +using System.Text.Json.Serialization; + +namespace ElevenLabs.TextToSpeech; + +public class Alignment +{ + [JsonPropertyName("char_start_times_ms")] + public int[] CharStartTimesMs { get; } + + [JsonPropertyName("chars_durations_ms")] + public int[] CharsDurationsMs { get; } + + [JsonPropertyName("chars")] + public string[] Chars { get; } + + public Alignment(int[] charStartTimesMs, int[] charsDurationsMs, string[] chars) + { + CharStartTimesMs = charStartTimesMs; + CharsDurationsMs = charsDurationsMs; + Chars = chars; + } +} \ No newline at end of file diff --git a/ElevenLabs-DotNet/TextToSpeech/GenerationConfig.cs b/ElevenLabs-DotNet/TextToSpeech/GenerationConfig.cs new file mode 100644 index 0000000..f827742 --- /dev/null +++ b/ElevenLabs-DotNet/TextToSpeech/GenerationConfig.cs @@ -0,0 +1,20 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using System.Text.Json.Serialization; + +namespace ElevenLabs.TextToSpeech; + +public sealed class GenerationConfig +{ + [JsonPropertyName("chunk_length_schedule")] + public int[] ChunkLengthSchedule { get; } + + public GenerationConfig() : this([120, 160, 250, 290]) + { + } + + public GenerationConfig(int[] chunkLengthSchedule) + { + ChunkLengthSchedule = chunkLengthSchedule; + } +} \ No newline at end of file diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs new file mode 100644 index 0000000..aef6616 --- /dev/null +++ b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs @@ -0,0 +1,183 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using ElevenLabs.Models; +using ElevenLabs.Voices; +using System; +using System.Collections.Generic; +using System.Net.WebSockets; +using System.Text; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; + +namespace ElevenLabs.TextToSpeech; + +/// +/// Access to convert text to synthesized speech. +/// +public sealed class TextToSpeechWebSocketEndpoint : ElevenLabsBaseEndPoint +{ + private const string ModelIdParameter = "model_id"; + private const string EnableLoggingParameter = "enable_logging"; + private const string EnableSsmlParsingParameter = "enable_ssml_parsing"; + private const string OptimizeStreamingLatencyParameter = "optimize_streaming_latency"; + private const string OutputFormatParameter = "output_format"; + + public TextToSpeechWebSocketEndpoint(ElevenLabsClient client) : base(client) + { + } + + protected override string Root => "text-to-speech"; + + /// + /// Converts text into speech using a voice of your choice and returns audio. + /// + /// + /// to use. + /// + /// + /// Callback for streamed audio as it comes in.
+ /// Returns partial . + /// + /// + /// Optional, that will override the default settings in . + /// + /// Optional, . + /// + /// Optional, to use. Defaults to . + /// + /// + /// Output format of the generated audio.
+ /// Defaults to + /// + /// Optional, enable logging. + /// Optional, enable SSML parsing. + /// + /// Optional, You can turn on latency optimizations at some cost of quality. + /// The best possible final latency varies by model.
+ /// Possible values:
+ /// 0 - default mode (no latency optimizations)
+ /// 1 - normal latency optimizations (about 50% of possible latency improvement of option 3)
+ /// 2 - strong latency optimizations (about 75% of possible latency improvement of option 3)
+ /// 3 - max latency optimizations
+ /// 4 - max latency optimizations, but also with text normalizer turned off for even more latency savings + /// (best latency, but can mispronounce eg numbers and dates). + /// + /// Optional, . + /// . + public async Task StartTextToSpeechAsync(Voice voice, Func partialClipCallback, + VoiceSettings voiceSettings = null, GenerationConfig generationConfig = null, Model model = null, + OutputFormat outputFormat = OutputFormat.MP3_44100_128, bool? enableLogging = null, + bool? enableSsmlParsing = null, int? optimizeStreamingLatency = null, + CancellationToken cancellationToken = default) + { + if (voice == null || + string.IsNullOrWhiteSpace(voice.Id)) + { + throw new ArgumentNullException(nameof(voice)); + } + + if (partialClipCallback == null) + { + throw new ArgumentNullException(nameof(partialClipCallback)); + } + + var parameters = new Dictionary + { + { ModelIdParameter, model?.Id ?? Model.MonoLingualV1.Id }, + { OutputFormatParameter, outputFormat.ToString().ToLower() } + }; + + if (enableLogging.HasValue) + { + parameters.Add(EnableLoggingParameter, enableLogging.ToString()); + } + + if (enableSsmlParsing.HasValue) + { + parameters.Add(EnableSsmlParsingParameter, enableSsmlParsing.ToString()); + } + + if (optimizeStreamingLatency.HasValue) + { + parameters.Add(OptimizeStreamingLatencyParameter, optimizeStreamingLatency.ToString()); + } + + await client.WebSocketClient.ConnectAsync( + new Uri(GetWebSocketUrl($"/{voice.Id}/stream-input", parameters)), cancellationToken); + + // start receiving messages in a separate task + _ = Task.Run(async () => await ReceiveMessagesAsync(partialClipCallback, voice, cancellationToken), + cancellationToken); + + TextToSpeechWebSocketFirstMessageRequest firstMessageRequest = new(voiceSettings, generationConfig); + await client.WebSocketClient.SendAsync(firstMessageRequest.ToArraySegment(), WebSocketMessageType.Text, true, + cancellationToken); + } + + public async Task SendTextToSpeechAsync(string text, bool? flush = null, bool tryTriggerGeneration = false, + CancellationToken cancellationToken = default) + { + TextToSpeechWebSocketRequest request = new(text, flush, tryTriggerGeneration); + await client.WebSocketClient.SendAsync(request.ToArraySegment(), WebSocketMessageType.Text, true, + cancellationToken); + } + + public async Task EndTextToSpeechAsync(CancellationToken cancellationToken = default) + { + TextToSpeechWebSocketLastMessageRequest lastMessageRequest = new(); + await client.WebSocketClient.SendAsync(lastMessageRequest.ToArraySegment(), WebSocketMessageType.Text, true, + cancellationToken); + } + + private async Task ReceiveMessagesAsync(Func partialClipCallback, Voice voice, + CancellationToken cancellationToken) + { + byte[] buffer = new byte[8192]; + StringBuilder message = new(); + + while (client.WebSocketClient.State == WebSocketState.Open) + { + WebSocketReceiveResult receiveResult = await client.WebSocketClient.ReceiveAsync( + new ArraySegment(buffer), cancellationToken); + + if (receiveResult.MessageType == WebSocketMessageType.Close) + { + await client.WebSocketClient.CloseAsync(WebSocketCloseStatus.NormalClosure, string.Empty, + cancellationToken); + break; + } + + string jsonString = Encoding.UTF8.GetString(buffer, 0, receiveResult.Count); + message.Append(jsonString); + + if (!receiveResult.EndOfMessage) + { + continue; + } + + TextToSpeechWebSocketResponse response = JsonSerializer.Deserialize( + message.ToString(), ElevenLabsClient.JsonSerializationOptions); + + if (response == null) + { + throw new ArgumentException("Failed to parse response!"); + } + + message.Clear(); + + if (!string.IsNullOrWhiteSpace(response.Audio)) + { + string text = response.Alignment is { Chars: not null } + ? string.Concat(response.Alignment.Chars) + : null; + VoiceClip voiceClip = new(string.Empty, text, voice, response.AudioBytes); + await partialClipCallback(voiceClip).ConfigureAwait(false); + } + else + { + await partialClipCallback(null).ConfigureAwait(false); + } + } + } +} \ No newline at end of file diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketFirstMessageRequest.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketFirstMessageRequest.cs new file mode 100644 index 0000000..4e828a9 --- /dev/null +++ b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketFirstMessageRequest.cs @@ -0,0 +1,36 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using System; +using System.Text; +using System.Text.Json; +using ElevenLabs.Voices; +using System.Text.Json.Serialization; + +namespace ElevenLabs.TextToSpeech; + +public sealed class TextToSpeechWebSocketFirstMessageRequest +{ + public TextToSpeechWebSocketFirstMessageRequest( + VoiceSettings voiceSettings = null, + GenerationConfig generationConfig = null) + { + VoiceSettings = voiceSettings; + GenerationConfig = generationConfig; + } + + [JsonPropertyName("text"), JsonInclude] + public string Text { get; } = " "; + + [JsonPropertyName("voice_settings")] + public VoiceSettings VoiceSettings { get; } + + [JsonPropertyName("generation_config")] + public GenerationConfig GenerationConfig { get; } + + public ArraySegment ToArraySegment() + { + string json = JsonSerializer.Serialize(this); + byte[] bytes = Encoding.UTF8.GetBytes(json); + return new ArraySegment(bytes); + } +} \ No newline at end of file diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketLastMessageRequest.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketLastMessageRequest.cs new file mode 100644 index 0000000..4eca5b7 --- /dev/null +++ b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketLastMessageRequest.cs @@ -0,0 +1,21 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using System; +using System.Text; +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace ElevenLabs.TextToSpeech; + +public sealed class TextToSpeechWebSocketLastMessageRequest +{ + [JsonPropertyName("text"), JsonInclude] + public string Text { get; } = ""; + + public ArraySegment ToArraySegment() + { + string json = JsonSerializer.Serialize(this); + byte[] bytes = Encoding.UTF8.GetBytes(json); + return new ArraySegment(bytes); + } +} \ No newline at end of file diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketRequest.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketRequest.cs new file mode 100644 index 0000000..9c59cf3 --- /dev/null +++ b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketRequest.cs @@ -0,0 +1,50 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using System; +using System.Text; +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace ElevenLabs.TextToSpeech; + +public sealed class TextToSpeechWebSocketRequest +{ + public TextToSpeechWebSocketRequest(string text, bool? flush = null, bool tryTriggerGeneration = false) + { + if (string.IsNullOrWhiteSpace(text)) + { + throw new ArgumentNullException(nameof(text)); + } + + // if the last character of the text is not a space, append one + Text = text[^1] != ' ' ? text + ' ' : text; + TryTriggerGeneration = tryTriggerGeneration; + Flush = flush; + } + + /// + /// The text to be converted to speech. The last character of the text must be a space. + /// + [JsonPropertyName("text"), JsonInclude] + public string Text { get; } + + /// + /// Use this to attempt to immediately trigger the generation of audio. Most users shouldn't use this. + /// + [JsonPropertyName("try_trigger_generation")] + public bool TryTriggerGeneration { get; } + + /// + /// Flush forces the generation of audio. Set this value to true when you have finished sending text, + /// but want to keep the websocket connection open. + /// + [JsonPropertyName("flush"), JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public bool? Flush { get; } + + public ArraySegment ToArraySegment() + { + string json = JsonSerializer.Serialize(this); + byte[] bytes = Encoding.UTF8.GetBytes(json); + return new ArraySegment(bytes); + } +} \ No newline at end of file diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketResponse.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketResponse.cs new file mode 100644 index 0000000..7ad6494 --- /dev/null +++ b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketResponse.cs @@ -0,0 +1,49 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using System; +using System.Text.Json.Serialization; + +namespace ElevenLabs.TextToSpeech; + +public sealed class TextToSpeechWebSocketResponse +{ + /// + /// A generated partial audio chunk, encoded using the selected output_format, + /// by default this is MP3 encoded as a base64 string. + /// + [JsonPropertyName("audio")] + public string Audio { get; } + + /// + /// A generated partial audio chunk, encoded using the selected output_format, + /// + [JsonIgnore] + public byte[] AudioBytes { get; } + + /// + /// Indicates if the generation is complete. If set to True, audio will be null. + /// + [JsonPropertyName("isFinal")] + public bool? IsFinal { get; } + + /// + /// Alignment information for the generated audio given the input normalized text sequence. + /// + [JsonPropertyName("normalizedAlignment")] + public Alignment NormalizedAlignment { get; } + + /// + /// Alignment information for the generated audio given the original text sequence. + /// + [JsonPropertyName("alignment")] + public Alignment Alignment { get; } + + public TextToSpeechWebSocketResponse(string audio, bool? isFinal, Alignment normalizedAlignment, Alignment alignment) + { + Audio = audio; + IsFinal = isFinal; + NormalizedAlignment = normalizedAlignment; + Alignment = alignment; + AudioBytes = audio != null ? Convert.FromBase64String(audio) : null; + } +} \ No newline at end of file From 94c252989c6c0f7082104795201d4910005100b6 Mon Sep 17 00:00:00 2001 From: Nico Nonne Date: Fri, 26 Jul 2024 11:34:04 +0200 Subject: [PATCH 2/6] - adding the turbo 2.5 model --- ElevenLabs-DotNet/Models/Model.cs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ElevenLabs-DotNet/Models/Model.cs b/ElevenLabs-DotNet/Models/Model.cs index a1bf3c7..658c474 100644 --- a/ElevenLabs-DotNet/Models/Model.cs +++ b/ElevenLabs-DotNet/Models/Model.cs @@ -93,6 +93,9 @@ public Model(string id) /// /// Our state-of-the-art speech to speech model suitable for scenarios where you need maximum control over the content and prosody of your generations. /// + [JsonIgnore] + public static Model TurboV25 { get; } = new("eleven_turbo_v2_5"); + [JsonIgnore] public static Model EnglishSpeechToSpeechV2 { get; } = new("eleven_english_sts_v2"); From dd0de4fcaacd5383458c342e00e85bc2ddf2e7a9 Mon Sep 17 00:00:00 2001 From: Nico Nonne Date: Wed, 31 Jul 2024 11:08:39 +0200 Subject: [PATCH 3/6] - added several checks and documentation, made the custom WebSocket version into a spawner function for re-using it --- ElevenLabs-DotNet/ElevenLabsClient.cs | 37 +++-- .../TextToSpeechWebSocketEndpoint.cs | 132 ++++++++++++------ .../TextToSpeechWebSocketRequest.cs | 15 +- 3 files changed, 120 insertions(+), 64 deletions(-) diff --git a/ElevenLabs-DotNet/ElevenLabsClient.cs b/ElevenLabs-DotNet/ElevenLabsClient.cs index 3a97963..b3c24e7 100644 --- a/ElevenLabs-DotNet/ElevenLabsClient.cs +++ b/ElevenLabs-DotNet/ElevenLabsClient.cs @@ -31,7 +31,7 @@ public sealed class ElevenLabsClient : IDisposable /// Optional, for specifying a proxy domain. /// /// Optional, . - /// Optional, . + /// Optional, to create custom versions of . /// Raised when authentication details are missing or invalid. /// /// implements @@ -46,7 +46,8 @@ public sealed class ElevenLabsClient : IDisposable /// If you provide an external HttpClient instance to ElevenLabsClient, you are responsible for managing its disposal. /// public ElevenLabsClient(ElevenLabsAuthentication authentication = null, - ElevenLabsClientSettings settings = null, HttpClient httpClient = null, ClientWebSocket webSocketClient = null) + ElevenLabsClientSettings settings = null, HttpClient httpClient = null, + Func clientWebSocketSpawner = null) { ElevenLabsAuthentication = authentication ?? ElevenLabsAuthentication.Default; ElevenLabsClientSettings = settings ?? ElevenLabsClientSettings.Default; @@ -69,20 +70,12 @@ public ElevenLabsClient(ElevenLabsAuthentication authentication = null, isCustomClient = true; } - if (webSocketClient == null) - { - webSocketClient = new ClientWebSocket(); - } - else - { - isCustomWebSocketClient = true; - } - Client = httpClient; Client.DefaultRequestHeaders.Add("User-Agent", "ElevenLabs-DotNet"); Client.DefaultRequestHeaders.Add("xi-api-key", ElevenLabsAuthentication.ApiKey); - WebSocketClient = webSocketClient; + this.clientWebSocketSpawner = clientWebSocketSpawner; + WebSocketClient = clientWebSocketSpawner == null ? new ClientWebSocket() : clientWebSocketSpawner(); WebSocketClient.Options.SetRequestHeader("User-Agent", "ElevenLabs-DotNet"); WebSocketClient.Options.SetRequestHeader("xi-api-key", ElevenLabsAuthentication.ApiKey); @@ -98,6 +91,14 @@ public ElevenLabsClient(ElevenLabsAuthentication authentication = null, DubbingEndpoint = new DubbingEndpoint(this); } + public void ReinitializeWebSocketClient() + { + WebSocketClient.Dispose(); + WebSocketClient = clientWebSocketSpawner == null ? new ClientWebSocket() : clientWebSocketSpawner(); + WebSocketClient.Options.SetRequestHeader("User-Agent", "ElevenLabs-DotNet"); + WebSocketClient.Options.SetRequestHeader("xi-api-key", ElevenLabsAuthentication.ApiKey); + } + ~ElevenLabsClient() { Dispose(false); @@ -121,12 +122,8 @@ private void Dispose(bool disposing) { Client?.Dispose(); } - - if (!isCustomWebSocketClient) - { - WebSocketClient?.Dispose(); - } + WebSocketClient?.Dispose(); isDisposed = true; } } @@ -135,7 +132,7 @@ private void Dispose(bool disposing) private bool isCustomClient; - private bool isCustomWebSocketClient; + private Func clientWebSocketSpawner; /// /// to use when making calls to the API. @@ -145,7 +142,7 @@ private void Dispose(bool disposing) /// /// to use when making calls to the API. /// - internal ClientWebSocket WebSocketClient { get; } + internal ClientWebSocket WebSocketClient { get; private set; } /// /// The to use when making calls to the API. @@ -178,7 +175,7 @@ private void Dispose(bool disposing) public HistoryEndpoint HistoryEndpoint { get; } public TextToSpeechEndpoint TextToSpeechEndpoint { get; } - + public TextToSpeechWebSocketEndpoint TextToSpeechWebSocketEndpoint { get; } public VoiceGenerationEndpoint VoiceGenerationEndpoint { get; } diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs index aef6616..409eae0 100644 --- a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs +++ b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs @@ -13,7 +13,7 @@ namespace ElevenLabs.TextToSpeech; /// -/// Access to convert text to synthesized speech. +/// Access to convert text to synthesized speech using a WebSocket connection. /// public sealed class TextToSpeechWebSocketEndpoint : ElevenLabsBaseEndPoint { @@ -64,7 +64,8 @@ public TextToSpeechWebSocketEndpoint(ElevenLabsClient client) : base(client) /// (best latency, but can mispronounce eg numbers and dates). /// /// Optional, . - /// . + /// Raised when is null or empty. + /// Raised when is null. public async Task StartTextToSpeechAsync(Voice voice, Func partialClipCallback, VoiceSettings voiceSettings = null, GenerationConfig generationConfig = null, Model model = null, OutputFormat outputFormat = OutputFormat.MP3_44100_128, bool? enableLogging = null, @@ -115,69 +116,114 @@ await client.WebSocketClient.SendAsync(firstMessageRequest.ToArraySegment(), Web cancellationToken); } + /// + /// Sends text to the WebSocket for speech synthesis. + /// + /// Text input to synthesize speech for. Needs to end with a space and cannot be null or empty. + /// + /// Forces the generation of audio. Set this value to true when you have finished sending text, but + /// want to keep the websocket connection open. + /// + /// + /// Use this to attempt to immediately trigger the generation of audio. Most users + /// shouldn't use this. + /// + /// Optional, . + /// Raised when the WebSocket is not open. + /// Raised when is null or empty. public async Task SendTextToSpeechAsync(string text, bool? flush = null, bool tryTriggerGeneration = false, CancellationToken cancellationToken = default) { + if (client.WebSocketClient.State != WebSocketState.Open) + { + throw new InvalidOperationException("WebSocket is not open!"); + } + + if (string.IsNullOrWhiteSpace(text)) + { + throw new ArgumentNullException($"{nameof(text)} cannot be null or empty!"); + } + TextToSpeechWebSocketRequest request = new(text, flush, tryTriggerGeneration); await client.WebSocketClient.SendAsync(request.ToArraySegment(), WebSocketMessageType.Text, true, cancellationToken); } + /// + /// Closes the text to speech WebSocket connection. + /// + /// Optional, . + /// Raised when the WebSocket is not open. public async Task EndTextToSpeechAsync(CancellationToken cancellationToken = default) { + if (client.WebSocketClient.State != WebSocketState.Open) + { + throw new InvalidOperationException("WebSocket is not open!"); + } + TextToSpeechWebSocketLastMessageRequest lastMessageRequest = new(); await client.WebSocketClient.SendAsync(lastMessageRequest.ToArraySegment(), WebSocketMessageType.Text, true, cancellationToken); + await client.WebSocketClient.CloseAsync(WebSocketCloseStatus.NormalClosure, string.Empty, cancellationToken); } private async Task ReceiveMessagesAsync(Func partialClipCallback, Voice voice, CancellationToken cancellationToken) { - byte[] buffer = new byte[8192]; - StringBuilder message = new(); - - while (client.WebSocketClient.State == WebSocketState.Open) + try { - WebSocketReceiveResult receiveResult = await client.WebSocketClient.ReceiveAsync( - new ArraySegment(buffer), cancellationToken); - - if (receiveResult.MessageType == WebSocketMessageType.Close) - { - await client.WebSocketClient.CloseAsync(WebSocketCloseStatus.NormalClosure, string.Empty, - cancellationToken); - break; - } - - string jsonString = Encoding.UTF8.GetString(buffer, 0, receiveResult.Count); - message.Append(jsonString); - - if (!receiveResult.EndOfMessage) - { - continue; - } - - TextToSpeechWebSocketResponse response = JsonSerializer.Deserialize( - message.ToString(), ElevenLabsClient.JsonSerializationOptions); + byte[] buffer = new byte[8192]; + StringBuilder message = new(); - if (response == null) + while (client.WebSocketClient.State == WebSocketState.Open) { - throw new ArgumentException("Failed to parse response!"); - } - - message.Clear(); - - if (!string.IsNullOrWhiteSpace(response.Audio)) - { - string text = response.Alignment is { Chars: not null } - ? string.Concat(response.Alignment.Chars) - : null; - VoiceClip voiceClip = new(string.Empty, text, voice, response.AudioBytes); - await partialClipCallback(voiceClip).ConfigureAwait(false); - } - else - { - await partialClipCallback(null).ConfigureAwait(false); + WebSocketReceiveResult receiveResult = await client.WebSocketClient.ReceiveAsync( + new ArraySegment(buffer), cancellationToken); + + Console.WriteLine($"{receiveResult.MessageType} - {receiveResult.Count}"); + + if (receiveResult.MessageType == WebSocketMessageType.Close) + { + await client.WebSocketClient.CloseAsync(WebSocketCloseStatus.NormalClosure, string.Empty, + cancellationToken); + break; + } + + string jsonString = Encoding.UTF8.GetString(buffer, 0, receiveResult.Count); + message.Append(jsonString); + + if (!receiveResult.EndOfMessage) + { + continue; + } + + TextToSpeechWebSocketResponse response = JsonSerializer.Deserialize( + message.ToString(), ElevenLabsClient.JsonSerializationOptions); + + if (response == null) + { + throw new ArgumentException("Failed to parse response!"); + } + + message.Clear(); + + if (!string.IsNullOrWhiteSpace(response.Audio)) + { + string text = response.Alignment is { Chars: not null } + ? string.Concat(response.Alignment.Chars) + : null; + VoiceClip voiceClip = new(string.Empty, text, voice, response.AudioBytes); + await partialClipCallback(voiceClip); + } + else + { + await partialClipCallback(null); + } } } + finally + { + client.ReinitializeWebSocketClient(); + } } } \ No newline at end of file diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketRequest.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketRequest.cs index 9c59cf3..379f17b 100644 --- a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketRequest.cs +++ b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketRequest.cs @@ -9,6 +9,19 @@ namespace ElevenLabs.TextToSpeech; public sealed class TextToSpeechWebSocketRequest { + /// + /// Text needs to end with a space and cannot be null or empty. + /// + /// The text to be converted to speech. Needs to end with a space, cannot be null or empty. + /// + /// Forces the generation of audio. Set this value to true when you have finished sending text, but + /// want to keep the websocket connection open. + /// + /// + /// Use this to attempt to immediately trigger the generation of audio. Most users + /// shouldn't use this. + /// + /// Thrown when is null or empty. public TextToSpeechWebSocketRequest(string text, bool? flush = null, bool tryTriggerGeneration = false) { if (string.IsNullOrWhiteSpace(text)) @@ -40,7 +53,7 @@ public TextToSpeechWebSocketRequest(string text, bool? flush = null, bool tryTri /// [JsonPropertyName("flush"), JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public bool? Flush { get; } - + public ArraySegment ToArraySegment() { string json = JsonSerializer.Serialize(this); From fed407fc407f9d2d5363114c636527be3d7c6121 Mon Sep 17 00:00:00 2001 From: Nico Nonne Date: Wed, 31 Jul 2024 11:13:47 +0200 Subject: [PATCH 4/6] - removing debug output --- ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs | 2 -- 1 file changed, 2 deletions(-) diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs index 409eae0..929f662 100644 --- a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs +++ b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs @@ -180,8 +180,6 @@ private async Task ReceiveMessagesAsync(Func partialClipCallbac WebSocketReceiveResult receiveResult = await client.WebSocketClient.ReceiveAsync( new ArraySegment(buffer), cancellationToken); - Console.WriteLine($"{receiveResult.MessageType} - {receiveResult.Count}"); - if (receiveResult.MessageType == WebSocketMessageType.Close) { await client.WebSocketClient.CloseAsync(WebSocketCloseStatus.NormalClosure, string.Empty, From b44389f6e107c6cab47f61ff59af730fb609b7b8 Mon Sep 17 00:00:00 2001 From: Nico Nonne Date: Tue, 22 Oct 2024 15:18:57 +0200 Subject: [PATCH 5/6] Rebasing on main. --- .../TextToSpeech/TextToSpeechWebSocketEndpoint.cs | 6 +++++- .../TextToSpeech/TextToSpeechWebSocketRequest.cs | 5 ----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs index 929f662..9fb38f5 100644 --- a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs +++ b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs @@ -85,7 +85,7 @@ public async Task StartTextToSpeechAsync(Voice voice, Func part var parameters = new Dictionary { - { ModelIdParameter, model?.Id ?? Model.MonoLingualV1.Id }, + { ModelIdParameter, model?.Id ?? Model.EnglishV1.Id }, { OutputFormatParameter, outputFormat.ToString().ToLower() } }; @@ -219,6 +219,10 @@ await client.WebSocketClient.CloseAsync(WebSocketCloseStatus.NormalClosure, stri } } } + catch (Exception e) + { + Console.WriteLine(e); + } finally { client.ReinitializeWebSocketClient(); diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketRequest.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketRequest.cs index 379f17b..df47eb9 100644 --- a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketRequest.cs +++ b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketRequest.cs @@ -24,11 +24,6 @@ public sealed class TextToSpeechWebSocketRequest /// Thrown when is null or empty. public TextToSpeechWebSocketRequest(string text, bool? flush = null, bool tryTriggerGeneration = false) { - if (string.IsNullOrWhiteSpace(text)) - { - throw new ArgumentNullException(nameof(text)); - } - // if the last character of the text is not a space, append one Text = text[^1] != ' ' ? text + ' ' : text; TryTriggerGeneration = tryTriggerGeneration; From 5d4d0d0c48c984191197becdc899f48f0db172cc Mon Sep 17 00:00:00 2001 From: Nico Nonne Date: Tue, 22 Oct 2024 16:01:54 +0200 Subject: [PATCH 6/6] Add the inactivity timeout parameter. Remove the keep-alive messages (as they don't work and stop the connection). And allow the sending of white space messages for testing. --- .../TextToSpeechWebSocketEndpoint.cs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs index 9fb38f5..91abc47 100644 --- a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs +++ b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechWebSocketEndpoint.cs @@ -22,6 +22,7 @@ public sealed class TextToSpeechWebSocketEndpoint : ElevenLabsBaseEndPoint private const string EnableSsmlParsingParameter = "enable_ssml_parsing"; private const string OptimizeStreamingLatencyParameter = "optimize_streaming_latency"; private const string OutputFormatParameter = "output_format"; + private const string InactivityTimeoutParameter = "inactivity_timeout"; public TextToSpeechWebSocketEndpoint(ElevenLabsClient client) : base(client) { @@ -63,13 +64,17 @@ public TextToSpeechWebSocketEndpoint(ElevenLabsClient client) : base(client) /// 4 - max latency optimizations, but also with text normalizer turned off for even more latency savings /// (best latency, but can mispronounce eg numbers and dates). /// + /// + /// The number of seconds that the connection can be inactive before it is automatically closed. + /// Defaults to 20 seconds, with a maximum allowed value of 180 seconds. + /// /// Optional, . /// Raised when is null or empty. /// Raised when is null. public async Task StartTextToSpeechAsync(Voice voice, Func partialClipCallback, VoiceSettings voiceSettings = null, GenerationConfig generationConfig = null, Model model = null, OutputFormat outputFormat = OutputFormat.MP3_44100_128, bool? enableLogging = null, - bool? enableSsmlParsing = null, int? optimizeStreamingLatency = null, + bool? enableSsmlParsing = null, int? optimizeStreamingLatency = null, int? inactivityTimeout = null, CancellationToken cancellationToken = default) { if (voice == null || @@ -104,6 +109,11 @@ public async Task StartTextToSpeechAsync(Voice voice, Func part parameters.Add(OptimizeStreamingLatencyParameter, optimizeStreamingLatency.ToString()); } + if (inactivityTimeout.HasValue) + { + parameters.Add(InactivityTimeoutParameter, inactivityTimeout.ToString()); + } + await client.WebSocketClient.ConnectAsync( new Uri(GetWebSocketUrl($"/{voice.Id}/stream-input", parameters)), cancellationToken); @@ -139,11 +149,6 @@ public async Task SendTextToSpeechAsync(string text, bool? flush = null, bool tr throw new InvalidOperationException("WebSocket is not open!"); } - if (string.IsNullOrWhiteSpace(text)) - { - throw new ArgumentNullException($"{nameof(text)} cannot be null or empty!"); - } - TextToSpeechWebSocketRequest request = new(text, flush, tryTriggerGeneration); await client.WebSocketClient.SendAsync(request.ToArraySegment(), WebSocketMessageType.Text, true, cancellationToken);