From 854a6b06b3a3d95f9311ce630d420d6d48b847dc Mon Sep 17 00:00:00 2001 From: Stephen Hodgson Date: Sat, 14 Sep 2024 17:23:01 -0400 Subject: [PATCH] ElevenLabs-DotNet 3.1.0 (#64) - Refactored TextToSpeechEndpoint endpoint to accept TextToSpeechRequest object - Added text encoding options to TextToSpeechRequest - Added previous text input parameter to TextToSpeechRequest --- ElevenLabs-DotNet/ElevenLabs-DotNet.csproj | 6 +- .../TextToSpeech/TextToSpeechEndpoint.cs | 43 +++++---- .../TextToSpeech/TextToSpeechRequest.cs | 87 ++++++++++++++++++- 3 files changed, 113 insertions(+), 23 deletions(-) diff --git a/ElevenLabs-DotNet/ElevenLabs-DotNet.csproj b/ElevenLabs-DotNet/ElevenLabs-DotNet.csproj index cdfde1a..08a3542 100644 --- a/ElevenLabs-DotNet/ElevenLabs-DotNet.csproj +++ b/ElevenLabs-DotNet/ElevenLabs-DotNet.csproj @@ -25,8 +25,12 @@ All copyrights, trademarks, logos, and assets are the property of their respecti false true true - 3.0.3 + 3.1.0 +Version 3.1.0 +- Refactored TextToSpeechEndpoint endpoint to accept TextToSpeechRequest object + - Added text encoding options to TextToSpeechRequest + - Added previous text input parameter to TextToSpeechRequest Version 3.0.3 - Fix DubbingRequest.DropBackgroundAudio flag not properly being set - Added DubbingRequest.UseProfanityFilter flag diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechEndpoint.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechEndpoint.cs index e036fa1..bfe7098 100644 --- a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechEndpoint.cs +++ b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechEndpoint.cs @@ -8,6 +8,7 @@ using System.IO; using System.Linq; using System.Net.Http; +using System.Text; using System.Text.Json; using System.Threading; using System.Threading.Tasks; @@ -55,7 +56,7 @@ public TextToSpeechEndpoint(ElevenLabsClient client) : base(client) { } /// 2 - strong latency optimizations (about 75% of possible latency improvement of option 3)
/// 3 - max latency optimizations
/// 4 - max latency optimizations, but also with text normalizer turned off for even more latency savings - /// (best latency, but can mispronounce eg numbers and dates). + /// (best latency, but can mispronounce e.g. numbers and dates). /// /// /// Optional, Callback to enable streaming audio as it comes in.
@@ -65,30 +66,34 @@ public TextToSpeechEndpoint(ElevenLabsClient client) : base(client) { } /// . public async Task TextToSpeechAsync(string text, Voice voice, VoiceSettings voiceSettings = null, Model model = null, OutputFormat outputFormat = OutputFormat.MP3_44100_128, int? optimizeStreamingLatency = null, Func partialClipCallback = null, CancellationToken cancellationToken = default) { - if (text.Length > 5000) - { - throw new ArgumentOutOfRangeException(nameof(text), $"{nameof(text)} cannot exceed 5000 characters"); - } - - if (voice == null || - string.IsNullOrWhiteSpace(voice.Id)) - { - throw new ArgumentNullException(nameof(voice)); - } - var defaultVoiceSettings = voiceSettings ?? voice.Settings ?? await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken); - using var payload = JsonSerializer.Serialize(new TextToSpeechRequest(text, model, defaultVoiceSettings)).ToJsonStringContent(); + return await TextToSpeechAsync(new TextToSpeechRequest(voice, text, Encoding.UTF8, defaultVoiceSettings, outputFormat, optimizeStreamingLatency, model), partialClipCallback, cancellationToken).ConfigureAwait(false); + } + + /// + /// Converts text into speech using a voice of your choice and returns audio. + /// + /// . + /// + /// Optional, Callback to enable streaming audio as it comes in.
+ /// Returns partial . + /// + /// Optional, . + /// . + public async Task TextToSpeechAsync(TextToSpeechRequest request, Func partialClipCallback = null, CancellationToken cancellationToken = default) + { + using var payload = JsonSerializer.Serialize(request, ElevenLabsClient.JsonSerializationOptions).ToJsonStringContent(); var parameters = new Dictionary { - { OutputFormatParameter, outputFormat.ToString().ToLower() } + { OutputFormatParameter, request.OutputFormat.ToString().ToLower() } }; - if (optimizeStreamingLatency.HasValue) + if (request.OptimizeStreamingLatency.HasValue) { - parameters.Add(OptimizeStreamingLatencyParameter, optimizeStreamingLatency.ToString()); + parameters.Add(OptimizeStreamingLatencyParameter, request.OptimizeStreamingLatency.Value.ToString()); } - using var postRequest = new HttpRequestMessage(HttpMethod.Post, GetUrl($"/{voice.Id}{(partialClipCallback == null ? string.Empty : "/stream")}", parameters)); + using var postRequest = new HttpRequestMessage(HttpMethod.Post, GetUrl($"/{request.Voice.Id}{(partialClipCallback == null ? string.Empty : "/stream")}", parameters)); postRequest.Content = payload; var requestOption = partialClipCallback == null ? HttpCompletionOption.ResponseContentRead @@ -116,7 +121,7 @@ public async Task TextToSpeechAsync(string text, Voice voice, VoiceSe { try { - await partialClipCallback(new VoiceClip(clipId, text, voice, new ReadOnlyMemory(memoryStream.GetBuffer(), totalBytesRead, bytesRead))).ConfigureAwait(false); + await partialClipCallback(new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory(memoryStream.GetBuffer(), totalBytesRead, bytesRead))).ConfigureAwait(false); } catch (Exception e) { @@ -127,7 +132,7 @@ public async Task TextToSpeechAsync(string text, Voice voice, VoiceSe totalBytesRead += bytesRead; } - return new VoiceClip(clipId, text, voice, new ReadOnlyMemory(memoryStream.GetBuffer(), 0, totalBytesRead)); + return new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory(memoryStream.GetBuffer(), 0, totalBytesRead)); } } } diff --git a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechRequest.cs b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechRequest.cs index 42e6acb..1ca2a29 100644 --- a/ElevenLabs-DotNet/TextToSpeech/TextToSpeechRequest.cs +++ b/ElevenLabs-DotNet/TextToSpeech/TextToSpeechRequest.cs @@ -3,22 +3,90 @@ using ElevenLabs.Models; using ElevenLabs.Voices; using System; +using System.Text; using System.Text.Json.Serialization; namespace ElevenLabs.TextToSpeech { public sealed class TextToSpeechRequest { - public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings) + public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings) : + this(null, text, voiceSettings: voiceSettings, model: model) + { + } + + /// + /// Constructor. + /// + /// + /// to use. + /// + /// + /// Text input to synthesize speech for. Maximum 5000 characters. + /// + /// to use for . + /// + /// Optional, that will override the default settings in . + /// + /// + /// Optional, to use. Defaults to . + /// + /// + /// Output format of the generated audio.
+ /// Defaults to + /// + /// + /// Optional, You can turn on latency optimizations at some cost of quality. + /// The best possible final latency varies by model.
+ /// Possible values:
+ /// 0 - default mode (no latency optimizations)
+ /// 1 - normal latency optimizations (about 50% of possible latency improvement of option 3)
+ /// 2 - strong latency optimizations (about 75% of possible latency improvement of option 3)
+ /// 3 - max latency optimizations
+ /// 4 - max latency optimizations, but also with text normalizer turned off for even more latency savings + /// (best latency, but can mispronounce e.g. numbers and dates). + /// + /// + /// + /// + public TextToSpeechRequest( + Voice voice, + string text, + Encoding encoding = null, + VoiceSettings voiceSettings = null, + OutputFormat outputFormat = OutputFormat.MP3_44100_128, + int? optimizeStreamingLatency = null, + Model model = null, + string previousText = null) { if (string.IsNullOrWhiteSpace(text)) { throw new ArgumentNullException(nameof(text)); } + if (text.Length > 5000) + { + throw new ArgumentOutOfRangeException(nameof(text), $"{nameof(text)} cannot exceed 5000 characters"); + } + + if (voice == null || + string.IsNullOrWhiteSpace(voice.Id)) + { + throw new ArgumentNullException(nameof(voice)); + } + + if (encoding?.Equals(Encoding.UTF8) == false) + { + text = Encoding.UTF8.GetString(encoding.GetBytes(text)); + } + Text = text; - Model = model ?? Models.Model.EnglishV1; - VoiceSettings = voiceSettings ?? throw new ArgumentNullException(nameof(voiceSettings)); + Model = model ?? Models.Model.MultiLingualV2; + Voice = voice; + VoiceSettings = voiceSettings ?? voice.Settings ?? throw new ArgumentNullException(nameof(voiceSettings)); + PreviousText = previousText; + OutputFormat = outputFormat; + OptimizeStreamingLatency = optimizeStreamingLatency; } [JsonPropertyName("text")] @@ -27,7 +95,20 @@ public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings [JsonPropertyName("model_id")] public string Model { get; } + [JsonIgnore] + public Voice Voice { get; } + [JsonPropertyName("voice_settings")] public VoiceSettings VoiceSettings { get; } + + [JsonPropertyName("previous_text")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)] + public string PreviousText { get; } + + [JsonIgnore] + public OutputFormat OutputFormat { get; } + + [JsonIgnore] + public int? OptimizeStreamingLatency { get; } } }