RageAgainstThePixel · StephenHodgson · Sep 14, 2024 · Sep 14, 2024
@@ -25,8 +25,12 @@ All copyrights, trademarks, logos, and assets are the property of their respecti
     <SignAssembly>false</SignAssembly>
     <IncludeSymbols>true</IncludeSymbols>
     <TreatWarningsAsErrors>true</TreatWarningsAsErrors>
-    <Version>3.0.3</Version>
+    <Version>3.1.0</Version>
     <PackageReleaseNotes>
+Version 3.1.0
+- Refactored TextToSpeechEndpoint endpoint to accept TextToSpeechRequest object
+  - Added text encoding options to TextToSpeechRequest
+  - Added previous text input parameter to TextToSpeechRequest
 Version 3.0.3
 - Fix DubbingRequest.DropBackgroundAudio flag not properly being set
 - Added DubbingRequest.UseProfanityFilter flag

@@ -8,6 +8,7 @@
 using System.IO;
 using System.Linq;
 using System.Net.Http;
+using System.Text;
 using System.Text.Json;
 using System.Threading;
 using System.Threading.Tasks;
@@ -55,7 +56,7 @@ public TextToSpeechEndpoint(ElevenLabsClient client) : base(client) { }
         /// 2 - strong latency optimizations (about 75% of possible latency improvement of option 3)<br/>
         /// 3 - max latency optimizations<br/>
         /// 4 - max latency optimizations, but also with text normalizer turned off for even more latency savings
-        /// (best latency, but can mispronounce eg numbers and dates).
+        /// (best latency, but can mispronounce e.g. numbers and dates).
         /// </param>
         /// <param name="partialClipCallback">
         /// Optional, Callback to enable streaming audio as it comes in.<br/>
@@ -65,30 +66,34 @@ public TextToSpeechEndpoint(ElevenLabsClient client) : base(client) { }
         /// <returns><see cref="VoiceClip"/>.</returns>
         public async Task<VoiceClip> TextToSpeechAsync(string text, Voice voice, VoiceSettings voiceSettings = null, Model model = null, OutputFormat outputFormat = OutputFormat.MP3_44100_128, int? optimizeStreamingLatency = null, Func<VoiceClip, Task> partialClipCallback = null, CancellationToken cancellationToken = default)
         {
-            if (text.Length > 5000)
-            {
-                throw new ArgumentOutOfRangeException(nameof(text), $"{nameof(text)} cannot exceed 5000 characters");
-            }
-
-            if (voice == null ||
-                string.IsNullOrWhiteSpace(voice.Id))
-            {
-                throw new ArgumentNullException(nameof(voice));
-            }
-
             var defaultVoiceSettings = voiceSettings ?? voice.Settings ?? await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken);
-            using var payload = JsonSerializer.Serialize(new TextToSpeechRequest(text, model, defaultVoiceSettings)).ToJsonStringContent();
+            return await TextToSpeechAsync(new TextToSpeechRequest(voice, text, Encoding.UTF8, defaultVoiceSettings, outputFormat, optimizeStreamingLatency, model), partialClipCallback, cancellationToken).ConfigureAwait(false);
+        }
+
+        /// <summary>
+        /// Converts text into speech using a voice of your choice and returns audio.
+        /// </summary>
+        /// <param name="request"><see cref="TextToSpeechRequest"/>.</param>
+        /// <param name="partialClipCallback">
+        /// Optional, Callback to enable streaming audio as it comes in.<br/>
+        /// Returns partial <see cref="VoiceClip"/>.
+        /// </param>
+        /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
+        /// <returns><see cref="VoiceClip"/>.</returns>
+        public async Task<VoiceClip> TextToSpeechAsync(TextToSpeechRequest request, Func<VoiceClip, Task> partialClipCallback = null, CancellationToken cancellationToken = default)
+        {
+            using var payload = JsonSerializer.Serialize(request, ElevenLabsClient.JsonSerializationOptions).ToJsonStringContent();
             var parameters = new Dictionary<string, string>
             {
-                { OutputFormatParameter, outputFormat.ToString().ToLower() }
+                { OutputFormatParameter, request.OutputFormat.ToString().ToLower() }
             };
 
-            if (optimizeStreamingLatency.HasValue)
+            if (request.OptimizeStreamingLatency.HasValue)
             {
-                parameters.Add(OptimizeStreamingLatencyParameter, optimizeStreamingLatency.ToString());
+                parameters.Add(OptimizeStreamingLatencyParameter, request.OptimizeStreamingLatency.Value.ToString());
             }
 
-            using var postRequest = new HttpRequestMessage(HttpMethod.Post, GetUrl($"/{voice.Id}{(partialClipCallback == null ? string.Empty : "/stream")}", parameters));
+            using var postRequest = new HttpRequestMessage(HttpMethod.Post, GetUrl($"/{request.Voice.Id}{(partialClipCallback == null ? string.Empty : "/stream")}", parameters));
             postRequest.Content = payload;
             var requestOption = partialClipCallback == null
                 ? HttpCompletionOption.ResponseContentRead
@@ -116,7 +121,7 @@ public async Task<VoiceClip> TextToSpeechAsync(string text, Voice voice, VoiceSe
                 {
                     try
                     {
-                        await partialClipCallback(new VoiceClip(clipId, text, voice, new ReadOnlyMemory<byte>(memoryStream.GetBuffer(), totalBytesRead, bytesRead))).ConfigureAwait(false);
+                        await partialClipCallback(new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory<byte>(memoryStream.GetBuffer(), totalBytesRead, bytesRead))).ConfigureAwait(false);
                     }
                     catch (Exception e)
                     {
@@ -127,7 +132,7 @@ public async Task<VoiceClip> TextToSpeechAsync(string text, Voice voice, VoiceSe
                 totalBytesRead += bytesRead;
             }
 
-            return new VoiceClip(clipId, text, voice, new ReadOnlyMemory<byte>(memoryStream.GetBuffer(), 0, totalBytesRead));
+            return new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory<byte>(memoryStream.GetBuffer(), 0, totalBytesRead));
         }
     }
 }
@@ -3,22 +3,90 @@
 using ElevenLabs.Models;
 using ElevenLabs.Voices;
 using System;
+using System.Text;
 using System.Text.Json.Serialization;
 
 namespace ElevenLabs.TextToSpeech
 {
     public sealed class TextToSpeechRequest
     {
-        public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings)
+        public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings) :
+            this(null, text, voiceSettings: voiceSettings, model: model)
+        {
+        }
+
+        /// <summary>
+        /// Constructor.
+        /// </summary>
+        /// <param name="voice">
+        /// <see cref="Voice"/> to use.
+        /// </param>
+        /// <param name="text">
+        /// Text input to synthesize speech for. Maximum 5000 characters.
+        /// </param>
+        /// <param name="encoding"><see cref="Encoding"/> to use for <see cref="text"/>.</param>
+        /// <param name="voiceSettings">
+        /// Optional, <see cref="VoiceSettings"/> that will override the default settings in <see cref="Voice.Settings"/>.
+        /// </param>
+        /// <param name="model">
+        /// Optional, <see cref="Model"/> to use. Defaults to <see cref="Model.MonoLingualV1"/>.
+        /// </param>
+        /// <param name="outputFormat">
+        /// Output format of the generated audio.<br/>
+        /// Defaults to <see cref="OutputFormat.MP3_44100_128"/>
+        /// </param>
+        /// <param name="optimizeStreamingLatency">
+        /// Optional, You can turn on latency optimizations at some cost of quality.
+        /// The best possible final latency varies by model.<br/>
+        /// Possible values:<br/>
+        /// 0 - default mode (no latency optimizations)<br/>
+        /// 1 - normal latency optimizations (about 50% of possible latency improvement of option 3)<br/>
+        /// 2 - strong latency optimizations (about 75% of possible latency improvement of option 3)<br/>
+        /// 3 - max latency optimizations<br/>
+        /// 4 - max latency optimizations, but also with text normalizer turned off for even more latency savings
+        /// (best latency, but can mispronounce e.g. numbers and dates).
+        /// </param>
+        /// <param name="previousText"></param>
+        /// <exception cref="ArgumentNullException"></exception>
+        /// <exception cref="ArgumentOutOfRangeException"></exception>
+        public TextToSpeechRequest(
+            Voice voice,
+            string text,
+            Encoding encoding = null,
+            VoiceSettings voiceSettings = null,
+            OutputFormat outputFormat = OutputFormat.MP3_44100_128,
+            int? optimizeStreamingLatency = null,
+            Model model = null,
+            string previousText = null)
         {
             if (string.IsNullOrWhiteSpace(text))
             {
                 throw new ArgumentNullException(nameof(text));
             }
 
+            if (text.Length > 5000)
+            {
+                throw new ArgumentOutOfRangeException(nameof(text), $"{nameof(text)} cannot exceed 5000 characters");
+            }
+
+            if (voice == null ||
+                string.IsNullOrWhiteSpace(voice.Id))
+            {
+                throw new ArgumentNullException(nameof(voice));
+            }
+
+            if (encoding?.Equals(Encoding.UTF8) == false)
+            {
+                text = Encoding.UTF8.GetString(encoding.GetBytes(text));
+            }
+
             Text = text;
-            Model = model ?? Models.Model.EnglishV1;
-            VoiceSettings = voiceSettings ?? throw new ArgumentNullException(nameof(voiceSettings));
+            Model = model ?? Models.Model.MultiLingualV2;
+            Voice = voice;
+            VoiceSettings = voiceSettings ?? voice.Settings ?? throw new ArgumentNullException(nameof(voiceSettings));
+            PreviousText = previousText;
+            OutputFormat = outputFormat;
+            OptimizeStreamingLatency = optimizeStreamingLatency;
         }
 
         [JsonPropertyName("text")]
@@ -27,7 +95,20 @@ public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings
         [JsonPropertyName("model_id")]
         public string Model { get; }
 
+        [JsonIgnore]
+        public Voice Voice { get; }
+
         [JsonPropertyName("voice_settings")]
         public VoiceSettings VoiceSettings { get; }
+
+        [JsonPropertyName("previous_text")]
+        [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)]
+        public string PreviousText { get; }
+
+        [JsonIgnore]
+        public OutputFormat OutputFormat { get; }
+
+        [JsonIgnore]
+        public int? OptimizeStreamingLatency { get; }
     }
 }