Skip to content

Commit

Permalink
ElevenLabs-DotNet 3.1.0 (#64)
Browse files Browse the repository at this point in the history
- Refactored TextToSpeechEndpoint endpoint to accept TextToSpeechRequest
object
  - Added text encoding options to TextToSpeechRequest
  - Added previous text input parameter to TextToSpeechRequest
  • Loading branch information
StephenHodgson authored Sep 14, 2024
1 parent 519611e commit 854a6b0
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 23 deletions.
6 changes: 5 additions & 1 deletion ElevenLabs-DotNet/ElevenLabs-DotNet.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,12 @@ All copyrights, trademarks, logos, and assets are the property of their respecti
<SignAssembly>false</SignAssembly>
<IncludeSymbols>true</IncludeSymbols>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<Version>3.0.3</Version>
<Version>3.1.0</Version>
<PackageReleaseNotes>
Version 3.1.0
- Refactored TextToSpeechEndpoint endpoint to accept TextToSpeechRequest object
- Added text encoding options to TextToSpeechRequest
- Added previous text input parameter to TextToSpeechRequest
Version 3.0.3
- Fix DubbingRequest.DropBackgroundAudio flag not properly being set
- Added DubbingRequest.UseProfanityFilter flag
Expand Down
43 changes: 24 additions & 19 deletions ElevenLabs-DotNet/TextToSpeech/TextToSpeechEndpoint.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using System.IO;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading;
using System.Threading.Tasks;
Expand Down Expand Up @@ -55,7 +56,7 @@ public TextToSpeechEndpoint(ElevenLabsClient client) : base(client) { }
/// 2 - strong latency optimizations (about 75% of possible latency improvement of option 3)<br/>
/// 3 - max latency optimizations<br/>
/// 4 - max latency optimizations, but also with text normalizer turned off for even more latency savings
/// (best latency, but can mispronounce eg numbers and dates).
/// (best latency, but can mispronounce e.g. numbers and dates).
/// </param>
/// <param name="partialClipCallback">
/// Optional, Callback to enable streaming audio as it comes in.<br/>
Expand All @@ -65,30 +66,34 @@ public TextToSpeechEndpoint(ElevenLabsClient client) : base(client) { }
/// <returns><see cref="VoiceClip"/>.</returns>
public async Task<VoiceClip> TextToSpeechAsync(string text, Voice voice, VoiceSettings voiceSettings = null, Model model = null, OutputFormat outputFormat = OutputFormat.MP3_44100_128, int? optimizeStreamingLatency = null, Func<VoiceClip, Task> partialClipCallback = null, CancellationToken cancellationToken = default)
{
if (text.Length > 5000)
{
throw new ArgumentOutOfRangeException(nameof(text), $"{nameof(text)} cannot exceed 5000 characters");
}

if (voice == null ||
string.IsNullOrWhiteSpace(voice.Id))
{
throw new ArgumentNullException(nameof(voice));
}

var defaultVoiceSettings = voiceSettings ?? voice.Settings ?? await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken);
using var payload = JsonSerializer.Serialize(new TextToSpeechRequest(text, model, defaultVoiceSettings)).ToJsonStringContent();
return await TextToSpeechAsync(new TextToSpeechRequest(voice, text, Encoding.UTF8, defaultVoiceSettings, outputFormat, optimizeStreamingLatency, model), partialClipCallback, cancellationToken).ConfigureAwait(false);
}

/// <summary>
/// Converts text into speech using a voice of your choice and returns audio.
/// </summary>
/// <param name="request"><see cref="TextToSpeechRequest"/>.</param>
/// <param name="partialClipCallback">
/// Optional, Callback to enable streaming audio as it comes in.<br/>
/// Returns partial <see cref="VoiceClip"/>.
/// </param>
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
/// <returns><see cref="VoiceClip"/>.</returns>
public async Task<VoiceClip> TextToSpeechAsync(TextToSpeechRequest request, Func<VoiceClip, Task> partialClipCallback = null, CancellationToken cancellationToken = default)
{
using var payload = JsonSerializer.Serialize(request, ElevenLabsClient.JsonSerializationOptions).ToJsonStringContent();
var parameters = new Dictionary<string, string>
{
{ OutputFormatParameter, outputFormat.ToString().ToLower() }
{ OutputFormatParameter, request.OutputFormat.ToString().ToLower() }
};

if (optimizeStreamingLatency.HasValue)
if (request.OptimizeStreamingLatency.HasValue)
{
parameters.Add(OptimizeStreamingLatencyParameter, optimizeStreamingLatency.ToString());
parameters.Add(OptimizeStreamingLatencyParameter, request.OptimizeStreamingLatency.Value.ToString());
}

using var postRequest = new HttpRequestMessage(HttpMethod.Post, GetUrl($"/{voice.Id}{(partialClipCallback == null ? string.Empty : "/stream")}", parameters));
using var postRequest = new HttpRequestMessage(HttpMethod.Post, GetUrl($"/{request.Voice.Id}{(partialClipCallback == null ? string.Empty : "/stream")}", parameters));
postRequest.Content = payload;
var requestOption = partialClipCallback == null
? HttpCompletionOption.ResponseContentRead
Expand Down Expand Up @@ -116,7 +121,7 @@ public async Task<VoiceClip> TextToSpeechAsync(string text, Voice voice, VoiceSe
{
try
{
await partialClipCallback(new VoiceClip(clipId, text, voice, new ReadOnlyMemory<byte>(memoryStream.GetBuffer(), totalBytesRead, bytesRead))).ConfigureAwait(false);
await partialClipCallback(new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory<byte>(memoryStream.GetBuffer(), totalBytesRead, bytesRead))).ConfigureAwait(false);
}
catch (Exception e)
{
Expand All @@ -127,7 +132,7 @@ public async Task<VoiceClip> TextToSpeechAsync(string text, Voice voice, VoiceSe
totalBytesRead += bytesRead;
}

return new VoiceClip(clipId, text, voice, new ReadOnlyMemory<byte>(memoryStream.GetBuffer(), 0, totalBytesRead));
return new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory<byte>(memoryStream.GetBuffer(), 0, totalBytesRead));
}
}
}
87 changes: 84 additions & 3 deletions ElevenLabs-DotNet/TextToSpeech/TextToSpeechRequest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,90 @@
using ElevenLabs.Models;
using ElevenLabs.Voices;
using System;
using System.Text;
using System.Text.Json.Serialization;

namespace ElevenLabs.TextToSpeech
{
public sealed class TextToSpeechRequest
{
public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings)
public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings) :
this(null, text, voiceSettings: voiceSettings, model: model)
{
}

/// <summary>
/// Constructor.
/// </summary>
/// <param name="voice">
/// <see cref="Voice"/> to use.
/// </param>
/// <param name="text">
/// Text input to synthesize speech for. Maximum 5000 characters.
/// </param>
/// <param name="encoding"><see cref="Encoding"/> to use for <see cref="text"/>.</param>
/// <param name="voiceSettings">
/// Optional, <see cref="VoiceSettings"/> that will override the default settings in <see cref="Voice.Settings"/>.
/// </param>
/// <param name="model">
/// Optional, <see cref="Model"/> to use. Defaults to <see cref="Model.MonoLingualV1"/>.
/// </param>
/// <param name="outputFormat">
/// Output format of the generated audio.<br/>
/// Defaults to <see cref="OutputFormat.MP3_44100_128"/>
/// </param>
/// <param name="optimizeStreamingLatency">
/// Optional, You can turn on latency optimizations at some cost of quality.
/// The best possible final latency varies by model.<br/>
/// Possible values:<br/>
/// 0 - default mode (no latency optimizations)<br/>
/// 1 - normal latency optimizations (about 50% of possible latency improvement of option 3)<br/>
/// 2 - strong latency optimizations (about 75% of possible latency improvement of option 3)<br/>
/// 3 - max latency optimizations<br/>
/// 4 - max latency optimizations, but also with text normalizer turned off for even more latency savings
/// (best latency, but can mispronounce e.g. numbers and dates).
/// </param>
/// <param name="previousText"></param>
/// <exception cref="ArgumentNullException"></exception>
/// <exception cref="ArgumentOutOfRangeException"></exception>
public TextToSpeechRequest(
Voice voice,
string text,
Encoding encoding = null,
VoiceSettings voiceSettings = null,
OutputFormat outputFormat = OutputFormat.MP3_44100_128,
int? optimizeStreamingLatency = null,
Model model = null,
string previousText = null)
{
if (string.IsNullOrWhiteSpace(text))
{
throw new ArgumentNullException(nameof(text));
}

if (text.Length > 5000)
{
throw new ArgumentOutOfRangeException(nameof(text), $"{nameof(text)} cannot exceed 5000 characters");
}

if (voice == null ||
string.IsNullOrWhiteSpace(voice.Id))
{
throw new ArgumentNullException(nameof(voice));
}

if (encoding?.Equals(Encoding.UTF8) == false)
{
text = Encoding.UTF8.GetString(encoding.GetBytes(text));
}

Text = text;
Model = model ?? Models.Model.EnglishV1;
VoiceSettings = voiceSettings ?? throw new ArgumentNullException(nameof(voiceSettings));
Model = model ?? Models.Model.MultiLingualV2;
Voice = voice;
VoiceSettings = voiceSettings ?? voice.Settings ?? throw new ArgumentNullException(nameof(voiceSettings));
PreviousText = previousText;
OutputFormat = outputFormat;
OptimizeStreamingLatency = optimizeStreamingLatency;
}

[JsonPropertyName("text")]
Expand All @@ -27,7 +95,20 @@ public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings
[JsonPropertyName("model_id")]
public string Model { get; }

[JsonIgnore]
public Voice Voice { get; }

[JsonPropertyName("voice_settings")]
public VoiceSettings VoiceSettings { get; }

[JsonPropertyName("previous_text")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)]
public string PreviousText { get; }

[JsonIgnore]
public OutputFormat OutputFormat { get; }

[JsonIgnore]
public int? OptimizeStreamingLatency { get; }
}
}

0 comments on commit 854a6b0

Please sign in to comment.