Skip to content

Commit f6c6ea7

Browse files
committed
Load/Run pipeline models, Text preprocessor
1 parent 221748c commit f6c6ea7

File tree

4 files changed

+711
-7
lines changed

4 files changed

+711
-7
lines changed

TensorStack.Audio.Windows/AudioInput.cs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ namespace TensorStack.Audio
1212
/// </summary>
1313
public class AudioInput : AudioInputBase
1414
{
15-
private readonly string _sourceFile;
15+
private string _sourceFile;
1616

1717
/// <summary>
1818
/// Initializes a new instance of the <see cref="AudioInput"/> class.
@@ -21,6 +21,13 @@ public class AudioInput : AudioInputBase
2121
public AudioInput(string filename, string audioCodec = "pcm_s16le", int sampleRate = 16000, int channels = 1)
2222
: this(filename, AudioManager.LoadTensor(filename, audioCodec, sampleRate, channels)) { }
2323

24+
/// <summary>
25+
/// Initializes a new instance of the <see cref="AudioInput"/> class.
26+
/// </summary>
27+
/// <param name="audioTensor">The audio tensor.</param>
28+
public AudioInput(AudioTensor audioTensor)
29+
: base(audioTensor) { }
30+
2431
/// <summary>
2532
/// Initializes a new instance of the <see cref="AudioInput"/> class.
2633
/// </summary>
@@ -44,6 +51,9 @@ protected AudioInput(string filename, AudioTensor audioTensor)
4451
/// <param name="filename">The filename.</param>
4552
public override void Save(string filename)
4653
{
54+
if (string.IsNullOrEmpty(_sourceFile))
55+
_sourceFile = filename;
56+
4757
AudioManager.SaveAudio(filename, this);
4858
}
4959

@@ -55,6 +65,9 @@ public override void Save(string filename)
5565
/// <param name="cancellationToken">The cancellation token.</param>
5666
public override async Task SaveAsync(string filename, CancellationToken cancellationToken = default)
5767
{
68+
if (string.IsNullOrEmpty(_sourceFile))
69+
_sourceFile = filename;
70+
5871
await AudioManager.SaveAudioAync(filename, this, cancellationToken);
5972
}
6073

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Supertonic TTS
2+
https://github.com/supertone-inc/supertonic
3+
4+
5+
```csharp
6+
// [model] https://huggingface.co/TensorStack/Supertonic-onnx
7+
8+
var provider = Provider.GetProvider(GraphOptimizationLevel.ORT_ENABLE_ALL);
9+
var modelPath = "M:\\Models\\Supertonic-onnx";
10+
var pipeline = SupertonicPipeline.Create(modelPath, provider);
11+
var options = new SupertonicOptions
12+
{
13+
TextInput = "On a quiet morning in the old town, a clockmaker named Ellis unlocked his tiny shop",
14+
VoiceStyle = "Female1"
15+
};
16+
17+
var generateResult = await pipeline.RunAsync(options);
18+
AudioManager.SaveAudio("Output.wav", generateResult);
19+
```
Lines changed: 275 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,304 @@
11
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using System.Runtime.InteropServices;
25
using System.Threading;
36
using System.Threading.Tasks;
7+
using TensorStack.Common;
48
using TensorStack.Common.Pipeline;
59
using TensorStack.Common.Tensor;
610
using TensorStack.TextGeneration.Common;
711

812
namespace TensorStack.TextGeneration.Pipelines.Supertonic
913
{
14+
/// <summary>
15+
/// Supertonic TTS Pipeline.
16+
/// </summary>
1017
public class SupertonicPipeline : IPipeline<AudioTensor, SupertonicOptions, GenerateProgress>
1118
{
19+
private readonly Random _random;
20+
21+
/// <summary>
22+
/// Initializes a new instance of the <see cref="SupertonicPipeline"/> class.
23+
/// </summary>
24+
/// <param name="config">The configuration.</param>
25+
public SupertonicPipeline(SupertonicConfig configuration)
26+
{
27+
_random = new Random();
28+
Configuration = configuration;
29+
Processor = new SupertonicProcessor(configuration.IndexerPath, configuration.VoiceStylePath);
30+
Prediction = new ModelSession(configuration.PredictorConfig);
31+
Encoder = new ModelSession(configuration.EncoderConfig);
32+
Estimator = new ModelSession(configuration.EstimatorConfig);
33+
Decoder = new ModelSession(configuration.DecoderConfig);
34+
}
35+
36+
public SupertonicConfig Configuration { get; init; }
37+
public SupertonicProcessor Processor { get; init; }
38+
public ModelSession Prediction { get; init; }
39+
public ModelSession Encoder { get; init; }
40+
public ModelSession Estimator { get; init; }
41+
public ModelSession Decoder { get; init; }
42+
public IEnumerable<string> VoiceStyles => Processor.VoiceStyles;
43+
44+
45+
/// <summary>
46+
/// Loads the pipeline.
47+
/// </summary>
48+
/// <param name="cancellationToken">The cancellation token.</param>
1249
public Task LoadAsync(CancellationToken cancellationToken = default)
1350
{
14-
throw new NotImplementedException();
51+
// SupertonicPipeline pipelines are lazy loaded on first run
52+
return Task.CompletedTask;
53+
}
54+
55+
56+
/// <summary>
57+
/// Unloads the pipeline.
58+
/// </summary>
59+
/// <param name="cancellationToken">The cancellation token.</param>
60+
public async Task UnloadAsync(CancellationToken cancellationToken = default)
61+
{
62+
await Task.WhenAll
63+
(
64+
Prediction.UnloadAsync(),
65+
Encoder.UnloadAsync(),
66+
Estimator.UnloadAsync(),
67+
Decoder.UnloadAsync()
68+
);
1569
}
1670

1771

18-
public Task UnloadAsync(CancellationToken cancellationToken = default)
72+
/// <summary>
73+
/// Run as an asynchronous operation.
74+
/// </summary>
75+
/// <param name="options">The options.</param>
76+
/// <param name="progressCallback">The progress callback.</param>
77+
/// <param name="cancellationToken">The cancellation token.</param>
78+
public async Task<AudioTensor> RunAsync(SupertonicOptions options, IProgress<GenerateProgress> progressCallback = null, CancellationToken cancellationToken = default)
1979
{
20-
throw new NotImplementedException();
80+
var totalDuration = 0.0f;
81+
var audioBuffer = new List<float>();
82+
var voiceStyle = Processor.GetVoiceStyle(options.VoiceStyle);
83+
var silenceLength = (int)(options.SilenceDuration * Configuration.SampleRate);
84+
var silenceBuffer = new float[silenceLength];
85+
86+
// Process text
87+
foreach (var textIds in Processor.GetTextIds(options.TextInput))
88+
{
89+
var result = await RunInferenceAsync(textIds, voiceStyle, options.Steps, options.Speed);
90+
if (audioBuffer.Count == 0)
91+
{
92+
audioBuffer.AddRange(result.Audio.Memory.Span);
93+
totalDuration = result.Duration;
94+
}
95+
else
96+
{
97+
audioBuffer.AddRange(silenceBuffer);
98+
audioBuffer.AddRange(result.Audio.Memory.Span);
99+
totalDuration += result.Duration + options.SilenceDuration;
100+
}
101+
}
102+
103+
var audioSpan = CollectionsMarshal.AsSpan(audioBuffer);
104+
var audioLength = (int)(Configuration.SampleRate * totalDuration);
105+
var audioTensor = new Tensor<float>([1, audioLength]);
106+
audioSpan[..Math.Min(audioLength, audioSpan.Length)].CopyTo(audioTensor.Memory.Span);
107+
return audioTensor.AsAudioTensor(Configuration.SampleRate);
21108
}
22109

23110

24-
public Task<AudioTensor> RunAsync(SupertonicOptions options, IProgress<GenerateProgress> progressCallback = null, CancellationToken cancellationToken = default)
111+
/// <summary>
112+
/// Run inference as an asynchronous operation.
113+
/// </summary>
114+
/// <param name="textIds">The text ids.</param>
115+
/// <param name="style">The style.</param>
116+
/// <param name="totalStep">The total step.</param>
117+
/// <param name="speed">The speed.</param>
118+
/// <param name="cancellationToken">The cancellation token.</param>
119+
private async Task<InferenceResult> RunInferenceAsync(Tensor<long> textIds, VoiceStyle style, int totalStep, float speed = 1.05f, CancellationToken cancellationToken = default)
25120
{
26-
throw new NotImplementedException();
121+
var predictionResult = await PredictAsync(textIds, style.Dropout, cancellationToken);
122+
var duration = predictionResult.Memory.Span[0] / speed;
123+
var encoderResult = await EncodeAsync(textIds, style.Global, cancellationToken);
124+
var latents = PrepareLatents(duration);
125+
for (int step = 0; step < totalStep; step++)
126+
{
127+
latents = await EstimateAsync(latents, encoderResult, style.Global, step, totalStep, cancellationToken);
128+
}
129+
var decoderResult = await DecodeAsync(latents, cancellationToken);
130+
return new InferenceResult(decoderResult, duration);
27131
}
28132

29133

134+
/// <summary>
135+
/// Run duration prediction model
136+
/// </summary>
137+
/// <param name="textIds">The text ids.</param>
138+
/// <param name="styleDropout">The style dropout.</param>
139+
/// <param name="cancellationToken">The cancellation token.</param>
140+
private async Task<Tensor<float>> PredictAsync(Tensor<long> textIds, Tensor<float> styleDropout, CancellationToken cancellationToken = default)
141+
{
142+
var metadata = await Prediction.LoadAsync();
143+
var textMask = new Tensor<float>([1, 1, textIds.Dimensions[1]], 1f);
144+
using (var parameters = new ModelParameters(metadata, cancellationToken))
145+
{
146+
parameters.AddInput(textIds);
147+
parameters.AddInput(styleDropout);
148+
parameters.AddInput(textMask);
149+
parameters.AddOutput([1]);
150+
using (var result = await Prediction.RunInferenceAsync(parameters))
151+
{
152+
return result[0].ToTensor();
153+
}
154+
}
155+
}
156+
157+
/// <summary>
158+
/// Run text encoder model
159+
/// </summary>
160+
/// <param name="textIds">The text ids.</param>
161+
/// <param name="styleGlobal">The style global.</param>
162+
/// <param name="cancellationToken">The cancellation token.</param>
163+
private async Task<Tensor<float>> EncodeAsync(Tensor<long> textIds, Tensor<float> styleGlobal, CancellationToken cancellationToken = default)
164+
{
165+
var metadata = await Encoder.LoadAsync();
166+
var textMask = new Tensor<float>([1, 1, textIds.Dimensions[1]], 1f);
167+
using (var parameters = new ModelParameters(metadata, cancellationToken))
168+
{
169+
parameters.AddInput(textIds);
170+
parameters.AddInput(styleGlobal);
171+
parameters.AddInput(textMask);
172+
parameters.AddOutput([1, Configuration.TextEmbedSize, textIds.Dimensions[1]]);
173+
using (var result = await Encoder.RunInferenceAsync(parameters))
174+
{
175+
return result[0].ToTensor();
176+
}
177+
}
178+
}
179+
180+
/// <summary>
181+
/// Run vector estimate model
182+
/// </summary>
183+
/// <param name="latents">The latents.</param>
184+
/// <param name="textEmbeds">The text embeds.</param>
185+
/// <param name="styleGlobal">The style global.</param>
186+
/// <param name="step">The step.</param>
187+
/// <param name="steps">The steps.</param>
188+
/// <param name="cancellationToken">The cancellation token.</param>
189+
private async Task<Tensor<float>> EstimateAsync(Tensor<float> latents, Tensor<float> textEmbeds, Tensor<float> styleGlobal, int step, int steps, CancellationToken cancellationToken = default)
190+
{
191+
var metadata = await Estimator.LoadAsync();
192+
var textMask = new Tensor<float>([1, 1, textEmbeds.Dimensions[2]], 1f);
193+
var latentMask = new Tensor<float>([1, 1, latents.Dimensions[2]], 1f);
194+
using (var parameters = new ModelParameters(metadata, cancellationToken))
195+
{
196+
parameters.AddInput(latents);
197+
parameters.AddInput(textEmbeds);
198+
parameters.AddInput(styleGlobal);
199+
parameters.AddInput(latentMask);
200+
parameters.AddInput(textMask);
201+
parameters.AddScalarInput(step);
202+
parameters.AddScalarInput(steps);
203+
parameters.AddOutput(latents.Dimensions);
204+
using (var vectorEstResult = await Estimator.RunInferenceAsync(parameters))
205+
{
206+
return vectorEstResult[0].ToTensor();
207+
}
208+
}
209+
}
210+
211+
212+
/// <summary>
213+
/// Run decoder model
214+
/// </summary>
215+
/// <param name="latents">The latents.</param>
216+
/// <param name="cancellationToken">The cancellation token.</param>
217+
private async Task<Tensor<float>> DecodeAsync(Tensor<float> latents, CancellationToken cancellationToken = default)
218+
{
219+
var metadata = await Decoder.LoadAsync();
220+
var bufferSize = Configuration.ScaleFactor * latents.Dimensions[2];
221+
using (var parameters = new ModelParameters(metadata, cancellationToken))
222+
{
223+
parameters.AddInput(latents);
224+
parameters.AddOutput([1, bufferSize]);
225+
using (var result = await Decoder.RunInferenceAsync(parameters))
226+
{
227+
return result[0].ToTensor();
228+
}
229+
}
230+
}
231+
232+
233+
/// <summary>
234+
/// Prepares the latents.
235+
/// </summary>
236+
/// <param name="duration">The duration.</param>
237+
private Tensor<float> PrepareLatents(float duration)
238+
{
239+
var audioLength = duration * Configuration.SampleRate;
240+
var chunkSize = Configuration.BaseChunkSize * Configuration.ChunkCompressFactor;
241+
var latentLen = (int)((audioLength + chunkSize - 1) / chunkSize);
242+
var latentDim = Configuration.LatentDim * Configuration.ChunkCompressFactor;
243+
var latents = _random.NextTensor([1, latentDim, latentLen]);
244+
return latents;
245+
}
246+
247+
248+
/// <summary>
249+
/// Performs application-defined tasks associated with freeing, releasing, or resetting unmanaged resources.
250+
/// </summary>
30251
public void Dispose()
31252
{
32-
throw new NotImplementedException();
253+
Prediction.Dispose();
254+
Encoder.Dispose();
255+
Estimator.Dispose();
256+
Decoder.Dispose();
257+
}
258+
259+
260+
/// <summary>
261+
/// Creates the SupertonicPipeline
262+
/// </summary>
263+
/// <param name="modelPath">The model path.</param>
264+
/// <param name="provider">The provider.</param>
265+
/// <returns>SupertonicPipeline.</returns>
266+
public static SupertonicPipeline Create(string modelPath, ExecutionProvider provider)
267+
{
268+
var config = new SupertonicConfig
269+
{
270+
LatentDim = 24,
271+
SampleRate = 44100,
272+
ScaleFactor = 3072,
273+
BaseChunkSize = 512,
274+
TextEmbedSize = 256,
275+
ChunkCompressFactor = 6,
276+
VoiceStylePath = Path.Combine(modelPath, "voice_styles"),
277+
IndexerPath = Path.Combine(modelPath, "unicode_indexer.json"),
278+
PredictorConfig = new ModelConfig
279+
{
280+
ExecutionProvider = provider,
281+
Path = Path.Combine(modelPath, "duration_predictor.onnx")
282+
},
283+
EncoderConfig = new ModelConfig
284+
{
285+
ExecutionProvider = provider,
286+
Path = Path.Combine(modelPath, "text_encoder.onnx")
287+
},
288+
EstimatorConfig = new ModelConfig
289+
{
290+
ExecutionProvider = provider,
291+
Path = Path.Combine(modelPath, "vector_estimator.onnx")
292+
},
293+
DecoderConfig = new ModelConfig
294+
{
295+
ExecutionProvider = provider,
296+
Path = Path.Combine(modelPath, "vocoder.onnx"),
297+
}
298+
};
299+
return new SupertonicPipeline(config);
33300
}
301+
302+
private record InferenceResult(Tensor<float> Audio, float Duration);
34303
}
35304
}

0 commit comments

Comments
 (0)