Please note the above benchmarks - you can actually get 5 RTS on a CPU, most likely the model just warms up during the first run
As for the problems listed with the model
no handling of numbers, those are just omitted
there is no text normalization middleware packaged with the models
the model just produces audio from text
it was not included by design
issues with longer sentences, interference just stops (might be related to warning that sentence has more than 140 chars) or is getting worse at the end of longer
this is also by design
model accepts sentences and it can work with batches
see these examples
import torch
import torchaudio
language = 'ru'
speaker = 'kseniya_16khz'
device = torch.device('cpu')
model, symbols, sample_rate, example_text, apply_tts = torch.hub.load(repo_or_dir='snakers4/silero-models',
model='silero_tts',language=language,speaker=speaker)
model = model.to(device) # gpu or cpu
example_text="нав+ерное, существ+уют друг+ие рец+епты, но я их не зн+аю. +или он+и мне не помог+ают. х+очешь моег+о сов+ета - пож+алуйста: сад+ись раб+отать. сл+ава б+огу, так+им л+юдям, как мы с тоб+ой, для раб+оты ничег+о не н+ужно кр+оме бум+аги и карандаш+а."
for i, text in enumerate(example_text.split('. ')):
audio = apply_tts(texts=[text],
model=model,
sample_rate=sample_rate,
symbols=symbols,
device=device)
torchaudio.save(f'test_{str(i).zfill(2)}.wav',
audio[0].unsqueeze(0),
sample_rate=16000,
bits_per_sample=16)
import torch
import torchaudio
language = 'ru'
speaker = 'kseniya_16khz'
device = torch.device('cpu')
model, symbols, sample_rate, example_text, apply_tts = torch.hub.load(repo_or_dir='snakers4/silero-models',
model='silero_tts',language=language,speaker=speaker,
force_reload=True)
model = model.to(device) # gpu or cpu
example_text="нав+ерное, существ+уют друг+ие рец+епты, но я их не зн+аю. +или он+и мне не помог+ают. х+очешь моег+о сов+ета - пож+алуйста: сад+ись раб+отать. сл+ава б+огу, так+им л+юдям, как мы с тоб+ой, для раб+оты ничег+о не н+ужно кр+оме бум+аги и карандаш+а."
example_text = example_text.split('. ')
print(example_text)
audio = apply_tts(texts=example_text,
model=model,
sample_rate=sample_rate,
symbols=symbols,
device=device)