This is the code I try to execute in second PC (in the first one I didn’t problems):
import os
import sys
import torch
import time
import IPython
for some reason TTS installation does not work on Colab
sys.path.append(‘TTS_repo’)
from TTS.tts.utils.generic_utils import setup_model
from TTS.utils.io import load_config
from TTS.tts.utils.text.symbols import symbols, phonemes
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.synthesis import synthesis
def interpolate_vocoder_input(scale_factor, spec):
“”“Interpolation to tolarate the sampling rate difference
btw tts model and vocoder”""
print(" > before interpolation :", spec.shape)
spec = torch.tensor(spec).unsqueeze(0).unsqueeze(0)
spec = torch.nn.functional.interpolate(spec, scale_factor=scale_factor, mode=‘bilinear’).squeeze(0)
print(" > after interpolation :", spec.shape)
return spec
def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):
t_1 = time.time()
waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,
truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)
print(mel_postnet_spec.shape)
mel_postnet_spec = ap.denormalize(mel_postnet_spec.T).T
if not use_gl:
target_sr = VOCODER_CONFIG.audio[‘sample_rate’]
vocoder_input = ap_vocoder.normalize(mel_postnet_spec.T)
if scale_factor[1] != 1:
vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
else:
vocoder_input = torch.tensor(vocoder_input).unsqueeze(0)
waveform = vocoder_model.inference(vocoder_input)
if use_cuda and not use_gl:
waveform = waveform.cpu()
if not use_gl:
waveform = waveform.numpy()
waveform = waveform.squeeze()
rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
tps = (time.time() - t_1) / len(waveform)
print(waveform.shape)
print(" > Run-time: {}".format(time.time() - t_1))
print(" > Real-time factor: {}".format(rtf))
print(" > Time per step: {}".format(tps))
IPython.display.display(IPython.display.Audio(waveform, rate=VOCODER_CONFIG.audio[‘sample_rate’]))
return alignment, mel_postnet_spec, stop_tokens, waveform
runtime settings
use_cuda = False
model paths
TTS_MODEL = “tts_model.pth.tar”
TTS_CONFIG = “config.json”
VOCODER_MODEL = “vocoder_model.pth.tar”
VOCODER_CONFIG = “config_vocoder.json”
TTS_CONFIG = load_config(TTS_CONFIG)
VOCODER_CONFIG = load_config(VOCODER_CONFIG)
VOCODER_CONFIG.audio[‘stats_path’] = ‘scale_stats_vocoder.npy’
load the audio processor
ap = AudioProcessor(**TTS_CONFIG.audio)
ap_vocoder = AudioProcessor(**VOCODER_CONFIG[‘audio’])
scale factor for sampling rate difference
scale_factor = [1, VOCODER_CONFIG[‘audio’][‘sample_rate’] / ap.sample_rate]
print(f"scale_factor: {scale_factor}")
LOAD TTS MODEL
multi speaker
speaker_id = None
speakers = []
load the model
num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, len(speakers), TTS_CONFIG)
load model state
cp = torch.load(TTS_MODEL, map_location=torch.device(‘cpu’))
load the model
model.load_state_dict(cp[‘model’])
if use_cuda:
model.cuda()
model.eval()
set model stepsize
if ‘r’ in cp:
model.decoder.set_r(cp[‘r’])
from TTS.vocoder.utils.generic_utils import setup_generator
LOAD VOCODER MODEL
vocoder_model = setup_generator(VOCODER_CONFIG)
vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=“cpu”)[“model”])
vocoder_model.remove_weight_norm()
vocoder_model.inference_padding = 0
if use_cuda:
vocoder_model.cuda()
vocoder_model.eval();
sentence = “rápido corren los carros cargados de fierro del ferrocarril”
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)
sr = 24000
from scipy.io.wavfile import write
write(“ejemplo.wav”, sr, wav)