Mozilla TTS - how to save wav?

Hi,

I’m trying to run Mozilla TTS. I used DDC_TTS_and_MultiBand_MelGAN_TF_Example and it worked exactly like in the example but… it only shows audiofile on my jupyter notebook. I would like to save this file or at least play this on my terminal. I tried to do that like this:

def tts(model, text, CONFIG, p):
    t_1 = time.time()
    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,
                                                                             truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,
                                                                             backend='tf')
    waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))
    waveform = waveform.numpy()[0, 0]
    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
    tps = (time.time() - t_1) / len(waveform)
    print(waveform.shape)
    print(" > Run-time: {}".format(time.time() - t_1))
    print(" > Real-time factor: {}".format(rtf))
    print(" > Time per step: {}".format(tps))
    IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate']))
    scipy.io.wavfile.write('test.wav',CONFIG.audio['sample_rate'],waveform)
    return alignment, mel_postnet_spec, stop_tokens, waveform

sentence =  "Holly molly, it works!"
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)

But it saved a file with 1 minute long and what I hear is a slow motion voice. How should I do that?

Have you tried a right click on the audio element, this will let you save the audio you just heard.

Hah, that’s not the point. I want to change text to speech in real time. I need to play that sound from my Python app, not only from jupyter notebook.

I figured out that the order in which the modules are imported is important. Here is my solution:

import os
import torch
import time
import IPython

from TTS.tf.utils.generic_utils import setup_model
from TTS.tf.utils.io import load_checkpoint
from TTS.utils.io import load_config
from TTS.utils.text.symbols import symbols, phonemes
from TTS.utils.audio import AudioProcessor
from TTS.utils.synthesis import synthesis

class Speech:
    # model paths
    TTS_MODEL = "tts_model.pkl"
    TTS_CONFIG = "config.json"
    VOCODER_MODEL = "vocoder_model.pkl"
    VOCODER_CONFIG = "config_vocoder.json"
    model = ''
    ap = ''
    vocoder_model = ''
    use_cuda = False
    speaker_id = None

    def tts(self, text):
        model=self.model
        CONFIG = self.TTS_CONFIG
        ap = self.ap
        t_1 = time.time()
        waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, self.use_cuda, ap, self.speaker_id, style_wav=None,
                                                                                 truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,
                                                                                 backend='tf')
        waveform = self.vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))
        waveform = waveform.numpy()[0, 0]
        rtf = (time.time() - t_1) / (len(waveform) / self.ap.sample_rate)
        tps = (time.time() - t_1) / len(waveform)
        print(waveform.shape)
        print(" > Run-time: {}".format(time.time() - t_1))
        print(" > Real-time factor: {}".format(rtf))
        print(" > Time per step: {}".format(tps))
        IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate']))  
        return alignment, mel_postnet_spec, stop_tokens, waveform, ap.sample_rate
    def init(self):
        # runtime settings
        self.use_cuda = False

        # load configs
        self.TTS_CONFIG = load_config(self.TTS_CONFIG)
        self.VOCODER_CONFIG = load_config(self.VOCODER_CONFIG)

        self.ap = AudioProcessor(**self.TTS_CONFIG.audio)    

        # LOAD TTS MODEL
        # multi speaker 
        self.speaker_id = None
        speakers = []

        # load the model
        num_chars = len(phonemes) if self.TTS_CONFIG.use_phonemes else len(symbols)
        self.model = setup_model(num_chars, len(speakers), self.TTS_CONFIG)
        self.model.build_inference()
        self.model = load_checkpoint(self.model, self.TTS_MODEL)
        self.model.decoder.set_max_decoder_steps(1000)
        self.initAudio()

    def initAudio(self):
        from TTS.vocoder.tf.utils.generic_utils import setup_generator
        from TTS.vocoder.tf.utils.io import load_checkpoint

        # LOAD VOCODER MODEL
        self.vocoder_model = setup_generator(self.VOCODER_CONFIG)
        self.vocoder_model.build_inference()
        self.vocoder_model = load_checkpoint(self.vocoder_model, self.VOCODER_MODEL)
        self.vocoder_model.inference_padding = 0

        self.ap_vocoder = AudioProcessor(**self.VOCODER_CONFIG['audio'])     

And this is how I use that:

from Speech import *
import numpy as np
import scipy.io.wavfile
import math
from playsound import playsound
import os

speak = Speech()
speak.init()
test = "Holly shit, it's working!"
sentence =  "Holly shit, it's working!"
while(sentence!='0'):
    sentence = input('What should I say? ')
    sentence = str(sentence)
    align, spec, stop_tokens, wav, sample_rate = speak.tts(sentence)
    scipy.io.wavfile.write('test.wav', sample_rate, wav)
    playsound('test.wav')

os.remove('test.wav')

I hope it could help somone.

Calling it directly as you’ve done is one option. The other alternative is running the server and then just call that API from your Python app.

This has some slight overhead but has the advantage of decoupling the processes.

2 Likes