Hey @erogol + TTS community,
I’m trying to train a custom WaveRNN model for a Tacotron1 model that I’ve already trained on in-house data, and I’m having issues. Specifically, I cannot generate spectrograms from the Tacotron model.
Setup
WaveRNN commit 7c52874
TTS commit: b1657d70
Python: 3.6.8
PyTorch: 1.1.0
Ubuntu: 18.04
CUDA: 9.1
GPU: GeForce GTX 1060
Dataset: in-house, approx 15 hours
Relevant Info
The trained Tacotron model works:
I can use the model file (best_model.pth.tar
) in addition to the config file used in training (config.json
) to synthesize speech using my GPU.
Problem Description
I can’t generate training data from Tacotron for WaveRNN. Working off a modified version of the notebook in @erogol’s branch of WaveRNN, I get an error from PyTorch saying that I have a dimension mis-match somewhere. Here is the output I see at the terminal:
> Setting up Audio Processor...
| > sample_rate:22050
| > num_mels:80
| > min_level_db:-100
| > frame_shift_ms:12.5
| > frame_length_ms:50
| > ref_level_db:20
| > num_freq:1025
| > power:1.5
| > preemphasis:0.98
| > griffin_lim_iters:60
| > signal_norm:True
| > symmetric_norm:False
| > mel_fmin:0
| > mel_fmax:8000.0
| > max_norm:1.0
| > clip_norm:True
| > do_trim_silence:True
| > n_fft:2048
| > hop_length:275
| > win_length:1102
## LOADING DATASET ##
## LOADING TACOTRON MODEL ##
> Using model: Tacotron
The loaded model was trained for 157358 iterations
Using CUDA
## BEGINNING TRAIN LOOP ##
0%| | 0/656 [00:00<?, ?it/s]text_input.shape = torch.Size([32, 6])
text_lengths.shape = torch.Size([32])
mel_input.shape = torch.Size([32, 105, 1025])
Traceback (most recent call last):
File "extract-tts-spectrogram.py", line 113, in <module>
mel_outputs, postnet_outputs, alignments, stop_tokens = model.forward(text_input, text_lengths, mel_input)
File "/home/josh/git/TTS/models/tacotron.py", line 51, in forward
encoder_outputs, mel_specs, mask)
File "/home/josh/git/TTS/venv/lib/python3.6/site-packages/torch-1.1.0-py3.6-linux-x86_64.egg/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(*input, **kwargs)
File "/home/josh/git/TTS/layers/tacotron.py", line 435, in forward
output, stop_token, attention = self.decode(inputs, mask)
File "/home/josh/git/TTS/layers/tacotron.py", line 372, in decode
processed_memory = self.prenet(self.memory_input)
File "/home/josh/git/TTS/venv/lib/python3.6/site-packages/torch-1.1.0-py3.6-linux-x86_64.egg/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(*input, **kwargs)
File "/home/josh/git/TTS/layers/common_layers.py", line 79, in forward
x = F.dropout(F.relu(linear(x)), p=0.5, training=self.training)
File "/home/josh/git/TTS/venv/lib/python3.6/site-packages/torch-1.1.0-py3.6-linux-x86_64.egg/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(*input, **kwargs)
File "/home/josh/git/TTS/layers/common_layers.py", line 24, in forward
return self.linear_layer(x)
File "/home/josh/git/TTS/venv/lib/python3.6/site-packages/torch-1.1.0-py3.6-linux-x86_64.egg/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(*input, **kwargs)
File "/home/josh/git/TTS/venv/lib/python3.6/site-packages/torch-1.1.0-py3.6-linux-x86_64.egg/torch/nn/modules/linear.py", line 92, in forward
return F.linear(input, self.weight, self.bias)
File "/home/josh/git/TTS/venv/lib/python3.6/site-packages/torch-1.1.0-py3.6-linux-x86_64.egg/torch/nn/functional.py", line 1406, in linear
ret = torch.addmm(bias, input, weight.t())
RuntimeError: size mismatch, m1: [32 x 1025], m2: [400 x 256] at /pytorch/aten/src/THC/generic/THCTensorMathBlas.cu:268
The script that I’m actually using (i.e. the modified version of the Jupyter notebook) is here:
import os
import sys
sys.path.append("/home/josh/git/") # TTS_PATH
import torch
import importlib
import numpy as np
from tqdm import tqdm as tqdm
from torch.utils.data import DataLoader
from TTS.models.tacotron import Tacotron
from TTS.datasets.TTSDataset import MyDataset
from TTS.utils.audio import AudioProcessor
from TTS.utils.visual import plot_spectrogram
from TTS.utils.generic_utils import load_config, setup_model
from TTS.datasets.preprocess import ljspeech
from utils.text.symbols import symbols, phonemes
from utils.generic_utils import sequence_mask
from layers.losses import L1LossMasked
from utils.text.symbols import symbols, phonemes
import pickle
DATA_PATH = "/home/josh/Downloads/isaac/all/"
METADATA_FILE = "/home/josh/Desktop/results/train.csv"
MODEL_FILE = "/home/josh/Desktop/results/best_model.pth.tar"
CONFIG_PATH = "/home/josh/Desktop/results/config.json"
VOCODER_CONFIG_PATH = "/home/josh/git/WaveRNN/config.json"
OUT_PATH = "/home/josh/Desktop/results/wavernn/"
DRY_RUN = False # if False, does not generate output files, only computes loss and visuals.
BATCH_SIZE = 32
use_cuda = torch.cuda.is_available()
C = load_config(CONFIG_PATH)
C_vocoder = load_config(VOCODER_CONFIG_PATH)
ap = AudioProcessor(**C.audio)
def set_filename(wav_path, out_path):
wav_file = os.path.basename(wav_path)
file_name = wav_file.split('.')[0]
os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
wavq_path = os.path.join(out_path, "quant", file_name)
mel_path = os.path.join(out_path, "mel", file_name)
wav_path = os.path.join(out_path, "wav_gl", file_name)
return file_name, wavq_path, mel_path, wav_path
print("## LOADING DATASET ##")
dataset = MyDataset(C.r,
C.text_cleaner,
meta_data=ljspeech(root_path=DATA_PATH,
meta_file=METADATA_FILE),
ap=ap,
use_phonemes=C.use_phonemes,
phoneme_cache_path=C.phoneme_cache_path)
loader = DataLoader(dataset,
batch_size=BATCH_SIZE,
num_workers=4,
collate_fn=dataset.collate_fn,
shuffle=False,
drop_last=False)
print("## LOADING TACOTRON MODEL ##")
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
num_speakers = 0
model = setup_model(num_chars, num_speakers, C)
checkpoint = torch.load(MODEL_FILE)
model.load_state_dict(checkpoint['model'])
print("The loaded model was trained for ", checkpoint['step'], "iterations")
model.eval()
if use_cuda:
print("Using CUDA")
model = model.cuda()
print("## BEGINNING TRAIN LOOP ##")
file_idxs = []
losses = []
postnet_losses = []
criterion = L1LossMasked()
for data in tqdm(loader):
# setup input data
# print(data)
text_input = data[0]
text_lengths = data[1]
linear_input = data[2]
mel_input = data[3]
mel_lengths = data[4]
stop_targets = data[5]
item_idx = data[6]
# dispatch data to GPU
if use_cuda:
text_input = text_input.cuda()
text_lengths = text_lengths.cuda()
mel_input = mel_input.cuda()
mel_lengths = mel_lengths.cuda()
# linear_input = linear_input.cuda()
stop_targets = stop_targets.cuda()
mask = sequence_mask(text_lengths)
# print(model)
print("text_input.shape = ", text_input.shape)
print("text_lengths.shape = ", text_lengths.shape)
print("mel_input.shape = ", mel_input.shape)
mel_outputs, postnet_outputs, alignments, stop_tokens = model.forward(text_input, text_lengths, mel_input)
# compute mel specs from linear spec if model is Tacotron
mel_specs = []
if C.model == "Tacotron":
postnet_outputs = postnet_outputs.data.cpu().numpy()
for b in range(postnet_outputs.shape[0]):
postnet_output = postnet_outputs[b]
mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T).cuda())
postnet_outputs = torch.stack(mel_specs)
loss = criterion(mel_outputs, mel_input, mel_lengths)
loss_postnet = criterion(postnet_outputs, mel_input, mel_lengths)
losses.append(loss.item())
postnet_losses.append(loss_postnet.item())
if not DRY_RUN:
for idx in range(text_input.shape[0]):
wav_file_path = item_idx[idx]
wav = ap.load_wav(wav_file_path)
file_name, wavq_path, mel_path, wav_path = set_filename(wav_file_path, OUT_PATH)
file_idxs.append(file_name)
# quantize and save wav
if type(C_vocoder.mode) is int and C_vocoder.mulaw:
wavq = ap.mulaw_encode(wav, C_vocoder.mode)
np.save(wavq_path, wavq, allow_pickle=False)
elif type(C_vocoder.mode) is int:
wavq = ap.quantize(wav)
np.save(wavq_path, wavq, allow_pickle=False)
# save Mel Spectrograms from Tacotron model
mel = postnet_outputs[idx]
mel = mel.data.cpu().numpy()
mel_length = mel_lengths[idx]
mel = mel[:mel_length, :].T
np.save(mel_path, mel)
if not DRY_RUN:
pickle.dump(file_idxs, open(OUT_PATH+"/dataset_ids.pkl", "wb"))
print(np.mean(losses))
print(np.mean(postnet_losses))
idx = 1
mel_example = postnet_outputs[idx].data.cpu().numpy()
plot_spectrogram(mel_example[:mel_lengths[idx], :], ap);
print(mel_example[:mel_lengths[1], :].shape)
Any pointers on how to troubleshoot this are very welcome.
Thanks!
Josh