Hello everybody,
thank you very much for your amazing work.
This message is to ask if someone can help me understand why 13 test audio generated with TTS still are 5 seconds long bip like https://drive.google.com/file/d/1cmMML6C3Cn5euUrdFtPENVvGMR94zrCd/view?usp=sharing.
I am training TTS on a custom language dataset composed by 500 wavs.
This is what I see in my terminal:
EPOCH: 218/1000
Number of output frames: 1
TRAINING (2020-12-18 10:15:49)
–> STEP: 8/11 – GLOBAL_STEP: 5125
| > decoder_loss: 0.11445 (0.12108)
| > postnet_loss: 0.12307 (0.14004)
| > stopnet_loss: 0.77172 (0.72628)
| > decoder_coarse_loss: 0.24829 (0.24632)
| > decoder_ddc_loss: 0.00494 (0.00659)
| > ga_loss: 0.00320 (0.00599)
| > loss: 0.49394 (0.52002)
| > align_error: 0.67648 (0.60663)
| > avg_spec_length: 909.1
| > avg_text_length: 33.7
| > step_time: 559.1368
| > loader_time: 0.00
| > current_lr: 0.0001
–> TRAIN PERFORMACE – EPOCH TIME: 6744.64 sec – GLOBAL_STEP: 5128
| > avg_decoder_loss: 0.12189
| > avg_postnet_loss: 0.14110
| > avg_stopnet_loss: 0.62447
| > avg_decoder_coarse_loss: 0.25328
| > avg_decoder_ddc_loss: 0.00588
| > avg_ga_loss: 0.00497
| > avg_loss: 0.52712
| > avg_align_error: 0.64611
| > avg_loader_time: 0.00970
| > avg_step_time: 604.00557
EVALUATION
warning: audio amplitude out of range, auto clipped.
| > Synthesizing test sentences
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
This is my conf.json
{
“github_branch”:"* master",
“restore_path”:“path/checkpoint_2500.pth.tar”,
“model”: “Tacotron2”,
“run_name”: “Davide”,
“run_description”: “tacotron2 with ddc and batch-normalization”,
// AUDIO PARAMETERS
"audio":{
// stft parameters
"fft_size": 1024,
"win_length": 1024,
"hop_length": 256,
"frame_length_ms": null,
"frame_shift_ms": null,
// Audio processing parameters
"sample_rate": 44100,
"preemphasis": 0.0,
"ref_level_db": 20,
// Silence trimming
"do_trim_silence": true,
"trim_db": 60,
// Griffin-Lim
"power": 1.5,
"griffin_lim_iters": 60,
// MelSpectrogram parameters
"num_mels": 80,
"mel_fmin": 50.0,
"mel_fmax": 8000.0,
"spec_gain": 20.0,
// Normalization parameters
"signal_norm": true,
"min_level_db": -100,
"symmetric_norm": true,
"max_norm": 4.0,
"clip_norm": true,
"stats_path": null
},
// VOCABULARY PARAMETERS
// if custom character set is not defined,
// default set in symbols.py is used
"characters":{
"pad": "_",
"eos": "~",
"bos": "^",
"characters": "ːʒʧʦʃɲʎʤʣABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
"punctuations":"!'(),-.:;? ",
"phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
},
// DISTRIBUTED TRAINING
"distributed":{
"backend": "nccl",
"url": "tcp:\/\/localhost:54321"
},
"reinit_layers": [],
// TRAINING
"batch_size": 32,
"eval_batch_size":16,
"r": 7,
"gradual_training": [[0, 7, 64], [1, 5, 64], [500, 3, 32], [1300, 2, 32], [2900, 1, 32]],
"loss_masking": true,
"ga_alpha": 10.0,
"apex_amp_level": null,
// VALIDATION
"run_eval": true,
"test_delay_epochs": 10,
"test_sentences_file": "/path/sentences.txt",
// OPTIMIZER
"noam_schedule": false,
"grad_clip": 1.0,
"epochs": 1000,
"lr": 0.0001,
"wd": 0.000001,
"warmup_steps": 4000,
"seq_len_norm": true,
// TACOTRON PRENET
"memory_size": -1,
"prenet_type": "bn",
"prenet_dropout": false,
// TACOTRON ATTENTION
"attention_type": "original",
"attention_heads": 4,
"attention_norm": "sigmoid",
"windowing": false,
"use_forward_attn": false,
"forward_attn_mask": false,
"transition_agent": false,
"location_attn": true,
"bidirectional_decoder": false,
"double_decoder_consistency": true,
"ddc_r": 7,
// STOPNET
"stopnet": true,
"separate_stopnet": true,
// TENSORBOARD and LOGGING
"print_step": 25,
"tb_plot_step": 100,
"print_eval": false,
"save_step": 50,
"checkpoint": true,
"tb_model_param_stats": false,
// DATA LOADING
"text_cleaner": "phoneme_cleaners",
"enable_eos_bos_chars": false,
"num_loader_workers": 4,
"num_val_loader_workers": 4,
"batch_group_size": 0,
"min_seq_len": 6,
"max_seq_len": 153,
// PATHS
"output_path": "path/output_train",
// PHONEMES
"phoneme_cache_path": "/path/",
"use_phonemes": false,
"phoneme_language": "en-us",
// MULTI-SPEAKER and GST
"use_speaker_embedding": false,
"use_external_speaker_embedding_file": false,
"external_speaker_embedding_file": "path/speakers-Davide.json",
"use_gst": true,
"gst": {
"gst_style_input": null,
"gst_embedding_dim": 512,
"gst_num_heads": 4,
"gst_style_tokens": 10
},
// DATASETS
"datasets":
[
{
"name": "davide",
"path": "/path/data/davide",
"meta_file_train": "metadata_train.csv",
"meta_file_val": "metadata_val.csv"
}
]
}
Thank you very much for your support