Audio generated with TTS is a Bip

Hello everybody,
thank you very much for your amazing work.

This message is to ask if someone can help me understand why 13 test audio generated with TTS still are 5 seconds long bip like https://drive.google.com/file/d/1cmMML6C3Cn5euUrdFtPENVvGMR94zrCd/view?usp=sharing.

I am training TTS on a custom language dataset composed by 500 wavs.

This is what I see in my terminal:

EPOCH: 218/1000

Number of output frames: 1

TRAINING (2020-12-18 10:15:49)

–> STEP: 8/11 – GLOBAL_STEP: 5125
| > decoder_loss: 0.11445 (0.12108)
| > postnet_loss: 0.12307 (0.14004)
| > stopnet_loss: 0.77172 (0.72628)
| > decoder_coarse_loss: 0.24829 (0.24632)
| > decoder_ddc_loss: 0.00494 (0.00659)
| > ga_loss: 0.00320 (0.00599)
| > loss: 0.49394 (0.52002)
| > align_error: 0.67648 (0.60663)
| > avg_spec_length: 909.1
| > avg_text_length: 33.7
| > step_time: 559.1368
| > loader_time: 0.00
| > current_lr: 0.0001

–> TRAIN PERFORMACE – EPOCH TIME: 6744.64 sec – GLOBAL_STEP: 5128
| > avg_decoder_loss: 0.12189
| > avg_postnet_loss: 0.14110
| > avg_stopnet_loss: 0.62447
| > avg_decoder_coarse_loss: 0.25328
| > avg_decoder_ddc_loss: 0.00588
| > avg_ga_loss: 0.00497
| > avg_loss: 0.52712
| > avg_align_error: 0.64611
| > avg_loader_time: 0.00970
| > avg_step_time: 604.00557

EVALUATION

warning: audio amplitude out of range, auto clipped.
| > Synthesizing test sentences
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.

This is my conf.json

{
“github_branch”:"* master",
“restore_path”:“path/checkpoint_2500.pth.tar”,
“model”: “Tacotron2”,
“run_name”: “Davide”,
“run_description”: “tacotron2 with ddc and batch-normalization”,

// AUDIO PARAMETERS
"audio":{
    // stft parameters
    "fft_size": 1024,        
    "win_length": 1024,     
    "hop_length": 256,      
    "frame_length_ms": null, 
    "frame_shift_ms": null,  

    // Audio processing parameters
    "sample_rate": 44100,   
    "preemphasis": 0.0,     
    "ref_level_db": 20,    

    // Silence trimming
    "do_trim_silence": true,
    "trim_db": 60,          

    // Griffin-Lim
    "power": 1.5,         
    "griffin_lim_iters": 60, 

    // MelSpectrogram parameters
    "num_mels": 80,        
    "mel_fmin": 50.0,        
    "mel_fmax": 8000.0,    
    "spec_gain": 20.0,

    // Normalization parameters
    "signal_norm": true,    
    "min_level_db": -100,   
    "symmetric_norm": true, 
    "max_norm": 4.0,      
    "clip_norm": true,   
    "stats_path": null   
},

// VOCABULARY PARAMETERS
// if custom character set is not defined,
// default set in symbols.py is used
 "characters":{
     "pad": "_",
     "eos": "~",
     "bos": "^",
     "characters": "ːʒʧʦʃɲʎʤʣABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
     "punctuations":"!'(),-.:;? ",
     "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
 },

// DISTRIBUTED TRAINING
"distributed":{
    "backend": "nccl",
    "url": "tcp:\/\/localhost:54321"
},

"reinit_layers": [],    

// TRAINING
"batch_size": 32,       
"eval_batch_size":16,
"r": 7,                
"gradual_training": [[0, 7, 64], [1, 5, 64], [500, 3, 32], [1300, 2, 32], [2900, 1, 32]], 
"loss_masking": true,        
"ga_alpha": 10.0,      
"apex_amp_level": null, 

// VALIDATION
"run_eval": true,         
"test_delay_epochs": 10,  
"test_sentences_file": "/path/sentences.txt",  

// OPTIMIZER
"noam_schedule": false,       
"grad_clip": 1.0,             
"epochs": 1000,              
"lr": 0.0001,                 
"wd": 0.000001,               
"warmup_steps": 4000,         
"seq_len_norm": true,        

// TACOTRON PRENET
"memory_size": -1,            
"prenet_type": "bn",          
"prenet_dropout": false,       

// TACOTRON ATTENTION
"attention_type": "original",  
"attention_heads": 4,        
"attention_norm": "sigmoid",  
"windowing": false,           
"use_forward_attn": false,    
"forward_attn_mask": false,    
"transition_agent": false,    
"location_attn": true,        
"bidirectional_decoder": false,  
"double_decoder_consistency": true,  
"ddc_r": 7,                           

// STOPNET
"stopnet": true,               
"separate_stopnet": true,      

// TENSORBOARD and LOGGING
"print_step": 25,      
"tb_plot_step": 100,   
"print_eval": false,    
"save_step": 50,     
"checkpoint": true,    
"tb_model_param_stats": false,    

// DATA LOADING
"text_cleaner": "phoneme_cleaners",
"enable_eos_bos_chars": false,
"num_loader_workers": 4,       
"num_val_loader_workers": 4,   
"batch_group_size": 0,  
"min_seq_len": 6,      
"max_seq_len": 153,    

// PATHS
"output_path": "path/output_train",

// PHONEMES
"phoneme_cache_path": "/path/",  
"use_phonemes": false,           
"phoneme_language": "en-us",     

// MULTI-SPEAKER and GST
"use_speaker_embedding": false,     
"use_external_speaker_embedding_file": false,
"external_speaker_embedding_file": "path/speakers-Davide.json", 
"use_gst": true,       			  
"gst":	{			               
    "gst_style_input": null,       
    "gst_embedding_dim": 512,
    "gst_num_heads": 4,
    "gst_style_tokens": 10
},

// DATASETS
"datasets":   
    [
        {
            "name": "davide",
            "path": "/path/data/davide",
            "meta_file_train": "metadata_train.csv",
            "meta_file_val": "metadata_val.csv"
        }
    ]

}

Thank you very much for your support

  1. What is the quality of your data?

  2. 500 files is not a lot of data.

  3. You switch to fast between r values (
    "gradual_training": [[0, 7, 64], [1, 5, 64], [500, 3, 32], [1300, 2, 32], [2900, 1, 32]] )

  4. r=1 is hard to train so try without it
    "gradual_training": [[0, 7, 64], [1, 5, 64], [10000, 3, 32], [50000, 2, 32]]

Thank you sanjaesc!!!

  1. the quality of our wavs is 44100 hz mono and we clipped any starting and ending silence.

  2. I understand, but our artistic project does not require extremely clean natural voice in the end. We wonder to obtain something acceptable.

  3. Thank you! we will substitute your parameters and we will try.

Thank you very much for your support.

As we will proceed execution we will warn you about results.