Audio generated with TTS is a Bip

Davide_Cangelosi · December 19, 2020, 11:13am

Hello everybody,
thank you very much for your amazing work.

This message is to ask if someone can help me understand why 13 test audio generated with TTS still are 5 seconds long bip like TestSentence_12.wav - Google Drive.

I am training TTS on a custom language dataset composed by 500 wavs.

This is what I see in my terminal:

EPOCH: 218/1000

Number of output frames: 1

TRAINING (2020-12-18 10:15:49)

EVALUATION

warning: audio amplitude out of range, auto clipped.
| > Synthesizing test sentences
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.

This is my conf.json

{
“github_branch”:“* master”,
“restore_path”:“path/checkpoint_2500.pth.tar”,
“model”: “Tacotron2”,
“run_name”: “Davide”,
“run_description”: “tacotron2 with ddc and batch-normalization”,

// AUDIO PARAMETERS
"audio":{
    // stft parameters
    "fft_size": 1024,        
    "win_length": 1024,     
    "hop_length": 256,      
    "frame_length_ms": null, 
    "frame_shift_ms": null,  

    // Audio processing parameters
    "sample_rate": 44100,   
    "preemphasis": 0.0,     
    "ref_level_db": 20,    

    // Silence trimming
    "do_trim_silence": true,
    "trim_db": 60,          

    // Griffin-Lim
    "power": 1.5,         
    "griffin_lim_iters": 60, 

    // MelSpectrogram parameters
    "num_mels": 80,        
    "mel_fmin": 50.0,        
    "mel_fmax": 8000.0,    
    "spec_gain": 20.0,

    // Normalization parameters
    "signal_norm": true,    
    "min_level_db": -100,   
    "symmetric_norm": true, 
    "max_norm": 4.0,      
    "clip_norm": true,   
    "stats_path": null   
},

// VOCABULARY PARAMETERS
// if custom character set is not defined,
// default set in symbols.py is used
 "characters":{
     "pad": "_",
     "eos": "~",
     "bos": "^",
     "characters": "ːʒʧʦʃɲʎʤʣABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
     "punctuations":"!'(),-.:;? ",
     "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
 },

// DISTRIBUTED TRAINING
"distributed":{
    "backend": "nccl",
    "url": "tcp:\/\/localhost:54321"
},

"reinit_layers": [],    

// TRAINING
"batch_size": 32,       
"eval_batch_size":16,
"r": 7,                
"gradual_training": [[0, 7, 64], [1, 5, 64], [500, 3, 32], [1300, 2, 32], [2900, 1, 32]], 
"loss_masking": true,        
"ga_alpha": 10.0,      
"apex_amp_level": null, 

// VALIDATION
"run_eval": true,         
"test_delay_epochs": 10,  
"test_sentences_file": "/path/sentences.txt",  

// OPTIMIZER
"noam_schedule": false,       
"grad_clip": 1.0,             
"epochs": 1000,              
"lr": 0.0001,                 
"wd": 0.000001,               
"warmup_steps": 4000,         
"seq_len_norm": true,        

// TACOTRON PRENET
"memory_size": -1,            
"prenet_type": "bn",          
"prenet_dropout": false,       

// TACOTRON ATTENTION
"attention_type": "original",  
"attention_heads": 4,        
"attention_norm": "sigmoid",  
"windowing": false,           
"use_forward_attn": false,    
"forward_attn_mask": false,    
"transition_agent": false,    
"location_attn": true,        
"bidirectional_decoder": false,  
"double_decoder_consistency": true,  
"ddc_r": 7,                           

// STOPNET
"stopnet": true,               
"separate_stopnet": true,      

// TENSORBOARD and LOGGING
"print_step": 25,      
"tb_plot_step": 100,   
"print_eval": false,    
"save_step": 50,     
"checkpoint": true,    
"tb_model_param_stats": false,    

// DATA LOADING
"text_cleaner": "phoneme_cleaners",
"enable_eos_bos_chars": false,
"num_loader_workers": 4,       
"num_val_loader_workers": 4,   
"batch_group_size": 0,  
"min_seq_len": 6,      
"max_seq_len": 153,    

// PATHS
"output_path": "path/output_train",

// PHONEMES
"phoneme_cache_path": "/path/",  
"use_phonemes": false,           
"phoneme_language": "en-us",     

// MULTI-SPEAKER and GST
"use_speaker_embedding": false,     
"use_external_speaker_embedding_file": false,
"external_speaker_embedding_file": "path/speakers-Davide.json", 
"use_gst": true,       			  
"gst":	{			               
    "gst_style_input": null,       
    "gst_embedding_dim": 512,
    "gst_num_heads": 4,
    "gst_style_tokens": 10
},

// DATASETS
"datasets":   
    [
        {
            "name": "davide",
            "path": "/path/data/davide",
            "meta_file_train": "metadata_train.csv",
            "meta_file_val": "metadata_val.csv"
        }
    ]

}

Thank you very much for your support

sanjaesc · December 19, 2020, 3:24pm

What is the quality of your data?
500 files is not a lot of data.
You switch to fast between r values (
"gradual_training": [[0, 7, 64], [1, 5, 64], [500, 3, 32], [1300, 2, 32], [2900, 1, 32]] )
r=1 is hard to train so try without it
"gradual_training": [[0, 7, 64], [1, 5, 64], [10000, 3, 32], [50000, 2, 32]]

Davide_Cangelosi · December 20, 2020, 1:30pm

Thank you sanjaesc!!!

the quality of our wavs is 44100 hz mono and we clipped any starting and ending silence.
I understand, but our artistic project does not require extremely clean natural voice in the end. We wonder to obtain something acceptable.
Thank you! we will substitute your parameters and we will try.

Thank you very much for your support.

As we will proceed execution we will warn you about results.

Davide_Cangelosi · February 20, 2021, 4:25pm

Dear Sanjaesc,
we changed the “gradual_training” parameter according your indications. After 14000 steps audio continues to be bipping.

Evaluation on five custom texts displays the following verbose text:

warning: audio amplitude out of range, auto clipped.

| > Synthesizing test sentences

| > Decoder stopped with 'max_decoder_steps

warning: audio amplitude out of range, auto clipped.

This is an example wav generated by TTS on a custom text.

TestSentence_0.wav.zip (118.1 KB)

I look something more humanoid.

Do 14000 steps may be too low to have something humanoid in your experience?

Thank you in advance for your help

chambersmary864 · March 10, 2021, 11:15am

Davide_Cangelosi:

Hello everybody,
thank you very much for your amazing work.

This message is to ask if someone can help me understand why 13 test audio generated with TTS still are 5 seconds long bip like TestSentence_12.wav - Google Drive.

I am training TTS on a custom language dataset composed by 500 wavs.

This is what I see in my terminal:

EPOCH: 218/1000

Number of output frames: 1

TRAINING (2020-12-18 10:15:49)

→ STEP: 8/11 – GLOBAL_STEP: 5125
| > decoder_loss: 0.11445 (0.12108)
| > postnet_loss: 0.12307 (0.14004)
| > stopnet_loss: 0.77172 (0.72628)
| > decoder_coarse_loss: 0.24829 (0.24632)
| > decoder_ddc_loss: 0.00494 (0.00659)
| > ga_loss: 0.00320 (0.00599)
| > loss: 0.49394 (0.52002)
| > align_error: 0.67648 (0.60663)
| > avg_spec_length: 909.1
| > avg_text_length: 33.7
| > step_time: 559.1368
| > loader_time: 0.00
| > current_lr: 0.0001

→ TRAIN PERFORMACE – EPOCH TIME: 6744.64 sec – GLOBAL_STEP: 5128
| > avg_decoder_loss: 0.12189
| > avg_postnet_loss: 0.14110
| > avg_stopnet_loss: 0.62447
| > avg_decoder_coarse_loss: 0.25328
| > avg_decoder_ddc_loss: 0.00588
| > avg_ga_loss: 0.00497
| > avg_loss: 0.52712
| > avg_align_error: 0.64611
| > avg_loader_time: 0.00970
| > avg_step_time: 604.00557

EVALUATION

warning: audio amplitude out of range, auto clipped.
| > Synthesizing test sentences
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
| > Decoder stopped with 'max_decoder_steps
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.
warning: audio amplitude out of range, auto clipped.

This is my conf.json

{
“github_branch”:“* master”,
“restore_path”:“path/checkpoint_2500.pth.tar”,
“model”: “Tacotron2”,
“run_name”: “Davide”,
“run_description”: “tacotron2 with ddc and batch-normalization”,
// AUDIO PARAMETERS
"audio":{
    // stft parameters
    "fft_size": 1024,        
    "win_length": 1024,     
    "hop_length": 256,      
    "frame_length_ms": null, 
    "frame_shift_ms": null,  

    // Audio processing parameters
    "sample_rate": 44100,   
    "preemphasis": 0.0,     
    "ref_level_db": 20,    

    // Silence trimming
    "do_trim_silence": true,
    "trim_db": 60,          

    // Griffin-Lim
    "power": 1.5,         
    "griffin_lim_iters": 60, 

    // MelSpectrogram parameters
    "num_mels": 80,        
    "mel_fmin": 50.0,        
    "mel_fmax": 8000.0,    
    "spec_gain": 20.0,

    // Normalization parameters
    "signal_norm": true,    
    "min_level_db": -100,   
    "symmetric_norm": true, 
    "max_norm": 4.0,      
    "clip_norm": true,   
    "stats_path": null   
},

// VOCABULARY PARAMETERS
// if custom character set is not defined,
// default set in symbols.py is used
 "characters":{
     "pad": "_",
     "eos": "~",
     "bos": "^",
     "characters": "ːʒʧʦʃɲʎʤʣABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
     "punctuations":"!'(),-.:;? ",
     "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
 },

// DISTRIBUTED TRAINING
"distributed":{
    "backend": "nccl",
    "url": "tcp:\/\/localhost:54321"
},

"reinit_layers": [],    

// TRAINING
"batch_size": 32,       
"eval_batch_size":16,
"r": 7,                
"gradual_training": [[0, 7, 64], [1, 5, 64], [500, 3, 32], [1300, 2, 32], [2900, 1, 32]], 
"loss_masking": true,        
"ga_alpha": 10.0,      
"apex_amp_level": null, 

// VALIDATION
"run_eval": true,         
"test_delay_epochs": 10,  
"test_sentences_file": "/path/sentences.txt",  

// OPTIMIZER
"noam_schedule": false,       
"grad_clip": 1.0,             
"epochs": 1000,              
"lr": 0.0001,                 
"wd": 0.000001,               
"warmup_steps": 4000,         
"seq_len_norm": true,        

// TACOTRON PRENET
"memory_size": -1,            
"prenet_type": "bn",          
"prenet_dropout": false,       

// TACOTRON ATTENTION
"attention_type": "original",  
"attention_heads": 4,        
"attention_norm": "sigmoid",  
"windowing": false,           
"use_forward_attn": false,    
"forward_attn_mask": false,    
"transition_agent": false,    
"location_attn": true,        
"bidirectional_decoder": false,  
"double_decoder_consistency": true,  
"ddc_r": 7,                           

// STOPNET
"stopnet": true,               
"separate_stopnet": true,      

// TENSORBOARD and LOGGING
"print_step": 25,      
"tb_plot_step": 100,   
"print_eval": false,    
"save_step": 50,     
"checkpoint": true,    
"tb_model_param_stats": false,    

// DATA LOADING
"text_cleaner": "phoneme_cleaners",
"enable_eos_bos_chars": false,
"num_loader_workers": 4,       
"num_val_loader_workers": 4,   
"batch_group_size": 0,  
"min_seq_len": 6,      
"max_seq_len": 153,    

// PATHS
"output_path": "path/output_train",

// PHONEMES
"phoneme_cache_path": "/path/",  
"use_phonemes": false,           
"phoneme_language": "en-us",     

// MULTI-SPEAKER and GST
"use_speaker_embedding": false,     
"use_external_speaker_embedding_file": false,
"external_speaker_embedding_file": "path/speakers-Davide.json", 
"use_gst": true,       			  
"gst":	{			               
    "gst_style_input": null,       
    "gst_embedding_dim": 512,
    "gst_num_heads": 4,
    "gst_style_tokens": 10
},

// DATASETS
"datasets":   
    [
        {
            "name": "davide",
            "path": "/path/data/davide",
            "meta_file_train": "metadata_train.csv",
            "meta_file_val": "metadata_val.csv"
        }
    ]
}

Screenshot 2020-12-19 at 12.10.421406×1174 188 KB

Screenshot 2020-12-19 at 12.10.321404×1180 124 KB

Screenshot 2020-12-19 at 12.13.25740×878 40.4 KB

Thank you very much for your support

*`

> emphasized text

`*

Topic		Replies	Views
[TWEB dataset] TestSentence audio is progressing while synthesized audio is noisy TTS (Text-to-Speech)	1	307	July 28, 2020
Training suddenly dropping in quality TTS (Text-to-Speech)	20	2482	August 18, 2020
Data and training considerations to improve voice naturalness TTS (Text-to-Speech)	32	4401	November 11, 2019
Query regarding post processing TTS (Text-to-Speech)	49	2206	September 19, 2019
No output on generating voice TTS (Text-to-Speech)	13	2303	November 7, 2020

Audio generated with TTS is a Bip

Related topics