Hey @lissyx @othiele I tried training with the same training parameters but with augmentation and i got the following error
!python3 DeepSpeech.py --train_files /content/voxforge/voxforge-train.csv --test_files /content/voxforge/voxforge-test.csv --dev_files /content/voxforge/voxforge-dev.csv --epochs 15 --dev_batch_size 64 --train_batch_size 64 --test_batch_size 64 --log_dir /content/loggs --export_dir /content/models/ --train_cudnn True --checkpoint_dir /content/checks/ --alphabet_config_path /content/voxforge/alphabet.txt --export_model_name 'sept4,0.5,1,spec,noise1' --summary_dir /content/tensorsumm/ --augment add[p=0.5,stddev=0.5,domain='spectrogram']
I0904 09:25:33.918955 139703338915712 utils.py:141] NumExpr defaulting to 2
threads.
I Could not find best validating checkpoint.
I Could not find most recent checkpoint.
I Initializing all variables.
I STARTING Optimization
Epoch 0 | Training | Elapsed Time: 0:33:28 | Steps: 1313 | Loss: 148.881187 Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1365, in _do_call
return fn(*args)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1350, in _run_fn
target_list, run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1443, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.InternalError: 2 root error(s) found.
(0) Internal: Failed to call ThenRnnBackward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 2048, 2048, 1, 518, 64, 2048]
[[{{node tower_0/gradients/tower_0/cudnn_lstm/CudnnRNNV3_grad/CudnnRNNBackpropV3}}]]
[[tower_0/gradients/tower_0/cudnn_lstm/CudnnRNNV3_grad/CudnnRNNBackpropV3/_69]]
(1) Internal: Failed to call ThenRnnBackward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 2048, 2048, 1, 518, 64, 2048]
[[{{node tower_0/gradients/tower_0/cudnn_lstm/CudnnRNNV3_grad/CudnnRNNBackpropV3}}]]
0 successful operations.
0 derived errors ignored.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "DeepSpeech.py", line 12, in <module>
ds_train.run_script()
File "/usr/local/lib/python3.6/dist-packages/deepspeech_training/train.py", line 961, in run_script
absl.app.run(main)
File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 299, in run
_run_main(main, args)
File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 250, in _run_main
sys.exit(main(argv))
File "/usr/local/lib/python3.6/dist-packages/deepspeech_training/train.py", line 933, in main
train()
File "/usr/local/lib/python3.6/dist-packages/deepspeech_training/train.py", line 601, in train
train_loss, _ = run_set('train', epoch, train_init_op)
File "/usr/local/lib/python3.6/dist-packages/deepspeech_training/train.py", line 566, in run_set
feed_dict=feed_dict)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 956, in run
run_metadata_ptr)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1180, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1359, in _do_run
run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1384, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InternalError: 2 root error(s) found.
(0) Internal: Failed to call ThenRnnBackward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 2048, 2048, 1, 518, 64, 2048]
[[node tower_0/gradients/tower_0/cudnn_lstm/CudnnRNNV3_grad/CudnnRNNBackpropV3 (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]
[[tower_0/gradients/tower_0/cudnn_lstm/CudnnRNNV3_grad/CudnnRNNBackpropV3/_69]]
(1) Internal: Failed to call ThenRnnBackward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 2048, 2048, 1, 518, 64, 2048]
[[node tower_0/gradients/tower_0/cudnn_lstm/CudnnRNNV3_grad/CudnnRNNBackpropV3 (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]
0 successful operations.
0 derived errors ignored.
Original stack trace for 'tower_0/gradients/tower_0/cudnn_lstm/CudnnRNNV3_grad/CudnnRNNBackpropV3':
File "DeepSpeech.py", line 12, in <module>
ds_train.run_script()
File "/usr/local/lib/python3.6/dist-packages/deepspeech_training/train.py", line 961, in run_script
absl.app.run(main)
File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 299, in run
_run_main(main, args)
File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 250, in _run_main
sys.exit(main(argv))
File "/usr/local/lib/python3.6/dist-packages/deepspeech_training/train.py", line 933, in main
train()
File "/usr/local/lib/python3.6/dist-packages/deepspeech_training/train.py", line 479, in train
gradients, loss, non_finite_files = get_tower_results(iterator, optimizer, dropout_rates)
File "/usr/local/lib/python3.6/dist-packages/deepspeech_training/train.py", line 321, in get_tower_results
gradients = optimizer.compute_gradients(avg_loss)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/optimizer.py", line 512, in compute_gradients
colocate_gradients_with_ops=colocate_gradients_with_ops)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gradients_impl.py", line 158, in gradients
unconnected_gradients)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gradients_util.py", line 679, in _GradientsHelper
lambda: grad_fn(op, *out_grads))
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gradients_util.py", line 350, in _MaybeCompile
return grad_fn() # Exit early
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gradients_util.py", line 679, in <lambda>
lambda: grad_fn(op, *out_grads))
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/cudnn_rnn_grad.py", line 104, in _cudnn_rnn_backwardv3
direction=op.get_attr("direction")) + (None,)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gen_cudnn_rnn_ops.py", line 749, in cudnn_rnn_backprop_v3
name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
attrs, op_def, compute_device)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
self._traceback = tf_stack.extract_stack()
...which was originally created as op 'tower_0/cudnn_lstm/CudnnRNNV3', defined at:
File "DeepSpeech.py", line 12, in <module>
ds_train.run_script()
[elided 4 identical lines from previous traceback]
File "/usr/local/lib/python3.6/dist-packages/deepspeech_training/train.py", line 479, in train
gradients, loss, non_finite_files = get_tower_results(iterator, optimizer, dropout_rates)
File "/usr/local/lib/python3.6/dist-packages/deepspeech_training/train.py", line 312, in get_tower_results
avg_loss, non_finite_files = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0)
File "/usr/local/lib/python3.6/dist-packages/deepspeech_training/train.py", line 239, in calculate_mean_edit_distance_and_loss
logits, _ = create_model(batch_x, batch_seq_len, dropout, reuse=reuse, rnn_impl=rnn_impl)
File "/usr/local/lib/python3.6/dist-packages/deepspeech_training/train.py", line 190, in create_model
output, output_state = rnn_impl(layer_3, seq_length, previous_state, reuse)
File "/usr/local/lib/python3.6/dist-packages/deepspeech_training/train.py", line 128, in rnn_impl_cudnn_rnn
sequence_lengths=seq_length)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/layers/base.py", line 548, in __call__
outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/base_layer.py", line 854, in __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/impl/api.py", line 234, in wrapper
return converted_call(f, options, args, kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/impl/api.py", line 439, in converted_call
return _call_unconverted(f, args, kwargs, options)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/impl/api.py", line 330, in _call_unconverted
return f(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/contrib/cudnn_rnn/python/layers/cudnn_rnn.py", line 440, in call
training)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/contrib/cudnn_rnn/python/layers/cudnn_rnn.py", line 518, in _forward
seed=self._seed)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py", line 1132, in _cudnn_rnn
outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gen_cudnn_rnn_ops.py", line 2051, in cudnn_rnnv3
time_major=time_major, name=name)
Error in atexit._run_exitfuncs:
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/util.py", line 262, in _run_finalizers
Process ForkPoolWorker-1:
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/pool.py", line 108, in worker
task = get()
File "/usr/lib/python3.6/multiprocessing/queues.py", line 334, in get
with self._rlock:
File "/usr/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
return self._semlock.__enter__()
KeyboardInterrupt
File "/usr/lib/python3.6/multiprocessing/util.py", line 186, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/pool.py", line 571, in _terminate_pool
cls._help_stuff_finish(inqueue, task_handler, len(pool))
File "/usr/lib/python3.6/multiprocessing/pool.py", line 556, in _help_stuff_finish
inqueue._rlock.acquire()
KeyboardInterrupt
Process ForkPoolWorker-2:
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.6/multiprocessing/pool.py", line 108, in worker
task = get()
File "/usr/lib/python3.6/multiprocessing/queues.py", line 335, in get
res = self._reader.recv_bytes()
File "/usr/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/usr/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
KeyboardInterrupt
From the response above you can see I ran a training sucessfuly with no --augment and --epochs 10 , I don’t know why i got this error, what can I change?
Thanks in advance!