#Ubuntu 20.04
#graphics card RTX3090
#tf-gpu, CUDA and cudnn per Dockerfile which is provided in the official doc.
run python3 DeepSpeech.py
--train_files /root/de/clips/train.csv
--test_files /root/de/clips/test.csv
--dev_files /root/de/clips/dev.csv
--export_dir /root/DeepSpeech/bin/try
--train_batch_size=24
--dev_batch_size=24
--test_batch_size=24
--epochs=33
--dropout_rate=0.25
--learning_rate=0.0001
in docker container which is build by the image tensorflow/tensorflow:1.15.4-gpu-py3. # Problem: It always gets stuck before the Training starts. Did someone meet this Problem and solve it? Thx a lot.
The following is the specific error message:
Traceback (most recent call last):
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py”, line 1365, in _do_call
return fn(*args)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py”, line 1350, in _run_fn
target_list, run_metadata)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py”, line 1443, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.InternalError: 2 root error(s) found.
(0) Internal: Blas GEMM launch failed : a.shape=(1896, 494), b.shape=(494, 2048), m=1896, n=2048, k=494
[[{{node tower_0/MatMul}}]]
[[concat/concat/_99]]
(1) Internal: Blas GEMM launch failed : a.shape=(1896, 494), b.shape=(494, 2048), m=1896, n=2048, k=494
[[{{node tower_0/MatMul}}]]
0 successful operations.
0 derived errors ignored.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File “DeepSpeech.py”, line 12, in
ds_train.run_script()
File “/root/DeepSpeech/deepspeech_training/train.py”, line 982, in run_script
absl.app.run(main)
File “/usr/local/lib/python3.6/dist-packages/absl/app.py”, line 300, in run
_run_main(main, args)
File “/usr/local/lib/python3.6/dist-packages/absl/app.py”, line 251, in _run_main
sys.exit(main(argv))
File “/root/DeepSpeech/deepspeech_training/train.py”, line 954, in main
train()
File “/root/DeepSpeech/deepspeech_training/train.py”, line 607, in train
train_loss, _ = run_set(‘train’, epoch, train_init_op)
File “/root/DeepSpeech/deepspeech_training/train.py”, line 572, in run_set
feed_dict=feed_dict)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py”, line 956, in run
run_metadata_ptr)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py”, line 1180, in _run
feed_dict_tensor, options, run_metadata)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py”, line 1359, in _do_run
run_metadata)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py”, line 1384, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InternalError: 2 root error(s) found.
(0) Internal: Blas GEMM launch failed : a.shape=(1896, 494), b.shape=(494, 2048), m=1896, n=2048, k=494
[[node tower_0/MatMul (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]
[[concat/concat/_99]]
(1) Internal: Blas GEMM launch failed : a.shape=(1896, 494), b.shape=(494, 2048), m=1896, n=2048, k=494
[[node tower_0/MatMul (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]
0 successful operations.
0 derived errors ignored.
Original stack trace for ‘tower_0/MatMul’:
File “DeepSpeech.py”, line 12, in
ds_train.run_script()
File “/root/DeepSpeech/deepspeech_training/train.py”, line 982, in run_script
absl.app.run(main)
File “/usr/local/lib/python3.6/dist-packages/absl/app.py”, line 300, in run
_run_main(main, args)
File “/usr/local/lib/python3.6/dist-packages/absl/app.py”, line 251, in _run_main
sys.exit(main(argv))
File “/root/DeepSpeech/deepspeech_training/train.py”, line 954, in main
train()
File “/root/DeepSpeech/deepspeech_training/train.py”, line 484, in train
gradients, loss, non_finite_files = get_tower_results(iterator, optimizer, dropout_rates)
File “/root/DeepSpeech/deepspeech_training/train.py”, line 317, in get_tower_results
avg_loss, non_finite_files = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0)
File “/root/DeepSpeech/deepspeech_training/train.py”, line 244, in calculate_mean_edit_distance_and_loss
logits, _ = create_model(batch_x, batch_seq_len, dropout, reuse=reuse, rnn_impl=rnn_impl)
File “/root/DeepSpeech/deepspeech_training/train.py”, line 185, in create_model
layers[‘layer_1’] = layer_1 = dense(‘layer_1’, batch_x, Config.n_hidden_1, dropout_rate=dropout[0], layer_norm=FLAGS.layer_norm)
File “/root/DeepSpeech/deepspeech_training/train.py”, line 83, in dense
output = tf.nn.bias_add(tf.matmul(x, weights), bias)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/dispatch.py”, line 180, in wrapper
return target(*args, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/math_ops.py”, line 2754, in matmul
a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gen_math_ops.py”, line 6136, in mat_mul
name=name)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py”, line 794, in _apply_op_helper
op_def=op_def)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py”, line 507, in new_func
return func(*args, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py”, line 3357, in create_op
attrs, op_def, compute_device)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py”, line 3426, in _create_op_internal
op_def=op_def)
File “/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py”, line 1748, in init
self._traceback = tf_stack.extract_stack()