It trained through 2 complete epochs but due to it taking 6 and a half hours per epoch, I needed to cancel it since it would keep training due to the --epochs 1
argument not stopping at 1.
So there is no way to recover a .pb from the checkpoints? I need to train again? It is not the end of the world but I just want to assess.
However, I am now getting OOM errors after trying to train fresh with cleared checkpoints
I Could not find best validating checkpoint.
I Could not find most recent checkpoint.
I Initializing all variables.
I STARTING Optimization
Epoch 0 | Training | Elapsed Time: 0:00:00 | Steps: 0 | Loss: 0.000000
Traceback (most recent call last):
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/client/session.py”, line 1365, in _do_call
return fn(*args)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/client/session.py”, line 1350, in _run_fn
target_list, run_metadata)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/client/session.py”, line 1443, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor of shape [1,19,26,494] and type float
[[{{node tower_0/conv1d/ExpandDims_1}}]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File “DeepSpeech.py”, line 12, in
ds_train.run_script()
File “/DeepSpeech/training/deepspeech_training/train.py”, line 955, in run_script
absl.app.run(main)
File “/DeepSpeech/venv/lib/python3.6/site-packages/absl/app.py”, line 299, in run
_run_main(main, args)
File “/DeepSpeech/venv/lib/python3.6/site-packages/absl/app.py”, line 250, in _run_main
sys.exit(main(argv))
File “/DeepSpeech/training/deepspeech_training/train.py”, line 927, in main
train()
File “/DeepSpeech/training/deepspeech_training/train.py”, line 595, in train
train_loss, _ = run_set(‘train’, epoch, train_init_op)
File “/DeepSpeech/training/deepspeech_training/train.py”, line 560, in run_set
feed_dict=feed_dict)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/client/session.py”, line 956, in run
run_metadata_ptr)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/client/session.py”, line 1180, in _run
feed_dict_tensor, options, run_metadata)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/client/session.py”, line 1359, in _do_run
run_metadata)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/client/session.py”, line 1384, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor of shape [1,19,26,494] and type float
[[node tower_0/conv1d/ExpandDims_1 (defined at /DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1748) ]]
Original stack trace for ‘tower_0/conv1d/ExpandDims_1’:
File “DeepSpeech.py”, line 12, in
ds_train.run_script()
File “/DeepSpeech/training/deepspeech_training/train.py”, line 955, in run_script
absl.app.run(main)
File “/DeepSpeech/venv/lib/python3.6/site-packages/absl/app.py”, line 299, in run
_run_main(main, args)
File “/DeepSpeech/venv/lib/python3.6/site-packages/absl/app.py”, line 250, in _run_main
sys.exit(main(argv))
File “/DeepSpeech/training/deepspeech_training/train.py”, line 927, in main
train()
File “/DeepSpeech/training/deepspeech_training/train.py”, line 473, in train
gradients, loss, non_finite_files = get_tower_results(iterator, optimizer, dropout_rates)
File “/DeepSpeech/training/deepspeech_training/train.py”, line 312, in get_tower_results
avg_loss, non_finite_files = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0)
File “/DeepSpeech/training/deepspeech_training/train.py”, line 239, in calculate_mean_edit_distance_and_loss
logits, _ = create_model(batch_x, batch_seq_len, dropout, reuse=reuse, rnn_impl=rnn_impl)
File “/DeepSpeech/training/deepspeech_training/train.py”, line 167, in create_model
batch_x = create_overlapping_windows(batch_x)
File “/DeepSpeech/training/deepspeech_training/train.py”, line 69, in create_overlapping_windows
batch_x = tf.nn.conv1d(input=batch_x, filters=eye_filter, stride=1, padding=‘SAME’)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py”, line 574, in new_func
return func(*args, **kwargs)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py”, line 574, in new_func
return func(*args, **kwargs)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/ops/nn_ops.py”, line 1672, in conv1d
filters = array_ops.expand_dims(filters, 0)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/util/dispatch.py”, line 180, in wrapper
return target(*args, **kwargs)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py”, line 507, in new_func
return func(*args, **kwargs)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/ops/array_ops.py”, line 265, in expand_dims
return expand_dims_v2(input, axis, name)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/util/dispatch.py”, line 180, in wrapper
return target(*args, **kwargs)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/ops/array_ops.py”, line 314, in expand_dims_v2
return gen_array_ops.expand_dims(input, axis, name)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/ops/gen_array_ops.py”, line 2465, in expand_dims
“ExpandDims”, input=input, dim=axis, name=name)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/framework/op_def_library.py”, line 794, in _apply_op_helper
op_def=op_def)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py”, line 507, in new_func
return func(*args, **kwargs)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py”, line 3357, in create_op
attrs, op_def, compute_device)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py”, line 3426, in _create_op_internal
op_def=op_def)
File “/DeepSpeech/venv/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py”, line 1748, in init
self._traceback = tf_stack.extract_stack()