I’m trying to integrate Optuna with DeepSpeech in order to optimise some of its hyperparameters. I’m sticking to learning rate for now, just to get a feel for how Optuna works, but I’ve hit a roadblock and need some help.
I have a function hps_train
which is what does the training step. It takes the Optuna trial object as the argument and returns the dev loss, which is what I want to use Optuna to minimise. This is the exact same function as train()
in training/deepspeech_training/train.py
, but with a few modifications:
def hps_train(trial):
#.
#.Same as train()
#.
if FLAGS.horovod:
# Effective batch size in synchronous distributed training is scaled by the number of workers. An increase in learning rate compensates for the increased batch size.
optimizer = hps_create_optimizer(learning_rate_var * hvd.size())
optimizer = hvd.DistributedOptimizer(optimizer)
else:
optimizer, learning_rate_var = hps_create_optimizer(trial)
reduce_learning_rate_op = learning_rate_var.assign(
tf.multiply(learning_rate_var, FLAGS.plateau_reduction)
)
#.
#.Same as train()
#.
with tfv1.Session(config=Config.session_config) as session:
#.
#.Same as train()
#.
final_dev_loss = dev_losses[-1]
log_debug("Session closed.")
return final_dev_loss
I also have some helper functions:
def hps_create_optimizer(trial):
learning_rate = trial.suggest_float("adam_lr", 1e-5, 1e-1, log=True)
with tf.variable_scope("learning_rate", reuse=tf.AUTO_REUSE):
learning_rate_var = tfv1.get_variable(
"learning_rate", initializer=learning_rate, trainable=False
)
optimizer = tfv1.train.AdamOptimizer(
learning_rate=learning_rate_var, beta1=0.9, beta2=0.999, epsilon=1e-08
)
return optimizer, learning_rate_var
def new_trial_callback(study, trial):
chkpt_path = setup_dirs(study.study_name, trial.number + 1)
FLAGS.checkpoint_dir = chkpt_path
FLAGS.save_checkpoint_dir = chkpt_path
FLAGS.load_checkpoint_dir = chkpt_path
def objective(trial, session):
if FLAGS.train_files:
val_loss = hps_train(trial, session)
return float(val_loss)
def objective_tf(trial):
tfv1.reset_default_graph()
with tfv1.Graph().as_default():
return objective(trial, session)
Putting it all together:
def main(_):
initialize_globals()
early_training_checks()
lr_study = optuna.create_study(study_name="lr_study", direction='minimize')
chkpt_dir = setup_dirs(lr_study.study_name, 0)
FLAGS.checkpoint_dir = chkpt_dir
FLAGS.save_checkpoint_dir = chkpt_dir
FLAGS.load_checkpoint_dir = chkpt_dir
lr_study.optimize(objective_tf, n_trials=25, callbacks=[new_trial_callback])
When I run this code, the first run completes normally. However, when it tries to start the second one, I get an error:
$ python training/hparam_search.py --train_files ~/datasets/cv-corpus-1/en/clips/train.csv --dev_files ~/datasets/cv-corpus-1/en/clips/dev.csv --test_files ~/datasets/cv-corpus-1/en/clips/test.csv --train_batch_size 64 --test_batch_size 64 --dev_batch_size 64 --n_hidden 512 --epochs 1 --train_cudnn --use_allow_growth --checkpoint_dir checkpoints
[I 2021-08-30 15:06:16,637] A new study created in memory with name: lr_study
I Could not find best validating checkpoint.
I Could not find most recent checkpoint.
I Initializing all variables.
I STARTING Optimization
Epoch 0 | Training | Elapsed Time: 0:00:17 | Steps: 187 | Loss: 252.374135
Epoch 0 | Validation | Elapsed Time: 0:00:12 | Steps: 109 | Loss: 255.176724 | Dataset: /home/user/datasets/cv-corpus-1/en/clips/dev.csv
I Saved new best validating model with loss 255.176724 to: checkpoints/optuna_trials/lr_study/0/best_dev-187
--------------------------------------------------------------------------------
I FINISHED optimization in 0:00:30.553797
[I 2021-08-30 15:06:50,101] Trial 0 finished with value: 255.1767243551552 and parameters: {'adam_lr': 0.006636434104761772}. Best is trial 0 with value: 255.1767243551552.
[W 2021-08-30 15:06:50,229] Trial 1 failed because of the following error: ValueError('in converted code:\n relative to /usr/local/lib/python3.6/dist-packages/tensorflow_core:\n\n contrib/cudnn_rnn/python/layers/cudnn_rnn.py:440 call\n training)\n contrib/cudnn_rnn/python/layers/cudnn_rnn.py:518 _forward\n seed=self._seed)\n contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py:1132 _cudnn_rnn\n outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args)\n python/ops/gen_cudnn_rnn_ops.py:2051 cudnn_rnnv3\n time_major=time_major, name=name)\n python/framework/op_def_library.py:367 _apply_op_helper\n g = ops._get_graph_from_inputs(_Flatten(keywords.values()))\n python/framework/ops.py:5979 _get_graph_from_inputs\n _assert_same_graph(original_graph_element, graph_element)\n python/framework/ops.py:5914 _assert_same_graph\n (item, original_item))\n\n ValueError: Tensor("cudnn_lstm/opaque_kernel:0", dtype=float32_ref, device=/device:GPU:0) must be from the same graph as Tensor("tower_0/Reshape_2:0", shape=(?, ?, 512), dtype=float32, device=/device:GPU:0).\n',)
Traceback (most recent call last):
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
value_or_values = func(trial)
File "training/hparam_search.py", line 671, in objective_tf
return objective(trial)
File "training/hparam_search.py", line 660, in objective
val_loss = hps_train(trial)
File "training/hparam_search.py", line 332, in hps_train
iterator, optimizer, dropout_rates
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 317, in get_tower_results
avg_loss, non_finite_files = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 244, in calculate_mean_edit_distance_and_loss
logits, _ = create_model(batch_x, batch_seq_len, dropout, reuse=reuse, rnn_impl=rnn_impl)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 195, in create_model
output, output_state = rnn_impl(layer_3, seq_length, previous_state, reuse)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 133, in rnn_impl_cudnn_rnn
sequence_lengths=seq_length)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/layers/base.py", line 548, in __call__
outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/base_layer.py", line 854, in __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/impl/api.py", line 237, in wrapper
raise e.ag_error_metadata.to_exception(e)
ValueError: in converted code:
relative to /usr/local/lib/python3.6/dist-packages/tensorflow_core:
contrib/cudnn_rnn/python/layers/cudnn_rnn.py:440 call
training)
contrib/cudnn_rnn/python/layers/cudnn_rnn.py:518 _forward
seed=self._seed)
contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py:1132 _cudnn_rnn
outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args)
python/ops/gen_cudnn_rnn_ops.py:2051 cudnn_rnnv3
time_major=time_major, name=name)
python/framework/op_def_library.py:367 _apply_op_helper
g = ops._get_graph_from_inputs(_Flatten(keywords.values()))
python/framework/ops.py:5979 _get_graph_from_inputs
_assert_same_graph(original_graph_element, graph_element)
python/framework/ops.py:5914 _assert_same_graph
(item, original_item))
ValueError: Tensor("cudnn_lstm/opaque_kernel:0", dtype=float32_ref, device=/device:GPU:0) must be from the same graph as Tensor("tower_0/Reshape_2:0", shape=(?, ?, 512), dtype=float32, device=/device:GPU:0).
Traceback (most recent call last):
File "training/hparam_search.py", line 691, in <module>
absl.app.run(main)
File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 303, in run
_run_main(main, args)
File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 251, in _run_main
sys.exit(main(argv))
File "training/hparam_search.py", line 684, in main
lr_study.optimize(objective_tf, n_trials=25, callbacks=[new_trial_callback])
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/study.py", line 409, in optimize
show_progress_bar=show_progress_bar,
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/_optimize.py", line 76, in _optimize
progress_bar=progress_bar,
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/_optimize.py", line 163, in _optimize_sequential
trial = _run_trial(study, func, catch)
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/_optimize.py", line 264, in _run_trial
raise func_err
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
value_or_values = func(trial)
File "training/hparam_search.py", line 671, in objective_tf
return objective(trial)
File "training/hparam_search.py", line 660, in objective
val_loss = hps_train(trial)
File "training/hparam_search.py", line 332, in hps_train
iterator, optimizer, dropout_rates
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 317, in get_tower_results
avg_loss, non_finite_files = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 244, in calculate_mean_edit_distance_and_loss
logits, _ = create_model(batch_x, batch_seq_len, dropout, reuse=reuse, rnn_impl=rnn_impl)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 195, in create_model
output, output_state = rnn_impl(layer_3, seq_length, previous_state, reuse)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 133, in rnn_impl_cudnn_rnn
sequence_lengths=seq_length)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/layers/base.py", line 548, in __call__
outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/base_layer.py", line 854, in __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/impl/api.py", line 237, in wrapper
raise e.ag_error_metadata.to_exception(e)
ValueError: in converted code:
relative to /usr/local/lib/python3.6/dist-packages/tensorflow_core:
contrib/cudnn_rnn/python/layers/cudnn_rnn.py:440 call
training)
contrib/cudnn_rnn/python/layers/cudnn_rnn.py:518 _forward
seed=self._seed)
contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py:1132 _cudnn_rnn
outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args)
python/ops/gen_cudnn_rnn_ops.py:2051 cudnn_rnnv3
time_major=time_major, name=name)
python/framework/op_def_library.py:367 _apply_op_helper
g = ops._get_graph_from_inputs(_Flatten(keywords.values()))
python/framework/ops.py:5979 _get_graph_from_inputs
_assert_same_graph(original_graph_element, graph_element)
python/framework/ops.py:5914 _assert_same_graph
(item, original_item))
ValueError: Tensor("cudnn_lstm/opaque_kernel:0", dtype=float32_ref, device=/device:GPU:0) must be from the same graph as Tensor("tower_0/Reshape_2:0", shape=(?, ?, 512), dtype=float32, device=/device:GPU:0).
It looks like the ValueError is complaining that some tensor is not from the same graph as another. But I don’t understand how this can be, since I start each run within a new Graph context, so every tensor should be associated with this new graph.
I’d be grateful for any insights into where I’m going wrong here, or even if this is the recommended way to use Optuna. Thanks very much!