| explicit_log_dir: null | |
| exp_dir: ${exp_dir} | |
| name: ${name} | |
| create_wandb_logger: True | |
| wandb_logger_kwargs: | |
| name: ${name} | |
| project: ${project} | |
| entity: ${entity} | |
| resume_if_exists: True | |
| resume_ignore_no_checkpoint: True | |
| create_checkpoint_callback: True | |
| checkpoint_callback_params: | |
| monitor: val_loss | |
| always_save_nemo: False | |
| save_weights_only: True | |
| save_top_k: 1 | |
| save_last: False | |
| mode: min | |
| save_nemo_on_train_end: False | |
| filename: "{step}" | |
| model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} | |
| save_best_model: False # need to keep this false otherwise it will create multiple last.ckpt files because restore reset the previous best model | |