koichi12's picture
Add files using upload-large-folder tool
dd958be verified
raw
history blame
737 Bytes
explicit_log_dir: null
exp_dir: ${exp_dir}
name: ${name}
create_wandb_logger: True
wandb_logger_kwargs:
name: ${name}
project: ${project}
entity: ${entity}
resume_if_exists: True
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
always_save_nemo: False
save_weights_only: True
save_top_k: 1
save_last: False
mode: min
save_nemo_on_train_end: False
filename: "{step}"
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
save_best_model: False # need to keep this false otherwise it will create multiple last.ckpt files because restore reset the previous best model