File size: 737 Bytes
dd958be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
explicit_log_dir: null
exp_dir: ${exp_dir}
name: ${name}
create_wandb_logger: True
wandb_logger_kwargs:
    name: ${name}
    project: ${project}
    entity: ${entity}
resume_if_exists: True
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
    monitor: val_loss
    always_save_nemo: False
    save_weights_only: True
    save_top_k: 1
    save_last: False
    mode: min
    save_nemo_on_train_end: False
    filename: "{step}"
    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
    save_best_model: False   # need to keep this false otherwise it will create multiple last.ckpt files because restore reset the previous best model