| |
| defaults: |
| - override /trainer: default |
| - override /model: null |
| - override /datamodule: openwebtext |
| |
| |
| |
| - override /optimizer: adamw-apex |
| - override /scheduler: linear-warmup |
| - override /callbacks: [default, norm-monitor] |
| - override /metrics: [perplexity, num-tokens] |
| - override /logger: wandb |
|
|
| |
| |
|
|
| task: |
| _target_: src.tasks.seq.SequenceLMModel |
|
|
| seed: 1111 |
|
|
| trainer: |
| accelerator: gpu |
| devices: 8 |
| num_nodes: 1 |
| accumulate_grad_batches: ${div_up:${train.global_batch_size}, ${eval:${trainer.devices} * ${datamodule.batch_size} * ${trainer.num_nodes}}} |
| max_steps: 400000 |
| val_check_interval: ${eval:1000 * ${.accumulate_grad_batches}} |
| check_val_every_n_epoch: null |
| precision: 16 |
| gradient_clip_val: 1.0 |
| strategy: null |
|
|
| datamodule: |
| batch_size: 16 |
| batch_size_eval: ${.batch_size} |
| max_length: 1024 |
| fault_tolerant: True |
| ddp: ${eval:"${trainer.devices} > 1"} |
| |
| train: |
| gpu_mem: ${eval:"round(float(__import__('subprocess').check_output('nvidia-smi -i 0 --query-gpu=memory.total --format=csv,noheader,nounits', shell=True).strip().decode()) / 1000)"} |
| global_batch_size: 512 |
| optimizer: |
| lr: 6e-4 |
| weight_decay: 0.1 |
| optimizer_param_grouping: |
| bias_weight_decay: False |
| normalization_weight_decay: False |
| scheduler: |
| num_warmup_steps: ${eval:0.01 * ${trainer.max_steps}} |
| num_training_steps: ${trainer.max_steps} |
| loss_fn: |
| |
| |
| _target_: flash_attn.losses.cross_entropy.CrossEntropyLoss |
| inplace_backward: True |
|
|
| eval: |
| log_on_step: True |
|
|
| callbacks: |
| model_checkpoint: |
| monitor: val/loss |
| mode: min |
| save_top_k: 3 |
| save_last: True |
| every_n_train_steps: 1000 |
| dirpath: ${work_dir}/checkpoints/${oc.select:name,''} |
| filename: step_{step} |
| auto_insert_metric_name: False |
| model_checkpoint_progress: |
| _target_: src.callbacks.model_checkpoint.ModelCheckpointMine |
| fault_tolerant: True |
| every_n_train_steps: 50000 |
| save_last: False |
| save_top_k: -1 |
| dirpath: ${..model_checkpoint.dirpath} |
| filename: progress_step_{step} |
| auto_insert_metric_name: False |
| early_stopping: null |
|
|