{ "train_micro_batch_size_per_gpu": 1, "gradient_accumulation_steps": 8, "steps_per_print": 1, "optimizer": { "type": "AdamW", "params": { "lr": 1e-4, "betas": [0.9, 0.95], "weight_decay": 0.05 } }, "scheduler": { "type": "WarmupLR", "params": { "warmup_min_lr": 0.0, "warmup_max_lr": 1e-4, "warmup_num_steps": 4000 } }, "gradient_clipping": 1.0, "zero_optimization": { "stage": 2, "offload_optimizer": { "device": "cpu", "pin_memory": true }, "overlap_comm": true, "contiguous_gradients": true, "reduce_bucket_size": 5e7 }, "activation_checkpointing": { "partition_activations": true, "contiguous_memory_optimization": true, "cpu_checkpointing": true, "number_checkpoints": 2 } }