| |
| |
| |
|
|
| { |
| |
| "vocab_file": "./20B_checkpoints/20B_tokenizer.json", |
| "save": "./20B_checkpoints", |
| "load": "./20B_checkpoints", |
|
|
| |
| "data_path": "./data/pile_20B_tokenizer/pile_20B_tokenizer_text_document", |
|
|
| |
| |
| "pipe_parallel_size": 4, |
| "model_parallel_size": 2, |
|
|
| |
| "num_layers": 44, |
| "hidden_size": 6144, |
| "num_attention_heads": 64, |
| "seq_length": 2048, |
| "max_position_embeddings": 2048, |
| "norm": "layernorm", |
| "pos_emb": "rotary", |
| "rotary_pct": 0.25, |
| "no_weight_tying": true, |
| "gpt_j_residual": true, |
| "output_layer_parallelism": "column", |
| "scaled_upper_triang_masked_softmax_fusion": true, |
| "bias_gelu_fusion": true, |
| "rope_fusion": false, |
| "layernorm_fusion": false, |
|
|
| |
| "init_method": "small_init", |
| "output_layer_init_method": "wang_init", |
|
|
| |
| "optimizer": { |
| "type": "Adam", |
| "params": { |
| "lr": 0.97e-4, |
| "betas": [0.9, 0.95], |
| "eps": 1.0e-8, |
| } |
| }, |
|
|
| "min_lr": 0.97e-5, |
|
|
| |
| "zero_optimization": { |
| "stage": 1, |
| "allgather_partitions": True, |
| "allgather_bucket_size": 1260000000, |
| "overlap_comm": True, |
| "reduce_scatter": True, |
| "reduce_bucket_size": 1260000000, |
| "contiguous_gradients": True, |
| }, |
|
|
| |
| "train_micro_batch_size_per_gpu": 4, |
| "gradient_accumulation_steps": 32, |
| "data_impl": "mmap", |
| "split": "995,4,1", |
|
|
| |
| "checkpoint_activations": true, |
| "checkpoint_num_layers": 1, |
| "partition_activations": false, |
| "synchronize_each_layer": true, |
|
|
| |
| "gradient_clipping": 1.0, |
| "weight_decay": 0.01, |
| "hidden_dropout": 0, |
| "attention_dropout": 0, |
|
|
| |
| "fp16": { |
| "fp16": true, |
| "enabled": true, |
| "loss_scale": 0, |
| "loss_scale_window": 1000, |
| "initial_scale_power": 12, |
| "hysteresis": 2, |
| "min_loss_scale": 1 |
| }, |
|
|
| |
| "train_iters": 150000, |
| "lr_decay_iters": 150000, |
|
|
| "distributed_backend": "nccl", |
| "lr_decay_style": "cosine", |
| "warmup": 0.01, |
| "checkpoint_factor": 500, |
| "eval_interval": 1000, |
| "eval_iters": 10, |
|
|
| |
| "log_interval": 2, |
| "steps_per_print": 2, |
| "wall_clock_breakdown": false, |
|
|
| |
| "tokenizer_type": "HFTokenizer", |
| "tensorboard-dir": "./tensorboard", |
| "log_dir": "./logs", |
|
|
| } |
|
|