| max_seq_length: 4096 | |
| restore_from_path: "/path/to/model" # FIXME: Change this `restore_from_path` to the path of the model you want to resume from. | |
| tensor_model_parallel_size: 4 # intra-layer model parallelism | |
| pipeline_model_parallel_size: 2 # inter-layer model parallelism | |
| resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. | |
| save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. | |
| sync_batch_comm: False | |
| megatron_amp_O2: True | |
| encoder_seq_length: 4096 # the sequence length of the encoder model, it will be overwriten by loaded GPT model | |
| ## Sequence Parallelism | |
| # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially | |
| # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. | |
| sequence_parallel: False | |
| ## Activation Checkpoint | |
| activations_checkpoint_granularity: selective # 'selective' or 'full' | |
| activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective' | |
| # 'uniform' divides the total number of transformer layers and checkpoints the input activation of each chunk at the specified granularity | |
| # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity | |
| activations_checkpoint_num_layers: null # not used with 'selective' | |
| activations_checkpoint_layers_per_pipeline: null | |
| # This feature is valid only when used with pipeline-model-parallelism. More details in megatron_gpt_config.yaml. | |
| gradient_as_bucket_view: False | |
| seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value | |
| use_flash_attention: True # if not None, will match the base model's value | |
| hidden_dropout: ${dropout} | |
| attention_dropout: ${dropout} | |
| ffn_dropout: ${dropout} | |
| use_loss_mask: True | |
| ## Transformer Engine | |
| transformer_engine: True | |
| fp8: False # enables fp8 in TransformerLayer forward | |
| fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 | |
| fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID | |
| fp8_margin: 0 # scaling margin | |
| fp8_interval: 1 # scaling update interval | |
| fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor | |
| fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history | |
| reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration | |
| use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. | |
| peft: | |
| peft_scheme: "none" # ["lora", "none"] | |
| restore_from_path: null | |
| lora_tuning: | |
| target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', 'attention' (qkv & dense), 'mlp' (fc1 & fc2), 'all' | |
| adapter_dim: 32 | |
| adapter_dropout: 0.0 | |
| column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal | |
| row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal | |
| layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers | |
| weight_tying: False | |
| position_embedding_strategy: null # used only when weight_tying is True | |
| optim: | |
| name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work. | |
| lr: ${lr} | |
| weight_decay: 0.1 | |
| betas: | |
| - 0.9 | |
| - 0.98 | |
| sched: | |
| name: CosineAnnealing | |
| warmup_steps: 20 | |
| constant_steps: 100 | |
| min_lr: ${min_lr} | |