max_seq_length: 4096 restore_from_path: "/path/to/model" tensor_model_parallel_size: 4 # intra-layer model parallelism pipeline_model_parallel_size: 2 # inter-layer model parallelism resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. sync_batch_comm: False megatron_amp_O2: True encoder_seq_length: 4096 # the sequence length of the encoder model, it will be overwriten by loaded GPT model ## Sequence Parallelism # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. sequence_parallel: False ## Activation Checkpoint activations_checkpoint_granularity: full # 'selective' or 'full' activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective' # 'uniform' divides the total number of transformer layers and checkpoints the input activation of each chunk at the specified granularity # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity activations_checkpoint_num_layers: 1 # not used with 'selective' activations_checkpoint_layers_per_pipeline: null # This feature is valid only when used with pipeline-model-parallelism. More details in megatron_gpt_config.yaml. gradient_as_bucket_view: False seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value use_flash_attention: True # if not None, will match the base model's value hidden_dropout: ${dropout} attention_dropout: ${dropout} ffn_dropout: ${dropout} use_loss_mask: True ## Transformer Engine transformer_engine: True fp8: False # enables fp8 in TransformerLayer forward fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID fp8_margin: 0 # scaling margin fp8_interval: 1 # scaling update interval fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. peft: peft_scheme: "none" # ["lora", "none"] restore_from_path: null lora_tuning: target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', 'attention' (qkv & dense), 'mlp' (fc1 & fc2), 'all' adapter_dim: 32 adapter_dropout: 0.0 column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers weight_tying: False position_embedding_strategy: null # used only when weight_tying is True optim: name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work. lr: ${lr} weight_decay: 0.1 betas: - 0.9 - 0.98 sched: name: CosineAnnealing warmup_steps: 20 constant_steps: 100 min_lr: ${min_lr}