File size: 3,683 Bytes
dd958be | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | max_seq_length: 4096
restore_from_path: "/path/to/model"
tensor_model_parallel_size: 4 # intra-layer model parallelism
pipeline_model_parallel_size: 2 # inter-layer model parallelism
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training.
sync_batch_comm: False
megatron_amp_O2: True
encoder_seq_length: 4096 # the sequence length of the encoder model, it will be overwriten by loaded GPT model
## Sequence Parallelism
# Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
# See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
sequence_parallel: False
## Activation Checkpoint
activations_checkpoint_granularity: full # 'selective' or 'full'
activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective'
# 'uniform' divides the total number of transformer layers and checkpoints the input activation of each chunk at the specified granularity
# 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
activations_checkpoint_num_layers: 1 # not used with 'selective'
activations_checkpoint_layers_per_pipeline: null
# This feature is valid only when used with pipeline-model-parallelism. More details in megatron_gpt_config.yaml.
gradient_as_bucket_view: False
seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value
use_flash_attention: True # if not None, will match the base model's value
hidden_dropout: ${dropout}
attention_dropout: ${dropout}
ffn_dropout: ${dropout}
use_loss_mask: True
## Transformer Engine
transformer_engine: True
fp8: False # enables fp8 in TransformerLayer forward
fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
fp8_margin: 0 # scaling margin
fp8_interval: 1 # scaling update interval
fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration
use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
peft:
peft_scheme: "none" # ["lora", "none"]
restore_from_path: null
lora_tuning:
target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', 'attention' (qkv & dense), 'mlp' (fc1 & fc2), 'all'
adapter_dim: 32
adapter_dropout: 0.0
column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
weight_tying: False
position_embedding_strategy: null # used only when weight_tying is True
optim:
name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
lr: ${lr}
weight_decay: 0.1
betas:
- 0.9
- 0.98
sched:
name: CosineAnnealing
warmup_steps: 20
constant_steps: 100
min_lr: ${min_lr}
|