Add files using upload-large-folder tool

dd958be verified about 1 year ago

3.78 kB

	max_seq_length: 4096
	restore_from_path: "/path/to/model" # FIXME: Change this `restore_from_path` to the path of the model you want to resume from.

	tensor_model_parallel_size: 4 # intra-layer model parallelism
	pipeline_model_parallel_size: 2 # inter-layer model parallelism
	resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
	save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training.
	sync_batch_comm: False
	megatron_amp_O2: True
	encoder_seq_length: 4096 # the sequence length of the encoder model, it will be overwriten by loaded GPT model

	## Sequence Parallelism
	# Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
	# See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
	sequence_parallel: False

	## Activation Checkpoint
	activations_checkpoint_granularity: selective # 'selective' or 'full'
	activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective'
	# 'uniform' divides the total number of transformer layers and checkpoints the input activation of each chunk at the specified granularity
	# 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
	activations_checkpoint_num_layers: null # not used with 'selective'
	activations_checkpoint_layers_per_pipeline: null
	# This feature is valid only when used with pipeline-model-parallelism. More details in megatron_gpt_config.yaml.
	gradient_as_bucket_view: False
	seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value
	use_flash_attention: True # if not None, will match the base model's value

	hidden_dropout: ${dropout}
	attention_dropout: ${dropout}
	ffn_dropout: ${dropout}

	use_loss_mask: True

	## Transformer Engine
	transformer_engine: True
	fp8: False # enables fp8 in TransformerLayer forward
	fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
	fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
	fp8_margin: 0 # scaling margin
	fp8_interval: 1 # scaling update interval
	fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
	fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
	reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration
	use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.

	peft:
	peft_scheme: "none" # ["lora", "none"]
	restore_from_path: null
	lora_tuning:
	target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', 'attention' (qkv & dense), 'mlp' (fc1 & fc2), 'all'
	adapter_dim: 32
	adapter_dropout: 0.0
	column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
	row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
	layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
	weight_tying: False
	position_embedding_strategy: null # used only when weight_tying is True

	optim:
	name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
	lr: ${lr}
	weight_decay: 0.1
	betas:
	- 0.9
	- 0.98
	sched:
	name: CosineAnnealing
	warmup_steps: 20
	constant_steps: 100
	min_lr: ${min_lr}