name: gemma-7b-sql-nemo trainer: num_nodes: 1 devices: 8 accelerator: gpu precision: bf16 logger: false enable_checkpointing: false use_distributed_sampler: false max_time: null max_epochs: 1 max_steps: -1 sft: max_epochs: 1 max_steps: -1 val_check_interval: 1000 save_interval: 1000 limit_val_batches: 40 gradient_clip_val: 1.0 exp_manager: explicit_log_dir: models/gemma-7b-sql-nemo exp_dir: null name: gemma-7b-sql-nemo create_wandb_logger: false wandb_logger_kwargs: project: null name: null resume_if_exists: true resume_ignore_no_checkpoint: true create_checkpoint_callback: true checkpoint_callback_params: monitor: validation_loss save_top_k: 5 mode: min save_nemo_on_train_end: true filename: megatron_gpt_sft--{validation_loss:.3f}-{step}-{consumed_samples}-{epoch} model_parallel_size: 4 save_best_model: false model: seed: 1234 tensor_model_parallel_size: 4 pipeline_model_parallel_size: 1 restore_from_path: /workspace/models/pytorch-7b-pt.nemo resume_from_checkpoint: null save_nemo_on_validation_end: true sync_batch_comm: false megatron_amp_O2: true encoder_seq_length: 8192 sequence_parallel: false activations_checkpoint_granularity: null activations_checkpoint_method: null activations_checkpoint_num_layers: null activations_checkpoint_layers_per_pipeline: null answer_only_loss: true gradient_as_bucket_view: false seq_len_interpolation_factor: null use_flash_attention: null hidden_dropout: 0.0 attention_dropout: 0.0 ffn_dropout: 0.0 peft: peft_scheme: none restore_from_path: null lora_tuning: target_modules: - attention_qkv adapter_dim: 32 adapter_dropout: 0.0 column_init_method: xavier row_init_method: zero layer_selection: null weight_tying: false position_embedding_strategy: null data: chat: false chat_prompt_tokens: system_turn_start: "\0" turn_start: "\x11" label_start: "\x12" end_of_turn: ' ' end_of_name: ' ' sample: false num_workers: 0 dataloader_type: single train_ds: file_path: nsql.jsonl global_batch_size: 128 micro_batch_size: 1 shuffle: true memmap_workers: null max_seq_length: 8192 min_seq_length: 1 drop_last: true label_key: output add_eos: true add_sep: false add_bos: false truncation_field: input index_mapping_dir: null prompt_template: '{input} {output}' hf_dataset: false truncation_method: right validation_ds: file_path: nsql.jsonl global_batch_size: 128 micro_batch_size: 1 shuffle: false memmap_workers: null max_seq_length: 8192 min_seq_length: 1 drop_last: true label_key: output add_eos: true add_sep: false add_bos: false truncation_field: input index_mapping_dir: null prompt_template: '{input} {output}' hf_dataset: false truncation_method: right output_original_text: true optim: name: distributed_fused_adam lr: 5.0e-06 weight_decay: 0.01 betas: - 0.9 - 0.98 sched: name: CosineAnnealing warmup_steps: 10 constant_steps: 1000 min_lr: 9.0e-07 bias_activation_fusion: true precision: bf16