name: gemma-7b-sql-nemo
trainer:
  num_nodes: 1
  devices: 8
  accelerator: gpu
  precision: bf16
  logger: false
  enable_checkpointing: false
  use_distributed_sampler: false
  max_time: null
  max_epochs: 1
  max_steps: -1
  sft:
    max_epochs: 1
    max_steps: -1
    val_check_interval: 1000
    save_interval: 1000
    limit_val_batches: 40
    gradient_clip_val: 1.0
exp_manager:
  explicit_log_dir: models/gemma-7b-sql-nemo
  exp_dir: null
  name: gemma-7b-sql-nemo
  create_wandb_logger: false
  wandb_logger_kwargs:
    project: null
    name: null
  resume_if_exists: true
  resume_ignore_no_checkpoint: true
  create_checkpoint_callback: true
  checkpoint_callback_params:
    monitor: validation_loss
    save_top_k: 5
    mode: min
    save_nemo_on_train_end: true
    filename: megatron_gpt_sft--{validation_loss:.3f}-{step}-{consumed_samples}-{epoch}
    model_parallel_size: 4
    save_best_model: false
model:
  seed: 1234
  tensor_model_parallel_size: 4
  pipeline_model_parallel_size: 1
  restore_from_path: /workspace/models/pytorch-7b-pt.nemo
  resume_from_checkpoint: null
  save_nemo_on_validation_end: true
  sync_batch_comm: false
  megatron_amp_O2: true
  encoder_seq_length: 8192
  sequence_parallel: false
  activations_checkpoint_granularity: null
  activations_checkpoint_method: null
  activations_checkpoint_num_layers: null
  activations_checkpoint_layers_per_pipeline: null
  answer_only_loss: true
  gradient_as_bucket_view: false
  seq_len_interpolation_factor: null
  use_flash_attention: null
  hidden_dropout: 0.0
  attention_dropout: 0.0
  ffn_dropout: 0.0
  peft:
    peft_scheme: none
    restore_from_path: null
    lora_tuning:
      target_modules:
      - attention_qkv
      adapter_dim: 32
      adapter_dropout: 0.0
      column_init_method: xavier
      row_init_method: zero
      layer_selection: null
      weight_tying: false
      position_embedding_strategy: null
  data:
    chat: false
    chat_prompt_tokens:
      system_turn_start: "\0"
      turn_start: "\x11"
      label_start: "\x12"
      end_of_turn: '

        '
      end_of_name: '

        '
    sample: false
    num_workers: 0
    dataloader_type: single
    train_ds:
      file_path: nsql.jsonl
      global_batch_size: 128
      micro_batch_size: 1
      shuffle: true
      memmap_workers: null
      max_seq_length: 8192
      min_seq_length: 1
      drop_last: true
      label_key: output
      add_eos: true
      add_sep: false
      add_bos: false
      truncation_field: input
      index_mapping_dir: null
      prompt_template: '{input} {output}'
      hf_dataset: false
      truncation_method: right
    validation_ds:
      file_path: nsql.jsonl
      global_batch_size: 128
      micro_batch_size: 1
      shuffle: false
      memmap_workers: null
      max_seq_length: 8192
      min_seq_length: 1
      drop_last: true
      label_key: output
      add_eos: true
      add_sep: false
      add_bos: false
      truncation_field: input
      index_mapping_dir: null
      prompt_template: '{input} {output}'
      hf_dataset: false
      truncation_method: right
      output_original_text: true
  optim:
    name: distributed_fused_adam
    lr: 5.0e-06
    weight_decay: 0.01
    betas:
    - 0.9
    - 0.98
    sched:
      name: CosineAnnealing
      warmup_steps: 10
      constant_steps: 1000
      min_lr: 9.0e-07
  bias_activation_fusion: true
  precision: bf16