batch_size: 64 decoder_assistance_role: reflect decoder_base_conv_format: default decoder_base_conv_format_kwargs: null dropout: 0.0 eval_batch_size: 64 eval_data: heldout: - single_token_mc - ../../llama31_8b_data/eval_synthsys/heldout.pkl non_heldout: - single_token_mc - ../../llama31_8b_data/eval_synthsys/non_heldout.pkl eval_interval: 100 eval_name_mapping: test_templ_mc: test_mc train_templ_mc: train_mc eval_num_datapoints: 6400 eval_num_steps: null eval_patch_regions: - user fsdp_config: fsdp_activation_checkpointing: true fsdp_cpu_offload: false replica_group_size: 0 sharding_group_size: 0 sharding_strategy: 1 gradient_accumulation_steps: null gradient_clipping: false gradient_clipping_threshold: 1.0 hf_model_id: meta-llama/Llama-3.1-8B-Instruct layer_mapping: '0': 15 layer_to_read: 21 layer_to_write: 1 log_interval: 100 lora_config: lora_alpha: 32 lora_bias: none lora_dropout: 0.05 lora_r: 16 lora_target_modules: - q_proj - k_proj - v_proj - o_proj - gate_proj - up_proj - down_proj - lm_head lora_task_type: CAUSAL_LM lr: 0.0001 micro_batch_size: 8 min_lr_ratio: 0.1 new_context_tokens: null num_steps: 5000 pretrain_data_config: null read_layer_module_keys: - layer: 15 module: residual save_final_checkpoint: true save_interval: 500 save_path: latentqa/llama31_8b_experiments/15_to_0 seed: 7236 train_data_path: ../../llama31_8b_data/train.pkl train_patch_regions: - user use_fsdp: false use_peft: true use_wandb: true valid_data_path: null wandb_group_name: llama31_8b_experiments wandb_project: latentqa wandb_run_name: 15_to_0 warmup_steps: 0 weight_decay: 0.01 write_layer_module_keys: - layer: 0 module: residual