| model_config_path: /home/athuser/luxi-files/kotodama/configs/model.yaml | |
| model_config_section: proxy | |
| attn_res: true | |
| attn_res_boundaries: | |
| - 0 | |
| - 3 | |
| - 7 | |
| - 12 | |
| - 21 | |
| - 25 | |
| dataset: pipeline/formatted/tokenized/train | |
| eval_dataset: pipeline/formatted/tokenized/eval | |
| packed: true | |
| max_seq_len: 4096 | |
| batch_size: 4 | |
| gradient_accumulation: 1 | |
| max_steps: -1 | |
| bf16: true | |
| max_grad_norm: 1.0 | |
| muon_momentum: 0.95 | |
| muon_weight_decay: 0.01 | |
| muon_ns_iterations: 5 | |
| muon_ns_coefficients: gram_ns | |
| adamw_betas: | |
| - 0.9 | |
| - 0.95 | |
| adamw_weight_decay: 0.1 | |
| warmup_ratio: 0.05 | |
| wsd_decay_start: 1.0 | |
| wsd_decay_type: sqrt | |
| logging_steps: 10 | |
| grad_analysis_every: 10 | |
| weight_drift_every: 10 | |
| geo_tier1_every: 10 | |
| sample_every: 50 | |
| save_every: 25 | |
| eval_steps: 25 | |
| checkpoint_keep: 3 | |
| async_save: true | |
| checkpoint_compress: true | |
| checkpoint_shm_dir: /dev/shm/luxia-sft-ckpts | |
| wandb_project: kotodama-sft-sweep | |
| wandb_entity: aethera | |
| num_workers: 4 | |
| pretrained_checkpoint: /home/athuser/luxi-files/kotodama/checkpoints/fullcorpus-ddv1/step_00081252.pt.zst | |
| muon_lr: 0.003 | |
| adamw_lr: 0.00030000000000000003 | |
| num_epochs: 2 | |
| wandb_run_name: sweep-fullcorpus-lr3e-03-ep2 | |
| output_dir: outputs/sweep/sweep-fullcorpus-lr3e-03-ep2 | |
| checkpoint_dir: outputs/sweep/sweep-fullcorpus-lr3e-03-ep2/checkpoints | |