Training config
Browse files- fullcorpus-ddv1.yaml +61 -0
fullcorpus-ddv1.yaml
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Full corpus DD-v1 108M: single-phase shuffled ~105B tokens
|
| 2 |
+
# Purpose: test AttnRes geometry under extreme overtrain (~1000x tokens/param)
|
| 3 |
+
# Fresh start from NCA checkpoint, cosine decay, single epoch
|
| 4 |
+
|
| 5 |
+
# Model
|
| 6 |
+
model_size: proxy
|
| 7 |
+
|
| 8 |
+
# AttnRes DD-v1
|
| 9 |
+
attn_res: true
|
| 10 |
+
attn_res_boundaries: "0,3,7,12,21,25"
|
| 11 |
+
|
| 12 |
+
# Data — assembled binary on NFS (mmap, bandwidth is negligible for training)
|
| 13 |
+
data_path: /models/kotodama-data/assembled/train.bin
|
| 14 |
+
sequence_length: 4096
|
| 15 |
+
micro_batch_size: 16
|
| 16 |
+
|
| 17 |
+
# Training — 170.4B tokens, single epoch (no dedup, all 13 sources)
|
| 18 |
+
total_tokens: 170_400_000_000
|
| 19 |
+
muon_lr: 0.02
|
| 20 |
+
adamw_lr: 0.0006
|
| 21 |
+
# ~81K steps total, warmup ~6% of training
|
| 22 |
+
warmup_steps: 5000
|
| 23 |
+
decay_start_pct: 0.90
|
| 24 |
+
decay_type: cosine
|
| 25 |
+
gradient_clip: 1.0
|
| 26 |
+
|
| 27 |
+
# Muon
|
| 28 |
+
muon_momentum: 0.95
|
| 29 |
+
muon_weight_decay: 0.01
|
| 30 |
+
muon_ns_iterations: 5
|
| 31 |
+
muon_ns_coefficients: gram_ns
|
| 32 |
+
|
| 33 |
+
# NCA resume — co-trained NCA+AttnRes DD-v1 checkpoint (seed-17, 852M tokens)
|
| 34 |
+
resume_nca: checkpoints/nca-attnres-ddv1/step_00006500.pt
|
| 35 |
+
|
| 36 |
+
# Optimizations
|
| 37 |
+
compile: true
|
| 38 |
+
attn_impl: auto
|
| 39 |
+
fp8: true
|
| 40 |
+
use_liger: false
|
| 41 |
+
|
| 42 |
+
# Checkpointing — save every ~5B tokens (~2400 steps), keep all for geometric analysis
|
| 43 |
+
checkpoint_dir: checkpoints/fullcorpus-ddv1
|
| 44 |
+
save_every: 2400
|
| 45 |
+
keep_checkpoints: 80
|
| 46 |
+
async_checkpoint: true
|
| 47 |
+
checkpoint_shm_dir: /dev/shm/luxia-base-ckpts
|
| 48 |
+
|
| 49 |
+
# Geometric monitoring
|
| 50 |
+
geo_monitor: true
|
| 51 |
+
geo_monitor_tier1_every: 75
|
| 52 |
+
geo_monitor_tier2_every: 500
|
| 53 |
+
|
| 54 |
+
# Logging
|
| 55 |
+
log_every: 10
|
| 56 |
+
wandb: true
|
| 57 |
+
wandb_project: kotodama-ddv1-fullcorpus
|
| 58 |
+
wandb_run_name: fullcorpus-ddv1-170B
|
| 59 |
+
|
| 60 |
+
# HF upload — push final checkpoint on completion
|
| 61 |
+
hf_upload_repo: aethera-gp/kotodama-fullcorpus-ddv1
|