| auto_config: true | |
| hf_model_repo: "ASTERIZER/LUNA-100M" | |
| hf_model_file: "sft_v1/final/model.pth" | |
| pretrained_ckpt: "Base/out/input_models/luna_sft_v1/model.pth" | |
| train_json: "Base/Datasets/rag_mcp_sft/train.json" | |
| val_json: "Base/Datasets/rag_mcp_sft/val.json" | |
| out_dir: "Base/out/sft/rag_mcp_lora" | |
| tokenizer_dir: "Base/checkpoints/EleutherAI/pythia-160m" | |
| model: | |
| vocab_size: 50304 | |
| seq_len: 1024 | |
| n_layer: 10 | |
| n_embd: 768 | |
| n_head: 12 | |
| train: | |
| epochs: 2 | |
| lr_warmup_steps: 100 | |
| save_interval: 250 | |
| log_interval: 10 | |
| eval_interval: 250 | |
| max_norm: 1.0 | |
| optimizer: | |
| lr: 2.0e-4 | |
| min_lr: 2.0e-5 | |
| weight_decay: 0.0 | |
| betas: [0.9, 0.95] | |
| eps: 1.0e-8 | |
| batch: | |
| global_batch: 64 | |
| micro_batch: 8 | |
| grad_accum: 8 | |
| auto_probe_batch: true | |
| probe_safety: 0.94 | |
| dataloader: | |
| num_workers: 4 | |
| pin_memory: true | |
| hardware: | |
| precision: "bf16" | |
| lora: | |
| rank: 16 | |
| alpha: 32 | |
| dropout: 0.05 | |
| target_modules: | |
| - "attn.c_attn" | |
| - "attn.c_proj" | |
| - "mlp.fc" | |
| - "mlp.proj" | |
| eval_prompts: | |
| - "Explain what retrieval-augmented generation is and why teams use it." | |
| - "Describe the core parts of MCP for an engineer who has never used it." | |
| - "Compare RAG and MCP without confusing their roles." | |
| - "What are good practices for grounding answers with retrieved context?" | |
| - "How can an MCP server expose retrieval capabilities to an AI host?" |