| auto_config: true | |
| hf_model_repo: "ASTERIZER/LUNA-100M" | |
| hf_model_file: "sft_v1/final/model.pth" | |
| hf_dataset_repo: "ASTERIZER/LUNA-RAG-MCP-SFT-10M" | |
| pretrained_ckpt: "Base/out/input_models/luna_sft_v1/sft_v1/final/model.pth" | |
| train_json: "Base/Datasets/rag_mcp_sft/train.json" | |
| val_json: "Base/Datasets/rag_mcp_sft/val.json" | |
| out_dir: "Base/out/sft/rag_mcp_full_sft" | |
| tokenizer_dir: "Base/checkpoints/EleutherAI/pythia-160m" | |
| model: | |
| vocab_size: 50304 | |
| seq_len: 1024 | |
| n_layer: 10 | |
| n_embd: 768 | |
| n_head: 12 | |
| train: | |
| epochs: 2 | |
| max_tokens: 0 | |
| lr_warmup_steps: 100 | |
| save_interval: 250 | |
| log_interval: 10 | |
| eval_interval: 250 | |
| max_norm: 1.0 | |
| optimizer: | |
| lr: 8.0e-6 | |
| min_lr: 8.0e-7 | |
| weight_decay: 0.01 | |
| betas: [0.9, 0.95] | |
| eps: 1.0e-8 | |
| batch: | |
| global_batch: 48 | |
| micro_batch: 4 | |
| grad_accum: 12 | |
| dataloader: | |
| num_workers: 4 | |
| pin_memory: true | |
| hardware: | |
| precision: "bf16" | |
| compile: false | |
| eval_prompts: | |
| - "Explain retrieval-augmented generation in practical engineering terms." | |
| - "What problem does MCP solve for AI applications?" | |
| - "Compare RAG and MCP clearly without mixing them together." | |
| - "How should a model use retrieved context without overclaiming?" | |
| - "Describe how an MCP server can expose tools or retrieval to a host model." |