auto_config: true hf_model_repo: "ASTERIZER/LUNA-100M" hf_model_file: "sft_v1/final/model.pth" hf_dataset_repo: "ASTERIZER/LUNA-RAG-MCP-SFT-10M" pretrained_ckpt: "Base/out/input_models/luna_sft_v1/sft_v1/final/model.pth" train_json: "Base/Datasets/rag_mcp_sft/train.json" val_json: "Base/Datasets/rag_mcp_sft/val.json" out_dir: "Base/out/sft/rag_mcp_full_sft" tokenizer_dir: "Base/checkpoints/EleutherAI/pythia-160m" model: vocab_size: 50304 seq_len: 1024 n_layer: 10 n_embd: 768 n_head: 12 train: epochs: 2 max_tokens: 0 lr_warmup_steps: 100 save_interval: 250 log_interval: 10 eval_interval: 250 max_norm: 1.0 optimizer: lr: 8.0e-6 min_lr: 8.0e-7 weight_decay: 0.01 betas: [0.9, 0.95] eps: 1.0e-8 batch: global_batch: 48 micro_batch: 4 grad_accum: 12 dataloader: num_workers: 4 pin_memory: true hardware: precision: "bf16" compile: false eval_prompts: - "Explain retrieval-augmented generation in practical engineering terms." - "What problem does MCP solve for AI applications?" - "Compare RAG and MCP clearly without mixing them together." - "How should a model use retrieved context without overclaiming?" - "Describe how an MCP server can expose tools or retrieval to a host model."