LUNA-Training / rag_mcp_full_sft_config.yaml
ASTERIZER's picture
Upload rag_mcp_full_sft_config.yaml with huggingface_hub
caa2a1b verified
auto_config: true
hf_model_repo: "ASTERIZER/LUNA-100M"
hf_model_file: "sft_v1/final/model.pth"
hf_dataset_repo: "ASTERIZER/LUNA-RAG-MCP-SFT-10M"
pretrained_ckpt: "Base/out/input_models/luna_sft_v1/sft_v1/final/model.pth"
train_json: "Base/Datasets/rag_mcp_sft/train.json"
val_json: "Base/Datasets/rag_mcp_sft/val.json"
out_dir: "Base/out/sft/rag_mcp_full_sft"
tokenizer_dir: "Base/checkpoints/EleutherAI/pythia-160m"
model:
vocab_size: 50304
seq_len: 1024
n_layer: 10
n_embd: 768
n_head: 12
train:
epochs: 2
max_tokens: 0
lr_warmup_steps: 100
save_interval: 250
log_interval: 10
eval_interval: 250
max_norm: 1.0
optimizer:
lr: 8.0e-6
min_lr: 8.0e-7
weight_decay: 0.01
betas: [0.9, 0.95]
eps: 1.0e-8
batch:
global_batch: 48
micro_batch: 4
grad_accum: 12
dataloader:
num_workers: 4
pin_memory: true
hardware:
precision: "bf16"
compile: false
eval_prompts:
- "Explain retrieval-augmented generation in practical engineering terms."
- "What problem does MCP solve for AI applications?"
- "Compare RAG and MCP clearly without mixing them together."
- "How should a model use retrieved context without overclaiming?"
- "Describe how an MCP server can expose tools or retrieval to a host model."