auto_config: true

hf_model_repo: "ASTERIZER/LUNA-100M"
hf_model_file: "sft_v1/final/model.pth"
hf_dataset_repo: "ASTERIZER/LUNA-RAG-MCP-SFT-10M"
pretrained_ckpt: "Base/out/input_models/luna_sft_v1/sft_v1/final/model.pth"

train_json: "Base/Datasets/rag_mcp_sft/train.json"
val_json: "Base/Datasets/rag_mcp_sft/val.json"
out_dir: "Base/out/sft/rag_mcp_full_sft"
tokenizer_dir: "Base/checkpoints/EleutherAI/pythia-160m"

model:
  vocab_size: 50304
  seq_len: 1024
  n_layer: 10
  n_embd: 768
  n_head: 12

train:
  epochs: 2
  max_tokens: 0
  lr_warmup_steps: 100
  save_interval: 250
  log_interval: 10
  eval_interval: 250
  max_norm: 1.0

optimizer:
  lr: 8.0e-6
  min_lr: 8.0e-7
  weight_decay: 0.01
  betas: [0.9, 0.95]
  eps: 1.0e-8

batch:
  global_batch: 48
  micro_batch: 4
  grad_accum: 12

dataloader:
  num_workers: 4
  pin_memory: true

hardware:
  precision: "bf16"
  compile: false

eval_prompts:
  - "Explain retrieval-augmented generation in practical engineering terms."
  - "What problem does MCP solve for AI applications?"
  - "Compare RAG and MCP clearly without mixing them together."
  - "How should a model use retrieved context without overclaiming?"
  - "Describe how an MCP server can expose tools or retrieval to a host model."