LUNA-Training / rag_mcp_full_sft_config.yaml

Upload rag_mcp_full_sft_config.yaml with huggingface_hub

caa2a1b verified 20 days ago

1.31 kB

	auto_config: true

	hf_model_repo: "ASTERIZER/LUNA-100M"
	hf_model_file: "sft_v1/final/model.pth"
	hf_dataset_repo: "ASTERIZER/LUNA-RAG-MCP-SFT-10M"
	pretrained_ckpt: "Base/out/input_models/luna_sft_v1/sft_v1/final/model.pth"

	train_json: "Base/Datasets/rag_mcp_sft/train.json"
	val_json: "Base/Datasets/rag_mcp_sft/val.json"
	out_dir: "Base/out/sft/rag_mcp_full_sft"
	tokenizer_dir: "Base/checkpoints/EleutherAI/pythia-160m"

	model:
	vocab_size: 50304
	seq_len: 1024
	n_layer: 10
	n_embd: 768
	n_head: 12

	train:
	epochs: 2
	max_tokens: 0
	lr_warmup_steps: 100
	save_interval: 250
	log_interval: 10
	eval_interval: 250
	max_norm: 1.0

	optimizer:
	lr: 8.0e-6
	min_lr: 8.0e-7
	weight_decay: 0.01
	betas: [0.9, 0.95]
	eps: 1.0e-8

	batch:
	global_batch: 48
	micro_batch: 4
	grad_accum: 12

	dataloader:
	num_workers: 4
	pin_memory: true

	hardware:
	precision: "bf16"
	compile: false

	eval_prompts:
	- "Explain retrieval-augmented generation in practical engineering terms."
	- "What problem does MCP solve for AI applications?"
	- "Compare RAG and MCP clearly without mixing them together."
	- "How should a model use retrieved context without overclaiming?"
	- "Describe how an MCP server can expose tools or retrieval to a host model."