llimba-3b-instruct / training_config.json
lballore's picture
Initial release of llimba-3b-instruct
c86e92d
raw
history blame
641 Bytes
{
"base_model": "/workspaces/LLiMba/models/cpt-pretrain-qwen2.5-3b",
"dataset": "/workspaces/LLiMba/data/curated/sft/sft_dataset.jsonl",
"mode": "lora",
"rank": 256,
"alpha": 256,
"dropout": 0.05,
"target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj"
],
"lr": 2e-05,
"epochs": 2,
"batch_size": 1,
"grad_accum": 16,
"effective_batch": 16,
"max_length": 4096,
"warmup_steps": 50,
"attention": "flash_attention_2",
"eval_split": 0.05,
"train_loss": 0.867611675270807,
"eval_loss": null,
"train_samples": 13683,
"eval_samples": 721
}