{ "base_model": "/workspaces/LLiMba/models/cpt-pretrain-qwen2.5-3b", "dataset": "/workspaces/LLiMba/data/curated/sft/sft_dataset.jsonl", "mode": "lora", "rank": 256, "alpha": 256, "dropout": 0.05, "target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ], "lr": 2e-05, "epochs": 2, "batch_size": 1, "grad_accum": 16, "effective_batch": 16, "max_length": 4096, "warmup_steps": 50, "attention": "flash_attention_2", "eval_split": 0.05, "train_loss": 0.867611675270807, "eval_loss": null, "train_samples": 13683, "eval_samples": 721 }