File size: 1,401 Bytes
156581e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | {
"model": {
"base_model": "Qwen/Qwen3-0.6B",
"teacher_model": "Qwen/Qwen3-4B-Base",
"include_response": false,
"is_do_alignment": false,
"alignment_strategy": "first",
"projector": {
"type": "C2CProjector",
"params": {
"hidden_dim": 1024,
"intermediate_dim": 1024,
"num_layers": 3,
"dropout": 0.1,
"initial_temperature": 1.0,
"final_temperature": 0.001,
"anneal_steps": 1953
}
},
"mapping": "last_aligned"
},
"training": {
"learning_rate": 0.0001,
"weight_decay": 0.01,
"num_epochs": 1,
"max_length": 2048,
"device": "cuda",
"scheduler_type": "linear",
"warmup_ratio": 0.1,
"max_grad_norm": 1.0,
"gradient_accumulation_steps": 4,
"per_device_train_batch_size": 8,
"num_processes": 8,
"freeze": [
"teacher",
"base"
],
"seed": 42
},
"output": {
"output_dir": "local/checkpoints/Q3-0.6B_Q3-4B-Base_general_500k_C2C",
"save_steps": 500,
"eval_steps": 100,
"wandb_config": {
"project": "Rosetta",
"mode": "online",
"entity": "nics-efc",
"run_name": "Q3-0.6B_Q3-4B-Base_general_500k_C2C"
}
},
"data": {
"type": "OpenHermesChatDataset",
"kwargs": {
"split": "train",
"max_word_count": 2048,
"num_samples": 500000
},
"train_ratio": 0.99
}
} |