minzh23's picture
[debug] fix config format
156581e verified
{
"model": {
"base_model": "Qwen/Qwen3-0.6B",
"teacher_model": "Qwen/Qwen3-4B-Base",
"include_response": false,
"is_do_alignment": false,
"alignment_strategy": "first",
"projector": {
"type": "C2CProjector",
"params": {
"hidden_dim": 1024,
"intermediate_dim": 1024,
"num_layers": 3,
"dropout": 0.1,
"initial_temperature": 1.0,
"final_temperature": 0.001,
"anneal_steps": 1953
}
},
"mapping": "last_aligned"
},
"training": {
"learning_rate": 0.0001,
"weight_decay": 0.01,
"num_epochs": 1,
"max_length": 2048,
"device": "cuda",
"scheduler_type": "linear",
"warmup_ratio": 0.1,
"max_grad_norm": 1.0,
"gradient_accumulation_steps": 4,
"per_device_train_batch_size": 8,
"num_processes": 8,
"freeze": [
"teacher",
"base"
],
"seed": 42
},
"output": {
"output_dir": "local/checkpoints/Q3-0.6B_Q3-4B-Base_general_500k_C2C",
"save_steps": 500,
"eval_steps": 100,
"wandb_config": {
"project": "Rosetta",
"mode": "online",
"entity": "nics-efc",
"run_name": "Q3-0.6B_Q3-4B-Base_general_500k_C2C"
}
},
"data": {
"type": "OpenHermesChatDataset",
"kwargs": {
"split": "train",
"max_word_count": 2048,
"num_samples": 500000
},
"train_ratio": 0.99
}
}