| { | |
| "model": { | |
| "base_model": "Qwen/Qwen3-0.6B", | |
| "teacher_model": "Qwen/Qwen3-4B-Base", | |
| "include_response": false, | |
| "is_do_alignment": false, | |
| "alignment_strategy": "first", | |
| "projector": { | |
| "type": "C2CProjector", | |
| "params": { | |
| "hidden_dim": 1024, | |
| "intermediate_dim": 1024, | |
| "num_layers": 3, | |
| "dropout": 0.1, | |
| "initial_temperature": 1.0, | |
| "final_temperature": 0.001, | |
| "anneal_steps": 1953 | |
| } | |
| }, | |
| "mapping": "last_aligned" | |
| }, | |
| "training": { | |
| "learning_rate": 0.0001, | |
| "weight_decay": 0.01, | |
| "num_epochs": 1, | |
| "max_length": 2048, | |
| "device": "cuda", | |
| "scheduler_type": "linear", | |
| "warmup_ratio": 0.1, | |
| "max_grad_norm": 1.0, | |
| "gradient_accumulation_steps": 4, | |
| "per_device_train_batch_size": 8, | |
| "num_processes": 8, | |
| "freeze": [ | |
| "teacher", | |
| "base" | |
| ], | |
| "seed": 42 | |
| }, | |
| "output": { | |
| "output_dir": "local/checkpoints/Q3-0.6B_Q3-4B-Base_general_500k_C2C", | |
| "save_steps": 500, | |
| "eval_steps": 100, | |
| "wandb_config": { | |
| "project": "Rosetta", | |
| "mode": "online", | |
| "entity": "nics-efc", | |
| "run_name": "Q3-0.6B_Q3-4B-Base_general_500k_C2C" | |
| } | |
| }, | |
| "data": { | |
| "type": "OpenHermesChatDataset", | |
| "kwargs": { | |
| "split": "train", | |
| "max_word_count": 2048, | |
| "num_samples": 500000 | |
| }, | |
| "train_ratio": 0.99 | |
| } | |
| } |