{ "model": { "base_model": "Qwen/Qwen3-0.6B", "teacher_model": "Qwen/Qwen3-4B-Base", "include_response": false, "is_do_alignment": false, "alignment_strategy": "first", "projector": { "type": "C2CProjector", "params": { "hidden_dim": 1024, "intermediate_dim": 1024, "num_layers": 3, "dropout": 0.1, "initial_temperature": 1.0, "final_temperature": 0.001, "anneal_steps": 1953 } }, "mapping": "last_aligned" }, "training": { "learning_rate": 0.0001, "weight_decay": 0.01, "num_epochs": 1, "max_length": 2048, "device": "cuda", "scheduler_type": "linear", "warmup_ratio": 0.1, "max_grad_norm": 1.0, "gradient_accumulation_steps": 4, "per_device_train_batch_size": 8, "num_processes": 8, "freeze": [ "teacher", "base" ], "seed": 42 }, "output": { "output_dir": "local/checkpoints/Q3-0.6B_Q3-4B-Base_general_500k_C2C", "save_steps": 500, "eval_steps": 100, "wandb_config": { "project": "Rosetta", "mode": "online", "entity": "nics-efc", "run_name": "Q3-0.6B_Q3-4B-Base_general_500k_C2C" } }, "data": { "type": "OpenHermesChatDataset", "kwargs": { "split": "train", "max_word_count": 2048, "num_samples": 500000 }, "train_ratio": 0.99 } }