File size: 1,615 Bytes
909c76d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
{
    "model": {
        "base_model": "Qwen/Qwen3-0.6B",
        "teacher_model": "Qwen/Qwen3-4B",    
        "include_response": false,
        "is_do_alignment": false,
        "alignment_strategy": "first",
        "projector": {
            "type": "C2CProjector",
            "params": {
                "hidden_dim": 1024,
                "intermediate_dim": 1024,
                "num_layers": 3,
                "dropout": 0.1,
                "initial_temperature": 1.0,
                "final_temperature": 0.001,
                "anneal_steps": 1929
            }
        },
        "mapping": "last_aligned"
    },
    "training": {
        "learning_rate": 1e-4,
        "weight_decay": 0.01,
        "num_epochs": 1,
        "max_length": 2048,
        "device": "cuda",
        "scheduler_type": "linear",
        "warmup_ratio": 0.1,
        "max_grad_norm": 1.0,
        "gradient_accumulation_steps": 8,
        "per_device_train_batch_size": 4,
        "num_processes": 8,
        "freeze": ["teacher","base"],
        "seed": 42
    },
    "output": {
        "output_dir": "local/checkpoints/0.6+4B_C2C_general",
        "save_steps": 500,
        "eval_steps": 100,
        "wandb_config": {
            "project": "Rosetta",
            "mode": "online",
            "entity": "nics-efc",
            "run_name": "0.6B+4B_C2C_general_OpenHermes_500k"
        }
    },
    "data": {
        "type": "OpenHermesChatDataset",
        "kwargs": {
            "split": "train",
            "max_word_count": 2048,
            "num_samples": 500000
        },
        "train_ratio": 0.99
    }
}