File size: 1,401 Bytes
156581e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
{
  "model": {
    "base_model": "Qwen/Qwen3-0.6B",
    "teacher_model": "Qwen/Qwen3-4B-Base",
    "include_response": false,
    "is_do_alignment": false,
    "alignment_strategy": "first",
    "projector": {
      "type": "C2CProjector",
      "params": {
        "hidden_dim": 1024,
        "intermediate_dim": 1024,
        "num_layers": 3,
        "dropout": 0.1,
        "initial_temperature": 1.0,
        "final_temperature": 0.001,
        "anneal_steps": 1953
      }
    },
    "mapping": "last_aligned"
  },
  "training": {
    "learning_rate": 0.0001,
    "weight_decay": 0.01,
    "num_epochs": 1,
    "max_length": 2048,
    "device": "cuda",
    "scheduler_type": "linear",
    "warmup_ratio": 0.1,
    "max_grad_norm": 1.0,
    "gradient_accumulation_steps": 4,
    "per_device_train_batch_size": 8,
    "num_processes": 8,
    "freeze": [
      "teacher",
      "base"
    ],
    "seed": 42
  },
  "output": {
    "output_dir": "local/checkpoints/Q3-0.6B_Q3-4B-Base_general_500k_C2C",
    "save_steps": 500,
    "eval_steps": 100,
    "wandb_config": {
      "project": "Rosetta",
      "mode": "online",
      "entity": "nics-efc",
      "run_name": "Q3-0.6B_Q3-4B-Base_general_500k_C2C"
    }
  },
  "data": {
    "type": "OpenHermesChatDataset",
    "kwargs": {
      "split": "train",
      "max_word_count": 2048,
      "num_samples": 500000
    },
    "train_ratio": 0.99
  }
}