File size: 4,324 Bytes
b5cc210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
{
  "status": "ok",
  "training_mode": "sft-baseline",
  "completed_models": 3,
  "failed_or_skipped_models": 0,
  "models": [
    {
      "run_id": "qwen-qwen2-5-0-5b-instruct",
      "training_mode": "sft-baseline",
      "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
      "label": "Qwen2.5-0.5B",
      "status": "completed",
      "error": "",
      "sft_backend": "trl_transformers",
      "sft_examples": 2000,
      "sft_train_loss": 0.19233327957964502,
      "sft_runtime": 234.6302,
      "grpo_backend": "",
      "grpo_records": 0,
      "grpo_avg_reward": 0.726,
      "sft_inference_reward": 0.726,
      "sft_valid_rate": 1.0,
      "sft_latency_seconds": 1.839,
      "grpo_inference_reward": 0.726,
      "grpo_valid_rate": 1.0,
      "grpo_latency_seconds": 0.0,
      "train_holdout_gap": 0.0,
      "fallback_detected": false,
      "reward_range_ok": true,
      "reward_range_failures": [],
      "exploit_rate": 0.0,
      "legal_rate": 0.0,
      "candidate_diversity": 0.0,
      "top_candidate_rate": 0.0,
      "reward_components": {},
      "primary_reward_channels": {},
      "artifact_paths": {
        "sft": "/app/checkpoints/sweeps/qwen-qwen2-5-0-5b-instruct/sft_adapter",
        "grpo": ""
      }
    },
    {
      "run_id": "qwen-qwen2-5-1-5b-instruct",
      "training_mode": "sft-baseline",
      "model_id": "Qwen/Qwen2.5-1.5B-Instruct",
      "label": "Qwen2.5-1.5B",
      "status": "completed",
      "error": "",
      "sft_backend": "trl_transformers",
      "sft_examples": 2000,
      "sft_train_loss": 0.11515871361242898,
      "sft_runtime": 483.7085,
      "grpo_backend": "",
      "grpo_records": 0,
      "grpo_avg_reward": 0.726,
      "sft_inference_reward": 0.726,
      "sft_valid_rate": 1.0,
      "sft_latency_seconds": 2.158,
      "grpo_inference_reward": 0.726,
      "grpo_valid_rate": 1.0,
      "grpo_latency_seconds": 0.0,
      "train_holdout_gap": 0.0,
      "fallback_detected": false,
      "reward_range_ok": true,
      "reward_range_failures": [],
      "exploit_rate": 0.0,
      "legal_rate": 0.0,
      "candidate_diversity": 0.0,
      "top_candidate_rate": 0.0,
      "reward_components": {},
      "primary_reward_channels": {},
      "artifact_paths": {
        "sft": "/app/checkpoints/sweeps/qwen-qwen2-5-1-5b-instruct/sft_adapter",
        "grpo": ""
      }
    },
    {
      "run_id": "qwen-qwen2-5-3b-instruct",
      "training_mode": "sft-baseline",
      "model_id": "Qwen/Qwen2.5-3B-Instruct",
      "label": "Qwen2.5-3B",
      "status": "completed",
      "error": "",
      "sft_backend": "trl_transformers",
      "sft_examples": 2000,
      "sft_train_loss": 0.18184852770145518,
      "sft_runtime": 372.1845,
      "grpo_backend": "",
      "grpo_records": 0,
      "grpo_avg_reward": 0.762,
      "sft_inference_reward": 0.762,
      "sft_valid_rate": 1.0,
      "sft_latency_seconds": 2.748,
      "grpo_inference_reward": 0.762,
      "grpo_valid_rate": 1.0,
      "grpo_latency_seconds": 0.0,
      "train_holdout_gap": 0.0,
      "fallback_detected": false,
      "reward_range_ok": true,
      "reward_range_failures": [],
      "exploit_rate": 0.0,
      "legal_rate": 0.0,
      "candidate_diversity": 0.0,
      "top_candidate_rate": 0.0,
      "reward_components": {},
      "primary_reward_channels": {},
      "artifact_paths": {
        "sft": "/app/checkpoints/sweeps/qwen-qwen2-5-3b-instruct/sft_adapter",
        "grpo": ""
      }
    }
  ],
  "charts": {
    "sft_vs_grpo_reward": "outputs/plots/sft_vs_grpo_reward.png",
    "sft_loss_curves": "outputs/plots/sft_loss_curves.png",
    "qwen_model_sft_reward": "outputs/plots/qwen_model_sft_reward.png",
    "qwen_model_sft_loss": "outputs/plots/qwen_model_sft_loss.png",
    "sft_validity_reward": "outputs/plots/sft_validity_reward.png",
    "grpo_reward_curves": "outputs/plots/grpo_reward_curves.png",
    "qwen_model_grpo_reward": "outputs/plots/qwen_model_grpo_reward.png",
    "reward_component_bars": "outputs/plots/reward_component_bars.png",
    "anti_cheat_failure_rates": "outputs/plots/anti_cheat_failure_rates.png",
    "train_holdout_gap": "outputs/plots/train_holdout_gap.png",
    "inference_validity_reward": "outputs/plots/inference_validity_reward.png",
    "inference_latency_validity": "outputs/plots/inference_latency_validity.png"
  }
}