Spaces:
Paused
Paused
HF Job: train_grpo run output
Browse files
run-output/plots/io_log.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
run-output/plots/training_log.csv
CHANGED
|
@@ -1,7 +1 @@
|
|
| 1 |
phase,round,global_step,use_hint,avg_episode_reward,max_episode_reward,min_episode_reward,avg_grader,max_grader,n_training_samples,train_loss
|
| 2 |
-
phase1_timing,1,1,True,5.127,5.315,4.96,0.9498,1.0,81,2.833
|
| 3 |
-
phase1_timing,2,2,False,3.04,3.303,2.6,0.259,0.3614,96,3.1413
|
| 4 |
-
phase1_timing,3,3,False,2.867,3.016,2.555,0.2083,0.3042,102,3.1255
|
| 5 |
-
phase2_content,1,4,True,3.538,3.837,3.338,0.8697,1.0,77,2.8381
|
| 6 |
-
phase2_content,2,5,False,2.15,2.807,1.587,0.3763,0.5979,90,2.9281
|
| 7 |
-
phase2_content,3,6,False,1.924,2.609,1.375,0.2855,0.5027,76,2.9184
|
|
|
|
| 1 |
phase,round,global_step,use_hint,avg_episode_reward,max_episode_reward,min_episode_reward,avg_grader,max_grader,n_training_samples,train_loss
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
run-output/plots/training_summary.json
CHANGED
|
@@ -1,21 +1,18 @@
|
|
| 1 |
{
|
| 2 |
"model": "Qwen/Qwen2.5-3B-Instruct",
|
| 3 |
"training": "Two-phase LoRA SFT (timing -> content) with hardcoded peak-hours hint on round 1 of each phase",
|
| 4 |
-
"phases": [
|
| 5 |
-
"phase1_timing",
|
| 6 |
-
"phase2_content"
|
| 7 |
-
],
|
| 8 |
"rounds_per_phase": 3,
|
| 9 |
"episodes_per_round": 6,
|
| 10 |
"before": {
|
| 11 |
-
"monthly_engage":
|
| 12 |
-
"monthly_strategic": 0.
|
| 13 |
-
"monthly_competitive": 0.
|
| 14 |
},
|
| 15 |
"after": {
|
| 16 |
-
"monthly_engage": 0.
|
| 17 |
-
"monthly_strategic": 0.
|
| 18 |
-
"monthly_competitive": 0.
|
| 19 |
},
|
| 20 |
"smart_heuristic": {
|
| 21 |
"monthly_engage": 0.7519,
|
|
@@ -23,98 +20,21 @@
|
|
| 23 |
"monthly_competitive": 0.9141
|
| 24 |
},
|
| 25 |
"improvement": {
|
| 26 |
-
"monthly_engage": 0.
|
| 27 |
-
"monthly_strategic": 0.
|
| 28 |
-
"monthly_competitive": 0.
|
| 29 |
},
|
| 30 |
"training_log": {
|
| 31 |
-
"phase": [
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
],
|
| 39 |
-
"
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
3,
|
| 43 |
-
1,
|
| 44 |
-
2,
|
| 45 |
-
3
|
| 46 |
-
],
|
| 47 |
-
"global_step": [
|
| 48 |
-
1,
|
| 49 |
-
2,
|
| 50 |
-
3,
|
| 51 |
-
4,
|
| 52 |
-
5,
|
| 53 |
-
6
|
| 54 |
-
],
|
| 55 |
-
"use_hint": [
|
| 56 |
-
true,
|
| 57 |
-
false,
|
| 58 |
-
false,
|
| 59 |
-
true,
|
| 60 |
-
false,
|
| 61 |
-
false
|
| 62 |
-
],
|
| 63 |
-
"avg_episode_reward": [
|
| 64 |
-
5.127,
|
| 65 |
-
3.04,
|
| 66 |
-
2.867,
|
| 67 |
-
3.538,
|
| 68 |
-
2.15,
|
| 69 |
-
1.924
|
| 70 |
-
],
|
| 71 |
-
"max_episode_reward": [
|
| 72 |
-
5.315,
|
| 73 |
-
3.303,
|
| 74 |
-
3.016,
|
| 75 |
-
3.837,
|
| 76 |
-
2.807,
|
| 77 |
-
2.609
|
| 78 |
-
],
|
| 79 |
-
"min_episode_reward": [
|
| 80 |
-
4.96,
|
| 81 |
-
2.6,
|
| 82 |
-
2.555,
|
| 83 |
-
3.338,
|
| 84 |
-
1.587,
|
| 85 |
-
1.375
|
| 86 |
-
],
|
| 87 |
-
"avg_grader": [
|
| 88 |
-
0.9498,
|
| 89 |
-
0.259,
|
| 90 |
-
0.2083,
|
| 91 |
-
0.8697,
|
| 92 |
-
0.3763,
|
| 93 |
-
0.2855
|
| 94 |
-
],
|
| 95 |
-
"max_grader": [
|
| 96 |
-
1.0,
|
| 97 |
-
0.3614,
|
| 98 |
-
0.3042,
|
| 99 |
-
1.0,
|
| 100 |
-
0.5979,
|
| 101 |
-
0.5027
|
| 102 |
-
],
|
| 103 |
-
"n_training_samples": [
|
| 104 |
-
81,
|
| 105 |
-
96,
|
| 106 |
-
102,
|
| 107 |
-
77,
|
| 108 |
-
90,
|
| 109 |
-
76
|
| 110 |
-
],
|
| 111 |
-
"train_loss": [
|
| 112 |
-
2.833,
|
| 113 |
-
3.1413,
|
| 114 |
-
3.1255,
|
| 115 |
-
2.8381,
|
| 116 |
-
2.9281,
|
| 117 |
-
2.9184
|
| 118 |
-
]
|
| 119 |
}
|
| 120 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model": "Qwen/Qwen2.5-3B-Instruct",
|
| 3 |
"training": "Two-phase LoRA SFT (timing -> content) with hardcoded peak-hours hint on round 1 of each phase",
|
| 4 |
+
"phases": [],
|
|
|
|
|
|
|
|
|
|
| 5 |
"rounds_per_phase": 3,
|
| 6 |
"episodes_per_round": 6,
|
| 7 |
"before": {
|
| 8 |
+
"monthly_engage": 1.0,
|
| 9 |
+
"monthly_strategic": 0.8357,
|
| 10 |
+
"monthly_competitive": 0.9414
|
| 11 |
},
|
| 12 |
"after": {
|
| 13 |
+
"monthly_engage": 0.999,
|
| 14 |
+
"monthly_strategic": 0.9321439559505211,
|
| 15 |
+
"monthly_competitive": 0.999
|
| 16 |
},
|
| 17 |
"smart_heuristic": {
|
| 18 |
"monthly_engage": 0.7519,
|
|
|
|
| 20 |
"monthly_competitive": 0.9141
|
| 21 |
},
|
| 22 |
"improvement": {
|
| 23 |
+
"monthly_engage": -0.0010000000000000009,
|
| 24 |
+
"monthly_strategic": 0.09644395595052113,
|
| 25 |
+
"monthly_competitive": 0.057599999999999985
|
| 26 |
},
|
| 27 |
"training_log": {
|
| 28 |
+
"phase": [],
|
| 29 |
+
"round": [],
|
| 30 |
+
"global_step": [],
|
| 31 |
+
"use_hint": [],
|
| 32 |
+
"avg_episode_reward": [],
|
| 33 |
+
"max_episode_reward": [],
|
| 34 |
+
"min_episode_reward": [],
|
| 35 |
+
"avg_grader": [],
|
| 36 |
+
"max_grader": [],
|
| 37 |
+
"n_training_samples": [],
|
| 38 |
+
"train_loss": []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
}
|
| 40 |
}
|
run-output/training/train_grpo.executed.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|