vaibhavkhandare commited on
Commit
e52d302
·
verified ·
1 Parent(s): a402a82

Upload folder using huggingface_hub

Browse files
run-output/plots/io_log.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
run-output/plots/training_log.csv CHANGED
@@ -1,3 +1,7 @@
1
- round,avg_episode_reward,max_episode_reward,min_episode_reward,avg_grader,max_grader,n_training_samples,train_loss
2
- 1,3.904,4.514,3.287,0.6202,0.8268,101,2.6723
3
- 2,4.215,4.658,3.566,0.7325,0.8703,102,2.5934
 
 
 
 
 
1
+ phase,round,global_step,use_hint,avg_episode_reward,max_episode_reward,min_episode_reward,avg_grader,max_grader,n_training_samples,train_loss
2
+ phase1_timing,1,1,True,5.127,5.315,4.96,0.9498,1.0,81,2.833
3
+ phase1_timing,2,2,False,3.04,3.303,2.6,0.259,0.3614,96,3.1413
4
+ phase1_timing,3,3,False,2.867,3.016,2.555,0.2083,0.3042,102,3.1255
5
+ phase2_content,1,4,True,3.538,3.837,3.338,0.8697,1.0,77,2.8381
6
+ phase2_content,2,5,False,2.15,2.807,1.587,0.3763,0.5979,90,2.9281
7
+ phase2_content,3,6,False,1.924,2.609,1.375,0.2855,0.5027,76,2.9184
run-output/plots/training_summary.json CHANGED
@@ -1,60 +1,120 @@
1
  {
2
  "model": "Qwen/Qwen2.5-3B-Instruct",
3
- "training": "LoRA SFT (real weight updates)",
4
- "rounds": 2,
 
 
 
 
5
  "episodes_per_round": 6,
6
  "before": {
7
- "monthly_engage": 1.0,
8
- "monthly_strategic": 0.8426,
9
- "monthly_competitive": 0.9521
10
  },
11
  "after": {
12
- "monthly_engage": 1.0,
13
- "monthly_strategic": 0.8416,
14
- "monthly_competitive": 0.964
15
  },
16
  "smart_heuristic": {
17
- "monthly_engage": 0.7352,
18
- "monthly_strategic": 0.9043,
19
- "monthly_competitive": 0.9066
20
  },
21
  "improvement": {
22
  "monthly_engage": 0.0,
23
- "monthly_strategic": -0.0010000000000000009,
24
- "monthly_competitive": 0.011900000000000022
25
  },
26
  "training_log": {
 
 
 
 
 
 
 
 
27
  "round": [
28
  1,
29
- 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  ],
31
  "avg_episode_reward": [
32
- 3.904,
33
- 4.215
 
 
 
 
34
  ],
35
  "max_episode_reward": [
36
- 4.514,
37
- 4.658
 
 
 
 
38
  ],
39
  "min_episode_reward": [
40
- 3.287,
41
- 3.566
 
 
 
 
42
  ],
43
  "avg_grader": [
44
- 0.6202,
45
- 0.7325
 
 
 
 
46
  ],
47
  "max_grader": [
48
- 0.8268,
49
- 0.8703
 
 
 
 
50
  ],
51
  "n_training_samples": [
52
- 101,
53
- 102
 
 
 
 
54
  ],
55
  "train_loss": [
56
- 2.6723,
57
- 2.5934
 
 
 
 
58
  ]
59
  }
60
  }
 
1
  {
2
  "model": "Qwen/Qwen2.5-3B-Instruct",
3
+ "training": "Two-phase LoRA SFT (timing -> content) with hardcoded peak-hours hint on round 1 of each phase",
4
+ "phases": [
5
+ "phase1_timing",
6
+ "phase2_content"
7
+ ],
8
+ "rounds_per_phase": 3,
9
  "episodes_per_round": 6,
10
  "before": {
11
+ "monthly_engage": 0.0,
12
+ "monthly_strategic": 0.175,
13
+ "monthly_competitive": 0.035
14
  },
15
  "after": {
16
+ "monthly_engage": 0.0,
17
+ "monthly_strategic": 0.175,
18
+ "monthly_competitive": 0.035
19
  },
20
  "smart_heuristic": {
21
+ "monthly_engage": 0.7519,
22
+ "monthly_strategic": 0.9101,
23
+ "monthly_competitive": 0.9141
24
  },
25
  "improvement": {
26
  "monthly_engage": 0.0,
27
+ "monthly_strategic": 0.0,
28
+ "monthly_competitive": 0.0
29
  },
30
  "training_log": {
31
+ "phase": [
32
+ "phase1_timing",
33
+ "phase1_timing",
34
+ "phase1_timing",
35
+ "phase2_content",
36
+ "phase2_content",
37
+ "phase2_content"
38
+ ],
39
  "round": [
40
  1,
41
+ 2,
42
+ 3,
43
+ 1,
44
+ 2,
45
+ 3
46
+ ],
47
+ "global_step": [
48
+ 1,
49
+ 2,
50
+ 3,
51
+ 4,
52
+ 5,
53
+ 6
54
+ ],
55
+ "use_hint": [
56
+ true,
57
+ false,
58
+ false,
59
+ true,
60
+ false,
61
+ false
62
  ],
63
  "avg_episode_reward": [
64
+ 5.127,
65
+ 3.04,
66
+ 2.867,
67
+ 3.538,
68
+ 2.15,
69
+ 1.924
70
  ],
71
  "max_episode_reward": [
72
+ 5.315,
73
+ 3.303,
74
+ 3.016,
75
+ 3.837,
76
+ 2.807,
77
+ 2.609
78
  ],
79
  "min_episode_reward": [
80
+ 4.96,
81
+ 2.6,
82
+ 2.555,
83
+ 3.338,
84
+ 1.587,
85
+ 1.375
86
  ],
87
  "avg_grader": [
88
+ 0.9498,
89
+ 0.259,
90
+ 0.2083,
91
+ 0.8697,
92
+ 0.3763,
93
+ 0.2855
94
  ],
95
  "max_grader": [
96
+ 1.0,
97
+ 0.3614,
98
+ 0.3042,
99
+ 1.0,
100
+ 0.5979,
101
+ 0.5027
102
  ],
103
  "n_training_samples": [
104
+ 81,
105
+ 96,
106
+ 102,
107
+ 77,
108
+ 90,
109
+ 76
110
  ],
111
  "train_loss": [
112
+ 2.833,
113
+ 3.1413,
114
+ 3.1255,
115
+ 2.8381,
116
+ 2.9281,
117
+ 2.9184
118
  ]
119
  }
120
  }
run-output/training/train_grpo.executed.ipynb CHANGED
The diff for this file is too large to render. See raw diff