Spaces:

ycwhencpp
/

final-iteration

Paused

App Files Files Community

ycwhencpp commited on 11 days ago

Commit

17149c8

verified ·

1 Parent(s): 0c87e02

HF Job: train_grpo run output

Browse files

Files changed (4) hide show

run-output/plots/io_log.jsonl +0 -0
run-output/plots/training_log.csv +0 -6
run-output/plots/training_summary.json +21 -101
run-output/training/train_grpo.executed.ipynb +0 -0

run-output/plots/io_log.jsonl CHANGED Viewed

The diff for this file is too large to render. See raw diff

run-output/plots/training_log.csv CHANGED Viewed

@@ -1,7 +1 @@
 phase,round,global_step,use_hint,avg_episode_reward,max_episode_reward,min_episode_reward,avg_grader,max_grader,n_training_samples,train_loss
-phase1_timing,1,1,True,5.127,5.315,4.96,0.9498,1.0,81,2.833
-phase1_timing,2,2,False,3.04,3.303,2.6,0.259,0.3614,96,3.1413
-phase1_timing,3,3,False,2.867,3.016,2.555,0.2083,0.3042,102,3.1255
-phase2_content,1,4,True,3.538,3.837,3.338,0.8697,1.0,77,2.8381
-phase2_content,2,5,False,2.15,2.807,1.587,0.3763,0.5979,90,2.9281
-phase2_content,3,6,False,1.924,2.609,1.375,0.2855,0.5027,76,2.9184


1	phase,round,global_step,use_hint,avg_episode_reward,max_episode_reward,min_episode_reward,avg_grader,max_grader,n_training_samples,train_loss

run-output/plots/training_summary.json CHANGED Viewed

@@ -1,21 +1,18 @@
 {
   "model": "Qwen/Qwen2.5-3B-Instruct",
   "training": "Two-phase LoRA SFT (timing -> content) with hardcoded peak-hours hint on round 1 of each phase",
-  "phases": [
-    "phase1_timing",
-    "phase2_content"
-  ],
   "rounds_per_phase": 3,
   "episodes_per_round": 6,
   "before": {
-    "monthly_engage": 0.0,
-    "monthly_strategic": 0.175,
-    "monthly_competitive": 0.035
   },
   "after": {
-    "monthly_engage": 0.0,
-    "monthly_strategic": 0.175,
-    "monthly_competitive": 0.035
   },
   "smart_heuristic": {
     "monthly_engage": 0.7519,
@@ -23,98 +20,21 @@
     "monthly_competitive": 0.9141
   },
   "improvement": {
-    "monthly_engage": 0.0,
-    "monthly_strategic": 0.0,
-    "monthly_competitive": 0.0
   },
   "training_log": {
-    "phase": [
-      "phase1_timing",
-      "phase1_timing",
-      "phase1_timing",
-      "phase2_content",
-      "phase2_content",
-      "phase2_content"
-    ],
-    "round": [
-      1,
-      2,
-      3,
-      1,
-      2,
-      3
-    ],
-    "global_step": [
-      1,
-      2,
-      3,
-      4,
-      5,
-      6
-    ],
-    "use_hint": [
-      true,
-      false,
-      false,
-      true,
-      false,
-      false
-    ],
-    "avg_episode_reward": [
-      5.127,
-      3.04,
-      2.867,
-      3.538,
-      2.15,
-      1.924
-    ],
-    "max_episode_reward": [
-      5.315,
-      3.303,
-      3.016,
-      3.837,
-      2.807,
-      2.609
-    ],
-    "min_episode_reward": [
-      4.96,
-      2.6,
-      2.555,
-      3.338,
-      1.587,
-      1.375
-    ],
-    "avg_grader": [
-      0.9498,
-      0.259,
-      0.2083,
-      0.8697,
-      0.3763,
-      0.2855
-    ],
-    "max_grader": [
-      1.0,
-      0.3614,
-      0.3042,
-      1.0,
-      0.5979,
-      0.5027
-    ],
-    "n_training_samples": [
-      81,
-      96,
-      102,
-      77,
-      90,
-      76
-    ],
-    "train_loss": [
-      2.833,
-      3.1413,
-      3.1255,
-      2.8381,
-      2.9281,
-      2.9184
-    ]
   }
 }

 {
   "model": "Qwen/Qwen2.5-3B-Instruct",
   "training": "Two-phase LoRA SFT (timing -> content) with hardcoded peak-hours hint on round 1 of each phase",
+  "phases": [],
   "rounds_per_phase": 3,
   "episodes_per_round": 6,
   "before": {
+    "monthly_engage": 1.0,
+    "monthly_strategic": 0.8357,
+    "monthly_competitive": 0.9414
   },
   "after": {
+    "monthly_engage": 0.999,
+    "monthly_strategic": 0.9321439559505211,
+    "monthly_competitive": 0.999
   },
   "smart_heuristic": {
     "monthly_engage": 0.7519,
     "monthly_competitive": 0.9141
   },
   "improvement": {
+    "monthly_engage": -0.0010000000000000009,
+    "monthly_strategic": 0.09644395595052113,
+    "monthly_competitive": 0.057599999999999985
   },
   "training_log": {
+    "phase": [],
+    "round": [],
+    "global_step": [],
+    "use_hint": [],
+    "avg_episode_reward": [],
+    "max_episode_reward": [],
+    "min_episode_reward": [],
+    "avg_grader": [],
+    "max_grader": [],
+    "n_training_samples": [],
+    "train_loss": []
   }
 }

run-output/training/train_grpo.executed.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff