Upload session_state.json
Browse files- session_state.json +10 -14
session_state.json
CHANGED
|
@@ -15,8 +15,9 @@
|
|
| 15 |
"completed_at": "2026-05-14T04:30:00Z"
|
| 16 |
},
|
| 17 |
"2": {
|
| 18 |
-
"status": "
|
| 19 |
-
"
|
|
|
|
| 20 |
"latest_checkpoint": "phase2_ckpt_600352.zip",
|
| 21 |
"latest_timestep": 600352,
|
| 22 |
"target_timestep": 1000352,
|
|
@@ -26,11 +27,7 @@
|
|
| 26 |
"k": 1.2,
|
| 27 |
"base_weight": 0.5
|
| 28 |
},
|
| 29 |
-
"note": "
|
| 30 |
-
"blockers": [
|
| 31 |
-
"HF Job git clone auth failure for private TIL repo",
|
| 32 |
-
"sandbox process died without detection, kept billing empty"
|
| 33 |
-
]
|
| 34 |
},
|
| 35 |
"3": {
|
| 36 |
"status": "PENDING",
|
|
@@ -59,19 +56,18 @@
|
|
| 59 |
"mistake": "No session state persistence on Hub",
|
| 60 |
"cost": "Time lost reconstructing state",
|
| 61 |
"why_wrong": "Relied on ephemeral /app files instead of pushing state to Hub repo after every milestone.",
|
| 62 |
-
"fix": "
|
| 63 |
}
|
| 64 |
],
|
| 65 |
"scripts": {
|
| 66 |
-
"phase1_training": "
|
| 67 |
-
"phase2_training": "
|
| 68 |
"inference": "ae_manager.py in Hub repo (also in e-rong/til-26-ae ae/src/)"
|
| 69 |
},
|
| 70 |
"next_steps": [
|
| 71 |
-
"
|
| 72 |
-
"
|
| 73 |
-
"Resume Phase 2 from phase2_ckpt_600352.zip to 1,000,352 steps",
|
| 74 |
"Run evaluation vs random opponents",
|
| 75 |
"Proceed to Phase 3 curriculum"
|
| 76 |
]
|
| 77 |
-
}
|
|
|
|
| 15 |
"completed_at": "2026-05-14T04:30:00Z"
|
| 16 |
},
|
| 17 |
"2": {
|
| 18 |
+
"status": "RUNNING_IN_HF_JOB",
|
| 19 |
+
"job_id": "6a058adfe48bea4538b9c767",
|
| 20 |
+
"started_from": "phase2_ckpt_600352.zip",
|
| 21 |
"latest_checkpoint": "phase2_ckpt_600352.zip",
|
| 22 |
"latest_timestep": 600352,
|
| 23 |
"target_timestep": 1000352,
|
|
|
|
| 27 |
"k": 1.2,
|
| 28 |
"base_weight": 0.5
|
| 29 |
},
|
| 30 |
+
"note": "Resumed via HF Job at 2026-05-14 08:42 UTC. Using snapshot_download for auth."
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
},
|
| 32 |
"3": {
|
| 33 |
"status": "PENDING",
|
|
|
|
| 56 |
"mistake": "No session state persistence on Hub",
|
| 57 |
"cost": "Time lost reconstructing state",
|
| 58 |
"why_wrong": "Relied on ephemeral /app files instead of pushing state to Hub repo after every milestone.",
|
| 59 |
+
"fix": "Push session_state.json to Hub after every milestone. Read it at start of every session."
|
| 60 |
}
|
| 61 |
],
|
| 62 |
"scripts": {
|
| 63 |
+
"phase1_training": "completed",
|
| 64 |
+
"phase2_training": "phase2_resume.py in Hub repo",
|
| 65 |
"inference": "ae_manager.py in Hub repo (also in e-rong/til-26-ae ae/src/)"
|
| 66 |
},
|
| 67 |
"next_steps": [
|
| 68 |
+
"Monitor Phase 2 HF Job (6a058adfe48bea4538b9c767) until completion",
|
| 69 |
+
"Verify phase2_final.zip pushed to Hub",
|
|
|
|
| 70 |
"Run evaluation vs random opponents",
|
| 71 |
"Proceed to Phase 3 curriculum"
|
| 72 |
]
|
| 73 |
+
}
|