E-Rong commited on
Commit
67d546f
·
verified ·
1 Parent(s): 06087ac

Upload session_state.json

Browse files
Files changed (1) hide show
  1. session_state.json +10 -14
session_state.json CHANGED
@@ -15,8 +15,9 @@
15
  "completed_at": "2026-05-14T04:30:00Z"
16
  },
17
  "2": {
18
- "status": "IN_PROGRESS",
19
- "started_from": "phase1_final.zip",
 
20
  "latest_checkpoint": "phase2_ckpt_600352.zip",
21
  "latest_timestep": 600352,
22
  "target_timestep": 1000352,
@@ -26,11 +27,7 @@
26
  "k": 1.2,
27
  "base_weight": 0.5
28
  },
29
- "note": "Training crashed during sandbox session. Need to resume from checkpoint.",
30
- "blockers": [
31
- "HF Job git clone auth failure for private TIL repo",
32
- "sandbox process died without detection, kept billing empty"
33
- ]
34
  },
35
  "3": {
36
  "status": "PENDING",
@@ -59,19 +56,18 @@
59
  "mistake": "No session state persistence on Hub",
60
  "cost": "Time lost reconstructing state",
61
  "why_wrong": "Relied on ephemeral /app files instead of pushing state to Hub repo after every milestone.",
62
- "fix": "This file. Push/pull session_state.json from Hub at every session boundary."
63
  }
64
  ],
65
  "scripts": {
66
- "phase1_training": "/app/phase1_script.py (lost in sandbox reset)",
67
- "phase2_training": "phase2_job.py in Hub repo",
68
  "inference": "ae_manager.py in Hub repo (also in e-rong/til-26-ae ae/src/)"
69
  },
70
  "next_steps": [
71
- "Fix HF Job script to use snapshot_download instead of git clone",
72
- "Test with 5-minute smoke job before full submission",
73
- "Resume Phase 2 from phase2_ckpt_600352.zip to 1,000,352 steps",
74
  "Run evaluation vs random opponents",
75
  "Proceed to Phase 3 curriculum"
76
  ]
77
- }
 
15
  "completed_at": "2026-05-14T04:30:00Z"
16
  },
17
  "2": {
18
+ "status": "RUNNING_IN_HF_JOB",
19
+ "job_id": "6a058adfe48bea4538b9c767",
20
+ "started_from": "phase2_ckpt_600352.zip",
21
  "latest_checkpoint": "phase2_ckpt_600352.zip",
22
  "latest_timestep": 600352,
23
  "target_timestep": 1000352,
 
27
  "k": 1.2,
28
  "base_weight": 0.5
29
  },
30
+ "note": "Resumed via HF Job at 2026-05-14 08:42 UTC. Using snapshot_download for auth."
 
 
 
 
31
  },
32
  "3": {
33
  "status": "PENDING",
 
56
  "mistake": "No session state persistence on Hub",
57
  "cost": "Time lost reconstructing state",
58
  "why_wrong": "Relied on ephemeral /app files instead of pushing state to Hub repo after every milestone.",
59
+ "fix": "Push session_state.json to Hub after every milestone. Read it at start of every session."
60
  }
61
  ],
62
  "scripts": {
63
+ "phase1_training": "completed",
64
+ "phase2_training": "phase2_resume.py in Hub repo",
65
  "inference": "ae_manager.py in Hub repo (also in e-rong/til-26-ae ae/src/)"
66
  },
67
  "next_steps": [
68
+ "Monitor Phase 2 HF Job (6a058adfe48bea4538b9c767) until completion",
69
+ "Verify phase2_final.zip pushed to Hub",
 
70
  "Run evaluation vs random opponents",
71
  "Proceed to Phase 3 curriculum"
72
  ]
73
+ }