E-Rong commited on
Commit
69d8b50
·
verified ·
1 Parent(s): 2f3c7cd

Add session_state.json for cross-session persistence

Browse files
Files changed (1) hide show
  1. session_state.json +77 -0
session_state.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "project": "TIL-26-AE Bomberman Agent",
3
+ "repo": "E-Rong/til-26-ae-agent",
4
+ "space": "e-rong/til-26-ae",
5
+ "last_updated": "2026-05-14",
6
+ "current_phase": 2,
7
+ "phases": {
8
+ "1": {
9
+ "status": "COMPLETE",
10
+ "timesteps": 500352,
11
+ "checkpoint": "phase1_final.zip",
12
+ "eval_win_rate": "92.0%",
13
+ "eval_avg_reward": 180.1,
14
+ "eval_survival": "100.0%",
15
+ "completed_at": "2026-05-14T04:30:00Z"
16
+ },
17
+ "2": {
18
+ "status": "IN_PROGRESS",
19
+ "started_from": "phase1_final.zip",
20
+ "latest_checkpoint": "phase2_ckpt_600352.zip",
21
+ "latest_timestep": 600352,
22
+ "target_timestep": 1000352,
23
+ "remaining_steps": 400000,
24
+ "shaping": {
25
+ "method": "visit_count_adaptive",
26
+ "k": 1.2,
27
+ "base_weight": 0.5
28
+ },
29
+ "note": "Training crashed during sandbox session. Need to resume from checkpoint.",
30
+ "blockers": [
31
+ "HF Job git clone auth failure for private TIL repo",
32
+ "sandbox process died without detection, kept billing empty"
33
+ ]
34
+ },
35
+ "3": {
36
+ "status": "PENDING",
37
+ "duration": 1000000,
38
+ "opponents": "rule_based_curriculum",
39
+ "teams": 3
40
+ }
41
+ },
42
+ "mistakes_log": [
43
+ {
44
+ "date": "2026-05-14",
45
+ "mistake": "Used sandboxes for 3+ hour training runs",
46
+ "cost": "~$4.87",
47
+ "why_wrong": "Sandboxes are interactive dev environments, not batch compute. They recycle/timeout and keep billing when empty.",
48
+ "fix": "Use HF Jobs for any training >30 minutes"
49
+ },
50
+ {
51
+ "date": "2026-05-14",
52
+ "mistake": "git clone private repo in HF Job without auth",
53
+ "cost": "~$0.10",
54
+ "why_wrong": "HF Jobs have HF_TOKEN env var, but git clone doesn't use it automatically. Need snapshot_download or token-in-URL.",
55
+ "fix": "Use huggingface_hub.snapshot_download() which auto-uses HF_TOKEN"
56
+ },
57
+ {
58
+ "date": "2026-05-14",
59
+ "mistake": "No session state persistence on Hub",
60
+ "cost": "Time lost reconstructing state",
61
+ "why_wrong": "Relied on ephemeral /app files instead of pushing state to Hub repo after every milestone.",
62
+ "fix": "This file. Push/pull session_state.json from Hub at every session boundary."
63
+ }
64
+ ],
65
+ "scripts": {
66
+ "phase1_training": "/app/phase1_script.py (lost in sandbox reset)",
67
+ "phase2_training": "phase2_job.py in Hub repo",
68
+ "inference": "ae_manager.py in Hub repo (also in e-rong/til-26-ae ae/src/)"
69
+ },
70
+ "next_steps": [
71
+ "Fix HF Job script to use snapshot_download instead of git clone",
72
+ "Test with 5-minute smoke job before full submission",
73
+ "Resume Phase 2 from phase2_ckpt_600352.zip to 1,000,352 steps",
74
+ "Run evaluation vs random opponents",
75
+ "Proceed to Phase 3 curriculum"
76
+ ]
77
+ }