bot commited on
Commit
a7a423a
·
1 Parent(s): 40bbdfd

Add checkpoint cleanup script: keeps every 5000th step

Browse files
Files changed (1) hide show
  1. cleanup_checkpoints.py +72 -0
cleanup_checkpoints.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Monitor HF checkpoints repo and delete intermediate checkpoints.
4
+ Keeps every 5000th step checkpoint, deletes the rest.
5
+ Runs in a loop, checking every 10 minutes.
6
+ """
7
+
8
+ import time
9
+ import re
10
+ import os
11
+ from huggingface_hub import HfApi
12
+
13
+ REPO_ID = os.environ.get("CHECKPOINT_REPO", "StrongRoboticsLab/pi05-so100-diverse-checkpoints")
14
+ KEEP_EVERY = 5000 # keep checkpoints at step 5000, 10000, 15000, ...
15
+ CHECK_INTERVAL = 600 # seconds between checks
16
+
17
+ api = HfApi(token=os.environ.get("HF_TOKEN"))
18
+
19
+
20
+ def get_checkpoint_steps():
21
+ """List all checkpoint step numbers in the repo."""
22
+ try:
23
+ files = api.list_repo_tree(REPO_ID, recursive=False)
24
+ steps = []
25
+ for f in files:
26
+ match = re.match(r"step_(\d+)", f.rfilename)
27
+ if match:
28
+ steps.append(int(match.group(1)))
29
+ return sorted(steps)
30
+ except Exception as e:
31
+ print(f"Error listing repo: {e}")
32
+ return []
33
+
34
+
35
+ def cleanup():
36
+ """Delete checkpoints that aren't multiples of KEEP_EVERY, except the latest."""
37
+ steps = get_checkpoint_steps()
38
+ if len(steps) <= 1:
39
+ return
40
+
41
+ latest = max(steps)
42
+ to_delete = []
43
+ to_keep = []
44
+
45
+ for step in steps:
46
+ if step == latest or step % KEEP_EVERY == 0:
47
+ to_keep.append(step)
48
+ else:
49
+ to_delete.append(step)
50
+
51
+ if not to_delete:
52
+ print(f"Nothing to delete. {len(to_keep)} checkpoints kept.")
53
+ return
54
+
55
+ print(f"Keeping {len(to_keep)} checkpoints: {to_keep}")
56
+ print(f"Deleting {len(to_delete)} checkpoints...")
57
+
58
+ for step in to_delete:
59
+ folder = f"step_{step:06d}"
60
+ try:
61
+ api.delete_folder(path_in_repo=folder, repo_id=REPO_ID, repo_type="model")
62
+ print(f" Deleted {folder}")
63
+ except Exception as e:
64
+ print(f" Failed to delete {folder}: {e}")
65
+
66
+
67
+ if __name__ == "__main__":
68
+ print(f"Monitoring {REPO_ID}, keeping every {KEEP_EVERY} steps")
69
+ while True:
70
+ cleanup()
71
+ print(f"Sleeping {CHECK_INTERVAL}s...")
72
+ time.sleep(CHECK_INTERVAL)