#!/usr/bin/env python3 """ Monitor HF checkpoints repo and delete intermediate checkpoints. Keeps every 5000th step checkpoint, deletes the rest. Runs in a loop, checking every 10 minutes. """ import time import re import os from huggingface_hub import HfApi REPO_ID = os.environ.get("CHECKPOINT_REPO", "StrongRoboticsLab/pi05-so100-diverse-checkpoints") KEEP_EVERY = 5000 # keep checkpoints at step 5000, 10000, 15000, ... CHECK_INTERVAL = 600 # seconds between checks api = HfApi(token=os.environ.get("HF_TOKEN")) def get_checkpoint_steps(): """List all checkpoint step numbers in the repo.""" try: files = api.list_repo_tree(REPO_ID, recursive=False) steps = [] for f in files: match = re.match(r"step_(\d+)", f.rfilename) if match: steps.append(int(match.group(1))) return sorted(steps) except Exception as e: print(f"Error listing repo: {e}") return [] def cleanup(): """Delete checkpoints that aren't multiples of KEEP_EVERY, except the latest.""" steps = get_checkpoint_steps() if len(steps) <= 1: return latest = max(steps) to_delete = [] to_keep = [] for step in steps: if step == latest or step % KEEP_EVERY == 0: to_keep.append(step) else: to_delete.append(step) if not to_delete: print(f"Nothing to delete. {len(to_keep)} checkpoints kept.") return print(f"Keeping {len(to_keep)} checkpoints: {to_keep}") print(f"Deleting {len(to_delete)} checkpoints...") for step in to_delete: folder = f"step_{step:06d}" try: api.delete_folder(path_in_repo=folder, repo_id=REPO_ID, repo_type="model") print(f" Deleted {folder}") except Exception as e: print(f" Failed to delete {folder}: {e}") if __name__ == "__main__": print(f"Monitoring {REPO_ID}, keeping every {KEEP_EVERY} steps") while True: cleanup() print(f"Sleeping {CHECK_INTERVAL}s...") time.sleep(CHECK_INTERVAL)