pi05-so100-diverse / cleanup_dataset.py
bot
Update lerobot to latest with SO100 rename_map fix
a8eb6e5
#!/usr/bin/env python3
"""Delete all files from the dataset that aren't in filtered_index.json."""
import json
import os
import shutil
import sys
from collections import defaultdict
from pathlib import Path
def main():
index_path = sys.argv[1] if len(sys.argv) > 1 else "filtered_index.json"
dataset_dir = sys.argv[2] if len(sys.argv) > 2 else "/ephemeral/community_dataset_v3"
with open(index_path) as f:
index = json.load(f)
# Build set of needed directories (contributor/dataset)
needed_datasets = set()
for ep in index["episodes"]:
needed_datasets.add(ep["dataset"])
# Walk the dataset dir and find all contributor/dataset dirs
dataset_root = Path(dataset_dir)
deleted_bytes = 0
deleted_dirs = 0
for contributor_dir in sorted(dataset_root.iterdir()):
if not contributor_dir.is_dir() or contributor_dir.name.startswith("."):
continue
for ds_dir in sorted(contributor_dir.iterdir()):
if not ds_dir.is_dir():
continue
dataset_name = f"{contributor_dir.name}/{ds_dir.name}"
if dataset_name not in needed_datasets:
# Get size before deleting
size = sum(f.stat().st_size for f in ds_dir.rglob("*") if f.is_file())
shutil.rmtree(ds_dir)
deleted_bytes += size
deleted_dirs += 1
if deleted_dirs % 50 == 0:
print(f" Deleted {deleted_dirs} datasets, freed {deleted_bytes / 1024**3:.1f}GB", flush=True)
# Remove empty contributor dirs
if contributor_dir.exists() and not any(contributor_dir.iterdir()):
contributor_dir.rmdir()
# Also delete the .cache dir
cache_dir = dataset_root / ".cache"
if cache_dir.exists():
cache_size = sum(f.stat().st_size for f in cache_dir.rglob("*") if f.is_file())
shutil.rmtree(cache_dir)
deleted_bytes += cache_size
print(f" Deleted .cache ({cache_size / 1024**3:.1f}GB)")
print(f"\nDone: deleted {deleted_dirs} unused datasets, freed {deleted_bytes / 1024**3:.1f}GB")
if __name__ == "__main__":
main()