| |
| """Delete all files from the dataset that aren't in filtered_index.json.""" |
|
|
| import json |
| import os |
| import shutil |
| import sys |
| from collections import defaultdict |
| from pathlib import Path |
|
|
|
|
| def main(): |
| index_path = sys.argv[1] if len(sys.argv) > 1 else "filtered_index.json" |
| dataset_dir = sys.argv[2] if len(sys.argv) > 2 else "/ephemeral/community_dataset_v3" |
|
|
| with open(index_path) as f: |
| index = json.load(f) |
|
|
| |
| needed_datasets = set() |
| for ep in index["episodes"]: |
| needed_datasets.add(ep["dataset"]) |
|
|
| |
| dataset_root = Path(dataset_dir) |
| deleted_bytes = 0 |
| deleted_dirs = 0 |
|
|
| for contributor_dir in sorted(dataset_root.iterdir()): |
| if not contributor_dir.is_dir() or contributor_dir.name.startswith("."): |
| continue |
|
|
| for ds_dir in sorted(contributor_dir.iterdir()): |
| if not ds_dir.is_dir(): |
| continue |
|
|
| dataset_name = f"{contributor_dir.name}/{ds_dir.name}" |
| if dataset_name not in needed_datasets: |
| |
| size = sum(f.stat().st_size for f in ds_dir.rglob("*") if f.is_file()) |
| shutil.rmtree(ds_dir) |
| deleted_bytes += size |
| deleted_dirs += 1 |
| if deleted_dirs % 50 == 0: |
| print(f" Deleted {deleted_dirs} datasets, freed {deleted_bytes / 1024**3:.1f}GB", flush=True) |
|
|
| |
| if contributor_dir.exists() and not any(contributor_dir.iterdir()): |
| contributor_dir.rmdir() |
|
|
| |
| cache_dir = dataset_root / ".cache" |
| if cache_dir.exists(): |
| cache_size = sum(f.stat().st_size for f in cache_dir.rglob("*") if f.is_file()) |
| shutil.rmtree(cache_dir) |
| deleted_bytes += cache_size |
| print(f" Deleted .cache ({cache_size / 1024**3:.1f}GB)") |
|
|
| print(f"\nDone: deleted {deleted_dirs} unused datasets, freed {deleted_bytes / 1024**3:.1f}GB") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|