#!/bin/bash # Run on a prep instance to: download the training subset, split-tar it, upload to HF. # Usage: HF_TOKEN=xxx bash prep_dataset.sh set -e if [ -z "$HF_TOKEN" ]; then echo "ERROR: export HF_TOKEN first"; exit 1; fi DATA_DIR="$HOME/data/community_dataset_v3" TAR_DIR="$HOME/data/tar_chunks" CHUNK_SIZE="2G" REPO_ID="StrongRoboticsLab/pi05-so100-diverse" echo "=== Step 1: Clone project repo ===" if [ ! -d $HOME/pi05-so100-diverse ]; then git clone https://huggingface.co/$REPO_ID $HOME/pi05-so100-diverse fi cd $HOME/pi05-so100-diverse echo "=== Step 2: Install deps ===" pip install -q huggingface_hub pandas echo "=== Step 3: Download training subset ===" python download_subset.py \ --index filtered_index.json \ --output "$DATA_DIR" \ --token "$HF_TOKEN" echo "=== Step 4: Split-tar the dataset ===" mkdir -p "$TAR_DIR" echo "Tarring $(du -sh $DATA_DIR | cut -f1) into ${CHUNK_SIZE} chunks..." time tar cf - -C "$(dirname $DATA_DIR)" "$(basename $DATA_DIR)" \ | split -b "$CHUNK_SIZE" -d -a 3 - "$TAR_DIR/training_subset.tar." echo "Chunks created:" ls -lh "$TAR_DIR"/training_subset.tar.* echo "=== Step 5: Upload chunks to HF ===" python -c " from huggingface_hub import HfApi import glob api = HfApi(token='$HF_TOKEN') chunks = sorted(glob.glob('$TAR_DIR/training_subset.tar.*')) print(f'Uploading {len(chunks)} chunks...') for i, chunk in enumerate(chunks): name = chunk.split('/')[-1] print(f' [{i+1}/{len(chunks)}] {name}') api.upload_file( path_or_fileobj=chunk, path_in_repo=f'dataset/{name}', repo_id='$REPO_ID', repo_type='model', ) print('All chunks uploaded') " echo "=== Done! ===" echo "Chunks are at dataset/training_subset.tar.* in the HF repo."