pi05-so100-diverse / prep_dataset.sh
bot
Update lerobot to latest with SO100 rename_map fix
a8eb6e5
#!/bin/bash
# Run on a prep instance to: download the training subset, split-tar it, upload to HF.
# Usage: HF_TOKEN=xxx bash prep_dataset.sh
set -e
if [ -z "$HF_TOKEN" ]; then echo "ERROR: export HF_TOKEN first"; exit 1; fi
DATA_DIR="$HOME/data/community_dataset_v3"
TAR_DIR="$HOME/data/tar_chunks"
CHUNK_SIZE="2G"
REPO_ID="StrongRoboticsLab/pi05-so100-diverse"
echo "=== Step 1: Clone project repo ==="
if [ ! -d $HOME/pi05-so100-diverse ]; then
git clone https://huggingface.co/$REPO_ID $HOME/pi05-so100-diverse
fi
cd $HOME/pi05-so100-diverse
echo "=== Step 2: Install deps ==="
pip install -q huggingface_hub pandas
echo "=== Step 3: Download training subset ==="
python download_subset.py \
--index filtered_index.json \
--output "$DATA_DIR" \
--token "$HF_TOKEN"
echo "=== Step 4: Split-tar the dataset ==="
mkdir -p "$TAR_DIR"
echo "Tarring $(du -sh $DATA_DIR | cut -f1) into ${CHUNK_SIZE} chunks..."
time tar cf - -C "$(dirname $DATA_DIR)" "$(basename $DATA_DIR)" \
| split -b "$CHUNK_SIZE" -d -a 3 - "$TAR_DIR/training_subset.tar."
echo "Chunks created:"
ls -lh "$TAR_DIR"/training_subset.tar.*
echo "=== Step 5: Upload chunks to HF ==="
python -c "
from huggingface_hub import HfApi
import glob
api = HfApi(token='$HF_TOKEN')
chunks = sorted(glob.glob('$TAR_DIR/training_subset.tar.*'))
print(f'Uploading {len(chunks)} chunks...')
for i, chunk in enumerate(chunks):
name = chunk.split('/')[-1]
print(f' [{i+1}/{len(chunks)}] {name}')
api.upload_file(
path_or_fileobj=chunk,
path_in_repo=f'dataset/{name}',
repo_id='$REPO_ID',
repo_type='model',
)
print('All chunks uploaded')
"
echo "=== Done! ==="
echo "Chunks are at dataset/training_subset.tar.* in the HF repo."