| #!/bin/bash |
| |
| |
|
|
| set -e |
|
|
| if [ -z "$HF_TOKEN" ]; then echo "ERROR: export HF_TOKEN first"; exit 1; fi |
|
|
| DATA_DIR="$HOME/data/community_dataset_v3" |
| TAR_DIR="$HOME/data/tar_chunks" |
| CHUNK_SIZE="2G" |
| REPO_ID="StrongRoboticsLab/pi05-so100-diverse" |
|
|
| echo "=== Step 1: Clone project repo ===" |
| if [ ! -d $HOME/pi05-so100-diverse ]; then |
| git clone https://huggingface.co/$REPO_ID $HOME/pi05-so100-diverse |
| fi |
| cd $HOME/pi05-so100-diverse |
|
|
| echo "=== Step 2: Install deps ===" |
| pip install -q huggingface_hub pandas |
|
|
| echo "=== Step 3: Download training subset ===" |
| python download_subset.py \ |
| --index filtered_index.json \ |
| --output "$DATA_DIR" \ |
| --token "$HF_TOKEN" |
|
|
| echo "=== Step 4: Split-tar the dataset ===" |
| mkdir -p "$TAR_DIR" |
| echo "Tarring $(du -sh $DATA_DIR | cut -f1) into ${CHUNK_SIZE} chunks..." |
| time tar cf - -C "$(dirname $DATA_DIR)" "$(basename $DATA_DIR)" \ |
| | split -b "$CHUNK_SIZE" -d -a 3 - "$TAR_DIR/training_subset.tar." |
| echo "Chunks created:" |
| ls -lh "$TAR_DIR"/training_subset.tar.* |
|
|
| echo "=== Step 5: Upload chunks to HF ===" |
| python -c " |
| from huggingface_hub import HfApi |
| import glob |
| |
| api = HfApi(token='$HF_TOKEN') |
| chunks = sorted(glob.glob('$TAR_DIR/training_subset.tar.*')) |
| print(f'Uploading {len(chunks)} chunks...') |
| for i, chunk in enumerate(chunks): |
| name = chunk.split('/')[-1] |
| print(f' [{i+1}/{len(chunks)}] {name}') |
| api.upload_file( |
| path_or_fileobj=chunk, |
| path_in_repo=f'dataset/{name}', |
| repo_id='$REPO_ID', |
| repo_type='model', |
| ) |
| print('All chunks uploaded') |
| " |
|
|
| echo "=== Done! ===" |
| echo "Chunks are at dataset/training_subset.tar.* in the HF repo." |
|
|