#!/bin/bash
# Run on a prep instance to: download the training subset, split-tar it, upload to HF.
# Usage: HF_TOKEN=xxx bash prep_dataset.sh

set -e

if [ -z "$HF_TOKEN" ]; then echo "ERROR: export HF_TOKEN first"; exit 1; fi

DATA_DIR="$HOME/data/community_dataset_v3"
TAR_DIR="$HOME/data/tar_chunks"
CHUNK_SIZE="2G"
REPO_ID="StrongRoboticsLab/pi05-so100-diverse"

echo "=== Step 1: Clone project repo ==="
if [ ! -d $HOME/pi05-so100-diverse ]; then
    git clone https://huggingface.co/$REPO_ID $HOME/pi05-so100-diverse
fi
cd $HOME/pi05-so100-diverse

echo "=== Step 2: Install deps ==="
pip install -q huggingface_hub pandas

echo "=== Step 3: Download training subset ==="
python download_subset.py \
    --index filtered_index.json \
    --output "$DATA_DIR" \
    --token "$HF_TOKEN"

echo "=== Step 4: Split-tar the dataset ==="
mkdir -p "$TAR_DIR"
echo "Tarring $(du -sh $DATA_DIR | cut -f1) into ${CHUNK_SIZE} chunks..."
time tar cf - -C "$(dirname $DATA_DIR)" "$(basename $DATA_DIR)" \
    | split -b "$CHUNK_SIZE" -d -a 3 - "$TAR_DIR/training_subset.tar."
echo "Chunks created:"
ls -lh "$TAR_DIR"/training_subset.tar.*

echo "=== Step 5: Upload chunks to HF ==="
python -c "
from huggingface_hub import HfApi
import glob

api = HfApi(token='$HF_TOKEN')
chunks = sorted(glob.glob('$TAR_DIR/training_subset.tar.*'))
print(f'Uploading {len(chunks)} chunks...')
for i, chunk in enumerate(chunks):
    name = chunk.split('/')[-1]
    print(f'  [{i+1}/{len(chunks)}] {name}')
    api.upload_file(
        path_or_fileobj=chunk,
        path_in_repo=f'dataset/{name}',
        repo_id='$REPO_ID',
        repo_type='model',
    )
print('All chunks uploaded')
"

echo "=== Done! ==="
echo "Chunks are at dataset/training_subset.tar.* in the HF repo."