| # Download the full community_dataset_v3 using hfd (aria2c-based, resolver-only, no API rate limit issues) | |
| set -e | |
| if [ -z "$HF_TOKEN" ]; then echo "ERROR: export HF_TOKEN first"; exit 1; fi | |
| DATASET_DIR="${DATASET_DIR:-/ephemeral/community_dataset_v3}" | |
| # Install hfd if not present | |
| if [ ! -f /usr/local/bin/hfd ]; then | |
| wget -q https://gist.githubusercontent.com/padeoe/697678ab8e528b85a2a7bddafea1fa4f/raw/hfd.sh -O /usr/local/bin/hfd | |
| chmod +x /usr/local/bin/hfd | |
| fi | |
| echo "Downloading dataset to $DATASET_DIR..." | |
| echo "Using aria2c with 4 threads per file, 5 concurrent downloads" | |
| hfd HuggingFaceVLA/community_dataset_v3 \ | |
| --dataset \ | |
| --hf_token "$HF_TOKEN" \ | |
| --tool aria2c \ | |
| -x 4 \ | |
| -j 5 \ | |
| --local-dir "$DATASET_DIR" | |
| echo "Download complete!" | |