#!/usr/bin/env bash set -euo pipefail MODE="${1:-all}" REPO_ID="derek-thomas/ScienceQA" ROOT_DIR="/workspace/xiaobin/RL_dataset/data/ScienceQA" HF_DIR="${ROOT_DIR}/hf" IMG_DIR="${ROOT_DIR}/images" CACHE_DIR="${ROOT_DIR}/.hf_cache" DEFAULT_ENDPOINT="https://hf-mirror.com" HF_ENDPOINT_VALUE="${HF_ENDPOINT:-${HF_ENDPOINT_OVERRIDE:-${DEFAULT_ENDPOINT}}}" unset http_proxy unset https_proxy unset HTTP_PROXY unset HTTPS_PROXY unset all_proxy unset ALL_PROXY export HF_ENDPOINT="${HF_ENDPOINT_VALUE}" mkdir -p "${HF_DIR}" "${IMG_DIR}" "${CACHE_DIR}" if command -v hf >/dev/null 2>&1; then HF_BIN=(hf download) elif command -v huggingface-cli >/dev/null 2>&1; then HF_BIN=(huggingface-cli download) else echo "Missing Hugging Face CLI. Install it with:" >&2 echo " python -m pip install -U \"huggingface_hub[cli]\"" >&2 exit 1 fi print_help() { cat <<'EOF' Usage: bash download_scienceqa_hf.sh [parquet|images|all] Modes: parquet Download the public Hugging Face parquet files only images Download the original ScienceQA image zip files only all Download both parquet files and images Output layout: /workspace/xiaobin/RL_dataset/data/ScienceQA/hf /workspace/xiaobin/RL_dataset/data/ScienceQA/images Notes: - This dataset is public and should not require an HF token. - Image URLs are adapted from: /workspace/xiaobin/RL_dataset/ScienceQA/tools/download.sh - Proxies are unset before download. - Default HF endpoint: https://hf-mirror.com - To override and use the official endpoint: HF_ENDPOINT=https://huggingface.co bash download_scienceqa_hf.sh parquet EOF } if [[ "${MODE}" == "-h" || "${MODE}" == "--help" || "${MODE}" == "help" ]]; then print_help exit 0 fi verify_glob() { local pattern="$1" if ! compgen -G "${pattern}" >/dev/null; then echo "Missing expected file matching: ${pattern}" >&2 exit 1 fi } download_parquet() { "${HF_BIN[@]}" "${REPO_ID}" \ --repo-type dataset \ --cache-dir "${CACHE_DIR}" \ --local-dir "${HF_DIR}" \ --include "data/*.parquet" \ --include "README.md" \ --include "ScienceQA.py" verify_glob "${HF_DIR}/data/train-*.parquet" verify_glob "${HF_DIR}/data/validation-*.parquet" verify_glob "${HF_DIR}/data/test-*.parquet" } download_one_split() { local split="$1" local zip_path="${IMG_DIR}/${split}.zip" local split_dir="${IMG_DIR}/${split}" local url="https://scienceqa.s3.us-west-1.amazonaws.com/images/${split}.zip" if [[ -d "${split_dir}" ]]; then echo "Image split already exists: ${split_dir}" return 0 fi wget -c -O "${zip_path}" "${url}" unzip -q -o "${zip_path}" -d "${IMG_DIR}" rm -f "${zip_path}" if [[ ! -d "${split_dir}" ]]; then echo "Failed to extract image split: ${split}" >&2 exit 1 fi } download_images() { download_one_split train download_one_split val download_one_split test } case "${MODE}" in parquet) download_parquet ;; images) download_images ;; all) download_parquet download_images ;; *) echo "Unknown mode: ${MODE}" >&2 print_help >&2 exit 1 ;; esac echo "Download completed." echo "Parquet dir: ${HF_DIR}" echo "Image dir: ${IMG_DIR}"