#!/usr/bin/env bash
set -euo pipefail

MODE="${1:-all}"

REPO_ID="derek-thomas/ScienceQA"
ROOT_DIR="/workspace/xiaobin/RL_dataset/data/ScienceQA"
HF_DIR="${ROOT_DIR}/hf"
IMG_DIR="${ROOT_DIR}/images"
CACHE_DIR="${ROOT_DIR}/.hf_cache"
DEFAULT_ENDPOINT="https://hf-mirror.com"
HF_ENDPOINT_VALUE="${HF_ENDPOINT:-${HF_ENDPOINT_OVERRIDE:-${DEFAULT_ENDPOINT}}}"

unset http_proxy
unset https_proxy
unset HTTP_PROXY
unset HTTPS_PROXY
unset all_proxy
unset ALL_PROXY

export HF_ENDPOINT="${HF_ENDPOINT_VALUE}"

mkdir -p "${HF_DIR}" "${IMG_DIR}" "${CACHE_DIR}"

if command -v hf >/dev/null 2>&1; then
  HF_BIN=(hf download)
elif command -v huggingface-cli >/dev/null 2>&1; then
  HF_BIN=(huggingface-cli download)
else
  echo "Missing Hugging Face CLI. Install it with:" >&2
  echo "  python -m pip install -U \"huggingface_hub[cli]\"" >&2
  exit 1
fi

print_help() {
  cat <<'EOF'
Usage:
  bash download_scienceqa_hf.sh [parquet|images|all]

Modes:
  parquet  Download the public Hugging Face parquet files only
  images   Download the original ScienceQA image zip files only
  all      Download both parquet files and images

Output layout:
  /workspace/xiaobin/RL_dataset/data/ScienceQA/hf
  /workspace/xiaobin/RL_dataset/data/ScienceQA/images

Notes:
  - This dataset is public and should not require an HF token.
  - Image URLs are adapted from:
    /workspace/xiaobin/RL_dataset/ScienceQA/tools/download.sh
  - Proxies are unset before download.
  - Default HF endpoint: https://hf-mirror.com
  - To override and use the official endpoint:
    HF_ENDPOINT=https://huggingface.co bash download_scienceqa_hf.sh parquet
EOF
}

if [[ "${MODE}" == "-h" || "${MODE}" == "--help" || "${MODE}" == "help" ]]; then
  print_help
  exit 0
fi

verify_glob() {
  local pattern="$1"

  if ! compgen -G "${pattern}" >/dev/null; then
    echo "Missing expected file matching: ${pattern}" >&2
    exit 1
  fi
}

download_parquet() {
  "${HF_BIN[@]}" "${REPO_ID}" \
    --repo-type dataset \
    --cache-dir "${CACHE_DIR}" \
    --local-dir "${HF_DIR}" \
    --include "data/*.parquet" \
    --include "README.md" \
    --include "ScienceQA.py"

  verify_glob "${HF_DIR}/data/train-*.parquet"
  verify_glob "${HF_DIR}/data/validation-*.parquet"
  verify_glob "${HF_DIR}/data/test-*.parquet"
}

download_one_split() {
  local split="$1"
  local zip_path="${IMG_DIR}/${split}.zip"
  local split_dir="${IMG_DIR}/${split}"
  local url="https://scienceqa.s3.us-west-1.amazonaws.com/images/${split}.zip"

  if [[ -d "${split_dir}" ]]; then
    echo "Image split already exists: ${split_dir}"
    return 0
  fi

  wget -c -O "${zip_path}" "${url}"
  unzip -q -o "${zip_path}" -d "${IMG_DIR}"
  rm -f "${zip_path}"

  if [[ ! -d "${split_dir}" ]]; then
    echo "Failed to extract image split: ${split}" >&2
    exit 1
  fi
}

download_images() {
  download_one_split train
  download_one_split val
  download_one_split test
}

case "${MODE}" in
  parquet)
    download_parquet
    ;;
  images)
    download_images
    ;;
  all)
    download_parquet
    download_images
    ;;
  *)
    echo "Unknown mode: ${MODE}" >&2
    print_help >&2
    exit 1
    ;;
esac

echo "Download completed."
echo "Parquet dir: ${HF_DIR}"
echo "Image dir:   ${IMG_DIR}"