#!/usr/bin/env bash set -euo pipefail MODE="${1:-all}" REPO_ID="ychenNLP/oven" TARGET_DIR="/workspace/xiaobin/RL_dataset/data" CACHE_DIR="${TARGET_DIR}/.hf_cache" ASSETS_DIR="${TARGET_DIR}/.hf_assets" DEFAULT_ENDPOINT="https://hf-mirror.com" MIRROR_URL="${HF_ENDPOINT:-${HF_ENDPOINT_OVERRIDE:-${DEFAULT_ENDPOINT}}}" HARDCODED_TOKEN="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" META_FILES=( "download_infoseek_jsonl.sh" "download_oven_jsonl.sh" "ovenid2impath.csv" ) IMAGE_FILES=( "shard01.tar" "shard02.tar" "shard03.tar" "shard04.tar" "shard05.tar" "shard06.tar" "shard07.tar" "shard08.tar" "all_wikipedia_images.tar" ) unset http_proxy unset https_proxy unset HTTP_PROXY unset HTTPS_PROXY unset all_proxy unset ALL_PROXY export HF_ENDPOINT="${MIRROR_URL}" export HF_HUB_CACHE="${CACHE_DIR}" export HF_ASSETS_CACHE="${ASSETS_DIR}" mkdir -p "${TARGET_DIR}" "${CACHE_DIR}" "${ASSETS_DIR}" if command -v hf >/dev/null 2>&1; then HF_BIN=(hf download) elif command -v huggingface-cli >/dev/null 2>&1; then HF_BIN=(huggingface-cli download) else echo "Missing Hugging Face CLI. Install it with:" >&2 echo " python -m pip install -U \"huggingface_hub[cli]\"" >&2 exit 1 fi TOKEN_ARGS=() if [[ -n "${HF_TOKEN:-}" ]]; then TOKEN_ARGS=(--token "${HF_TOKEN}") elif [[ -n "${HARDCODED_TOKEN}" ]]; then TOKEN_ARGS=(--token "${HARDCODED_TOKEN}") fi print_help() { cat <<'EOF' Usage: bash download_oven_hf_mirror.sh [meta|images|all] Modes: meta Download metadata files only: - download_infoseek_jsonl.sh - download_oven_jsonl.sh - ovenid2impath.csv images Download image tar files only: - shard01.tar ... shard08.tar - all_wikipedia_images.tar all Download both metadata and image tar files Behavior: - unsets proxy variables before downloading - uses the mirror endpoint: https://hf-mirror.com - endpoint can be overridden: HF_ENDPOINT=https://huggingface.co bash download_oven_hf_mirror.sh meta - stores downloaded files in: /workspace/xiaobin/RL_dataset/data - stores Hugging Face cache in: /workspace/xiaobin/RL_dataset/data/.hf_cache Notes: - The dataset is gated. First accept access at: https://huggingface.co/datasets/ychenNLP/oven - The script contains a hardcoded token by default. - If needed, export your token before running to override it: export HF_TOKEN=hf_xxx EOF } if [[ "${MODE}" == "-h" || "${MODE}" == "--help" || "${MODE}" == "help" ]]; then print_help exit 0 fi require_auth() { if [[ -n "${HF_TOKEN:-}" ]]; then return 0 fi if hf auth whoami >/dev/null 2>&1; then return 0 fi echo "No Hugging Face authentication detected." >&2 echo "Do this first:" >&2 echo " 1. Open https://huggingface.co/datasets/ychenNLP/oven and accept access." >&2 echo " 2. Run: hf auth login" >&2 echo " or: export HF_TOKEN=hf_xxx" >&2 exit 2 } run_download() { if ! "$@"; then echo >&2 echo "Download failed." >&2 echo "Check these items:" >&2 echo " - access was approved for https://huggingface.co/datasets/ychenNLP/oven" >&2 echo " - HF_TOKEN is valid, or 'hf auth login' succeeded" >&2 echo " - the mirror endpoint is reachable: ${MIRROR_URL}" >&2 exit 1 fi } verify_files() { local missing=0 local file for file in "$@"; do if [[ ! -f "${TARGET_DIR}/${file}" ]]; then echo "Missing expected file: ${TARGET_DIR}/${file}" >&2 missing=1 fi done if [[ "${missing}" -ne 0 ]]; then echo >&2 echo "Download did not complete successfully." >&2 echo "This usually means one of these:" >&2 echo " - the mirror endpoint could not be reached" >&2 echo " - access to the gated dataset was not approved" >&2 echo " - authentication was missing or invalid" >&2 exit 1 fi } download_meta() { run_download "${HF_BIN[@]}" "${REPO_ID}" \ --repo-type dataset \ --local-dir "${TARGET_DIR}" \ --include "download_infoseek_jsonl.sh" \ --include "download_oven_jsonl.sh" \ --include "ovenid2impath.csv" \ "${TOKEN_ARGS[@]}" verify_files "${META_FILES[@]}" } download_images() { run_download "${HF_BIN[@]}" "${REPO_ID}" \ --repo-type dataset \ --local-dir "${TARGET_DIR}" \ --include "all_wikipedia_images.tar" \ --include "shard*.tar" \ "${TOKEN_ARGS[@]}" verify_files "${IMAGE_FILES[@]}" } require_auth case "${MODE}" in meta) download_meta ;; images) download_images ;; all) download_meta download_images ;; *) echo "Unknown mode: ${MODE}" >&2 print_help >&2 exit 1 ;; esac echo "Download completed. Files are under: ${TARGET_DIR}"