| #!/usr/bin/env bash |
| set -euo pipefail |
|
|
| MODE="${1:-all}" |
|
|
| REPO_ID="ychenNLP/oven" |
| TARGET_DIR="/workspace/xiaobin/RL_dataset/data" |
| CACHE_DIR="${TARGET_DIR}/.hf_cache" |
| ASSETS_DIR="${TARGET_DIR}/.hf_assets" |
| DEFAULT_ENDPOINT="https://hf-mirror.com" |
| MIRROR_URL="${HF_ENDPOINT:-${HF_ENDPOINT_OVERRIDE:-${DEFAULT_ENDPOINT}}}" |
| HARDCODED_TOKEN="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" |
| META_FILES=( |
| "download_infoseek_jsonl.sh" |
| "download_oven_jsonl.sh" |
| "ovenid2impath.csv" |
| ) |
| IMAGE_FILES=( |
| "shard01.tar" |
| "shard02.tar" |
| "shard03.tar" |
| "shard04.tar" |
| "shard05.tar" |
| "shard06.tar" |
| "shard07.tar" |
| "shard08.tar" |
| "all_wikipedia_images.tar" |
| ) |
|
|
| unset http_proxy |
| unset https_proxy |
| unset HTTP_PROXY |
| unset HTTPS_PROXY |
| unset all_proxy |
| unset ALL_PROXY |
|
|
| export HF_ENDPOINT="${MIRROR_URL}" |
| export HF_HUB_CACHE="${CACHE_DIR}" |
| export HF_ASSETS_CACHE="${ASSETS_DIR}" |
|
|
| mkdir -p "${TARGET_DIR}" "${CACHE_DIR}" "${ASSETS_DIR}" |
|
|
| if command -v hf >/dev/null 2>&1; then |
| HF_BIN=(hf download) |
| elif command -v huggingface-cli >/dev/null 2>&1; then |
| HF_BIN=(huggingface-cli download) |
| else |
| echo "Missing Hugging Face CLI. Install it with:" >&2 |
| echo " python -m pip install -U \"huggingface_hub[cli]\"" >&2 |
| exit 1 |
| fi |
|
|
| TOKEN_ARGS=() |
| if [[ -n "${HF_TOKEN:-}" ]]; then |
| TOKEN_ARGS=(--token "${HF_TOKEN}") |
| elif [[ -n "${HARDCODED_TOKEN}" ]]; then |
| TOKEN_ARGS=(--token "${HARDCODED_TOKEN}") |
| fi |
|
|
| print_help() { |
| cat <<'EOF' |
| Usage: |
| bash download_oven_hf_mirror.sh [meta|images|all] |
|
|
| Modes: |
| meta Download metadata files only: |
| - download_infoseek_jsonl.sh |
| - download_oven_jsonl.sh |
| - ovenid2impath.csv |
| images Download image tar files only: |
| - shard01.tar ... shard08.tar |
| - all_wikipedia_images.tar |
| all Download both metadata and image tar files |
|
|
| Behavior: |
| - unsets proxy variables before downloading |
| - uses the mirror endpoint: https://hf-mirror.com |
| - endpoint can be overridden: |
| HF_ENDPOINT=https://huggingface.co bash download_oven_hf_mirror.sh meta |
| - stores downloaded files in: /workspace/xiaobin/RL_dataset/data |
| - stores Hugging Face cache in: /workspace/xiaobin/RL_dataset/data/.hf_cache |
|
|
| Notes: |
| - The dataset is gated. First accept access at: |
| https://huggingface.co/datasets/ychenNLP/oven |
| - The script contains a hardcoded token by default. |
| - If needed, export your token before running to override it: |
| export HF_TOKEN=hf_xxx |
| EOF |
| } |
|
|
| if [[ "${MODE}" == "-h" || "${MODE}" == "--help" || "${MODE}" == "help" ]]; then |
| print_help |
| exit 0 |
| fi |
|
|
| require_auth() { |
| if [[ -n "${HF_TOKEN:-}" ]]; then |
| return 0 |
| fi |
|
|
| if hf auth whoami >/dev/null 2>&1; then |
| return 0 |
| fi |
|
|
| echo "No Hugging Face authentication detected." >&2 |
| echo "Do this first:" >&2 |
| echo " 1. Open https://huggingface.co/datasets/ychenNLP/oven and accept access." >&2 |
| echo " 2. Run: hf auth login" >&2 |
| echo " or: export HF_TOKEN=hf_xxx" >&2 |
| exit 2 |
| } |
|
|
| run_download() { |
| if ! "$@"; then |
| echo >&2 |
| echo "Download failed." >&2 |
| echo "Check these items:" >&2 |
| echo " - access was approved for https://huggingface.co/datasets/ychenNLP/oven" >&2 |
| echo " - HF_TOKEN is valid, or 'hf auth login' succeeded" >&2 |
| echo " - the mirror endpoint is reachable: ${MIRROR_URL}" >&2 |
| exit 1 |
| fi |
| } |
|
|
| verify_files() { |
| local missing=0 |
| local file |
|
|
| for file in "$@"; do |
| if [[ ! -f "${TARGET_DIR}/${file}" ]]; then |
| echo "Missing expected file: ${TARGET_DIR}/${file}" >&2 |
| missing=1 |
| fi |
| done |
|
|
| if [[ "${missing}" -ne 0 ]]; then |
| echo >&2 |
| echo "Download did not complete successfully." >&2 |
| echo "This usually means one of these:" >&2 |
| echo " - the mirror endpoint could not be reached" >&2 |
| echo " - access to the gated dataset was not approved" >&2 |
| echo " - authentication was missing or invalid" >&2 |
| exit 1 |
| fi |
| } |
|
|
| download_meta() { |
| run_download "${HF_BIN[@]}" "${REPO_ID}" \ |
| --repo-type dataset \ |
| --local-dir "${TARGET_DIR}" \ |
| --include "download_infoseek_jsonl.sh" \ |
| --include "download_oven_jsonl.sh" \ |
| --include "ovenid2impath.csv" \ |
| "${TOKEN_ARGS[@]}" |
| verify_files "${META_FILES[@]}" |
| } |
|
|
| download_images() { |
| run_download "${HF_BIN[@]}" "${REPO_ID}" \ |
| --repo-type dataset \ |
| --local-dir "${TARGET_DIR}" \ |
| --include "all_wikipedia_images.tar" \ |
| --include "shard*.tar" \ |
| "${TOKEN_ARGS[@]}" |
| verify_files "${IMAGE_FILES[@]}" |
| } |
|
|
| require_auth |
|
|
| case "${MODE}" in |
| meta) |
| download_meta |
| ;; |
| images) |
| download_images |
| ;; |
| all) |
| download_meta |
| download_images |
| ;; |
| *) |
| echo "Unknown mode: ${MODE}" >&2 |
| print_help >&2 |
| exit 1 |
| ;; |
| esac |
|
|
| echo "Download completed. Files are under: ${TARGET_DIR}" |
|
|