ICL / RL_dataset /download_oven_hf_mirror.sh
Lekr0's picture
Add files using upload-large-folder tool
90afcf2 verified
#!/usr/bin/env bash
set -euo pipefail
MODE="${1:-all}"
REPO_ID="ychenNLP/oven"
TARGET_DIR="/workspace/xiaobin/RL_dataset/data"
CACHE_DIR="${TARGET_DIR}/.hf_cache"
ASSETS_DIR="${TARGET_DIR}/.hf_assets"
DEFAULT_ENDPOINT="https://hf-mirror.com"
MIRROR_URL="${HF_ENDPOINT:-${HF_ENDPOINT_OVERRIDE:-${DEFAULT_ENDPOINT}}}"
HARDCODED_TOKEN="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"
META_FILES=(
"download_infoseek_jsonl.sh"
"download_oven_jsonl.sh"
"ovenid2impath.csv"
)
IMAGE_FILES=(
"shard01.tar"
"shard02.tar"
"shard03.tar"
"shard04.tar"
"shard05.tar"
"shard06.tar"
"shard07.tar"
"shard08.tar"
"all_wikipedia_images.tar"
)
unset http_proxy
unset https_proxy
unset HTTP_PROXY
unset HTTPS_PROXY
unset all_proxy
unset ALL_PROXY
export HF_ENDPOINT="${MIRROR_URL}"
export HF_HUB_CACHE="${CACHE_DIR}"
export HF_ASSETS_CACHE="${ASSETS_DIR}"
mkdir -p "${TARGET_DIR}" "${CACHE_DIR}" "${ASSETS_DIR}"
if command -v hf >/dev/null 2>&1; then
HF_BIN=(hf download)
elif command -v huggingface-cli >/dev/null 2>&1; then
HF_BIN=(huggingface-cli download)
else
echo "Missing Hugging Face CLI. Install it with:" >&2
echo " python -m pip install -U \"huggingface_hub[cli]\"" >&2
exit 1
fi
TOKEN_ARGS=()
if [[ -n "${HF_TOKEN:-}" ]]; then
TOKEN_ARGS=(--token "${HF_TOKEN}")
elif [[ -n "${HARDCODED_TOKEN}" ]]; then
TOKEN_ARGS=(--token "${HARDCODED_TOKEN}")
fi
print_help() {
cat <<'EOF'
Usage:
bash download_oven_hf_mirror.sh [meta|images|all]
Modes:
meta Download metadata files only:
- download_infoseek_jsonl.sh
- download_oven_jsonl.sh
- ovenid2impath.csv
images Download image tar files only:
- shard01.tar ... shard08.tar
- all_wikipedia_images.tar
all Download both metadata and image tar files
Behavior:
- unsets proxy variables before downloading
- uses the mirror endpoint: https://hf-mirror.com
- endpoint can be overridden:
HF_ENDPOINT=https://huggingface.co bash download_oven_hf_mirror.sh meta
- stores downloaded files in: /workspace/xiaobin/RL_dataset/data
- stores Hugging Face cache in: /workspace/xiaobin/RL_dataset/data/.hf_cache
Notes:
- The dataset is gated. First accept access at:
https://huggingface.co/datasets/ychenNLP/oven
- The script contains a hardcoded token by default.
- If needed, export your token before running to override it:
export HF_TOKEN=hf_xxx
EOF
}
if [[ "${MODE}" == "-h" || "${MODE}" == "--help" || "${MODE}" == "help" ]]; then
print_help
exit 0
fi
require_auth() {
if [[ -n "${HF_TOKEN:-}" ]]; then
return 0
fi
if hf auth whoami >/dev/null 2>&1; then
return 0
fi
echo "No Hugging Face authentication detected." >&2
echo "Do this first:" >&2
echo " 1. Open https://huggingface.co/datasets/ychenNLP/oven and accept access." >&2
echo " 2. Run: hf auth login" >&2
echo " or: export HF_TOKEN=hf_xxx" >&2
exit 2
}
run_download() {
if ! "$@"; then
echo >&2
echo "Download failed." >&2
echo "Check these items:" >&2
echo " - access was approved for https://huggingface.co/datasets/ychenNLP/oven" >&2
echo " - HF_TOKEN is valid, or 'hf auth login' succeeded" >&2
echo " - the mirror endpoint is reachable: ${MIRROR_URL}" >&2
exit 1
fi
}
verify_files() {
local missing=0
local file
for file in "$@"; do
if [[ ! -f "${TARGET_DIR}/${file}" ]]; then
echo "Missing expected file: ${TARGET_DIR}/${file}" >&2
missing=1
fi
done
if [[ "${missing}" -ne 0 ]]; then
echo >&2
echo "Download did not complete successfully." >&2
echo "This usually means one of these:" >&2
echo " - the mirror endpoint could not be reached" >&2
echo " - access to the gated dataset was not approved" >&2
echo " - authentication was missing or invalid" >&2
exit 1
fi
}
download_meta() {
run_download "${HF_BIN[@]}" "${REPO_ID}" \
--repo-type dataset \
--local-dir "${TARGET_DIR}" \
--include "download_infoseek_jsonl.sh" \
--include "download_oven_jsonl.sh" \
--include "ovenid2impath.csv" \
"${TOKEN_ARGS[@]}"
verify_files "${META_FILES[@]}"
}
download_images() {
run_download "${HF_BIN[@]}" "${REPO_ID}" \
--repo-type dataset \
--local-dir "${TARGET_DIR}" \
--include "all_wikipedia_images.tar" \
--include "shard*.tar" \
"${TOKEN_ARGS[@]}"
verify_files "${IMAGE_FILES[@]}"
}
require_auth
case "${MODE}" in
meta)
download_meta
;;
images)
download_images
;;
all)
download_meta
download_images
;;
*)
echo "Unknown mode: ${MODE}" >&2
print_help >&2
exit 1
;;
esac
echo "Download completed. Files are under: ${TARGET_DIR}"