final / src /data /upload_to_hf.py
tintin1027's picture
Upload folder from local machine
ae510c3 verified
#!/usr/bin/env python3
"""
Upload a local folder to Hugging Face Hub.
Example:
python src/data/upload_to_hf.py \
--local-dir data/processed/scPerturb \
--repo-id your-username/final-thesis-data \
--repo-type dataset \
--private \
--create-repo
Auth / 401:
- Create a token at https://huggingface.co/settings/tokens with write access.
- export HF_TOKEN=hf_... or huggingface-cli login
- Use --create-repo on first push so the Hub repo exists (type must match --repo-type).
"""
from __future__ import annotations
import argparse
from pathlib import Path
from huggingface_hub import HfApi
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Upload a local folder to Hugging Face Hub.")
parser.add_argument("--local-dir", required=True, help="Local folder to upload.")
parser.add_argument("--repo-id", required=True, help="Hugging Face repo id, e.g. user/my-repo.")
parser.add_argument(
"--repo-type",
default="dataset",
choices=["model", "dataset", "space"],
help="Hub repo type.",
)
parser.add_argument("--private", action="store_true", help="Create repo as private.")
parser.add_argument(
"--create-repo",
action="store_true",
help="Create repo if missing (safe to set true repeatedly).",
)
parser.add_argument(
"--token",
default=None,
help="HF token. If omitted, uses HF_TOKEN env var / cached login.",
)
parser.add_argument(
"--path-in-repo",
default="",
help="Target path inside repo root. Default uploads to repo root.",
)
parser.add_argument(
"--commit-message",
default="Upload folder from local machine",
help="Commit message for upload.",
)
parser.add_argument(
"--include",
nargs="*",
default=None,
help='Optional allow patterns, e.g. --include "*.h5ad" "*.csv"',
)
parser.add_argument(
"--exclude",
nargs="*",
default=None,
help='Optional ignore patterns, e.g. --exclude "*.pt" "__pycache__/*"',
)
return parser.parse_args()
def _hub_repo_url(repo_id: str, repo_type: str) -> str:
if repo_type == "dataset":
return f"https://huggingface.co/datasets/{repo_id}"
if repo_type == "space":
return f"https://huggingface.co/spaces/{repo_id}"
return f"https://huggingface.co/{repo_id}"
def main() -> None:
args = parse_args()
local_dir = Path(args.local_dir).expanduser().resolve()
if not local_dir.exists() or not local_dir.is_dir():
raise FileNotFoundError(f"Local directory not found: {local_dir}")
parts = args.repo_id.split("/")
if len(parts) != 2 or not all(parts):
raise ValueError(
f"repo_id must be exactly 'namespace/name' (one slash), got {args.repo_id!r}. "
"Do not put subpaths like 'user/repo/preupload' here; use --path-in-repo for folders inside the repo."
)
api = HfApi(token=args.token)
if args.create_repo:
api.create_repo(
repo_id=args.repo_id,
repo_type=args.repo_type,
private=args.private,
exist_ok=True,
)
result = api.upload_folder(
repo_id=args.repo_id,
repo_type=args.repo_type,
folder_path=str(local_dir),
path_in_repo=args.path_in_repo,
commit_message=args.commit_message,
allow_patterns=args.include,
ignore_patterns=args.exclude,
)
print("Upload complete.")
print(f"Local folder: {local_dir}")
print(f"Repo: {_hub_repo_url(args.repo_id, args.repo_type)}")
print(f"Commit URL: {result}")
if __name__ == "__main__":
main()