#!/usr/bin/env python3 """ Upload a local folder to Hugging Face Hub. Example: python src/data/upload_to_hf.py \ --local-dir data/processed/scPerturb \ --repo-id your-username/final-thesis-data \ --repo-type dataset \ --private \ --create-repo Auth / 401: - Create a token at https://huggingface.co/settings/tokens with write access. - export HF_TOKEN=hf_... or huggingface-cli login - Use --create-repo on first push so the Hub repo exists (type must match --repo-type). """ from __future__ import annotations import argparse from pathlib import Path from huggingface_hub import HfApi def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Upload a local folder to Hugging Face Hub.") parser.add_argument("--local-dir", required=True, help="Local folder to upload.") parser.add_argument("--repo-id", required=True, help="Hugging Face repo id, e.g. user/my-repo.") parser.add_argument( "--repo-type", default="dataset", choices=["model", "dataset", "space"], help="Hub repo type.", ) parser.add_argument("--private", action="store_true", help="Create repo as private.") parser.add_argument( "--create-repo", action="store_true", help="Create repo if missing (safe to set true repeatedly).", ) parser.add_argument( "--token", default=None, help="HF token. If omitted, uses HF_TOKEN env var / cached login.", ) parser.add_argument( "--path-in-repo", default="", help="Target path inside repo root. Default uploads to repo root.", ) parser.add_argument( "--commit-message", default="Upload folder from local machine", help="Commit message for upload.", ) parser.add_argument( "--include", nargs="*", default=None, help='Optional allow patterns, e.g. --include "*.h5ad" "*.csv"', ) parser.add_argument( "--exclude", nargs="*", default=None, help='Optional ignore patterns, e.g. --exclude "*.pt" "__pycache__/*"', ) return parser.parse_args() def _hub_repo_url(repo_id: str, repo_type: str) -> str: if repo_type == "dataset": return f"https://huggingface.co/datasets/{repo_id}" if repo_type == "space": return f"https://huggingface.co/spaces/{repo_id}" return f"https://huggingface.co/{repo_id}" def main() -> None: args = parse_args() local_dir = Path(args.local_dir).expanduser().resolve() if not local_dir.exists() or not local_dir.is_dir(): raise FileNotFoundError(f"Local directory not found: {local_dir}") parts = args.repo_id.split("/") if len(parts) != 2 or not all(parts): raise ValueError( f"repo_id must be exactly 'namespace/name' (one slash), got {args.repo_id!r}. " "Do not put subpaths like 'user/repo/preupload' here; use --path-in-repo for folders inside the repo." ) api = HfApi(token=args.token) if args.create_repo: api.create_repo( repo_id=args.repo_id, repo_type=args.repo_type, private=args.private, exist_ok=True, ) result = api.upload_folder( repo_id=args.repo_id, repo_type=args.repo_type, folder_path=str(local_dir), path_in_repo=args.path_in_repo, commit_message=args.commit_message, allow_patterns=args.include, ignore_patterns=args.exclude, ) print("Upload complete.") print(f"Local folder: {local_dir}") print(f"Repo: {_hub_repo_url(args.repo_id, args.repo_type)}") print(f"Commit URL: {result}") if __name__ == "__main__": main()