| |
| """ |
| Upload a local folder to Hugging Face Hub. |
| |
| Example: |
| python src/data/upload_to_hf.py \ |
| --local-dir data/processed/scPerturb \ |
| --repo-id your-username/final-thesis-data \ |
| --repo-type dataset \ |
| --private \ |
| --create-repo |
| |
| Auth / 401: |
| - Create a token at https://huggingface.co/settings/tokens with write access. |
| - export HF_TOKEN=hf_... or huggingface-cli login |
| - Use --create-repo on first push so the Hub repo exists (type must match --repo-type). |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| from pathlib import Path |
|
|
| from huggingface_hub import HfApi |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser(description="Upload a local folder to Hugging Face Hub.") |
| parser.add_argument("--local-dir", required=True, help="Local folder to upload.") |
| parser.add_argument("--repo-id", required=True, help="Hugging Face repo id, e.g. user/my-repo.") |
| parser.add_argument( |
| "--repo-type", |
| default="dataset", |
| choices=["model", "dataset", "space"], |
| help="Hub repo type.", |
| ) |
| parser.add_argument("--private", action="store_true", help="Create repo as private.") |
| parser.add_argument( |
| "--create-repo", |
| action="store_true", |
| help="Create repo if missing (safe to set true repeatedly).", |
| ) |
| parser.add_argument( |
| "--token", |
| default=None, |
| help="HF token. If omitted, uses HF_TOKEN env var / cached login.", |
| ) |
| parser.add_argument( |
| "--path-in-repo", |
| default="", |
| help="Target path inside repo root. Default uploads to repo root.", |
| ) |
| parser.add_argument( |
| "--commit-message", |
| default="Upload folder from local machine", |
| help="Commit message for upload.", |
| ) |
| parser.add_argument( |
| "--include", |
| nargs="*", |
| default=None, |
| help='Optional allow patterns, e.g. --include "*.h5ad" "*.csv"', |
| ) |
| parser.add_argument( |
| "--exclude", |
| nargs="*", |
| default=None, |
| help='Optional ignore patterns, e.g. --exclude "*.pt" "__pycache__/*"', |
| ) |
| return parser.parse_args() |
|
|
|
|
| def _hub_repo_url(repo_id: str, repo_type: str) -> str: |
| if repo_type == "dataset": |
| return f"https://huggingface.co/datasets/{repo_id}" |
| if repo_type == "space": |
| return f"https://huggingface.co/spaces/{repo_id}" |
| return f"https://huggingface.co/{repo_id}" |
|
|
|
|
| def main() -> None: |
| args = parse_args() |
| local_dir = Path(args.local_dir).expanduser().resolve() |
| if not local_dir.exists() or not local_dir.is_dir(): |
| raise FileNotFoundError(f"Local directory not found: {local_dir}") |
|
|
| parts = args.repo_id.split("/") |
| if len(parts) != 2 or not all(parts): |
| raise ValueError( |
| f"repo_id must be exactly 'namespace/name' (one slash), got {args.repo_id!r}. " |
| "Do not put subpaths like 'user/repo/preupload' here; use --path-in-repo for folders inside the repo." |
| ) |
|
|
| api = HfApi(token=args.token) |
|
|
| if args.create_repo: |
| api.create_repo( |
| repo_id=args.repo_id, |
| repo_type=args.repo_type, |
| private=args.private, |
| exist_ok=True, |
| ) |
|
|
| result = api.upload_folder( |
| repo_id=args.repo_id, |
| repo_type=args.repo_type, |
| folder_path=str(local_dir), |
| path_in_repo=args.path_in_repo, |
| commit_message=args.commit_message, |
| allow_patterns=args.include, |
| ignore_patterns=args.exclude, |
| ) |
|
|
| print("Upload complete.") |
| print(f"Local folder: {local_dir}") |
| print(f"Repo: {_hub_repo_url(args.repo_id, args.repo_type)}") |
| print(f"Commit URL: {result}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|