Spaces:
Running on Zero
Running on Zero
| """Browse and download files from the BONES SEED Hugging Face dataset repository.""" | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import logging | |
| import os | |
| from dataclasses import dataclass, asdict | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from typing import Iterable, Sequence | |
| from urllib.error import HTTPError, URLError | |
| from urllib.request import Request, urlopen | |
| from huggingface_hub import HfApi, get_token, hf_hub_download | |
| from huggingface_hub.errors import HfHubHTTPError | |
| DEFAULT_REPO_ID = "bones-studio/seed" | |
| DEFAULT_REPO_TYPE = "dataset" | |
| DEFAULT_SPACE_ID = "lablab-ai-amd-developer-hackathon/movimento" | |
| LOGGER = logging.getLogger(__name__) | |
| def _resolve_token(token: str | None = None) -> str | None: | |
| LOGGER.info("bones_seed.resolve_token.start") | |
| if token: | |
| LOGGER.info("bones_seed.resolve_token.exit source=arg") | |
| return token | |
| for env_name in ("HUGGING_FACE_HUB_TOKEN", "HF_TOKEN", "HF_API_TOKEN"): | |
| value = os.environ.get(env_name) | |
| if value: | |
| LOGGER.info("bones_seed.resolve_token.exit source=env var=%s", env_name) | |
| return value | |
| resolved = get_token() | |
| LOGGER.info("bones_seed.resolve_token.exit source=cache found=%s", bool(resolved)) | |
| return resolved | |
| class DownloadManifest: | |
| repo_id: str | |
| repo_type: str | |
| revision: str | None | |
| local_dir: str | |
| files: list[str] | |
| downloaded_at: str | |
| class SpaceLogCheckResult: | |
| space_id: str | |
| run_status_code: int | |
| build_status_code: int | |
| run_ok: bool | |
| build_ok: bool | |
| def list_repo_files( | |
| repo_id: str = DEFAULT_REPO_ID, | |
| *, | |
| repo_type: str = DEFAULT_REPO_TYPE, | |
| revision: str | None = None, | |
| token: str | None = None, | |
| ) -> list[str]: | |
| """Return all files in a Hugging Face dataset repository.""" | |
| LOGGER.info("bones_seed.list_repo_files.start repo_id=%s revision=%s", repo_id, revision) | |
| api = HfApi(token=_resolve_token(token)) | |
| files = sorted(api.list_repo_files(repo_id=repo_id, repo_type=repo_type, revision=revision)) | |
| LOGGER.info("bones_seed.list_repo_files.exit count=%s", len(files)) | |
| return files | |
| def download_repo_files( | |
| filenames: Sequence[str], | |
| *, | |
| repo_id: str = DEFAULT_REPO_ID, | |
| repo_type: str = DEFAULT_REPO_TYPE, | |
| revision: str | None = None, | |
| local_dir: str | Path = "bones_seed", | |
| token: str | None = None, | |
| ) -> list[Path]: | |
| """Download selected files from a Hugging Face dataset repository.""" | |
| LOGGER.info("bones_seed.download_repo_files.start repo_id=%s files=%s", repo_id, len(filenames)) | |
| resolved_token = _resolve_token(token) | |
| output_dir = Path(local_dir) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| downloaded: list[Path] = [] | |
| for filename in filenames: | |
| # Each file is downloaded independently so partial progress is visible in logs. | |
| local_path = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=filename, | |
| repo_type=repo_type, | |
| revision=revision, | |
| token=resolved_token, | |
| local_dir=output_dir, | |
| ) | |
| downloaded.append(Path(local_path)) | |
| LOGGER.info("bones_seed.download_repo_files.exit downloaded=%s", len(downloaded)) | |
| return downloaded | |
| def download_by_prefix( | |
| prefix: str, | |
| *, | |
| repo_id: str = DEFAULT_REPO_ID, | |
| repo_type: str = DEFAULT_REPO_TYPE, | |
| revision: str | None = None, | |
| local_dir: str | Path = "bones_seed", | |
| token: str | None = None, | |
| ) -> list[Path]: | |
| """Download files matching a prefix from the repository listing.""" | |
| LOGGER.info("bones_seed.download_by_prefix.start prefix=%s", prefix) | |
| files = [name for name in list_repo_files(repo_id, repo_type=repo_type, revision=revision, token=token) if name.startswith(prefix)] | |
| if not files: | |
| raise ValueError(f"No files matched prefix '{prefix}' in {repo_id}.") | |
| downloaded = download_repo_files( | |
| files, | |
| repo_id=repo_id, | |
| repo_type=repo_type, | |
| revision=revision, | |
| local_dir=local_dir, | |
| token=token, | |
| ) | |
| LOGGER.info("bones_seed.download_by_prefix.exit matched=%s", len(downloaded)) | |
| return downloaded | |
| def write_manifest( | |
| local_dir: str | Path, | |
| files: Iterable[Path], | |
| *, | |
| repo_id: str = DEFAULT_REPO_ID, | |
| repo_type: str = DEFAULT_REPO_TYPE, | |
| revision: str | None = None, | |
| ) -> Path: | |
| """Write a manifest that records what was downloaded.""" | |
| LOGGER.info("bones_seed.write_manifest.start local_dir=%s", local_dir) | |
| output_dir = Path(local_dir) | |
| manifest = DownloadManifest( | |
| repo_id=repo_id, | |
| repo_type=repo_type, | |
| revision=revision, | |
| local_dir=str(output_dir), | |
| files=[str(path) for path in files], | |
| downloaded_at=datetime.now(timezone.utc).isoformat(), | |
| ) | |
| manifest_path = output_dir / "manifest.json" | |
| manifest_path.write_text(json.dumps(asdict(manifest), indent=2, sort_keys=True) + "\n", encoding="utf-8") | |
| LOGGER.info("bones_seed.write_manifest.exit path=%s", manifest_path) | |
| return manifest_path | |
| def upload_manifest_to_space( | |
| manifest_path: str | Path, | |
| *, | |
| space_id: str = DEFAULT_SPACE_ID, | |
| token: str | None = None, | |
| path_in_repo: str = "data/bones_seed/manifest.json", | |
| commit_message: str = "Update BONES-SEED ingestion manifest", | |
| create_pr: bool = True, | |
| ) -> str: | |
| """Upload manifest file into a Space repository path for lablab ingestion traceability.""" | |
| LOGGER.info("bones_seed.upload_manifest_to_space.start space_id=%s", space_id) | |
| manifest = Path(manifest_path) | |
| if not manifest.exists(): | |
| raise FileNotFoundError(f"Manifest file does not exist: {manifest}") | |
| api = HfApi(token=_resolve_token(token)) | |
| try: | |
| uploaded = api.upload_file( | |
| path_or_fileobj=str(manifest), | |
| path_in_repo=path_in_repo, | |
| repo_id=space_id, | |
| repo_type="space", | |
| commit_message=commit_message, | |
| create_pr=False, | |
| ) | |
| LOGGER.info("bones_seed.upload_manifest_to_space.exit mode=direct") | |
| return uploaded | |
| except HfHubHTTPError as exc: | |
| if create_pr and "create_pr=1" in str(exc): | |
| uploaded = api.upload_file( | |
| path_or_fileobj=str(manifest), | |
| path_in_repo=path_in_repo, | |
| repo_id=space_id, | |
| repo_type="space", | |
| commit_message=commit_message, | |
| create_pr=True, | |
| ) | |
| LOGGER.info("bones_seed.upload_manifest_to_space.exit mode=create_pr") | |
| return uploaded | |
| raise | |
| def _check_logs_endpoint(url: str, token: str | None, timeout_sec: float) -> tuple[int, bool]: | |
| LOGGER.info("bones_seed.check_logs_endpoint.start url=%s", url) | |
| headers = {} | |
| resolved = _resolve_token(token) | |
| if resolved: | |
| headers["Authorization"] = f"Bearer {resolved}" | |
| request = Request(url=url, headers=headers, method="GET") | |
| try: | |
| with urlopen(request, timeout=timeout_sec) as response: | |
| status = int(getattr(response, "status", 0)) | |
| LOGGER.info("bones_seed.check_logs_endpoint.exit status=%s", status) | |
| return status, 200 <= status < 300 | |
| except HTTPError as exc: | |
| LOGGER.warning("bones_seed.check_logs_endpoint.http_error status=%s", exc.code) | |
| return int(exc.code), False | |
| except URLError: | |
| LOGGER.warning("bones_seed.check_logs_endpoint.network_error") | |
| return 0, False | |
| def verify_space_logs( | |
| *, | |
| space_id: str = DEFAULT_SPACE_ID, | |
| token: str | None = None, | |
| timeout_sec: float = 10.0, | |
| ) -> SpaceLogCheckResult: | |
| """Verify build and runtime log endpoints are reachable for the target Space.""" | |
| LOGGER.info("bones_seed.verify_space_logs.start space_id=%s", space_id) | |
| base = f"https://huggingface.co/api/spaces/{space_id}/logs" | |
| run_status, run_ok = _check_logs_endpoint(f"{base}/run", token, timeout_sec) | |
| build_status, build_ok = _check_logs_endpoint(f"{base}/build", token, timeout_sec) | |
| result = SpaceLogCheckResult( | |
| space_id=space_id, | |
| run_status_code=run_status, | |
| build_status_code=build_status, | |
| run_ok=run_ok, | |
| build_ok=build_ok, | |
| ) | |
| LOGGER.info("bones_seed.verify_space_logs.exit run_ok=%s build_ok=%s", run_ok, build_ok) | |
| return result | |
| def build_parser() -> argparse.ArgumentParser: | |
| parser = argparse.ArgumentParser(description="Browse and download BONES SEED dataset files from Hugging Face.") | |
| parser.add_argument( | |
| "command", | |
| choices=("list", "download", "prefix", "verify-logs"), | |
| help="List files, download selected files, download files by prefix, or verify Space log endpoints.", | |
| ) | |
| parser.add_argument("files", nargs="*", help="Exact file paths inside the dataset repository.") | |
| parser.add_argument("--repo-id", default=DEFAULT_REPO_ID, help="Hugging Face dataset repository id.") | |
| parser.add_argument("--repo-type", default=DEFAULT_REPO_TYPE, help="Hugging Face repo type.") | |
| parser.add_argument("--revision", default=None, help="Optional repository revision or branch.") | |
| parser.add_argument("--local-dir", default="bones_seed", help="Directory where files will be stored.") | |
| parser.add_argument("--token", default=None, help="Hugging Face token override.") | |
| parser.add_argument("--prefix", default=None, help="File prefix to match when using the prefix command.") | |
| parser.add_argument("--manifest", action="store_true", help="Write a manifest.json after download.") | |
| parser.add_argument("--space-id", default=DEFAULT_SPACE_ID, help="Target Space id for manifest publish or logs checks.") | |
| parser.add_argument( | |
| "--space-manifest-path", | |
| default="data/bones_seed/manifest.json", | |
| help="Path inside target Space repo where manifest will be uploaded.", | |
| ) | |
| parser.add_argument( | |
| "--publish-manifest-to-space", | |
| action="store_true", | |
| help="Upload generated manifest to the Space repo destination.", | |
| ) | |
| parser.add_argument( | |
| "--space-upload-create-pr", | |
| action="store_true", | |
| help="Force upload as a PR in target Space repo when direct commits are forbidden.", | |
| ) | |
| parser.add_argument( | |
| "--logs-timeout-sec", | |
| type=float, | |
| default=10.0, | |
| help="Timeout for log endpoint verification requests.", | |
| ) | |
| return parser | |
| def main(argv: Sequence[str] | None = None) -> int: | |
| LOGGER.info("bones_seed.main.start") | |
| parser = build_parser() | |
| args = parser.parse_args(argv) | |
| if args.command == "list": | |
| try: | |
| for name in list_repo_files(args.repo_id, repo_type=args.repo_type, revision=args.revision, token=args.token): | |
| print(name) | |
| except BrokenPipeError: | |
| LOGGER.info("bones_seed.main.exit broken_pipe") | |
| return 0 | |
| LOGGER.info("bones_seed.main.exit command=list") | |
| return 0 | |
| if args.command == "verify-logs": | |
| result = verify_space_logs(space_id=args.space_id, token=args.token, timeout_sec=args.logs_timeout_sec) | |
| print(json.dumps(asdict(result), indent=2, sort_keys=True)) | |
| LOGGER.info("bones_seed.main.exit command=verify-logs") | |
| return 0 if (result.run_ok and result.build_ok) else 2 | |
| if args.command == "download": | |
| if not args.files: | |
| raise SystemExit("download requires at least one file path") | |
| downloaded = download_repo_files( | |
| args.files, | |
| repo_id=args.repo_id, | |
| repo_type=args.repo_type, | |
| revision=args.revision, | |
| local_dir=args.local_dir, | |
| token=args.token, | |
| ) | |
| else: | |
| if not args.prefix: | |
| raise SystemExit("prefix requires --prefix") | |
| downloaded = download_by_prefix( | |
| args.prefix, | |
| repo_id=args.repo_id, | |
| repo_type=args.repo_type, | |
| revision=args.revision, | |
| local_dir=args.local_dir, | |
| token=args.token, | |
| ) | |
| for path in downloaded: | |
| print(path) | |
| if args.manifest: | |
| manifest_path = write_manifest( | |
| args.local_dir, | |
| downloaded, | |
| repo_id=args.repo_id, | |
| repo_type=args.repo_type, | |
| revision=args.revision, | |
| ) | |
| print(manifest_path) | |
| if args.publish_manifest_to_space: | |
| uploaded = upload_manifest_to_space( | |
| manifest_path, | |
| space_id=args.space_id, | |
| token=args.token, | |
| path_in_repo=args.space_manifest_path, | |
| create_pr=args.space_upload_create_pr, | |
| ) | |
| print(uploaded) | |
| elif args.publish_manifest_to_space: | |
| raise SystemExit("--publish-manifest-to-space requires --manifest") | |
| LOGGER.info("bones_seed.main.exit command=%s", args.command) | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) |