movimento / kimodo /scripts /bones_seed.py
Kimodo Bot
Add core kimodo package modules required by native demo
6d5047c
"""Browse and download files from the BONES SEED Hugging Face dataset repository."""
from __future__ import annotations
import argparse
import json
import logging
import os
from dataclasses import dataclass, asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable, Sequence
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen
from huggingface_hub import HfApi, get_token, hf_hub_download
from huggingface_hub.errors import HfHubHTTPError
DEFAULT_REPO_ID = "bones-studio/seed"
DEFAULT_REPO_TYPE = "dataset"
DEFAULT_SPACE_ID = "lablab-ai-amd-developer-hackathon/movimento"
LOGGER = logging.getLogger(__name__)
def _resolve_token(token: str | None = None) -> str | None:
LOGGER.info("bones_seed.resolve_token.start")
if token:
LOGGER.info("bones_seed.resolve_token.exit source=arg")
return token
for env_name in ("HUGGING_FACE_HUB_TOKEN", "HF_TOKEN", "HF_API_TOKEN"):
value = os.environ.get(env_name)
if value:
LOGGER.info("bones_seed.resolve_token.exit source=env var=%s", env_name)
return value
resolved = get_token()
LOGGER.info("bones_seed.resolve_token.exit source=cache found=%s", bool(resolved))
return resolved
@dataclass(frozen=True)
class DownloadManifest:
repo_id: str
repo_type: str
revision: str | None
local_dir: str
files: list[str]
downloaded_at: str
@dataclass(frozen=True)
class SpaceLogCheckResult:
space_id: str
run_status_code: int
build_status_code: int
run_ok: bool
build_ok: bool
def list_repo_files(
repo_id: str = DEFAULT_REPO_ID,
*,
repo_type: str = DEFAULT_REPO_TYPE,
revision: str | None = None,
token: str | None = None,
) -> list[str]:
"""Return all files in a Hugging Face dataset repository."""
LOGGER.info("bones_seed.list_repo_files.start repo_id=%s revision=%s", repo_id, revision)
api = HfApi(token=_resolve_token(token))
files = sorted(api.list_repo_files(repo_id=repo_id, repo_type=repo_type, revision=revision))
LOGGER.info("bones_seed.list_repo_files.exit count=%s", len(files))
return files
def download_repo_files(
filenames: Sequence[str],
*,
repo_id: str = DEFAULT_REPO_ID,
repo_type: str = DEFAULT_REPO_TYPE,
revision: str | None = None,
local_dir: str | Path = "bones_seed",
token: str | None = None,
) -> list[Path]:
"""Download selected files from a Hugging Face dataset repository."""
LOGGER.info("bones_seed.download_repo_files.start repo_id=%s files=%s", repo_id, len(filenames))
resolved_token = _resolve_token(token)
output_dir = Path(local_dir)
output_dir.mkdir(parents=True, exist_ok=True)
downloaded: list[Path] = []
for filename in filenames:
# Each file is downloaded independently so partial progress is visible in logs.
local_path = hf_hub_download(
repo_id=repo_id,
filename=filename,
repo_type=repo_type,
revision=revision,
token=resolved_token,
local_dir=output_dir,
)
downloaded.append(Path(local_path))
LOGGER.info("bones_seed.download_repo_files.exit downloaded=%s", len(downloaded))
return downloaded
def download_by_prefix(
prefix: str,
*,
repo_id: str = DEFAULT_REPO_ID,
repo_type: str = DEFAULT_REPO_TYPE,
revision: str | None = None,
local_dir: str | Path = "bones_seed",
token: str | None = None,
) -> list[Path]:
"""Download files matching a prefix from the repository listing."""
LOGGER.info("bones_seed.download_by_prefix.start prefix=%s", prefix)
files = [name for name in list_repo_files(repo_id, repo_type=repo_type, revision=revision, token=token) if name.startswith(prefix)]
if not files:
raise ValueError(f"No files matched prefix '{prefix}' in {repo_id}.")
downloaded = download_repo_files(
files,
repo_id=repo_id,
repo_type=repo_type,
revision=revision,
local_dir=local_dir,
token=token,
)
LOGGER.info("bones_seed.download_by_prefix.exit matched=%s", len(downloaded))
return downloaded
def write_manifest(
local_dir: str | Path,
files: Iterable[Path],
*,
repo_id: str = DEFAULT_REPO_ID,
repo_type: str = DEFAULT_REPO_TYPE,
revision: str | None = None,
) -> Path:
"""Write a manifest that records what was downloaded."""
LOGGER.info("bones_seed.write_manifest.start local_dir=%s", local_dir)
output_dir = Path(local_dir)
manifest = DownloadManifest(
repo_id=repo_id,
repo_type=repo_type,
revision=revision,
local_dir=str(output_dir),
files=[str(path) for path in files],
downloaded_at=datetime.now(timezone.utc).isoformat(),
)
manifest_path = output_dir / "manifest.json"
manifest_path.write_text(json.dumps(asdict(manifest), indent=2, sort_keys=True) + "\n", encoding="utf-8")
LOGGER.info("bones_seed.write_manifest.exit path=%s", manifest_path)
return manifest_path
def upload_manifest_to_space(
manifest_path: str | Path,
*,
space_id: str = DEFAULT_SPACE_ID,
token: str | None = None,
path_in_repo: str = "data/bones_seed/manifest.json",
commit_message: str = "Update BONES-SEED ingestion manifest",
create_pr: bool = True,
) -> str:
"""Upload manifest file into a Space repository path for lablab ingestion traceability."""
LOGGER.info("bones_seed.upload_manifest_to_space.start space_id=%s", space_id)
manifest = Path(manifest_path)
if not manifest.exists():
raise FileNotFoundError(f"Manifest file does not exist: {manifest}")
api = HfApi(token=_resolve_token(token))
try:
uploaded = api.upload_file(
path_or_fileobj=str(manifest),
path_in_repo=path_in_repo,
repo_id=space_id,
repo_type="space",
commit_message=commit_message,
create_pr=False,
)
LOGGER.info("bones_seed.upload_manifest_to_space.exit mode=direct")
return uploaded
except HfHubHTTPError as exc:
if create_pr and "create_pr=1" in str(exc):
uploaded = api.upload_file(
path_or_fileobj=str(manifest),
path_in_repo=path_in_repo,
repo_id=space_id,
repo_type="space",
commit_message=commit_message,
create_pr=True,
)
LOGGER.info("bones_seed.upload_manifest_to_space.exit mode=create_pr")
return uploaded
raise
def _check_logs_endpoint(url: str, token: str | None, timeout_sec: float) -> tuple[int, bool]:
LOGGER.info("bones_seed.check_logs_endpoint.start url=%s", url)
headers = {}
resolved = _resolve_token(token)
if resolved:
headers["Authorization"] = f"Bearer {resolved}"
request = Request(url=url, headers=headers, method="GET")
try:
with urlopen(request, timeout=timeout_sec) as response:
status = int(getattr(response, "status", 0))
LOGGER.info("bones_seed.check_logs_endpoint.exit status=%s", status)
return status, 200 <= status < 300
except HTTPError as exc:
LOGGER.warning("bones_seed.check_logs_endpoint.http_error status=%s", exc.code)
return int(exc.code), False
except URLError:
LOGGER.warning("bones_seed.check_logs_endpoint.network_error")
return 0, False
def verify_space_logs(
*,
space_id: str = DEFAULT_SPACE_ID,
token: str | None = None,
timeout_sec: float = 10.0,
) -> SpaceLogCheckResult:
"""Verify build and runtime log endpoints are reachable for the target Space."""
LOGGER.info("bones_seed.verify_space_logs.start space_id=%s", space_id)
base = f"https://huggingface.co/api/spaces/{space_id}/logs"
run_status, run_ok = _check_logs_endpoint(f"{base}/run", token, timeout_sec)
build_status, build_ok = _check_logs_endpoint(f"{base}/build", token, timeout_sec)
result = SpaceLogCheckResult(
space_id=space_id,
run_status_code=run_status,
build_status_code=build_status,
run_ok=run_ok,
build_ok=build_ok,
)
LOGGER.info("bones_seed.verify_space_logs.exit run_ok=%s build_ok=%s", run_ok, build_ok)
return result
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Browse and download BONES SEED dataset files from Hugging Face.")
parser.add_argument(
"command",
choices=("list", "download", "prefix", "verify-logs"),
help="List files, download selected files, download files by prefix, or verify Space log endpoints.",
)
parser.add_argument("files", nargs="*", help="Exact file paths inside the dataset repository.")
parser.add_argument("--repo-id", default=DEFAULT_REPO_ID, help="Hugging Face dataset repository id.")
parser.add_argument("--repo-type", default=DEFAULT_REPO_TYPE, help="Hugging Face repo type.")
parser.add_argument("--revision", default=None, help="Optional repository revision or branch.")
parser.add_argument("--local-dir", default="bones_seed", help="Directory where files will be stored.")
parser.add_argument("--token", default=None, help="Hugging Face token override.")
parser.add_argument("--prefix", default=None, help="File prefix to match when using the prefix command.")
parser.add_argument("--manifest", action="store_true", help="Write a manifest.json after download.")
parser.add_argument("--space-id", default=DEFAULT_SPACE_ID, help="Target Space id for manifest publish or logs checks.")
parser.add_argument(
"--space-manifest-path",
default="data/bones_seed/manifest.json",
help="Path inside target Space repo where manifest will be uploaded.",
)
parser.add_argument(
"--publish-manifest-to-space",
action="store_true",
help="Upload generated manifest to the Space repo destination.",
)
parser.add_argument(
"--space-upload-create-pr",
action="store_true",
help="Force upload as a PR in target Space repo when direct commits are forbidden.",
)
parser.add_argument(
"--logs-timeout-sec",
type=float,
default=10.0,
help="Timeout for log endpoint verification requests.",
)
return parser
def main(argv: Sequence[str] | None = None) -> int:
LOGGER.info("bones_seed.main.start")
parser = build_parser()
args = parser.parse_args(argv)
if args.command == "list":
try:
for name in list_repo_files(args.repo_id, repo_type=args.repo_type, revision=args.revision, token=args.token):
print(name)
except BrokenPipeError:
LOGGER.info("bones_seed.main.exit broken_pipe")
return 0
LOGGER.info("bones_seed.main.exit command=list")
return 0
if args.command == "verify-logs":
result = verify_space_logs(space_id=args.space_id, token=args.token, timeout_sec=args.logs_timeout_sec)
print(json.dumps(asdict(result), indent=2, sort_keys=True))
LOGGER.info("bones_seed.main.exit command=verify-logs")
return 0 if (result.run_ok and result.build_ok) else 2
if args.command == "download":
if not args.files:
raise SystemExit("download requires at least one file path")
downloaded = download_repo_files(
args.files,
repo_id=args.repo_id,
repo_type=args.repo_type,
revision=args.revision,
local_dir=args.local_dir,
token=args.token,
)
else:
if not args.prefix:
raise SystemExit("prefix requires --prefix")
downloaded = download_by_prefix(
args.prefix,
repo_id=args.repo_id,
repo_type=args.repo_type,
revision=args.revision,
local_dir=args.local_dir,
token=args.token,
)
for path in downloaded:
print(path)
if args.manifest:
manifest_path = write_manifest(
args.local_dir,
downloaded,
repo_id=args.repo_id,
repo_type=args.repo_type,
revision=args.revision,
)
print(manifest_path)
if args.publish_manifest_to_space:
uploaded = upload_manifest_to_space(
manifest_path,
space_id=args.space_id,
token=args.token,
path_in_repo=args.space_manifest_path,
create_pr=args.space_upload_create_pr,
)
print(uploaded)
elif args.publish_manifest_to_space:
raise SystemExit("--publish-manifest-to-space requires --manifest")
LOGGER.info("bones_seed.main.exit command=%s", args.command)
return 0
if __name__ == "__main__":
raise SystemExit(main())