File size: 12,981 Bytes
6d5047c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
"""Browse and download files from the BONES SEED Hugging Face dataset repository."""

from __future__ import annotations

import argparse
import json
import logging
import os
from dataclasses import dataclass, asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable, Sequence
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen

from huggingface_hub import HfApi, get_token, hf_hub_download
from huggingface_hub.errors import HfHubHTTPError


DEFAULT_REPO_ID = "bones-studio/seed"
DEFAULT_REPO_TYPE = "dataset"
DEFAULT_SPACE_ID = "lablab-ai-amd-developer-hackathon/movimento"

LOGGER = logging.getLogger(__name__)


def _resolve_token(token: str | None = None) -> str | None:
    LOGGER.info("bones_seed.resolve_token.start")
    if token:
        LOGGER.info("bones_seed.resolve_token.exit source=arg")
        return token
    for env_name in ("HUGGING_FACE_HUB_TOKEN", "HF_TOKEN", "HF_API_TOKEN"):
        value = os.environ.get(env_name)
        if value:
            LOGGER.info("bones_seed.resolve_token.exit source=env var=%s", env_name)
            return value
    resolved = get_token()
    LOGGER.info("bones_seed.resolve_token.exit source=cache found=%s", bool(resolved))
    return resolved


@dataclass(frozen=True)
class DownloadManifest:
    repo_id: str
    repo_type: str
    revision: str | None
    local_dir: str
    files: list[str]
    downloaded_at: str


@dataclass(frozen=True)
class SpaceLogCheckResult:
    space_id: str
    run_status_code: int
    build_status_code: int
    run_ok: bool
    build_ok: bool


def list_repo_files(
    repo_id: str = DEFAULT_REPO_ID,
    *,
    repo_type: str = DEFAULT_REPO_TYPE,
    revision: str | None = None,
    token: str | None = None,
) -> list[str]:
    """Return all files in a Hugging Face dataset repository."""
    LOGGER.info("bones_seed.list_repo_files.start repo_id=%s revision=%s", repo_id, revision)
    api = HfApi(token=_resolve_token(token))
    files = sorted(api.list_repo_files(repo_id=repo_id, repo_type=repo_type, revision=revision))
    LOGGER.info("bones_seed.list_repo_files.exit count=%s", len(files))
    return files


def download_repo_files(
    filenames: Sequence[str],
    *,
    repo_id: str = DEFAULT_REPO_ID,
    repo_type: str = DEFAULT_REPO_TYPE,
    revision: str | None = None,
    local_dir: str | Path = "bones_seed",
    token: str | None = None,
) -> list[Path]:
    """Download selected files from a Hugging Face dataset repository."""
    LOGGER.info("bones_seed.download_repo_files.start repo_id=%s files=%s", repo_id, len(filenames))
    resolved_token = _resolve_token(token)
    output_dir = Path(local_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    downloaded: list[Path] = []
    for filename in filenames:
        # Each file is downloaded independently so partial progress is visible in logs.
        local_path = hf_hub_download(
            repo_id=repo_id,
            filename=filename,
            repo_type=repo_type,
            revision=revision,
            token=resolved_token,
            local_dir=output_dir,
        )
        downloaded.append(Path(local_path))
    LOGGER.info("bones_seed.download_repo_files.exit downloaded=%s", len(downloaded))
    return downloaded


def download_by_prefix(
    prefix: str,
    *,
    repo_id: str = DEFAULT_REPO_ID,
    repo_type: str = DEFAULT_REPO_TYPE,
    revision: str | None = None,
    local_dir: str | Path = "bones_seed",
    token: str | None = None,
) -> list[Path]:
    """Download files matching a prefix from the repository listing."""
    LOGGER.info("bones_seed.download_by_prefix.start prefix=%s", prefix)
    files = [name for name in list_repo_files(repo_id, repo_type=repo_type, revision=revision, token=token) if name.startswith(prefix)]
    if not files:
        raise ValueError(f"No files matched prefix '{prefix}' in {repo_id}.")
    downloaded = download_repo_files(
        files,
        repo_id=repo_id,
        repo_type=repo_type,
        revision=revision,
        local_dir=local_dir,
        token=token,
    )
    LOGGER.info("bones_seed.download_by_prefix.exit matched=%s", len(downloaded))
    return downloaded


def write_manifest(
    local_dir: str | Path,
    files: Iterable[Path],
    *,
    repo_id: str = DEFAULT_REPO_ID,
    repo_type: str = DEFAULT_REPO_TYPE,
    revision: str | None = None,
) -> Path:
    """Write a manifest that records what was downloaded."""
    LOGGER.info("bones_seed.write_manifest.start local_dir=%s", local_dir)
    output_dir = Path(local_dir)
    manifest = DownloadManifest(
        repo_id=repo_id,
        repo_type=repo_type,
        revision=revision,
        local_dir=str(output_dir),
        files=[str(path) for path in files],
        downloaded_at=datetime.now(timezone.utc).isoformat(),
    )
    manifest_path = output_dir / "manifest.json"
    manifest_path.write_text(json.dumps(asdict(manifest), indent=2, sort_keys=True) + "\n", encoding="utf-8")
    LOGGER.info("bones_seed.write_manifest.exit path=%s", manifest_path)
    return manifest_path


def upload_manifest_to_space(
    manifest_path: str | Path,
    *,
    space_id: str = DEFAULT_SPACE_ID,
    token: str | None = None,
    path_in_repo: str = "data/bones_seed/manifest.json",
    commit_message: str = "Update BONES-SEED ingestion manifest",
    create_pr: bool = True,
) -> str:
    """Upload manifest file into a Space repository path for lablab ingestion traceability."""
    LOGGER.info("bones_seed.upload_manifest_to_space.start space_id=%s", space_id)
    manifest = Path(manifest_path)
    if not manifest.exists():
        raise FileNotFoundError(f"Manifest file does not exist: {manifest}")

    api = HfApi(token=_resolve_token(token))
    try:
        uploaded = api.upload_file(
            path_or_fileobj=str(manifest),
            path_in_repo=path_in_repo,
            repo_id=space_id,
            repo_type="space",
            commit_message=commit_message,
            create_pr=False,
        )
        LOGGER.info("bones_seed.upload_manifest_to_space.exit mode=direct")
        return uploaded
    except HfHubHTTPError as exc:
        if create_pr and "create_pr=1" in str(exc):
            uploaded = api.upload_file(
                path_or_fileobj=str(manifest),
                path_in_repo=path_in_repo,
                repo_id=space_id,
                repo_type="space",
                commit_message=commit_message,
                create_pr=True,
            )
            LOGGER.info("bones_seed.upload_manifest_to_space.exit mode=create_pr")
            return uploaded
        raise


def _check_logs_endpoint(url: str, token: str | None, timeout_sec: float) -> tuple[int, bool]:
    LOGGER.info("bones_seed.check_logs_endpoint.start url=%s", url)
    headers = {}
    resolved = _resolve_token(token)
    if resolved:
        headers["Authorization"] = f"Bearer {resolved}"
    request = Request(url=url, headers=headers, method="GET")
    try:
        with urlopen(request, timeout=timeout_sec) as response:
            status = int(getattr(response, "status", 0))
            LOGGER.info("bones_seed.check_logs_endpoint.exit status=%s", status)
            return status, 200 <= status < 300
    except HTTPError as exc:
        LOGGER.warning("bones_seed.check_logs_endpoint.http_error status=%s", exc.code)
        return int(exc.code), False
    except URLError:
        LOGGER.warning("bones_seed.check_logs_endpoint.network_error")
        return 0, False


def verify_space_logs(
    *,
    space_id: str = DEFAULT_SPACE_ID,
    token: str | None = None,
    timeout_sec: float = 10.0,
) -> SpaceLogCheckResult:
    """Verify build and runtime log endpoints are reachable for the target Space."""
    LOGGER.info("bones_seed.verify_space_logs.start space_id=%s", space_id)
    base = f"https://huggingface.co/api/spaces/{space_id}/logs"
    run_status, run_ok = _check_logs_endpoint(f"{base}/run", token, timeout_sec)
    build_status, build_ok = _check_logs_endpoint(f"{base}/build", token, timeout_sec)
    result = SpaceLogCheckResult(
        space_id=space_id,
        run_status_code=run_status,
        build_status_code=build_status,
        run_ok=run_ok,
        build_ok=build_ok,
    )
    LOGGER.info("bones_seed.verify_space_logs.exit run_ok=%s build_ok=%s", run_ok, build_ok)
    return result


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="Browse and download BONES SEED dataset files from Hugging Face.")
    parser.add_argument(
        "command",
        choices=("list", "download", "prefix", "verify-logs"),
        help="List files, download selected files, download files by prefix, or verify Space log endpoints.",
    )
    parser.add_argument("files", nargs="*", help="Exact file paths inside the dataset repository.")
    parser.add_argument("--repo-id", default=DEFAULT_REPO_ID, help="Hugging Face dataset repository id.")
    parser.add_argument("--repo-type", default=DEFAULT_REPO_TYPE, help="Hugging Face repo type.")
    parser.add_argument("--revision", default=None, help="Optional repository revision or branch.")
    parser.add_argument("--local-dir", default="bones_seed", help="Directory where files will be stored.")
    parser.add_argument("--token", default=None, help="Hugging Face token override.")
    parser.add_argument("--prefix", default=None, help="File prefix to match when using the prefix command.")
    parser.add_argument("--manifest", action="store_true", help="Write a manifest.json after download.")
    parser.add_argument("--space-id", default=DEFAULT_SPACE_ID, help="Target Space id for manifest publish or logs checks.")
    parser.add_argument(
        "--space-manifest-path",
        default="data/bones_seed/manifest.json",
        help="Path inside target Space repo where manifest will be uploaded.",
    )
    parser.add_argument(
        "--publish-manifest-to-space",
        action="store_true",
        help="Upload generated manifest to the Space repo destination.",
    )
    parser.add_argument(
        "--space-upload-create-pr",
        action="store_true",
        help="Force upload as a PR in target Space repo when direct commits are forbidden.",
    )
    parser.add_argument(
        "--logs-timeout-sec",
        type=float,
        default=10.0,
        help="Timeout for log endpoint verification requests.",
    )
    return parser


def main(argv: Sequence[str] | None = None) -> int:
    LOGGER.info("bones_seed.main.start")
    parser = build_parser()
    args = parser.parse_args(argv)

    if args.command == "list":
        try:
            for name in list_repo_files(args.repo_id, repo_type=args.repo_type, revision=args.revision, token=args.token):
                print(name)
        except BrokenPipeError:
            LOGGER.info("bones_seed.main.exit broken_pipe")
            return 0
        LOGGER.info("bones_seed.main.exit command=list")
        return 0

    if args.command == "verify-logs":
        result = verify_space_logs(space_id=args.space_id, token=args.token, timeout_sec=args.logs_timeout_sec)
        print(json.dumps(asdict(result), indent=2, sort_keys=True))
        LOGGER.info("bones_seed.main.exit command=verify-logs")
        return 0 if (result.run_ok and result.build_ok) else 2

    if args.command == "download":
        if not args.files:
            raise SystemExit("download requires at least one file path")
        downloaded = download_repo_files(
            args.files,
            repo_id=args.repo_id,
            repo_type=args.repo_type,
            revision=args.revision,
            local_dir=args.local_dir,
            token=args.token,
        )
    else:
        if not args.prefix:
            raise SystemExit("prefix requires --prefix")
        downloaded = download_by_prefix(
            args.prefix,
            repo_id=args.repo_id,
            repo_type=args.repo_type,
            revision=args.revision,
            local_dir=args.local_dir,
            token=args.token,
        )

    for path in downloaded:
        print(path)

    if args.manifest:
        manifest_path = write_manifest(
            args.local_dir,
            downloaded,
            repo_id=args.repo_id,
            repo_type=args.repo_type,
            revision=args.revision,
        )
        print(manifest_path)
        if args.publish_manifest_to_space:
            uploaded = upload_manifest_to_space(
                manifest_path,
                space_id=args.space_id,
                token=args.token,
                path_in_repo=args.space_manifest_path,
                create_pr=args.space_upload_create_pr,
            )
            print(uploaded)
    elif args.publish_manifest_to_space:
        raise SystemExit("--publish-manifest-to-space requires --manifest")

    LOGGER.info("bones_seed.main.exit command=%s", args.command)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())