bruAristimunha commited on
Commit
5c60b17
·
1 Parent(s): c799eda

Add metadata-stub generator — reuses eegdash API + CSV, renders HF dataset cards

Browse files
Files changed (1) hide show
  1. scripts/push_metadata_stubs.py +556 -0
scripts/push_metadata_stubs.py ADDED
@@ -0,0 +1,556 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """Generate and push per-dataset metadata stubs to the ``EEGDash`` HF org.
3
+
4
+ Lives inside the Space on purpose: the Space already vendors
5
+ ``dataset_summary.csv`` and hits the same live EEGDash API that
6
+ ``docs/source/conf.py`` uses. No rehosting of EEG data — each repo is a
7
+ Markdown card + a small ``eegdash.json`` pointer.
8
+
9
+ The field-priority rules mirror ``_build_dataset_context`` in the docs
10
+ Sphinx config: CSV row wins when it has a value, otherwise fall back to
11
+ the API response. That keeps the eegdash.org dataset pages and the HF
12
+ stubs in lock-step — edit the CSV (or the API), both re-render the same
13
+ way.
14
+
15
+ Usage::
16
+
17
+ # Dry-run: write one stub README to /tmp/stub_preview/
18
+ python scripts/push_metadata_stubs.py --dataset ds002718 --dry-run
19
+
20
+ # Push a single stub
21
+ python scripts/push_metadata_stubs.py --dataset ds002718
22
+
23
+ # Push every row in the CSV, skipping repos that already exist
24
+ python scripts/push_metadata_stubs.py --all --skip-existing
25
+
26
+ # Sample 10 for a smoke test
27
+ python scripts/push_metadata_stubs.py --all --limit 10
28
+
29
+ Requires ``huggingface-cli login`` (or ``HF_TOKEN`` env var) when pushing.
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import argparse
35
+ import ast
36
+ import json
37
+ import logging
38
+ import os
39
+ import sys
40
+ import tempfile
41
+ import time
42
+ import urllib.error
43
+ import urllib.request
44
+ from pathlib import Path
45
+ from typing import Any, Iterable
46
+
47
+ import pandas as pd
48
+
49
+ ROOT = Path(__file__).resolve().parents[1]
50
+ CSV_PATH = ROOT / "dataset_summary.csv"
51
+ HF_ORG = "EEGDash"
52
+ EEGDASH_API = "https://data.eegdash.org/api/eegdash"
53
+ CATALOG_SPACE = f"https://huggingface.co/spaces/{HF_ORG}/catalog"
54
+ EEGDASH_URL = "https://eegdash.org"
55
+ GITHUB_URL = "https://github.com/eegdash/EEGDash"
56
+
57
+ logger = logging.getLogger("push_metadata_stubs")
58
+
59
+
60
+ # ---------------------------------------------------------------------------
61
+ # Same helpers as docs/source/conf.py — lifted verbatim so the output format
62
+ # stays in sync without a sphinx import.
63
+ # ---------------------------------------------------------------------------
64
+
65
+
66
+ def _clean_value(value: Any) -> str:
67
+ if value is None:
68
+ return ""
69
+ s = str(value).strip()
70
+ if not s or s.lower() in {"nan", "none", "null", "n/a", "—", "-"}:
71
+ return ""
72
+ return s
73
+
74
+
75
+ def _normalize_list(value: Any) -> list[str]:
76
+ if not value:
77
+ return []
78
+ if isinstance(value, list):
79
+ return [str(v).strip() for v in value if str(v).strip()]
80
+ if isinstance(value, str):
81
+ cleaned = value.strip()
82
+ if cleaned.startswith("[") and cleaned.endswith("]"):
83
+ try:
84
+ parsed = ast.literal_eval(cleaned)
85
+ if isinstance(parsed, (list, tuple)):
86
+ return [str(v).strip() for v in parsed if str(v).strip()]
87
+ except (ValueError, SyntaxError):
88
+ pass
89
+ return [cleaned]
90
+ return [str(value).strip()]
91
+
92
+
93
+ def _format_hours(cell: Any) -> str:
94
+ s = _clean_value(cell)
95
+ if not s:
96
+ return ""
97
+ try:
98
+ h = float(s)
99
+ except ValueError:
100
+ return s
101
+ return f"{h:,.1f}"
102
+
103
+
104
+ def _format_stat_counts(cell: Any) -> str:
105
+ """Render a ``[{val, count}, ...]`` JSON cell as ``"val (×count)"``.
106
+
107
+ Matches the helper of the same name in ``docs/source/conf.py`` so
108
+ sampling rate / channel count rows look identical on eegdash.org and
109
+ on HF.
110
+ """
111
+ s = _clean_value(cell)
112
+ if not s:
113
+ return ""
114
+ try:
115
+ parsed = json.loads(s)
116
+ except json.JSONDecodeError:
117
+ try:
118
+ parsed = ast.literal_eval(s)
119
+ except (ValueError, SyntaxError):
120
+ return s
121
+ if not isinstance(parsed, list) or not parsed:
122
+ return ""
123
+ entries = []
124
+ for row in parsed:
125
+ if not isinstance(row, dict):
126
+ continue
127
+ val = row.get("val")
128
+ count = row.get("count")
129
+ if val is None:
130
+ continue
131
+ if isinstance(val, float) and val.is_integer():
132
+ val = int(val)
133
+ if count in (None, "", 0):
134
+ entries.append(str(val))
135
+ else:
136
+ entries.append(f"{val} (×{count})")
137
+ return ", ".join(entries)
138
+
139
+
140
+ # ---------------------------------------------------------------------------
141
+ # API fetch — same endpoint as docs, same failure-is-fine policy.
142
+ # ---------------------------------------------------------------------------
143
+
144
+
145
+ def _fetch_api_summary(dataset_id: str, timeout: float = 10.0) -> dict[str, Any]:
146
+ variants = [dataset_id]
147
+ if dataset_id.startswith("ds"):
148
+ variants.append(dataset_id.lower())
149
+ elif dataset_id.lower().startswith("eeg2025r"):
150
+ variants.append(f"EEG2025r{dataset_id.lower().replace('eeg2025r', '')}")
151
+
152
+ for vid in variants:
153
+ url = f"{EEGDASH_API}/datasets/summary/{vid}"
154
+ try:
155
+ with urllib.request.urlopen(url, timeout=timeout) as resp:
156
+ data = json.loads(resp.read().decode("utf-8"))
157
+ except (urllib.error.URLError, TimeoutError, json.JSONDecodeError) as exc:
158
+ logger.debug("API %s failed: %s", vid, exc)
159
+ continue
160
+ if data.get("success"):
161
+ return data.get("data") or {}
162
+ return {}
163
+
164
+
165
+ # ---------------------------------------------------------------------------
166
+ # Context builder — CSV row first, API second. Mirrors conf.py field order.
167
+ # ---------------------------------------------------------------------------
168
+
169
+
170
+ def _build_context(row: pd.Series) -> dict[str, Any]:
171
+ dataset_id = _clean_value(row.get("dataset")).lower()
172
+ api = _fetch_api_summary(dataset_id)
173
+
174
+ def pick(row_key: str, api_key: str = "") -> str:
175
+ v = _clean_value(row.get(row_key))
176
+ if v and v != "0":
177
+ return v
178
+ if api_key:
179
+ return _clean_value(api.get(api_key))
180
+ return ""
181
+
182
+ title = _clean_value(row.get("dataset_title")) or _clean_value(
183
+ api.get("computed_title") or api.get("name")
184
+ )
185
+ doi_raw = _clean_value(row.get("doi")) or _clean_value(api.get("dataset_doi"))
186
+ # DOIs sometimes ship with a "doi:" prefix — strip so links don't double up.
187
+ doi = doi_raw[4:].strip() if doi_raw.lower().startswith("doi:") else doi_raw
188
+ license_ = _clean_value(row.get("license")) or _clean_value(api.get("license"))
189
+ authors = _normalize_list(api.get("authors"))
190
+ source = _clean_value(row.get("source")) or "OpenNeuro"
191
+
192
+ # Year from API timestamps (docs does the same)
193
+ year = ""
194
+ ts = api.get("timestamps") or {}
195
+ created = ts.get("dataset_created_at") or ""
196
+ if isinstance(created, str) and len(created) >= 4:
197
+ year = created[:4]
198
+
199
+ return {
200
+ "dataset_id": dataset_id,
201
+ "title": title or dataset_id,
202
+ "author_year": _clean_value(row.get("author_year")),
203
+ "authors": authors,
204
+ "year": year,
205
+ "license": license_ or "Unknown",
206
+ "doi": doi,
207
+ "source": source,
208
+ "openneuro_url": f"https://openneuro.org/datasets/{dataset_id}",
209
+ "nemar_url": f"https://nemar.org/dataexplorer/detail?dataset_id={dataset_id}",
210
+ "source_url": _clean_value(api.get("source_url")),
211
+ "record_modality": _clean_value(row.get("record_modality")),
212
+ "modality_exp": _clean_value(row.get("modality of exp")),
213
+ "type_exp": _clean_value(row.get("type of exp")),
214
+ "pathology": _clean_value(row.get("Type Subject")),
215
+ "n_subjects": pick("n_subjects", "n_subjects"),
216
+ "n_records": pick("n_records", "total_files"),
217
+ "n_tasks": pick("n_tasks", "n_tasks"),
218
+ "n_channels": _format_stat_counts(row.get("nchans_set")),
219
+ "sampling_freqs": _format_stat_counts(row.get("sampling_freqs")),
220
+ "size": _clean_value(row.get("size")),
221
+ "duration_hours_total": _format_hours(row.get("duration_hours_total")),
222
+ "references": _normalize_list(api.get("references")),
223
+ "how_to_acknowledge": _clean_value(api.get("how_to_acknowledge")),
224
+ }
225
+
226
+
227
+ # ---------------------------------------------------------------------------
228
+ # Render a HF Dataset Card (README.md) from the context.
229
+ # ---------------------------------------------------------------------------
230
+
231
+
232
+ HF_LICENSE_MAP = {
233
+ # HF's vetted SPDX-ish identifiers. Unknown values map to "other".
234
+ "cc0": "cc0-1.0",
235
+ "cc0-1.0": "cc0-1.0",
236
+ "cc-by-4.0": "cc-by-4.0",
237
+ "cc-by-sa-4.0": "cc-by-sa-4.0",
238
+ "cc-by-nc-4.0": "cc-by-nc-4.0",
239
+ "cc-by-nc-sa-4.0": "cc-by-nc-sa-4.0",
240
+ "mit": "mit",
241
+ "apache-2.0": "apache-2.0",
242
+ "bsd-3-clause": "bsd-3-clause",
243
+ }
244
+
245
+
246
+ def _hf_license(raw: str) -> str:
247
+ norm = raw.lower().replace("_", "-").replace(" ", "-").strip()
248
+ for key, val in HF_LICENSE_MAP.items():
249
+ if key in norm:
250
+ return val
251
+ return "other"
252
+
253
+
254
+ def _size_category(n_records: str) -> str:
255
+ try:
256
+ n = int(n_records)
257
+ except (TypeError, ValueError):
258
+ return "unknown"
259
+ if n < 10:
260
+ return "n<1K"
261
+ if n < 1_000:
262
+ return "n<1K"
263
+ if n < 10_000:
264
+ return "1K<n<10K"
265
+ return "10K<n<100K"
266
+
267
+
268
+ def _render_readme(ctx: dict[str, Any]) -> str:
269
+ tags = ["neuroscience", "eegdash", "brain-computer-interface"]
270
+ rm = ctx["record_modality"].lower()
271
+ if rm in {"eeg", "meg", "ieeg"}:
272
+ tags.insert(0, rm)
273
+ else:
274
+ tags.insert(0, "eeg")
275
+ if ctx["modality_exp"]:
276
+ tags.append(ctx["modality_exp"].lower().replace(" ", "-"))
277
+ if ctx["pathology"] and ctx["pathology"].lower() not in {"unknown", "healthy"}:
278
+ tags.append(ctx["pathology"].lower().replace(" ", "-").replace("/", "-"))
279
+
280
+ license_slug = _hf_license(ctx["license"])
281
+ size_cat = _size_category(ctx["n_records"])
282
+
283
+ yaml_tags = "\n".join(f"- {t}" for t in tags)
284
+ yaml_authors = ""
285
+ if ctx["authors"]:
286
+ yaml_authors = "authors:\n" + "\n".join(
287
+ f" - {a}" for a in ctx["authors"][:8]
288
+ ) + "\n"
289
+
290
+ # --- Body -------------------------------------------------------------
291
+
292
+ hero_lines = []
293
+ if ctx["title"] and ctx["title"].lower() != ctx["dataset_id"].lower():
294
+ hero_lines.append(f"# {ctx['title']}")
295
+ else:
296
+ hero_lines.append(f"# {ctx['dataset_id']}")
297
+ if ctx["author_year"]:
298
+ hero_lines.append(f"*{ctx['author_year']}*")
299
+ elif ctx["authors"]:
300
+ head = ctx["authors"][0]
301
+ extra = f" et al." if len(ctx["authors"]) > 1 else ""
302
+ yr = f" ({ctx['year']})" if ctx["year"] else ""
303
+ hero_lines.append(f"*{head}{extra}{yr}*")
304
+ hero = "\n\n".join(hero_lines)
305
+
306
+ load_block = f"""## Load this dataset
307
+
308
+ This repo is a **pointer** — the raw EEG data lives at its canonical source
309
+ (OpenNeuro / NEMAR). [EEGDash](https://github.com/eegdash/EEGDash) handles the
310
+ download, caching, and conversion to a PyTorch / braindecode dataset.
311
+
312
+ ```python
313
+ # pip install eegdash
314
+ from eegdash import EEGDashDataset
315
+
316
+ ds = EEGDashDataset(dataset="{ctx['dataset_id']}", cache_dir="./cache")
317
+ print(len(ds), "recordings")
318
+ ```
319
+
320
+ Need it in braindecode's HF-native Zarr format? Once mirrored
321
+ (`ds.push_to_hub(...)`) you can also do:
322
+
323
+ ```python
324
+ from braindecode.datasets import BaseConcatDataset
325
+ ds = BaseConcatDataset.pull_from_hub("{HF_ORG}/{ctx['dataset_id']}")
326
+ ```
327
+ """
328
+
329
+ rows = [
330
+ ("Subjects", ctx["n_subjects"]),
331
+ ("Recordings", ctx["n_records"]),
332
+ ("Tasks", ctx["n_tasks"]),
333
+ ("Channels", ctx["n_channels"]),
334
+ ("Sampling rate (Hz)", ctx["sampling_freqs"]),
335
+ ("Size on disk", ctx["size"]),
336
+ ("Total duration (h)", ctx["duration_hours_total"]),
337
+ ("Experimental modality", ctx["modality_exp"]),
338
+ ("Experimental type", ctx["type_exp"]),
339
+ ("Population", ctx["pathology"]),
340
+ ("Recording type", ctx["record_modality"].upper()),
341
+ ("Source", ctx["source"]),
342
+ ("License", ctx["license"]),
343
+ ]
344
+ md_rows = "\n".join(
345
+ f"| **{k}** | {v or '—'} |" for k, v in rows if v or k in {"Source", "License"}
346
+ )
347
+
348
+ meta_table = f"""## Dataset metadata
349
+
350
+ | | |
351
+ |---|---|
352
+ {md_rows}
353
+ """
354
+
355
+ links = []
356
+ if ctx["doi"]:
357
+ links.append(f"- **DOI:** [{ctx['doi']}](https://doi.org/{ctx['doi']})")
358
+ if ctx["source"].lower() == "openneuro":
359
+ links.append(f"- **OpenNeuro:** [{ctx['dataset_id']}]({ctx['openneuro_url']})")
360
+ if ctx["source"].lower() == "nemar":
361
+ links.append(f"- **NEMAR:** [{ctx['dataset_id']}]({ctx['nemar_url']})")
362
+ if ctx["source_url"]:
363
+ links.append(f"- **Source:** <{ctx['source_url']}>")
364
+ links.append(f"- **Browse 700+ datasets:** [EEGDash catalog]({CATALOG_SPACE})")
365
+ links.append(f"- **Docs:** <{EEGDASH_URL}>")
366
+ links.append(f"- **Code:** <{GITHUB_URL}>")
367
+ links_block = "## Links\n\n" + "\n".join(links)
368
+
369
+ cite_block = ""
370
+ if ctx["how_to_acknowledge"]:
371
+ cite_block = (
372
+ "## How to cite\n\n"
373
+ "Please follow the upstream dataset's citation policy:\n\n"
374
+ f"> {ctx['how_to_acknowledge'].strip()}\n"
375
+ )
376
+ elif ctx["references"]:
377
+ cite_block = "## References\n\n" + "\n".join(
378
+ f"- {r}" for r in ctx["references"][:5]
379
+ )
380
+
381
+ footer = (
382
+ f"\n---\n\n"
383
+ f"_This repo is auto-generated from [dataset_summary.csv]"
384
+ f"({GITHUB_URL}/blob/main/eegdash/dataset/dataset_summary.csv) + the "
385
+ f"EEGDash API. Edit the upstream source, not this file._"
386
+ )
387
+
388
+ return f"""---
389
+ tags:
390
+ {yaml_tags}
391
+ license: {license_slug}
392
+ size_categories:
393
+ - {size_cat}
394
+ pretty_name: "{ctx['title'] or ctx['dataset_id']}"
395
+ {yaml_authors}---
396
+
397
+ {hero}
398
+
399
+ {load_block}
400
+
401
+ {meta_table}
402
+
403
+ {links_block}
404
+
405
+ {cite_block}
406
+ {footer}
407
+ """
408
+
409
+
410
+ def _render_pointer(ctx: dict[str, Any]) -> str:
411
+ """Small machine-readable sibling — the same fields the web catalog uses."""
412
+ return json.dumps(
413
+ {
414
+ "dataset_id": ctx["dataset_id"],
415
+ "title": ctx["title"],
416
+ "source": ctx["source"],
417
+ "source_url": ctx["source_url"] or ctx["openneuro_url"] or ctx["nemar_url"],
418
+ "doi": ctx["doi"],
419
+ "license": ctx["license"],
420
+ "loader": {
421
+ "library": "eegdash",
422
+ "class": "EEGDashDataset",
423
+ "kwargs": {"dataset": ctx["dataset_id"]},
424
+ },
425
+ "catalog": CATALOG_SPACE,
426
+ "generated_by": "huggingface-space/scripts/push_metadata_stubs.py",
427
+ },
428
+ indent=2,
429
+ ensure_ascii=False,
430
+ ) + "\n"
431
+
432
+
433
+ # ---------------------------------------------------------------------------
434
+ # Push logic.
435
+ # ---------------------------------------------------------------------------
436
+
437
+
438
+ def _iter_slugs(df: pd.DataFrame, args: argparse.Namespace) -> Iterable[pd.Series]:
439
+ if args.dataset:
440
+ wanted = {s.lower() for s in args.dataset}
441
+ yield from (r for _, r in df.iterrows() if str(r["dataset"]).lower() in wanted)
442
+ return
443
+ if args.all:
444
+ it = df.iterrows()
445
+ if args.limit:
446
+ it = list(df.head(args.limit).iterrows())
447
+ for _, r in it:
448
+ yield r
449
+ return
450
+ raise SystemExit("Pass --dataset <slug> [...] or --all")
451
+
452
+
453
+ def _push_one(ctx: dict[str, Any], args: argparse.Namespace) -> str:
454
+ from huggingface_hub import HfApi # noqa: WPS433
455
+
456
+ api = HfApi(token=args.token)
457
+ repo_id = f"{HF_ORG}/{ctx['dataset_id']}"
458
+ api.create_repo(
459
+ repo_id=repo_id,
460
+ repo_type="dataset",
461
+ exist_ok=True,
462
+ private=args.private,
463
+ )
464
+ with tempfile.TemporaryDirectory() as tmp:
465
+ readme = Path(tmp) / "README.md"
466
+ pointer = Path(tmp) / "eegdash.json"
467
+ readme.write_text(_render_readme(ctx), encoding="utf-8")
468
+ pointer.write_text(_render_pointer(ctx), encoding="utf-8")
469
+ api.upload_folder(
470
+ repo_id=repo_id,
471
+ folder_path=tmp,
472
+ repo_type="dataset",
473
+ commit_message=f"Metadata stub for {ctx['dataset_id']}",
474
+ )
475
+ return repo_id
476
+
477
+
478
+ def main(argv: list[str] | None = None) -> int:
479
+ parser = argparse.ArgumentParser(
480
+ description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
481
+ )
482
+ parser.add_argument("--dataset", nargs="+", help="One or more slugs.")
483
+ parser.add_argument("--all", action="store_true", help="Every row in the CSV.")
484
+ parser.add_argument("--limit", type=int, default=0, help="Cap --all to N rows.")
485
+ parser.add_argument("--skip-existing", action="store_true")
486
+ parser.add_argument(
487
+ "--dry-run",
488
+ action="store_true",
489
+ help="Write one stub README + pointer to a temp dir, no push.",
490
+ )
491
+ parser.add_argument("--dry-run-out", type=Path, default=Path("/tmp/stub_preview"))
492
+ parser.add_argument("--private", action="store_true")
493
+ parser.add_argument("--token", default=os.environ.get("HF_TOKEN"))
494
+ parser.add_argument("-v", "--verbose", action="count", default=0)
495
+ args = parser.parse_args(argv)
496
+
497
+ logging.basicConfig(
498
+ level=logging.DEBUG if args.verbose else logging.INFO,
499
+ format="%(asctime)s %(levelname)s %(name)s — %(message)s",
500
+ )
501
+
502
+ df = pd.read_csv(CSV_PATH)
503
+ rows = list(_iter_slugs(df, args))
504
+ if not rows:
505
+ raise SystemExit("No rows matched the given slugs.")
506
+
507
+ existing: set[str] = set()
508
+ if args.skip_existing and not args.dry_run:
509
+ from huggingface_hub import HfApi # noqa: WPS433
510
+
511
+ existing = {
512
+ r.id.split("/", 1)[-1]
513
+ for r in HfApi().list_datasets(author=HF_ORG, limit=2000)
514
+ }
515
+
516
+ if args.dry_run:
517
+ args.dry_run_out.mkdir(parents=True, exist_ok=True)
518
+ for r in rows[:3]:
519
+ ctx = _build_context(r)
520
+ (args.dry_run_out / f"{ctx['dataset_id']}_README.md").write_text(
521
+ _render_readme(ctx), encoding="utf-8"
522
+ )
523
+ (args.dry_run_out / f"{ctx['dataset_id']}_eegdash.json").write_text(
524
+ _render_pointer(ctx), encoding="utf-8"
525
+ )
526
+ logger.info("Wrote dry-run preview for %s", ctx["dataset_id"])
527
+ logger.info("Dry-run output: %s", args.dry_run_out)
528
+ return 0
529
+
530
+ failed: list[tuple[str, str]] = []
531
+ for r in rows:
532
+ slug = str(r["dataset"]).lower()
533
+ if slug in existing:
534
+ logger.info("skipping %s (exists)", slug)
535
+ continue
536
+ try:
537
+ ctx = _build_context(r)
538
+ repo_id = _push_one(ctx, args)
539
+ logger.info("pushed %s", repo_id)
540
+ except Exception as exc: # noqa: BLE001
541
+ logger.exception("failed %s", slug)
542
+ failed.append((slug, str(exc)))
543
+ # Be polite to the API and HF.
544
+ time.sleep(0.25)
545
+
546
+ if failed:
547
+ logger.error("%d failures:", len(failed))
548
+ for slug, err in failed:
549
+ logger.error(" %s — %s", slug, err)
550
+ return 1
551
+ logger.info("done — %d stubs processed", len(rows) - len(existing))
552
+ return 0
553
+
554
+
555
+ if __name__ == "__main__":
556
+ sys.exit(main())