Commit ·
4af5c7a
1
Parent(s): 5c60b17
Richer stubs: canonical aliases, upstream README verbatim, demographics, funding, provenance
Browse files- scripts/push_metadata_stubs.py +341 -78
scripts/push_metadata_stubs.py
CHANGED
|
@@ -167,6 +167,34 @@ def _fetch_api_summary(dataset_id: str, timeout: float = 10.0) -> dict[str, Any]
|
|
| 167 |
# ---------------------------------------------------------------------------
|
| 168 |
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
def _build_context(row: pd.Series) -> dict[str, Any]:
|
| 171 |
dataset_id = _clean_value(row.get("dataset")).lower()
|
| 172 |
api = _fetch_api_summary(dataset_id)
|
|
@@ -183,44 +211,100 @@ def _build_context(row: pd.Series) -> dict[str, Any]:
|
|
| 183 |
api.get("computed_title") or api.get("name")
|
| 184 |
)
|
| 185 |
doi_raw = _clean_value(row.get("doi")) or _clean_value(api.get("dataset_doi"))
|
| 186 |
-
# DOIs sometimes ship with a "doi:" prefix — strip so links don't double up.
|
| 187 |
doi = doi_raw[4:].strip() if doi_raw.lower().startswith("doi:") else doi_raw
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
license_ = _clean_value(row.get("license")) or _clean_value(api.get("license"))
|
| 189 |
authors = _normalize_list(api.get("authors"))
|
| 190 |
source = _clean_value(row.get("source")) or "OpenNeuro"
|
| 191 |
|
| 192 |
-
# Year from API timestamps (docs does the same)
|
| 193 |
-
year = ""
|
| 194 |
ts = api.get("timestamps") or {}
|
|
|
|
| 195 |
created = ts.get("dataset_created_at") or ""
|
| 196 |
if isinstance(created, str) and len(created) >= 4:
|
| 197 |
year = created[:4]
|
| 198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
return {
|
| 200 |
"dataset_id": dataset_id,
|
| 201 |
"title": title or dataset_id,
|
| 202 |
"author_year": _clean_value(row.get("author_year")),
|
|
|
|
| 203 |
"authors": authors,
|
|
|
|
|
|
|
|
|
|
| 204 |
"year": year,
|
| 205 |
"license": license_ or "Unknown",
|
| 206 |
"doi": doi,
|
|
|
|
| 207 |
"source": source,
|
| 208 |
"openneuro_url": f"https://openneuro.org/datasets/{dataset_id}",
|
| 209 |
"nemar_url": f"https://nemar.org/dataexplorer/detail?dataset_id={dataset_id}",
|
| 210 |
-
"source_url": _clean_value(api.get("source_url")),
|
|
|
|
|
|
|
| 211 |
"record_modality": _clean_value(row.get("record_modality")),
|
| 212 |
-
"modality_exp": _clean_value(row.get("modality of exp")),
|
| 213 |
-
"type_exp": _clean_value(row.get("type of exp")),
|
| 214 |
-
"pathology": _clean_value(row.get("Type Subject")),
|
| 215 |
-
"
|
|
|
|
| 216 |
"n_records": pick("n_records", "total_files"),
|
| 217 |
"n_tasks": pick("n_tasks", "n_tasks"),
|
| 218 |
-
"n_channels": _format_stat_counts(row.get("nchans_set")),
|
| 219 |
-
"sampling_freqs": _format_stat_counts(row.get("sampling_freqs")),
|
| 220 |
"size": _clean_value(row.get("size")),
|
| 221 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
"references": _normalize_list(api.get("references")),
|
| 223 |
"how_to_acknowledge": _clean_value(api.get("how_to_acknowledge")),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
}
|
| 225 |
|
| 226 |
|
|
@@ -265,8 +349,33 @@ def _size_category(n_records: str) -> str:
|
|
| 265 |
return "10K<n<100K"
|
| 266 |
|
| 267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
def _render_readme(ctx: dict[str, Any]) -> str:
|
| 269 |
-
|
|
|
|
|
|
|
| 270 |
rm = ctx["record_modality"].lower()
|
| 271 |
if rm in {"eeg", "meg", "ieeg"}:
|
| 272 |
tags.insert(0, rm)
|
|
@@ -274,40 +383,97 @@ def _render_readme(ctx: dict[str, Any]) -> str:
|
|
| 274 |
tags.insert(0, "eeg")
|
| 275 |
if ctx["modality_exp"]:
|
| 276 |
tags.append(ctx["modality_exp"].lower().replace(" ", "-"))
|
|
|
|
|
|
|
| 277 |
if ctx["pathology"] and ctx["pathology"].lower() not in {"unknown", "healthy"}:
|
| 278 |
tags.append(ctx["pathology"].lower().replace(" ", "-").replace("/", "-"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
license_slug = _hf_license(ctx["license"])
|
| 281 |
size_cat = _size_category(ctx["n_records"])
|
| 282 |
|
| 283 |
-
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
if ctx["authors"]:
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
|
|
|
|
|
|
| 289 |
|
| 290 |
-
# --
|
| 291 |
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
hero_lines.append(f"# {ctx['title']}")
|
| 295 |
-
else:
|
| 296 |
-
hero_lines.append(f"# {ctx['dataset_id']}")
|
| 297 |
if ctx["author_year"]:
|
| 298 |
-
|
| 299 |
elif ctx["authors"]:
|
| 300 |
head = ctx["authors"][0]
|
| 301 |
-
extra =
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
|
| 306 |
load_block = f"""## Load this dataset
|
| 307 |
|
| 308 |
-
This repo is a **pointer**
|
| 309 |
-
(OpenNeuro / NEMAR)
|
| 310 |
-
|
| 311 |
|
| 312 |
```python
|
| 313 |
# pip install eegdash
|
|
@@ -316,9 +482,9 @@ from eegdash import EEGDashDataset
|
|
| 316 |
ds = EEGDashDataset(dataset="{ctx['dataset_id']}", cache_dir="./cache")
|
| 317 |
print(len(ds), "recordings")
|
| 318 |
```
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
|
| 323 |
```python
|
| 324 |
from braindecode.datasets import BaseConcatDataset
|
|
@@ -326,85 +492,182 @@ ds = BaseConcatDataset.pull_from_hub("{HF_ORG}/{ctx['dataset_id']}")
|
|
| 326 |
```
|
| 327 |
"""
|
| 328 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
rows = [
|
| 330 |
("Subjects", ctx["n_subjects"]),
|
|
|
|
| 331 |
("Recordings", ctx["n_records"]),
|
| 332 |
-
("Tasks", ctx["n_tasks"]),
|
|
|
|
| 333 |
("Channels", ctx["n_channels"]),
|
| 334 |
("Sampling rate (Hz)", ctx["sampling_freqs"]),
|
| 335 |
-
("Size on disk", ctx["size"]),
|
| 336 |
("Total duration (h)", ctx["duration_hours_total"]),
|
|
|
|
|
|
|
| 337 |
("Experimental modality", ctx["modality_exp"]),
|
| 338 |
-
("
|
| 339 |
("Population", ctx["pathology"]),
|
| 340 |
-
("
|
|
|
|
|
|
|
| 341 |
("Source", ctx["source"]),
|
| 342 |
("License", ctx["license"]),
|
|
|
|
| 343 |
]
|
| 344 |
md_rows = "\n".join(
|
| 345 |
-
f"| **{k}** | {v
|
| 346 |
)
|
|
|
|
| 347 |
|
| 348 |
-
|
| 349 |
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
{
|
| 353 |
-
""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
|
| 355 |
links = []
|
| 356 |
if ctx["doi"]:
|
| 357 |
links.append(f"- **DOI:** [{ctx['doi']}](https://doi.org/{ctx['doi']})")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
if ctx["source"].lower() == "openneuro":
|
| 359 |
links.append(f"- **OpenNeuro:** [{ctx['dataset_id']}]({ctx['openneuro_url']})")
|
| 360 |
if ctx["source"].lower() == "nemar":
|
| 361 |
links.append(f"- **NEMAR:** [{ctx['dataset_id']}]({ctx['nemar_url']})")
|
| 362 |
-
if ctx["source_url"]:
|
| 363 |
links.append(f"- **Source:** <{ctx['source_url']}>")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
links.append(f"- **Browse 700+ datasets:** [EEGDash catalog]({CATALOG_SPACE})")
|
| 365 |
links.append(f"- **Docs:** <{EEGDASH_URL}>")
|
| 366 |
links.append(f"- **Code:** <{GITHUB_URL}>")
|
| 367 |
links_block = "## Links\n\n" + "\n".join(links)
|
| 368 |
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
f"
|
|
|
|
| 375 |
)
|
| 376 |
-
elif ctx["
|
| 377 |
-
|
| 378 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
)
|
|
|
|
|
|
|
|
|
|
| 380 |
|
| 381 |
footer = (
|
| 382 |
-
f"
|
| 383 |
-
f"
|
| 384 |
-
f"({GITHUB_URL}/blob/main/eegdash/dataset/dataset_summary.csv)
|
| 385 |
-
f"
|
|
|
|
|
|
|
| 386 |
)
|
| 387 |
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
{cite_block}
|
| 406 |
-
{footer}
|
| 407 |
-
"""
|
| 408 |
|
| 409 |
|
| 410 |
def _render_pointer(ctx: dict[str, Any]) -> str:
|
|
|
|
| 167 |
# ---------------------------------------------------------------------------
|
| 168 |
|
| 169 |
|
| 170 |
+
def _parse_canonical_names(cell: Any) -> list[str]:
|
| 171 |
+
"""Match eegdash.dataset.registry._parse_canonical_names output.
|
| 172 |
+
|
| 173 |
+
The CSV ships canonical aliases as a JSON array string; some rows are
|
| 174 |
+
empty, some hold a list of strings. Returns a clean list of valid
|
| 175 |
+
Python identifiers so the rendered aliases match the ones the runtime
|
| 176 |
+
registry would register.
|
| 177 |
+
"""
|
| 178 |
+
s = _clean_value(cell)
|
| 179 |
+
if not s:
|
| 180 |
+
return []
|
| 181 |
+
try:
|
| 182 |
+
parsed = json.loads(s)
|
| 183 |
+
except json.JSONDecodeError:
|
| 184 |
+
try:
|
| 185 |
+
parsed = ast.literal_eval(s)
|
| 186 |
+
except (ValueError, SyntaxError):
|
| 187 |
+
return []
|
| 188 |
+
if not isinstance(parsed, (list, tuple)):
|
| 189 |
+
return []
|
| 190 |
+
out: list[str] = []
|
| 191 |
+
for name in parsed:
|
| 192 |
+
n = str(name).strip()
|
| 193 |
+
if n and n.isidentifier():
|
| 194 |
+
out.append(n)
|
| 195 |
+
return out
|
| 196 |
+
|
| 197 |
+
|
| 198 |
def _build_context(row: pd.Series) -> dict[str, Any]:
|
| 199 |
dataset_id = _clean_value(row.get("dataset")).lower()
|
| 200 |
api = _fetch_api_summary(dataset_id)
|
|
|
|
| 211 |
api.get("computed_title") or api.get("name")
|
| 212 |
)
|
| 213 |
doi_raw = _clean_value(row.get("doi")) or _clean_value(api.get("dataset_doi"))
|
|
|
|
| 214 |
doi = doi_raw[4:].strip() if doi_raw.lower().startswith("doi:") else doi_raw
|
| 215 |
+
paper_doi_raw = _clean_value(api.get("associated_paper_doi"))
|
| 216 |
+
paper_doi = (
|
| 217 |
+
paper_doi_raw[4:].strip()
|
| 218 |
+
if paper_doi_raw.lower().startswith("doi:")
|
| 219 |
+
else paper_doi_raw
|
| 220 |
+
)
|
| 221 |
license_ = _clean_value(row.get("license")) or _clean_value(api.get("license"))
|
| 222 |
authors = _normalize_list(api.get("authors"))
|
| 223 |
source = _clean_value(row.get("source")) or "OpenNeuro"
|
| 224 |
|
|
|
|
|
|
|
| 225 |
ts = api.get("timestamps") or {}
|
| 226 |
+
year = ""
|
| 227 |
created = ts.get("dataset_created_at") or ""
|
| 228 |
if isinstance(created, str) and len(created) >= 4:
|
| 229 |
year = created[:4]
|
| 230 |
|
| 231 |
+
# Canonical aliases: CSV first (filtered the same way the runtime registry
|
| 232 |
+
# filters), API second as a safety net.
|
| 233 |
+
canonical_names = _parse_canonical_names(row.get("canonical_name"))
|
| 234 |
+
if not canonical_names:
|
| 235 |
+
raw = api.get("canonical_name")
|
| 236 |
+
if isinstance(raw, list):
|
| 237 |
+
canonical_names = [
|
| 238 |
+
str(n).strip()
|
| 239 |
+
for n in raw
|
| 240 |
+
if isinstance(n, str) and str(n).strip().isidentifier()
|
| 241 |
+
]
|
| 242 |
+
|
| 243 |
+
# Duration: prefer CSV hours, else API seconds → hours
|
| 244 |
+
dur_h = _format_hours(row.get("duration_hours_total"))
|
| 245 |
+
if not dur_h:
|
| 246 |
+
sec = _clean_value(api.get("total_duration_s"))
|
| 247 |
+
if sec:
|
| 248 |
+
try:
|
| 249 |
+
dur_h = f"{float(sec) / 3600:,.1f}"
|
| 250 |
+
except ValueError:
|
| 251 |
+
dur_h = ""
|
| 252 |
+
|
| 253 |
+
demographics = api.get("demographics") or {}
|
| 254 |
+
storage = api.get("storage") or {}
|
| 255 |
+
external = api.get("external_links") or {}
|
| 256 |
+
api_tags = api.get("tags") or {}
|
| 257 |
+
|
| 258 |
return {
|
| 259 |
"dataset_id": dataset_id,
|
| 260 |
"title": title or dataset_id,
|
| 261 |
"author_year": _clean_value(row.get("author_year")),
|
| 262 |
+
"canonical_names": canonical_names,
|
| 263 |
"authors": authors,
|
| 264 |
+
"senior_author": _clean_value(api.get("senior_author")),
|
| 265 |
+
"contact_info": _normalize_list(api.get("contact_info")),
|
| 266 |
+
"contributing_labs": _normalize_list(api.get("contributing_labs")),
|
| 267 |
"year": year,
|
| 268 |
"license": license_ or "Unknown",
|
| 269 |
"doi": doi,
|
| 270 |
+
"paper_doi": paper_doi,
|
| 271 |
"source": source,
|
| 272 |
"openneuro_url": f"https://openneuro.org/datasets/{dataset_id}",
|
| 273 |
"nemar_url": f"https://nemar.org/dataexplorer/detail?dataset_id={dataset_id}",
|
| 274 |
+
"source_url": _clean_value(api.get("source_url")) or _clean_value(external.get("source_url")),
|
| 275 |
+
"osf_url": _clean_value(external.get("osf_url")),
|
| 276 |
+
"github_url": _clean_value(external.get("github_url")),
|
| 277 |
"record_modality": _clean_value(row.get("record_modality")),
|
| 278 |
+
"modality_exp": _clean_value(row.get("modality of exp")) or _clean_value(api_tags.get("modality")),
|
| 279 |
+
"type_exp": _clean_value(row.get("type of exp")) or _clean_value(api_tags.get("type")),
|
| 280 |
+
"pathology": _clean_value(row.get("Type Subject")) or _clean_value(api_tags.get("pathology")),
|
| 281 |
+
"tasks_list": _normalize_list(api.get("tasks")),
|
| 282 |
+
"n_subjects": pick("n_subjects", "n_subjects") or str(_clean_value(demographics.get("subjects_count")) or ""),
|
| 283 |
"n_records": pick("n_records", "total_files"),
|
| 284 |
"n_tasks": pick("n_tasks", "n_tasks"),
|
| 285 |
+
"n_channels": _format_stat_counts(row.get("nchans_set")) or _format_stat_counts(api.get("nchans_counts")),
|
| 286 |
+
"sampling_freqs": _format_stat_counts(row.get("sampling_freqs")) or _format_stat_counts(api.get("sfreq_counts")),
|
| 287 |
"size": _clean_value(row.get("size")),
|
| 288 |
+
"size_bytes": _clean_value(api.get("size_bytes")),
|
| 289 |
+
"duration_hours_total": dur_h,
|
| 290 |
+
"bids_version": _clean_value(api.get("bids_version")),
|
| 291 |
+
"age_min": _clean_value(demographics.get("age_min")),
|
| 292 |
+
"age_max": _clean_value(demographics.get("age_max")),
|
| 293 |
+
"age_mean": _clean_value(demographics.get("age_mean")),
|
| 294 |
+
"sessions": _normalize_list(api.get("sessions")),
|
| 295 |
+
"study_design": _clean_value(api.get("study_design")),
|
| 296 |
+
"study_domain": _clean_value(api.get("study_domain")),
|
| 297 |
+
"experimental_modalities": _normalize_list(api.get("experimental_modalities")),
|
| 298 |
+
"datatypes": _normalize_list(api.get("datatypes")),
|
| 299 |
+
"funding": _normalize_list(api.get("funding")),
|
| 300 |
"references": _normalize_list(api.get("references")),
|
| 301 |
"how_to_acknowledge": _clean_value(api.get("how_to_acknowledge")),
|
| 302 |
+
"readme": _clean_value(api.get("readme")),
|
| 303 |
+
"nemar_citations": _clean_value(api.get("nemar_citation_count")) or _clean_value(row.get("nemar_citation_count")),
|
| 304 |
+
"storage_backend": _clean_value(storage.get("backend")),
|
| 305 |
+
"storage_base": _clean_value(storage.get("base")),
|
| 306 |
+
"digested_at": _clean_value(ts.get("digested_at")),
|
| 307 |
+
"stats_computed_at": _clean_value(api.get("stats_computed_at")),
|
| 308 |
}
|
| 309 |
|
| 310 |
|
|
|
|
| 349 |
return "10K<n<100K"
|
| 350 |
|
| 351 |
|
| 352 |
+
def _escape_yaml(s: str) -> str:
|
| 353 |
+
"""Quote a YAML string value safely. Assumes the content is plain text."""
|
| 354 |
+
return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"'
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
def _sanitize_upstream_readme(text: str) -> str:
|
| 358 |
+
"""Defuse markers that could confuse HF's frontmatter parser.
|
| 359 |
+
|
| 360 |
+
An upstream README that happens to start a line with ``---`` on its
|
| 361 |
+
own renders fine in the body of a Markdown doc, but trailing YAML
|
| 362 |
+
blocks at the top of a mixed document can trip some parsers. We also
|
| 363 |
+
strip ingested-time pollution ("Introduction:" header styling etc.
|
| 364 |
+
stays intact — only raw markers get touched).
|
| 365 |
+
"""
|
| 366 |
+
out_lines: list[str] = []
|
| 367 |
+
for ln in text.splitlines():
|
| 368 |
+
if ln.strip() == "---":
|
| 369 |
+
out_lines.append("***") # visual divider instead
|
| 370 |
+
else:
|
| 371 |
+
out_lines.append(ln)
|
| 372 |
+
return "\n".join(out_lines).strip()
|
| 373 |
+
|
| 374 |
+
|
| 375 |
def _render_readme(ctx: dict[str, Any]) -> str:
|
| 376 |
+
# -- Frontmatter -------------------------------------------------------
|
| 377 |
+
|
| 378 |
+
tags = ["neuroscience", "eegdash", "brain-computer-interface", "pytorch"]
|
| 379 |
rm = ctx["record_modality"].lower()
|
| 380 |
if rm in {"eeg", "meg", "ieeg"}:
|
| 381 |
tags.insert(0, rm)
|
|
|
|
| 383 |
tags.insert(0, "eeg")
|
| 384 |
if ctx["modality_exp"]:
|
| 385 |
tags.append(ctx["modality_exp"].lower().replace(" ", "-"))
|
| 386 |
+
if ctx["type_exp"]:
|
| 387 |
+
tags.append(ctx["type_exp"].lower().replace(" ", "-").replace("/", "-"))
|
| 388 |
if ctx["pathology"] and ctx["pathology"].lower() not in {"unknown", "healthy"}:
|
| 389 |
tags.append(ctx["pathology"].lower().replace(" ", "-").replace("/", "-"))
|
| 390 |
+
for t in ctx["tasks_list"][:5]:
|
| 391 |
+
slug = t.lower().replace("_", "-").replace(" ", "-")
|
| 392 |
+
if slug and slug not in tags:
|
| 393 |
+
tags.append(slug)
|
| 394 |
+
# Dedupe while preserving order.
|
| 395 |
+
tags = list(dict.fromkeys(tags))
|
| 396 |
|
| 397 |
license_slug = _hf_license(ctx["license"])
|
| 398 |
size_cat = _size_category(ctx["n_records"])
|
| 399 |
|
| 400 |
+
yaml_parts = ["---"]
|
| 401 |
+
yaml_parts.append(f"pretty_name: {_escape_yaml(ctx['title'] or ctx['dataset_id'])}")
|
| 402 |
+
yaml_parts.append(f"license: {license_slug}")
|
| 403 |
+
yaml_parts.append("tags:")
|
| 404 |
+
for t in tags:
|
| 405 |
+
yaml_parts.append(f" - {t}")
|
| 406 |
+
yaml_parts.append("size_categories:")
|
| 407 |
+
yaml_parts.append(f" - {size_cat}")
|
| 408 |
+
if ctx["record_modality"]:
|
| 409 |
+
yaml_parts.append("task_categories:")
|
| 410 |
+
yaml_parts.append(" - other")
|
| 411 |
if ctx["authors"]:
|
| 412 |
+
yaml_parts.append("authors:")
|
| 413 |
+
for a in ctx["authors"][:12]:
|
| 414 |
+
yaml_parts.append(f" - {_escape_yaml(a)}")
|
| 415 |
+
yaml_parts.append("---")
|
| 416 |
+
frontmatter = "\n".join(yaml_parts)
|
| 417 |
|
| 418 |
+
# -- Hero --------------------------------------------------------------
|
| 419 |
|
| 420 |
+
hero_title = ctx["title"] or ctx["dataset_id"]
|
| 421 |
+
attribution = ""
|
|
|
|
|
|
|
|
|
|
| 422 |
if ctx["author_year"]:
|
| 423 |
+
attribution = ctx["author_year"]
|
| 424 |
elif ctx["authors"]:
|
| 425 |
head = ctx["authors"][0]
|
| 426 |
+
extra = " et al." if len(ctx["authors"]) > 1 else ""
|
| 427 |
+
attribution = head + extra + (f" ({ctx['year']})" if ctx["year"] else "")
|
| 428 |
+
alias_line = ""
|
| 429 |
+
if ctx["canonical_names"]:
|
| 430 |
+
joined = " · ".join(f"`{n}`" for n in ctx["canonical_names"])
|
| 431 |
+
alias_line = f"**Canonical aliases:** {joined}"
|
| 432 |
+
hero_bits = [f"# {hero_title}", f"**Dataset ID:** `{ctx['dataset_id']}`"]
|
| 433 |
+
if attribution:
|
| 434 |
+
hero_bits.append(f"_{attribution}_")
|
| 435 |
+
if alias_line:
|
| 436 |
+
hero_bits.append(alias_line)
|
| 437 |
+
hero = "\n\n".join(hero_bits)
|
| 438 |
+
|
| 439 |
+
# -- Summary line (3-second takeaway) ---------------------------------
|
| 440 |
+
|
| 441 |
+
tl_bits = []
|
| 442 |
+
if ctx["record_modality"]:
|
| 443 |
+
tl_bits.append(ctx["record_modality"].upper())
|
| 444 |
+
if ctx["modality_exp"] and ctx["type_exp"]:
|
| 445 |
+
tl_bits.append(f"{ctx['modality_exp']} {ctx['type_exp'].lower()}")
|
| 446 |
+
elif ctx["modality_exp"]:
|
| 447 |
+
tl_bits.append(ctx["modality_exp"])
|
| 448 |
+
if ctx["pathology"]:
|
| 449 |
+
tl_bits.append(ctx["pathology"].lower())
|
| 450 |
+
if ctx["n_subjects"]:
|
| 451 |
+
tl_bits.append(f"{ctx['n_subjects']} subjects")
|
| 452 |
+
if ctx["n_records"]:
|
| 453 |
+
tl_bits.append(f"{ctx['n_records']} recordings")
|
| 454 |
+
if ctx["license"]:
|
| 455 |
+
tl_bits.append(ctx["license"])
|
| 456 |
+
tldr = "> **At a glance:** " + " · ".join(tl_bits) if tl_bits else ""
|
| 457 |
+
|
| 458 |
+
# -- Load section ------------------------------------------------------
|
| 459 |
+
|
| 460 |
+
aliases_hint = ""
|
| 461 |
+
if ctx["canonical_names"]:
|
| 462 |
+
a0 = ctx["canonical_names"][0]
|
| 463 |
+
aliases_hint = (
|
| 464 |
+
f"\nYou can also load it by canonical alias — these are registered "
|
| 465 |
+
f"classes in `eegdash.dataset`:\n\n"
|
| 466 |
+
f"```python\n"
|
| 467 |
+
f"from eegdash.dataset import {a0}\n"
|
| 468 |
+
f"ds = {a0}(cache_dir=\"./cache\")\n"
|
| 469 |
+
f"```\n"
|
| 470 |
+
)
|
| 471 |
|
| 472 |
load_block = f"""## Load this dataset
|
| 473 |
|
| 474 |
+
This repo is a **pointer**. The raw EEG data lives at its canonical source
|
| 475 |
+
(OpenNeuro / NEMAR); [EEGDash](https://github.com/eegdash/EEGDash) streams it
|
| 476 |
+
on demand and returns a PyTorch / braindecode dataset.
|
| 477 |
|
| 478 |
```python
|
| 479 |
# pip install eegdash
|
|
|
|
| 482 |
ds = EEGDashDataset(dataset="{ctx['dataset_id']}", cache_dir="./cache")
|
| 483 |
print(len(ds), "recordings")
|
| 484 |
```
|
| 485 |
+
{aliases_hint}
|
| 486 |
+
If the dataset has been mirrored to the HF Hub in braindecode's Zarr layout,
|
| 487 |
+
you can also pull it directly:
|
| 488 |
|
| 489 |
```python
|
| 490 |
from braindecode.datasets import BaseConcatDataset
|
|
|
|
| 492 |
```
|
| 493 |
"""
|
| 494 |
|
| 495 |
+
# -- Metadata table ---------------------------------------------------
|
| 496 |
+
|
| 497 |
+
age_str = ""
|
| 498 |
+
if ctx["age_min"] or ctx["age_max"] or ctx["age_mean"]:
|
| 499 |
+
parts = []
|
| 500 |
+
if ctx["age_min"] and ctx["age_max"]:
|
| 501 |
+
parts.append(f"{ctx['age_min']}–{ctx['age_max']} yrs")
|
| 502 |
+
if ctx["age_mean"]:
|
| 503 |
+
try:
|
| 504 |
+
parts.append(f"mean {float(ctx['age_mean']):.1f}")
|
| 505 |
+
except ValueError:
|
| 506 |
+
parts.append(f"mean {ctx['age_mean']}")
|
| 507 |
+
age_str = ", ".join(parts)
|
| 508 |
+
|
| 509 |
rows = [
|
| 510 |
("Subjects", ctx["n_subjects"]),
|
| 511 |
+
("Age range", age_str),
|
| 512 |
("Recordings", ctx["n_records"]),
|
| 513 |
+
("Tasks (count)", ctx["n_tasks"]),
|
| 514 |
+
("Sessions", str(len(ctx["sessions"])) if ctx["sessions"] else ""),
|
| 515 |
("Channels", ctx["n_channels"]),
|
| 516 |
("Sampling rate (Hz)", ctx["sampling_freqs"]),
|
|
|
|
| 517 |
("Total duration (h)", ctx["duration_hours_total"]),
|
| 518 |
+
("Size on disk", ctx["size"]),
|
| 519 |
+
("Recording type", ctx["record_modality"].upper() if ctx["record_modality"] else ""),
|
| 520 |
("Experimental modality", ctx["modality_exp"]),
|
| 521 |
+
("Paradigm type", ctx["type_exp"]),
|
| 522 |
("Population", ctx["pathology"]),
|
| 523 |
+
("Study design", ctx["study_design"]),
|
| 524 |
+
("Study domain", ctx["study_domain"]),
|
| 525 |
+
("BIDS version", ctx["bids_version"]),
|
| 526 |
("Source", ctx["source"]),
|
| 527 |
("License", ctx["license"]),
|
| 528 |
+
("NEMAR citations", ctx["nemar_citations"]),
|
| 529 |
]
|
| 530 |
md_rows = "\n".join(
|
| 531 |
+
f"| **{k}** | {v} |" for k, v in rows if str(v or "").strip()
|
| 532 |
)
|
| 533 |
+
meta_table = "## Dataset metadata\n\n| | |\n|---|---|\n" + md_rows
|
| 534 |
|
| 535 |
+
# -- Tasks list (if any) ----------------------------------------------
|
| 536 |
|
| 537 |
+
tasks_block = ""
|
| 538 |
+
if ctx["tasks_list"]:
|
| 539 |
+
items = "\n".join(f"- `{t}`" for t in ctx["tasks_list"])
|
| 540 |
+
tasks_block = f"## Tasks\n\n{items}\n"
|
| 541 |
+
|
| 542 |
+
# -- Upstream README (the star of the show) ---------------------------
|
| 543 |
+
|
| 544 |
+
upstream_block = ""
|
| 545 |
+
if ctx["readme"]:
|
| 546 |
+
body = _sanitize_upstream_readme(ctx["readme"])
|
| 547 |
+
upstream_block = (
|
| 548 |
+
"## Upstream README\n\n"
|
| 549 |
+
"_Verbatim from the dataset's authors — the canonical "
|
| 550 |
+
"description._\n\n"
|
| 551 |
+
f"{body}\n"
|
| 552 |
+
)
|
| 553 |
+
|
| 554 |
+
# -- People -----------------------------------------------------------
|
| 555 |
+
|
| 556 |
+
people_lines = []
|
| 557 |
+
if ctx["authors"]:
|
| 558 |
+
people_lines.append("### Authors")
|
| 559 |
+
for a in ctx["authors"]:
|
| 560 |
+
marker = " _(senior)_" if a.strip() == ctx["senior_author"].strip() else ""
|
| 561 |
+
people_lines.append(f"- {a}{marker}")
|
| 562 |
+
if ctx["contributing_labs"]:
|
| 563 |
+
people_lines.append("\n### Contributing labs")
|
| 564 |
+
for lab in ctx["contributing_labs"]:
|
| 565 |
+
people_lines.append(f"- {lab}")
|
| 566 |
+
if ctx["contact_info"]:
|
| 567 |
+
people_lines.append("\n### Contact")
|
| 568 |
+
for c in ctx["contact_info"]:
|
| 569 |
+
people_lines.append(f"- {c}")
|
| 570 |
+
people_block = "## People\n\n" + "\n".join(people_lines) if people_lines else ""
|
| 571 |
+
|
| 572 |
+
# -- Funding + references ---------------------------------------------
|
| 573 |
+
|
| 574 |
+
funding_block = ""
|
| 575 |
+
if ctx["funding"]:
|
| 576 |
+
items = "\n".join(f"- {f}" for f in ctx["funding"])
|
| 577 |
+
funding_block = f"## Funding\n\n{items}"
|
| 578 |
+
|
| 579 |
+
cite_block = ""
|
| 580 |
+
if ctx["how_to_acknowledge"]:
|
| 581 |
+
cite_block = (
|
| 582 |
+
"## How to cite\n\n"
|
| 583 |
+
"Please follow the upstream dataset's citation policy:\n\n"
|
| 584 |
+
+ "\n".join(
|
| 585 |
+
f"> {ln}" for ln in ctx["how_to_acknowledge"].strip().splitlines()
|
| 586 |
+
)
|
| 587 |
+
)
|
| 588 |
+
if ctx["references"]:
|
| 589 |
+
if cite_block:
|
| 590 |
+
cite_block += "\n\n### References\n\n"
|
| 591 |
+
else:
|
| 592 |
+
cite_block = "## References\n\n"
|
| 593 |
+
cite_block += "\n".join(f"- {r}" for r in ctx["references"])
|
| 594 |
+
|
| 595 |
+
# -- Links ------------------------------------------------------------
|
| 596 |
|
| 597 |
links = []
|
| 598 |
if ctx["doi"]:
|
| 599 |
links.append(f"- **DOI:** [{ctx['doi']}](https://doi.org/{ctx['doi']})")
|
| 600 |
+
if ctx["paper_doi"]:
|
| 601 |
+
links.append(
|
| 602 |
+
f"- **Associated paper:** [{ctx['paper_doi']}]"
|
| 603 |
+
f"(https://doi.org/{ctx['paper_doi']})"
|
| 604 |
+
)
|
| 605 |
if ctx["source"].lower() == "openneuro":
|
| 606 |
links.append(f"- **OpenNeuro:** [{ctx['dataset_id']}]({ctx['openneuro_url']})")
|
| 607 |
if ctx["source"].lower() == "nemar":
|
| 608 |
links.append(f"- **NEMAR:** [{ctx['dataset_id']}]({ctx['nemar_url']})")
|
| 609 |
+
if ctx["source_url"] and ctx["source_url"] not in (ctx["openneuro_url"], ctx["nemar_url"]):
|
| 610 |
links.append(f"- **Source:** <{ctx['source_url']}>")
|
| 611 |
+
if ctx["osf_url"]:
|
| 612 |
+
links.append(f"- **OSF:** <{ctx['osf_url']}>")
|
| 613 |
+
if ctx["github_url"]:
|
| 614 |
+
links.append(f"- **GitHub:** <{ctx['github_url']}>")
|
| 615 |
links.append(f"- **Browse 700+ datasets:** [EEGDash catalog]({CATALOG_SPACE})")
|
| 616 |
links.append(f"- **Docs:** <{EEGDASH_URL}>")
|
| 617 |
links.append(f"- **Code:** <{GITHUB_URL}>")
|
| 618 |
links_block = "## Links\n\n" + "\n".join(links)
|
| 619 |
|
| 620 |
+
# -- Provenance (where the data actually lives + when we saw it) ------
|
| 621 |
+
|
| 622 |
+
prov_lines = []
|
| 623 |
+
if ctx["storage_backend"] and ctx["storage_base"]:
|
| 624 |
+
prov_lines.append(
|
| 625 |
+
f"- **Backend:** `{ctx['storage_backend']}` — "
|
| 626 |
+
f"`{ctx['storage_base']}`"
|
| 627 |
)
|
| 628 |
+
elif ctx["storage_backend"]:
|
| 629 |
+
prov_lines.append(f"- **Backend:** `{ctx['storage_backend']}`")
|
| 630 |
+
if ctx["size_bytes"]:
|
| 631 |
+
try:
|
| 632 |
+
sb = float(ctx["size_bytes"])
|
| 633 |
+
prov_lines.append(f"- **Exact size:** {int(sb):,} bytes ({ctx['size']})")
|
| 634 |
+
except ValueError:
|
| 635 |
+
pass
|
| 636 |
+
if ctx["digested_at"]:
|
| 637 |
+
prov_lines.append(f"- **Ingested:** {ctx['digested_at'][:10]}")
|
| 638 |
+
if ctx["stats_computed_at"]:
|
| 639 |
+
prov_lines.append(
|
| 640 |
+
f"- **Stats computed:** {ctx['stats_computed_at'][:10]}"
|
| 641 |
)
|
| 642 |
+
prov_block = "## Provenance\n\n" + "\n".join(prov_lines) if prov_lines else ""
|
| 643 |
+
|
| 644 |
+
# -- Footer -----------------------------------------------------------
|
| 645 |
|
| 646 |
footer = (
|
| 647 |
+
f"---\n\n"
|
| 648 |
+
f"_Auto-generated from "
|
| 649 |
+
f"[dataset_summary.csv]({GITHUB_URL}/blob/main/eegdash/dataset/dataset_summary.csv) "
|
| 650 |
+
f"and the [EEGDash API]({EEGDASH_API}/datasets/summary/{ctx['dataset_id']}). "
|
| 651 |
+
f"Do not edit this file by hand — update the upstream source and "
|
| 652 |
+
f"re-run `scripts/push_metadata_stubs.py`._"
|
| 653 |
)
|
| 654 |
|
| 655 |
+
sections = [
|
| 656 |
+
frontmatter,
|
| 657 |
+
hero,
|
| 658 |
+
tldr,
|
| 659 |
+
load_block,
|
| 660 |
+
meta_table,
|
| 661 |
+
tasks_block,
|
| 662 |
+
upstream_block,
|
| 663 |
+
people_block,
|
| 664 |
+
funding_block,
|
| 665 |
+
cite_block,
|
| 666 |
+
links_block,
|
| 667 |
+
prov_block,
|
| 668 |
+
footer,
|
| 669 |
+
]
|
| 670 |
+
return "\n\n".join(s for s in sections if s).strip() + "\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 671 |
|
| 672 |
|
| 673 |
def _render_pointer(ctx: dict[str, Any]) -> str:
|