bruAristimunha commited on
Commit
4af5c7a
·
1 Parent(s): 5c60b17

Richer stubs: canonical aliases, upstream README verbatim, demographics, funding, provenance

Browse files
Files changed (1) hide show
  1. scripts/push_metadata_stubs.py +341 -78
scripts/push_metadata_stubs.py CHANGED
@@ -167,6 +167,34 @@ def _fetch_api_summary(dataset_id: str, timeout: float = 10.0) -> dict[str, Any]
167
  # ---------------------------------------------------------------------------
168
 
169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  def _build_context(row: pd.Series) -> dict[str, Any]:
171
  dataset_id = _clean_value(row.get("dataset")).lower()
172
  api = _fetch_api_summary(dataset_id)
@@ -183,44 +211,100 @@ def _build_context(row: pd.Series) -> dict[str, Any]:
183
  api.get("computed_title") or api.get("name")
184
  )
185
  doi_raw = _clean_value(row.get("doi")) or _clean_value(api.get("dataset_doi"))
186
- # DOIs sometimes ship with a "doi:" prefix — strip so links don't double up.
187
  doi = doi_raw[4:].strip() if doi_raw.lower().startswith("doi:") else doi_raw
 
 
 
 
 
 
188
  license_ = _clean_value(row.get("license")) or _clean_value(api.get("license"))
189
  authors = _normalize_list(api.get("authors"))
190
  source = _clean_value(row.get("source")) or "OpenNeuro"
191
 
192
- # Year from API timestamps (docs does the same)
193
- year = ""
194
  ts = api.get("timestamps") or {}
 
195
  created = ts.get("dataset_created_at") or ""
196
  if isinstance(created, str) and len(created) >= 4:
197
  year = created[:4]
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  return {
200
  "dataset_id": dataset_id,
201
  "title": title or dataset_id,
202
  "author_year": _clean_value(row.get("author_year")),
 
203
  "authors": authors,
 
 
 
204
  "year": year,
205
  "license": license_ or "Unknown",
206
  "doi": doi,
 
207
  "source": source,
208
  "openneuro_url": f"https://openneuro.org/datasets/{dataset_id}",
209
  "nemar_url": f"https://nemar.org/dataexplorer/detail?dataset_id={dataset_id}",
210
- "source_url": _clean_value(api.get("source_url")),
 
 
211
  "record_modality": _clean_value(row.get("record_modality")),
212
- "modality_exp": _clean_value(row.get("modality of exp")),
213
- "type_exp": _clean_value(row.get("type of exp")),
214
- "pathology": _clean_value(row.get("Type Subject")),
215
- "n_subjects": pick("n_subjects", "n_subjects"),
 
216
  "n_records": pick("n_records", "total_files"),
217
  "n_tasks": pick("n_tasks", "n_tasks"),
218
- "n_channels": _format_stat_counts(row.get("nchans_set")),
219
- "sampling_freqs": _format_stat_counts(row.get("sampling_freqs")),
220
  "size": _clean_value(row.get("size")),
221
- "duration_hours_total": _format_hours(row.get("duration_hours_total")),
 
 
 
 
 
 
 
 
 
 
 
222
  "references": _normalize_list(api.get("references")),
223
  "how_to_acknowledge": _clean_value(api.get("how_to_acknowledge")),
 
 
 
 
 
 
224
  }
225
 
226
 
@@ -265,8 +349,33 @@ def _size_category(n_records: str) -> str:
265
  return "10K<n<100K"
266
 
267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  def _render_readme(ctx: dict[str, Any]) -> str:
269
- tags = ["neuroscience", "eegdash", "brain-computer-interface"]
 
 
270
  rm = ctx["record_modality"].lower()
271
  if rm in {"eeg", "meg", "ieeg"}:
272
  tags.insert(0, rm)
@@ -274,40 +383,97 @@ def _render_readme(ctx: dict[str, Any]) -> str:
274
  tags.insert(0, "eeg")
275
  if ctx["modality_exp"]:
276
  tags.append(ctx["modality_exp"].lower().replace(" ", "-"))
 
 
277
  if ctx["pathology"] and ctx["pathology"].lower() not in {"unknown", "healthy"}:
278
  tags.append(ctx["pathology"].lower().replace(" ", "-").replace("/", "-"))
 
 
 
 
 
 
279
 
280
  license_slug = _hf_license(ctx["license"])
281
  size_cat = _size_category(ctx["n_records"])
282
 
283
- yaml_tags = "\n".join(f"- {t}" for t in tags)
284
- yaml_authors = ""
 
 
 
 
 
 
 
 
 
285
  if ctx["authors"]:
286
- yaml_authors = "authors:\n" + "\n".join(
287
- f" - {a}" for a in ctx["authors"][:8]
288
- ) + "\n"
 
 
289
 
290
- # --- Body -------------------------------------------------------------
291
 
292
- hero_lines = []
293
- if ctx["title"] and ctx["title"].lower() != ctx["dataset_id"].lower():
294
- hero_lines.append(f"# {ctx['title']}")
295
- else:
296
- hero_lines.append(f"# {ctx['dataset_id']}")
297
  if ctx["author_year"]:
298
- hero_lines.append(f"*{ctx['author_year']}*")
299
  elif ctx["authors"]:
300
  head = ctx["authors"][0]
301
- extra = f" et al." if len(ctx["authors"]) > 1 else ""
302
- yr = f" ({ctx['year']})" if ctx["year"] else ""
303
- hero_lines.append(f"*{head}{extra}{yr}*")
304
- hero = "\n\n".join(hero_lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
  load_block = f"""## Load this dataset
307
 
308
- This repo is a **pointer** the raw EEG data lives at its canonical source
309
- (OpenNeuro / NEMAR). [EEGDash](https://github.com/eegdash/EEGDash) handles the
310
- download, caching, and conversion to a PyTorch / braindecode dataset.
311
 
312
  ```python
313
  # pip install eegdash
@@ -316,9 +482,9 @@ from eegdash import EEGDashDataset
316
  ds = EEGDashDataset(dataset="{ctx['dataset_id']}", cache_dir="./cache")
317
  print(len(ds), "recordings")
318
  ```
319
-
320
- Need it in braindecode's HF-native Zarr format? Once mirrored
321
- (`ds.push_to_hub(...)`) you can also do:
322
 
323
  ```python
324
  from braindecode.datasets import BaseConcatDataset
@@ -326,85 +492,182 @@ ds = BaseConcatDataset.pull_from_hub("{HF_ORG}/{ctx['dataset_id']}")
326
  ```
327
  """
328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  rows = [
330
  ("Subjects", ctx["n_subjects"]),
 
331
  ("Recordings", ctx["n_records"]),
332
- ("Tasks", ctx["n_tasks"]),
 
333
  ("Channels", ctx["n_channels"]),
334
  ("Sampling rate (Hz)", ctx["sampling_freqs"]),
335
- ("Size on disk", ctx["size"]),
336
  ("Total duration (h)", ctx["duration_hours_total"]),
 
 
337
  ("Experimental modality", ctx["modality_exp"]),
338
- ("Experimental type", ctx["type_exp"]),
339
  ("Population", ctx["pathology"]),
340
- ("Recording type", ctx["record_modality"].upper()),
 
 
341
  ("Source", ctx["source"]),
342
  ("License", ctx["license"]),
 
343
  ]
344
  md_rows = "\n".join(
345
- f"| **{k}** | {v or '—'} |" for k, v in rows if v or k in {"Source", "License"}
346
  )
 
347
 
348
- meta_table = f"""## Dataset metadata
349
 
350
- | | |
351
- |---|---|
352
- {md_rows}
353
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
  links = []
356
  if ctx["doi"]:
357
  links.append(f"- **DOI:** [{ctx['doi']}](https://doi.org/{ctx['doi']})")
 
 
 
 
 
358
  if ctx["source"].lower() == "openneuro":
359
  links.append(f"- **OpenNeuro:** [{ctx['dataset_id']}]({ctx['openneuro_url']})")
360
  if ctx["source"].lower() == "nemar":
361
  links.append(f"- **NEMAR:** [{ctx['dataset_id']}]({ctx['nemar_url']})")
362
- if ctx["source_url"]:
363
  links.append(f"- **Source:** <{ctx['source_url']}>")
 
 
 
 
364
  links.append(f"- **Browse 700+ datasets:** [EEGDash catalog]({CATALOG_SPACE})")
365
  links.append(f"- **Docs:** <{EEGDASH_URL}>")
366
  links.append(f"- **Code:** <{GITHUB_URL}>")
367
  links_block = "## Links\n\n" + "\n".join(links)
368
 
369
- cite_block = ""
370
- if ctx["how_to_acknowledge"]:
371
- cite_block = (
372
- "## How to cite\n\n"
373
- "Please follow the upstream dataset's citation policy:\n\n"
374
- f"> {ctx['how_to_acknowledge'].strip()}\n"
 
375
  )
376
- elif ctx["references"]:
377
- cite_block = "## References\n\n" + "\n".join(
378
- f"- {r}" for r in ctx["references"][:5]
 
 
 
 
 
 
 
 
 
 
379
  )
 
 
 
380
 
381
  footer = (
382
- f"\n---\n\n"
383
- f"_This repo is auto-generated from [dataset_summary.csv]"
384
- f"({GITHUB_URL}/blob/main/eegdash/dataset/dataset_summary.csv) + the "
385
- f"EEGDash API. Edit the upstream source, not this file._"
 
 
386
  )
387
 
388
- return f"""---
389
- tags:
390
- {yaml_tags}
391
- license: {license_slug}
392
- size_categories:
393
- - {size_cat}
394
- pretty_name: "{ctx['title'] or ctx['dataset_id']}"
395
- {yaml_authors}---
396
-
397
- {hero}
398
-
399
- {load_block}
400
-
401
- {meta_table}
402
-
403
- {links_block}
404
-
405
- {cite_block}
406
- {footer}
407
- """
408
 
409
 
410
  def _render_pointer(ctx: dict[str, Any]) -> str:
 
167
  # ---------------------------------------------------------------------------
168
 
169
 
170
+ def _parse_canonical_names(cell: Any) -> list[str]:
171
+ """Match eegdash.dataset.registry._parse_canonical_names output.
172
+
173
+ The CSV ships canonical aliases as a JSON array string; some rows are
174
+ empty, some hold a list of strings. Returns a clean list of valid
175
+ Python identifiers so the rendered aliases match the ones the runtime
176
+ registry would register.
177
+ """
178
+ s = _clean_value(cell)
179
+ if not s:
180
+ return []
181
+ try:
182
+ parsed = json.loads(s)
183
+ except json.JSONDecodeError:
184
+ try:
185
+ parsed = ast.literal_eval(s)
186
+ except (ValueError, SyntaxError):
187
+ return []
188
+ if not isinstance(parsed, (list, tuple)):
189
+ return []
190
+ out: list[str] = []
191
+ for name in parsed:
192
+ n = str(name).strip()
193
+ if n and n.isidentifier():
194
+ out.append(n)
195
+ return out
196
+
197
+
198
  def _build_context(row: pd.Series) -> dict[str, Any]:
199
  dataset_id = _clean_value(row.get("dataset")).lower()
200
  api = _fetch_api_summary(dataset_id)
 
211
  api.get("computed_title") or api.get("name")
212
  )
213
  doi_raw = _clean_value(row.get("doi")) or _clean_value(api.get("dataset_doi"))
 
214
  doi = doi_raw[4:].strip() if doi_raw.lower().startswith("doi:") else doi_raw
215
+ paper_doi_raw = _clean_value(api.get("associated_paper_doi"))
216
+ paper_doi = (
217
+ paper_doi_raw[4:].strip()
218
+ if paper_doi_raw.lower().startswith("doi:")
219
+ else paper_doi_raw
220
+ )
221
  license_ = _clean_value(row.get("license")) or _clean_value(api.get("license"))
222
  authors = _normalize_list(api.get("authors"))
223
  source = _clean_value(row.get("source")) or "OpenNeuro"
224
 
 
 
225
  ts = api.get("timestamps") or {}
226
+ year = ""
227
  created = ts.get("dataset_created_at") or ""
228
  if isinstance(created, str) and len(created) >= 4:
229
  year = created[:4]
230
 
231
+ # Canonical aliases: CSV first (filtered the same way the runtime registry
232
+ # filters), API second as a safety net.
233
+ canonical_names = _parse_canonical_names(row.get("canonical_name"))
234
+ if not canonical_names:
235
+ raw = api.get("canonical_name")
236
+ if isinstance(raw, list):
237
+ canonical_names = [
238
+ str(n).strip()
239
+ for n in raw
240
+ if isinstance(n, str) and str(n).strip().isidentifier()
241
+ ]
242
+
243
+ # Duration: prefer CSV hours, else API seconds → hours
244
+ dur_h = _format_hours(row.get("duration_hours_total"))
245
+ if not dur_h:
246
+ sec = _clean_value(api.get("total_duration_s"))
247
+ if sec:
248
+ try:
249
+ dur_h = f"{float(sec) / 3600:,.1f}"
250
+ except ValueError:
251
+ dur_h = ""
252
+
253
+ demographics = api.get("demographics") or {}
254
+ storage = api.get("storage") or {}
255
+ external = api.get("external_links") or {}
256
+ api_tags = api.get("tags") or {}
257
+
258
  return {
259
  "dataset_id": dataset_id,
260
  "title": title or dataset_id,
261
  "author_year": _clean_value(row.get("author_year")),
262
+ "canonical_names": canonical_names,
263
  "authors": authors,
264
+ "senior_author": _clean_value(api.get("senior_author")),
265
+ "contact_info": _normalize_list(api.get("contact_info")),
266
+ "contributing_labs": _normalize_list(api.get("contributing_labs")),
267
  "year": year,
268
  "license": license_ or "Unknown",
269
  "doi": doi,
270
+ "paper_doi": paper_doi,
271
  "source": source,
272
  "openneuro_url": f"https://openneuro.org/datasets/{dataset_id}",
273
  "nemar_url": f"https://nemar.org/dataexplorer/detail?dataset_id={dataset_id}",
274
+ "source_url": _clean_value(api.get("source_url")) or _clean_value(external.get("source_url")),
275
+ "osf_url": _clean_value(external.get("osf_url")),
276
+ "github_url": _clean_value(external.get("github_url")),
277
  "record_modality": _clean_value(row.get("record_modality")),
278
+ "modality_exp": _clean_value(row.get("modality of exp")) or _clean_value(api_tags.get("modality")),
279
+ "type_exp": _clean_value(row.get("type of exp")) or _clean_value(api_tags.get("type")),
280
+ "pathology": _clean_value(row.get("Type Subject")) or _clean_value(api_tags.get("pathology")),
281
+ "tasks_list": _normalize_list(api.get("tasks")),
282
+ "n_subjects": pick("n_subjects", "n_subjects") or str(_clean_value(demographics.get("subjects_count")) or ""),
283
  "n_records": pick("n_records", "total_files"),
284
  "n_tasks": pick("n_tasks", "n_tasks"),
285
+ "n_channels": _format_stat_counts(row.get("nchans_set")) or _format_stat_counts(api.get("nchans_counts")),
286
+ "sampling_freqs": _format_stat_counts(row.get("sampling_freqs")) or _format_stat_counts(api.get("sfreq_counts")),
287
  "size": _clean_value(row.get("size")),
288
+ "size_bytes": _clean_value(api.get("size_bytes")),
289
+ "duration_hours_total": dur_h,
290
+ "bids_version": _clean_value(api.get("bids_version")),
291
+ "age_min": _clean_value(demographics.get("age_min")),
292
+ "age_max": _clean_value(demographics.get("age_max")),
293
+ "age_mean": _clean_value(demographics.get("age_mean")),
294
+ "sessions": _normalize_list(api.get("sessions")),
295
+ "study_design": _clean_value(api.get("study_design")),
296
+ "study_domain": _clean_value(api.get("study_domain")),
297
+ "experimental_modalities": _normalize_list(api.get("experimental_modalities")),
298
+ "datatypes": _normalize_list(api.get("datatypes")),
299
+ "funding": _normalize_list(api.get("funding")),
300
  "references": _normalize_list(api.get("references")),
301
  "how_to_acknowledge": _clean_value(api.get("how_to_acknowledge")),
302
+ "readme": _clean_value(api.get("readme")),
303
+ "nemar_citations": _clean_value(api.get("nemar_citation_count")) or _clean_value(row.get("nemar_citation_count")),
304
+ "storage_backend": _clean_value(storage.get("backend")),
305
+ "storage_base": _clean_value(storage.get("base")),
306
+ "digested_at": _clean_value(ts.get("digested_at")),
307
+ "stats_computed_at": _clean_value(api.get("stats_computed_at")),
308
  }
309
 
310
 
 
349
  return "10K<n<100K"
350
 
351
 
352
+ def _escape_yaml(s: str) -> str:
353
+ """Quote a YAML string value safely. Assumes the content is plain text."""
354
+ return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"'
355
+
356
+
357
+ def _sanitize_upstream_readme(text: str) -> str:
358
+ """Defuse markers that could confuse HF's frontmatter parser.
359
+
360
+ An upstream README that happens to start a line with ``---`` on its
361
+ own renders fine in the body of a Markdown doc, but trailing YAML
362
+ blocks at the top of a mixed document can trip some parsers. We also
363
+ strip ingested-time pollution ("Introduction:" header styling etc.
364
+ stays intact — only raw markers get touched).
365
+ """
366
+ out_lines: list[str] = []
367
+ for ln in text.splitlines():
368
+ if ln.strip() == "---":
369
+ out_lines.append("***") # visual divider instead
370
+ else:
371
+ out_lines.append(ln)
372
+ return "\n".join(out_lines).strip()
373
+
374
+
375
  def _render_readme(ctx: dict[str, Any]) -> str:
376
+ # -- Frontmatter -------------------------------------------------------
377
+
378
+ tags = ["neuroscience", "eegdash", "brain-computer-interface", "pytorch"]
379
  rm = ctx["record_modality"].lower()
380
  if rm in {"eeg", "meg", "ieeg"}:
381
  tags.insert(0, rm)
 
383
  tags.insert(0, "eeg")
384
  if ctx["modality_exp"]:
385
  tags.append(ctx["modality_exp"].lower().replace(" ", "-"))
386
+ if ctx["type_exp"]:
387
+ tags.append(ctx["type_exp"].lower().replace(" ", "-").replace("/", "-"))
388
  if ctx["pathology"] and ctx["pathology"].lower() not in {"unknown", "healthy"}:
389
  tags.append(ctx["pathology"].lower().replace(" ", "-").replace("/", "-"))
390
+ for t in ctx["tasks_list"][:5]:
391
+ slug = t.lower().replace("_", "-").replace(" ", "-")
392
+ if slug and slug not in tags:
393
+ tags.append(slug)
394
+ # Dedupe while preserving order.
395
+ tags = list(dict.fromkeys(tags))
396
 
397
  license_slug = _hf_license(ctx["license"])
398
  size_cat = _size_category(ctx["n_records"])
399
 
400
+ yaml_parts = ["---"]
401
+ yaml_parts.append(f"pretty_name: {_escape_yaml(ctx['title'] or ctx['dataset_id'])}")
402
+ yaml_parts.append(f"license: {license_slug}")
403
+ yaml_parts.append("tags:")
404
+ for t in tags:
405
+ yaml_parts.append(f" - {t}")
406
+ yaml_parts.append("size_categories:")
407
+ yaml_parts.append(f" - {size_cat}")
408
+ if ctx["record_modality"]:
409
+ yaml_parts.append("task_categories:")
410
+ yaml_parts.append(" - other")
411
  if ctx["authors"]:
412
+ yaml_parts.append("authors:")
413
+ for a in ctx["authors"][:12]:
414
+ yaml_parts.append(f" - {_escape_yaml(a)}")
415
+ yaml_parts.append("---")
416
+ frontmatter = "\n".join(yaml_parts)
417
 
418
+ # -- Hero --------------------------------------------------------------
419
 
420
+ hero_title = ctx["title"] or ctx["dataset_id"]
421
+ attribution = ""
 
 
 
422
  if ctx["author_year"]:
423
+ attribution = ctx["author_year"]
424
  elif ctx["authors"]:
425
  head = ctx["authors"][0]
426
+ extra = " et al." if len(ctx["authors"]) > 1 else ""
427
+ attribution = head + extra + (f" ({ctx['year']})" if ctx["year"] else "")
428
+ alias_line = ""
429
+ if ctx["canonical_names"]:
430
+ joined = " · ".join(f"`{n}`" for n in ctx["canonical_names"])
431
+ alias_line = f"**Canonical aliases:** {joined}"
432
+ hero_bits = [f"# {hero_title}", f"**Dataset ID:** `{ctx['dataset_id']}`"]
433
+ if attribution:
434
+ hero_bits.append(f"_{attribution}_")
435
+ if alias_line:
436
+ hero_bits.append(alias_line)
437
+ hero = "\n\n".join(hero_bits)
438
+
439
+ # -- Summary line (3-second takeaway) ---------------------------------
440
+
441
+ tl_bits = []
442
+ if ctx["record_modality"]:
443
+ tl_bits.append(ctx["record_modality"].upper())
444
+ if ctx["modality_exp"] and ctx["type_exp"]:
445
+ tl_bits.append(f"{ctx['modality_exp']} {ctx['type_exp'].lower()}")
446
+ elif ctx["modality_exp"]:
447
+ tl_bits.append(ctx["modality_exp"])
448
+ if ctx["pathology"]:
449
+ tl_bits.append(ctx["pathology"].lower())
450
+ if ctx["n_subjects"]:
451
+ tl_bits.append(f"{ctx['n_subjects']} subjects")
452
+ if ctx["n_records"]:
453
+ tl_bits.append(f"{ctx['n_records']} recordings")
454
+ if ctx["license"]:
455
+ tl_bits.append(ctx["license"])
456
+ tldr = "> **At a glance:** " + " · ".join(tl_bits) if tl_bits else ""
457
+
458
+ # -- Load section ------------------------------------------------------
459
+
460
+ aliases_hint = ""
461
+ if ctx["canonical_names"]:
462
+ a0 = ctx["canonical_names"][0]
463
+ aliases_hint = (
464
+ f"\nYou can also load it by canonical alias — these are registered "
465
+ f"classes in `eegdash.dataset`:\n\n"
466
+ f"```python\n"
467
+ f"from eegdash.dataset import {a0}\n"
468
+ f"ds = {a0}(cache_dir=\"./cache\")\n"
469
+ f"```\n"
470
+ )
471
 
472
  load_block = f"""## Load this dataset
473
 
474
+ This repo is a **pointer**. The raw EEG data lives at its canonical source
475
+ (OpenNeuro / NEMAR); [EEGDash](https://github.com/eegdash/EEGDash) streams it
476
+ on demand and returns a PyTorch / braindecode dataset.
477
 
478
  ```python
479
  # pip install eegdash
 
482
  ds = EEGDashDataset(dataset="{ctx['dataset_id']}", cache_dir="./cache")
483
  print(len(ds), "recordings")
484
  ```
485
+ {aliases_hint}
486
+ If the dataset has been mirrored to the HF Hub in braindecode's Zarr layout,
487
+ you can also pull it directly:
488
 
489
  ```python
490
  from braindecode.datasets import BaseConcatDataset
 
492
  ```
493
  """
494
 
495
+ # -- Metadata table ---------------------------------------------------
496
+
497
+ age_str = ""
498
+ if ctx["age_min"] or ctx["age_max"] or ctx["age_mean"]:
499
+ parts = []
500
+ if ctx["age_min"] and ctx["age_max"]:
501
+ parts.append(f"{ctx['age_min']}–{ctx['age_max']} yrs")
502
+ if ctx["age_mean"]:
503
+ try:
504
+ parts.append(f"mean {float(ctx['age_mean']):.1f}")
505
+ except ValueError:
506
+ parts.append(f"mean {ctx['age_mean']}")
507
+ age_str = ", ".join(parts)
508
+
509
  rows = [
510
  ("Subjects", ctx["n_subjects"]),
511
+ ("Age range", age_str),
512
  ("Recordings", ctx["n_records"]),
513
+ ("Tasks (count)", ctx["n_tasks"]),
514
+ ("Sessions", str(len(ctx["sessions"])) if ctx["sessions"] else ""),
515
  ("Channels", ctx["n_channels"]),
516
  ("Sampling rate (Hz)", ctx["sampling_freqs"]),
 
517
  ("Total duration (h)", ctx["duration_hours_total"]),
518
+ ("Size on disk", ctx["size"]),
519
+ ("Recording type", ctx["record_modality"].upper() if ctx["record_modality"] else ""),
520
  ("Experimental modality", ctx["modality_exp"]),
521
+ ("Paradigm type", ctx["type_exp"]),
522
  ("Population", ctx["pathology"]),
523
+ ("Study design", ctx["study_design"]),
524
+ ("Study domain", ctx["study_domain"]),
525
+ ("BIDS version", ctx["bids_version"]),
526
  ("Source", ctx["source"]),
527
  ("License", ctx["license"]),
528
+ ("NEMAR citations", ctx["nemar_citations"]),
529
  ]
530
  md_rows = "\n".join(
531
+ f"| **{k}** | {v} |" for k, v in rows if str(v or "").strip()
532
  )
533
+ meta_table = "## Dataset metadata\n\n| | |\n|---|---|\n" + md_rows
534
 
535
+ # -- Tasks list (if any) ----------------------------------------------
536
 
537
+ tasks_block = ""
538
+ if ctx["tasks_list"]:
539
+ items = "\n".join(f"- `{t}`" for t in ctx["tasks_list"])
540
+ tasks_block = f"## Tasks\n\n{items}\n"
541
+
542
+ # -- Upstream README (the star of the show) ---------------------------
543
+
544
+ upstream_block = ""
545
+ if ctx["readme"]:
546
+ body = _sanitize_upstream_readme(ctx["readme"])
547
+ upstream_block = (
548
+ "## Upstream README\n\n"
549
+ "_Verbatim from the dataset's authors — the canonical "
550
+ "description._\n\n"
551
+ f"{body}\n"
552
+ )
553
+
554
+ # -- People -----------------------------------------------------------
555
+
556
+ people_lines = []
557
+ if ctx["authors"]:
558
+ people_lines.append("### Authors")
559
+ for a in ctx["authors"]:
560
+ marker = " _(senior)_" if a.strip() == ctx["senior_author"].strip() else ""
561
+ people_lines.append(f"- {a}{marker}")
562
+ if ctx["contributing_labs"]:
563
+ people_lines.append("\n### Contributing labs")
564
+ for lab in ctx["contributing_labs"]:
565
+ people_lines.append(f"- {lab}")
566
+ if ctx["contact_info"]:
567
+ people_lines.append("\n### Contact")
568
+ for c in ctx["contact_info"]:
569
+ people_lines.append(f"- {c}")
570
+ people_block = "## People\n\n" + "\n".join(people_lines) if people_lines else ""
571
+
572
+ # -- Funding + references ---------------------------------------------
573
+
574
+ funding_block = ""
575
+ if ctx["funding"]:
576
+ items = "\n".join(f"- {f}" for f in ctx["funding"])
577
+ funding_block = f"## Funding\n\n{items}"
578
+
579
+ cite_block = ""
580
+ if ctx["how_to_acknowledge"]:
581
+ cite_block = (
582
+ "## How to cite\n\n"
583
+ "Please follow the upstream dataset's citation policy:\n\n"
584
+ + "\n".join(
585
+ f"> {ln}" for ln in ctx["how_to_acknowledge"].strip().splitlines()
586
+ )
587
+ )
588
+ if ctx["references"]:
589
+ if cite_block:
590
+ cite_block += "\n\n### References\n\n"
591
+ else:
592
+ cite_block = "## References\n\n"
593
+ cite_block += "\n".join(f"- {r}" for r in ctx["references"])
594
+
595
+ # -- Links ------------------------------------------------------------
596
 
597
  links = []
598
  if ctx["doi"]:
599
  links.append(f"- **DOI:** [{ctx['doi']}](https://doi.org/{ctx['doi']})")
600
+ if ctx["paper_doi"]:
601
+ links.append(
602
+ f"- **Associated paper:** [{ctx['paper_doi']}]"
603
+ f"(https://doi.org/{ctx['paper_doi']})"
604
+ )
605
  if ctx["source"].lower() == "openneuro":
606
  links.append(f"- **OpenNeuro:** [{ctx['dataset_id']}]({ctx['openneuro_url']})")
607
  if ctx["source"].lower() == "nemar":
608
  links.append(f"- **NEMAR:** [{ctx['dataset_id']}]({ctx['nemar_url']})")
609
+ if ctx["source_url"] and ctx["source_url"] not in (ctx["openneuro_url"], ctx["nemar_url"]):
610
  links.append(f"- **Source:** <{ctx['source_url']}>")
611
+ if ctx["osf_url"]:
612
+ links.append(f"- **OSF:** <{ctx['osf_url']}>")
613
+ if ctx["github_url"]:
614
+ links.append(f"- **GitHub:** <{ctx['github_url']}>")
615
  links.append(f"- **Browse 700+ datasets:** [EEGDash catalog]({CATALOG_SPACE})")
616
  links.append(f"- **Docs:** <{EEGDASH_URL}>")
617
  links.append(f"- **Code:** <{GITHUB_URL}>")
618
  links_block = "## Links\n\n" + "\n".join(links)
619
 
620
+ # -- Provenance (where the data actually lives + when we saw it) ------
621
+
622
+ prov_lines = []
623
+ if ctx["storage_backend"] and ctx["storage_base"]:
624
+ prov_lines.append(
625
+ f"- **Backend:** `{ctx['storage_backend']}` — "
626
+ f"`{ctx['storage_base']}`"
627
  )
628
+ elif ctx["storage_backend"]:
629
+ prov_lines.append(f"- **Backend:** `{ctx['storage_backend']}`")
630
+ if ctx["size_bytes"]:
631
+ try:
632
+ sb = float(ctx["size_bytes"])
633
+ prov_lines.append(f"- **Exact size:** {int(sb):,} bytes ({ctx['size']})")
634
+ except ValueError:
635
+ pass
636
+ if ctx["digested_at"]:
637
+ prov_lines.append(f"- **Ingested:** {ctx['digested_at'][:10]}")
638
+ if ctx["stats_computed_at"]:
639
+ prov_lines.append(
640
+ f"- **Stats computed:** {ctx['stats_computed_at'][:10]}"
641
  )
642
+ prov_block = "## Provenance\n\n" + "\n".join(prov_lines) if prov_lines else ""
643
+
644
+ # -- Footer -----------------------------------------------------------
645
 
646
  footer = (
647
+ f"---\n\n"
648
+ f"_Auto-generated from "
649
+ f"[dataset_summary.csv]({GITHUB_URL}/blob/main/eegdash/dataset/dataset_summary.csv) "
650
+ f"and the [EEGDash API]({EEGDASH_API}/datasets/summary/{ctx['dataset_id']}). "
651
+ f"Do not edit this file by hand — update the upstream source and "
652
+ f"re-run `scripts/push_metadata_stubs.py`._"
653
  )
654
 
655
+ sections = [
656
+ frontmatter,
657
+ hero,
658
+ tldr,
659
+ load_block,
660
+ meta_table,
661
+ tasks_block,
662
+ upstream_block,
663
+ people_block,
664
+ funding_block,
665
+ cite_block,
666
+ links_block,
667
+ prov_block,
668
+ footer,
669
+ ]
670
+ return "\n\n".join(s for s in sections if s).strip() + "\n"
 
 
 
 
671
 
672
 
673
  def _render_pointer(ctx: dict[str, Any]) -> str: