Spaces:

EEGDash
/

catalog

Running

App Files Files Community

bruAristimunha commited on Apr 19

Commit

4af5c7a

1 Parent(s): 5c60b17

Richer stubs: canonical aliases, upstream README verbatim, demographics, funding, provenance

Browse files

Files changed (1) hide show

scripts/push_metadata_stubs.py +341 -78

scripts/push_metadata_stubs.py CHANGED Viewed

@@ -167,6 +167,34 @@ def _fetch_api_summary(dataset_id: str, timeout: float = 10.0) -> dict[str, Any]
 # ---------------------------------------------------------------------------
 def _build_context(row: pd.Series) -> dict[str, Any]:
     dataset_id = _clean_value(row.get("dataset")).lower()
     api = _fetch_api_summary(dataset_id)
@@ -183,44 +211,100 @@ def _build_context(row: pd.Series) -> dict[str, Any]:
         api.get("computed_title") or api.get("name")
     )
     doi_raw = _clean_value(row.get("doi")) or _clean_value(api.get("dataset_doi"))
-    # DOIs sometimes ship with a "doi:" prefix — strip so links don't double up.
     doi = doi_raw[4:].strip() if doi_raw.lower().startswith("doi:") else doi_raw
     license_ = _clean_value(row.get("license")) or _clean_value(api.get("license"))
     authors = _normalize_list(api.get("authors"))
     source = _clean_value(row.get("source")) or "OpenNeuro"
-    # Year from API timestamps (docs does the same)
-    year = ""
     ts = api.get("timestamps") or {}
     created = ts.get("dataset_created_at") or ""
     if isinstance(created, str) and len(created) >= 4:
         year = created[:4]
     return {
         "dataset_id": dataset_id,
         "title": title or dataset_id,
         "author_year": _clean_value(row.get("author_year")),
         "authors": authors,
         "year": year,
         "license": license_ or "Unknown",
         "doi": doi,
         "source": source,
         "openneuro_url": f"https://openneuro.org/datasets/{dataset_id}",
         "nemar_url": f"https://nemar.org/dataexplorer/detail?dataset_id={dataset_id}",
-        "source_url": _clean_value(api.get("source_url")),
         "record_modality": _clean_value(row.get("record_modality")),
-        "modality_exp": _clean_value(row.get("modality of exp")),
-        "type_exp": _clean_value(row.get("type of exp")),
-        "pathology": _clean_value(row.get("Type Subject")),
-        "n_subjects": pick("n_subjects", "n_subjects"),
         "n_records": pick("n_records", "total_files"),
         "n_tasks": pick("n_tasks", "n_tasks"),
-        "n_channels": _format_stat_counts(row.get("nchans_set")),
-        "sampling_freqs": _format_stat_counts(row.get("sampling_freqs")),
         "size": _clean_value(row.get("size")),
-        "duration_hours_total": _format_hours(row.get("duration_hours_total")),
         "references": _normalize_list(api.get("references")),
         "how_to_acknowledge": _clean_value(api.get("how_to_acknowledge")),
     }
@@ -265,8 +349,33 @@ def _size_category(n_records: str) -> str:
     return "10K<n<100K"
 def _render_readme(ctx: dict[str, Any]) -> str:
-    tags = ["neuroscience", "eegdash", "brain-computer-interface"]
     rm = ctx["record_modality"].lower()
     if rm in {"eeg", "meg", "ieeg"}:
         tags.insert(0, rm)
@@ -274,40 +383,97 @@ def _render_readme(ctx: dict[str, Any]) -> str:
         tags.insert(0, "eeg")
     if ctx["modality_exp"]:
         tags.append(ctx["modality_exp"].lower().replace(" ", "-"))
     if ctx["pathology"] and ctx["pathology"].lower() not in {"unknown", "healthy"}:
         tags.append(ctx["pathology"].lower().replace(" ", "-").replace("/", "-"))
     license_slug = _hf_license(ctx["license"])
     size_cat = _size_category(ctx["n_records"])
-    yaml_tags = "\n".join(f"- {t}" for t in tags)
-    yaml_authors = ""
     if ctx["authors"]:
-        yaml_authors = "authors:\n" + "\n".join(
-            f"  - {a}" for a in ctx["authors"][:8]
-        ) + "\n"
-    # --- Body -------------------------------------------------------------
-    hero_lines = []
-    if ctx["title"] and ctx["title"].lower() != ctx["dataset_id"].lower():
-        hero_lines.append(f"# {ctx['title']}")
-    else:
-        hero_lines.append(f"# {ctx['dataset_id']}")
     if ctx["author_year"]:
-        hero_lines.append(f"*{ctx['author_year']}*")
     elif ctx["authors"]:
         head = ctx["authors"][0]
-        extra = f" et al." if len(ctx["authors"]) > 1 else ""
-        yr = f" ({ctx['year']})" if ctx["year"] else ""
-        hero_lines.append(f"*{head}{extra}{yr}*")
-    hero = "\n\n".join(hero_lines)
     load_block = f"""## Load this dataset
-This repo is a **pointer** — the raw EEG data lives at its canonical source
-(OpenNeuro / NEMAR). [EEGDash](https://github.com/eegdash/EEGDash) handles the
-download, caching, and conversion to a PyTorch / braindecode dataset.
 ```python
 # pip install eegdash
@@ -316,9 +482,9 @@ from eegdash import EEGDashDataset
 ds = EEGDashDataset(dataset="{ctx['dataset_id']}", cache_dir="./cache")
 print(len(ds), "recordings")
 ```
-Need it in braindecode's HF-native Zarr format? Once mirrored
-(`ds.push_to_hub(...)`) you can also do:
 ```python
 from braindecode.datasets import BaseConcatDataset
@@ -326,85 +492,182 @@ ds = BaseConcatDataset.pull_from_hub("{HF_ORG}/{ctx['dataset_id']}")
 ```
 """
     rows = [
         ("Subjects", ctx["n_subjects"]),
         ("Recordings", ctx["n_records"]),
-        ("Tasks", ctx["n_tasks"]),
         ("Channels", ctx["n_channels"]),
         ("Sampling rate (Hz)", ctx["sampling_freqs"]),
-        ("Size on disk", ctx["size"]),
         ("Total duration (h)", ctx["duration_hours_total"]),
         ("Experimental modality", ctx["modality_exp"]),
-        ("Experimental type", ctx["type_exp"]),
         ("Population", ctx["pathology"]),
-        ("Recording type", ctx["record_modality"].upper()),
         ("Source", ctx["source"]),
         ("License", ctx["license"]),
     ]
     md_rows = "\n".join(
-        f"| **{k}** | {v or '—'} |" for k, v in rows if v or k in {"Source", "License"}
     )
-    meta_table = f"""## Dataset metadata
-| | |
-|---|---|
-{md_rows}
-"""
     links = []
     if ctx["doi"]:
         links.append(f"- **DOI:** [{ctx['doi']}](https://doi.org/{ctx['doi']})")
     if ctx["source"].lower() == "openneuro":
         links.append(f"- **OpenNeuro:** [{ctx['dataset_id']}]({ctx['openneuro_url']})")
     if ctx["source"].lower() == "nemar":
         links.append(f"- **NEMAR:** [{ctx['dataset_id']}]({ctx['nemar_url']})")
-    if ctx["source_url"]:
         links.append(f"- **Source:** <{ctx['source_url']}>")
     links.append(f"- **Browse 700+ datasets:** [EEGDash catalog]({CATALOG_SPACE})")
     links.append(f"- **Docs:** <{EEGDASH_URL}>")
     links.append(f"- **Code:** <{GITHUB_URL}>")
     links_block = "## Links\n\n" + "\n".join(links)
-    cite_block = ""
-    if ctx["how_to_acknowledge"]:
-        cite_block = (
-            "## How to cite\n\n"
-            "Please follow the upstream dataset's citation policy:\n\n"
-            f"> {ctx['how_to_acknowledge'].strip()}\n"
         )
-    elif ctx["references"]:
-        cite_block = "## References\n\n" + "\n".join(
-            f"- {r}" for r in ctx["references"][:5]
         )
     footer = (
-        f"\n---\n\n"
-        f"_This repo is auto-generated from [dataset_summary.csv]"
-        f"({GITHUB_URL}/blob/main/eegdash/dataset/dataset_summary.csv) + the "
-        f"EEGDash API. Edit the upstream source, not this file._"
     )
-    return f"""---
-tags:
-{yaml_tags}
-license: {license_slug}
-size_categories:
-- {size_cat}
-pretty_name: "{ctx['title'] or ctx['dataset_id']}"
-{yaml_authors}---
-{hero}
-{load_block}
-{meta_table}
-{links_block}
-{cite_block}
-{footer}
-"""
 def _render_pointer(ctx: dict[str, Any]) -> str:

 # ---------------------------------------------------------------------------
+def _parse_canonical_names(cell: Any) -> list[str]:
+    """Match eegdash.dataset.registry._parse_canonical_names output.
+    The CSV ships canonical aliases as a JSON array string; some rows are
+    empty, some hold a list of strings. Returns a clean list of valid
+    Python identifiers so the rendered aliases match the ones the runtime
+    registry would register.
+    """
+    s = _clean_value(cell)
+    if not s:
+        return []
+    try:
+        parsed = json.loads(s)
+    except json.JSONDecodeError:
+        try:
+            parsed = ast.literal_eval(s)
+        except (ValueError, SyntaxError):
+            return []
+    if not isinstance(parsed, (list, tuple)):
+        return []
+    out: list[str] = []
+    for name in parsed:
+        n = str(name).strip()
+        if n and n.isidentifier():
+            out.append(n)
+    return out
 def _build_context(row: pd.Series) -> dict[str, Any]:
     dataset_id = _clean_value(row.get("dataset")).lower()
     api = _fetch_api_summary(dataset_id)
         api.get("computed_title") or api.get("name")
     )
     doi_raw = _clean_value(row.get("doi")) or _clean_value(api.get("dataset_doi"))
     doi = doi_raw[4:].strip() if doi_raw.lower().startswith("doi:") else doi_raw
+    paper_doi_raw = _clean_value(api.get("associated_paper_doi"))
+    paper_doi = (
+        paper_doi_raw[4:].strip()
+        if paper_doi_raw.lower().startswith("doi:")
+        else paper_doi_raw
+    )
     license_ = _clean_value(row.get("license")) or _clean_value(api.get("license"))
     authors = _normalize_list(api.get("authors"))
     source = _clean_value(row.get("source")) or "OpenNeuro"
     ts = api.get("timestamps") or {}
+    year = ""
     created = ts.get("dataset_created_at") or ""
     if isinstance(created, str) and len(created) >= 4:
         year = created[:4]
+    # Canonical aliases: CSV first (filtered the same way the runtime registry
+    # filters), API second as a safety net.
+    canonical_names = _parse_canonical_names(row.get("canonical_name"))
+    if not canonical_names:
+        raw = api.get("canonical_name")
+        if isinstance(raw, list):
+            canonical_names = [
+                str(n).strip()
+                for n in raw
+                if isinstance(n, str) and str(n).strip().isidentifier()
+            ]
+    # Duration: prefer CSV hours, else API seconds → hours
+    dur_h = _format_hours(row.get("duration_hours_total"))
+    if not dur_h:
+        sec = _clean_value(api.get("total_duration_s"))
+        if sec:
+            try:
+                dur_h = f"{float(sec) / 3600:,.1f}"
+            except ValueError:
+                dur_h = ""
+    demographics = api.get("demographics") or {}
+    storage = api.get("storage") or {}
+    external = api.get("external_links") or {}
+    api_tags = api.get("tags") or {}
     return {
         "dataset_id": dataset_id,
         "title": title or dataset_id,
         "author_year": _clean_value(row.get("author_year")),
+        "canonical_names": canonical_names,
         "authors": authors,
+        "senior_author": _clean_value(api.get("senior_author")),
+        "contact_info": _normalize_list(api.get("contact_info")),
+        "contributing_labs": _normalize_list(api.get("contributing_labs")),
         "year": year,
         "license": license_ or "Unknown",
         "doi": doi,
+        "paper_doi": paper_doi,
         "source": source,
         "openneuro_url": f"https://openneuro.org/datasets/{dataset_id}",
         "nemar_url": f"https://nemar.org/dataexplorer/detail?dataset_id={dataset_id}",
+        "source_url": _clean_value(api.get("source_url")) or _clean_value(external.get("source_url")),
+        "osf_url": _clean_value(external.get("osf_url")),
+        "github_url": _clean_value(external.get("github_url")),
         "record_modality": _clean_value(row.get("record_modality")),
+        "modality_exp": _clean_value(row.get("modality of exp")) or _clean_value(api_tags.get("modality")),
+        "type_exp": _clean_value(row.get("type of exp")) or _clean_value(api_tags.get("type")),
+        "pathology": _clean_value(row.get("Type Subject")) or _clean_value(api_tags.get("pathology")),
+        "tasks_list": _normalize_list(api.get("tasks")),
+        "n_subjects": pick("n_subjects", "n_subjects") or str(_clean_value(demographics.get("subjects_count")) or ""),
         "n_records": pick("n_records", "total_files"),
         "n_tasks": pick("n_tasks", "n_tasks"),
+        "n_channels": _format_stat_counts(row.get("nchans_set")) or _format_stat_counts(api.get("nchans_counts")),
+        "sampling_freqs": _format_stat_counts(row.get("sampling_freqs")) or _format_stat_counts(api.get("sfreq_counts")),
         "size": _clean_value(row.get("size")),
+        "size_bytes": _clean_value(api.get("size_bytes")),
+        "duration_hours_total": dur_h,
+        "bids_version": _clean_value(api.get("bids_version")),
+        "age_min": _clean_value(demographics.get("age_min")),
+        "age_max": _clean_value(demographics.get("age_max")),
+        "age_mean": _clean_value(demographics.get("age_mean")),
+        "sessions": _normalize_list(api.get("sessions")),
+        "study_design": _clean_value(api.get("study_design")),
+        "study_domain": _clean_value(api.get("study_domain")),
+        "experimental_modalities": _normalize_list(api.get("experimental_modalities")),
+        "datatypes": _normalize_list(api.get("datatypes")),
+        "funding": _normalize_list(api.get("funding")),
         "references": _normalize_list(api.get("references")),
         "how_to_acknowledge": _clean_value(api.get("how_to_acknowledge")),
+        "readme": _clean_value(api.get("readme")),
+        "nemar_citations": _clean_value(api.get("nemar_citation_count")) or _clean_value(row.get("nemar_citation_count")),
+        "storage_backend": _clean_value(storage.get("backend")),
+        "storage_base": _clean_value(storage.get("base")),
+        "digested_at": _clean_value(ts.get("digested_at")),
+        "stats_computed_at": _clean_value(api.get("stats_computed_at")),
     }
     return "10K<n<100K"
+def _escape_yaml(s: str) -> str:
+    """Quote a YAML string value safely. Assumes the content is plain text."""
+    return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"'
+def _sanitize_upstream_readme(text: str) -> str:
+    """Defuse markers that could confuse HF's frontmatter parser.
+    An upstream README that happens to start a line with ``---`` on its
+    own renders fine in the body of a Markdown doc, but trailing YAML
+    blocks at the top of a mixed document can trip some parsers. We also
+    strip ingested-time pollution ("Introduction:" header styling etc.
+    stays intact — only raw markers get touched).
+    """
+    out_lines: list[str] = []
+    for ln in text.splitlines():
+        if ln.strip() == "---":
+            out_lines.append("***")  # visual divider instead
+        else:
+            out_lines.append(ln)
+    return "\n".join(out_lines).strip()
 def _render_readme(ctx: dict[str, Any]) -> str:
+    # -- Frontmatter -------------------------------------------------------
+    tags = ["neuroscience", "eegdash", "brain-computer-interface", "pytorch"]
     rm = ctx["record_modality"].lower()
     if rm in {"eeg", "meg", "ieeg"}:
         tags.insert(0, rm)
         tags.insert(0, "eeg")
     if ctx["modality_exp"]:
         tags.append(ctx["modality_exp"].lower().replace(" ", "-"))
+    if ctx["type_exp"]:
+        tags.append(ctx["type_exp"].lower().replace(" ", "-").replace("/", "-"))
     if ctx["pathology"] and ctx["pathology"].lower() not in {"unknown", "healthy"}:
         tags.append(ctx["pathology"].lower().replace(" ", "-").replace("/", "-"))
+    for t in ctx["tasks_list"][:5]:
+        slug = t.lower().replace("_", "-").replace(" ", "-")
+        if slug and slug not in tags:
+            tags.append(slug)
+    # Dedupe while preserving order.
+    tags = list(dict.fromkeys(tags))
     license_slug = _hf_license(ctx["license"])
     size_cat = _size_category(ctx["n_records"])
+    yaml_parts = ["---"]
+    yaml_parts.append(f"pretty_name: {_escape_yaml(ctx['title'] or ctx['dataset_id'])}")
+    yaml_parts.append(f"license: {license_slug}")
+    yaml_parts.append("tags:")
+    for t in tags:
+        yaml_parts.append(f"  - {t}")
+    yaml_parts.append("size_categories:")
+    yaml_parts.append(f"  - {size_cat}")
+    if ctx["record_modality"]:
+        yaml_parts.append("task_categories:")
+        yaml_parts.append("  - other")
     if ctx["authors"]:
+        yaml_parts.append("authors:")
+        for a in ctx["authors"][:12]:
+            yaml_parts.append(f"  - {_escape_yaml(a)}")
+    yaml_parts.append("---")
+    frontmatter = "\n".join(yaml_parts)
+    # -- Hero --------------------------------------------------------------
+    hero_title = ctx["title"] or ctx["dataset_id"]
+    attribution = ""
     if ctx["author_year"]:
+        attribution = ctx["author_year"]
     elif ctx["authors"]:
         head = ctx["authors"][0]
+        extra = " et al." if len(ctx["authors"]) > 1 else ""
+        attribution = head + extra + (f" ({ctx['year']})" if ctx["year"] else "")
+    alias_line = ""
+    if ctx["canonical_names"]:
+        joined = " · ".join(f"`{n}`" for n in ctx["canonical_names"])
+        alias_line = f"**Canonical aliases:** {joined}"
+    hero_bits = [f"# {hero_title}", f"**Dataset ID:** `{ctx['dataset_id']}`"]
+    if attribution:
+        hero_bits.append(f"_{attribution}_")
+    if alias_line:
+        hero_bits.append(alias_line)
+    hero = "\n\n".join(hero_bits)
+    # -- Summary line (3-second takeaway) ---------------------------------
+    tl_bits = []
+    if ctx["record_modality"]:
+        tl_bits.append(ctx["record_modality"].upper())
+    if ctx["modality_exp"] and ctx["type_exp"]:
+        tl_bits.append(f"{ctx['modality_exp']} {ctx['type_exp'].lower()}")
+    elif ctx["modality_exp"]:
+        tl_bits.append(ctx["modality_exp"])
+    if ctx["pathology"]:
+        tl_bits.append(ctx["pathology"].lower())
+    if ctx["n_subjects"]:
+        tl_bits.append(f"{ctx['n_subjects']} subjects")
+    if ctx["n_records"]:
+        tl_bits.append(f"{ctx['n_records']} recordings")
+    if ctx["license"]:
+        tl_bits.append(ctx["license"])
+    tldr = "> **At a glance:** " + " · ".join(tl_bits) if tl_bits else ""
+    # -- Load section ------------------------------------------------------
+    aliases_hint = ""
+    if ctx["canonical_names"]:
+        a0 = ctx["canonical_names"][0]
+        aliases_hint = (
+            f"\nYou can also load it by canonical alias — these are registered "
+            f"classes in `eegdash.dataset`:\n\n"
+            f"```python\n"
+            f"from eegdash.dataset import {a0}\n"
+            f"ds = {a0}(cache_dir=\"./cache\")\n"
+            f"```\n"
+        )
     load_block = f"""## Load this dataset
+This repo is a **pointer**. The raw EEG data lives at its canonical source
+(OpenNeuro / NEMAR); [EEGDash](https://github.com/eegdash/EEGDash) streams it
+on demand and returns a PyTorch / braindecode dataset.
 ```python
 # pip install eegdash
 ds = EEGDashDataset(dataset="{ctx['dataset_id']}", cache_dir="./cache")
 print(len(ds), "recordings")
 ```
+{aliases_hint}
+If the dataset has been mirrored to the HF Hub in braindecode's Zarr layout,
+you can also pull it directly:
 ```python
 from braindecode.datasets import BaseConcatDataset
 ```
 """
+    # -- Metadata table ---------------------------------------------------
+    age_str = ""
+    if ctx["age_min"] or ctx["age_max"] or ctx["age_mean"]:
+        parts = []
+        if ctx["age_min"] and ctx["age_max"]:
+            parts.append(f"{ctx['age_min']}–{ctx['age_max']} yrs")
+        if ctx["age_mean"]:
+            try:
+                parts.append(f"mean {float(ctx['age_mean']):.1f}")
+            except ValueError:
+                parts.append(f"mean {ctx['age_mean']}")
+        age_str = ", ".join(parts)
     rows = [
         ("Subjects", ctx["n_subjects"]),
+        ("Age range", age_str),
         ("Recordings", ctx["n_records"]),
+        ("Tasks (count)", ctx["n_tasks"]),
+        ("Sessions", str(len(ctx["sessions"])) if ctx["sessions"] else ""),
         ("Channels", ctx["n_channels"]),
         ("Sampling rate (Hz)", ctx["sampling_freqs"]),
         ("Total duration (h)", ctx["duration_hours_total"]),
+        ("Size on disk", ctx["size"]),
+        ("Recording type", ctx["record_modality"].upper() if ctx["record_modality"] else ""),
         ("Experimental modality", ctx["modality_exp"]),
+        ("Paradigm type", ctx["type_exp"]),
         ("Population", ctx["pathology"]),
+        ("Study design", ctx["study_design"]),
+        ("Study domain", ctx["study_domain"]),
+        ("BIDS version", ctx["bids_version"]),
         ("Source", ctx["source"]),
         ("License", ctx["license"]),
+        ("NEMAR citations", ctx["nemar_citations"]),
     ]
     md_rows = "\n".join(
+        f"| **{k}** | {v} |" for k, v in rows if str(v or "").strip()
     )
+    meta_table = "## Dataset metadata\n\n| | |\n|---|---|\n" + md_rows
+    # -- Tasks list (if any) ----------------------------------------------
+    tasks_block = ""
+    if ctx["tasks_list"]:
+        items = "\n".join(f"- `{t}`" for t in ctx["tasks_list"])
+        tasks_block = f"## Tasks\n\n{items}\n"
+    # -- Upstream README (the star of the show) ---------------------------
+    upstream_block = ""
+    if ctx["readme"]:
+        body = _sanitize_upstream_readme(ctx["readme"])
+        upstream_block = (
+            "## Upstream README\n\n"
+            "_Verbatim from the dataset's authors — the canonical "
+            "description._\n\n"
+            f"{body}\n"
+        )
+    # -- People -----------------------------------------------------------
+    people_lines = []
+    if ctx["authors"]:
+        people_lines.append("### Authors")
+        for a in ctx["authors"]:
+            marker = " _(senior)_" if a.strip() == ctx["senior_author"].strip() else ""
+            people_lines.append(f"- {a}{marker}")
+    if ctx["contributing_labs"]:
+        people_lines.append("\n### Contributing labs")
+        for lab in ctx["contributing_labs"]:
+            people_lines.append(f"- {lab}")
+    if ctx["contact_info"]:
+        people_lines.append("\n### Contact")
+        for c in ctx["contact_info"]:
+            people_lines.append(f"- {c}")
+    people_block = "## People\n\n" + "\n".join(people_lines) if people_lines else ""
+    # -- Funding + references ---------------------------------------------
+    funding_block = ""
+    if ctx["funding"]:
+        items = "\n".join(f"- {f}" for f in ctx["funding"])
+        funding_block = f"## Funding\n\n{items}"
+    cite_block = ""
+    if ctx["how_to_acknowledge"]:
+        cite_block = (
+            "## How to cite\n\n"
+            "Please follow the upstream dataset's citation policy:\n\n"
+            + "\n".join(
+                f"> {ln}" for ln in ctx["how_to_acknowledge"].strip().splitlines()
+            )
+        )
+    if ctx["references"]:
+        if cite_block:
+            cite_block += "\n\n### References\n\n"
+        else:
+            cite_block = "## References\n\n"
+        cite_block += "\n".join(f"- {r}" for r in ctx["references"])
+    # -- Links ------------------------------------------------------------
     links = []
     if ctx["doi"]:
         links.append(f"- **DOI:** [{ctx['doi']}](https://doi.org/{ctx['doi']})")
+    if ctx["paper_doi"]:
+        links.append(
+            f"- **Associated paper:** [{ctx['paper_doi']}]"
+            f"(https://doi.org/{ctx['paper_doi']})"
+        )
     if ctx["source"].lower() == "openneuro":
         links.append(f"- **OpenNeuro:** [{ctx['dataset_id']}]({ctx['openneuro_url']})")
     if ctx["source"].lower() == "nemar":
         links.append(f"- **NEMAR:** [{ctx['dataset_id']}]({ctx['nemar_url']})")
+    if ctx["source_url"] and ctx["source_url"] not in (ctx["openneuro_url"], ctx["nemar_url"]):
         links.append(f"- **Source:** <{ctx['source_url']}>")
+    if ctx["osf_url"]:
+        links.append(f"- **OSF:** <{ctx['osf_url']}>")
+    if ctx["github_url"]:
+        links.append(f"- **GitHub:** <{ctx['github_url']}>")
     links.append(f"- **Browse 700+ datasets:** [EEGDash catalog]({CATALOG_SPACE})")
     links.append(f"- **Docs:** <{EEGDASH_URL}>")
     links.append(f"- **Code:** <{GITHUB_URL}>")
     links_block = "## Links\n\n" + "\n".join(links)
+    # -- Provenance (where the data actually lives + when we saw it) ------
+    prov_lines = []
+    if ctx["storage_backend"] and ctx["storage_base"]:
+        prov_lines.append(
+            f"- **Backend:** `{ctx['storage_backend']}` — "
+            f"`{ctx['storage_base']}`"
         )
+    elif ctx["storage_backend"]:
+        prov_lines.append(f"- **Backend:** `{ctx['storage_backend']}`")
+    if ctx["size_bytes"]:
+        try:
+            sb = float(ctx["size_bytes"])
+            prov_lines.append(f"- **Exact size:** {int(sb):,} bytes ({ctx['size']})")
+        except ValueError:
+            pass
+    if ctx["digested_at"]:
+        prov_lines.append(f"- **Ingested:** {ctx['digested_at'][:10]}")
+    if ctx["stats_computed_at"]:
+        prov_lines.append(
+            f"- **Stats computed:** {ctx['stats_computed_at'][:10]}"
         )
+    prov_block = "## Provenance\n\n" + "\n".join(prov_lines) if prov_lines else ""
+    # -- Footer -----------------------------------------------------------
     footer = (
+        f"---\n\n"
+        f"_Auto-generated from "
+        f"[dataset_summary.csv]({GITHUB_URL}/blob/main/eegdash/dataset/dataset_summary.csv) "
+        f"and the [EEGDash API]({EEGDASH_API}/datasets/summary/{ctx['dataset_id']}). "
+        f"Do not edit this file by hand — update the upstream source and "
+        f"re-run `scripts/push_metadata_stubs.py`._"
     )
+    sections = [
+        frontmatter,
+        hero,
+        tldr,
+        load_block,
+        meta_table,
+        tasks_block,
+        upstream_block,
+        people_block,
+        funding_block,
+        cite_block,
+        links_block,
+        prov_block,
+        footer,
+    ]
+    return "\n\n".join(s for s in sections if s).strip() + "\n"
 def _render_pointer(ctx: dict[str, Any]) -> str: