Spaces:

EEGDash
/

catalog

Running

App Files Files Community

bruAristimunha commited on Apr 19

Commit

5c60b17

1 Parent(s): c799eda

Add metadata-stub generator — reuses eegdash API + CSV, renders HF dataset cards

Browse files

Files changed (1) hide show

scripts/push_metadata_stubs.py +556 -0

scripts/push_metadata_stubs.py ADDED Viewed

	@@ -0,0 +1,556 @@

+#!/usr/bin/env python
+"""Generate and push per-dataset metadata stubs to the ``EEGDash`` HF org.
+Lives inside the Space on purpose: the Space already vendors
+``dataset_summary.csv`` and hits the same live EEGDash API that
+``docs/source/conf.py`` uses. No rehosting of EEG data — each repo is a
+Markdown card + a small ``eegdash.json`` pointer.
+The field-priority rules mirror ``_build_dataset_context`` in the docs
+Sphinx config: CSV row wins when it has a value, otherwise fall back to
+the API response. That keeps the eegdash.org dataset pages and the HF
+stubs in lock-step — edit the CSV (or the API), both re-render the same
+way.
+Usage::
+    # Dry-run: write one stub README to /tmp/stub_preview/
+    python scripts/push_metadata_stubs.py --dataset ds002718 --dry-run
+    # Push a single stub
+    python scripts/push_metadata_stubs.py --dataset ds002718
+    # Push every row in the CSV, skipping repos that already exist
+    python scripts/push_metadata_stubs.py --all --skip-existing
+    # Sample 10 for a smoke test
+    python scripts/push_metadata_stubs.py --all --limit 10
+Requires ``huggingface-cli login`` (or ``HF_TOKEN`` env var) when pushing.
+"""
+from __future__ import annotations
+import argparse
+import ast
+import json
+import logging
+import os
+import sys
+import tempfile
+import time
+import urllib.error
+import urllib.request
+from pathlib import Path
+from typing import Any, Iterable
+import pandas as pd
+ROOT = Path(__file__).resolve().parents[1]
+CSV_PATH = ROOT / "dataset_summary.csv"
+HF_ORG = "EEGDash"
+EEGDASH_API = "https://data.eegdash.org/api/eegdash"
+CATALOG_SPACE = f"https://huggingface.co/spaces/{HF_ORG}/catalog"
+EEGDASH_URL = "https://eegdash.org"
+GITHUB_URL = "https://github.com/eegdash/EEGDash"
+logger = logging.getLogger("push_metadata_stubs")
+# ---------------------------------------------------------------------------
+# Same helpers as docs/source/conf.py — lifted verbatim so the output format
+# stays in sync without a sphinx import.
+# ---------------------------------------------------------------------------
+def _clean_value(value: Any) -> str:
+    if value is None:
+        return ""
+    s = str(value).strip()
+    if not s or s.lower() in {"nan", "none", "null", "n/a", "—", "-"}:
+        return ""
+    return s
+def _normalize_list(value: Any) -> list[str]:
+    if not value:
+        return []
+    if isinstance(value, list):
+        return [str(v).strip() for v in value if str(v).strip()]
+    if isinstance(value, str):
+        cleaned = value.strip()
+        if cleaned.startswith("[") and cleaned.endswith("]"):
+            try:
+                parsed = ast.literal_eval(cleaned)
+                if isinstance(parsed, (list, tuple)):
+                    return [str(v).strip() for v in parsed if str(v).strip()]
+            except (ValueError, SyntaxError):
+                pass
+        return [cleaned]
+    return [str(value).strip()]
+def _format_hours(cell: Any) -> str:
+    s = _clean_value(cell)
+    if not s:
+        return ""
+    try:
+        h = float(s)
+    except ValueError:
+        return s
+    return f"{h:,.1f}"
+def _format_stat_counts(cell: Any) -> str:
+    """Render a ``[{val, count}, ...]`` JSON cell as ``"val (×count)"``.
+    Matches the helper of the same name in ``docs/source/conf.py`` so
+    sampling rate / channel count rows look identical on eegdash.org and
+    on HF.
+    """
+    s = _clean_value(cell)
+    if not s:
+        return ""
+    try:
+        parsed = json.loads(s)
+    except json.JSONDecodeError:
+        try:
+            parsed = ast.literal_eval(s)
+        except (ValueError, SyntaxError):
+            return s
+    if not isinstance(parsed, list) or not parsed:
+        return ""
+    entries = []
+    for row in parsed:
+        if not isinstance(row, dict):
+            continue
+        val = row.get("val")
+        count = row.get("count")
+        if val is None:
+            continue
+        if isinstance(val, float) and val.is_integer():
+            val = int(val)
+        if count in (None, "", 0):
+            entries.append(str(val))
+        else:
+            entries.append(f"{val} (×{count})")
+    return ", ".join(entries)
+# ---------------------------------------------------------------------------
+# API fetch — same endpoint as docs, same failure-is-fine policy.
+# ---------------------------------------------------------------------------
+def _fetch_api_summary(dataset_id: str, timeout: float = 10.0) -> dict[str, Any]:
+    variants = [dataset_id]
+    if dataset_id.startswith("ds"):
+        variants.append(dataset_id.lower())
+    elif dataset_id.lower().startswith("eeg2025r"):
+        variants.append(f"EEG2025r{dataset_id.lower().replace('eeg2025r', '')}")
+    for vid in variants:
+        url = f"{EEGDASH_API}/datasets/summary/{vid}"
+        try:
+            with urllib.request.urlopen(url, timeout=timeout) as resp:
+                data = json.loads(resp.read().decode("utf-8"))
+        except (urllib.error.URLError, TimeoutError, json.JSONDecodeError) as exc:
+            logger.debug("API %s failed: %s", vid, exc)
+            continue
+        if data.get("success"):
+            return data.get("data") or {}
+    return {}
+# ---------------------------------------------------------------------------
+# Context builder — CSV row first, API second. Mirrors conf.py field order.
+# ---------------------------------------------------------------------------
+def _build_context(row: pd.Series) -> dict[str, Any]:
+    dataset_id = _clean_value(row.get("dataset")).lower()
+    api = _fetch_api_summary(dataset_id)
+    def pick(row_key: str, api_key: str = "") -> str:
+        v = _clean_value(row.get(row_key))
+        if v and v != "0":
+            return v
+        if api_key:
+            return _clean_value(api.get(api_key))
+        return ""
+    title = _clean_value(row.get("dataset_title")) or _clean_value(
+        api.get("computed_title") or api.get("name")
+    )
+    doi_raw = _clean_value(row.get("doi")) or _clean_value(api.get("dataset_doi"))
+    # DOIs sometimes ship with a "doi:" prefix — strip so links don't double up.
+    doi = doi_raw[4:].strip() if doi_raw.lower().startswith("doi:") else doi_raw
+    license_ = _clean_value(row.get("license")) or _clean_value(api.get("license"))
+    authors = _normalize_list(api.get("authors"))
+    source = _clean_value(row.get("source")) or "OpenNeuro"
+    # Year from API timestamps (docs does the same)
+    year = ""
+    ts = api.get("timestamps") or {}
+    created = ts.get("dataset_created_at") or ""
+    if isinstance(created, str) and len(created) >= 4:
+        year = created[:4]
+    return {
+        "dataset_id": dataset_id,
+        "title": title or dataset_id,
+        "author_year": _clean_value(row.get("author_year")),
+        "authors": authors,
+        "year": year,
+        "license": license_ or "Unknown",
+        "doi": doi,
+        "source": source,
+        "openneuro_url": f"https://openneuro.org/datasets/{dataset_id}",
+        "nemar_url": f"https://nemar.org/dataexplorer/detail?dataset_id={dataset_id}",
+        "source_url": _clean_value(api.get("source_url")),
+        "record_modality": _clean_value(row.get("record_modality")),
+        "modality_exp": _clean_value(row.get("modality of exp")),
+        "type_exp": _clean_value(row.get("type of exp")),
+        "pathology": _clean_value(row.get("Type Subject")),
+        "n_subjects": pick("n_subjects", "n_subjects"),
+        "n_records": pick("n_records", "total_files"),
+        "n_tasks": pick("n_tasks", "n_tasks"),
+        "n_channels": _format_stat_counts(row.get("nchans_set")),
+        "sampling_freqs": _format_stat_counts(row.get("sampling_freqs")),
+        "size": _clean_value(row.get("size")),
+        "duration_hours_total": _format_hours(row.get("duration_hours_total")),
+        "references": _normalize_list(api.get("references")),
+        "how_to_acknowledge": _clean_value(api.get("how_to_acknowledge")),
+    }
+# ---------------------------------------------------------------------------
+# Render a HF Dataset Card (README.md) from the context.
+# ---------------------------------------------------------------------------
+HF_LICENSE_MAP = {
+    # HF's vetted SPDX-ish identifiers. Unknown values map to "other".
+    "cc0": "cc0-1.0",
+    "cc0-1.0": "cc0-1.0",
+    "cc-by-4.0": "cc-by-4.0",
+    "cc-by-sa-4.0": "cc-by-sa-4.0",
+    "cc-by-nc-4.0": "cc-by-nc-4.0",
+    "cc-by-nc-sa-4.0": "cc-by-nc-sa-4.0",
+    "mit": "mit",
+    "apache-2.0": "apache-2.0",
+    "bsd-3-clause": "bsd-3-clause",
+}
+def _hf_license(raw: str) -> str:
+    norm = raw.lower().replace("_", "-").replace(" ", "-").strip()
+    for key, val in HF_LICENSE_MAP.items():
+        if key in norm:
+            return val
+    return "other"
+def _size_category(n_records: str) -> str:
+    try:
+        n = int(n_records)
+    except (TypeError, ValueError):
+        return "unknown"
+    if n < 10:
+        return "n<1K"
+    if n < 1_000:
+        return "n<1K"
+    if n < 10_000:
+        return "1K<n<10K"
+    return "10K<n<100K"
+def _render_readme(ctx: dict[str, Any]) -> str:
+    tags = ["neuroscience", "eegdash", "brain-computer-interface"]
+    rm = ctx["record_modality"].lower()
+    if rm in {"eeg", "meg", "ieeg"}:
+        tags.insert(0, rm)
+    else:
+        tags.insert(0, "eeg")
+    if ctx["modality_exp"]:
+        tags.append(ctx["modality_exp"].lower().replace(" ", "-"))
+    if ctx["pathology"] and ctx["pathology"].lower() not in {"unknown", "healthy"}:
+        tags.append(ctx["pathology"].lower().replace(" ", "-").replace("/", "-"))
+    license_slug = _hf_license(ctx["license"])
+    size_cat = _size_category(ctx["n_records"])
+    yaml_tags = "\n".join(f"- {t}" for t in tags)
+    yaml_authors = ""
+    if ctx["authors"]:
+        yaml_authors = "authors:\n" + "\n".join(
+            f"  - {a}" for a in ctx["authors"][:8]
+        ) + "\n"
+    # --- Body -------------------------------------------------------------
+    hero_lines = []
+    if ctx["title"] and ctx["title"].lower() != ctx["dataset_id"].lower():
+        hero_lines.append(f"# {ctx['title']}")
+    else:
+        hero_lines.append(f"# {ctx['dataset_id']}")
+    if ctx["author_year"]:
+        hero_lines.append(f"*{ctx['author_year']}*")
+    elif ctx["authors"]:
+        head = ctx["authors"][0]
+        extra = f" et al." if len(ctx["authors"]) > 1 else ""
+        yr = f" ({ctx['year']})" if ctx["year"] else ""
+        hero_lines.append(f"*{head}{extra}{yr}*")
+    hero = "\n\n".join(hero_lines)
+    load_block = f"""## Load this dataset
+This repo is a **pointer** — the raw EEG data lives at its canonical source
+(OpenNeuro / NEMAR). [EEGDash](https://github.com/eegdash/EEGDash) handles the
+download, caching, and conversion to a PyTorch / braindecode dataset.
+```python
+# pip install eegdash
+from eegdash import EEGDashDataset
+ds = EEGDashDataset(dataset="{ctx['dataset_id']}", cache_dir="./cache")
+print(len(ds), "recordings")
+```
+Need it in braindecode's HF-native Zarr format? Once mirrored
+(`ds.push_to_hub(...)`) you can also do:
+```python
+from braindecode.datasets import BaseConcatDataset
+ds = BaseConcatDataset.pull_from_hub("{HF_ORG}/{ctx['dataset_id']}")
+```
+"""
+    rows = [
+        ("Subjects", ctx["n_subjects"]),
+        ("Recordings", ctx["n_records"]),
+        ("Tasks", ctx["n_tasks"]),
+        ("Channels", ctx["n_channels"]),
+        ("Sampling rate (Hz)", ctx["sampling_freqs"]),
+        ("Size on disk", ctx["size"]),
+        ("Total duration (h)", ctx["duration_hours_total"]),
+        ("Experimental modality", ctx["modality_exp"]),
+        ("Experimental type", ctx["type_exp"]),
+        ("Population", ctx["pathology"]),
+        ("Recording type", ctx["record_modality"].upper()),
+        ("Source", ctx["source"]),
+        ("License", ctx["license"]),
+    ]
+    md_rows = "\n".join(
+        f"| **{k}** | {v or '—'} |" for k, v in rows if v or k in {"Source", "License"}
+    )
+    meta_table = f"""## Dataset metadata
+| | |
+|---|---|
+{md_rows}
+"""
+    links = []
+    if ctx["doi"]:
+        links.append(f"- **DOI:** [{ctx['doi']}](https://doi.org/{ctx['doi']})")
+    if ctx["source"].lower() == "openneuro":
+        links.append(f"- **OpenNeuro:** [{ctx['dataset_id']}]({ctx['openneuro_url']})")
+    if ctx["source"].lower() == "nemar":
+        links.append(f"- **NEMAR:** [{ctx['dataset_id']}]({ctx['nemar_url']})")
+    if ctx["source_url"]:
+        links.append(f"- **Source:** <{ctx['source_url']}>")
+    links.append(f"- **Browse 700+ datasets:** [EEGDash catalog]({CATALOG_SPACE})")
+    links.append(f"- **Docs:** <{EEGDASH_URL}>")
+    links.append(f"- **Code:** <{GITHUB_URL}>")
+    links_block = "## Links\n\n" + "\n".join(links)
+    cite_block = ""
+    if ctx["how_to_acknowledge"]:
+        cite_block = (
+            "## How to cite\n\n"
+            "Please follow the upstream dataset's citation policy:\n\n"
+            f"> {ctx['how_to_acknowledge'].strip()}\n"
+        )
+    elif ctx["references"]:
+        cite_block = "## References\n\n" + "\n".join(
+            f"- {r}" for r in ctx["references"][:5]
+        )
+    footer = (
+        f"\n---\n\n"
+        f"_This repo is auto-generated from [dataset_summary.csv]"
+        f"({GITHUB_URL}/blob/main/eegdash/dataset/dataset_summary.csv) + the "
+        f"EEGDash API. Edit the upstream source, not this file._"
+    )
+    return f"""---
+tags:
+{yaml_tags}
+license: {license_slug}
+size_categories:
+- {size_cat}
+pretty_name: "{ctx['title'] or ctx['dataset_id']}"
+{yaml_authors}---
+{hero}
+{load_block}
+{meta_table}
+{links_block}
+{cite_block}
+{footer}
+"""
+def _render_pointer(ctx: dict[str, Any]) -> str:
+    """Small machine-readable sibling — the same fields the web catalog uses."""
+    return json.dumps(
+        {
+            "dataset_id": ctx["dataset_id"],
+            "title": ctx["title"],
+            "source": ctx["source"],
+            "source_url": ctx["source_url"] or ctx["openneuro_url"] or ctx["nemar_url"],
+            "doi": ctx["doi"],
+            "license": ctx["license"],
+            "loader": {
+                "library": "eegdash",
+                "class": "EEGDashDataset",
+                "kwargs": {"dataset": ctx["dataset_id"]},
+            },
+            "catalog": CATALOG_SPACE,
+            "generated_by": "huggingface-space/scripts/push_metadata_stubs.py",
+        },
+        indent=2,
+        ensure_ascii=False,
+    ) + "\n"
+# ---------------------------------------------------------------------------
+# Push logic.
+# ---------------------------------------------------------------------------
+def _iter_slugs(df: pd.DataFrame, args: argparse.Namespace) -> Iterable[pd.Series]:
+    if args.dataset:
+        wanted = {s.lower() for s in args.dataset}
+        yield from (r for _, r in df.iterrows() if str(r["dataset"]).lower() in wanted)
+        return
+    if args.all:
+        it = df.iterrows()
+        if args.limit:
+            it = list(df.head(args.limit).iterrows())
+        for _, r in it:
+            yield r
+        return
+    raise SystemExit("Pass --dataset <slug> [...] or --all")
+def _push_one(ctx: dict[str, Any], args: argparse.Namespace) -> str:
+    from huggingface_hub import HfApi  # noqa: WPS433
+    api = HfApi(token=args.token)
+    repo_id = f"{HF_ORG}/{ctx['dataset_id']}"
+    api.create_repo(
+        repo_id=repo_id,
+        repo_type="dataset",
+        exist_ok=True,
+        private=args.private,
+    )
+    with tempfile.TemporaryDirectory() as tmp:
+        readme = Path(tmp) / "README.md"
+        pointer = Path(tmp) / "eegdash.json"
+        readme.write_text(_render_readme(ctx), encoding="utf-8")
+        pointer.write_text(_render_pointer(ctx), encoding="utf-8")
+        api.upload_folder(
+            repo_id=repo_id,
+            folder_path=tmp,
+            repo_type="dataset",
+            commit_message=f"Metadata stub for {ctx['dataset_id']}",
+        )
+    return repo_id
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument("--dataset", nargs="+", help="One or more slugs.")
+    parser.add_argument("--all", action="store_true", help="Every row in the CSV.")
+    parser.add_argument("--limit", type=int, default=0, help="Cap --all to N rows.")
+    parser.add_argument("--skip-existing", action="store_true")
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Write one stub README + pointer to a temp dir, no push.",
+    )
+    parser.add_argument("--dry-run-out", type=Path, default=Path("/tmp/stub_preview"))
+    parser.add_argument("--private", action="store_true")
+    parser.add_argument("--token", default=os.environ.get("HF_TOKEN"))
+    parser.add_argument("-v", "--verbose", action="count", default=0)
+    args = parser.parse_args(argv)
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(asctime)s %(levelname)s %(name)s — %(message)s",
+    )
+    df = pd.read_csv(CSV_PATH)
+    rows = list(_iter_slugs(df, args))
+    if not rows:
+        raise SystemExit("No rows matched the given slugs.")
+    existing: set[str] = set()
+    if args.skip_existing and not args.dry_run:
+        from huggingface_hub import HfApi  # noqa: WPS433
+        existing = {
+            r.id.split("/", 1)[-1]
+            for r in HfApi().list_datasets(author=HF_ORG, limit=2000)
+        }
+    if args.dry_run:
+        args.dry_run_out.mkdir(parents=True, exist_ok=True)
+        for r in rows[:3]:
+            ctx = _build_context(r)
+            (args.dry_run_out / f"{ctx['dataset_id']}_README.md").write_text(
+                _render_readme(ctx), encoding="utf-8"
+            )
+            (args.dry_run_out / f"{ctx['dataset_id']}_eegdash.json").write_text(
+                _render_pointer(ctx), encoding="utf-8"
+            )
+            logger.info("Wrote dry-run preview for %s", ctx["dataset_id"])
+        logger.info("Dry-run output: %s", args.dry_run_out)
+        return 0
+    failed: list[tuple[str, str]] = []
+    for r in rows:
+        slug = str(r["dataset"]).lower()
+        if slug in existing:
+            logger.info("skipping %s (exists)", slug)
+            continue
+        try:
+            ctx = _build_context(r)
+            repo_id = _push_one(ctx, args)
+            logger.info("pushed %s", repo_id)
+        except Exception as exc:  # noqa: BLE001
+            logger.exception("failed %s", slug)
+            failed.append((slug, str(exc)))
+        # Be polite to the API and HF.
+        time.sleep(0.25)
+    if failed:
+        logger.error("%d failures:", len(failed))
+        for slug, err in failed:
+            logger.error("  %s — %s", slug, err)
+        return 1
+    logger.info("done — %d stubs processed", len(rows) - len(existing))
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())