"""Load the published OPTCG embeddings corpus from HF Hub. Pulls `cards_with_embeddings.parquet` and `provenance.json` from the configured dataset repo, applies the same numpy-array-to-list coercion that the upstream CLI uses, and stacks the embedding column into a single float32 matrix that downstream code reuses without restacking. """ from __future__ import annotations import logging from pathlib import Path from typing import Any import numpy as np import pandas as pd from huggingface_hub import hf_hub_download from optcg_cards.provenance import EmbedProvenance, read_provenance logger = logging.getLogger(__name__) REPO_ID = "t22000t/optcg-en-card-embeddings" PARQUET_FILE = "cards_with_embeddings.parquet" PROVENANCE_FILE = "provenance.json" def load_corpus( token: str | None, ) -> tuple[list[dict[str, Any]], np.ndarray, EmbedProvenance, dict[str, int]]: """Return `(cards, matrix, embed_provenance, id_to_idx)` for the published embeddings corpus. The `embedding` column is stripped from `cards` after stacking into `matrix`. All list-typed columns are coerced to plain Python lists. The token is passed to `hf_hub_download` but never written to logs. """ logger.info( "Loading corpus from %s (authenticated=%s)", REPO_ID, "yes" if token else "no", ) parquet_path = hf_hub_download( repo_id=REPO_ID, filename=PARQUET_FILE, repo_type="dataset", token=token, ) prov_path = hf_hub_download( repo_id=REPO_ID, filename=PROVENANCE_FILE, repo_type="dataset", token=token, ) cards = _read_parquet_records(Path(parquet_path)) if not cards: raise RuntimeError("Embeddings parquet returned 0 rows") matrix = np.stack( [np.asarray(c["embedding"], dtype=np.float32) for c in cards], axis=0, ) for card in cards: card.pop("embedding", None) id_to_idx = {card["id"]: i for i, card in enumerate(cards)} _, embed_prov = read_provenance(Path(prov_path)) if embed_prov is None: raise RuntimeError("Embeddings provenance is missing the `embed` block") return cards, matrix, embed_prov, id_to_idx def _read_parquet_records(path: Path) -> list[dict[str, Any]]: # Mirrors the coercion loop in optcg_cards.cli._read_parquet # (cli.py:429-443). Pandas materializes list-typed parquet columns # as ndarrays; downstream code expects plain Python lists. df = pd.read_parquet(str(path)) records = df.to_dict(orient="records") for record in records: for key, value in record.items(): if isinstance(value, np.ndarray): record[key] = value.tolist() return records