t22000t's picture
Initial commit: optcg-deck-builder Gradio Space
16eaadc
"""Load the published OPTCG embeddings corpus from HF Hub.
Pulls `cards_with_embeddings.parquet` and `provenance.json` from the
configured dataset repo, applies the same numpy-array-to-list coercion
that the upstream CLI uses, and stacks the embedding column into a
single float32 matrix that downstream code reuses without restacking.
"""
from __future__ import annotations
import logging
from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
from huggingface_hub import hf_hub_download
from optcg_cards.provenance import EmbedProvenance, read_provenance
logger = logging.getLogger(__name__)
REPO_ID = "t22000t/optcg-en-card-embeddings"
PARQUET_FILE = "cards_with_embeddings.parquet"
PROVENANCE_FILE = "provenance.json"
def load_corpus(
token: str | None,
) -> tuple[list[dict[str, Any]], np.ndarray, EmbedProvenance, dict[str, int]]:
"""Return `(cards, matrix, embed_provenance, id_to_idx)` for the
published embeddings corpus.
The `embedding` column is stripped from `cards` after stacking into
`matrix`. All list-typed columns are coerced to plain Python lists.
The token is passed to `hf_hub_download` but never written to logs.
"""
logger.info(
"Loading corpus from %s (authenticated=%s)",
REPO_ID,
"yes" if token else "no",
)
parquet_path = hf_hub_download(
repo_id=REPO_ID,
filename=PARQUET_FILE,
repo_type="dataset",
token=token,
)
prov_path = hf_hub_download(
repo_id=REPO_ID,
filename=PROVENANCE_FILE,
repo_type="dataset",
token=token,
)
cards = _read_parquet_records(Path(parquet_path))
if not cards:
raise RuntimeError("Embeddings parquet returned 0 rows")
matrix = np.stack(
[np.asarray(c["embedding"], dtype=np.float32) for c in cards],
axis=0,
)
for card in cards:
card.pop("embedding", None)
id_to_idx = {card["id"]: i for i, card in enumerate(cards)}
_, embed_prov = read_provenance(Path(prov_path))
if embed_prov is None:
raise RuntimeError("Embeddings provenance is missing the `embed` block")
return cards, matrix, embed_prov, id_to_idx
def _read_parquet_records(path: Path) -> list[dict[str, Any]]:
# Mirrors the coercion loop in optcg_cards.cli._read_parquet
# (cli.py:429-443). Pandas materializes list-typed parquet columns
# as ndarrays; downstream code expects plain Python lists.
df = pd.read_parquet(str(path))
records = df.to_dict(orient="records")
for record in records:
for key, value in record.items():
if isinstance(value, np.ndarray):
record[key] = value.tolist()
return records