Spaces:

t22000t
/

optcg-explorer

Sleeping

App Files Files Community

optcg-explorer / spaceutil /data.py

t22000t

Initial commit: optcg-explorer Gradio Space

3ab07bd 4 days ago

raw

history blame contribute delete

2.74 kB

	"""Load the published OPTCG embeddings corpus from HF Hub.

	Pulls `cards_with_embeddings.parquet` and `provenance.json` from the
	configured dataset repo, applies the same numpy-array-to-list coercion
	that the upstream CLI uses, and stacks the embedding column into a
	single float32 matrix that downstream code reuses without restacking.
	"""

	from __future__ import annotations

	import logging
	from pathlib import Path
	from typing import Any

	import numpy as np
	import pandas as pd
	from huggingface_hub import hf_hub_download
	from optcg_cards.provenance import EmbedProvenance, read_provenance

	logger = logging.getLogger(__name__)

	REPO_ID = "t22000t/optcg-en-card-embeddings"
	PARQUET_FILE = "cards_with_embeddings.parquet"
	PROVENANCE_FILE = "provenance.json"


	def load_corpus(
	token: str \| None,
	) -> tuple[list[dict[str, Any]], np.ndarray, EmbedProvenance, dict[str, int]]:
	"""Return `(cards, matrix, embed_provenance, id_to_idx)` for the
	published embeddings corpus.

	The `embedding` column is stripped from `cards` after stacking into
	`matrix`. All list-typed columns are coerced to plain Python lists.
	The token is passed to `hf_hub_download` but never written to logs.
	"""
	logger.info(
	"Loading corpus from %s (authenticated=%s)",
	REPO_ID,
	"yes" if token else "no",
	)
	parquet_path = hf_hub_download(
	repo_id=REPO_ID,
	filename=PARQUET_FILE,
	repo_type="dataset",
	token=token,
	)
	prov_path = hf_hub_download(
	repo_id=REPO_ID,
	filename=PROVENANCE_FILE,
	repo_type="dataset",
	token=token,
	)

	cards = _read_parquet_records(Path(parquet_path))
	if not cards:
	raise RuntimeError("Embeddings parquet returned 0 rows")

	matrix = np.stack(
	[np.asarray(c["embedding"], dtype=np.float32) for c in cards],
	axis=0,
	)

	for card in cards:
	card.pop("embedding", None)

	id_to_idx = {card["id"]: i for i, card in enumerate(cards)}

	_, embed_prov = read_provenance(Path(prov_path))
	if embed_prov is None:
	raise RuntimeError("Embeddings provenance is missing the `embed` block")

	return cards, matrix, embed_prov, id_to_idx


	def _read_parquet_records(path: Path) -> list[dict[str, Any]]:
	# Mirrors the coercion loop in optcg_cards.cli._read_parquet
	# (cli.py:429-443). Pandas materializes list-typed parquet columns
	# as ndarrays; downstream code expects plain Python lists.
	df = pd.read_parquet(str(path))
	records = df.to_dict(orient="records")
	for record in records:
	for key, value in record.items():
	if isinstance(value, np.ndarray):
	record[key] = value.tolist()
	return records