Spaces:

t22000t
/

slaythespire-archetype-map

Sleeping

App Files Files Community

slaythespire-archetype-map / shared /data.py

t22000t

Update archetype-map Space

06202d6 10 days ago

raw

history blame contribute delete

2.56 kB

	"""Load the four published HuggingFace datasets and merge cards onto embeddings.

	Uses huggingface_hub.hf_hub_download to fetch parquets directly, bypassing
	the `datasets` library's hashing path (which references
	`transformers.PreTrainedTokenizerBase` via lazy lookup; that lookup fails
	on HF Spaces when transformers is in sys.modules even at known-good versions).

	Files cache to ~/.cache/huggingface/hub; subsequent calls hit disk.
	"""

	from __future__ import annotations

	from functools import lru_cache

	import numpy as np
	import pandas as pd
	from huggingface_hub import hf_hub_download

	REPOS = {
	"sts1": (
	"t22000t/slay-the-spire-1-cards",
	"t22000t/slay-the-spire-1-card-embeddings",
	),
	"sts2": (
	"t22000t/slay-the-spire-2-cards",
	"t22000t/slay-the-spire-2-card-embeddings",
	),
	}


	@lru_cache(maxsize=2)
	def load_game(game: str) -> tuple[pd.DataFrame, np.ndarray]:
	"""Return (cards_df_with_umap, embeddings_matrix) for one game.

	cards_df_with_umap has all 22 metadata columns plus umap_x, umap_y, embedding.
	embeddings_matrix is shape (n_cards, 1024), float32, unit-normalized.
	"""
	if game not in REPOS:
	raise ValueError(f"game must be one of {list(REPOS)}, got {game!r}")

	cards_repo, emb_repo = REPOS[game]
	cards_path = hf_hub_download(
	repo_id=cards_repo, filename="cards.parquet", repo_type="dataset",
	)
	embs_path = hf_hub_download(
	repo_id=emb_repo, filename="embeddings.parquet", repo_type="dataset",
	)

	cards = pd.read_parquet(cards_path)
	embs = pd.read_parquet(embs_path)

	df = cards.merge(
	embs[["id", "embedding", "umap_x", "umap_y"]],
	on="id",
	how="inner",
	)
	emb = np.vstack(df["embedding"].values).astype(np.float32)
	return df, emb


	def topk_similar(
	df: pd.DataFrame,
	emb: np.ndarray,
	query_vec: np.ndarray,
	k: int = 10,
	exclude_idx: int \| None = None,
	) -> pd.DataFrame:
	"""Top-k cosine-similar rows. Vectors must be unit-normalized.

	Vendored from src/sts_cards/search.py.
	"""
	sims = emb @ query_vec
	if exclude_idx is not None:
	sims[exclude_idx] = -np.inf

	k = min(k, len(sims))
	top_idx = np.argpartition(-sims, k - 1)[:k]
	top_idx = top_idx[np.argsort(-sims[top_idx])]

	columns = [c for c in ("name", "type", "rarity", "cost", "color", "description") if c in df.columns]
	out = df.iloc[top_idx][columns].copy()
	out.insert(0, "similarity", sims[top_idx].round(4))
	return out.reset_index(drop=True)