Spaces:
Sleeping
Sleeping
| """Load the four published HuggingFace datasets and merge cards onto embeddings. | |
| Uses huggingface_hub.hf_hub_download to fetch parquets directly, bypassing | |
| the `datasets` library's hashing path (which references | |
| `transformers.PreTrainedTokenizerBase` via lazy lookup; that lookup fails | |
| on HF Spaces when transformers is in sys.modules even at known-good versions). | |
| Files cache to ~/.cache/huggingface/hub; subsequent calls hit disk. | |
| """ | |
| from __future__ import annotations | |
| from functools import lru_cache | |
| import numpy as np | |
| import pandas as pd | |
| from huggingface_hub import hf_hub_download | |
| REPOS = { | |
| "sts1": ( | |
| "t22000t/slay-the-spire-1-cards", | |
| "t22000t/slay-the-spire-1-card-embeddings", | |
| ), | |
| "sts2": ( | |
| "t22000t/slay-the-spire-2-cards", | |
| "t22000t/slay-the-spire-2-card-embeddings", | |
| ), | |
| } | |
| def load_game(game: str) -> tuple[pd.DataFrame, np.ndarray]: | |
| """Return (cards_df_with_umap, embeddings_matrix) for one game. | |
| cards_df_with_umap has all 22 metadata columns plus umap_x, umap_y, embedding. | |
| embeddings_matrix is shape (n_cards, 1024), float32, unit-normalized. | |
| """ | |
| if game not in REPOS: | |
| raise ValueError(f"game must be one of {list(REPOS)}, got {game!r}") | |
| cards_repo, emb_repo = REPOS[game] | |
| cards_path = hf_hub_download( | |
| repo_id=cards_repo, filename="cards.parquet", repo_type="dataset", | |
| ) | |
| embs_path = hf_hub_download( | |
| repo_id=emb_repo, filename="embeddings.parquet", repo_type="dataset", | |
| ) | |
| cards = pd.read_parquet(cards_path) | |
| embs = pd.read_parquet(embs_path) | |
| df = cards.merge( | |
| embs[["id", "embedding", "umap_x", "umap_y"]], | |
| on="id", | |
| how="inner", | |
| ) | |
| emb = np.vstack(df["embedding"].values).astype(np.float32) | |
| return df, emb | |
| def topk_similar( | |
| df: pd.DataFrame, | |
| emb: np.ndarray, | |
| query_vec: np.ndarray, | |
| k: int = 10, | |
| exclude_idx: int | None = None, | |
| ) -> pd.DataFrame: | |
| """Top-k cosine-similar rows. Vectors must be unit-normalized. | |
| Vendored from src/sts_cards/search.py. | |
| """ | |
| sims = emb @ query_vec | |
| if exclude_idx is not None: | |
| sims[exclude_idx] = -np.inf | |
| k = min(k, len(sims)) | |
| top_idx = np.argpartition(-sims, k - 1)[:k] | |
| top_idx = top_idx[np.argsort(-sims[top_idx])] | |
| columns = [c for c in ("name", "type", "rarity", "cost", "color", "description") if c in df.columns] | |
| out = df.iloc[top_idx][columns].copy() | |
| out.insert(0, "similarity", sims[top_idx].round(4)) | |
| return out.reset_index(drop=True) | |