Spaces:
Sleeping
Sleeping
| """Card-document construction for embedding. | |
| Vendored from src/sts_cards/normalize.py, keep in sync. Only the helpers | |
| needed by Space 2 (Synergy Inspector) and Space 3 (Build Me a Deck) are | |
| included; the full normalize_card() function lives in the parent package. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import re | |
| from typing import Any | |
| import numpy as np | |
| # Fields the embedding model sees. Must match the parent package exactly, | |
| # otherwise user-supplied cards get encoded with a different field set than | |
| # the indexed cards and similarity scores stop being meaningful. | |
| MECHANICS_FIELDS: tuple[str, ...] = ( | |
| "name", "type", "rarity", "color", "cost", | |
| "description", "description_upgraded", "keywords", | |
| ) | |
| def normalize_card_name_in_text(name: str | None, text: str | None) -> str: | |
| """Replace the card's own name in its description with `~`. | |
| Following the minimaxir/mtg-embeddings convention: card text shouldn't | |
| leak the card's name to the embedding model, otherwise vectors are | |
| dominated by name surface form rather than mechanics. | |
| """ | |
| if not text: | |
| return "" | |
| if not name: | |
| return text | |
| return re.compile(re.escape(name), re.IGNORECASE).sub("~", text) | |
| def build_card_document(row: dict[str, Any] | Any) -> str: | |
| """Produce the prettified-JSON string that gets embedded. | |
| Indentation is intentional, measurably improves embedding quality | |
| per minimaxir's writeup. Accepts both dicts and pandas Series. | |
| """ | |
| def get(key: str, default: Any = None) -> Any: | |
| if hasattr(row, "get"): | |
| return row.get(key, default) | |
| return getattr(row, key, default) | |
| name = get("name") or "" | |
| doc: dict[str, Any] = {} | |
| for field in MECHANICS_FIELDS: | |
| val = get(field) | |
| if val is None or (isinstance(val, float) and np.isnan(val)): | |
| continue | |
| if field == "keywords" and isinstance(val, str): | |
| try: | |
| val = json.loads(val) if val else [] | |
| except json.JSONDecodeError: | |
| val = [val] | |
| if field in ("description", "description_upgraded") and isinstance(val, str): | |
| val = normalize_card_name_in_text(name, val) | |
| if val == "" or val == []: | |
| continue | |
| doc[field] = val | |
| return json.dumps(doc, indent=2, ensure_ascii=False) | |