t22000t's picture
Update archetype-map Space
06202d6
"""Card-document construction for embedding.
Vendored from src/sts_cards/normalize.py, keep in sync. Only the helpers
needed by Space 2 (Synergy Inspector) and Space 3 (Build Me a Deck) are
included; the full normalize_card() function lives in the parent package.
"""
from __future__ import annotations
import json
import re
from typing import Any
import numpy as np
# Fields the embedding model sees. Must match the parent package exactly,
# otherwise user-supplied cards get encoded with a different field set than
# the indexed cards and similarity scores stop being meaningful.
MECHANICS_FIELDS: tuple[str, ...] = (
"name", "type", "rarity", "color", "cost",
"description", "description_upgraded", "keywords",
)
def normalize_card_name_in_text(name: str | None, text: str | None) -> str:
"""Replace the card's own name in its description with `~`.
Following the minimaxir/mtg-embeddings convention: card text shouldn't
leak the card's name to the embedding model, otherwise vectors are
dominated by name surface form rather than mechanics.
"""
if not text:
return ""
if not name:
return text
return re.compile(re.escape(name), re.IGNORECASE).sub("~", text)
def build_card_document(row: dict[str, Any] | Any) -> str:
"""Produce the prettified-JSON string that gets embedded.
Indentation is intentional, measurably improves embedding quality
per minimaxir's writeup. Accepts both dicts and pandas Series.
"""
def get(key: str, default: Any = None) -> Any:
if hasattr(row, "get"):
return row.get(key, default)
return getattr(row, key, default)
name = get("name") or ""
doc: dict[str, Any] = {}
for field in MECHANICS_FIELDS:
val = get(field)
if val is None or (isinstance(val, float) and np.isnan(val)):
continue
if field == "keywords" and isinstance(val, str):
try:
val = json.loads(val) if val else []
except json.JSONDecodeError:
val = [val]
if field in ("description", "description_upgraded") and isinstance(val, str):
val = normalize_card_name_in_text(name, val)
if val == "" or val == []:
continue
doc[field] = val
return json.dumps(doc, indent=2, ensure_ascii=False)