File size: 2,347 Bytes
06202d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""Card-document construction for embedding.

Vendored from src/sts_cards/normalize.py, keep in sync. Only the helpers
needed by Space 2 (Synergy Inspector) and Space 3 (Build Me a Deck) are
included; the full normalize_card() function lives in the parent package.
"""

from __future__ import annotations

import json
import re
from typing import Any

import numpy as np

# Fields the embedding model sees. Must match the parent package exactly,
# otherwise user-supplied cards get encoded with a different field set than
# the indexed cards and similarity scores stop being meaningful.
MECHANICS_FIELDS: tuple[str, ...] = (
    "name", "type", "rarity", "color", "cost",
    "description", "description_upgraded", "keywords",
)


def normalize_card_name_in_text(name: str | None, text: str | None) -> str:
    """Replace the card's own name in its description with `~`.

    Following the minimaxir/mtg-embeddings convention: card text shouldn't
    leak the card's name to the embedding model, otherwise vectors are
    dominated by name surface form rather than mechanics.
    """
    if not text:
        return ""
    if not name:
        return text
    return re.compile(re.escape(name), re.IGNORECASE).sub("~", text)


def build_card_document(row: dict[str, Any] | Any) -> str:
    """Produce the prettified-JSON string that gets embedded.

    Indentation is intentional, measurably improves embedding quality
    per minimaxir's writeup. Accepts both dicts and pandas Series.
    """
    def get(key: str, default: Any = None) -> Any:
        if hasattr(row, "get"):
            return row.get(key, default)
        return getattr(row, key, default)

    name = get("name") or ""
    doc: dict[str, Any] = {}
    for field in MECHANICS_FIELDS:
        val = get(field)
        if val is None or (isinstance(val, float) and np.isnan(val)):
            continue
        if field == "keywords" and isinstance(val, str):
            try:
                val = json.loads(val) if val else []
            except json.JSONDecodeError:
                val = [val]
        if field in ("description", "description_upgraded") and isinstance(val, str):
            val = normalize_card_name_in_text(name, val)
        if val == "" or val == []:
            continue
        doc[field] = val
    return json.dumps(doc, indent=2, ensure_ascii=False)