optcg-deck-builder / tests /test_data.py
t22000t's picture
Initial commit: optcg-deck-builder Gradio Space
16eaadc
"""TDD for spaceutil.data.load_corpus."""
from __future__ import annotations
import logging
import numpy as np
from optcg_cards.provenance import EmbedProvenance
def test_load_corpus_returns_expected_shape(patched_hf_download):
from spaceutil.data import load_corpus
cards, matrix, embed_prov, id_to_idx = load_corpus(token="fake-token")
assert isinstance(cards, list)
assert len(cards) == 200
assert isinstance(matrix, np.ndarray)
assert matrix.shape == (200, 1024)
assert matrix.dtype == np.float32
assert isinstance(embed_prov, EmbedProvenance)
assert isinstance(id_to_idx, dict)
assert len(id_to_idx) == 200
def test_embedding_key_dropped_from_cards(patched_hf_download):
from spaceutil.data import load_corpus
cards, _, _, _ = load_corpus(token="fake-token")
for card in cards:
assert "embedding" not in card, "embedding column must be stripped after stacking"
def test_list_columns_coerced_to_python_lists(patched_hf_download):
from spaceutil.data import load_corpus
cards, _, _, _ = load_corpus(token="fake-token")
for card in cards:
assert isinstance(card["colors"], list), "colors must be list, not ndarray"
assert not isinstance(card["colors"], np.ndarray)
if card["family"] is not None:
assert isinstance(card["family"], list)
assert not isinstance(card["family"], np.ndarray)
def test_id_to_idx_consistency(patched_hf_download):
from spaceutil.data import load_corpus
cards, matrix, _, id_to_idx = load_corpus(token="fake-token")
for card in cards:
idx = id_to_idx[card["id"]]
assert cards[idx]["id"] == card["id"]
assert matrix[idx].shape == (1024,)
def test_provenance_recovered(patched_hf_download):
from spaceutil.data import load_corpus
_, _, embed_prov, _ = load_corpus(token="fake-token")
assert embed_prov.model_id == "Qwen/Qwen3-Embedding-0.6B"
assert embed_prov.embedding_dim == 1024
assert "Instruct" in embed_prov.task_instruction
assert "{card_document}" in embed_prov.task_instruction
def test_no_image_url_columns_exposed(patched_hf_download):
"""CLAUDE.md hard rule: no image/url/art columns."""
from spaceutil.data import load_corpus
cards, _, _, _ = load_corpus(token="fake-token")
forbidden_substrings = ("image", "art_url", "thumbnail", "img_")
for card in cards:
for key in card:
for sub in forbidden_substrings:
assert sub not in key.lower(), f"forbidden column {key!r}"
def test_token_never_logged(patched_hf_download, caplog):
"""HF_TOKEN must not appear in captured logs."""
from spaceutil.data import load_corpus
secret = "hf_super_secret_token_12345"
with caplog.at_level(logging.DEBUG):
load_corpus(token=secret)
for record in caplog.records:
assert secret not in record.getMessage()
assert secret not in str(record.args or "")
def test_matrix_is_l2_normalized(patched_hf_download):
"""Synthetic vectors are pre-normalized; load_corpus must preserve that."""
from spaceutil.data import load_corpus
_, matrix, _, _ = load_corpus(token="fake-token")
norms = np.linalg.norm(matrix, axis=1)
np.testing.assert_allclose(norms, 1.0, atol=1e-5)
def test_load_corpus_accepts_none_token(patched_hf_download):
"""After the HF repo is flipped public, token becomes optional."""
from spaceutil.data import load_corpus
cards, matrix, embed_prov, id_to_idx = load_corpus(token=None)
assert len(cards) == 200
assert matrix.shape == (200, 1024)