Spaces:
Sleeping
Sleeping
| """TDD for spaceutil.data.load_corpus.""" | |
| from __future__ import annotations | |
| import logging | |
| import numpy as np | |
| from optcg_cards.provenance import EmbedProvenance | |
| def test_load_corpus_returns_expected_shape(patched_hf_download): | |
| from spaceutil.data import load_corpus | |
| cards, matrix, embed_prov, id_to_idx = load_corpus(token="fake-token") | |
| assert isinstance(cards, list) | |
| assert len(cards) == 200 | |
| assert isinstance(matrix, np.ndarray) | |
| assert matrix.shape == (200, 1024) | |
| assert matrix.dtype == np.float32 | |
| assert isinstance(embed_prov, EmbedProvenance) | |
| assert isinstance(id_to_idx, dict) | |
| assert len(id_to_idx) == 200 | |
| def test_embedding_key_dropped_from_cards(patched_hf_download): | |
| from spaceutil.data import load_corpus | |
| cards, _, _, _ = load_corpus(token="fake-token") | |
| for card in cards: | |
| assert "embedding" not in card, "embedding column must be stripped after stacking" | |
| def test_list_columns_coerced_to_python_lists(patched_hf_download): | |
| from spaceutil.data import load_corpus | |
| cards, _, _, _ = load_corpus(token="fake-token") | |
| for card in cards: | |
| assert isinstance(card["colors"], list), "colors must be list, not ndarray" | |
| assert not isinstance(card["colors"], np.ndarray) | |
| if card["family"] is not None: | |
| assert isinstance(card["family"], list) | |
| assert not isinstance(card["family"], np.ndarray) | |
| def test_id_to_idx_consistency(patched_hf_download): | |
| from spaceutil.data import load_corpus | |
| cards, matrix, _, id_to_idx = load_corpus(token="fake-token") | |
| for card in cards: | |
| idx = id_to_idx[card["id"]] | |
| assert cards[idx]["id"] == card["id"] | |
| assert matrix[idx].shape == (1024,) | |
| def test_provenance_recovered(patched_hf_download): | |
| from spaceutil.data import load_corpus | |
| _, _, embed_prov, _ = load_corpus(token="fake-token") | |
| assert embed_prov.model_id == "Qwen/Qwen3-Embedding-0.6B" | |
| assert embed_prov.embedding_dim == 1024 | |
| assert "Instruct" in embed_prov.task_instruction | |
| assert "{card_document}" in embed_prov.task_instruction | |
| def test_no_image_url_columns_exposed(patched_hf_download): | |
| """CLAUDE.md hard rule: no image/url/art columns.""" | |
| from spaceutil.data import load_corpus | |
| cards, _, _, _ = load_corpus(token="fake-token") | |
| forbidden_substrings = ("image", "art_url", "thumbnail", "img_") | |
| for card in cards: | |
| for key in card: | |
| for sub in forbidden_substrings: | |
| assert sub not in key.lower(), f"forbidden column {key!r}" | |
| def test_token_never_logged(patched_hf_download, caplog): | |
| """HF_TOKEN must not appear in captured logs.""" | |
| from spaceutil.data import load_corpus | |
| secret = "hf_super_secret_token_12345" | |
| with caplog.at_level(logging.DEBUG): | |
| load_corpus(token=secret) | |
| for record in caplog.records: | |
| assert secret not in record.getMessage() | |
| assert secret not in str(record.args or "") | |
| def test_matrix_is_l2_normalized(patched_hf_download): | |
| """Synthetic vectors are pre-normalized; load_corpus must preserve that.""" | |
| from spaceutil.data import load_corpus | |
| _, matrix, _, _ = load_corpus(token="fake-token") | |
| norms = np.linalg.norm(matrix, axis=1) | |
| np.testing.assert_allclose(norms, 1.0, atol=1e-5) | |
| def test_load_corpus_accepts_none_token(patched_hf_download): | |
| """After the HF repo is flipped public, token becomes optional.""" | |
| from spaceutil.data import load_corpus | |
| cards, matrix, embed_prov, id_to_idx = load_corpus(token=None) | |
| assert len(cards) == 200 | |
| assert matrix.shape == (200, 1024) | |