"""TDD for spaceutil.data.load_corpus.""" from __future__ import annotations import logging import numpy as np from optcg_cards.provenance import EmbedProvenance def test_load_corpus_returns_expected_shape(patched_hf_download): from spaceutil.data import load_corpus cards, matrix, embed_prov, id_to_idx = load_corpus(token="fake-token") assert isinstance(cards, list) assert len(cards) == 200 assert isinstance(matrix, np.ndarray) assert matrix.shape == (200, 1024) assert matrix.dtype == np.float32 assert isinstance(embed_prov, EmbedProvenance) assert isinstance(id_to_idx, dict) assert len(id_to_idx) == 200 def test_embedding_key_dropped_from_cards(patched_hf_download): from spaceutil.data import load_corpus cards, _, _, _ = load_corpus(token="fake-token") for card in cards: assert "embedding" not in card, "embedding column must be stripped after stacking" def test_list_columns_coerced_to_python_lists(patched_hf_download): from spaceutil.data import load_corpus cards, _, _, _ = load_corpus(token="fake-token") for card in cards: assert isinstance(card["colors"], list), "colors must be list, not ndarray" assert not isinstance(card["colors"], np.ndarray) if card["family"] is not None: assert isinstance(card["family"], list) assert not isinstance(card["family"], np.ndarray) def test_id_to_idx_consistency(patched_hf_download): from spaceutil.data import load_corpus cards, matrix, _, id_to_idx = load_corpus(token="fake-token") for card in cards: idx = id_to_idx[card["id"]] assert cards[idx]["id"] == card["id"] assert matrix[idx].shape == (1024,) def test_provenance_recovered(patched_hf_download): from spaceutil.data import load_corpus _, _, embed_prov, _ = load_corpus(token="fake-token") assert embed_prov.model_id == "Qwen/Qwen3-Embedding-0.6B" assert embed_prov.embedding_dim == 1024 assert "Instruct" in embed_prov.task_instruction assert "{card_document}" in embed_prov.task_instruction def test_no_image_url_columns_exposed(patched_hf_download): """CLAUDE.md hard rule: no image/url/art columns.""" from spaceutil.data import load_corpus cards, _, _, _ = load_corpus(token="fake-token") forbidden_substrings = ("image", "art_url", "thumbnail", "img_") for card in cards: for key in card: for sub in forbidden_substrings: assert sub not in key.lower(), f"forbidden column {key!r}" def test_token_never_logged(patched_hf_download, caplog): """HF_TOKEN must not appear in captured logs.""" from spaceutil.data import load_corpus secret = "hf_super_secret_token_12345" with caplog.at_level(logging.DEBUG): load_corpus(token=secret) for record in caplog.records: assert secret not in record.getMessage() assert secret not in str(record.args or "") def test_matrix_is_l2_normalized(patched_hf_download): """Synthetic vectors are pre-normalized; load_corpus must preserve that.""" from spaceutil.data import load_corpus _, matrix, _, _ = load_corpus(token="fake-token") norms = np.linalg.norm(matrix, axis=1) np.testing.assert_allclose(norms, 1.0, atol=1e-5) def test_load_corpus_accepts_none_token(patched_hf_download): """After the HF repo is flipped public, token becomes optional.""" from spaceutil.data import load_corpus cards, matrix, embed_prov, id_to_idx = load_corpus(token=None) assert len(cards) == 200 assert matrix.shape == (200, 1024)