Spaces:
Sleeping
Sleeping
File size: 3,626 Bytes
16eaadc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | """TDD for spaceutil.data.load_corpus."""
from __future__ import annotations
import logging
import numpy as np
from optcg_cards.provenance import EmbedProvenance
def test_load_corpus_returns_expected_shape(patched_hf_download):
from spaceutil.data import load_corpus
cards, matrix, embed_prov, id_to_idx = load_corpus(token="fake-token")
assert isinstance(cards, list)
assert len(cards) == 200
assert isinstance(matrix, np.ndarray)
assert matrix.shape == (200, 1024)
assert matrix.dtype == np.float32
assert isinstance(embed_prov, EmbedProvenance)
assert isinstance(id_to_idx, dict)
assert len(id_to_idx) == 200
def test_embedding_key_dropped_from_cards(patched_hf_download):
from spaceutil.data import load_corpus
cards, _, _, _ = load_corpus(token="fake-token")
for card in cards:
assert "embedding" not in card, "embedding column must be stripped after stacking"
def test_list_columns_coerced_to_python_lists(patched_hf_download):
from spaceutil.data import load_corpus
cards, _, _, _ = load_corpus(token="fake-token")
for card in cards:
assert isinstance(card["colors"], list), "colors must be list, not ndarray"
assert not isinstance(card["colors"], np.ndarray)
if card["family"] is not None:
assert isinstance(card["family"], list)
assert not isinstance(card["family"], np.ndarray)
def test_id_to_idx_consistency(patched_hf_download):
from spaceutil.data import load_corpus
cards, matrix, _, id_to_idx = load_corpus(token="fake-token")
for card in cards:
idx = id_to_idx[card["id"]]
assert cards[idx]["id"] == card["id"]
assert matrix[idx].shape == (1024,)
def test_provenance_recovered(patched_hf_download):
from spaceutil.data import load_corpus
_, _, embed_prov, _ = load_corpus(token="fake-token")
assert embed_prov.model_id == "Qwen/Qwen3-Embedding-0.6B"
assert embed_prov.embedding_dim == 1024
assert "Instruct" in embed_prov.task_instruction
assert "{card_document}" in embed_prov.task_instruction
def test_no_image_url_columns_exposed(patched_hf_download):
"""CLAUDE.md hard rule: no image/url/art columns."""
from spaceutil.data import load_corpus
cards, _, _, _ = load_corpus(token="fake-token")
forbidden_substrings = ("image", "art_url", "thumbnail", "img_")
for card in cards:
for key in card:
for sub in forbidden_substrings:
assert sub not in key.lower(), f"forbidden column {key!r}"
def test_token_never_logged(patched_hf_download, caplog):
"""HF_TOKEN must not appear in captured logs."""
from spaceutil.data import load_corpus
secret = "hf_super_secret_token_12345"
with caplog.at_level(logging.DEBUG):
load_corpus(token=secret)
for record in caplog.records:
assert secret not in record.getMessage()
assert secret not in str(record.args or "")
def test_matrix_is_l2_normalized(patched_hf_download):
"""Synthetic vectors are pre-normalized; load_corpus must preserve that."""
from spaceutil.data import load_corpus
_, matrix, _, _ = load_corpus(token="fake-token")
norms = np.linalg.norm(matrix, axis=1)
np.testing.assert_allclose(norms, 1.0, atol=1e-5)
def test_load_corpus_accepts_none_token(patched_hf_download):
"""After the HF repo is flipped public, token becomes optional."""
from spaceutil.data import load_corpus
cards, matrix, embed_prov, id_to_idx = load_corpus(token=None)
assert len(cards) == 200
assert matrix.shape == (200, 1024)
|