File size: 3,621 Bytes
3ab07bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""TDD for spaceutil.data.load_corpus."""

from __future__ import annotations

import logging

import numpy as np
from optcg_cards.provenance import EmbedProvenance


def test_load_corpus_returns_expected_shape(patched_hf_download):
    from spaceutil.data import load_corpus

    cards, matrix, embed_prov, id_to_idx = load_corpus(token="fake-token")

    assert isinstance(cards, list)
    assert len(cards) == 20
    assert isinstance(matrix, np.ndarray)
    assert matrix.shape == (20, 1024)
    assert matrix.dtype == np.float32
    assert isinstance(embed_prov, EmbedProvenance)
    assert isinstance(id_to_idx, dict)
    assert len(id_to_idx) == 20


def test_embedding_key_dropped_from_cards(patched_hf_download):
    from spaceutil.data import load_corpus

    cards, _, _, _ = load_corpus(token="fake-token")

    for card in cards:
        assert "embedding" not in card, "embedding column must be stripped after stacking"


def test_list_columns_coerced_to_python_lists(patched_hf_download):
    from spaceutil.data import load_corpus

    cards, _, _, _ = load_corpus(token="fake-token")

    for card in cards:
        assert isinstance(card["colors"], list), "colors must be list, not ndarray"
        assert not isinstance(card["colors"], np.ndarray)
        if card["family"] is not None:
            assert isinstance(card["family"], list)
            assert not isinstance(card["family"], np.ndarray)


def test_id_to_idx_consistency(patched_hf_download):
    from spaceutil.data import load_corpus

    cards, matrix, _, id_to_idx = load_corpus(token="fake-token")

    for card in cards:
        idx = id_to_idx[card["id"]]
        assert cards[idx]["id"] == card["id"]
        assert matrix[idx].shape == (1024,)


def test_provenance_recovered(patched_hf_download):
    from spaceutil.data import load_corpus

    _, _, embed_prov, _ = load_corpus(token="fake-token")

    assert embed_prov.model_id == "Qwen/Qwen3-Embedding-0.6B"
    assert embed_prov.embedding_dim == 1024
    assert "Instruct" in embed_prov.task_instruction
    assert "{card_document}" in embed_prov.task_instruction


def test_no_image_url_columns_exposed(patched_hf_download):
    """CLAUDE.md hard rule: no image/url/art columns."""
    from spaceutil.data import load_corpus

    cards, _, _, _ = load_corpus(token="fake-token")

    forbidden_substrings = ("image", "art_url", "thumbnail", "img_")
    for card in cards:
        for key in card:
            for sub in forbidden_substrings:
                assert sub not in key.lower(), f"forbidden column {key!r}"


def test_token_never_logged(patched_hf_download, caplog):
    """HF_TOKEN must not appear in captured logs."""
    from spaceutil.data import load_corpus

    secret = "hf_super_secret_token_12345"
    with caplog.at_level(logging.DEBUG):
        load_corpus(token=secret)

    for record in caplog.records:
        assert secret not in record.getMessage()
        assert secret not in str(record.args or "")


def test_matrix_is_l2_normalized(patched_hf_download):
    """Synthetic vectors are pre-normalized; load_corpus must preserve that."""
    from spaceutil.data import load_corpus

    _, matrix, _, _ = load_corpus(token="fake-token")

    norms = np.linalg.norm(matrix, axis=1)
    np.testing.assert_allclose(norms, 1.0, atol=1e-5)


def test_load_corpus_accepts_none_token(patched_hf_download):
    """After the HF repo is flipped public, token becomes optional."""
    from spaceutil.data import load_corpus

    cards, matrix, embed_prov, id_to_idx = load_corpus(token=None)
    assert len(cards) == 20
    assert matrix.shape == (20, 1024)