optcg-explorer / tests /conftest.py
t22000t's picture
Initial commit: optcg-explorer Gradio Space
3ab07bd
"""Test fixtures for optcg-explorer-space.
A 20-row synthetic corpus at 1024-dim mimics the shape of the published
`cards_with_embeddings.parquet`. Embeddings are random unit vectors with
a fixed seed so neighbour rankings are deterministic.
"""
from __future__ import annotations
from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
import pytest
from optcg_cards.provenance import (
EmbedProvenance,
FetchProvenance,
write_provenance,
)
EMBEDDING_DIM = 1024
N_CARDS = 20
_COLORS_POOL = [
["Red"],
["Green"],
["Blue"],
["Purple"],
["Black"],
["Yellow"],
["Red", "Green"],
["Blue", "Yellow"],
]
_CARD_TYPES = ["Character", "Event", "Stage", "Leader"]
_RARITIES = ["C", "UC", "R", "SR", "L"]
def _unit_vector(rng: np.random.Generator, dim: int) -> list[float]:
v = rng.standard_normal(dim).astype(np.float32)
v /= np.linalg.norm(v)
return v.tolist()
@pytest.fixture
def synthetic_cards() -> list[dict[str, Any]]:
"""20 OPTCG-shaped cards with 1024-dim L2-normalized embeddings."""
rng = np.random.default_rng(seed=42)
cards: list[dict[str, Any]] = []
for i in range(N_CARDS):
cards.append(
{
"id": f"OP01-{i:03d}",
"code": f"OP01-{i:03d}",
"name": f"Card {i}",
"card_type": _CARD_TYPES[i % len(_CARD_TYPES)],
"colors": _COLORS_POOL[i % len(_COLORS_POOL)],
"cost": (i % 10),
"power": 1000 * (1 + i % 9),
"counter": (i % 3) * 1000 if (i % 3) else None,
"life": 5 if _CARD_TYPES[i % len(_CARD_TYPES)] == "Leader" else None,
"attribute": "Slash" if i % 2 else "Strike",
"family": ["Straw Hat Crew"] if i % 2 else ["Animal Kingdom Pirates"],
"effect_text": f"Effect for card {i}. Blocker. Draw 1." if i % 4 == 0 else f"Effect for card {i}.",
"trigger_text": "Trigger: Draw 1." if i % 5 == 0 else "",
"rarity": _RARITIES[i % len(_RARITIES)],
"pack_id": "OP01",
"set_code": "OP01",
"set_name": "Romance Dawn",
"language": "en",
"umap_x": float(rng.uniform(-10, 10)),
"umap_y": float(rng.uniform(-10, 10)),
"embedding": _unit_vector(rng, EMBEDDING_DIM),
}
)
return cards
@pytest.fixture
def synthetic_embed_provenance() -> EmbedProvenance:
return EmbedProvenance(
model_id="Qwen/Qwen3-Embedding-0.6B",
embedding_dim=EMBEDDING_DIM,
matryoshka_dim=None,
task_instruction=(
"Instruct: Represent this One Piece Card Game card so that "
"mechanically similar cards are close in embedding space.\n"
"Text: {card_document}"
),
embedded_at="2026-05-13T00:00:00+00:00",
sentence_transformers_version="5.4.1",
)
@pytest.fixture
def synthetic_fetch_provenance() -> FetchProvenance:
return FetchProvenance(
source="vegapull",
source_url="https://en.onepiece-cardgame.com/cardlist/",
source_attribution="vegapull scraping en.onepiece-cardgame.com",
source_fetched_at="2026-05-13T00:00:00+00:00",
language="en",
n_cards=N_CARDS,
pack_ids_included=["OP01"],
latest_pack_id="OP01",
vegapull_version="1.2.2",
)
@pytest.fixture
def synthetic_repo(
tmp_path: Path,
synthetic_cards: list[dict[str, Any]],
synthetic_fetch_provenance: FetchProvenance,
synthetic_embed_provenance: EmbedProvenance,
) -> dict[str, Path]:
"""Materialize a tmp directory laid out like the published HF repo:
tmp/
cards_with_embeddings.parquet
provenance.json
"""
parquet_path = tmp_path / "cards_with_embeddings.parquet"
pd.DataFrame(synthetic_cards).to_parquet(parquet_path, index=False)
prov_path = tmp_path / "provenance.json"
write_provenance(
prov_path,
fetch=synthetic_fetch_provenance,
embed=synthetic_embed_provenance,
)
return {"parquet": parquet_path, "provenance": prov_path, "root": tmp_path}
@pytest.fixture
def patched_hf_download(
monkeypatch: pytest.MonkeyPatch,
synthetic_repo: dict[str, Path],
):
"""Patch huggingface_hub.hf_hub_download so spaceutil.data.load_corpus
pulls from the local synthetic_repo instead of the network."""
def fake_download(
repo_id: str,
filename: str,
repo_type: str | None = None,
token: str | None = None,
**kwargs: Any,
) -> str:
# Token must never be logged; assertion-free here, the log-capture
# test in test_data.py verifies the no-log invariant.
if filename == "cards_with_embeddings.parquet":
return str(synthetic_repo["parquet"])
if filename == "provenance.json":
return str(synthetic_repo["provenance"])
raise FileNotFoundError(f"Unexpected filename in synthetic repo: {filename}")
import huggingface_hub
monkeypatch.setattr(huggingface_hub, "hf_hub_download", fake_download)
# Also patch the symbol re-exported into spaceutil.data once it exists.
try:
import spaceutil.data as data_mod
monkeypatch.setattr(data_mod, "hf_hub_download", fake_download, raising=False)
except ImportError:
pass
return fake_download