Spaces:
Sleeping
Sleeping
| """Test fixtures for optcg-explorer-space. | |
| A 20-row synthetic corpus at 1024-dim mimics the shape of the published | |
| `cards_with_embeddings.parquet`. Embeddings are random unit vectors with | |
| a fixed seed so neighbour rankings are deterministic. | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| from typing import Any | |
| import numpy as np | |
| import pandas as pd | |
| import pytest | |
| from optcg_cards.provenance import ( | |
| EmbedProvenance, | |
| FetchProvenance, | |
| write_provenance, | |
| ) | |
| EMBEDDING_DIM = 1024 | |
| N_CARDS = 20 | |
| _COLORS_POOL = [ | |
| ["Red"], | |
| ["Green"], | |
| ["Blue"], | |
| ["Purple"], | |
| ["Black"], | |
| ["Yellow"], | |
| ["Red", "Green"], | |
| ["Blue", "Yellow"], | |
| ] | |
| _CARD_TYPES = ["Character", "Event", "Stage", "Leader"] | |
| _RARITIES = ["C", "UC", "R", "SR", "L"] | |
| def _unit_vector(rng: np.random.Generator, dim: int) -> list[float]: | |
| v = rng.standard_normal(dim).astype(np.float32) | |
| v /= np.linalg.norm(v) | |
| return v.tolist() | |
| def synthetic_cards() -> list[dict[str, Any]]: | |
| """20 OPTCG-shaped cards with 1024-dim L2-normalized embeddings.""" | |
| rng = np.random.default_rng(seed=42) | |
| cards: list[dict[str, Any]] = [] | |
| for i in range(N_CARDS): | |
| cards.append( | |
| { | |
| "id": f"OP01-{i:03d}", | |
| "code": f"OP01-{i:03d}", | |
| "name": f"Card {i}", | |
| "card_type": _CARD_TYPES[i % len(_CARD_TYPES)], | |
| "colors": _COLORS_POOL[i % len(_COLORS_POOL)], | |
| "cost": (i % 10), | |
| "power": 1000 * (1 + i % 9), | |
| "counter": (i % 3) * 1000 if (i % 3) else None, | |
| "life": 5 if _CARD_TYPES[i % len(_CARD_TYPES)] == "Leader" else None, | |
| "attribute": "Slash" if i % 2 else "Strike", | |
| "family": ["Straw Hat Crew"] if i % 2 else ["Animal Kingdom Pirates"], | |
| "effect_text": f"Effect for card {i}. Blocker. Draw 1." if i % 4 == 0 else f"Effect for card {i}.", | |
| "trigger_text": "Trigger: Draw 1." if i % 5 == 0 else "", | |
| "rarity": _RARITIES[i % len(_RARITIES)], | |
| "pack_id": "OP01", | |
| "set_code": "OP01", | |
| "set_name": "Romance Dawn", | |
| "language": "en", | |
| "umap_x": float(rng.uniform(-10, 10)), | |
| "umap_y": float(rng.uniform(-10, 10)), | |
| "embedding": _unit_vector(rng, EMBEDDING_DIM), | |
| } | |
| ) | |
| return cards | |
| def synthetic_embed_provenance() -> EmbedProvenance: | |
| return EmbedProvenance( | |
| model_id="Qwen/Qwen3-Embedding-0.6B", | |
| embedding_dim=EMBEDDING_DIM, | |
| matryoshka_dim=None, | |
| task_instruction=( | |
| "Instruct: Represent this One Piece Card Game card so that " | |
| "mechanically similar cards are close in embedding space.\n" | |
| "Text: {card_document}" | |
| ), | |
| embedded_at="2026-05-13T00:00:00+00:00", | |
| sentence_transformers_version="5.4.1", | |
| ) | |
| def synthetic_fetch_provenance() -> FetchProvenance: | |
| return FetchProvenance( | |
| source="vegapull", | |
| source_url="https://en.onepiece-cardgame.com/cardlist/", | |
| source_attribution="vegapull scraping en.onepiece-cardgame.com", | |
| source_fetched_at="2026-05-13T00:00:00+00:00", | |
| language="en", | |
| n_cards=N_CARDS, | |
| pack_ids_included=["OP01"], | |
| latest_pack_id="OP01", | |
| vegapull_version="1.2.2", | |
| ) | |
| def synthetic_repo( | |
| tmp_path: Path, | |
| synthetic_cards: list[dict[str, Any]], | |
| synthetic_fetch_provenance: FetchProvenance, | |
| synthetic_embed_provenance: EmbedProvenance, | |
| ) -> dict[str, Path]: | |
| """Materialize a tmp directory laid out like the published HF repo: | |
| tmp/ | |
| cards_with_embeddings.parquet | |
| provenance.json | |
| """ | |
| parquet_path = tmp_path / "cards_with_embeddings.parquet" | |
| pd.DataFrame(synthetic_cards).to_parquet(parquet_path, index=False) | |
| prov_path = tmp_path / "provenance.json" | |
| write_provenance( | |
| prov_path, | |
| fetch=synthetic_fetch_provenance, | |
| embed=synthetic_embed_provenance, | |
| ) | |
| return {"parquet": parquet_path, "provenance": prov_path, "root": tmp_path} | |
| def patched_hf_download( | |
| monkeypatch: pytest.MonkeyPatch, | |
| synthetic_repo: dict[str, Path], | |
| ): | |
| """Patch huggingface_hub.hf_hub_download so spaceutil.data.load_corpus | |
| pulls from the local synthetic_repo instead of the network.""" | |
| def fake_download( | |
| repo_id: str, | |
| filename: str, | |
| repo_type: str | None = None, | |
| token: str | None = None, | |
| **kwargs: Any, | |
| ) -> str: | |
| # Token must never be logged; assertion-free here, the log-capture | |
| # test in test_data.py verifies the no-log invariant. | |
| if filename == "cards_with_embeddings.parquet": | |
| return str(synthetic_repo["parquet"]) | |
| if filename == "provenance.json": | |
| return str(synthetic_repo["provenance"]) | |
| raise FileNotFoundError(f"Unexpected filename in synthetic repo: {filename}") | |
| import huggingface_hub | |
| monkeypatch.setattr(huggingface_hub, "hf_hub_download", fake_download) | |
| # Also patch the symbol re-exported into spaceutil.data once it exists. | |
| try: | |
| import spaceutil.data as data_mod | |
| monkeypatch.setattr(data_mod, "hf_hub_download", fake_download, raising=False) | |
| except ImportError: | |
| pass | |
| return fake_download | |