Spaces:
Sleeping
Sleeping
| """Cached Qwen3-Embedding encoder for free-text queries. | |
| The upstream `optcg_cards.embed.encode_query` (embed.py:172-202) loads | |
| the model fresh on every call. That's fine for the CLI but unusable | |
| inside a Gradio Space (~2-5 s of model construction per query on CPU). | |
| This module wraps it with a module-level cache: the model is loaded | |
| once on first use, warmed up with a single encode pass, and reused for | |
| all subsequent queries. The task instruction and matryoshka truncation | |
| still come from the `EmbedProvenance`, so the encoded query stays | |
| comparable to the published vectors. | |
| The deviation from CLAUDE.md's "lazy-import heavy deps" rule is | |
| intentional: eager loading at module import lets HF Spaces' "Building" | |
| indicator absorb the ~30-60 s cold-start, sparing the first user. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| from typing import Any | |
| import numpy as np | |
| from optcg_cards.provenance import EmbedProvenance | |
| logger = logging.getLogger(__name__) | |
| _model_cache: dict[str, Any] = {} | |
| def get_encoder(embed_prov: EmbedProvenance): | |
| """Load + cache a SentenceTransformer for the given model id.""" | |
| key = embed_prov.model_id | |
| cached = _model_cache.get(key) | |
| if cached is not None: | |
| return cached | |
| from sentence_transformers import SentenceTransformer | |
| logger.info("Loading encoder %s (first use; subsequent calls reuse)", key) | |
| model = SentenceTransformer(key) | |
| # Warmup: first encode pass is always slower (graph build + kernel | |
| # selection). Pay it now so the first user query doesn't. | |
| _ = model.encode(["warmup"], normalize_embeddings=True, show_progress_bar=False) | |
| _model_cache[key] = model | |
| return model | |
| def encode_query_via_optcg(query: str, embed_prov: EmbedProvenance) -> np.ndarray: | |
| """Encode a free-text query using the cached model. | |
| Mirrors the logic of optcg_cards.embed.encode_query (embed.py:172-202) | |
| so query vectors are comparable to the published corpus, but reuses | |
| the cached model instead of re-instantiating SentenceTransformer. | |
| """ | |
| model = get_encoder(embed_prov) | |
| text = embed_prov.task_instruction.format(card_document=query) | |
| vector = model.encode( | |
| [text], | |
| normalize_embeddings=True, | |
| show_progress_bar=False, | |
| convert_to_numpy=True, | |
| )[0] | |
| if embed_prov.matryoshka_dim is not None: | |
| vector = vector[: embed_prov.matryoshka_dim] | |
| norm = float(np.linalg.norm(vector)) | |
| if norm > 0: | |
| vector = vector / norm | |
| return np.asarray(vector, dtype=np.float32) | |