RetailMind / modules /shared.py
hodfa840's picture
perf: reduce per-query latency — shared model, single encode, float32 LLM
d624b44
raw
history blame contribute delete
556 Bytes
"""Shared SentenceTransformer singleton — loaded once, used everywhere."""
from __future__ import annotations
import logging
from sentence_transformers import SentenceTransformer
logger = logging.getLogger(__name__)
_model: SentenceTransformer | None = None
def get_embedding_model() -> SentenceTransformer:
global _model
if _model is None:
logger.info("Loading SentenceTransformer (all-MiniLM-L6-v2)…")
_model = SentenceTransformer("all-MiniLM-L6-v2")
logger.info("SentenceTransformer ready.")
return _model