# ============================================================================= # smollm.py # SmolLM2 Inference Engine # SmolLM2 Service Space # Copyright 2026 - Volkan Kücükbudak # Apache License V2 + ESOL 1.1 # ============================================================================= import logging import torch from typing import Optional import model as model_module logger = logging.getLogger("smollm") _tokenizer = None _model = None _device = None def load(): """Lazy model loader — called on first request.""" global _tokenizer, _model, _device if _model is not None: return from transformers import AutoModelForCausalLM, AutoTokenizer model_id = model_module.get_model_id() kwargs = model_module.get_model_kwargs() _device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Loading {model_id} on {_device}...") _tokenizer = AutoTokenizer.from_pretrained(model_id, **kwargs) _model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs).to(_device) logger.info(f"Model ready [{_device}]") # Update model card on startup model_module.push_model_card({ "model_id": model_id, "device": _device, }) async def complete( prompt: str, system_prompt: str = "", max_tokens: int = 150, temperature: float = 0.2, ) -> str: """ Run SmolLM2 inference. Returns: Generated text string. Raises: RuntimeError on inference failure. """ load() messages = [] if system_prompt.strip(): messages.append({"role": "system", "content": system_prompt}) messages.append({"role": "user", "content": prompt}) text = _tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = _tokenizer.encode(text, return_tensors="pt").to(_device) with torch.no_grad(): outputs = _model.generate( inputs, max_new_tokens=max_tokens, temperature=temperature if temperature > 0 else None, do_sample=temperature > 0, top_p=0.9 if temperature > 0 else None, pad_token_id=_tokenizer.eos_token_id, ) new_tokens = outputs[0][inputs.shape[-1]:] return _tokenizer.decode(new_tokens, skip_special_tokens=True).strip() def is_ready() -> bool: return _model is not None def device_info() -> str: return _device or "not loaded"