"""LLM abstraction layer — Ollama and HuggingFace backends.""" from __future__ import annotations import asyncio import logging import re from typing import List from app.config import cfg logger = logging.getLogger("qmodel.llm") class LLMProvider: """Abstract base for LLM providers.""" async def chat( self, messages: List[dict], temperature: float, max_tokens: int ) -> str: raise NotImplementedError class OllamaProvider(LLMProvider): """Ollama-based LLM provider.""" def __init__(self, host: str, model: str): self.host = host self.model = model try: import ollama self.client = ollama.Client(host=host) except ImportError: raise ImportError("Install ollama: pip install ollama") async def chat( self, messages: List[dict], temperature: float, max_tokens: int ) -> str: # Qwen3 models return empty with think=False. Use think=True with # /no_think in the system prompt so the model responds immediately # without actually producing a block. patched = [] for msg in messages: if msg["role"] == "system" and "/no_think" not in msg["content"]: patched.append({"role": "system", "content": msg["content"] + "\n/no_think"}) else: patched.append(msg) loop = asyncio.get_event_loop() try: result = await loop.run_in_executor( None, lambda: self.client.chat( model=self.model, messages=patched, options={"temperature": temperature, "num_predict": max_tokens}, think=True, ), ) content = result["message"]["content"].strip() # Strip any blocks that slip through content = re.sub(r"[\s\S]*?", "", content, flags=re.IGNORECASE) content = re.sub(r"[\s\S]*$", "", content, flags=re.IGNORECASE) return content.strip() except Exception as exc: logger.error("Ollama chat failed: %s", exc) raise class GGUFProvider(LLMProvider): """llama-cpp-python GGUF provider — runs GGUF models directly in-process.""" def __init__(self, model_path: str, n_ctx: int = 4096, n_gpu_layers: int = -1): try: from llama_cpp import Llama except ImportError: raise ImportError("Install llama-cpp-python: pip install llama-cpp-python") self.llm = Llama( model_path=model_path, n_ctx=n_ctx, n_gpu_layers=n_gpu_layers, verbose=False, ) async def chat( self, messages: List[dict], temperature: float, max_tokens: int ) -> str: # Disable Qwen3 thinking mode by appending /no_think to the system message patched = [] for msg in messages: if msg["role"] == "system" and "/no_think" not in msg["content"]: patched.append({"role": "system", "content": msg["content"] + "\n/no_think"}) else: patched.append(msg) loop = asyncio.get_event_loop() try: result = await loop.run_in_executor( None, lambda: self.llm.create_chat_completion( messages=patched, temperature=temperature, max_tokens=max_tokens, ), ) content = result["choices"][0]["message"]["content"] or "" logger.debug("GGUF raw response (%d chars): %.500s", len(content), content) # Return raw content — callers handle stripping so they # can still extract structured data from inside think blocks. return content.strip() except Exception as exc: logger.error("GGUF chat failed: %s", exc) raise class LMStudioProvider(LLMProvider): """LM Studio provider — connects to LM Studio's OpenAI-compatible local API.""" def __init__(self, base_url: str, model: str): self.base_url = base_url.rstrip("/") self.model = model async def chat( self, messages: List[dict], temperature: float, max_tokens: int ) -> str: import httpx payload = { "model": self.model, "messages": messages, "temperature": temperature, "max_tokens": max_tokens, } try: async with httpx.AsyncClient(timeout=120) as client: resp = await client.post( f"{self.base_url}/v1/chat/completions", json=payload ) resp.raise_for_status() data = resp.json() return data["choices"][0]["message"]["content"].strip() except Exception as exc: logger.error("LM Studio chat failed: %s", exc) raise class HuggingFaceProvider(LLMProvider): """Hugging Face transformers-based LLM provider.""" def __init__(self, model_name: str, device: str): self.model_name = model_name self.device = device try: from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModelForCausalLM.from_pretrained( model_name, device_map=device, torch_dtype="auto", ) self.pipeline = TextGenerationPipeline( model=self.model, tokenizer=self.tokenizer, device=0 if device != "cpu" else None, ) except ImportError: raise ImportError("Install transformers: pip install transformers torch") async def chat( self, messages: List[dict], temperature: float, max_tokens: int ) -> str: prompt = self._format_messages(messages) loop = asyncio.get_event_loop() try: result = await loop.run_in_executor( None, lambda: self.pipeline( prompt, max_new_tokens=max_tokens, temperature=temperature, do_sample=temperature > 0, ), ) generated = result[0]["generated_text"] output = generated[len(prompt):].strip() return output except Exception as exc: logger.error("HF chat failed: %s", exc) raise def _format_messages(self, messages: List[dict]) -> str: prompt = "" for msg in messages: role = msg["role"] content = msg["content"] if role == "system": prompt += f"{content}\n\n" elif role == "user": prompt += f"User: {content}\n" elif role == "assistant": prompt += f"Assistant: {content}\n" prompt += "Assistant: " return prompt def get_llm_provider() -> LLMProvider: """Factory function to get the configured LLM provider.""" backend = cfg.LLM_BACKEND.lower() if backend == "ollama": logger.info("Using Ollama backend: %s @ %s", cfg.OLLAMA_MODEL, cfg.OLLAMA_HOST) return OllamaProvider(cfg.OLLAMA_HOST, cfg.OLLAMA_MODEL) elif backend == "hf": logger.info("Using HuggingFace backend: %s on %s", cfg.HF_MODEL_NAME, cfg.HF_DEVICE) return HuggingFaceProvider(cfg.HF_MODEL_NAME, cfg.HF_DEVICE) elif backend == "gguf": logger.info("Using GGUF backend: %s (ctx=%d, gpu_layers=%d)", cfg.GGUF_MODEL_PATH, cfg.GGUF_N_CTX, cfg.GGUF_N_GPU_LAYERS) return GGUFProvider(cfg.GGUF_MODEL_PATH, cfg.GGUF_N_CTX, cfg.GGUF_N_GPU_LAYERS) elif backend == "lmstudio": logger.info("Using LM Studio backend: %s @ %s", cfg.LMSTUDIO_MODEL, cfg.LMSTUDIO_URL) return LMStudioProvider(cfg.LMSTUDIO_URL, cfg.LMSTUDIO_MODEL) else: raise ValueError(f"Unknown LLM_BACKEND: {cfg.LLM_BACKEND!r}")