| """ |
| SLM-Native Backends — First-class support for Small Language Models. |
| |
| Purpose Agent is the world's first agentic framework designed natively for SLMs. |
| These backends handle the unique challenges of small models: |
| - Grammar-constrained JSON output (SLMs can't reliably produce JSON from prompts alone) |
| - Prompt compression for small context windows (8K-32K) |
| - Adaptive prompting (shorter system prompts, schema-first format) |
| - Token budget management |
| |
| Supported backends: |
| - OllamaBackend: Local serving via Ollama (CPU/GPU, any GGUF model) |
| - LlamaCppBackend: Direct llama-cpp-python (CPU/Apple Silicon, GGUF) |
| - TransformersBackend: HuggingFace transformers (GPU, native weights) |
| |
| All backends implement the same LLMBackend interface — swap freely. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| import logging |
| import os |
| import re |
| from typing import Any, AsyncIterator, Iterator |
|
|
| from purpose_agent.llm_backend import ChatMessage, LLMBackend |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| |
| |
| |
|
|
| class SLMPromptCompressor: |
| """ |
| Compresses prompts for small context windows without losing critical info. |
| |
| Strategies (from TinyAgent arxiv:2409.00608 + LLMLingua-2 arxiv:2403.12968): |
| 1. Schema-first: Move JSON schema to top, compress descriptions |
| 2. History truncation: Summarize old steps, keep recent ones verbatim |
| 3. Example reduction: Fewer few-shot examples for SLMs |
| 4. Whitespace stripping: Remove unnecessary formatting |
| |
| No external dependencies — pure Python compression. |
| For better compression, install llmlingua: pip install llmlingua |
| """ |
|
|
| def __init__(self, max_tokens: int = 4096, aggressive: bool = False): |
| self.max_tokens = max_tokens |
| self.aggressive = aggressive |
|
|
| def compress(self, text: str, budget: int | None = None) -> str: |
| """Compress text to fit within token budget.""" |
| budget = budget or self.max_tokens |
| |
| char_budget = budget * 4 |
|
|
| if len(text) <= char_budget: |
| return text |
|
|
| compressed = text |
| |
| compressed = re.sub(r'\n{3,}', '\n\n', compressed) |
| compressed = re.sub(r'[ \t]{2,}', ' ', compressed) |
| compressed = re.sub(r'^\s+', '', compressed, flags=re.MULTILINE) |
|
|
| if len(compressed) <= char_budget: |
| return compressed |
|
|
| |
| if self.aggressive: |
| |
| compressed = re.sub(r'\*\*([^*]+)\*\*', r'\1', compressed) |
| compressed = re.sub(r'#{1,3}\s+', '', compressed) |
| |
| replacements = { |
| "You MUST respond with": "Respond with", |
| "Based on the current state and your goal, ": "", |
| "Respond in this exact JSON format:": "JSON format:", |
| "Step-by-step justification": "Justification", |
| "Specific observable state changes": "State changes", |
| } |
| for old, new in replacements.items(): |
| compressed = compressed.replace(old, new) |
|
|
| if len(compressed) <= char_budget: |
| return compressed |
|
|
| |
| keep_start = char_budget * 2 // 3 |
| keep_end = char_budget // 3 |
| compressed = compressed[:keep_start] + "\n...[truncated]...\n" + compressed[-keep_end:] |
|
|
| return compressed |
|
|
| def compress_messages( |
| self, messages: list[ChatMessage], budget: int | None = None |
| ) -> list[ChatMessage]: |
| """Compress a message list to fit within token budget.""" |
| budget = budget or self.max_tokens |
| total_chars = sum(len(m.content) for m in messages) |
| char_budget = budget * 4 |
|
|
| if total_chars <= char_budget: |
| return messages |
|
|
| result = [] |
| |
| for i, msg in enumerate(messages): |
| if msg.role == "system": |
| result.append(ChatMessage( |
| role="system", |
| content=self.compress(msg.content, budget=budget // 3), |
| )) |
| elif i == len(messages) - 1: |
| |
| result.append(ChatMessage( |
| role=msg.role, |
| content=self.compress(msg.content, budget=budget // 2), |
| )) |
| else: |
| result.append(ChatMessage( |
| role=msg.role, |
| content=self.compress(msg.content, budget=budget // 4), |
| )) |
| return result |
|
|
|
|
| |
| |
| |
|
|
| class OllamaBackend(LLMBackend): |
| """ |
| Local model serving via Ollama with grammar-constrained JSON output. |
| |
| Ollama's grammar engine (via llama.cpp) forces valid JSON output from |
| ANY model — even tiny ones that can't produce reliable JSON from prompts. |
| This is the key advantage for SLM agent use. |
| |
| Setup: |
| 1. Install Ollama: https://ollama.ai |
| 2. Pull a model: ollama pull qwen3:1.7b |
| 3. Use this backend: |
| |
| Example: |
| backend = OllamaBackend(model="qwen3:1.7b") # 1.7B params, runs on CPU |
| backend = OllamaBackend(model="llama3.2:1b") # 1B params, ultra-light |
| backend = OllamaBackend(model="phi4-mini") # 3.8B, best tool-use |
| backend = OllamaBackend(model="smollm2:1.7b") # HF native SLM |
| |
| Also works with large models: |
| backend = OllamaBackend(model="qwen3:32b") # Full LLM |
| """ |
|
|
| def __init__( |
| self, |
| model: str = "qwen3:1.7b", |
| host: str = "http://localhost:11434", |
| context_window: int = 8192, |
| compress_prompts: bool = True, |
| num_ctx: int | None = None, |
| ): |
| self.model = model |
| self.host = host |
| self.context_window = context_window |
| self.compress_prompts = compress_prompts |
| self.num_ctx = num_ctx or context_window |
| self.compressor = SLMPromptCompressor( |
| max_tokens=context_window, aggressive=(context_window <= 8192) |
| ) |
| self._token_count = 0 |
|
|
| def _get_client(self): |
| """Lazy import ollama client.""" |
| try: |
| from ollama import Client |
| return Client(host=self.host) |
| except ImportError: |
| raise ImportError( |
| "Ollama client not installed. Run: pip install ollama\n" |
| "Also install Ollama server: https://ollama.ai" |
| ) |
|
|
| def generate( |
| self, |
| messages: list[ChatMessage], |
| temperature: float = 0.7, |
| max_tokens: int = 2048, |
| stop: list[str] | None = None, |
| ) -> str: |
| client = self._get_client() |
|
|
| if self.compress_prompts: |
| messages = self.compressor.compress_messages(messages, self.context_window) |
|
|
| msg_dicts = [{"role": m.role, "content": m.content} for m in messages] |
|
|
| response = client.chat( |
| model=self.model, |
| messages=msg_dicts, |
| options={ |
| "temperature": temperature, |
| "num_predict": max_tokens, |
| "num_ctx": self.num_ctx, |
| "stop": stop or [], |
| }, |
| ) |
|
|
| content = self._strip_thinking(response.message.content or "") |
| self._token_count += response.get("eval_count", 0) + response.get("prompt_eval_count", 0) |
| return content |
|
|
| def generate_structured( |
| self, |
| messages: list[ChatMessage], |
| schema: dict[str, Any], |
| temperature: float = 0.3, |
| max_tokens: int = 1024, |
| ) -> dict[str, Any]: |
| """ |
| Grammar-constrained JSON generation. |
| |
| Ollama uses llama.cpp's grammar engine to FORCE valid JSON output |
| matching the schema. This works even with tiny models that can't |
| produce valid JSON from prompts alone. |
| """ |
| client = self._get_client() |
|
|
| if self.compress_prompts: |
| messages = self.compressor.compress_messages(messages, self.context_window) |
|
|
| msg_dicts = [{"role": m.role, "content": m.content} for m in messages] |
|
|
| response = client.chat( |
| model=self.model, |
| messages=msg_dicts, |
| format=schema, |
| options={ |
| "temperature": temperature, |
| "num_predict": max_tokens, |
| "num_ctx": self.num_ctx, |
| }, |
| ) |
|
|
| content = response.message.content or "{}" |
| self._token_count += response.get("eval_count", 0) + response.get("prompt_eval_count", 0) |
| return json.loads(content) |
|
|
| def generate_stream( |
| self, |
| messages: list[ChatMessage], |
| temperature: float = 0.7, |
| max_tokens: int = 2048, |
| ) -> Iterator[str]: |
| """Streaming generation — yields tokens as they're produced.""" |
| client = self._get_client() |
|
|
| if self.compress_prompts: |
| messages = self.compressor.compress_messages(messages, self.context_window) |
|
|
| msg_dicts = [{"role": m.role, "content": m.content} for m in messages] |
|
|
| stream = client.chat( |
| model=self.model, |
| messages=msg_dicts, |
| stream=True, |
| options={ |
| "temperature": temperature, |
| "num_predict": max_tokens, |
| "num_ctx": self.num_ctx, |
| }, |
| ) |
|
|
| for chunk in stream: |
| token = chunk.get("message", {}).get("content", "") |
| if token: |
| yield token |
|
|
| @property |
| def total_tokens(self) -> int: |
| return self._token_count |
|
|
|
|
| |
| |
| |
|
|
| class LlamaCppBackend(LLMBackend): |
| """ |
| Direct llama-cpp-python backend for GGUF models. |
| |
| Best for: CPU inference, Apple Silicon, edge deployment, offline use. |
| |
| Example: |
| backend = LlamaCppBackend(model_path="./qwen2.5-1.5b-instruct-q4_k_m.gguf") |
| backend = LlamaCppBackend( |
| model_path="./phi-4-mini-q4.gguf", |
| n_ctx=4096, |
| n_gpu_layers=35, # Offload to GPU |
| ) |
| """ |
|
|
| def __init__( |
| self, |
| model_path: str, |
| n_ctx: int = 4096, |
| n_gpu_layers: int = 0, |
| verbose: bool = False, |
| ): |
| try: |
| from llama_cpp import Llama |
| except ImportError: |
| raise ImportError("llama-cpp-python not installed. Run: pip install llama-cpp-python") |
|
|
| self.model_path = model_path |
| self.llm = Llama( |
| model_path=model_path, |
| n_ctx=n_ctx, |
| n_gpu_layers=n_gpu_layers, |
| verbose=verbose, |
| ) |
| self.compressor = SLMPromptCompressor(max_tokens=n_ctx, aggressive=True) |
| self._token_count = 0 |
|
|
| def generate( |
| self, |
| messages: list[ChatMessage], |
| temperature: float = 0.7, |
| max_tokens: int = 2048, |
| stop: list[str] | None = None, |
| ) -> str: |
| messages = self.compressor.compress_messages(messages) |
| msg_dicts = [{"role": m.role, "content": m.content} for m in messages] |
|
|
| response = self.llm.create_chat_completion( |
| messages=msg_dicts, |
| temperature=temperature, |
| max_tokens=max_tokens, |
| stop=stop, |
| ) |
|
|
| content = response["choices"][0]["message"]["content"] or "" |
| usage = response.get("usage", {}) |
| self._token_count += usage.get("total_tokens", 0) |
| return content |
|
|
| def generate_structured( |
| self, |
| messages: list[ChatMessage], |
| schema: dict[str, Any], |
| temperature: float = 0.3, |
| max_tokens: int = 1024, |
| ) -> dict[str, Any]: |
| """Grammar-constrained JSON via llama.cpp GBNF grammar.""" |
| from llama_cpp import LlamaGrammar |
|
|
| grammar = LlamaGrammar.from_json_schema(json.dumps(schema)) |
| messages = self.compressor.compress_messages(messages) |
| msg_dicts = [{"role": m.role, "content": m.content} for m in messages] |
|
|
| response = self.llm.create_chat_completion( |
| messages=msg_dicts, |
| temperature=temperature, |
| max_tokens=max_tokens, |
| grammar=grammar, |
| ) |
|
|
| content = response["choices"][0]["message"]["content"] or "{}" |
| usage = response.get("usage", {}) |
| self._token_count += usage.get("total_tokens", 0) |
| return json.loads(content) |
|
|
| def generate_stream( |
| self, |
| messages: list[ChatMessage], |
| temperature: float = 0.7, |
| max_tokens: int = 2048, |
| ) -> Iterator[str]: |
| messages = self.compressor.compress_messages(messages) |
| msg_dicts = [{"role": m.role, "content": m.content} for m in messages] |
|
|
| stream = self.llm.create_chat_completion( |
| messages=msg_dicts, |
| temperature=temperature, |
| max_tokens=max_tokens, |
| stream=True, |
| ) |
|
|
| for chunk in stream: |
| delta = chunk.get("choices", [{}])[0].get("delta", {}) |
| token = delta.get("content", "") |
| if token: |
| yield token |
|
|
| @property |
| def total_tokens(self) -> int: |
| return self._token_count |
|
|
|
|
| |
| |
| |
|
|
| |
| SLM_REGISTRY = { |
| |
| "phi-4-mini": ("phi4-mini", 16384, "3.8B, best schema compliance, Microsoft"), |
| "qwen3-1.7b": ("qwen3:1.7b", 32768, "1.7B, strong function calling, 32K context"), |
| "qwen3-0.6b": ("qwen3:0.6b", 32768, "0.6B, ultra-light, 32K context"), |
| "qwen2.5-1.5b": ("qwen2.5:1.5b", 32768, "1.5B, proven tool-use"), |
| "llama-3.2-3b": ("llama3.2:3b", 131072, "3B, 128K context, Meta"), |
| "llama-3.2-1b": ("llama3.2:1b", 131072, "1B, smallest Llama, 128K context"), |
| "smollm2-1.7b": ("smollm2:1.7b", 8192, "1.7B, HF native, 8K context (tight!)"), |
| "gemma-3-1b": ("gemma3:1b", 32768, "1B, Google, multimodal capable"), |
| } |
|
|
|
|
| def create_slm_backend( |
| model_key: str = "qwen3-1.7b", |
| host: str = "http://localhost:11434", |
| ) -> OllamaBackend: |
| """ |
| Create an SLM backend from the registry. |
| |
| Usage: |
| backend = create_slm_backend("phi-4-mini") # Best overall |
| backend = create_slm_backend("qwen3-0.6b") # Ultra-light |
| backend = create_slm_backend("llama-3.2-1b") # Smallest Llama |
| """ |
| if model_key not in SLM_REGISTRY: |
| available = ", ".join(SLM_REGISTRY.keys()) |
| raise ValueError(f"Unknown SLM '{model_key}'. Available: {available}") |
|
|
| ollama_name, ctx_window, desc = SLM_REGISTRY[model_key] |
| logger.info(f"Creating SLM backend: {model_key} ({desc})") |
|
|
| return OllamaBackend( |
| model=ollama_name, |
| host=host, |
| context_window=ctx_window, |
| compress_prompts=True, |
| ) |
|
|