Rohan03
/

purpose-agent

+"""
+SLM-Native Backends — First-class support for Small Language Models.
+Purpose Agent is the world's first agentic framework designed natively for SLMs.
+These backends handle the unique challenges of small models:
+  - Grammar-constrained JSON output (SLMs can't reliably produce JSON from prompts alone)
+  - Prompt compression for small context windows (8K-32K)
+  - Adaptive prompting (shorter system prompts, schema-first format)
+  - Token budget management
+Supported backends:
+  - OllamaBackend: Local serving via Ollama (CPU/GPU, any GGUF model)
+  - LlamaCppBackend: Direct llama-cpp-python (CPU/Apple Silicon, GGUF)
+  - TransformersBackend: HuggingFace transformers (GPU, native weights)
+All backends implement the same LLMBackend interface — swap freely.
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+import re
+from typing import Any, AsyncIterator, Iterator
+from purpose_agent.llm_backend import ChatMessage, LLMBackend
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# SLM Prompt Compressor — reduces prompt size for small context windows
+# ---------------------------------------------------------------------------
+class SLMPromptCompressor:
+    """
+    Compresses prompts for small context windows without losing critical info.
+    Strategies (from TinyAgent arxiv:2409.00608 + LLMLingua-2 arxiv:2403.12968):
+    1. Schema-first: Move JSON schema to top, compress descriptions
+    2. History truncation: Summarize old steps, keep recent ones verbatim
+    3. Example reduction: Fewer few-shot examples for SLMs
+    4. Whitespace stripping: Remove unnecessary formatting
+    No external dependencies — pure Python compression.
+    For better compression, install llmlingua: pip install llmlingua
+    """
+    def __init__(self, max_tokens: int = 4096, aggressive: bool = False):
+        self.max_tokens = max_tokens
+        self.aggressive = aggressive
+    def compress(self, text: str, budget: int | None = None) -> str:
+        """Compress text to fit within token budget."""
+        budget = budget or self.max_tokens
+        # Rough estimate: 1 token ≈ 4 chars
+        char_budget = budget * 4
+        if len(text) <= char_budget:
+            return text
+        compressed = text
+        # Stage 1: Strip excessive whitespace
+        compressed = re.sub(r'\n{3,}', '\n\n', compressed)
+        compressed = re.sub(r'[ \t]{2,}', ' ', compressed)
+        compressed = re.sub(r'^\s+', '', compressed, flags=re.MULTILINE)
+        if len(compressed) <= char_budget:
+            return compressed
+        # Stage 2: Shorten verbose sections
+        if self.aggressive:
+            # Remove markdown formatting
+            compressed = re.sub(r'\*\*([^*]+)\*\*', r'\1', compressed)
+            compressed = re.sub(r'#{1,3}\s+', '', compressed)
+            # Shorten common verbose phrases
+            replacements = {
+                "You MUST respond with": "Respond with",
+                "Based on the current state and your goal, ": "",
+                "Respond in this exact JSON format:": "JSON format:",
+                "Step-by-step justification": "Justification",
+                "Specific observable state changes": "State changes",
+            }
+            for old, new in replacements.items():
+                compressed = compressed.replace(old, new)
+        if len(compressed) <= char_budget:
+            return compressed
+        # Stage 3: Truncate from middle (keep start + end)
+        keep_start = char_budget * 2 // 3
+        keep_end = char_budget // 3
+        compressed = compressed[:keep_start] + "\n...[truncated]...\n" + compressed[-keep_end:]
+        return compressed
+    def compress_messages(
+        self, messages: list[ChatMessage], budget: int | None = None
+    ) -> list[ChatMessage]:
+        """Compress a message list to fit within token budget."""
+        budget = budget or self.max_tokens
+        total_chars = sum(len(m.content) for m in messages)
+        char_budget = budget * 4
+        if total_chars <= char_budget:
+            return messages
+        result = []
+        # Always keep system prompt (compress it), always keep last user message
+        for i, msg in enumerate(messages):
+            if msg.role == "system":
+                result.append(ChatMessage(
+                    role="system",
+                    content=self.compress(msg.content, budget=budget // 3),
+                ))
+            elif i == len(messages) - 1:
+                # Last message — keep more of it
+                result.append(ChatMessage(
+                    role=msg.role,
+                    content=self.compress(msg.content, budget=budget // 2),
+                ))
+            else:
+                result.append(ChatMessage(
+                    role=msg.role,
+                    content=self.compress(msg.content, budget=budget // 4),
+                ))
+        return result
+# ---------------------------------------------------------------------------
+# Ollama Backend — Best for local SLMs
+# ---------------------------------------------------------------------------
+class OllamaBackend(LLMBackend):
+    """
+    Local model serving via Ollama with grammar-constrained JSON output.
+    Ollama's grammar engine (via llama.cpp) forces valid JSON output from
+    ANY model — even tiny ones that can't produce reliable JSON from prompts.
+    This is the key advantage for SLM agent use.
+    Setup:
+        1. Install Ollama: https://ollama.ai
+        2. Pull a model: ollama pull qwen3:1.7b
+        3. Use this backend:
+    Example:
+        backend = OllamaBackend(model="qwen3:1.7b")  # 1.7B params, runs on CPU
+        backend = OllamaBackend(model="llama3.2:1b")  # 1B params, ultra-light
+        backend = OllamaBackend(model="phi4-mini")     # 3.8B, best tool-use
+        backend = OllamaBackend(model="smollm2:1.7b")  # HF native SLM
+    Also works with large models:
+        backend = OllamaBackend(model="qwen3:32b")     # Full LLM
+    """
+    def __init__(
+        self,
+        model: str = "qwen3:1.7b",
+        host: str = "http://localhost:11434",
+        context_window: int = 8192,
+        compress_prompts: bool = True,
+        num_ctx: int | None = None,
+    ):
+        self.model = model
+        self.host = host
+        self.context_window = context_window
+        self.compress_prompts = compress_prompts
+        self.num_ctx = num_ctx or context_window
+        self.compressor = SLMPromptCompressor(
+            max_tokens=context_window, aggressive=(context_window <= 8192)
+        )
+        self._token_count = 0
+    def _get_client(self):
+        """Lazy import ollama client."""
+        try:
+            from ollama import Client
+            return Client(host=self.host)
+        except ImportError:
+            raise ImportError(
+                "Ollama client not installed. Run: pip install ollama\n"
+                "Also install Ollama server: https://ollama.ai"
+            )
+    def generate(
+        self,
+        messages: list[ChatMessage],
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+        stop: list[str] | None = None,
+    ) -> str:
+        client = self._get_client()
+        if self.compress_prompts:
+            messages = self.compressor.compress_messages(messages, self.context_window)
+        msg_dicts = [{"role": m.role, "content": m.content} for m in messages]
+        response = client.chat(
+            model=self.model,
+            messages=msg_dicts,
+            options={
+                "temperature": temperature,
+                "num_predict": max_tokens,
+                "num_ctx": self.num_ctx,
+                "stop": stop or [],
+            },
+        )
+        content = response.message.content or ""
+        # Track tokens for cost tracking
+        self._token_count += response.get("eval_count", 0) + response.get("prompt_eval_count", 0)
+        return content
+    def generate_structured(
+        self,
+        messages: list[ChatMessage],
+        schema: dict[str, Any],
+        temperature: float = 0.3,
+        max_tokens: int = 1024,
+    ) -> dict[str, Any]:
+        """
+        Grammar-constrained JSON generation.
+        Ollama uses llama.cpp's grammar engine to FORCE valid JSON output
+        matching the schema. This works even with tiny models that can't
+        produce valid JSON from prompts alone.
+        """
+        client = self._get_client()
+        if self.compress_prompts:
+            messages = self.compressor.compress_messages(messages, self.context_window)
+        msg_dicts = [{"role": m.role, "content": m.content} for m in messages]
+        response = client.chat(
+            model=self.model,
+            messages=msg_dicts,
+            format=schema,  # Grammar-constrained output!
+            options={
+                "temperature": temperature,
+                "num_predict": max_tokens,
+                "num_ctx": self.num_ctx,
+            },
+        )
+        content = response.message.content or "{}"
+        self._token_count += response.get("eval_count", 0) + response.get("prompt_eval_count", 0)
+        return json.loads(content)
+    def generate_stream(
+        self,
+        messages: list[ChatMessage],
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+    ) -> Iterator[str]:
+        """Streaming generation — yields tokens as they're produced."""
+        client = self._get_client()
+        if self.compress_prompts:
+            messages = self.compressor.compress_messages(messages, self.context_window)
+        msg_dicts = [{"role": m.role, "content": m.content} for m in messages]
+        stream = client.chat(
+            model=self.model,
+            messages=msg_dicts,
+            stream=True,
+            options={
+                "temperature": temperature,
+                "num_predict": max_tokens,
+                "num_ctx": self.num_ctx,
+            },
+        )
+        for chunk in stream:
+            token = chunk.get("message", {}).get("content", "")
+            if token:
+                yield token
+    @property
+    def total_tokens(self) -> int:
+        return self._token_count
+# ---------------------------------------------------------------------------
+# LlamaCpp Backend — Direct CPU/Apple Silicon/GGUF
+# ---------------------------------------------------------------------------
+class LlamaCppBackend(LLMBackend):
+    """
+    Direct llama-cpp-python backend for GGUF models.
+    Best for: CPU inference, Apple Silicon, edge deployment, offline use.
+    Example:
+        backend = LlamaCppBackend(model_path="./qwen2.5-1.5b-instruct-q4_k_m.gguf")
+        backend = LlamaCppBackend(
+            model_path="./phi-4-mini-q4.gguf",
+            n_ctx=4096,
+            n_gpu_layers=35,  # Offload to GPU
+        )
+    """
+    def __init__(
+        self,
+        model_path: str,
+        n_ctx: int = 4096,
+        n_gpu_layers: int = 0,
+        verbose: bool = False,
+    ):
+        try:
+            from llama_cpp import Llama
+        except ImportError:
+            raise ImportError("llama-cpp-python not installed. Run: pip install llama-cpp-python")
+        self.model_path = model_path
+        self.llm = Llama(
+            model_path=model_path,
+            n_ctx=n_ctx,
+            n_gpu_layers=n_gpu_layers,
+            verbose=verbose,
+        )
+        self.compressor = SLMPromptCompressor(max_tokens=n_ctx, aggressive=True)
+        self._token_count = 0
+    def generate(
+        self,
+        messages: list[ChatMessage],
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+        stop: list[str] | None = None,
+    ) -> str:
+        messages = self.compressor.compress_messages(messages)
+        msg_dicts = [{"role": m.role, "content": m.content} for m in messages]
+        response = self.llm.create_chat_completion(
+            messages=msg_dicts,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            stop=stop,
+        )
+        content = response["choices"][0]["message"]["content"] or ""
+        usage = response.get("usage", {})
+        self._token_count += usage.get("total_tokens", 0)
+        return content
+    def generate_structured(
+        self,
+        messages: list[ChatMessage],
+        schema: dict[str, Any],
+        temperature: float = 0.3,
+        max_tokens: int = 1024,
+    ) -> dict[str, Any]:
+        """Grammar-constrained JSON via llama.cpp GBNF grammar."""
+        from llama_cpp import LlamaGrammar
+        grammar = LlamaGrammar.from_json_schema(json.dumps(schema))
+        messages = self.compressor.compress_messages(messages)
+        msg_dicts = [{"role": m.role, "content": m.content} for m in messages]
+        response = self.llm.create_chat_completion(
+            messages=msg_dicts,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            grammar=grammar,
+        )
+        content = response["choices"][0]["message"]["content"] or "{}"
+        usage = response.get("usage", {})
+        self._token_count += usage.get("total_tokens", 0)
+        return json.loads(content)
+    def generate_stream(
+        self,
+        messages: list[ChatMessage],
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+    ) -> Iterator[str]:
+        messages = self.compressor.compress_messages(messages)
+        msg_dicts = [{"role": m.role, "content": m.content} for m in messages]
+        stream = self.llm.create_chat_completion(
+            messages=msg_dicts,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            stream=True,
+        )
+        for chunk in stream:
+            delta = chunk.get("choices", [{}])[0].get("delta", {})
+            token = delta.get("content", "")
+            if token:
+                yield token
+    @property
+    def total_tokens(self) -> int:
+        return self._token_count
+# ---------------------------------------------------------------------------
+# Model Registry — Easy model selection for SLMs
+# ---------------------------------------------------------------------------
+# Recommended SLMs for agent tasks, ranked by capability
+SLM_REGISTRY = {
+    # Model ID → (Ollama name, context window, description)
+    "phi-4-mini": ("phi4-mini", 16384, "3.8B, best schema compliance, Microsoft"),
+    "qwen3-1.7b": ("qwen3:1.7b", 32768, "1.7B, strong function calling, 32K context"),
+    "qwen3-0.6b": ("qwen3:0.6b", 32768, "0.6B, ultra-light, 32K context"),
+    "qwen2.5-1.5b": ("qwen2.5:1.5b", 32768, "1.5B, proven tool-use"),
+    "llama-3.2-3b": ("llama3.2:3b", 131072, "3B, 128K context, Meta"),
+    "llama-3.2-1b": ("llama3.2:1b", 131072, "1B, smallest Llama, 128K context"),
+    "smollm2-1.7b": ("smollm2:1.7b", 8192, "1.7B, HF native, 8K context (tight!)"),
+    "gemma-3-1b": ("gemma3:1b", 32768, "1B, Google, multimodal capable"),
+}
+def create_slm_backend(
+    model_key: str = "qwen3-1.7b",
+    host: str = "http://localhost:11434",
+) -> OllamaBackend:
+    """
+    Create an SLM backend from the registry.
+    Usage:
+        backend = create_slm_backend("phi-4-mini")      # Best overall
+        backend = create_slm_backend("qwen3-0.6b")      # Ultra-light
+        backend = create_slm_backend("llama-3.2-1b")    # Smallest Llama
+    """
+    if model_key not in SLM_REGISTRY:
+        available = ", ".join(SLM_REGISTRY.keys())
+        raise ValueError(f"Unknown SLM '{model_key}'. Available: {available}")
+    ollama_name, ctx_window, desc = SLM_REGISTRY[model_key]
+    logger.info(f"Creating SLM backend: {model_key} ({desc})")
+    return OllamaBackend(
+        model=ollama_name,
+        host=host,
+        context_window=ctx_window,
+        compress_prompts=True,
+    )