""" LLM Backend — Swappable inference layer. Supports: HuggingFace Inference Providers, OpenAI, Anthropic, local models, or any custom backend. Swap by changing one constructor call. Design: Abstract base class with structured output support. Inspired by smolagents Model interface + HF Inference Providers API. """ from __future__ import annotations import json import logging import os from abc import ABC, abstractmethod from dataclasses import dataclass, field from typing import Any logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Message types (OpenAI-compatible chat format) # --------------------------------------------------------------------------- @dataclass class ChatMessage: role: str # "system", "user", "assistant" content: str # --------------------------------------------------------------------------- # Abstract LLM Backend # --------------------------------------------------------------------------- class LLMBackend(ABC): """ Abstract LLM backend. All modules call this — swap the implementation to change the underlying model without touching any other code. Subclasses must implement `generate()` which takes messages and returns a string. Optionally implement `generate_structured()` for JSON-schema constrained generation (used by the Purpose Function for reliable scoring). """ @abstractmethod def generate( self, messages: list[ChatMessage], temperature: float = 0.7, max_tokens: int = 2048, stop: list[str] | None = None, ) -> str: """Generate a text completion from chat messages.""" ... def generate_structured( self, messages: list[ChatMessage], schema: dict[str, Any], temperature: float = 0.3, max_tokens: int = 1024, ) -> dict[str, Any]: """ Generate with JSON schema constraint. Default implementation: append schema instruction to last message and parse JSON from response. Override for native structured output. """ schema_instruction = ( f"\n\nYou MUST respond with valid JSON matching this schema:\n" f"```json\n{json.dumps(schema, indent=2)}\n```\n" f"Respond ONLY with the JSON object, no other text." ) augmented = list(messages) last = augmented[-1] augmented[-1] = ChatMessage( role=last.role, content=last.content + schema_instruction ) raw = self.generate(augmented, temperature=temperature, max_tokens=max_tokens) # Extract JSON from response (handle markdown code blocks) text = raw.strip() if text.startswith("```"): lines = text.split("\n") # Remove first and last ``` lines json_lines = [] inside = False for line in lines: if line.strip().startswith("```") and not inside: inside = True continue elif line.strip() == "```" and inside: break elif inside: json_lines.append(line) text = "\n".join(json_lines) return json.loads(text) # --------------------------------------------------------------------------- # HuggingFace Inference Provider Backend # --------------------------------------------------------------------------- class HFInferenceBackend(LLMBackend): """ Uses huggingface_hub InferenceClient for HF Inference Providers. Supports: Cerebras, Novita, Fireworks, Together, SambaNova, etc. Models: Qwen, Llama, Mistral, DeepSeek — anything on HF Hub. Example: backend = HFInferenceBackend( model_id="Qwen/Qwen3-32B", provider="cerebras", ) """ def __init__( self, model_id: str = "Qwen/Qwen3-32B", provider: str = "auto", api_key: str | None = None, ): from huggingface_hub import InferenceClient self.model_id = model_id self.provider = provider self.client = InferenceClient( provider=provider, api_key=api_key or os.environ.get("HF_TOKEN"), ) def generate( self, messages: list[ChatMessage], temperature: float = 0.7, max_tokens: int = 2048, stop: list[str] | None = None, ) -> str: msg_dicts = [{"role": m.role, "content": m.content} for m in messages] response = self.client.chat_completion( model=self.model_id, messages=msg_dicts, temperature=temperature, max_tokens=max_tokens, stop=stop or [], ) return response.choices[0].message.content def generate_structured( self, messages: list[ChatMessage], schema: dict[str, Any], temperature: float = 0.3, max_tokens: int = 1024, ) -> dict[str, Any]: msg_dicts = [{"role": m.role, "content": m.content} for m in messages] response = self.client.chat_completion( model=self.model_id, messages=msg_dicts, temperature=temperature, max_tokens=max_tokens, response_format={ "type": "json_schema", "json_schema": {"schema": schema}, }, ) return json.loads(response.choices[0].message.content) # --------------------------------------------------------------------------- # OpenAI-Compatible Backend (OpenAI, Azure, vLLM, Ollama, LiteLLM) # --------------------------------------------------------------------------- class OpenAICompatibleBackend(LLMBackend): """ Works with any OpenAI-compatible API endpoint. Examples: # OpenAI backend = OpenAICompatibleBackend(model="gpt-4o") # Local Ollama backend = OpenAICompatibleBackend( model="llama3.2", base_url="http://localhost:11434/v1", api_key="ollama", ) # vLLM server backend = OpenAICompatibleBackend( model="meta-llama/Llama-3.2-3B-Instruct", base_url="http://localhost:8000/v1", api_key="token-placeholder", ) # HF Inference via OpenAI SDK (for structured output with .parse()) backend = OpenAICompatibleBackend( model="Qwen/Qwen3-32B", base_url="https://router.huggingface.co/cerebras/v1", api_key=os.environ["HF_TOKEN"], ) """ def __init__( self, model: str = "gpt-4o", base_url: str | None = None, api_key: str | None = None, ): from openai import OpenAI self.model = model self.client = OpenAI( base_url=base_url, api_key=api_key or os.environ.get("OPENAI_API_KEY"), ) def generate( self, messages: list[ChatMessage], temperature: float = 0.7, max_tokens: int = 2048, stop: list[str] | None = None, ) -> str: msg_dicts = [{"role": m.role, "content": m.content} for m in messages] response = self.client.chat.completions.create( model=self.model, messages=msg_dicts, temperature=temperature, max_tokens=max_tokens, stop=stop, ) return response.choices[0].message.content def generate_structured( self, messages: list[ChatMessage], schema: dict[str, Any], temperature: float = 0.3, max_tokens: int = 1024, ) -> dict[str, Any]: msg_dicts = [{"role": m.role, "content": m.content} for m in messages] response = self.client.chat.completions.create( model=self.model, messages=msg_dicts, temperature=temperature, max_tokens=max_tokens, response_format={ "type": "json_schema", "json_schema": {"name": "purpose_score", "schema": schema}, }, ) return json.loads(response.choices[0].message.content) # --------------------------------------------------------------------------- # Mock Backend (for testing without API calls) # --------------------------------------------------------------------------- class MockLLMBackend(LLMBackend): """ Deterministic mock backend for testing the framework without LLM calls. Returns canned responses based on keywords in the prompt, or a default. You can register custom response handlers. """ def __init__(self): self._handlers: list[tuple[str, str | callable]] = [] self._structured_default: dict[str, Any] = {} self._call_log: list[dict] = [] def register_handler( self, keyword: str, response: str | callable ) -> "MockLLMBackend": """Add a keyword-matched response handler. Checked in order.""" self._handlers.append((keyword, response)) return self def set_structured_default(self, default: dict[str, Any]) -> "MockLLMBackend": """Set the default response for structured generation.""" self._structured_default = default return self @property def call_log(self) -> list[dict]: return self._call_log def generate( self, messages: list[ChatMessage], temperature: float = 0.7, max_tokens: int = 2048, stop: list[str] | None = None, ) -> str: full_text = " ".join(m.content for m in messages) self._call_log.append({ "method": "generate", "messages": [{"role": m.role, "content": m.content[:200]} for m in messages], }) for keyword, response in self._handlers: if keyword.lower() in full_text.lower(): if callable(response): return response(messages) return response # Default: echo the last user message with a generic response last_user = next( (m.content for m in reversed(messages) if m.role == "user"), "no input", ) return f"[MockLLM] Acknowledged: {last_user[:100]}" def generate_structured( self, messages: list[ChatMessage], schema: dict[str, Any], temperature: float = 0.3, max_tokens: int = 1024, ) -> dict[str, Any]: self._call_log.append({ "method": "generate_structured", "schema_keys": list(schema.get("properties", {}).keys()), }) # Try keyword handlers first — they may return JSON strings or dicts full_text = " ".join(m.content for m in messages) for keyword, response in self._handlers: if keyword.lower() in full_text.lower(): if callable(response): result = response(messages) else: result = response # If handler returned a string, try to parse as JSON if isinstance(result, str): try: return json.loads(result) except (json.JSONDecodeError, TypeError): pass elif isinstance(result, dict): return result # Fall back to structured default if self._structured_default: return self._structured_default # Build a minimal valid response from the schema props = schema.get("properties", {}) result = {} for key, prop in props.items(): ptype = prop.get("type", "string") if ptype == "number": result[key] = 5.0 elif ptype == "integer": result[key] = 5 elif ptype == "boolean": result[key] = True else: result[key] = f"mock_{key}" return result