"""LLM provider abstraction for the interactive demo. The demo points at any OpenAI-compatible ``/v1/chat/completions`` endpoint: local Ollama, Hugging Face's Inference Providers router, OpenAI itself, vLLM, OpenRouter, etc. Everything funnels through one factory so the UI only has to learn one shape. The browser passes ``base_url``, ``model``, and (optionally) ``api_key`` on every request. If ``api_key`` is missing we fall back to a per-provider env var so a Hugging Face Space can ship a default working config without hard-coding secrets in client bundles. """ from __future__ import annotations import logging import os from collections.abc import Callable from typing import Optional from fastapi import HTTPException from pydantic import BaseModel, ConfigDict, Field _log = logging.getLogger(__name__) HF_ROUTER_BASE_URL = "https://router.huggingface.co/v1" OPENAI_BASE_URL = "https://api.openai.com/v1" OLLAMA_OPENAI_BASE_URL = "http://localhost:11434/v1" PHYSIX_INFER_BASE_URL = "https://pratyush-01-physix-infer.hf.space/v1" class LlmStepRequest(BaseModel): """Provider-agnostic step payload. The browser names a base URL + model + (optional) key. The server fans these into an ``openai.OpenAI`` client. ``base_url`` is required so we never silently default to the wrong endpoint when the user swaps providers mid-session. """ model_config = ConfigDict(extra="forbid") base_url: str = Field( description=( "OpenAI-compatible /v1 base URL. E.g. http://localhost:11434/v1, " "https://router.huggingface.co/v1, https://api.openai.com/v1." ), ) model: str = Field( description=( "Model id understood by the chosen base URL. For HF this is the " "repo id (optionally suffixed with :provider, e.g. ':fastest'); " "for Ollama it's the local tag; for OpenAI it's the model name." ), ) api_key: Optional[str] = Field( default=None, description=( "Bearer token forwarded as Authorization header. Falls back to " "HF_TOKEN / OPENAI_API_KEY / OLLAMA_API_KEY env vars on the " "server based on `base_url` if omitted." ), ) temperature: float = Field(default=0.4, ge=0.0, le=2.0) max_tokens: int = Field(default=2048, ge=64, le=8192) request_timeout_s: float = Field(default=120.0, ge=5.0, le=600.0) # A policy is "give me prompt messages, get back the assistant content". LlmPolicy = Callable[[list[dict[str, str]]], str] LlmPolicyFactory = Callable[[LlmStepRequest], LlmPolicy] def resolve_api_key(request: LlmStepRequest) -> Optional[str]: """Pick the bearer token to use for this request. Browser-supplied keys win. When the browser sends nothing we fall back to a server-side env var picked from the URL — this lets a public Hugging Face Space ship a usable default by setting ``HF_TOKEN`` as a Space secret while still letting power users bring their own. """ if request.api_key: return request.api_key base_url = (request.base_url or "").lower() # The PhysiX-Infer sister Space serves Qwen + the trained 3B with no # auth — it's open-access by design (rate-limited only by sleep). if "physix-infer" in base_url: return "physix-infer" if "huggingface" in base_url: return os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY") if "openai.com" in base_url: return os.environ.get("OPENAI_API_KEY") if "openrouter" in base_url: return os.environ.get("OPENROUTER_API_KEY") if "localhost" in base_url or "127.0.0.1" in base_url: # Ollama doesn't require a key; the SDK still wants something # truthy in some versions, so we hand it a placeholder. return os.environ.get("OLLAMA_API_KEY", "ollama") return None def default_openai_compat_policy_factory(request: LlmStepRequest) -> LlmPolicy: """Build a chat policy for any OpenAI-compatible endpoint. Raises HTTPException(502) on provider errors.""" try: from openai import ( # type: ignore[import-not-found] APIConnectionError, APITimeoutError, AuthenticationError, BadRequestError, NotFoundError, OpenAI, ) except ImportError as exc: # pragma: no cover raise HTTPException( status_code=503, detail=( "The 'openai' Python package is not installed on the server. " "Install with: pip install -e '.[demo]'" ), ) from exc api_key = resolve_api_key(request) client = OpenAI( base_url=request.base_url, api_key=api_key or "missing", timeout=request.request_timeout_s, # Identifies us to providers that rate-limit by UA. Cheap to # send; helps the demo not look like a generic SDK probe. default_headers={"User-Agent": "physix-live-demo/0.1"}, ) def _create(*, with_json: bool): kwargs: dict[str, object] = { "model": request.model, "messages": prompt_holder["prompt"], "temperature": request.temperature, "max_tokens": request.max_tokens, } if with_json: # Encourages JSON output where supported (OpenAI, vLLM, # Ollama-OpenAI). HF router silently ignores this on # providers that don't support it; ones that do reject # land in the BadRequestError fallback below. kwargs["response_format"] = {"type": "json_object"} return client.chat.completions.create(**kwargs) # type: ignore[arg-type] # Captures the prompt for `_create` without re-parameterising it. prompt_holder: dict[str, list[dict[str, str]]] = {"prompt": []} def _policy(prompt: list[dict[str, str]]) -> str: prompt_holder["prompt"] = prompt try: try: response = _create(with_json=True) except (BadRequestError, TypeError) as exc: # Provider rejected response_format (or the SDK shape). # Retry without it; if it still fails, that's a real # error and we let it bubble to the outer handler. _log.info( "Retrying without response_format for %s: %s", request.base_url, exc, ) response = _create(with_json=False) except AuthenticationError as exc: raise HTTPException( status_code=502, detail=_format_auth_error(request, exc), ) from exc except NotFoundError as exc: raise HTTPException( status_code=502, detail=_format_not_found_error(request, exc), ) from exc except (APIConnectionError, APITimeoutError) as exc: raise HTTPException( status_code=502, detail=_format_connection_error(request, exc), ) from exc except Exception as exc: # noqa: BLE001 — last-resort UI surface raise HTTPException( status_code=502, detail=_format_provider_error(request, exc), ) from exc choice = response.choices[0] if response.choices else None content = (choice.message.content if choice and choice.message else "") or "" return str(content) return _policy def _format_provider_error(request: LlmStepRequest, exc: Exception) -> str: """Last-resort error formatter for unclassified provider failures. Most failures should land in one of the typed handlers below (`_format_auth_error`, `_format_not_found_error`, `_format_connection_error`). This is the catch-all when an OpenAI-compatible endpoint returns something we don't recognise. The string-matching here exists only because the test suite exercises this path directly without going through the full SDK exception hierarchy. """ base_msg = f"Chat completion failed via {request.base_url} for model {request.model!r}: {exc}" text = str(exc).lower() # HF Router 400s for unservable models land here (they're # ``BadRequestError`` from the SDK, so they bypass NotFoundError). # Detect both wordings the router emits. if "model_not_supported" in text or "is not supported by any provider" in text or "not supported by provider" in text: return _format_model_not_supported_error(request, exc) if "401" in text or "unauthorized" in text or "invalid api key" in text: return _format_auth_error(request, exc) if "404" in text or "not found" in text or "no such model" in text: return _format_not_found_error(request, exc) if "connection" in text or "refused" in text or "timeout" in text: return _format_connection_error(request, exc) return base_msg def _format_model_not_supported_error( request: LlmStepRequest, exc: Exception ) -> str: """The HF Router accepted the request but no enabled provider serves this model. The user-actionable fix depends on whether they own the model card or not, so we offer both paths.""" base = ( f"HF Router can't serve {request.model!r}: no inference provider " f"is enabled for this model. ({exc})" ) return ( f"{base}\n\n" "Hint: open the model card at " f"https://huggingface.co/{request.model.split(':')[0]} → " "'Inference Providers' panel. If it lists no providers, the model " "isn't routable yet. Two ways to fix:\n" " • Pick a model that already serves through the router — e.g. " "Qwen/Qwen2.5-7B-Instruct, meta-llama/Llama-3.3-70B-Instruct, " "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B.\n" " • For your own models, deploy them via the model card's " "'Deploy → Inference Endpoints' button (paid) or run the weights " "locally with Ollama / vLLM and switch this connection to that " "endpoint." ) def _format_auth_error(request: LlmStepRequest, exc: Exception) -> str: base = ( f"Authentication failed at {request.base_url} for model " f"{request.model!r}: {exc}" ) if "huggingface" in request.base_url.lower(): return ( f"{base}\n\n" "Hint: HF Router needs a token with the 'Make calls to " "Inference Providers' fine-grained permission. Re-create the " "token at https://huggingface.co/settings/tokens with that " "scope checked, then paste it into the API key field." ) return ( f"{base}\n\n" "Hint: the API key is missing or rejected. Open the connection " "panel and paste a valid token, or set the matching env var on " "the server (HF_TOKEN, OPENAI_API_KEY, etc.)." ) def _format_not_found_error(request: LlmStepRequest, exc: Exception) -> str: base = ( f"Model not found at {request.base_url}: {request.model!r} " f"({exc})" ) if "huggingface" in request.base_url.lower(): return ( f"{base}\n\n" "Hint: the model isn't being served by any HF Inference " "Provider right now. Check the model card at " f"https://huggingface.co/{request.model.split(':')[0]} — " "the 'Deploy → Inference API' panel must list at least one " "provider as 'warm'. You can also append ':fastest' to the " "model id to let HF auto-pick a provider." ) if "11434" in request.base_url: return ( f"{base}\n\n" "Hint: that Ollama tag isn't pulled. Run " f"'ollama pull {request.model}' and retry." ) return base def _format_connection_error(request: LlmStepRequest, exc: Exception) -> str: base = f"Could not reach {request.base_url}: {exc}" if "11434" in request.base_url: return ( f"{base}\n\n" "Hint: 'ollama serve' isn't running on this machine. Start " "it in another terminal and retry." ) return ( f"{base}\n\n" "Hint: the endpoint isn't reachable from the server. Check the " "URL, your network, and any firewall in front of it." ) # ----------------------------------------------------------------------- # Ollama-only model lister (kept for the local-dev convenience dropdown). # ----------------------------------------------------------------------- class LlmModelInfo(BaseModel): """A single locally-pulled Ollama model tag.""" model_config = ConfigDict(frozen=True) name: str size_bytes: Optional[int] = None parameter_size: Optional[str] = None family: Optional[str] = None class LlmModelsResponse(BaseModel): models: list[LlmModelInfo] = Field(default_factory=list) error: Optional[str] = None LlmModelsLister = Callable[[], LlmModelsResponse] def default_ollama_models_lister() -> LlmModelsResponse: """Enumerate locally-pulled Ollama tags. Best-effort.""" try: import ollama # type: ignore[import-not-found] except ImportError: return LlmModelsResponse( models=[], error=( "The 'ollama' Python package is not installed on the server. " "Install with: pip install -e '.[demo]'" ), ) try: response = ollama.Client().list() except Exception as exc: # noqa: BLE001 — surfaced in the response body return LlmModelsResponse( models=[], error=( f"Could not reach the local Ollama daemon ({exc}). " "Is 'ollama serve' running?" ), ) raw_models = getattr(response, "models", None) if raw_models is None and isinstance(response, dict): raw_models = response.get("models", []) raw_models = raw_models or [] out: list[LlmModelInfo] = [] for entry in raw_models: name = _model_attr(entry, "model") or _model_attr(entry, "name") if not isinstance(name, str) or not name: continue details = _model_attr(entry, "details") out.append( LlmModelInfo( name=name, size_bytes=_coerce_int(_model_attr(entry, "size")), parameter_size=_model_attr(details, "parameter_size"), family=_model_attr(details, "family"), ) ) out.sort(key=lambda m: m.name) return LlmModelsResponse(models=out) def _model_attr(obj: object, key: str) -> object: if obj is None: return None if isinstance(obj, dict): return obj.get(key) return getattr(obj, key, None) def _coerce_int(value: object) -> Optional[int]: if value is None: return None try: return int(value) except (TypeError, ValueError): return None