Spaces:

TheLinconX
/

contextforge-demo

Sleeping

File size: 3,142 Bytes

"""Async HTTP client for vLLM OpenAI-compatible API."""
import logging
from typing import Any

import httpx

from apohara_context_forge.config import settings

logger = logging.getLogger(__name__)


class vLLMClient:
    """Async client for vLLM server."""

    def __init__(self, base_url: str | None = None, api_key: str | None = None):
        self._base_url = base_url or settings.vllm_base_url
        self._api_key = api_key or settings.vllm_api_key
        self._client: httpx.AsyncClient | None = None

    async def __aenter__(self):
        self._client = httpx.AsyncClient(
            base_url=self._base_url,
            headers={"Authorization": f"Bearer {self._api_key}"},
            timeout=60.0,
        )
        return self

    async def __aexit__(self, *args):
        await self.aclose()

    async def aclose(self) -> None:
        """Close the underlying httpx client. Safe to call multiple times."""
        if self._client is not None:
            await self._client.aclose()
            self._client = None

    async def complete(
        self,
        prompt: str,
        max_tokens: int = 256,
        temperature: float = 0.7,
        **kwargs,
    ) -> dict[str, Any]:
        """Send completion request to vLLM."""
        if self._client is None:
            self._client = httpx.AsyncClient(
                base_url=self._base_url,
                headers={"Authorization": f"Bearer {self._api_key}"},
                timeout=60.0,
            )

        payload = {
            "model": settings.vllm_model,
            "prompt": prompt,
            "max_tokens": max_tokens,
            "temperature": temperature,
            **kwargs,
        }

        try:
            response = await self._client.post("/v1/completions", json=payload)
            response.raise_for_status()
            return response.json()
        except httpx.HTTPError as e:
            logger.error(f"vLLM request failed: {e}")
            return {"error": str(e)}

    async def chat(
        self,
        messages: list[dict[str, str]],
        max_tokens: int = 256,
        temperature: float = 0.7,
        **kwargs,
    ) -> dict[str, Any]:
        """Send chat completion request."""
        if self._client is None:
            self._client = httpx.AsyncClient(
                base_url=self._base_url,
                headers={"Authorization": f"Bearer {self._api_key}"},
                timeout=60.0,
            )

        payload = {
            "model": settings.vllm_model,
            "messages": messages,
            "max_tokens": max_tokens,
            "temperature": temperature,
            **kwargs,
        }

        try:
            response = await self._client.post("/v1/chat/completions", json=payload)
            response.raise_for_status()
            return response.json()
        except httpx.HTTPError as e:
            logger.error(f"vLLM chat request failed: {e}")
            return {"error": str(e)}


# Canonical PEP-8 alias. Tests and the MCP server import the upper-case form;
# the lower-case original stays for backward compatibility with older callers.
VLLMClient = vLLMClient