Spaces:
Sleeping
Sleeping
| """Async HTTP client for vLLM OpenAI-compatible API.""" | |
| import logging | |
| from typing import Any | |
| import httpx | |
| from apohara_context_forge.config import settings | |
| logger = logging.getLogger(__name__) | |
| class vLLMClient: | |
| """Async client for vLLM server.""" | |
| def __init__(self, base_url: str | None = None, api_key: str | None = None): | |
| self._base_url = base_url or settings.vllm_base_url | |
| self._api_key = api_key or settings.vllm_api_key | |
| self._client: httpx.AsyncClient | None = None | |
| async def __aenter__(self): | |
| self._client = httpx.AsyncClient( | |
| base_url=self._base_url, | |
| headers={"Authorization": f"Bearer {self._api_key}"}, | |
| timeout=60.0, | |
| ) | |
| return self | |
| async def __aexit__(self, *args): | |
| await self.aclose() | |
| async def aclose(self) -> None: | |
| """Close the underlying httpx client. Safe to call multiple times.""" | |
| if self._client is not None: | |
| await self._client.aclose() | |
| self._client = None | |
| async def complete( | |
| self, | |
| prompt: str, | |
| max_tokens: int = 256, | |
| temperature: float = 0.7, | |
| **kwargs, | |
| ) -> dict[str, Any]: | |
| """Send completion request to vLLM.""" | |
| if self._client is None: | |
| self._client = httpx.AsyncClient( | |
| base_url=self._base_url, | |
| headers={"Authorization": f"Bearer {self._api_key}"}, | |
| timeout=60.0, | |
| ) | |
| payload = { | |
| "model": settings.vllm_model, | |
| "prompt": prompt, | |
| "max_tokens": max_tokens, | |
| "temperature": temperature, | |
| **kwargs, | |
| } | |
| try: | |
| response = await self._client.post("/v1/completions", json=payload) | |
| response.raise_for_status() | |
| return response.json() | |
| except httpx.HTTPError as e: | |
| logger.error(f"vLLM request failed: {e}") | |
| return {"error": str(e)} | |
| async def chat( | |
| self, | |
| messages: list[dict[str, str]], | |
| max_tokens: int = 256, | |
| temperature: float = 0.7, | |
| **kwargs, | |
| ) -> dict[str, Any]: | |
| """Send chat completion request.""" | |
| if self._client is None: | |
| self._client = httpx.AsyncClient( | |
| base_url=self._base_url, | |
| headers={"Authorization": f"Bearer {self._api_key}"}, | |
| timeout=60.0, | |
| ) | |
| payload = { | |
| "model": settings.vllm_model, | |
| "messages": messages, | |
| "max_tokens": max_tokens, | |
| "temperature": temperature, | |
| **kwargs, | |
| } | |
| try: | |
| response = await self._client.post("/v1/chat/completions", json=payload) | |
| response.raise_for_status() | |
| return response.json() | |
| except httpx.HTTPError as e: | |
| logger.error(f"vLLM chat request failed: {e}") | |
| return {"error": str(e)} | |
| # Canonical PEP-8 alias. Tests and the MCP server import the upper-case form; | |
| # the lower-case original stays for backward compatibility with older callers. | |
| VLLMClient = vLLMClient | |