Spaces:

TheLinconX
/

contextforge-demo

Sleeping

contextforge-demo / apohara_context_forge /serving /vllm_client.py

Pablo

fix: test_mcp_server 12 failures resolved — model fields, registry API, GPU label

466cc3d 2 days ago

3.14 kB

	"""Async HTTP client for vLLM OpenAI-compatible API."""
	import logging
	from typing import Any

	import httpx

	from apohara_context_forge.config import settings

	logger = logging.getLogger(__name__)


	class vLLMClient:
	"""Async client for vLLM server."""

	def __init__(self, base_url: str \| None = None, api_key: str \| None = None):
	self._base_url = base_url or settings.vllm_base_url
	self._api_key = api_key or settings.vllm_api_key
	self._client: httpx.AsyncClient \| None = None

	async def __aenter__(self):
	self._client = httpx.AsyncClient(
	base_url=self._base_url,
	headers={"Authorization": f"Bearer {self._api_key}"},
	timeout=60.0,
	)
	return self

	async def __aexit__(self, *args):
	await self.aclose()

	async def aclose(self) -> None:
	"""Close the underlying httpx client. Safe to call multiple times."""
	if self._client is not None:
	await self._client.aclose()
	self._client = None

	async def complete(
	self,
	prompt: str,
	max_tokens: int = 256,
	temperature: float = 0.7,
	**kwargs,
	) -> dict[str, Any]:
	"""Send completion request to vLLM."""
	if self._client is None:
	self._client = httpx.AsyncClient(
	base_url=self._base_url,
	headers={"Authorization": f"Bearer {self._api_key}"},
	timeout=60.0,
	)

	payload = {
	"model": settings.vllm_model,
	"prompt": prompt,
	"max_tokens": max_tokens,
	"temperature": temperature,
	**kwargs,
	}

	try:
	response = await self._client.post("/v1/completions", json=payload)
	response.raise_for_status()
	return response.json()
	except httpx.HTTPError as e:
	logger.error(f"vLLM request failed: {e}")
	return {"error": str(e)}

	async def chat(
	self,
	messages: list[dict[str, str]],
	max_tokens: int = 256,
	temperature: float = 0.7,
	**kwargs,
	) -> dict[str, Any]:
	"""Send chat completion request."""
	if self._client is None:
	self._client = httpx.AsyncClient(
	base_url=self._base_url,
	headers={"Authorization": f"Bearer {self._api_key}"},
	timeout=60.0,
	)

	payload = {
	"model": settings.vllm_model,
	"messages": messages,
	"max_tokens": max_tokens,
	"temperature": temperature,
	**kwargs,
	}

	try:
	response = await self._client.post("/v1/chat/completions", json=payload)
	response.raise_for_status()
	return response.json()
	except httpx.HTTPError as e:
	logger.error(f"vLLM chat request failed: {e}")
	return {"error": str(e)}


	# Canonical PEP-8 alias. Tests and the MCP server import the upper-case form;
	# the lower-case original stays for backward compatibility with older callers.
	VLLMClient = vLLMClient