repomind / serving /vllm_client.py
ZeroR3's picture
REPOMIND v0.1 — repo-scale coding agent demo
e3a472a
"""vLLM-backed client (OpenAI-compatible API).
Targets a local vLLM server running:
vllm serve Qwen/Qwen3-Coder-Next-FP8 \\
--tool-call-parser qwen3_coder \\
--max-model-len 262144 \\
--kv-cache-dtype fp8
The server returns tool-calls in the OpenAI function-calling format, which
we translate to our internal ToolCall dataclass.
"""
from __future__ import annotations
import json
from dataclasses import dataclass
from typing import Any, Dict, List
from .base import LLMClient, LLMResponse, ToolCall
@dataclass
class VLLMClient(LLMClient):
base_url: str = "http://localhost:8000/v1"
model: str = "Qwen/Qwen3-Coder-Next-FP8"
api_key: str = "EMPTY" # vLLM ignores the key but the SDK requires one
timeout: float = 300.0
temperature: float = 0.2
max_tokens: int = 2048
def __post_init__(self):
try:
from openai import OpenAI # type: ignore
except ImportError as e:
raise ImportError("pip install openai") from e
self._client = OpenAI(base_url=self.base_url, api_key=self.api_key, timeout=self.timeout)
def complete(
self,
messages: List[Dict[str, Any]],
tools: List[Dict[str, Any]],
**kwargs: Any,
) -> LLMResponse:
kw = {
"model": self.model,
"messages": messages,
"temperature": kwargs.get("temperature", self.temperature),
"max_tokens": kwargs.get("max_tokens", self.max_tokens),
}
if tools:
kw["tools"] = tools
kw["tool_choice"] = kwargs.get("tool_choice", "auto")
resp = self._client.chat.completions.create(**kw)
choice = resp.choices[0].message
content = choice.content or ""
tool_calls: List[ToolCall] = []
for tc in (choice.tool_calls or []):
try:
args = json.loads(tc.function.arguments or "{}")
except json.JSONDecodeError:
args = {}
tool_calls.append(ToolCall(id=tc.id, name=tc.function.name, arguments=args))
usage = {}
if getattr(resp, "usage", None):
usage = {
"prompt": resp.usage.prompt_tokens,
"completion": resp.usage.completion_tokens,
}
return LLMResponse(content=content, tool_calls=tool_calls, usage=usage, raw=resp)