| """vLLM-backed client (OpenAI-compatible API). |
| |
| Targets a local vLLM server running: |
| vllm serve Qwen/Qwen3-Coder-Next-FP8 \\ |
| --tool-call-parser qwen3_coder \\ |
| --max-model-len 262144 \\ |
| --kv-cache-dtype fp8 |
| |
| The server returns tool-calls in the OpenAI function-calling format, which |
| we translate to our internal ToolCall dataclass. |
| """ |
| from __future__ import annotations |
| import json |
| from dataclasses import dataclass |
| from typing import Any, Dict, List |
|
|
| from .base import LLMClient, LLMResponse, ToolCall |
|
|
|
|
| @dataclass |
| class VLLMClient(LLMClient): |
| base_url: str = "http://localhost:8000/v1" |
| model: str = "Qwen/Qwen3-Coder-Next-FP8" |
| api_key: str = "EMPTY" |
| timeout: float = 300.0 |
| temperature: float = 0.2 |
| max_tokens: int = 2048 |
|
|
| def __post_init__(self): |
| try: |
| from openai import OpenAI |
| except ImportError as e: |
| raise ImportError("pip install openai") from e |
| self._client = OpenAI(base_url=self.base_url, api_key=self.api_key, timeout=self.timeout) |
|
|
| def complete( |
| self, |
| messages: List[Dict[str, Any]], |
| tools: List[Dict[str, Any]], |
| **kwargs: Any, |
| ) -> LLMResponse: |
| kw = { |
| "model": self.model, |
| "messages": messages, |
| "temperature": kwargs.get("temperature", self.temperature), |
| "max_tokens": kwargs.get("max_tokens", self.max_tokens), |
| } |
| if tools: |
| kw["tools"] = tools |
| kw["tool_choice"] = kwargs.get("tool_choice", "auto") |
|
|
| resp = self._client.chat.completions.create(**kw) |
| choice = resp.choices[0].message |
| content = choice.content or "" |
| tool_calls: List[ToolCall] = [] |
| for tc in (choice.tool_calls or []): |
| try: |
| args = json.loads(tc.function.arguments or "{}") |
| except json.JSONDecodeError: |
| args = {} |
| tool_calls.append(ToolCall(id=tc.id, name=tc.function.name, arguments=args)) |
|
|
| usage = {} |
| if getattr(resp, "usage", None): |
| usage = { |
| "prompt": resp.usage.prompt_tokens, |
| "completion": resp.usage.completion_tokens, |
| } |
| return LLMResponse(content=content, tool_calls=tool_calls, usage=usage, raw=resp) |
|
|