Spaces:

ZeroR3
/

repomind

Running

File size: 2,384 Bytes

e3a472a

"""vLLM-backed client (OpenAI-compatible API).

Targets a local vLLM server running:
    vllm serve Qwen/Qwen3-Coder-Next-FP8 \\
        --tool-call-parser qwen3_coder \\
        --max-model-len 262144 \\
        --kv-cache-dtype fp8

The server returns tool-calls in the OpenAI function-calling format, which
we translate to our internal ToolCall dataclass.
"""
from __future__ import annotations
import json
from dataclasses import dataclass
from typing import Any, Dict, List

from .base import LLMClient, LLMResponse, ToolCall


@dataclass
class VLLMClient(LLMClient):
    base_url: str = "http://localhost:8000/v1"
    model: str = "Qwen/Qwen3-Coder-Next-FP8"
    api_key: str = "EMPTY"          # vLLM ignores the key but the SDK requires one
    timeout: float = 300.0
    temperature: float = 0.2
    max_tokens: int = 2048

    def __post_init__(self):
        try:
            from openai import OpenAI  # type: ignore
        except ImportError as e:
            raise ImportError("pip install openai") from e
        self._client = OpenAI(base_url=self.base_url, api_key=self.api_key, timeout=self.timeout)

    def complete(
        self,
        messages: List[Dict[str, Any]],
        tools: List[Dict[str, Any]],
        **kwargs: Any,
    ) -> LLMResponse:
        kw = {
            "model": self.model,
            "messages": messages,
            "temperature": kwargs.get("temperature", self.temperature),
            "max_tokens": kwargs.get("max_tokens", self.max_tokens),
        }
        if tools:
            kw["tools"] = tools
            kw["tool_choice"] = kwargs.get("tool_choice", "auto")

        resp = self._client.chat.completions.create(**kw)
        choice = resp.choices[0].message
        content = choice.content or ""
        tool_calls: List[ToolCall] = []
        for tc in (choice.tool_calls or []):
            try:
                args = json.loads(tc.function.arguments or "{}")
            except json.JSONDecodeError:
                args = {}
            tool_calls.append(ToolCall(id=tc.id, name=tc.function.name, arguments=args))

        usage = {}
        if getattr(resp, "usage", None):
            usage = {
                "prompt": resp.usage.prompt_tokens,
                "completion": resp.usage.completion_tokens,
            }
        return LLMResponse(content=content, tool_calls=tool_calls, usage=usage, raw=resp)