Spaces:

ZeroR3
/

repomind

Running

App Files Files Community

repomind / serving /vllm_client.py

ZeroR3

REPOMIND v0.1 — repo-scale coding agent demo

e3a472a 4 days ago

raw

history blame contribute delete

2.38 kB

	"""vLLM-backed client (OpenAI-compatible API).

	Targets a local vLLM server running:
	vllm serve Qwen/Qwen3-Coder-Next-FP8 \\
	--tool-call-parser qwen3_coder \\
	--max-model-len 262144 \\
	--kv-cache-dtype fp8

	The server returns tool-calls in the OpenAI function-calling format, which
	we translate to our internal ToolCall dataclass.
	"""
	from __future__ import annotations
	import json
	from dataclasses import dataclass
	from typing import Any, Dict, List

	from .base import LLMClient, LLMResponse, ToolCall


	@dataclass
	class VLLMClient(LLMClient):
	base_url: str = "http://localhost:8000/v1"
	model: str = "Qwen/Qwen3-Coder-Next-FP8"
	api_key: str = "EMPTY" # vLLM ignores the key but the SDK requires one
	timeout: float = 300.0
	temperature: float = 0.2
	max_tokens: int = 2048

	def __post_init__(self):
	try:
	from openai import OpenAI # type: ignore
	except ImportError as e:
	raise ImportError("pip install openai") from e
	self._client = OpenAI(base_url=self.base_url, api_key=self.api_key, timeout=self.timeout)

	def complete(
	self,
	messages: List[Dict[str, Any]],
	tools: List[Dict[str, Any]],
	**kwargs: Any,
	) -> LLMResponse:
	kw = {
	"model": self.model,
	"messages": messages,
	"temperature": kwargs.get("temperature", self.temperature),
	"max_tokens": kwargs.get("max_tokens", self.max_tokens),
	}
	if tools:
	kw["tools"] = tools
	kw["tool_choice"] = kwargs.get("tool_choice", "auto")

	resp = self._client.chat.completions.create(**kw)
	choice = resp.choices[0].message
	content = choice.content or ""
	tool_calls: List[ToolCall] = []
	for tc in (choice.tool_calls or []):
	try:
	args = json.loads(tc.function.arguments or "{}")
	except json.JSONDecodeError:
	args = {}
	tool_calls.append(ToolCall(id=tc.id, name=tc.function.name, arguments=args))

	usage = {}
	if getattr(resp, "usage", None):
	usage = {
	"prompt": resp.usage.prompt_tokens,
	"completion": resp.usage.completion_tokens,
	}
	return LLMResponse(content=content, tool_calls=tool_calls, usage=usage, raw=resp)