vegarl / tests /test_serving_backend.py
ronitraj's picture
Deploy Space without oversized raw dataset
4fbc241
from __future__ import annotations
from types import SimpleNamespace
from llmserve_env.models import ServeAction, WorkloadSnapshot, default_action
from server.serving_backend import RealOpenAIBackend, SimulatedServingBackend, create_serving_backend
def _workload() -> WorkloadSnapshot:
return WorkloadSnapshot(
arrival_rate=8.0,
queue_depth=5,
mean_prompt_length=128.0,
prompt_length_bucket=1,
priority_fraction=0.25,
phase="steady",
)
class _FakeChatCompletions:
def create(self, **kwargs):
del kwargs
return SimpleNamespace(
usage=SimpleNamespace(prompt_tokens=120, completion_tokens=40, total_tokens=160),
)
class _FakeClient:
def __init__(self):
self.chat = SimpleNamespace(completions=_FakeChatCompletions())
def test_create_serving_backend_default_is_sim() -> None:
backend = create_serving_backend(mode="sim", seed=42)
assert isinstance(backend, SimulatedServingBackend)
def test_real_openai_backend_produces_metrics_without_network() -> None:
backend = RealOpenAIBackend(seed=1, client=_FakeClient(), model="gpt-4.1-mini", max_requests_per_step=2)
metrics = backend.run_step("static_workload", default_action(), _workload())
assert metrics.requests_served == 2
assert metrics.throughput_tps >= 1.0
assert metrics.estimated_cost_per_1k > 0.0
assert metrics.p50_ttft_ms > 0.0
assert metrics.p50_itl_ms > 0.0
def test_real_backend_respects_truncation_via_kv_budget() -> None:
backend = RealOpenAIBackend(seed=1, client=_FakeClient(), model="gpt-4.1-mini", max_requests_per_step=1)
action = ServeAction(batch_cap=1, kv_budget_fraction=0.1, speculation_depth=0, quantization_tier="FP16")
metrics = backend.run_step("static_workload", action, _workload())
assert metrics.eviction_events >= 1