Spaces:

ronitraj
/

vegarl

Running

App Files Files Community

vegarl / tests /test_serving_backend.py

ronitraj

Deploy Space without oversized raw dataset

4fbc241 29 days ago

raw

history blame contribute delete

1.87 kB

	from __future__ import annotations

	from types import SimpleNamespace

	from llmserve_env.models import ServeAction, WorkloadSnapshot, default_action
	from server.serving_backend import RealOpenAIBackend, SimulatedServingBackend, create_serving_backend


	def _workload() -> WorkloadSnapshot:
	return WorkloadSnapshot(
	arrival_rate=8.0,
	queue_depth=5,
	mean_prompt_length=128.0,
	prompt_length_bucket=1,
	priority_fraction=0.25,
	phase="steady",
	)


	class _FakeChatCompletions:
	def create(self, **kwargs):
	del kwargs
	return SimpleNamespace(
	usage=SimpleNamespace(prompt_tokens=120, completion_tokens=40, total_tokens=160),
	)


	class _FakeClient:
	def __init__(self):
	self.chat = SimpleNamespace(completions=_FakeChatCompletions())


	def test_create_serving_backend_default_is_sim() -> None:
	backend = create_serving_backend(mode="sim", seed=42)
	assert isinstance(backend, SimulatedServingBackend)


	def test_real_openai_backend_produces_metrics_without_network() -> None:
	backend = RealOpenAIBackend(seed=1, client=_FakeClient(), model="gpt-4.1-mini", max_requests_per_step=2)
	metrics = backend.run_step("static_workload", default_action(), _workload())
	assert metrics.requests_served == 2
	assert metrics.throughput_tps >= 1.0
	assert metrics.estimated_cost_per_1k > 0.0
	assert metrics.p50_ttft_ms > 0.0
	assert metrics.p50_itl_ms > 0.0


	def test_real_backend_respects_truncation_via_kv_budget() -> None:
	backend = RealOpenAIBackend(seed=1, client=_FakeClient(), model="gpt-4.1-mini", max_requests_per_step=1)
	action = ServeAction(batch_cap=1, kv_budget_fraction=0.1, speculation_depth=0, quantization_tier="FP16")
	metrics = backend.run_step("static_workload", action, _workload())
	assert metrics.eviction_events >= 1