from __future__ import annotations from types import SimpleNamespace from llmserve_env.models import ServeAction, WorkloadSnapshot, default_action from server.serving_backend import RealOpenAIBackend, SimulatedServingBackend, create_serving_backend def _workload() -> WorkloadSnapshot: return WorkloadSnapshot( arrival_rate=8.0, queue_depth=5, mean_prompt_length=128.0, prompt_length_bucket=1, priority_fraction=0.25, phase="steady", ) class _FakeChatCompletions: def create(self, **kwargs): del kwargs return SimpleNamespace( usage=SimpleNamespace(prompt_tokens=120, completion_tokens=40, total_tokens=160), ) class _FakeClient: def __init__(self): self.chat = SimpleNamespace(completions=_FakeChatCompletions()) def test_create_serving_backend_default_is_sim() -> None: backend = create_serving_backend(mode="sim", seed=42) assert isinstance(backend, SimulatedServingBackend) def test_real_openai_backend_produces_metrics_without_network() -> None: backend = RealOpenAIBackend(seed=1, client=_FakeClient(), model="gpt-4.1-mini", max_requests_per_step=2) metrics = backend.run_step("static_workload", default_action(), _workload()) assert metrics.requests_served == 2 assert metrics.throughput_tps >= 1.0 assert metrics.estimated_cost_per_1k > 0.0 assert metrics.p50_ttft_ms > 0.0 assert metrics.p50_itl_ms > 0.0 def test_real_backend_respects_truncation_via_kv_budget() -> None: backend = RealOpenAIBackend(seed=1, client=_FakeClient(), model="gpt-4.1-mini", max_requests_per_step=1) action = ServeAction(batch_cap=1, kv_budget_fraction=0.1, speculation_depth=0, quantization_tier="FP16") metrics = backend.run_step("static_workload", action, _workload()) assert metrics.eviction_events >= 1