Spaces:

ronitraj
/

vegarl

Running

File size: 1,868 Bytes

4fbc241

from __future__ import annotations

from types import SimpleNamespace

from llmserve_env.models import ServeAction, WorkloadSnapshot, default_action
from server.serving_backend import RealOpenAIBackend, SimulatedServingBackend, create_serving_backend


def _workload() -> WorkloadSnapshot:
    return WorkloadSnapshot(
        arrival_rate=8.0,
        queue_depth=5,
        mean_prompt_length=128.0,
        prompt_length_bucket=1,
        priority_fraction=0.25,
        phase="steady",
    )


class _FakeChatCompletions:
    def create(self, **kwargs):
        del kwargs
        return SimpleNamespace(
            usage=SimpleNamespace(prompt_tokens=120, completion_tokens=40, total_tokens=160),
        )


class _FakeClient:
    def __init__(self):
        self.chat = SimpleNamespace(completions=_FakeChatCompletions())


def test_create_serving_backend_default_is_sim() -> None:
    backend = create_serving_backend(mode="sim", seed=42)
    assert isinstance(backend, SimulatedServingBackend)


def test_real_openai_backend_produces_metrics_without_network() -> None:
    backend = RealOpenAIBackend(seed=1, client=_FakeClient(), model="gpt-4.1-mini", max_requests_per_step=2)
    metrics = backend.run_step("static_workload", default_action(), _workload())
    assert metrics.requests_served == 2
    assert metrics.throughput_tps >= 1.0
    assert metrics.estimated_cost_per_1k > 0.0
    assert metrics.p50_ttft_ms > 0.0
    assert metrics.p50_itl_ms > 0.0


def test_real_backend_respects_truncation_via_kv_budget() -> None:
    backend = RealOpenAIBackend(seed=1, client=_FakeClient(), model="gpt-4.1-mini", max_requests_per_step=1)
    action = ServeAction(batch_cap=1, kv_budget_fraction=0.1, speculation_depth=0, quantization_tier="FP16")
    metrics = backend.run_step("static_workload", action, _workload())
    assert metrics.eviction_events >= 1