Spaces:

ronitraj
/

vegarl

Running

File size: 6,600 Bytes

4fbc241

"""Comprehensive tests for the trace simulator and sub-components."""
from __future__ import annotations

from llmserve_env.models import QuantizationTier, ServeAction, WorkloadSnapshot
from server.kv_cache_simulator import KVCacheSimulator
from server.speculative_decoder import SpeculativeDecoder
from server.trace_simulator import TraceSimulator


def _make_action(**overrides) -> ServeAction:
    defaults = dict(
        batch_cap=32,
        kv_budget_fraction=1.0,
        speculation_depth=0,
        quantization_tier=QuantizationTier.FP16,
        prefill_decode_split=False,
        priority_routing=False,
    )
    defaults.update(overrides)
    return ServeAction(**defaults)


def _make_workload(**overrides) -> WorkloadSnapshot:
    defaults = dict(
        arrival_rate=10.0,
        queue_depth=20,
        mean_prompt_length=128.0,
        prompt_length_bucket=1,
        priority_fraction=0.0,
        phase="steady",
    )
    defaults.update(overrides)
    return WorkloadSnapshot(**defaults)


# ─── TraceSimulator ───────────────────────────────────────────────

class TestTraceSimulatorSmoke:
    """Basic smoke tests: simulator never crashes on valid input."""

    def test_returns_metrics_snapshot(self):
        sim = TraceSimulator()
        metrics = sim.simulate_step("static_workload", _make_action(), _make_workload())
        assert metrics.throughput_tps > 0
        assert metrics.p50_ttft_ms > 0
        assert metrics.p99_ttft_ms >= metrics.p50_ttft_ms
        assert metrics.gpu_memory_used_gb > 0
        assert metrics.estimated_cost_per_1k > 0

    def test_all_tasks_produce_metrics(self):
        sim = TraceSimulator()
        for task_id in ["static_workload", "bursty_workload", "adversarial_multitenant"]:
            metrics = sim.simulate_step(task_id, _make_action(), _make_workload())
            assert metrics.throughput_tps >= 1.0

    def test_varied_actions_no_crash(self):
        sim = TraceSimulator()
        for batch in [1, 8, 64, 256, 512]:
            for kv in [0.1, 0.5, 1.0]:
                for spec in [0, 2, 8]:
                    action = _make_action(batch_cap=batch, kv_budget_fraction=kv, speculation_depth=spec)
                    metrics = sim.simulate_step("static_workload", action, _make_workload())
                    assert metrics.throughput_tps >= 1.0
                    assert metrics.requests_served >= 0


class TestTraceSimulatorMonotonicity:
    """Higher batch_cap should generally increase throughput."""

    def test_throughput_increases_with_batch(self):
        sim = TraceSimulator()
        workload = _make_workload(queue_depth=200, arrival_rate=50.0)
        throughputs = []
        for batch in [4, 32, 128, 512]:
            action = _make_action(batch_cap=batch)
            metrics = sim.simulate_step("static_workload", action, workload)
            throughputs.append(metrics.throughput_tps)
        # Throughput should be non-decreasing (allow ties)
        for i in range(len(throughputs) - 1):
            assert throughputs[i] <= throughputs[i + 1], f"Throughput decreased: {throughputs}"


class TestTraceSimulatorOOM:
    """High batch + high kv_budget should trigger memory pressure."""

    def test_high_load_caps_memory(self):
        sim = TraceSimulator()
        action = _make_action(batch_cap=512, kv_budget_fraction=1.0)
        workload = _make_workload(queue_depth=500, arrival_rate=200.0, mean_prompt_length=4096.0)
        metrics = sim.simulate_step("adversarial_multitenant", action, workload)
        assert metrics.gpu_memory_used_gb <= 38.0  # OOM cap


class TestTraceSimulatorQuantization:
    """INT8/INT4 should be cheaper and faster than FP16."""

    def test_int8_cheaper_than_fp16(self):
        sim = TraceSimulator()
        workload = _make_workload()
        fp16 = sim.simulate_step("static_workload", _make_action(quantization_tier=QuantizationTier.FP16), workload)
        int8 = sim.simulate_step("static_workload", _make_action(quantization_tier=QuantizationTier.INT8), workload)
        assert int8.estimated_cost_per_1k <= fp16.estimated_cost_per_1k

    def test_int4_faster_than_fp16(self):
        sim = TraceSimulator()
        workload = _make_workload()
        fp16 = sim.simulate_step("static_workload", _make_action(quantization_tier=QuantizationTier.FP16), workload)
        int4 = sim.simulate_step("static_workload", _make_action(quantization_tier=QuantizationTier.INT4), workload)
        assert int4.throughput_tps >= fp16.throughput_tps


# ─── KVCacheSimulator ─────────────────────────────────────────────

class TestKVCacheSimulator:
    def test_low_load_no_evictions(self):
        kv = KVCacheSimulator()
        occupancy, evictions = kv.apply(queue_depth=5, mean_prompt_length=64.0, kv_budget_fraction=1.0)
        assert evictions == 0
        assert 0.0 <= occupancy <= 1.0

    def test_high_load_causes_evictions(self):
        kv = KVCacheSimulator()
        occupancy, evictions = kv.apply(queue_depth=500, mean_prompt_length=4096.0, kv_budget_fraction=0.1)
        assert evictions > 0
        assert occupancy == 1.0

    def test_full_budget_less_evictions(self):
        kv = KVCacheSimulator()
        _, evictions_low = kv.apply(queue_depth=100, mean_prompt_length=512.0, kv_budget_fraction=0.1)
        _, evictions_high = kv.apply(queue_depth=100, mean_prompt_length=512.0, kv_budget_fraction=1.0)
        assert evictions_high <= evictions_low


# ─── SpeculativeDecoder ───────────────────────────────────────────

class TestSpeculativeDecoder:
    def test_no_speculation(self):
        sd = SpeculativeDecoder()
        acceptance, itl = sd.estimate("static_workload", 0, 128.0)
        assert acceptance == 0.0
        assert itl == 1.0

    def test_static_has_high_acceptance(self):
        sd = SpeculativeDecoder()
        acceptance, _ = sd.estimate("static_workload", 4, 128.0)
        assert acceptance > 0.4  # depth=4 yields ~0.49 with depth decay

    def test_adversarial_has_low_acceptance(self):
        sd = SpeculativeDecoder()
        acceptance, _ = sd.estimate("adversarial_multitenant", 4, 4096.0)
        assert acceptance < 0.5

    def test_itl_speedup_bounded(self):
        sd = SpeculativeDecoder()
        _, itl = sd.estimate("static_workload", 8, 128.0)
        assert 0.5 <= itl <= 1.0