| """Tests for the grader, baseline agent, and score calibration.""" |
| from __future__ import annotations |
|
|
| from llmserve_env.models import default_action |
| from server.grader import GraderEngine |
| from server.llmserve_environment import LLMServeEnvironment |
|
|
|
|
| def _run_episode(task_id: str, seed: int = 42) -> LLMServeEnvironment: |
| env = LLMServeEnvironment(seed=seed) |
| env.reset(task_id=task_id, seed=seed) |
| while not env.state.done: |
| env.step(default_action()) |
| return env |
|
|
|
|
| |
|
|
| class TestGrader: |
| def test_score_in_valid_range(self): |
| grader = GraderEngine() |
| for task_id in ["static_workload", "bursty_workload", "adversarial_multitenant"]: |
| env = _run_episode(task_id) |
| result = grader.grade(env.export_episode_log()) |
| assert 0.0 <= result["score"] <= 1.0, f"Score out of range for {task_id}: {result['score']}" |
|
|
| def test_score_has_breakdown(self): |
| grader = GraderEngine() |
| env = _run_episode("static_workload") |
| result = grader.grade(env.export_episode_log()) |
| assert "breakdown" in result |
| breakdown = result["breakdown"] |
| assert "throughput" in breakdown |
| assert "slo" in breakdown |
| assert "memory" in breakdown |
| assert "cost" in breakdown |
|
|
| def test_empty_log_returns_zero(self): |
| from llmserve_env.models import EpisodeLog, ServeState |
| grader = GraderEngine() |
| empty_log = EpisodeLog( |
| task_id="static_workload", |
| actions=[], |
| observations=[], |
| rewards=[], |
| final_state=ServeState( |
| episode_id="test", |
| step_count=0, |
| task_id="static_workload", |
| total_requests_served=0, |
| total_slo_violations=0, |
| cumulative_reward=0.0, |
| elapsed_simulated_time_s=0.0, |
| workload_phase="warmup", |
| done=True, |
| ), |
| ) |
| result = grader.grade(empty_log) |
| assert result["score"] == 0.0 |
|
|
| def test_grader_is_deterministic(self): |
| grader = GraderEngine() |
| env = _run_episode("static_workload", seed=0) |
| log = env.export_episode_log() |
| score_1 = grader.grade(log)["score"] |
| score_2 = grader.grade(log)["score"] |
| assert score_1 == score_2 |
|
|
|
|
| |
|
|
| class TestBaseline: |
| def test_baseline_scores_all_tasks(self): |
| grader = GraderEngine() |
| for task_id in ["static_workload", "bursty_workload", "adversarial_multitenant"]: |
| env = _run_episode(task_id, seed=0) |
| result = grader.grade(env.export_episode_log()) |
| assert 0.0 < result["score"] <= 1.0, f"Baseline score too low for {task_id}: {result['score']}" |
|
|
| def test_baseline_deterministic_across_runs(self): |
| grader = GraderEngine() |
| scores = [] |
| for _ in range(3): |
| env = _run_episode("static_workload", seed=0) |
| result = grader.grade(env.export_episode_log()) |
| scores.append(result["score"]) |
| assert all(s == scores[0] for s in scores), f"Baseline scores not deterministic: {scores}" |
|
|
|
|
| |
|
|
| class TestScoreOrdering: |
| def test_breakdown_components_bounded(self): |
| grader = GraderEngine() |
| for task_id in ["static_workload", "bursty_workload", "adversarial_multitenant"]: |
| env = _run_episode(task_id) |
| result = grader.grade(env.export_episode_log()) |
| for key, val in result["breakdown"].items(): |
| assert 0.0 <= val <= 1.0, f"{key} out of [0,1] for {task_id}: {val}" |
|
|