File size: 4,114 Bytes
4fbc241 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | """Tests for the grader, baseline agent, and score calibration."""
from __future__ import annotations
from llmserve_env.models import default_action
from server.grader import GraderEngine
from server.llmserve_environment import LLMServeEnvironment
def _run_episode(task_id: str, seed: int = 42) -> LLMServeEnvironment:
env = LLMServeEnvironment(seed=seed)
env.reset(task_id=task_id, seed=seed)
while not env.state.done:
env.step(default_action())
return env
# βββ Grader βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestGrader:
def test_score_in_valid_range(self):
grader = GraderEngine()
for task_id in ["static_workload", "bursty_workload", "adversarial_multitenant"]:
env = _run_episode(task_id)
result = grader.grade(env.export_episode_log())
assert 0.0 <= result["score"] <= 1.0, f"Score out of range for {task_id}: {result['score']}"
def test_score_has_breakdown(self):
grader = GraderEngine()
env = _run_episode("static_workload")
result = grader.grade(env.export_episode_log())
assert "breakdown" in result
breakdown = result["breakdown"]
assert "throughput" in breakdown
assert "slo" in breakdown
assert "memory" in breakdown
assert "cost" in breakdown
def test_empty_log_returns_zero(self):
from llmserve_env.models import EpisodeLog, ServeState
grader = GraderEngine()
empty_log = EpisodeLog(
task_id="static_workload",
actions=[],
observations=[],
rewards=[],
final_state=ServeState(
episode_id="test",
step_count=0,
task_id="static_workload",
total_requests_served=0,
total_slo_violations=0,
cumulative_reward=0.0,
elapsed_simulated_time_s=0.0,
workload_phase="warmup",
done=True,
),
)
result = grader.grade(empty_log)
assert result["score"] == 0.0
def test_grader_is_deterministic(self):
grader = GraderEngine()
env = _run_episode("static_workload", seed=0)
log = env.export_episode_log()
score_1 = grader.grade(log)["score"]
score_2 = grader.grade(log)["score"]
assert score_1 == score_2
# βββ Baseline βββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestBaseline:
def test_baseline_scores_all_tasks(self):
grader = GraderEngine()
for task_id in ["static_workload", "bursty_workload", "adversarial_multitenant"]:
env = _run_episode(task_id, seed=0)
result = grader.grade(env.export_episode_log())
assert 0.0 < result["score"] <= 1.0, f"Baseline score too low for {task_id}: {result['score']}"
def test_baseline_deterministic_across_runs(self):
grader = GraderEngine()
scores = []
for _ in range(3):
env = _run_episode("static_workload", seed=0)
result = grader.grade(env.export_episode_log())
scores.append(result["score"])
assert all(s == scores[0] for s in scores), f"Baseline scores not deterministic: {scores}"
# βββ Score Ordering βββββββββββββββββββββββββββββββββββββββββββββββ
class TestScoreOrdering:
def test_breakdown_components_bounded(self):
grader = GraderEngine()
for task_id in ["static_workload", "bursty_workload", "adversarial_multitenant"]:
env = _run_episode(task_id)
result = grader.grade(env.export_episode_log())
for key, val in result["breakdown"].items():
assert 0.0 <= val <= 1.0, f"{key} out of [0,1] for {task_id}: {val}"
|