vegarl / evaluate.py
ronitraj's picture
Deploy Space without oversized raw dataset
4fbc241
#!/usr/bin/env python3
"""Evaluate agents on InferenceGym tasks and print benchmark table.
Usage:
python evaluate.py --agent ppo --task all --episodes 20 --seed 42
python evaluate.py --agent heuristic --task static_workload --episodes 10
python evaluate.py --agent random --task all --episodes 10
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from pathlib import Path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import numpy as np # noqa: E402
from server.llmserve_environment import LLMServeEnvironment # noqa: E402
TASK_IDS = ["static_workload", "bursty_workload", "adversarial_multitenant"]
AGENT_TYPES = ["random", "heuristic", "ppo"]
WEIGHTS_DIR = Path(__file__).resolve().parent / "weights"
def _get_agent(agent_type: str, task_id: str):
"""Return an agent object with a .act(obs, task_id) method."""
if agent_type == "heuristic":
from server.baseline_agent import HeuristicPolicy
return HeuristicPolicy()
if agent_type == "random":
import random as rnd
from agents.random_agent import random_action
rng = rnd.Random(42)
class _RandomAgent:
def reset(self): pass
def act(self, obs, tid): return random_action(rng)
return _RandomAgent()
if agent_type == "ppo":
from agents.ppo_agent import PPOAgent
label_map = {
"static_workload": "task1_static",
"bursty_workload": "task2_bursty",
"adversarial_multitenant": "task3_adversarial",
}
label = label_map.get(task_id, "task1_static")
weight_path = WEIGHTS_DIR / f"ppo_{label}.pt"
if not weight_path.exists():
print(f"[WARN] PPO weights not found at {weight_path}, falling back to heuristic")
from server.baseline_agent import HeuristicPolicy
return HeuristicPolicy()
return PPOAgent(str(weight_path))
raise ValueError(f"Unknown agent type: {agent_type}")
def run_episode(env: LLMServeEnvironment, agent, task_id: str, seed: int) -> float:
if hasattr(agent, "reset"):
agent.reset()
obs = env.reset(seed=seed, task_id=task_id)
task_cfg = env.task_config
max_steps = int(task_cfg["max_steps"]) if task_cfg else 60
total_reward = 0.0
for _ in range(max_steps):
action = agent.act(obs, task_id)
obs = env.step(action)
total_reward += float(getattr(obs, "reward", 0.0) or 0.0)
if getattr(obs, "done", False):
break
return total_reward
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Evaluate agents on InferenceGym")
parser.add_argument("--agent", default="ppo", choices=AGENT_TYPES + ["all"])
parser.add_argument("--task", default="all")
parser.add_argument("--episodes", type=int, default=20)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--output", type=str, default=None)
args = parser.parse_args(argv)
tasks = TASK_IDS if args.task == "all" else [args.task]
env = LLMServeEnvironment(seed=args.seed, mode="sim")
results = {}
selected_agents = AGENT_TYPES if args.agent == "all" else [args.agent]
print(f"\n{'Agent':<12} {'Task':<28} {'Mean Reward':>12} {'Std':>8} {'Episodes':>9}")
print("-" * 72)
for agent_type in selected_agents:
agent_results = {}
for task_id in tasks:
agent = _get_agent(agent_type, task_id)
rewards = []
for ep in range(args.episodes):
r = run_episode(env, agent, task_id, args.seed + ep)
rewards.append(r)
mean_r = float(np.mean(rewards))
std_r = float(np.std(rewards))
agent_results[task_id] = {"mean_reward": round(mean_r, 4), "std_reward": round(std_r, 4), "episodes": args.episodes}
print(f"{agent_type:<12} {task_id:<28} {mean_r:>12.4f} {std_r:>8.4f} {args.episodes:>9d}")
if args.agent == "all":
results[agent_type] = agent_results
else:
results = agent_results
if args.output:
Path(args.output).parent.mkdir(parents=True, exist_ok=True)
with open(args.output, "w") as f:
json.dump(results, f, indent=2)
print(f"\nResults saved to {args.output}")
print(f"\n{json.dumps(results, indent=2)}")
return 0
if __name__ == "__main__":
raise SystemExit(main())