UI-layout-optimizer / benchmark.py
ChaitanyaRasane
deploy: clean initial commit
f582a68
"""
benchmark.py
-------------
Robust benchmarking and leaderboard system for UIEnv.
Evaluates multiple agents on identical environment conditions, computes
standardised metrics, and produces a ranked leaderboard.
Fairness guarantee
------------------
Each agent is evaluated on a *fresh* UIEnv instance created with the same
seed, so every agent faces the exact same sequence of user types, devices,
and random-drop rolls. Agent-internal RNG is independent.
Usage
-----
python benchmark.py # default: 50 episodes
python benchmark.py --episodes 200 # custom episode count
"""
from __future__ import annotations
import argparse
import json
import time
from dataclasses import dataclass, field, asdict
from typing import Protocol, runtime_checkable
from env import UIEnv, Action, Observation
# ======================================================================
# Agent Protocol -- any agent plugged into the benchmark must satisfy this
# ======================================================================
@runtime_checkable
class Agent(Protocol):
"""Minimal interface every agent must expose."""
NAME: str
def reset(self) -> None: ...
def act(self, obs: Observation) -> Action: ...
def update(self, info: dict) -> None: ...
# ======================================================================
# Per-episode result record
# ======================================================================
@dataclass
class EpisodeResult:
"""Immutable record of a single episode's outcome."""
episode: int
outcome: str # "complete" | "drop" | "distrust" | "continue"
total_reward: float
steps: int
final_progress: float
# ======================================================================
# Per-agent aggregate metrics
# ======================================================================
@dataclass
class AgentMetrics:
"""Aggregate metrics for one agent across all episodes."""
agent_name: str
score: float # 0.7 * completion_rate + 0.3 * avg_reward
completion_rate: float
drop_rate: float
avg_reward: float
avg_steps: float
total_episodes: int
episodes: list[EpisodeResult] = field(default_factory=list, repr=False)
# ======================================================================
# BenchmarkRunner
# ======================================================================
class BenchmarkRunner:
"""
Evaluates a list of agents on UIEnv and produces a ranked leaderboard.
Parameters
----------
agents : list
Agent instances satisfying the Agent protocol.
episodes : int
Number of episodes per agent (default 50).
env_seed : int
Seed for UIEnv -- same for every agent to ensure fairness.
verbose : bool
If True, print per-episode progress during evaluation.
"""
def __init__(
self,
agents: list,
episodes: int = 50,
env_seed: int = 42,
verbose: bool = False,
) -> None:
self._agents = agents
self._episodes = episodes
self._env_seed = env_seed
self._verbose = verbose
# Validate agent interface at init time
for agent in agents:
if not isinstance(agent, Agent):
raise TypeError(
f"{agent!r} does not satisfy the Agent protocol "
f"(needs NAME, reset, act, update)"
)
# ------------------------------------------------------------------ #
# Core evaluation loop #
# ------------------------------------------------------------------ #
def _evaluate_agent(self, agent) -> AgentMetrics:
"""
Run one agent for N episodes and collect metrics.
A fresh UIEnv is created with the canonical seed so every agent
faces the same stochastic sequence and an even mix of tasks.
"""
total_reward: float = 0.0
completions: int = 0
drops: int = 0
total_steps: int = 0
episode_results: list[EpisodeResult] = []
tasks = ["easy", "medium", "hard"]
for ep in range(self._episodes):
# Rotate through all task difficulties evenly
current_task = tasks[ep % len(tasks)]
env = UIEnv(seed=self._env_seed + ep, task=current_task)
obs = env.reset()
agent.reset()
ep_reward: float = 0.0
done = False
while not done:
action = agent.act(obs)
obs, reward, done, info = env.step(action)
agent.update(info)
ep_reward += reward
outcome = info["outcome"]
steps = info["step_count"]
progress = info["progress"]
total_reward += ep_reward
total_steps += steps
if outcome == "complete":
completions += 1
elif outcome == "drop":
drops += 1
episode_results.append(
EpisodeResult(
episode=ep,
outcome=outcome,
total_reward=ep_reward,
steps=steps,
final_progress=progress,
)
)
if self._verbose:
print(
f" [{agent.NAME}] ep={ep:03d} "
f"outcome={outcome:<10s} "
f"reward={ep_reward:+.3f} "
f"steps={steps}"
)
n = self._episodes
completion_rate = completions / n
drop_rate = drops / n
avg_reward = total_reward / n
avg_steps = total_steps / n
score = 0.7 * completion_rate + 0.3 * avg_reward
return AgentMetrics(
agent_name=agent.NAME,
score=score,
completion_rate=completion_rate,
drop_rate=drop_rate,
avg_reward=avg_reward,
avg_steps=avg_steps,
total_episodes=n,
episodes=episode_results,
)
# ------------------------------------------------------------------ #
# Public API #
# ------------------------------------------------------------------ #
def run(self) -> list[AgentMetrics]:
"""
Evaluate all agents and return a leaderboard sorted by score (desc).
Returns
-------
list[AgentMetrics]
One entry per agent, sorted best-first.
"""
results: list[AgentMetrics] = []
for agent in self._agents:
if self._verbose:
print(f"\n> Evaluating {agent.NAME} ({self._episodes} episodes) ...")
t0 = time.perf_counter()
metrics = self._evaluate_agent(agent)
elapsed = time.perf_counter() - t0
if self._verbose:
print(f" Done in {elapsed:.2f}s")
results.append(metrics)
# Sort descending by score
results.sort(key=lambda m: m.score, reverse=True)
return results
# ------------------------------------------------------------------ #
# Display #
# ------------------------------------------------------------------ #
@staticmethod
def print_leaderboard(leaderboard: list[AgentMetrics]) -> None:
"""Print a professional leaderboard table to stdout."""
hdr = (
f" {'Rank':<6s}"
f"{'Agent':<20s}"
f"{'Score':>8s}"
f"{'Completion':>12s}"
f"{'Drop':>8s}"
f"{'AvgReward':>11s}"
f"{'AvgSteps':>10s}"
)
sep = "-" * len(hdr)
print()
print("=" * len(hdr))
print(" LEADERBOARD".center(len(hdr)))
print("=" * len(hdr))
print(hdr)
print(sep)
for rank, m in enumerate(leaderboard, start=1):
medal = {1: "(1st)", 2: "(2nd)", 3: "(3rd)"}.get(rank, "")
print(
f" {f'#{rank} {medal}':<6s}"
f"{m.agent_name:<20s}"
f"{m.score:>8.4f}"
f"{m.completion_rate * 100:>11.1f}%"
f"{m.drop_rate * 100:>7.1f}%"
f"{m.avg_reward:>11.4f}"
f"{m.avg_steps:>10.1f}"
)
print(sep)
print()
@staticmethod
def print_comparison(leaderboard: list[AgentMetrics]) -> None:
"""Print head-to-head delta between rank #1 and all others."""
if len(leaderboard) < 2:
return
best = leaderboard[0]
print(" HEAD-TO-HEAD vs " + best.agent_name)
print(" " + "-" * 50)
for other in leaderboard[1:]:
d_score = best.score - other.score
d_comp = (best.completion_rate - other.completion_rate) * 100
d_drop = (best.drop_rate - other.drop_rate) * 100
d_rew = best.avg_reward - other.avg_reward
print(
f" vs {other.agent_name:<16s} "
f"score: +{d_score:.4f} "
f"completion: {d_comp:+.1f}pp "
f"drop: {d_drop:+.1f}pp "
f"reward: {d_rew:+.4f}"
)
print()
@staticmethod
def export_json(leaderboard: list[AgentMetrics], path: str = "leaderboard.json") -> None:
"""Export the leaderboard to a JSON file (without per-episode logs)."""
data = []
for m in leaderboard:
d = asdict(m)
del d["episodes"] # keep export compact
data.append(d)
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
print(f" Leaderboard exported to {path}")
# ======================================================================
# Main -- run benchmark with all available agents
# ======================================================================
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="UIEnv Agent Benchmark")
parser.add_argument("--episodes", type=int, default=50, help="Episodes per agent")
parser.add_argument("--seed", type=int, default=42, help="Environment seed")
parser.add_argument("--verbose", action="store_true", help="Show per-episode logs")
parser.add_argument("--export", action="store_true", help="Export leaderboard JSON")
args = parser.parse_args()
# -- Import agents --
from agents.random_agent import RandomAgent
from agents.heuristic_agent import HeuristicAgent
agents = [
RandomAgent(seed=99),
HeuristicAgent(seed=99),
]
# -- Run benchmark --
runner = BenchmarkRunner(
agents=agents,
episodes=args.episodes,
env_seed=args.seed,
verbose=args.verbose,
)
leaderboard = runner.run()
# -- Display results --
runner.print_leaderboard(leaderboard)
runner.print_comparison(leaderboard)
if args.export:
runner.export_json(leaderboard)