Spaces:
Sleeping
Sleeping
| """ | |
| benchmark.py | |
| ------------- | |
| Robust benchmarking and leaderboard system for UIEnv. | |
| Evaluates multiple agents on identical environment conditions, computes | |
| standardised metrics, and produces a ranked leaderboard. | |
| Fairness guarantee | |
| ------------------ | |
| Each agent is evaluated on a *fresh* UIEnv instance created with the same | |
| seed, so every agent faces the exact same sequence of user types, devices, | |
| and random-drop rolls. Agent-internal RNG is independent. | |
| Usage | |
| ----- | |
| python benchmark.py # default: 50 episodes | |
| python benchmark.py --episodes 200 # custom episode count | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import time | |
| from dataclasses import dataclass, field, asdict | |
| from typing import Protocol, runtime_checkable | |
| from env import UIEnv, Action, Observation | |
| # ====================================================================== | |
| # Agent Protocol -- any agent plugged into the benchmark must satisfy this | |
| # ====================================================================== | |
| class Agent(Protocol): | |
| """Minimal interface every agent must expose.""" | |
| NAME: str | |
| def reset(self) -> None: ... | |
| def act(self, obs: Observation) -> Action: ... | |
| def update(self, info: dict) -> None: ... | |
| # ====================================================================== | |
| # Per-episode result record | |
| # ====================================================================== | |
| class EpisodeResult: | |
| """Immutable record of a single episode's outcome.""" | |
| episode: int | |
| outcome: str # "complete" | "drop" | "distrust" | "continue" | |
| total_reward: float | |
| steps: int | |
| final_progress: float | |
| # ====================================================================== | |
| # Per-agent aggregate metrics | |
| # ====================================================================== | |
| class AgentMetrics: | |
| """Aggregate metrics for one agent across all episodes.""" | |
| agent_name: str | |
| score: float # 0.7 * completion_rate + 0.3 * avg_reward | |
| completion_rate: float | |
| drop_rate: float | |
| avg_reward: float | |
| avg_steps: float | |
| total_episodes: int | |
| episodes: list[EpisodeResult] = field(default_factory=list, repr=False) | |
| # ====================================================================== | |
| # BenchmarkRunner | |
| # ====================================================================== | |
| class BenchmarkRunner: | |
| """ | |
| Evaluates a list of agents on UIEnv and produces a ranked leaderboard. | |
| Parameters | |
| ---------- | |
| agents : list | |
| Agent instances satisfying the Agent protocol. | |
| episodes : int | |
| Number of episodes per agent (default 50). | |
| env_seed : int | |
| Seed for UIEnv -- same for every agent to ensure fairness. | |
| verbose : bool | |
| If True, print per-episode progress during evaluation. | |
| """ | |
| def __init__( | |
| self, | |
| agents: list, | |
| episodes: int = 50, | |
| env_seed: int = 42, | |
| verbose: bool = False, | |
| ) -> None: | |
| self._agents = agents | |
| self._episodes = episodes | |
| self._env_seed = env_seed | |
| self._verbose = verbose | |
| # Validate agent interface at init time | |
| for agent in agents: | |
| if not isinstance(agent, Agent): | |
| raise TypeError( | |
| f"{agent!r} does not satisfy the Agent protocol " | |
| f"(needs NAME, reset, act, update)" | |
| ) | |
| # ------------------------------------------------------------------ # | |
| # Core evaluation loop # | |
| # ------------------------------------------------------------------ # | |
| def _evaluate_agent(self, agent) -> AgentMetrics: | |
| """ | |
| Run one agent for N episodes and collect metrics. | |
| A fresh UIEnv is created with the canonical seed so every agent | |
| faces the same stochastic sequence and an even mix of tasks. | |
| """ | |
| total_reward: float = 0.0 | |
| completions: int = 0 | |
| drops: int = 0 | |
| total_steps: int = 0 | |
| episode_results: list[EpisodeResult] = [] | |
| tasks = ["easy", "medium", "hard"] | |
| for ep in range(self._episodes): | |
| # Rotate through all task difficulties evenly | |
| current_task = tasks[ep % len(tasks)] | |
| env = UIEnv(seed=self._env_seed + ep, task=current_task) | |
| obs = env.reset() | |
| agent.reset() | |
| ep_reward: float = 0.0 | |
| done = False | |
| while not done: | |
| action = agent.act(obs) | |
| obs, reward, done, info = env.step(action) | |
| agent.update(info) | |
| ep_reward += reward | |
| outcome = info["outcome"] | |
| steps = info["step_count"] | |
| progress = info["progress"] | |
| total_reward += ep_reward | |
| total_steps += steps | |
| if outcome == "complete": | |
| completions += 1 | |
| elif outcome == "drop": | |
| drops += 1 | |
| episode_results.append( | |
| EpisodeResult( | |
| episode=ep, | |
| outcome=outcome, | |
| total_reward=ep_reward, | |
| steps=steps, | |
| final_progress=progress, | |
| ) | |
| ) | |
| if self._verbose: | |
| print( | |
| f" [{agent.NAME}] ep={ep:03d} " | |
| f"outcome={outcome:<10s} " | |
| f"reward={ep_reward:+.3f} " | |
| f"steps={steps}" | |
| ) | |
| n = self._episodes | |
| completion_rate = completions / n | |
| drop_rate = drops / n | |
| avg_reward = total_reward / n | |
| avg_steps = total_steps / n | |
| score = 0.7 * completion_rate + 0.3 * avg_reward | |
| return AgentMetrics( | |
| agent_name=agent.NAME, | |
| score=score, | |
| completion_rate=completion_rate, | |
| drop_rate=drop_rate, | |
| avg_reward=avg_reward, | |
| avg_steps=avg_steps, | |
| total_episodes=n, | |
| episodes=episode_results, | |
| ) | |
| # ------------------------------------------------------------------ # | |
| # Public API # | |
| # ------------------------------------------------------------------ # | |
| def run(self) -> list[AgentMetrics]: | |
| """ | |
| Evaluate all agents and return a leaderboard sorted by score (desc). | |
| Returns | |
| ------- | |
| list[AgentMetrics] | |
| One entry per agent, sorted best-first. | |
| """ | |
| results: list[AgentMetrics] = [] | |
| for agent in self._agents: | |
| if self._verbose: | |
| print(f"\n> Evaluating {agent.NAME} ({self._episodes} episodes) ...") | |
| t0 = time.perf_counter() | |
| metrics = self._evaluate_agent(agent) | |
| elapsed = time.perf_counter() - t0 | |
| if self._verbose: | |
| print(f" Done in {elapsed:.2f}s") | |
| results.append(metrics) | |
| # Sort descending by score | |
| results.sort(key=lambda m: m.score, reverse=True) | |
| return results | |
| # ------------------------------------------------------------------ # | |
| # Display # | |
| # ------------------------------------------------------------------ # | |
| def print_leaderboard(leaderboard: list[AgentMetrics]) -> None: | |
| """Print a professional leaderboard table to stdout.""" | |
| hdr = ( | |
| f" {'Rank':<6s}" | |
| f"{'Agent':<20s}" | |
| f"{'Score':>8s}" | |
| f"{'Completion':>12s}" | |
| f"{'Drop':>8s}" | |
| f"{'AvgReward':>11s}" | |
| f"{'AvgSteps':>10s}" | |
| ) | |
| sep = "-" * len(hdr) | |
| print() | |
| print("=" * len(hdr)) | |
| print(" LEADERBOARD".center(len(hdr))) | |
| print("=" * len(hdr)) | |
| print(hdr) | |
| print(sep) | |
| for rank, m in enumerate(leaderboard, start=1): | |
| medal = {1: "(1st)", 2: "(2nd)", 3: "(3rd)"}.get(rank, "") | |
| print( | |
| f" {f'#{rank} {medal}':<6s}" | |
| f"{m.agent_name:<20s}" | |
| f"{m.score:>8.4f}" | |
| f"{m.completion_rate * 100:>11.1f}%" | |
| f"{m.drop_rate * 100:>7.1f}%" | |
| f"{m.avg_reward:>11.4f}" | |
| f"{m.avg_steps:>10.1f}" | |
| ) | |
| print(sep) | |
| print() | |
| def print_comparison(leaderboard: list[AgentMetrics]) -> None: | |
| """Print head-to-head delta between rank #1 and all others.""" | |
| if len(leaderboard) < 2: | |
| return | |
| best = leaderboard[0] | |
| print(" HEAD-TO-HEAD vs " + best.agent_name) | |
| print(" " + "-" * 50) | |
| for other in leaderboard[1:]: | |
| d_score = best.score - other.score | |
| d_comp = (best.completion_rate - other.completion_rate) * 100 | |
| d_drop = (best.drop_rate - other.drop_rate) * 100 | |
| d_rew = best.avg_reward - other.avg_reward | |
| print( | |
| f" vs {other.agent_name:<16s} " | |
| f"score: +{d_score:.4f} " | |
| f"completion: {d_comp:+.1f}pp " | |
| f"drop: {d_drop:+.1f}pp " | |
| f"reward: {d_rew:+.4f}" | |
| ) | |
| print() | |
| def export_json(leaderboard: list[AgentMetrics], path: str = "leaderboard.json") -> None: | |
| """Export the leaderboard to a JSON file (without per-episode logs).""" | |
| data = [] | |
| for m in leaderboard: | |
| d = asdict(m) | |
| del d["episodes"] # keep export compact | |
| data.append(d) | |
| with open(path, "w", encoding="utf-8") as f: | |
| json.dump(data, f, indent=2) | |
| print(f" Leaderboard exported to {path}") | |
| # ====================================================================== | |
| # Main -- run benchmark with all available agents | |
| # ====================================================================== | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="UIEnv Agent Benchmark") | |
| parser.add_argument("--episodes", type=int, default=50, help="Episodes per agent") | |
| parser.add_argument("--seed", type=int, default=42, help="Environment seed") | |
| parser.add_argument("--verbose", action="store_true", help="Show per-episode logs") | |
| parser.add_argument("--export", action="store_true", help="Export leaderboard JSON") | |
| args = parser.parse_args() | |
| # -- Import agents -- | |
| from agents.random_agent import RandomAgent | |
| from agents.heuristic_agent import HeuristicAgent | |
| agents = [ | |
| RandomAgent(seed=99), | |
| HeuristicAgent(seed=99), | |
| ] | |
| # -- Run benchmark -- | |
| runner = BenchmarkRunner( | |
| agents=agents, | |
| episodes=args.episodes, | |
| env_seed=args.seed, | |
| verbose=args.verbose, | |
| ) | |
| leaderboard = runner.run() | |
| # -- Display results -- | |
| runner.print_leaderboard(leaderboard) | |
| runner.print_comparison(leaderboard) | |
| if args.export: | |
| runner.export_json(leaderboard) | |