| """ |
| Evaluation harness for comparing agents. |
| |
| Runs multiple episodes per agent and computes comprehensive metrics. |
| """ |
|
|
| import sys |
| import os |
| import io |
|
|
| |
| if sys.stdout.encoding != "utf-8": |
| sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") |
| sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace") |
|
|
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
| from env.assistant_env import ExecutiveAssistantEnv |
| from agents.random_agent import RandomAgent |
| from agents.rule_based_agent import RuleBasedAgent |
| from agents.rl_agent import RLAgent |
|
|
|
|
| def evaluate_agent(env, agent, num_episodes=50, label="Agent"): |
| """Evaluate an agent over multiple episodes. |
| |
| Returns: |
| Dict with aggregated metrics. |
| """ |
| all_rewards = [] |
| all_metrics = [] |
| all_steps = [] |
|
|
| for ep in range(num_episodes): |
| state = env.reset() |
| total_reward = 0.0 |
| done = False |
| steps = 0 |
|
|
| while not done: |
| action = agent.act(state) |
| state, reward, done, info = env.step(action) |
| total_reward += reward |
| steps += 1 |
|
|
| all_rewards.append(total_reward) |
| all_steps.append(steps) |
|
|
| metrics = info.get("metrics", {}) |
| all_metrics.append(metrics) |
|
|
| |
| avg_reward = sum(all_rewards) / len(all_rewards) |
| avg_steps = sum(all_steps) / len(all_steps) |
|
|
| avg_metrics = {} |
| if all_metrics: |
| for key in all_metrics[0]: |
| values = [m.get(key, 0) for m in all_metrics] |
| avg_metrics[key] = sum(values) / len(values) |
|
|
| return { |
| "label": label, |
| "avg_reward": round(avg_reward, 2), |
| "avg_steps": round(avg_steps, 1), |
| "best_reward": round(max(all_rewards), 2), |
| "worst_reward": round(min(all_rewards), 2), |
| "metrics": {k: round(v, 3) for k, v in avg_metrics.items()}, |
| } |
|
|
|
|
| def main(): |
| """Run full evaluation comparison.""" |
| NUM_EVAL = 100 |
| os.makedirs("logs", exist_ok=True) |
|
|
| print("=" * 60) |
| print("π AI Executive Assistant β Agent Evaluation") |
| print("=" * 60) |
|
|
| agents = { |
| "Random": RandomAgent(seed=123), |
| "Rule-Based": RuleBasedAgent(), |
| "Q-Learning (untrained)": RLAgent(epsilon=0.0, seed=123), |
| } |
|
|
| results = [] |
| for name, agent in agents.items(): |
| print(f"\n Evaluating {name}...") |
| env = ExecutiveAssistantEnv(difficulty="medium", seed=123) |
| result = evaluate_agent(env, agent, NUM_EVAL, label=name) |
| results.append(result) |
|
|
| |
| print("\n" + "=" * 60) |
| print("π EVALUATION RESULTS") |
| print("=" * 60) |
| print(f"{'Agent':<25} {'Avg Reward':>10} {'Completion':>12} {'Efficiency':>12}") |
| print("-" * 60) |
|
|
| for r in results: |
| m = r["metrics"] |
| print( |
| f"{r['label']:<25} " |
| f"{r['avg_reward']:>10.1f} " |
| f"{m.get('task_completion_rate', 0):>11.1%} " |
| f"{m.get('efficiency_score', 0):>11.1f}" |
| ) |
|
|
| print("\n" + "=" * 60) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|