""" Evaluation harness for comparing agents. Runs multiple episodes per agent and computes comprehensive metrics. """ import sys import os import io # Fix Windows console encoding for Unicode output if sys.stdout.encoding != "utf-8": sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace") sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from env.assistant_env import ExecutiveAssistantEnv from agents.random_agent import RandomAgent from agents.rule_based_agent import RuleBasedAgent from agents.rl_agent import RLAgent def evaluate_agent(env, agent, num_episodes=50, label="Agent"): """Evaluate an agent over multiple episodes. Returns: Dict with aggregated metrics. """ all_rewards = [] all_metrics = [] all_steps = [] for ep in range(num_episodes): state = env.reset() total_reward = 0.0 done = False steps = 0 while not done: action = agent.act(state) state, reward, done, info = env.step(action) total_reward += reward steps += 1 all_rewards.append(total_reward) all_steps.append(steps) metrics = info.get("metrics", {}) all_metrics.append(metrics) # Aggregate avg_reward = sum(all_rewards) / len(all_rewards) avg_steps = sum(all_steps) / len(all_steps) avg_metrics = {} if all_metrics: for key in all_metrics[0]: values = [m.get(key, 0) for m in all_metrics] avg_metrics[key] = sum(values) / len(values) return { "label": label, "avg_reward": round(avg_reward, 2), "avg_steps": round(avg_steps, 1), "best_reward": round(max(all_rewards), 2), "worst_reward": round(min(all_rewards), 2), "metrics": {k: round(v, 3) for k, v in avg_metrics.items()}, } def main(): """Run full evaluation comparison.""" NUM_EVAL = 100 os.makedirs("logs", exist_ok=True) print("=" * 60) print("📊 AI Executive Assistant — Agent Evaluation") print("=" * 60) agents = { "Random": RandomAgent(seed=123), "Rule-Based": RuleBasedAgent(), "Q-Learning (untrained)": RLAgent(epsilon=0.0, seed=123), } results = [] for name, agent in agents.items(): print(f"\n Evaluating {name}...") env = ExecutiveAssistantEnv(difficulty="medium", seed=123) result = evaluate_agent(env, agent, NUM_EVAL, label=name) results.append(result) # Print results table print("\n" + "=" * 60) print("📈 EVALUATION RESULTS") print("=" * 60) print(f"{'Agent':<25} {'Avg Reward':>10} {'Completion':>12} {'Efficiency':>12}") print("-" * 60) for r in results: m = r["metrics"] print( f"{r['label']:<25} " f"{r['avg_reward']:>10.1f} " f"{m.get('task_completion_rate', 0):>11.1%} " f"{m.get('efficiency_score', 0):>11.1f}" ) print("\n" + "=" * 60) if __name__ == "__main__": main()