""" CivicAI Evaluation & Metrics Script Runs multiple episodes across all tasks, computes metrics, generates reward curve plots and comparison tables. """ from __future__ import annotations import json import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import numpy as np from civicai.environment import CivicAIEnv from civicai.models import Action, SubsidyPolicy from agents.orchestrator import Orchestrator def run_evaluation(episodes_per_task: int = 5) -> dict: """Run comprehensive evaluation across all tasks and agent types.""" tasks = ["stabilize_economy", "manage_pandemic", "control_crisis"] results = {} for task_id in tasks: print(f"\n{'='*50}") print(f" Evaluating: {task_id}") print(f"{'='*50}") # Multi-agent baseline agent_rewards = [] for ep in range(episodes_per_task): env = CivicAIEnv() orch = Orchestrator(env) result = orch.run_episode(task_id) agent_rewards.append(result["avg_reward"]) print(f" Agent ep{ep}: avg_reward={result['avg_reward']:.4f}") # Random baseline random_rewards = [] import random for ep in range(episodes_per_task): env = CivicAIEnv() obs = env.reset(task_id) total_r = 0.0 count = 0 for _ in range(50): action = Action( tax_rate=random.uniform(0.1, 0.5), healthcare_budget=random.uniform(0.05, 0.4), education_budget=random.uniform(0.05, 0.3), police_budget=random.uniform(0.03, 0.2), subsidy_policy=random.choice(list(SubsidyPolicy)), ) obs, r, done, _ = env.step(action) total_r += r count += 1 if done: break random_rewards.append(total_r / max(1, count)) results[task_id] = { "agent_mean": round(float(np.mean(agent_rewards)), 4), "agent_std": round(float(np.std(agent_rewards)), 4), "random_mean": round(float(np.mean(random_rewards)), 4), "random_std": round(float(np.std(random_rewards)), 4), "improvement": round(float(np.mean(agent_rewards) - np.mean(random_rewards)), 4), } return results def generate_plots(results: dict) -> None: """Generate evaluation plots.""" os.makedirs("assets", exist_ok=True) # --- Comparison Bar Chart --- fig, ax = plt.subplots(figsize=(10, 6)) fig.patch.set_facecolor("#0f172a") ax.set_facecolor("#1e293b") tasks = list(results.keys()) x = np.arange(len(tasks)) width = 0.35 agent_means = [results[t]["agent_mean"] for t in tasks] random_means = [results[t]["random_mean"] for t in tasks] agent_stds = [results[t]["agent_std"] for t in tasks] random_stds = [results[t]["random_std"] for t in tasks] bars1 = ax.bar(x - width/2, random_means, width, yerr=random_stds, label="Random", color="#ef4444", alpha=0.8, capsize=4) bars2 = ax.bar(x + width/2, agent_means, width, yerr=agent_stds, label="Multi-Agent", color="#06b6d4", alpha=0.8, capsize=4) ax.set_ylabel("Avg Reward", color="white", fontsize=12) ax.set_title("CivicAI: Agent vs Random Performance", color="white", fontsize=14, fontweight="bold") ax.set_xticks(x) task_labels = ["Economic\nStability", "Pandemic\nManagement", "Social\nCrisis"] ax.set_xticklabels(task_labels, color="white") ax.tick_params(colors="white") ax.legend(facecolor="#1e293b", edgecolor="#334155", labelcolor="white") ax.spines["bottom"].set_color("#334155") ax.spines["left"].set_color("#334155") ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.set_ylim(0, 1) plt.tight_layout() plt.savefig("assets/comparison_chart.png", dpi=150, facecolor="#0f172a") plt.close() print(" Saved: assets/comparison_chart.png") # --- Reward Curve --- fig, ax = plt.subplots(figsize=(10, 5)) fig.patch.set_facecolor("#0f172a") ax.set_facecolor("#1e293b") env = CivicAIEnv() orch = Orchestrator(env) result = orch.run_episode("stabilize_economy") curve = result["reward_curve"] ax.plot(curve, color="#06b6d4", linewidth=2, label="Multi-Agent Reward") ax.fill_between(range(len(curve)), curve, alpha=0.15, color="#06b6d4") ax.axhline(y=np.mean(curve), color="#a855f7", linestyle="--", alpha=0.7, label=f"Mean: {np.mean(curve):.3f}") ax.set_xlabel("Turn", color="white", fontsize=12) ax.set_ylabel("Reward", color="white", fontsize=12) ax.set_title("CivicAI: Reward Curve (Economic Stability)", color="white", fontsize=14, fontweight="bold") ax.tick_params(colors="white") ax.legend(facecolor="#1e293b", edgecolor="#334155", labelcolor="white") ax.spines["bottom"].set_color("#334155") ax.spines["left"].set_color("#334155") ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) plt.tight_layout() plt.savefig("assets/reward_curve.png", dpi=150, facecolor="#0f172a") plt.close() print(" Saved: assets/reward_curve.png") def print_results_table(results: dict) -> None: """Print a formatted results table.""" print(f"\n{'='*70}") print(f" {'Task':<25} {'Random':>10} {'Agent':>10} {'Improve':>10}") print(f" {'-'*55}") for task_id, r in results.items(): print(f" {task_id:<25} {r['random_mean']:>10.4f} {r['agent_mean']:>10.4f} {r['improvement']:>+10.4f}") print(f"{'='*70}") if __name__ == "__main__": episodes = int(sys.argv[1]) if len(sys.argv) > 1 else 3 print("\n[CivicAI] Evaluation Suite\n") results = run_evaluation(episodes) print_results_table(results) generate_plots(results) # Save results os.makedirs("assets", exist_ok=True) with open("assets/evaluation_results.json", "w") as f: json.dump(results, f, indent=2) print("\n Results saved to assets/evaluation_results.json")