Spaces:
Sleeping
Sleeping
| """ | |
| CivicAI Evaluation & Metrics Script | |
| Runs multiple episodes across all tasks, computes metrics, | |
| generates reward curve plots and comparison tables. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import sys | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| from civicai.environment import CivicAIEnv | |
| from civicai.models import Action, SubsidyPolicy | |
| from agents.orchestrator import Orchestrator | |
| def run_evaluation(episodes_per_task: int = 5) -> dict: | |
| """Run comprehensive evaluation across all tasks and agent types.""" | |
| tasks = ["stabilize_economy", "manage_pandemic", "control_crisis"] | |
| results = {} | |
| for task_id in tasks: | |
| print(f"\n{'='*50}") | |
| print(f" Evaluating: {task_id}") | |
| print(f"{'='*50}") | |
| # Multi-agent baseline | |
| agent_rewards = [] | |
| for ep in range(episodes_per_task): | |
| env = CivicAIEnv() | |
| orch = Orchestrator(env) | |
| result = orch.run_episode(task_id) | |
| agent_rewards.append(result["avg_reward"]) | |
| print(f" Agent ep{ep}: avg_reward={result['avg_reward']:.4f}") | |
| # Random baseline | |
| random_rewards = [] | |
| import random | |
| for ep in range(episodes_per_task): | |
| env = CivicAIEnv() | |
| obs = env.reset(task_id) | |
| total_r = 0.0 | |
| count = 0 | |
| for _ in range(50): | |
| action = Action( | |
| tax_rate=random.uniform(0.1, 0.5), | |
| healthcare_budget=random.uniform(0.05, 0.4), | |
| education_budget=random.uniform(0.05, 0.3), | |
| police_budget=random.uniform(0.03, 0.2), | |
| subsidy_policy=random.choice(list(SubsidyPolicy)), | |
| ) | |
| obs, r, done, _ = env.step(action) | |
| total_r += r | |
| count += 1 | |
| if done: | |
| break | |
| random_rewards.append(total_r / max(1, count)) | |
| results[task_id] = { | |
| "agent_mean": round(float(np.mean(agent_rewards)), 4), | |
| "agent_std": round(float(np.std(agent_rewards)), 4), | |
| "random_mean": round(float(np.mean(random_rewards)), 4), | |
| "random_std": round(float(np.std(random_rewards)), 4), | |
| "improvement": round(float(np.mean(agent_rewards) - np.mean(random_rewards)), 4), | |
| } | |
| return results | |
| def generate_plots(results: dict) -> None: | |
| """Generate evaluation plots.""" | |
| os.makedirs("assets", exist_ok=True) | |
| # --- Comparison Bar Chart --- | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| fig.patch.set_facecolor("#0f172a") | |
| ax.set_facecolor("#1e293b") | |
| tasks = list(results.keys()) | |
| x = np.arange(len(tasks)) | |
| width = 0.35 | |
| agent_means = [results[t]["agent_mean"] for t in tasks] | |
| random_means = [results[t]["random_mean"] for t in tasks] | |
| agent_stds = [results[t]["agent_std"] for t in tasks] | |
| random_stds = [results[t]["random_std"] for t in tasks] | |
| bars1 = ax.bar(x - width/2, random_means, width, yerr=random_stds, | |
| label="Random", color="#ef4444", alpha=0.8, capsize=4) | |
| bars2 = ax.bar(x + width/2, agent_means, width, yerr=agent_stds, | |
| label="Multi-Agent", color="#06b6d4", alpha=0.8, capsize=4) | |
| ax.set_ylabel("Avg Reward", color="white", fontsize=12) | |
| ax.set_title("CivicAI: Agent vs Random Performance", color="white", fontsize=14, fontweight="bold") | |
| ax.set_xticks(x) | |
| task_labels = ["Economic\nStability", "Pandemic\nManagement", "Social\nCrisis"] | |
| ax.set_xticklabels(task_labels, color="white") | |
| ax.tick_params(colors="white") | |
| ax.legend(facecolor="#1e293b", edgecolor="#334155", labelcolor="white") | |
| ax.spines["bottom"].set_color("#334155") | |
| ax.spines["left"].set_color("#334155") | |
| ax.spines["top"].set_visible(False) | |
| ax.spines["right"].set_visible(False) | |
| ax.set_ylim(0, 1) | |
| plt.tight_layout() | |
| plt.savefig("assets/comparison_chart.png", dpi=150, facecolor="#0f172a") | |
| plt.close() | |
| print(" Saved: assets/comparison_chart.png") | |
| # --- Reward Curve --- | |
| fig, ax = plt.subplots(figsize=(10, 5)) | |
| fig.patch.set_facecolor("#0f172a") | |
| ax.set_facecolor("#1e293b") | |
| env = CivicAIEnv() | |
| orch = Orchestrator(env) | |
| result = orch.run_episode("stabilize_economy") | |
| curve = result["reward_curve"] | |
| ax.plot(curve, color="#06b6d4", linewidth=2, label="Multi-Agent Reward") | |
| ax.fill_between(range(len(curve)), curve, alpha=0.15, color="#06b6d4") | |
| ax.axhline(y=np.mean(curve), color="#a855f7", linestyle="--", alpha=0.7, label=f"Mean: {np.mean(curve):.3f}") | |
| ax.set_xlabel("Turn", color="white", fontsize=12) | |
| ax.set_ylabel("Reward", color="white", fontsize=12) | |
| ax.set_title("CivicAI: Reward Curve (Economic Stability)", color="white", fontsize=14, fontweight="bold") | |
| ax.tick_params(colors="white") | |
| ax.legend(facecolor="#1e293b", edgecolor="#334155", labelcolor="white") | |
| ax.spines["bottom"].set_color("#334155") | |
| ax.spines["left"].set_color("#334155") | |
| ax.spines["top"].set_visible(False) | |
| ax.spines["right"].set_visible(False) | |
| plt.tight_layout() | |
| plt.savefig("assets/reward_curve.png", dpi=150, facecolor="#0f172a") | |
| plt.close() | |
| print(" Saved: assets/reward_curve.png") | |
| def print_results_table(results: dict) -> None: | |
| """Print a formatted results table.""" | |
| print(f"\n{'='*70}") | |
| print(f" {'Task':<25} {'Random':>10} {'Agent':>10} {'Improve':>10}") | |
| print(f" {'-'*55}") | |
| for task_id, r in results.items(): | |
| print(f" {task_id:<25} {r['random_mean']:>10.4f} {r['agent_mean']:>10.4f} {r['improvement']:>+10.4f}") | |
| print(f"{'='*70}") | |
| if __name__ == "__main__": | |
| episodes = int(sys.argv[1]) if len(sys.argv) > 1 else 3 | |
| print("\n[CivicAI] Evaluation Suite\n") | |
| results = run_evaluation(episodes) | |
| print_results_table(results) | |
| generate_plots(results) | |
| # Save results | |
| os.makedirs("assets", exist_ok=True) | |
| with open("assets/evaluation_results.json", "w") as f: | |
| json.dump(results, f, indent=2) | |
| print("\n Results saved to assets/evaluation_results.json") | |