File size: 3,181 Bytes
62851e9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | """
Evaluation harness for comparing agents.
Runs multiple episodes per agent and computes comprehensive metrics.
"""
import sys
import os
import io
# Fix Windows console encoding for Unicode output
if sys.stdout.encoding != "utf-8":
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from env.assistant_env import ExecutiveAssistantEnv
from agents.random_agent import RandomAgent
from agents.rule_based_agent import RuleBasedAgent
from agents.rl_agent import RLAgent
def evaluate_agent(env, agent, num_episodes=50, label="Agent"):
"""Evaluate an agent over multiple episodes.
Returns:
Dict with aggregated metrics.
"""
all_rewards = []
all_metrics = []
all_steps = []
for ep in range(num_episodes):
state = env.reset()
total_reward = 0.0
done = False
steps = 0
while not done:
action = agent.act(state)
state, reward, done, info = env.step(action)
total_reward += reward
steps += 1
all_rewards.append(total_reward)
all_steps.append(steps)
metrics = info.get("metrics", {})
all_metrics.append(metrics)
# Aggregate
avg_reward = sum(all_rewards) / len(all_rewards)
avg_steps = sum(all_steps) / len(all_steps)
avg_metrics = {}
if all_metrics:
for key in all_metrics[0]:
values = [m.get(key, 0) for m in all_metrics]
avg_metrics[key] = sum(values) / len(values)
return {
"label": label,
"avg_reward": round(avg_reward, 2),
"avg_steps": round(avg_steps, 1),
"best_reward": round(max(all_rewards), 2),
"worst_reward": round(min(all_rewards), 2),
"metrics": {k: round(v, 3) for k, v in avg_metrics.items()},
}
def main():
"""Run full evaluation comparison."""
NUM_EVAL = 100
os.makedirs("logs", exist_ok=True)
print("=" * 60)
print("📊 AI Executive Assistant — Agent Evaluation")
print("=" * 60)
agents = {
"Random": RandomAgent(seed=123),
"Rule-Based": RuleBasedAgent(),
"Q-Learning (untrained)": RLAgent(epsilon=0.0, seed=123),
}
results = []
for name, agent in agents.items():
print(f"\n Evaluating {name}...")
env = ExecutiveAssistantEnv(difficulty="medium", seed=123)
result = evaluate_agent(env, agent, NUM_EVAL, label=name)
results.append(result)
# Print results table
print("\n" + "=" * 60)
print("📈 EVALUATION RESULTS")
print("=" * 60)
print(f"{'Agent':<25} {'Avg Reward':>10} {'Completion':>12} {'Efficiency':>12}")
print("-" * 60)
for r in results:
m = r["metrics"]
print(
f"{r['label']:<25} "
f"{r['avg_reward']:>10.1f} "
f"{m.get('task_completion_rate', 0):>11.1%} "
f"{m.get('efficiency_score', 0):>11.1f}"
)
print("\n" + "=" * 60)
if __name__ == "__main__":
main()
|