File size: 3,181 Bytes
62851e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""
Evaluation harness for comparing agents.

Runs multiple episodes per agent and computes comprehensive metrics.
"""

import sys
import os
import io

# Fix Windows console encoding for Unicode output
if sys.stdout.encoding != "utf-8":
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from env.assistant_env import ExecutiveAssistantEnv
from agents.random_agent import RandomAgent
from agents.rule_based_agent import RuleBasedAgent
from agents.rl_agent import RLAgent


def evaluate_agent(env, agent, num_episodes=50, label="Agent"):
    """Evaluate an agent over multiple episodes.

    Returns:
        Dict with aggregated metrics.
    """
    all_rewards = []
    all_metrics = []
    all_steps = []

    for ep in range(num_episodes):
        state = env.reset()
        total_reward = 0.0
        done = False
        steps = 0

        while not done:
            action = agent.act(state)
            state, reward, done, info = env.step(action)
            total_reward += reward
            steps += 1

        all_rewards.append(total_reward)
        all_steps.append(steps)

        metrics = info.get("metrics", {})
        all_metrics.append(metrics)

    # Aggregate
    avg_reward = sum(all_rewards) / len(all_rewards)
    avg_steps = sum(all_steps) / len(all_steps)

    avg_metrics = {}
    if all_metrics:
        for key in all_metrics[0]:
            values = [m.get(key, 0) for m in all_metrics]
            avg_metrics[key] = sum(values) / len(values)

    return {
        "label": label,
        "avg_reward": round(avg_reward, 2),
        "avg_steps": round(avg_steps, 1),
        "best_reward": round(max(all_rewards), 2),
        "worst_reward": round(min(all_rewards), 2),
        "metrics": {k: round(v, 3) for k, v in avg_metrics.items()},
    }


def main():
    """Run full evaluation comparison."""
    NUM_EVAL = 100
    os.makedirs("logs", exist_ok=True)

    print("=" * 60)
    print("📊 AI Executive Assistant — Agent Evaluation")
    print("=" * 60)

    agents = {
        "Random": RandomAgent(seed=123),
        "Rule-Based": RuleBasedAgent(),
        "Q-Learning (untrained)": RLAgent(epsilon=0.0, seed=123),
    }

    results = []
    for name, agent in agents.items():
        print(f"\n  Evaluating {name}...")
        env = ExecutiveAssistantEnv(difficulty="medium", seed=123)
        result = evaluate_agent(env, agent, NUM_EVAL, label=name)
        results.append(result)

    # Print results table
    print("\n" + "=" * 60)
    print("📈 EVALUATION RESULTS")
    print("=" * 60)
    print(f"{'Agent':<25} {'Avg Reward':>10} {'Completion':>12} {'Efficiency':>12}")
    print("-" * 60)

    for r in results:
        m = r["metrics"]
        print(
            f"{r['label']:<25} "
            f"{r['avg_reward']:>10.1f} "
            f"{m.get('task_completion_rate', 0):>11.1%} "
            f"{m.get('efficiency_score', 0):>11.1f}"
        )

    print("\n" + "=" * 60)


if __name__ == "__main__":
    main()