File size: 3,573 Bytes
4b77608
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import json
from openai import OpenAI
from openenv.core.env import OpenEnv
from openenv.core.config import EnvConfig
from openenv.core.models import Action
from openenv.core.grader import create_grader

system_prompt = """You are an Email Triage AI Agent. You must decide the best action to take for an incoming email.
Possible actions:
0 = Ignore
1 = Reply
2 = Forward
3 = Archive (for newsletters/generic non-urgent internal updates)
4 = Delete (for spam)

Important Rules:
- If the email is clearly spam, Delete it (4).
- If the email is urgent/from a boss, NEVER Ignore (0) or Delete (4) or Archive (3).
- If the email asks for a report/update and says "forward", Forward it (2).
- If the email is urgent and asks a question, Reply (1).
- If the email is a routine question from a colleague, Reply (1).

Return your answer strictly in the following JSON format:
{
    "action": <int>
}
"""

def get_agent_action(client: OpenAI, email) -> int:
    if client is None:
        # Mocking basic behavior if no API key
        if email.is_spam: return 4
        if email.is_urgent: return 1
        return 3

    human_prompt = f"Sender: {email.sender}\nSubject: {email.subject}\nBody:\n{email.body}"
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": human_prompt}
            ],
            response_format={"type": "json_object"},
            temperature=0.0
        )
        data = json.loads(response.choices[0].message.content)
        return data.get("action", 3)
    except Exception as e:
        print(f"OpenAI error: {e}")
        return 3 # default to archive

def main():
    api_key = os.environ.get("OPENAI_API_KEY", "")
    client = OpenAI(api_key=api_key) if api_key else None
    if not client:
        print("Warning: OPENAI_API_KEY not set. Using mocked basic agent.")

    print("Running Baseline Inference on all Tasks...")
    
    # Assuming yaml task configs
    task_configs = {
        'easy': {'criteria': [{'name': 'accuracy', 'weight': 0.8}, {'name': 'critical_safety', 'weight': 0.2}]},
        'medium': {'criteria': [{'name': 'accuracy', 'weight': 0.7}, {'name': 'critical_safety', 'weight': 0.3}]},
        'hard': {'criteria': [{'name': 'accuracy', 'weight': 0.6}, {'name': 'critical_safety', 'weight': 0.4}]}
    }
    
    for level in ["easy", "medium", "hard"]:
        print(f"\n--- Testing Level: {level.upper()} ---")
        config = EnvConfig(task_level=level, verbose=False)
        env = OpenEnv(config=config)
        grader = create_grader(level, task_configs[level])
        
        obs, info = env.reset(seed=42)
        grader.reset()
        
        while not obs.current_email is None:
            action_int = get_agent_action(client, obs.current_email)
            obs, reward, terminated, truncated, info = env.step(Action(action_type=action_int))
            grader.update(**info)
            if terminated:
                break
                
        report = grader.get_grade_report()
        print(f"Final Score: {report['final_score']:.2f} / 1.0 (Passed? {report['passed']})")
        print(f"Accuracy: {report['criteria_scores'].get('accuracy', 0):.2f}")
        print(f"Safety: {report['criteria_scores'].get('critical_safety', 0):.2f}")
        print(f"Metrics: Incorrect={report['episode_data']['incorrect_actions']}, Critical Failures={report['episode_data']['critical_failures']}")

if __name__ == "__main__":
    main()