File size: 4,703 Bytes
4b77608
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import json
from typing import Optional, List
from openai import OpenAI
from openenv.core.env import OpenEnv
from openenv.core.config import EnvConfig
from openenv.core.models import Action
from openenv.core.grader import create_grader

# Hackathon Variables
API_BASE_URL = os.getenv("API_BASE_URL") or "https://api.openai.com/v1"
MODEL_NAME = os.getenv("MODEL_NAME") or "gpt-4o-mini"
HF_TOKEN = os.getenv("HF_TOKEN") or "dummy"
BENCHMARK = "OpenEnv Email Triage"

system_prompt = """You are an Email Triage AI Agent. You must decide the best action to take for an incoming email.
Possible actions:
0 = Ignore
1 = Reply
2 = Forward
3 = Archive (for newsletters/generic non-urgent internal updates)
4 = Delete (for spam)

Important Rules:
- If the email is clearly spam, Delete it (4).
- If the email is urgent/from a boss, NEVER Ignore (0) or Delete (4) or Archive (3).
- If the email asks for a report/update and says "forward", Forward it (2).
- If the email is urgent and asks a question, Reply (1).
- If the email is a routine question from a colleague, Reply (1).

Return your answer strictly in the following JSON format:
{
    "action": <int>
}
"""

def log_start(task: str, env: str, model: str) -> None:
    print(f"[START] task={task} env={env} model={model}", flush=True)

def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
    error_val = error if error else "null"
    done_val = str(done).lower()
    print(
        f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
        flush=True,
    )

def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
    print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)

def get_agent_action(client: OpenAI, email) -> int:
    if not client.api_key or client.api_key == "dummy":
        if email.is_spam: return 4
        if email.is_urgent: return 1
        return 3

    human_prompt = f"Sender: {email.sender}\nSubject: {email.subject}\nBody:\n{email.body}"
    try:
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": human_prompt}
            ],
            response_format={"type": "json_object"},
            temperature=0.0
        )
        data = json.loads(response.choices[0].message.content)
        return int(data.get("action", 3))
    except Exception as e:
        return 3

def main():
    client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)

    task_configs = {
        'easy': {'criteria': [{'name': 'accuracy', 'weight': 0.8}, {'name': 'critical_safety', 'weight': 0.2}]},
        'medium': {'criteria': [{'name': 'accuracy', 'weight': 0.7}, {'name': 'critical_safety', 'weight': 0.3}]},
        'hard': {'criteria': [{'name': 'accuracy', 'weight': 0.6}, {'name': 'critical_safety', 'weight': 0.4}]}
    }
    
    for level in ["easy", "medium", "hard"]:
        log_start(task=level, env=BENCHMARK, model=MODEL_NAME)
        
        config = EnvConfig(task_level=level, verbose=False)
        env = OpenEnv(config=config)
        grader = create_grader(level, task_configs[level])
        
        try:
            obs, info = env.reset(seed=42)
            grader.reset()
            
            rewards = []
            steps_taken = 0
            
            while obs.current_email is not None:
                steps_taken += 1
                error = None
                
                try:
                    action_int = get_agent_action(client, obs.current_email)
                except Exception as e:
                    action_int = 3
                    error = str(e)
                    
                obs, reward, terminated, truncated, info = env.step(Action(action_type=action_int))
                grader.update(**info)
                
                rewards.append(reward)
                done = terminated or truncated
                
                log_step(step=steps_taken, action=str(action_int), reward=reward, done=done, error=error)
                
                if done:
                    break
                    
            report = grader.get_grade_report()
            log_end(success=report['passed'], steps=steps_taken, score=report['final_score'], rewards=rewards)
        finally:
            try:
                env.close()
            except Exception as e:
                print(f"[DEBUG] env.close() error: {e}", flush=True)

if __name__ == "__main__":
    main()