import os import json from openai import OpenAI from openenv.core.env import OpenEnv from openenv.core.config import EnvConfig from openenv.core.models import Action from openenv.core.grader import create_grader system_prompt = """You are an Email Triage AI Agent. You must decide the best action to take for an incoming email. Possible actions: 0 = Ignore 1 = Reply 2 = Forward 3 = Archive (for newsletters/generic non-urgent internal updates) 4 = Delete (for spam) Important Rules: - If the email is clearly spam, Delete it (4). - If the email is urgent/from a boss, NEVER Ignore (0) or Delete (4) or Archive (3). - If the email asks for a report/update and says "forward", Forward it (2). - If the email is urgent and asks a question, Reply (1). - If the email is a routine question from a colleague, Reply (1). Return your answer strictly in the following JSON format: { "action": } """ def get_agent_action(client: OpenAI, email) -> int: if client is None: # Mocking basic behavior if no API key if email.is_spam: return 4 if email.is_urgent: return 1 return 3 human_prompt = f"Sender: {email.sender}\nSubject: {email.subject}\nBody:\n{email.body}" try: response = client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": human_prompt} ], response_format={"type": "json_object"}, temperature=0.0 ) data = json.loads(response.choices[0].message.content) return data.get("action", 3) except Exception as e: print(f"OpenAI error: {e}") return 3 # default to archive def main(): api_key = os.environ.get("OPENAI_API_KEY", "") client = OpenAI(api_key=api_key) if api_key else None if not client: print("Warning: OPENAI_API_KEY not set. Using mocked basic agent.") print("Running Baseline Inference on all Tasks...") # Assuming yaml task configs task_configs = { 'easy': {'criteria': [{'name': 'accuracy', 'weight': 0.8}, {'name': 'critical_safety', 'weight': 0.2}]}, 'medium': {'criteria': [{'name': 'accuracy', 'weight': 0.7}, {'name': 'critical_safety', 'weight': 0.3}]}, 'hard': {'criteria': [{'name': 'accuracy', 'weight': 0.6}, {'name': 'critical_safety', 'weight': 0.4}]} } for level in ["easy", "medium", "hard"]: print(f"\n--- Testing Level: {level.upper()} ---") config = EnvConfig(task_level=level, verbose=False) env = OpenEnv(config=config) grader = create_grader(level, task_configs[level]) obs, info = env.reset(seed=42) grader.reset() while not obs.current_email is None: action_int = get_agent_action(client, obs.current_email) obs, reward, terminated, truncated, info = env.step(Action(action_type=action_int)) grader.update(**info) if terminated: break report = grader.get_grade_report() print(f"Final Score: {report['final_score']:.2f} / 1.0 (Passed? {report['passed']})") print(f"Accuracy: {report['criteria_scores'].get('accuracy', 0):.2f}") print(f"Safety: {report['criteria_scores'].get('critical_safety', 0):.2f}") print(f"Metrics: Incorrect={report['episode_data']['incorrect_actions']}, Critical Failures={report['episode_data']['critical_failures']}") if __name__ == "__main__": main()