OpenEnv / scripts /baseline.py
mahammadaftab's picture
Initial OpenEnv Email Triage Submission
4b77608
import os
import json
from openai import OpenAI
from openenv.core.env import OpenEnv
from openenv.core.config import EnvConfig
from openenv.core.models import Action
from openenv.core.grader import create_grader
system_prompt = """You are an Email Triage AI Agent. You must decide the best action to take for an incoming email.
Possible actions:
0 = Ignore
1 = Reply
2 = Forward
3 = Archive (for newsletters/generic non-urgent internal updates)
4 = Delete (for spam)
Important Rules:
- If the email is clearly spam, Delete it (4).
- If the email is urgent/from a boss, NEVER Ignore (0) or Delete (4) or Archive (3).
- If the email asks for a report/update and says "forward", Forward it (2).
- If the email is urgent and asks a question, Reply (1).
- If the email is a routine question from a colleague, Reply (1).
Return your answer strictly in the following JSON format:
{
"action": <int>
}
"""
def get_agent_action(client: OpenAI, email) -> int:
if client is None:
# Mocking basic behavior if no API key
if email.is_spam: return 4
if email.is_urgent: return 1
return 3
human_prompt = f"Sender: {email.sender}\nSubject: {email.subject}\nBody:\n{email.body}"
try:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": human_prompt}
],
response_format={"type": "json_object"},
temperature=0.0
)
data = json.loads(response.choices[0].message.content)
return data.get("action", 3)
except Exception as e:
print(f"OpenAI error: {e}")
return 3 # default to archive
def main():
api_key = os.environ.get("OPENAI_API_KEY", "")
client = OpenAI(api_key=api_key) if api_key else None
if not client:
print("Warning: OPENAI_API_KEY not set. Using mocked basic agent.")
print("Running Baseline Inference on all Tasks...")
# Assuming yaml task configs
task_configs = {
'easy': {'criteria': [{'name': 'accuracy', 'weight': 0.8}, {'name': 'critical_safety', 'weight': 0.2}]},
'medium': {'criteria': [{'name': 'accuracy', 'weight': 0.7}, {'name': 'critical_safety', 'weight': 0.3}]},
'hard': {'criteria': [{'name': 'accuracy', 'weight': 0.6}, {'name': 'critical_safety', 'weight': 0.4}]}
}
for level in ["easy", "medium", "hard"]:
print(f"\n--- Testing Level: {level.upper()} ---")
config = EnvConfig(task_level=level, verbose=False)
env = OpenEnv(config=config)
grader = create_grader(level, task_configs[level])
obs, info = env.reset(seed=42)
grader.reset()
while not obs.current_email is None:
action_int = get_agent_action(client, obs.current_email)
obs, reward, terminated, truncated, info = env.step(Action(action_type=action_int))
grader.update(**info)
if terminated:
break
report = grader.get_grade_report()
print(f"Final Score: {report['final_score']:.2f} / 1.0 (Passed? {report['passed']})")
print(f"Accuracy: {report['criteria_scores'].get('accuracy', 0):.2f}")
print(f"Safety: {report['criteria_scores'].get('critical_safety', 0):.2f}")
print(f"Metrics: Incorrect={report['episode_data']['incorrect_actions']}, Critical Failures={report['episode_data']['critical_failures']}")
if __name__ == "__main__":
main()