Spaces:

mahammadaftab
/

OpenEnv

Sleeping

File size: 3,573 Bytes

4b77608

import os
import json
from openai import OpenAI
from openenv.core.env import OpenEnv
from openenv.core.config import EnvConfig
from openenv.core.models import Action
from openenv.core.grader import create_grader

system_prompt = """You are an Email Triage AI Agent. You must decide the best action to take for an incoming email.
Possible actions:
0 = Ignore
1 = Reply
2 = Forward
3 = Archive (for newsletters/generic non-urgent internal updates)
4 = Delete (for spam)

Important Rules:
- If the email is clearly spam, Delete it (4).
- If the email is urgent/from a boss, NEVER Ignore (0) or Delete (4) or Archive (3).
- If the email asks for a report/update and says "forward", Forward it (2).
- If the email is urgent and asks a question, Reply (1).
- If the email is a routine question from a colleague, Reply (1).

Return your answer strictly in the following JSON format:
{
    "action": <int>
}
"""

def get_agent_action(client: OpenAI, email) -> int:
    if client is None:
        # Mocking basic behavior if no API key
        if email.is_spam: return 4
        if email.is_urgent: return 1
        return 3

    human_prompt = f"Sender: {email.sender}\nSubject: {email.subject}\nBody:\n{email.body}"
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": human_prompt}
            ],
            response_format={"type": "json_object"},
            temperature=0.0
        )
        data = json.loads(response.choices[0].message.content)
        return data.get("action", 3)
    except Exception as e:
        print(f"OpenAI error: {e}")
        return 3 # default to archive

def main():
    api_key = os.environ.get("OPENAI_API_KEY", "")
    client = OpenAI(api_key=api_key) if api_key else None
    if not client:
        print("Warning: OPENAI_API_KEY not set. Using mocked basic agent.")

    print("Running Baseline Inference on all Tasks...")
    
    # Assuming yaml task configs
    task_configs = {
        'easy': {'criteria': [{'name': 'accuracy', 'weight': 0.8}, {'name': 'critical_safety', 'weight': 0.2}]},
        'medium': {'criteria': [{'name': 'accuracy', 'weight': 0.7}, {'name': 'critical_safety', 'weight': 0.3}]},
        'hard': {'criteria': [{'name': 'accuracy', 'weight': 0.6}, {'name': 'critical_safety', 'weight': 0.4}]}
    }
    
    for level in ["easy", "medium", "hard"]:
        print(f"\n--- Testing Level: {level.upper()} ---")
        config = EnvConfig(task_level=level, verbose=False)
        env = OpenEnv(config=config)
        grader = create_grader(level, task_configs[level])
        
        obs, info = env.reset(seed=42)
        grader.reset()
        
        while not obs.current_email is None:
            action_int = get_agent_action(client, obs.current_email)
            obs, reward, terminated, truncated, info = env.step(Action(action_type=action_int))
            grader.update(**info)
            if terminated:
                break
                
        report = grader.get_grade_report()
        print(f"Final Score: {report['final_score']:.2f} / 1.0 (Passed? {report['passed']})")
        print(f"Accuracy: {report['criteria_scores'].get('accuracy', 0):.2f}")
        print(f"Safety: {report['criteria_scores'].get('critical_safety', 0):.2f}")
        print(f"Metrics: Incorrect={report['episode_data']['incorrect_actions']}, Critical Failures={report['episode_data']['critical_failures']}")

if __name__ == "__main__":
    main()