""" Grader for Email Triage Environment. Two grading methods: 1. Programmatic check - exact match 2. LLM judge prompt - for evaluation panel """ import requests BASE = "http://localhost:8000" CATEGORIES = ["billing", "security", "order_update", "scheduling", "promotional"] def run_episode(agent_category: str) -> dict: """Run one full episode and return the result.""" requests.post(f"{BASE}/reset", json={}) email_resp = requests.post(f"{BASE}/step", json={ "action": {"type": "call_tool", "tool_name": "read_email", "arguments": {}} }).json() email = email_resp["observation"]["result"]["structured_content"] result_resp = requests.post(f"{BASE}/step", json={ "action": {"type": "call_tool", "tool_name": "classify_email", "arguments": {"category": agent_category}} }).json() result = result_resp["observation"]["result"]["structured_content"] return { "email_subject": email["subject"], "email_body": email["body"], "agent_answer": result["your_answer"], "correct_answer": result["correct_answer"], "correct": result["correct"], "reward": result_resp["reward"], } def programmatic_check(agent_answer: str, correct_answer: str) -> bool: """Hard check — exact string match.""" return agent_answer.strip().lower() == correct_answer.strip().lower() # LLM Judge prompt — paste this into your submission LLM_JUDGE_PROMPT = """ You are evaluating an AI email triage agent. Email Subject: {email_subject} Email Body: {email_body} Available categories: billing, security, order_update, scheduling, promotional Agent's answer: {agent_answer} Correct answer: {correct_answer} Score the agent: - 1.0 if the agent picked the correct category - 0.5 if the agent picked a reasonable but incorrect category - 0.0 if the agent picked a clearly wrong category Reply with ONLY a number: 0.0, 0.5, or 1.0 """ if __name__ == "__main__": print("=== Running 5 graded episodes ===\n") total_reward = 0 for i in range(5): # Simulate agent always guessing "billing" (replace with real agent later) result = run_episode(agent_category="billing") passed = programmatic_check(result["agent_answer"], result["correct_answer"]) total_reward += result["reward"] print(f"Episode {i+1}:") print(f" Subject: {result['email_subject']}") print(f" Agent: {result['agent_answer']} | Correct: {result['correct_answer']}") print(f" Passed: {passed} | Reward: {result['reward']}") print() print(f"Total reward: {total_reward}/5") print(f"Accuracy: {total_reward/5*100:.0f}%")