Spaces:
No application file
No application file
| """ | |
| Grader for Email Triage Environment. | |
| Two grading methods: | |
| 1. Programmatic check - exact match | |
| 2. LLM judge prompt - for evaluation panel | |
| """ | |
| import requests | |
| BASE = "http://localhost:8000" | |
| CATEGORIES = ["billing", "security", "order_update", "scheduling", "promotional"] | |
| def run_episode(agent_category: str) -> dict: | |
| """Run one full episode and return the result.""" | |
| requests.post(f"{BASE}/reset", json={}) | |
| email_resp = requests.post(f"{BASE}/step", json={ | |
| "action": {"type": "call_tool", "tool_name": "read_email", "arguments": {}} | |
| }).json() | |
| email = email_resp["observation"]["result"]["structured_content"] | |
| result_resp = requests.post(f"{BASE}/step", json={ | |
| "action": {"type": "call_tool", "tool_name": "classify_email", | |
| "arguments": {"category": agent_category}} | |
| }).json() | |
| result = result_resp["observation"]["result"]["structured_content"] | |
| return { | |
| "email_subject": email["subject"], | |
| "email_body": email["body"], | |
| "agent_answer": result["your_answer"], | |
| "correct_answer": result["correct_answer"], | |
| "correct": result["correct"], | |
| "reward": result_resp["reward"], | |
| } | |
| def programmatic_check(agent_answer: str, correct_answer: str) -> bool: | |
| """Hard check — exact string match.""" | |
| return agent_answer.strip().lower() == correct_answer.strip().lower() | |
| # LLM Judge prompt — paste this into your submission | |
| LLM_JUDGE_PROMPT = """ | |
| You are evaluating an AI email triage agent. | |
| Email Subject: {email_subject} | |
| Email Body: {email_body} | |
| Available categories: billing, security, order_update, scheduling, promotional | |
| Agent's answer: {agent_answer} | |
| Correct answer: {correct_answer} | |
| Score the agent: | |
| - 1.0 if the agent picked the correct category | |
| - 0.5 if the agent picked a reasonable but incorrect category | |
| - 0.0 if the agent picked a clearly wrong category | |
| Reply with ONLY a number: 0.0, 0.5, or 1.0 | |
| """ | |
| if __name__ == "__main__": | |
| print("=== Running 5 graded episodes ===\n") | |
| total_reward = 0 | |
| for i in range(5): | |
| # Simulate agent always guessing "billing" (replace with real agent later) | |
| result = run_episode(agent_category="billing") | |
| passed = programmatic_check(result["agent_answer"], result["correct_answer"]) | |
| total_reward += result["reward"] | |
| print(f"Episode {i+1}:") | |
| print(f" Subject: {result['email_subject']}") | |
| print(f" Agent: {result['agent_answer']} | Correct: {result['correct_answer']}") | |
| print(f" Passed: {passed} | Reward: {result['reward']}") | |
| print() | |
| print(f"Total reward: {total_reward}/5") | |
| print(f"Accuracy: {total_reward/5*100:.0f}%") | |