email-triage-env / grader.py
anugrah55's picture
Upload folder using huggingface_hub
520bc7a verified
"""
Grader for Email Triage Environment.
Two grading methods:
1. Programmatic check - exact match
2. LLM judge prompt - for evaluation panel
"""
import requests
BASE = "http://localhost:8000"
CATEGORIES = ["billing", "security", "order_update", "scheduling", "promotional"]
def run_episode(agent_category: str) -> dict:
"""Run one full episode and return the result."""
requests.post(f"{BASE}/reset", json={})
email_resp = requests.post(f"{BASE}/step", json={
"action": {"type": "call_tool", "tool_name": "read_email", "arguments": {}}
}).json()
email = email_resp["observation"]["result"]["structured_content"]
result_resp = requests.post(f"{BASE}/step", json={
"action": {"type": "call_tool", "tool_name": "classify_email",
"arguments": {"category": agent_category}}
}).json()
result = result_resp["observation"]["result"]["structured_content"]
return {
"email_subject": email["subject"],
"email_body": email["body"],
"agent_answer": result["your_answer"],
"correct_answer": result["correct_answer"],
"correct": result["correct"],
"reward": result_resp["reward"],
}
def programmatic_check(agent_answer: str, correct_answer: str) -> bool:
"""Hard check — exact string match."""
return agent_answer.strip().lower() == correct_answer.strip().lower()
# LLM Judge prompt — paste this into your submission
LLM_JUDGE_PROMPT = """
You are evaluating an AI email triage agent.
Email Subject: {email_subject}
Email Body: {email_body}
Available categories: billing, security, order_update, scheduling, promotional
Agent's answer: {agent_answer}
Correct answer: {correct_answer}
Score the agent:
- 1.0 if the agent picked the correct category
- 0.5 if the agent picked a reasonable but incorrect category
- 0.0 if the agent picked a clearly wrong category
Reply with ONLY a number: 0.0, 0.5, or 1.0
"""
if __name__ == "__main__":
print("=== Running 5 graded episodes ===\n")
total_reward = 0
for i in range(5):
# Simulate agent always guessing "billing" (replace with real agent later)
result = run_episode(agent_category="billing")
passed = programmatic_check(result["agent_answer"], result["correct_answer"])
total_reward += result["reward"]
print(f"Episode {i+1}:")
print(f" Subject: {result['email_subject']}")
print(f" Agent: {result['agent_answer']} | Correct: {result['correct_answer']}")
print(f" Passed: {passed} | Reward: {result['reward']}")
print()
print(f"Total reward: {total_reward}/5")
print(f"Accuracy: {total_reward/5*100:.0f}%")