Spaces:

anugrah55
/

email-triage-env

No application file

App Files Files Community

email-triage-env / grader.py

anugrah55

Upload folder using huggingface_hub

520bc7a verified about 1 month ago

raw

history blame contribute delete

2.69 kB

	"""
	Grader for Email Triage Environment.
	Two grading methods:
	1. Programmatic check - exact match
	2. LLM judge prompt - for evaluation panel
	"""
	import requests

	BASE = "http://localhost:8000"
	CATEGORIES = ["billing", "security", "order_update", "scheduling", "promotional"]


	def run_episode(agent_category: str) -> dict:
	"""Run one full episode and return the result."""
	requests.post(f"{BASE}/reset", json={})

	email_resp = requests.post(f"{BASE}/step", json={
	"action": {"type": "call_tool", "tool_name": "read_email", "arguments": {}}
	}).json()
	email = email_resp["observation"]["result"]["structured_content"]

	result_resp = requests.post(f"{BASE}/step", json={
	"action": {"type": "call_tool", "tool_name": "classify_email",
	"arguments": {"category": agent_category}}
	}).json()
	result = result_resp["observation"]["result"]["structured_content"]

	return {
	"email_subject": email["subject"],
	"email_body": email["body"],
	"agent_answer": result["your_answer"],
	"correct_answer": result["correct_answer"],
	"correct": result["correct"],
	"reward": result_resp["reward"],
	}


	def programmatic_check(agent_answer: str, correct_answer: str) -> bool:
	"""Hard check — exact string match."""
	return agent_answer.strip().lower() == correct_answer.strip().lower()


	# LLM Judge prompt — paste this into your submission
	LLM_JUDGE_PROMPT = """
	You are evaluating an AI email triage agent.

	Email Subject: {email_subject}
	Email Body: {email_body}

	Available categories: billing, security, order_update, scheduling, promotional

	Agent's answer: {agent_answer}
	Correct answer: {correct_answer}

	Score the agent:
	- 1.0 if the agent picked the correct category
	- 0.5 if the agent picked a reasonable but incorrect category
	- 0.0 if the agent picked a clearly wrong category

	Reply with ONLY a number: 0.0, 0.5, or 1.0
	"""


	if __name__ == "__main__":
	print("=== Running 5 graded episodes ===\n")
	total_reward = 0

	for i in range(5):
	# Simulate agent always guessing "billing" (replace with real agent later)
	result = run_episode(agent_category="billing")
	passed = programmatic_check(result["agent_answer"], result["correct_answer"])
	total_reward += result["reward"]

	print(f"Episode {i+1}:")
	print(f" Subject: {result['email_subject']}")
	print(f" Agent: {result['agent_answer']} \| Correct: {result['correct_answer']}")
	print(f" Passed: {passed} \| Reward: {result['reward']}")
	print()

	print(f"Total reward: {total_reward}/5")
	print(f"Accuracy: {total_reward/5*100:.0f}%")