"""
Grader for Email Triage Environment.
Two grading methods:
1. Programmatic check - exact match
2. LLM judge prompt - for evaluation panel
"""
import requests

BASE = "http://localhost:8000"
CATEGORIES = ["billing", "security", "order_update", "scheduling", "promotional"]


def run_episode(agent_category: str) -> dict:
    """Run one full episode and return the result."""
    requests.post(f"{BASE}/reset", json={})
    
    email_resp = requests.post(f"{BASE}/step", json={
        "action": {"type": "call_tool", "tool_name": "read_email", "arguments": {}}
    }).json()
    email = email_resp["observation"]["result"]["structured_content"]

    result_resp = requests.post(f"{BASE}/step", json={
        "action": {"type": "call_tool", "tool_name": "classify_email", 
                   "arguments": {"category": agent_category}}
    }).json()
    result = result_resp["observation"]["result"]["structured_content"]

    return {
        "email_subject": email["subject"],
        "email_body": email["body"],
        "agent_answer": result["your_answer"],
        "correct_answer": result["correct_answer"],
        "correct": result["correct"],
        "reward": result_resp["reward"],
    }


def programmatic_check(agent_answer: str, correct_answer: str) -> bool:
    """Hard check — exact string match."""
    return agent_answer.strip().lower() == correct_answer.strip().lower()


# LLM Judge prompt — paste this into your submission
LLM_JUDGE_PROMPT = """
You are evaluating an AI email triage agent.

Email Subject: {email_subject}
Email Body: {email_body}

Available categories: billing, security, order_update, scheduling, promotional

Agent's answer: {agent_answer}
Correct answer: {correct_answer}

Score the agent:
- 1.0 if the agent picked the correct category
- 0.5 if the agent picked a reasonable but incorrect category
- 0.0 if the agent picked a clearly wrong category

Reply with ONLY a number: 0.0, 0.5, or 1.0
"""


if __name__ == "__main__":
    print("=== Running 5 graded episodes ===\n")
    total_reward = 0

    for i in range(5):
        # Simulate agent always guessing "billing" (replace with real agent later)
        result = run_episode(agent_category="billing")
        passed = programmatic_check(result["agent_answer"], result["correct_answer"])
        total_reward += result["reward"]

        print(f"Episode {i+1}:")
        print(f"  Subject: {result['email_subject']}")
        print(f"  Agent: {result['agent_answer']} | Correct: {result['correct_answer']}")
        print(f"  Passed: {passed} | Reward: {result['reward']}")
        print()

    print(f"Total reward: {total_reward}/5")
    print(f"Accuracy: {total_reward/5*100:.0f}%")