Spaces:

Sayed223
/

customer_support

Runtime error

App Files Files Community

Sayed223 commited on 19 days ago

Commit

f672bcf

verified ·

1 Parent(s): 5bf4915

Upload 8 files

Browse files

Files changed (8) hide show

env/__init__.py +0 -0
env/environment.py +366 -0
env/models.py +130 -0
env/tickets.py +153 -0
graders/__init__.py +0 -0
graders/graders.py +194 -0
tests/__init__.py +0 -0
tests/test_env.py +226 -0

env/__init__.py ADDED Viewed

File without changes

env/environment.py ADDED Viewed

	@@ -0,0 +1,366 @@

+"""
+CustomerSupportEnv — Core environment implementing the OpenEnv spec.
+step(action)  → StepResult(observation, reward, done, info)
+reset()       → Observation
+state()       → Observation
+"""
+from __future__ import annotations
+import random
+from typing import Any, Dict, List, Optional, Tuple
+from env.models import (
+    Action, ActionType, Category, Message, Observation,
+    Priority, Reward, Sentiment, StepResult, TaskSpec, TicketStatus
+)
+from env.tickets import TICKETS, get_ticket
+# ── Reward constants ──────────────────────────────────────────────────────────
+R_SEARCH_KB        =  2.0
+R_EMPATHIZE        =  1.0
+R_ASK_CLARIFY      =  1.0
+R_OFFER_SOLUTION   =  3.0
+R_RESOLVE_GOOD     =  5.0
+R_RESOLVE_BAD      = -3.0
+R_ESCALATE         = -1.0
+R_DUPLICATE_ACTION = -1.0
+R_SKIP_KB_PENALTY  = -1.0
+R_TIMEOUT          = -2.0
+CSAT_WEIGHTS = {
+    "empathized": 0.3,
+    "kb_searched": 0.3,
+    "solution_offered": 0.4,
+}
+# Optimal trajectory (used for efficiency scoring)
+OPTIMAL_STEPS = 4  # search_kb, empathize, offer_solution, resolve
+# ── Task definitions ──────────────────────────────────────────────────────────
+TASKS: Dict[str, TaskSpec] = {
+    "task_1": TaskSpec(
+        task_id="task_1",
+        name="Resolve a Standard Auth Ticket",
+        description=(
+            "Handle a frustrated customer locked out of their account. "
+            "The agent must search the knowledge base, acknowledge the "
+            "customer's frustration, offer a concrete solution, and resolve the ticket. "
+            "EASY: single-step fix, KB articles directly address the issue."
+        ),
+        difficulty="easy",
+        ticket_id="TKT-001",
+        success_criteria=[
+            "search_kb called before offer_solution",
+            "empathize called at least once",
+            "offer_solution payload mentions unlock or reset",
+            "resolve called to close episode"
+        ],
+        max_turns=8,
+        optimal_actions=["search_kb", "empathize", "offer_solution", "resolve"]
+    ),
+    "task_2": TaskSpec(
+        task_id="task_2",
+        name="Handle a Multi-Step Billing Dispute",
+        description=(
+            "Resolve a billing discrepancy for a customer who was overcharged after "
+            "a plan downgrade. The agent must clarify details, check the KB, diagnose "
+            "the root cause, provide a specific dollar credit, and confirm the fix. "
+            "MEDIUM: requires clarification before diagnosis; generic solutions penalised."
+        ),
+        difficulty="medium",
+        ticket_id="TKT-003",
+        success_criteria=[
+            "ask_clarify called at least once",
+            "search_kb called",
+            "offer_solution mentions credit or refund amount",
+            "resolve called"
+        ],
+        max_turns=10,
+        optimal_actions=["search_kb", "ask_clarify", "empathize", "offer_solution", "resolve"]
+    ),
+    "task_3": TaskSpec(
+        task_id="task_3",
+        name="Triage a Critical Time-Sensitive Bug Report",
+        description=(
+            "An enterprise customer has a compliance deadline tomorrow and a data export "
+            "stuck at 12% for 6 hours. The agent must quickly diagnose the issue, "
+            "deploy an immediate workaround (priority queue), offer a backup strategy "
+            "(partial export), and close with a monitoring commitment. "
+            "HARD: time pressure, two-part solution required, escalation penalised, "
+            "generic solutions score low."
+        ),
+        difficulty="hard",
+        ticket_id="TKT-006",
+        success_criteria=[
+            "search_kb called",
+            "offer_solution mentions priority queue AND partial export",
+            "solution demonstrates urgency awareness",
+            "resolve called without escalation"
+        ],
+        max_turns=8,
+        optimal_actions=["search_kb", "empathize", "ask_clarify", "offer_solution", "resolve"]
+    )
+}
+# ── Environment ───────────────────────────────────────────────────────────────
+class CustomerSupportEnv:
+    """
+    OpenEnv-compatible customer support RL environment.
+    Usage:
+        env = CustomerSupportEnv(task_id="task_1")
+        obs = env.reset()
+        result = env.step(Action(action_type="search_kb"))
+        current = env.state()
+    """
+    VERSION = "1.0.0"
+    def __init__(self, task_id: str = "task_1", seed: Optional[int] = None):
+        if task_id not in TASKS:
+            raise ValueError(f"Unknown task_id '{task_id}'. Valid: {list(TASKS.keys())}")
+        self.task_id = task_id
+        self.task = TASKS[task_id]
+        self._seed = seed
+        self._rng = random.Random(seed)
+        self._obs: Observation = self._make_idle_obs()
+    # ── OpenEnv API ───────────────────────────────────────────────────────────
+    def reset(self) -> Observation:
+        """Reset the environment and return the initial observation."""
+        ticket_data = get_ticket(self.task.ticket_id)
+        history = [
+            Message(role=m["role"], text=m["text"], turn=m.get("turn", 0))
+            for m in ticket_data["history"]
+        ]
+        self._obs = Observation(
+            ticket_id=self.task.ticket_id,
+            task_id=self.task_id,
+            status=TicketStatus.OPEN,
+            sentiment=ticket_data["sentiment"],
+            priority=ticket_data["priority"],
+            category=ticket_data["category"],
+            turn=0,
+            max_turns=self.task.max_turns,
+            history=history,
+            kb_results=[],
+            kb_searched=False,
+            empathized=False,
+            clarified=False,
+            solution_offered=False,
+            escalated=False,
+            cumulative_reward=0.0,
+            done=False,
+            info={"task_name": self.task.name, "difficulty": self.task.difficulty}
+        )
+        return self._obs
+    def step(self, action: Action) -> StepResult:
+        """
+        Advance the environment by one step.
+        Returns StepResult(observation, reward, done, info).
+        """
+        if self._obs.status == TicketStatus.IDLE:
+            raise RuntimeError("Call reset() before step().")
+        if self._obs.done:
+            raise RuntimeError("Episode is done. Call reset() to start a new episode.")
+        obs = self._obs
+        ticket = get_ticket(obs.ticket_id)
+        action_type = ActionType(action.action_type)
+        step_reward, reason, penalty = 0.0, "", 0.0
+        done = False
+        info: Dict[str, Any] = {}
+        obs.turn += 1
+        # ── Dispatch action ────────────────────────────────────────────────
+        if action_type == ActionType.SEARCH_KB:
+            if obs.kb_searched:
+                penalty = R_DUPLICATE_ACTION
+                reason = "Duplicate search_kb — no new information."
+            else:
+                obs.kb_searched = True
+                obs.kb_results = ticket["kb_articles"]
+                step_reward = R_SEARCH_KB
+                reason = f"Retrieved {len(obs.kb_results)} KB articles."
+        elif action_type == ActionType.EMPATHIZE:
+            if obs.empathized:
+                reason = "Already empathized — no incremental reward."
+            else:
+                obs.empathized = True
+                step_reward = R_EMPATHIZE
+                reason = "Empathy acknowledged by customer."
+            obs.history.append(Message(
+                role="agent",
+                text=self._rng.choice([
+                    "I completely understand how frustrating this situation must be. Let me help you immediately.",
+                    "I'm sorry you're going through this — that sounds really stressful. Let's fix it right away.",
+                    "Thank you for reaching out. I can see why this is a concern and I want to resolve it for you."
+                ]),
+                turn=obs.turn
+            ))
+            obs.history.append(Message(
+                role="customer",
+                text=self._rng.choice(["I appreciate that, thank you.", "Ok, let's get this sorted.", "Thank you."]),
+                turn=obs.turn
+            ))
+        elif action_type == ActionType.ASK_CLARIFY:
+            if obs.clarified:
+                reason = "Already clarified — no incremental reward."
+            else:
+                obs.clarified = True
+                step_reward = R_ASK_CLARIFY
+                reason = "Clarifying question logged."
+            clarify_q = action.payload or "Could you share your account email and any relevant reference numbers?"
+            obs.history.append(Message(role="agent", text=clarify_q, turn=obs.turn))
+            obs.history.append(Message(
+                role="customer",
+                text=self._rng.choice([
+                    "My account email is user@example.com. Order reference #482923.",
+                    "Sure — account email user@example.com, invoice #8821.",
+                    "My email is user@example.com. It started 3 days ago."
+                ]),
+                turn=obs.turn
+            ))
+        elif action_type == ActionType.OFFER_SOLUTION:
+            if not obs.kb_searched:
+                penalty = R_SKIP_KB_PENALTY
+                reason = "Penalty: solution offered without consulting the knowledge base."
+            solution_text = action.payload or ticket["canonical_solution"]
+            quality = self._score_solution(solution_text, ticket)
+            obs.solution_offered = True
+            step_reward = R_OFFER_SOLUTION * quality
+            reason = f"Solution offered. Quality score: {quality:.2f}."
+            info["solution_quality"] = quality
+            obs.history.append(Message(role="agent", text=solution_text, turn=obs.turn))
+            obs.history.append(Message(
+                role="customer",
+                text=self._rng.choice(ticket["customer_followups"]),
+                turn=obs.turn
+            ))
+        elif action_type == ActionType.ESCALATE:
+            if obs.escalated:
+                penalty = R_DUPLICATE_ACTION * 2
+                reason = "Double escalation penalty."
+            else:
+                obs.escalated = True
+                penalty = R_ESCALATE
+                reason = "Escalated to tier-2. In-tier resolution preferred."
+            obs.history.append(Message(
+                role="system",
+                text="Ticket escalated to tier-2 specialist team.",
+                turn=obs.turn
+            ))
+        elif action_type == ActionType.RESOLVE:
+            done = True
+            obs.status = TicketStatus.RESOLVED if not obs.escalated else TicketStatus.ESCALATED
+            if obs.solution_offered or obs.escalated:
+                csat = self._compute_csat(obs)
+                step_reward = R_RESOLVE_GOOD + csat * 2.0
+                reason = f"Resolved. CSAT: {csat:.2f}/1.0"
+                info["csat"] = csat
+            else:
+                step_reward = R_RESOLVE_BAD
+                reason = "Penalty: resolved without offering a solution."
+            obs.history.append(Message(
+                role="agent",
+                text="Thank you for your patience. I'm marking this ticket as resolved. Please don't hesitate to reach out if you need further help.",
+                turn=obs.turn
+            ))
+        elif action_type == ActionType.SEND_MESSAGE:
+            # Free-form message — small reward for engagement
+            msg = action.payload or "I'm looking into this for you."
+            obs.history.append(Message(role="agent", text=msg, turn=obs.turn))
+            step_reward = 0.5
+            reason = "Message sent."
+        # ── Timeout check ─────────────────────────────────────────────────
+        if obs.turn >= obs.max_turns and not done:
+            penalty += R_TIMEOUT
+            done = True
+            obs.status = TicketStatus.TIMEOUT
+            reason += " | Episode timed out."
+        # ── Build reward ──────────────────────────────────────────────────
+        net = step_reward + penalty
+        efficiency = max(0.0, 1.0 - max(0, obs.turn - OPTIMAL_STEPS) * 0.1)
+        process = min(1.0, (
+            (0.25 if obs.kb_searched else 0) +
+            (0.25 if obs.empathized else 0) +
+            (0.25 if obs.solution_offered else 0) +
+            (0.25 if done and obs.status == TicketStatus.RESOLVED else 0)
+        ))
+        reward = Reward(
+            total=round(net, 3),
+            process_score=round(process, 3),
+            quality_score=round(info.get("solution_quality", 0.0), 3),
+            efficiency_score=round(efficiency, 3),
+            csat_score=round(info.get("csat", 0.0), 3),
+            penalties=round(penalty, 3),
+            reason=reason
+        )
+        obs.cumulative_reward = round(obs.cumulative_reward + net, 3)
+        obs.done = done
+        info["turn"] = obs.turn
+        info["cumulative_reward"] = obs.cumulative_reward
+        obs.info = info
+        self._obs = obs
+        return StepResult(observation=obs, reward=reward, done=done, info=info)
+    def state(self) -> Observation:
+        """Return current observation without advancing the environment."""
+        return self._obs
+    # ── Helpers ───────────────────────────────────────────────────────────────
+    def _make_idle_obs(self) -> Observation:
+        return Observation(task_id=self.task_id)
+    def _score_solution(self, solution_text: str, ticket: dict) -> float:
+        """Score solution quality against expected keywords (0.0–1.0)."""
+        text_lower = solution_text.lower()
+        keywords = ticket.get("solution_keywords", [])
+        if not keywords:
+            return 0.5
+        hits = sum(1 for kw in keywords if kw.lower() in text_lower)
+        return min(1.0, hits / max(1, len(keywords)))
+    def _compute_csat(self, obs: Observation) -> float:
+        """Synthetic CSAT score (0.0–1.0) based on interaction quality."""
+        score = 0.0
+        if obs.empathized:
+            score += CSAT_WEIGHTS["empathized"]
+        if obs.kb_searched:
+            score += CSAT_WEIGHTS["kb_searched"]
+        if obs.solution_offered:
+            score += CSAT_WEIGHTS["solution_offered"]
+        return round(score, 3)
+    @staticmethod
+    def list_tasks() -> List[str]:
+        return list(TASKS.keys())
+    @staticmethod
+    def get_task_spec(task_id: str) -> TaskSpec:
+        return TASKS[task_id]

env/models.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+Typed Pydantic models for CustomerSupportEnv (OpenEnv spec).
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, Field
+from enum import Enum
+# ── Enumerations ──────────────────────────────────────────────────────────────
+class TicketStatus(str, Enum):
+    IDLE = "idle"
+    OPEN = "open"
+    RESOLVED = "resolved"
+    ESCALATED = "escalated"
+    TIMEOUT = "timeout"
+class Sentiment(str, Enum):
+    POSITIVE = "positive"
+    NEUTRAL = "neutral"
+    FRUSTRATED = "frustrated"
+    ANGRY = "angry"
+class Priority(str, Enum):
+    LOW = "low"
+    MEDIUM = "medium"
+    HIGH = "high"
+    URGENT = "urgent"
+class Category(str, Enum):
+    AUTH = "auth"
+    BILLING = "billing"
+    FULFILLMENT = "fulfillment"
+    BUG = "bug"
+    SALES = "sales"
+    GENERAL = "general"
+class ActionType(str, Enum):
+    SEARCH_KB = "search_kb"
+    EMPATHIZE = "empathize"
+    ASK_CLARIFY = "ask_clarify"
+    OFFER_SOLUTION = "offer_solution"
+    ESCALATE = "escalate"
+    RESOLVE = "resolve"
+    SEND_MESSAGE = "send_message"
+# ── Core Typed Models ─────────────────────────────────────────────────────────
+class Message(BaseModel):
+    role: str  # "customer" | "agent" | "system"
+    text: str
+    turn: int = 0
+class Observation(BaseModel):
+    """Full typed observation returned by reset() and step()."""
+    ticket_id: Optional[str] = None
+    task_id: str = "task_1"
+    status: TicketStatus = TicketStatus.IDLE
+    sentiment: Optional[Sentiment] = None
+    priority: Optional[Priority] = None
+    category: Optional[Category] = None
+    turn: int = 0
+    max_turns: int = 10
+    history: List[Message] = Field(default_factory=list)
+    kb_results: List[str] = Field(default_factory=list)
+    kb_searched: bool = False
+    empathized: bool = False
+    clarified: bool = False
+    solution_offered: bool = False
+    escalated: bool = False
+    cumulative_reward: float = 0.0
+    done: bool = False
+    info: Dict[str, Any] = Field(default_factory=dict)
+    class Config:
+        use_enum_values = True
+class Action(BaseModel):
+    """Typed action submitted by the agent via step()."""
+    action_type: ActionType
+    payload: Optional[str] = None  # free-text for send_message / offer_solution
+    class Config:
+        use_enum_values = True
+class Reward(BaseModel):
+    """Typed reward with decomposed components."""
+    total: float
+    process_score: float = 0.0   # correct action sequencing
+    quality_score: float = 0.0   # solution quality / empathy
+    efficiency_score: float = 0.0  # steps taken vs optimal
+    csat_score: float = 0.0      # synthetic customer satisfaction (0–1)
+    penalties: float = 0.0
+    reason: str = ""
+class StepResult(BaseModel):
+    observation: Observation
+    reward: Reward
+    done: bool
+    info: Dict[str, Any] = Field(default_factory=dict)
+class TaskSpec(BaseModel):
+    """Defines one graded task within the environment."""
+    task_id: str
+    name: str
+    description: str
+    difficulty: str           # easy | medium | hard
+    ticket_id: str
+    success_criteria: List[str]
+    max_turns: int
+    optimal_actions: List[str]
+class GraderResult(BaseModel):
+    task_id: str
+    score: float              # 0.0 – 1.0
+    breakdown: Dict[str, float]
+    passed: bool
+    reason: str

env/tickets.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""
+Ticket scenario database for CustomerSupportEnv.
+Each ticket includes: metadata, customer history, KB articles,
+canonical solution, and keyword-based solution validator.
+"""
+from __future__ import annotations
+from typing import Dict, List, Any
+TICKETS: Dict[str, Dict[str, Any]] = {
+    "TKT-001": {
+        "subject": "Cannot log in to my account",
+        "customer": "Aria Shah",
+        "priority": "high",
+        "category": "auth",
+        "sentiment": "frustrated",
+        "history": [
+            {"role": "customer", "text": "I've been locked out for 2 days! I tried resetting my password three times and nothing works. This is extremely urgent.", "turn": 0}
+        ],
+        "kb_articles": [
+            "Password reset: Visit /forgot-password and enter your registered email. Reset links expire in 15 minutes.",
+            "Account lockout policy: Accounts lock after 5 failed attempts. Auto-unlock after 30 minutes, or contact support for manual unlock.",
+            "2FA issues: If locked out due to 2FA, an admin can bypass the second factor temporarily via the admin console."
+        ],
+        "canonical_solution": "I have manually unlocked your account and sent a fresh password reset link to your registered email. The link will expire in 15 minutes. If 2FA is causing issues I can temporarily bypass it.",
+        "solution_keywords": ["unlock", "reset", "link", "email", "password"],
+        "customer_followups": [
+            "Thank you! I got the email and it worked.",
+            "That fixed it, appreciate your help.",
+            "Great, I'm back in now."
+        ]
+    },
+    "TKT-002": {
+        "subject": "Wrong item shipped — order #482923",
+        "customer": "Bryce Lee",
+        "priority": "urgent",
+        "category": "fulfillment",
+        "sentiment": "angry",
+        "history": [
+            {"role": "customer", "text": "This is unacceptable. I ordered a Red T-Shirt size L but you sent me a Blue size M. Order #482923. I need the right item immediately.", "turn": 0}
+        ],
+        "kb_articles": [
+            "Return policy: Customers have 30 days to initiate a return. Use the portal at /returns. We cover return shipping for our errors.",
+            "Priority re-ship: For fulfilment errors on orders >$25, approve a priority reship within 24h after return label is issued. No need to wait for return arrival.",
+            "Compensation policy: For urgent orders or repeat fulfilment errors, issue a 15% discount code on next purchase."
+        ],
+        "canonical_solution": "I sincerely apologise. I've raised a priority reship for the Red T-Shirt size L — it will ship within 24 hours. I've emailed a pre-paid return label for the incorrect item, and added a 15% discount code to your account for the inconvenience.",
+        "solution_keywords": ["reship", "return", "label", "correct", "apologise", "apologi", "discount"],
+        "customer_followups": [
+            "OK, as long as it ships today I'm fine with that.",
+            "Got the email with the label. Thank you.",
+            "Alright, I appreciate the quick response."
+        ]
+    },
+    "TKT-003": {
+        "subject": "Invoice #8821 shows wrong amount",
+        "customer": "Cleo Park",
+        "priority": "medium",
+        "category": "billing",
+        "sentiment": "neutral",
+        "history": [
+            {"role": "customer", "text": "Hello, invoice #8821 shows $49 but I'm on the $29/month Basic plan. I downgraded last month. Can you check?", "turn": 0}
+        ],
+        "kb_articles": [
+            "Plan changes: Downgrades take effect at the start of the next billing cycle. The current period is charged at the old rate.",
+            "Prorate credits: If a downgrade was confirmed before the cycle closed, a manual credit can be issued for the difference.",
+            "Billing disputes: Finance team can adjust invoices within 60 days of issue date. Requires the invoice number and account email."
+        ],
+        "canonical_solution": "I've reviewed your account. Your downgrade was confirmed before the billing cycle closed, so I'm issuing a $20 credit to your account which will appear on your next invoice. Going forward you will be billed $29/month.",
+        "solution_keywords": ["credit", "$20", "twenty", "correct", "billing", "downgrade", "refund"],
+        "customer_followups": [
+            "Perfect, that makes sense. Thanks.",
+            "Great, I can see the credit on my account.",
+            "Thanks for sorting that out quickly."
+        ]
+    },
+    "TKT-004": {
+        "subject": "App crashes on iOS 17 during PDF export",
+        "customer": "Dev Okonkwo",
+        "priority": "medium",
+        "category": "bug",
+        "sentiment": "neutral",
+        "history": [
+            {"role": "customer", "text": "Every time I tap 'Export PDF' the app force-quits. iPhone 14 Pro, iOS 17.4.1. Started after the last app update.", "turn": 0}
+        ],
+        "kb_articles": [
+            "Known iOS 17 crash: The PDF export feature has a memory issue on iOS 17.3 and above introduced in app v4.1.0. Fix is in v4.2.1.",
+            "Workaround: Use the web app at app.example.com/export for PDF exports until v4.2.1 is released (ETA: 5 business days).",
+            "Bug reporting: Collect crash logs from Settings > Privacy > Analytics & Improvements > Analytics Data and share with devs@example.com."
+        ],
+        "canonical_solution": "This is a known bug in v4.1.0 on iOS 17.3+ — our engineering team has a fix ready in v4.2.1, releasing in 5 days. In the meantime, use our web app at app.example.com/export. I've also flagged your report to the engineering team.",
+        "solution_keywords": ["known", "bug", "v4.2", "workaround", "web", "fix", "engineering"],
+        "customer_followups": [
+            "Good to know it's being fixed. I'll use the web app for now.",
+            "Thanks for the workaround, that works.",
+            "OK, I'll wait for the update."
+        ]
+    },
+    "TKT-005": {
+        "subject": "Bulk licence pricing for 50 seats",
+        "customer": "Emma Ng",
+        "priority": "low",
+        "category": "sales",
+        "sentiment": "positive",
+        "history": [
+            {"role": "customer", "text": "Hi! We're a team of about 50 and are considering your Pro plan. Do you offer bulk discounts? Also, is there an enterprise contract option?", "turn": 0}
+        ],
+        "kb_articles": [
+            "Volume discounts: 10–24 seats: 10% off. 25–49 seats: 15% off. 50+ seats: 25% off annual plan.",
+            "Enterprise contracts: Custom SLA, SSO, dedicated support, and invoice billing. Contact sales@example.com. Average deal closes in 2 weeks.",
+            "Trial: Teams of 5+ can get a 30-day free trial of the Pro plan. No credit card required."
+        ],
+        "canonical_solution": "Great news — 50 seats qualifies for our 25% volume discount on the annual Pro plan. We also offer enterprise contracts with SSO, dedicated support, and custom SLA. I'd love to connect you with our enterprise team at sales@example.com, or I can have an account executive reach out directly.",
+        "solution_keywords": ["25%", "twenty-five", "enterprise", "volume", "discount", "sales@", "executive"],
+        "customer_followups": [
+            "That sounds great, please have someone reach out.",
+            "25% is better than I expected! I'll email sales.",
+            "Perfect, we'll set up a call with the enterprise team."
+        ]
+    },
+    "TKT-006": {
+        "subject": "Data export taking over 6 hours",
+        "customer": "Felix Martín",
+        "priority": "high",
+        "category": "bug",
+        "sentiment": "frustrated",
+        "history": [
+            {"role": "customer", "text": "I started a full data export 6 hours ago and it's still at 12%. I have a compliance deadline tomorrow. This is critical.", "turn": 0}
+        ],
+        "kb_articles": [
+            "Export timeouts: Large exports (>10GB) can time out. The system retries automatically but may take 8-12 hours total.",
+            "Priority export queue: Support can manually move a job to the priority queue, cutting estimated time to 1-2 hours.",
+            "Partial exports: Users can export data by date range to reduce file size. Recommended for compliance: export by quarter."
+        ],
+        "canonical_solution": "I've moved your export job to the priority queue — it should complete within 1-2 hours. As a backup, I recommend also starting a partial export by date range which will be much faster. I'll monitor and send you a confirmation email when the full export completes.",
+        "solution_keywords": ["priority", "queue", "1-2 hour", "partial", "monitor", "email"],
+        "customer_followups": [
+            "Thank you! I'll start the partial export as backup.",
+            "OK, I can see the progress picked up. Thanks.",
+            "The priority queue worked, it's done now."
+        ]
+    }
+}
+def get_ticket(ticket_id: str) -> Dict[str, Any]:
+    if ticket_id not in TICKETS:
+        raise ValueError(f"Unknown ticket_id: {ticket_id}")
+    return TICKETS[ticket_id]
+def all_ticket_ids() -> List[str]:
+    return list(TICKETS.keys())

graders/__init__.py ADDED Viewed

File without changes

graders/graders.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""
+Programmatic graders for CustomerSupportEnv tasks.
+Each grader accepts a completed Observation and returns a GraderResult
+with a score in [0.0, 1.0] and a detailed breakdown.
+Graders are deterministic — same inputs always produce same outputs.
+"""
+from __future__ import annotations
+from typing import Dict
+from env.models import GraderResult, Observation, TicketStatus
+from env.tickets import get_ticket
+# ── Grader registry ───────────────────────────────────────────────────────────
+def grade_task_1(obs: Observation) -> GraderResult:
+    """
+    Task 1 (EASY): Resolve a standard auth ticket.
+    Scoring:
+      - 0.30  kb_searched before offer_solution
+      - 0.25  empathize called at least once
+      - 0.25  offer_solution payload mentions unlock/reset keywords
+      - 0.20  resolve called (status == RESOLVED)
+    """
+    ticket = get_ticket("TKT-001")
+    breakdown: Dict[str, float] = {}
+    # Check conversation history for evidence of each required action
+    agent_turns = [m.text.lower() for m in obs.history if m.role == "agent"]
+    all_agent_text = " ".join(agent_turns)
+    # 1. KB searched
+    kb_score = 0.30 if obs.kb_searched else 0.0
+    breakdown["kb_searched"] = kb_score
+    # 2. Empathy expressed
+    empathy_score = 0.25 if obs.empathized else 0.0
+    breakdown["empathized"] = empathy_score
+    # 3. Solution quality — unlock/reset keywords
+    solution_keywords = ticket["solution_keywords"]
+    kw_hits = sum(1 for kw in solution_keywords if kw in all_agent_text)
+    sol_score = 0.25 * min(1.0, kw_hits / max(1, len(solution_keywords)))
+    breakdown["solution_quality"] = round(sol_score, 3)
+    # 4. Resolved cleanly (not timeout, not just escalated)
+    resolved = obs.status == TicketStatus.RESOLVED.value or obs.status == TicketStatus.RESOLVED
+    resolve_score = 0.20 if resolved else 0.0
+    breakdown["resolved"] = resolve_score
+    total = sum(breakdown.values())
+    passed = total >= 0.70
+    return GraderResult(
+        task_id="task_1",
+        score=round(total, 3),
+        breakdown=breakdown,
+        passed=passed,
+        reason=_build_reason(breakdown, passed)
+    )
+def grade_task_2(obs: Observation) -> GraderResult:
+    """
+    Task 2 (MEDIUM): Multi-step billing dispute.
+    Scoring:
+      - 0.20  ask_clarify called
+      - 0.20  kb_searched
+      - 0.30  offer_solution mentions a specific credit/refund (amount or keyword)
+      - 0.15  empathize called
+      - 0.15  resolve called
+    """
+    ticket = get_ticket("TKT-003")
+    breakdown: Dict[str, float] = {}
+    all_agent_text = " ".join(m.text.lower() for m in obs.history if m.role == "agent")
+    # 1. Clarification step
+    breakdown["ask_clarify"] = 0.20 if obs.clarified else 0.0
+    # 2. KB searched
+    breakdown["kb_searched"] = 0.20 if obs.kb_searched else 0.0
+    # 3. Specific solution with $ amount or keywords
+    solution_keywords = ticket["solution_keywords"]
+    kw_hits = sum(1 for kw in solution_keywords if kw in all_agent_text)
+    # Extra check: requires a numeric/specific value, not just generic words
+    has_amount = any(x in all_agent_text for x in ["$20", "twenty", "20 credit", "credit of"])
+    quality = min(1.0, kw_hits / max(1, len(solution_keywords)))
+    if has_amount:
+        quality = min(1.0, quality + 0.3)
+    breakdown["solution_quality"] = round(0.30 * quality, 3)
+    # 4. Empathy
+    breakdown["empathized"] = 0.15 if obs.empathized else 0.0
+    # 5. Resolved
+    resolved = obs.status in (TicketStatus.RESOLVED.value, TicketStatus.RESOLVED)
+    breakdown["resolved"] = 0.15 if resolved else 0.0
+    total = sum(breakdown.values())
+    passed = total >= 0.70
+    return GraderResult(
+        task_id="task_2",
+        score=round(total, 3),
+        breakdown=breakdown,
+        passed=passed,
+        reason=_build_reason(breakdown, passed)
+    )
+def grade_task_3(obs: Observation) -> GraderResult:
+    """
+    Task 3 (HARD): Critical time-sensitive bug — data export stuck.
+    Scoring:
+      - 0.20  kb_searched
+      - 0.15  empathize called
+      - 0.35  solution mentions BOTH priority queue AND partial export (two-part solution)
+      - 0.15  NOT escalated (in-tier resolution required for full score)
+      - 0.15  resolve called
+    Bonus deduction: -0.10 if escalated (overrides the 0.15 no-escalation credit)
+    """
+    ticket = get_ticket("TKT-006")
+    breakdown: Dict[str, float] = {}
+    all_agent_text = " ".join(m.text.lower() for m in obs.history if m.role == "agent")
+    # 1. KB searched
+    breakdown["kb_searched"] = 0.20 if obs.kb_searched else 0.0
+    # 2. Empathy
+    breakdown["empathized"] = 0.15 if obs.empathized else 0.0
+    # 3. Two-part solution: priority queue + partial export
+    has_priority_queue = any(x in all_agent_text for x in ["priority queue", "priority export", "move your", "moved your"])
+    has_partial = any(x in all_agent_text for x in ["partial", "date range", "by quarter", "partial export"])
+    has_urgency = any(x in all_agent_text for x in ["deadline", "1-2 hour", "urgent", "compliance", "monitor", "email you"])
+    sol_quality = 0.0
+    if has_priority_queue and has_partial:
+        sol_quality = 1.0
+    elif has_priority_queue or has_partial:
+        sol_quality = 0.5
+    if has_urgency:
+        sol_quality = min(1.0, sol_quality + 0.2)
+    breakdown["solution_quality"] = round(0.35 * sol_quality, 3)
+    # 4. No escalation
+    breakdown["no_escalation"] = 0.0 if obs.escalated else 0.15
+    # 5. Resolved
+    resolved = obs.status in (TicketStatus.RESOLVED.value, TicketStatus.RESOLVED)
+    breakdown["resolved"] = 0.15 if resolved else 0.0
+    total = sum(breakdown.values())
+    # Hard cap at 0.85 if escalated (escalation shows poor judgment on this task)
+    if obs.escalated:
+        total = min(total, 0.55)
+    passed = total >= 0.70
+    return GraderResult(
+        task_id="task_3",
+        score=round(total, 3),
+        breakdown=breakdown,
+        passed=passed,
+        reason=_build_reason(breakdown, passed)
+    )
+GRADERS = {
+    "task_1": grade_task_1,
+    "task_2": grade_task_2,
+    "task_3": grade_task_3,
+}
+def grade(task_id: str, obs: Observation) -> GraderResult:
+    """Grade a completed observation for the given task."""
+    if task_id not in GRADERS:
+        raise ValueError(f"No grader for task_id '{task_id}'. Valid: {list(GRADERS.keys())}")
+    return GRADERS[task_id](obs)
+def _build_reason(breakdown: Dict[str, float], passed: bool) -> str:
+    hits = [k for k, v in breakdown.items() if v > 0]
+    misses = [k for k, v in breakdown.items() if v == 0]
+    status = "PASS" if passed else "FAIL"
+    msg = f"[{status}] Score components present: {hits}."
+    if misses:
+        msg += f" Missing: {misses}."
+    return msg

tests/__init__.py ADDED Viewed

File without changes

tests/test_env.py ADDED Viewed

	@@ -0,0 +1,226 @@

+"""
+Tests for CustomerSupportEnv.
+Run: python -m pytest tests/ -v
+"""
+import sys, os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+import pytest
+from env.environment import CustomerSupportEnv, TASKS
+from env.models import Action, ActionType, TicketStatus
+from graders.graders import grade
+# ── Fixtures ──────────────────────────────────────────────────────────────────
+@pytest.fixture
+def env1():
+    e = CustomerSupportEnv(task_id="task_1", seed=0)
+    e.reset()
+    return e
+@pytest.fixture
+def env2():
+    e = CustomerSupportEnv(task_id="task_2", seed=0)
+    e.reset()
+    return e
+@pytest.fixture
+def env3():
+    e = CustomerSupportEnv(task_id="task_3", seed=0)
+    e.reset()
+    return e
+# ── reset() ───────────────────────────────────────────────────────────────────
+def test_reset_returns_observation():
+    env = CustomerSupportEnv(task_id="task_1", seed=0)
+    obs = env.reset()
+    assert obs.ticket_id == "TKT-001"
+    assert obs.done is False
+    assert obs.turn == 0
+    assert obs.status == TicketStatus.OPEN.value or obs.status == TicketStatus.OPEN
+def test_reset_clears_state(env1):
+    env1.step(Action(action_type=ActionType.SEARCH_KB))
+    obs = env1.reset()
+    assert obs.kb_searched is False
+    assert obs.turn == 0
+    assert obs.cumulative_reward == 0.0
+def test_reset_loads_history(env1):
+    obs = env1.state()
+    assert len(obs.history) >= 1
+    assert obs.history[0].role == "customer"
+# ── state() ───────────────────────────────────────────────────────────────────
+def test_state_does_not_advance(env1):
+    obs_before = env1.state()
+    env1.state()
+    obs_after = env1.state()
+    assert obs_before.turn == obs_after.turn
+# ── step() ────────────────────────────────────────────────────────────────────
+def test_step_search_kb(env1):
+    result = env1.step(Action(action_type=ActionType.SEARCH_KB))
+    assert result.reward.total == 2.0
+    assert result.observation.kb_searched is True
+    assert len(result.observation.kb_results) > 0
+def test_step_search_kb_duplicate_penalised(env1):
+    env1.step(Action(action_type=ActionType.SEARCH_KB))
+    result = env1.step(Action(action_type=ActionType.SEARCH_KB))
+    assert result.reward.total < 0
+def test_step_empathize(env1):
+    result = env1.step(Action(action_type=ActionType.EMPATHIZE))
+    assert result.reward.total == 1.0
+    assert result.observation.empathized is True
+def test_step_empathize_no_double_reward(env1):
+    env1.step(Action(action_type=ActionType.EMPATHIZE))
+    result = env1.step(Action(action_type=ActionType.EMPATHIZE))
+    assert result.reward.total == 0.0
+def test_step_offer_solution_without_kb_penalised(env1):
+    result = env1.step(Action(
+        action_type=ActionType.OFFER_SOLUTION,
+        payload="I have unlocked your account and sent a reset link."
+    ))
+    assert result.reward.penalties == -1.0
+def test_step_offer_solution_with_kb_rewarded(env1):
+    env1.step(Action(action_type=ActionType.SEARCH_KB))
+    result = env1.step(Action(
+        action_type=ActionType.OFFER_SOLUTION,
+        payload="I have unlocked your account and sent a password reset link."
+    ))
+    assert result.reward.total > 0
+def test_step_resolve_without_solution_penalised(env1):
+    result = env1.step(Action(action_type=ActionType.RESOLVE))
+    assert result.reward.total == -3.0
+    assert result.done is True
+def test_step_resolve_good(env1):
+    env1.step(Action(action_type=ActionType.SEARCH_KB))
+    env1.step(Action(
+        action_type=ActionType.OFFER_SOLUTION,
+        payload="Account unlocked and reset email sent."
+    ))
+    result = env1.step(Action(action_type=ActionType.RESOLVE))
+    assert result.reward.total >= 5.0
+    assert result.done is True
+def test_step_raises_before_reset():
+    env = CustomerSupportEnv(task_id="task_1")
+    with pytest.raises(RuntimeError):
+        env.step(Action(action_type=ActionType.SEARCH_KB))
+def test_step_raises_after_done(env1):
+    env1.step(Action(action_type=ActionType.RESOLVE))
+    with pytest.raises(RuntimeError):
+        env1.step(Action(action_type=ActionType.SEARCH_KB))
+def test_timeout_penalty(env1):
+    """Exceeding max_turns gives timeout penalty."""
+    for _ in range(env1._obs.max_turns - 1):
+        env1.step(Action(action_type=ActionType.EMPATHIZE))
+    obs = env1.state()
+    assert obs.turn >= obs.max_turns - 1
+# ── Graders ───────────────────────────────────────────────────────────────────
+def test_grader_task1_optimal(env1):
+    env1.step(Action(action_type=ActionType.SEARCH_KB))
+    env1.step(Action(action_type=ActionType.EMPATHIZE))
+    env1.step(Action(
+        action_type=ActionType.OFFER_SOLUTION,
+        payload="I have unlocked your account and sent a password reset link to your email."
+    ))
+    env1.step(Action(action_type=ActionType.RESOLVE))
+    result = grade("task_1", env1.state())
+    assert result.score >= 0.90
+    assert result.passed is True
+def test_grader_task1_minimal(env1):
+    """Just resolve with no steps — should fail."""
+    env1.step(Action(action_type=ActionType.RESOLVE))
+    result = grade("task_1", env1.state())
+    assert result.score < 0.40
+    assert result.passed is False
+def test_grader_task1_score_in_range(env1):
+    result = grade("task_1", env1.state())
+    assert 0.0 <= result.score <= 1.0
+def test_grader_task2_requires_clarify(env2):
+    """Medium task: no clarify → lower score."""
+    env2.step(Action(action_type=ActionType.SEARCH_KB))
+    env2.step(Action(
+        action_type=ActionType.OFFER_SOLUTION,
+        payload="I have applied a $20 credit to your account."
+    ))
+    env2.step(Action(action_type=ActionType.RESOLVE))
+    result = grade("task_2", env2.state())
+    assert result.breakdown.get("ask_clarify", 0) == 0.0
+def test_grader_task2_full_score(env2):
+    env2.step(Action(action_type=ActionType.SEARCH_KB))
+    env2.step(Action(action_type=ActionType.ASK_CLARIFY, payload="Can you confirm your account email and invoice number?"))
+    env2.step(Action(action_type=ActionType.EMPATHIZE))
+    env2.step(Action(
+        action_type=ActionType.OFFER_SOLUTION,
+        payload="I have issued a $20 credit to your account. Your plan is now corrected to $29/month."
+    ))
+    env2.step(Action(action_type=ActionType.RESOLVE))
+    result = grade("task_2", env2.state())
+    assert result.score >= 0.70
+def test_grader_task3_two_part_solution(env3):
+    env3.step(Action(action_type=ActionType.SEARCH_KB))
+    env3.step(Action(action_type=ActionType.EMPATHIZE))
+    env3.step(Action(
+        action_type=ActionType.OFFER_SOLUTION,
+        payload="I have moved your export job to the priority queue — it will complete in 1-2 hours. "
+                "As a backup, please start a partial export by date range which will be much faster. "
+                "I will email you when the full export completes."
+    ))
+    env3.step(Action(action_type=ActionType.RESOLVE))
+    result = grade("task_3", env3.state())
+    assert result.score >= 0.70
+    assert result.passed is True
+def test_grader_task3_escalation_capped(env3):
+    env3.step(Action(action_type=ActionType.SEARCH_KB))
+    env3.step(Action(action_type=ActionType.ESCALATE))
+    env3.step(Action(action_type=ActionType.RESOLVE))
+    result = grade("task_3", env3.state())
+    assert result.score <= 0.55
+def test_grader_deterministic(env1):
+    """Same inputs → same grader output every time."""
+    env1.step(Action(action_type=ActionType.SEARCH_KB))
+    env1.step(Action(action_type=ActionType.RESOLVE))
+    r1 = grade("task_1", env1.state())
+    env1.reset()
+    env1.step(Action(action_type=ActionType.SEARCH_KB))
+    env1.step(Action(action_type=ActionType.RESOLVE))
+    r2 = grade("task_1", env1.state())
+    assert r1.score == r2.score
+# ── Task specs ────────────────────────────────────────────────────────────────
+def test_task_list():
+    assert set(CustomerSupportEnv.list_tasks()) == {"task_1", "task_2", "task_3"}
+def test_task_difficulty_progression():
+    diffs = [TASKS[tid].difficulty for tid in ["task_1", "task_2", "task_3"]]
+    assert diffs == ["easy", "medium", "hard"]