File size: 8,415 Bytes
181758b
 
44686b8
 
726cf7a
44686b8
726cf7a
 
 
 
 
 
4f129c9
 
 
 
181758b
 
 
 
 
 
 
9d47369
181758b
 
 
 
 
 
 
9d47369
 
 
181758b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220e9f3
d33da97
 
 
 
 
 
 
 
 
 
 
 
9d47369
d33da97
 
 
55db2c6
726cf7a
 
220e9f3
4f129c9
220e9f3
 
 
 
 
4f129c9
220e9f3
 
 
 
 
 
 
 
 
 
 
 
44686b8
 
 
 
 
 
 
6f6f46e
551c5bc
 
 
44686b8
 
726cf7a
 
 
 
9d47369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726cf7a
 
 
 
4f129c9
726cf7a
 
 
 
 
1305932
 
 
726cf7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1305932
 
 
726cf7a
 
 
 
 
 
 
 
 
 
55db2c6
 
 
 
4f129c9
55db2c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1305932
 
 
55db2c6
 
 
 
 
 
 
 
 
 
85333b3
 
 
 
4f129c9
85333b3
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
"""Smoke tests for the SupportDesk environment."""

import importlib

import pytest
import yaml

try:
    from fastapi.testclient import TestClient
except RuntimeError:
    TestClient = None  # type: ignore[assignment]

from graders import grade_case
from models import SupportCaseProgress, SupportDeskAction
from server.supportdesk_environment import SupportDeskEnvironment
from tasks import get_task, list_task_ids


def test_all_tasks_are_registered():
    assert list_task_ids() == [
        "billing_refund_easy",
        "account_takeover_medium",
        "api_incident_hard",
        "regulated_export_exception_hard",
    ]


def test_environment_reset_and_state():
    env = SupportDeskEnvironment(task_id="billing_refund_easy")
    observation = env.reset()
    assert observation.task_id == "billing_refund_easy"
    assert observation.workflow_stage == "intake"
    assert "classify" in observation.required_next_actions
    assert observation.current_sla_minutes_remaining == 240
    assert env.state.step_count == 0
    assert env.state.current_score == 0.15


def test_perfect_solution_grades_full_score():
    task = get_task("billing_refund_easy")
    env = SupportDeskEnvironment(task_id=task.task_id)
    env.reset()
    env.step(
        SupportDeskAction(
            operation="classify",
            queue=task.gold_queue,
            priority=task.gold_priority,
            issue_type=task.gold_issue_type,
        )
    )
    env.step(
        SupportDeskAction(
            operation="draft_reply",
            reply="Refund approved for the duplicate charge and it should arrive within 5-7 business days.",
        )
    )
    env.step(
        SupportDeskAction(
            operation="add_internal_note",
            internal_note="Duplicate charge verified and refund approved.",
        )
    )
    env.step(
        SupportDeskAction(
            operation="submit",
            status=task.gold_status,
            resolution_code=task.gold_resolution_code,
        )
    )

    breakdown = grade_case(task, env.state.case)
    assert breakdown.total_score == 0.99


def test_max_steps_ends_episode():
    env = SupportDeskEnvironment(task_id="billing_refund_easy")
    observation = env.reset()
    for _ in range(6):
        observation = env.step(SupportDeskAction(operation="classify"))
    assert observation.done is True
    assert env.state.step_count == 6


def test_grade_is_bounded_between_zero_and_one():
    task = get_task("regulated_export_exception_hard")
    env = SupportDeskEnvironment(task_id=task.task_id)
    env.reset()
    breakdown = grade_case(task, env.state.case)
    assert 0.0 < breakdown.total_score < 1.0


def test_task_specific_graders_are_importable_and_clamped():
    from graders import (
        AccountTakeoverMediumGrader,
        ApiIncidentHardGrader,
        BillingRefundEasyGrader,
        RegulatedExportExceptionHardGrader,
    )
    from models import SupportCaseProgress

    case = SupportCaseProgress()
    scores = [
        BillingRefundEasyGrader().grade(case),
        AccountTakeoverMediumGrader().grade(case),
        ApiIncidentHardGrader().grade(case),
        RegulatedExportExceptionHardGrader().grade(case),
    ]

    assert scores == [0.15, 0.01, 0.01, 0.01]


def test_openenv_manifest_graders_are_importable():
    manifest = yaml.safe_load(open("openenv.yaml", encoding="utf-8"))

    assert "tasks" in manifest
    assert len(manifest["tasks"]) >= 4

    for task in manifest["tasks"]:
        grader_block = task["grader"]
        assert isinstance(grader_block, dict)
        assert grader_block.get("type") == "llm"
        assert isinstance(grader_block.get("prompt_template"), str)


def test_state_includes_episode_id_after_reset():
    env = SupportDeskEnvironment(task_id="billing_refund_easy")
    env.reset(episode_id="episode-123")
    assert env.state.episode_id == "episode-123"
    assert env.state.workflow_stage == "intake"
    assert "finance_close_risk" in env.state.risk_flags


def test_premature_submit_gets_penalized():
    env = SupportDeskEnvironment(task_id="api_incident_hard")
    env.reset()
    observation = env.step(
        SupportDeskAction(
            operation="submit",
            status="resolved",
            resolution_code="incident_opened",
        )
    )
    assert observation.reward < 0
    assert observation.done is True


def test_follow_up_arrives_after_wait():
    env = SupportDeskEnvironment(task_id="account_takeover_medium")
    env.reset()
    env.step(
        SupportDeskAction(
            operation="classify",
            queue="trust_and_safety",
            priority="urgent",
            issue_type="account_compromise",
        )
    )
    observation = env.step(
        SupportDeskAction(
            operation="request_info",
            requested_fields=["workspace_id", "last_successful_login", "billing_email"],
        )
    )
    assert observation.case.customer_follow_up.status == "pending"

    observation = env.step(SupportDeskAction(operation="wait"))
    assert observation.case.customer_follow_up.status == "partial"
    assert "customer_reply_incomplete" in observation.risk_flags


@pytest.mark.skipif(TestClient is None, reason="httpx is not installed for FastAPI TestClient")
def test_http_reset_step_state_are_session_consistent():
    from server.app import app

    client = TestClient(app)

    reset_response = client.post("/reset", json={"episode_id": "http-episode"})
    assert reset_response.status_code == 200
    reset_payload = reset_response.json()
    assert "score" in reset_payload
    assert 0.0 < reset_payload["score"] < 1.0

    step_response = client.post(
        "/step",
        json={
            "action": {
                "operation": "classify",
                "queue": "billing_ops",
                "priority": "high",
                "issue_type": "duplicate_charge",
                "status": "new",
                "requested_fields": [],
                "reply": "",
                "internal_note": "",
            }
        },
    )
    assert step_response.status_code == 200
    step_payload = step_response.json()
    assert "score" in step_payload
    assert 0.0 < step_payload["score"] < 1.0

    state_response = client.get("/state")
    assert state_response.status_code == 200
    state_payload = state_response.json()

    assert state_payload["episode_id"] == "http-episode"
    assert state_payload["step_count"] == 1
    assert state_payload["case"]["queue"] == "billing_ops"
    assert state_payload["case"]["priority"] == "high"
    assert state_payload["case"]["issue_type"] == "duplicate_charge"


@pytest.mark.skipif(TestClient is None, reason="httpx is not installed for FastAPI TestClient")
def test_http_explicit_episode_helpers_work():
    from server.app import app

    client = TestClient(app)

    episode_id = "explicit-http-episode"
    reset_response = client.post("/reset", json={"episode_id": episode_id})
    assert reset_response.status_code == 200

    step_response = client.post(
        f"/episodes/{episode_id}/step",
        json={
            "action": {
                "operation": "classify",
                "queue": "billing_ops",
                "priority": "high",
                "issue_type": "duplicate_charge",
            }
        },
    )
    assert step_response.status_code == 200
    step_payload = step_response.json()
    assert "score" in step_payload
    assert 0.0 < step_payload["score"] < 1.0

    state_response = client.get(f"/episodes/{episode_id}/state")
    assert state_response.status_code == 200
    state_payload = state_response.json()

    assert state_payload["episode_id"] == episode_id
    assert state_payload["step_count"] == 1
    assert state_payload["case"]["queue"] == "billing_ops"
    assert state_payload["case"]["priority"] == "high"
    assert state_payload["case"]["issue_type"] == "duplicate_charge"


@pytest.mark.skipif(TestClient is None, reason="httpx is not installed for FastAPI TestClient")
def test_http_tasks_include_truthy_grader_field():
    from server.app import app

    client = TestClient(app)

    tasks_response = client.get("/tasks")
    assert tasks_response.status_code == 200
    payload = tasks_response.json()

    assert payload["total_tasks"] >= 4
    assert len(payload["tasks"]) >= 4
    for task in payload["tasks"]:
        assert task["grader"].startswith("graders:")