Spaces:
Sleeping
Sleeping
| """Smoke tests for the SupportDesk environment.""" | |
| import importlib | |
| import pytest | |
| import yaml | |
| try: | |
| from fastapi.testclient import TestClient | |
| except RuntimeError: | |
| TestClient = None # type: ignore[assignment] | |
| from graders import grade_case | |
| from models import SupportCaseProgress, SupportDeskAction | |
| from server.supportdesk_environment import SupportDeskEnvironment | |
| from tasks import get_task, list_task_ids | |
| def test_all_tasks_are_registered(): | |
| assert list_task_ids() == [ | |
| "billing_refund_easy", | |
| "account_takeover_medium", | |
| "api_incident_hard", | |
| "regulated_export_exception_hard", | |
| ] | |
| def test_environment_reset_and_state(): | |
| env = SupportDeskEnvironment(task_id="billing_refund_easy") | |
| observation = env.reset() | |
| assert observation.task_id == "billing_refund_easy" | |
| assert observation.workflow_stage == "intake" | |
| assert "classify" in observation.required_next_actions | |
| assert observation.current_sla_minutes_remaining == 240 | |
| assert env.state.step_count == 0 | |
| assert env.state.current_score == 0.15 | |
| def test_perfect_solution_grades_full_score(): | |
| task = get_task("billing_refund_easy") | |
| env = SupportDeskEnvironment(task_id=task.task_id) | |
| env.reset() | |
| env.step( | |
| SupportDeskAction( | |
| operation="classify", | |
| queue=task.gold_queue, | |
| priority=task.gold_priority, | |
| issue_type=task.gold_issue_type, | |
| ) | |
| ) | |
| env.step( | |
| SupportDeskAction( | |
| operation="draft_reply", | |
| reply="Refund approved for the duplicate charge and it should arrive within 5-7 business days.", | |
| ) | |
| ) | |
| env.step( | |
| SupportDeskAction( | |
| operation="add_internal_note", | |
| internal_note="Duplicate charge verified and refund approved.", | |
| ) | |
| ) | |
| env.step( | |
| SupportDeskAction( | |
| operation="submit", | |
| status=task.gold_status, | |
| resolution_code=task.gold_resolution_code, | |
| ) | |
| ) | |
| breakdown = grade_case(task, env.state.case) | |
| assert breakdown.total_score == 0.99 | |
| def test_max_steps_ends_episode(): | |
| env = SupportDeskEnvironment(task_id="billing_refund_easy") | |
| observation = env.reset() | |
| for _ in range(6): | |
| observation = env.step(SupportDeskAction(operation="classify")) | |
| assert observation.done is True | |
| assert env.state.step_count == 6 | |
| def test_grade_is_bounded_between_zero_and_one(): | |
| task = get_task("regulated_export_exception_hard") | |
| env = SupportDeskEnvironment(task_id=task.task_id) | |
| env.reset() | |
| breakdown = grade_case(task, env.state.case) | |
| assert 0.0 < breakdown.total_score < 1.0 | |
| def test_task_specific_graders_are_importable_and_clamped(): | |
| from graders import ( | |
| AccountTakeoverMediumGrader, | |
| ApiIncidentHardGrader, | |
| BillingRefundEasyGrader, | |
| RegulatedExportExceptionHardGrader, | |
| ) | |
| from models import SupportCaseProgress | |
| case = SupportCaseProgress() | |
| scores = [ | |
| BillingRefundEasyGrader().grade(case), | |
| AccountTakeoverMediumGrader().grade(case), | |
| ApiIncidentHardGrader().grade(case), | |
| RegulatedExportExceptionHardGrader().grade(case), | |
| ] | |
| assert scores == [0.15, 0.01, 0.01, 0.01] | |
| def test_openenv_manifest_graders_are_importable(): | |
| manifest = yaml.safe_load(open("openenv.yaml", encoding="utf-8")) | |
| assert "tasks" in manifest | |
| assert len(manifest["tasks"]) >= 4 | |
| for task in manifest["tasks"]: | |
| grader_block = task["grader"] | |
| assert isinstance(grader_block, dict) | |
| assert grader_block.get("type") == "llm" | |
| assert isinstance(grader_block.get("prompt_template"), str) | |
| def test_state_includes_episode_id_after_reset(): | |
| env = SupportDeskEnvironment(task_id="billing_refund_easy") | |
| env.reset(episode_id="episode-123") | |
| assert env.state.episode_id == "episode-123" | |
| assert env.state.workflow_stage == "intake" | |
| assert "finance_close_risk" in env.state.risk_flags | |
| def test_premature_submit_gets_penalized(): | |
| env = SupportDeskEnvironment(task_id="api_incident_hard") | |
| env.reset() | |
| observation = env.step( | |
| SupportDeskAction( | |
| operation="submit", | |
| status="resolved", | |
| resolution_code="incident_opened", | |
| ) | |
| ) | |
| assert observation.reward < 0 | |
| assert observation.done is True | |
| def test_follow_up_arrives_after_wait(): | |
| env = SupportDeskEnvironment(task_id="account_takeover_medium") | |
| env.reset() | |
| env.step( | |
| SupportDeskAction( | |
| operation="classify", | |
| queue="trust_and_safety", | |
| priority="urgent", | |
| issue_type="account_compromise", | |
| ) | |
| ) | |
| observation = env.step( | |
| SupportDeskAction( | |
| operation="request_info", | |
| requested_fields=["workspace_id", "last_successful_login", "billing_email"], | |
| ) | |
| ) | |
| assert observation.case.customer_follow_up.status == "pending" | |
| observation = env.step(SupportDeskAction(operation="wait")) | |
| assert observation.case.customer_follow_up.status == "partial" | |
| assert "customer_reply_incomplete" in observation.risk_flags | |
| def test_http_reset_step_state_are_session_consistent(): | |
| from server.app import app | |
| client = TestClient(app) | |
| reset_response = client.post("/reset", json={"episode_id": "http-episode"}) | |
| assert reset_response.status_code == 200 | |
| reset_payload = reset_response.json() | |
| assert "score" in reset_payload | |
| assert 0.0 < reset_payload["score"] < 1.0 | |
| step_response = client.post( | |
| "/step", | |
| json={ | |
| "action": { | |
| "operation": "classify", | |
| "queue": "billing_ops", | |
| "priority": "high", | |
| "issue_type": "duplicate_charge", | |
| "status": "new", | |
| "requested_fields": [], | |
| "reply": "", | |
| "internal_note": "", | |
| } | |
| }, | |
| ) | |
| assert step_response.status_code == 200 | |
| step_payload = step_response.json() | |
| assert "score" in step_payload | |
| assert 0.0 < step_payload["score"] < 1.0 | |
| state_response = client.get("/state") | |
| assert state_response.status_code == 200 | |
| state_payload = state_response.json() | |
| assert state_payload["episode_id"] == "http-episode" | |
| assert state_payload["step_count"] == 1 | |
| assert state_payload["case"]["queue"] == "billing_ops" | |
| assert state_payload["case"]["priority"] == "high" | |
| assert state_payload["case"]["issue_type"] == "duplicate_charge" | |
| def test_http_explicit_episode_helpers_work(): | |
| from server.app import app | |
| client = TestClient(app) | |
| episode_id = "explicit-http-episode" | |
| reset_response = client.post("/reset", json={"episode_id": episode_id}) | |
| assert reset_response.status_code == 200 | |
| step_response = client.post( | |
| f"/episodes/{episode_id}/step", | |
| json={ | |
| "action": { | |
| "operation": "classify", | |
| "queue": "billing_ops", | |
| "priority": "high", | |
| "issue_type": "duplicate_charge", | |
| } | |
| }, | |
| ) | |
| assert step_response.status_code == 200 | |
| step_payload = step_response.json() | |
| assert "score" in step_payload | |
| assert 0.0 < step_payload["score"] < 1.0 | |
| state_response = client.get(f"/episodes/{episode_id}/state") | |
| assert state_response.status_code == 200 | |
| state_payload = state_response.json() | |
| assert state_payload["episode_id"] == episode_id | |
| assert state_payload["step_count"] == 1 | |
| assert state_payload["case"]["queue"] == "billing_ops" | |
| assert state_payload["case"]["priority"] == "high" | |
| assert state_payload["case"]["issue_type"] == "duplicate_charge" | |
| def test_http_tasks_include_truthy_grader_field(): | |
| from server.app import app | |
| client = TestClient(app) | |
| tasks_response = client.get("/tasks") | |
| assert tasks_response.status_code == 200 | |
| payload = tasks_response.json() | |
| assert payload["total_tasks"] >= 4 | |
| assert len(payload["tasks"]) >= 4 | |
| for task in payload["tasks"]: | |
| assert task["grader"].startswith("graders:") | |