Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from tasks.base import SupportTaskSpec, KnowledgeSnippet, SupportTicket | |
| TASK = SupportTaskSpec( | |
| task_id="api_incident_hard", | |
| difficulty="hard", | |
| title="Production API incident triage", | |
| objective=( | |
| "Triage a production API latency/5xx incident affecting multiple customers; " | |
| "collect diagnostics, apply runbook mitigations, and escalate to platform engineering appropriately." | |
| ), | |
| ticket=SupportTicket( | |
| customer_name="Marco Alvarez", | |
| customer_tier="enterprise", | |
| company="Northwind Labs", | |
| subject="API timeouts for createOrder", | |
| body=( | |
| "Since 3 hours ago, createOrder calls are timing out or returning 500s across regions. " | |
| "We rolled back our last deploy and still see issues. Need RCA and mitigation ASAP." | |
| ), | |
| region="us-west-2", | |
| affected_users=4200, | |
| sla_minutes_remaining=90, | |
| ), | |
| knowledge_base=( | |
| KnowledgeSnippet( | |
| article_id="kb-api-runbook", | |
| title="API latency/5xx runbook", | |
| content=( | |
| "Capture request IDs, time window, regions, and payload samples. " | |
| "Check current status page and incident channel. " | |
| "If multiple regions impacted, escalate to platform_engineering and set customer expectations." | |
| ), | |
| ), | |
| KnowledgeSnippet( | |
| article_id="kb-status-page", | |
| title="Status page policy", | |
| content="If 2+ enterprise customers report API errors, post a preliminary status within 15 minutes.", | |
| ), | |
| ), | |
| gold_queue="platform_engineering", | |
| gold_priority="urgent", | |
| gold_issue_type="production_incident", | |
| gold_status="escalated", | |
| gold_resolution_code="runbook_investigation", | |
| required_requested_fields=("request_ids", "time_window", "regions", "payload_sample"), | |
| required_reply_markers=( | |
| ("acknowledge", "incident"), | |
| ("collect", "request ids"), | |
| ("status", "page"), | |
| ("escalate", "platform"), | |
| ), | |
| required_note_markers=( | |
| ("status page",), | |
| ("platform escalation",), | |
| ("request ids",), | |
| ), | |
| risk_flags=("sla_breach", "p1_incident"), | |
| follow_up_outcome="partial", | |
| follow_up_message="Platform team investigating elevated DB latency; ETA 20 minutes.", | |
| follow_up_provided_fields=("request_ids", "time_window"), | |
| follow_up_wrong_fields=("payload_sample",), | |
| sla_step_cost=20, | |
| over_escalation_queues=(), | |
| under_escalation_deadline_step=3, | |
| max_steps=8, | |
| ) | |