HyperBrickCaseOps / tasks /api_incident_hard.py
modelbuilderhq's picture
Upload folder using huggingface_hub
2ade2c6 verified
from __future__ import annotations
from tasks.base import SupportTaskSpec, KnowledgeSnippet, SupportTicket
TASK = SupportTaskSpec(
task_id="api_incident_hard",
difficulty="hard",
title="Production API incident triage",
objective=(
"Triage a production API latency/5xx incident affecting multiple customers; "
"collect diagnostics, apply runbook mitigations, and escalate to platform engineering appropriately."
),
ticket=SupportTicket(
customer_name="Marco Alvarez",
customer_tier="enterprise",
company="Northwind Labs",
subject="API timeouts for createOrder",
body=(
"Since 3 hours ago, createOrder calls are timing out or returning 500s across regions. "
"We rolled back our last deploy and still see issues. Need RCA and mitigation ASAP."
),
region="us-west-2",
affected_users=4200,
sla_minutes_remaining=90,
),
knowledge_base=(
KnowledgeSnippet(
article_id="kb-api-runbook",
title="API latency/5xx runbook",
content=(
"Capture request IDs, time window, regions, and payload samples. "
"Check current status page and incident channel. "
"If multiple regions impacted, escalate to platform_engineering and set customer expectations."
),
),
KnowledgeSnippet(
article_id="kb-status-page",
title="Status page policy",
content="If 2+ enterprise customers report API errors, post a preliminary status within 15 minutes.",
),
),
gold_queue="platform_engineering",
gold_priority="urgent",
gold_issue_type="production_incident",
gold_status="escalated",
gold_resolution_code="runbook_investigation",
required_requested_fields=("request_ids", "time_window", "regions", "payload_sample"),
required_reply_markers=(
("acknowledge", "incident"),
("collect", "request ids"),
("status", "page"),
("escalate", "platform"),
),
required_note_markers=(
("status page",),
("platform escalation",),
("request ids",),
),
risk_flags=("sla_breach", "p1_incident"),
follow_up_outcome="partial",
follow_up_message="Platform team investigating elevated DB latency; ETA 20 minutes.",
follow_up_provided_fields=("request_ids", "time_window"),
follow_up_wrong_fields=("payload_sample",),
sla_step_cost=20,
over_escalation_queues=(),
under_escalation_deadline_step=3,
max_steps=8,
)