Pramod Basavaraj Menasi commited on
Commit
66ae73a
·
1 Parent(s): 2668702

fixed errors

Browse files
__pycache__/client.cpython-313.pyc CHANGED
Binary files a/__pycache__/client.cpython-313.pyc and b/__pycache__/client.cpython-313.pyc differ
 
__pycache__/graders.cpython-313.pyc ADDED
Binary file (3.65 kB). View file
 
__pycache__/inference.cpython-313.pyc ADDED
Binary file (10.1 kB). View file
 
__pycache__/models.cpython-313.pyc CHANGED
Binary files a/__pycache__/models.cpython-313.pyc and b/__pycache__/models.cpython-313.pyc differ
 
client.py CHANGED
@@ -1,27 +1,27 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
-
3
- # All rights reserved.
4
-
5
- #
6
-
7
- # This source code is licensed under the BSD-style license found in the
8
-
9
- # LICENSE file in the root directory of this source tree.
10
-
11
- """Incidentops Env Environment Client."""
12
  from __future__ import annotations
13
- from typing import Dict
14
  from openenv.core import EnvClient
15
  from openenv.core.client_types import StepResult
16
  from openenv.core.env_server.types import State
17
  from models import IncidentopsAction, IncidentopsObservation
18
 
 
19
  class IncidentopsEnv(EnvClient[IncidentopsAction, IncidentopsObservation, State]):
 
20
  def _step_payload(self, action: IncidentopsAction) -> Dict:
21
- return {"action": action.action}
 
 
 
 
 
22
 
23
  def _parse_result(self, payload: Dict) -> StepResult[IncidentopsObservation]:
24
- obs_data = payload.get("observation", {})
 
 
 
 
25
  observation = IncidentopsObservation(
26
  alert_summary=obs_data.get("alert_summary", ""),
27
  severity=obs_data.get("severity", "low"),
@@ -38,8 +38,8 @@ class IncidentopsEnv(EnvClient[IncidentopsAction, IncidentopsObservation, State]
38
  incident_resolved=obs_data.get("incident_resolved", False),
39
  wrong_escalations=obs_data.get("wrong_escalations", 0),
40
  metadata=obs_data.get("metadata", {}),
41
- reward=payload.get("reward", 0.0),
42
- done=payload.get("done", False),
43
  )
44
  return StepResult(
45
  observation=observation,
@@ -51,5 +51,4 @@ class IncidentopsEnv(EnvClient[IncidentopsAction, IncidentopsObservation, State]
51
  return State(
52
  episode_id=payload.get("episode_id"),
53
  step_count=payload.get("step_count", 0),
54
- )
55
-
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
+ from typing import Any, Dict
3
  from openenv.core import EnvClient
4
  from openenv.core.client_types import StepResult
5
  from openenv.core.env_server.types import State
6
  from models import IncidentopsAction, IncidentopsObservation
7
 
8
+
9
  class IncidentopsEnv(EnvClient[IncidentopsAction, IncidentopsObservation, State]):
10
+
11
  def _step_payload(self, action: IncidentopsAction) -> Dict:
12
+ # Return just the model_dump — NOT wrapped in {"action": ...}
13
+ # The server will do IncidentopsAction.model_validate(payload)
14
+ return action.model_dump()
15
+
16
+ def _reset_payload(self, **kwargs: Any) -> Dict:
17
+ return kwargs
18
 
19
  def _parse_result(self, payload: Dict) -> StepResult[IncidentopsObservation]:
20
+ obs_data = payload.get("observation", payload)
21
+ if isinstance(obs_data, str):
22
+ import json
23
+ obs_data = json.loads(obs_data)
24
+
25
  observation = IncidentopsObservation(
26
  alert_summary=obs_data.get("alert_summary", ""),
27
  severity=obs_data.get("severity", "low"),
 
38
  incident_resolved=obs_data.get("incident_resolved", False),
39
  wrong_escalations=obs_data.get("wrong_escalations", 0),
40
  metadata=obs_data.get("metadata", {}),
41
+ reward=payload.get("reward", obs_data.get("reward", 0.0)),
42
+ done=payload.get("done", obs_data.get("done", False)),
43
  )
44
  return StepResult(
45
  observation=observation,
 
51
  return State(
52
  episode_id=payload.get("episode_id"),
53
  step_count=payload.get("step_count", 0),
54
+ )
 
debug.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ print("1. Script started", flush=True)
3
+
4
+ print("2. Testing imports...", flush=True)
5
+
6
+ try:
7
+ from dotenv import load_dotenv
8
+ load_dotenv()
9
+ print("3. dotenv OK", flush=True)
10
+ except Exception as e:
11
+ print(f"3. dotenv error: {e}", flush=True)
12
+
13
+ try:
14
+ import httpx
15
+ print("4. httpx OK", flush=True)
16
+ except Exception as e:
17
+ print(f"4. httpx FAILED: {e}", flush=True)
18
+ sys.exit(1)
19
+
20
+ try:
21
+ from openai import OpenAI
22
+ print("5. openai OK", flush=True)
23
+ except Exception as e:
24
+ print(f"5. openai FAILED: {e}", flush=True)
25
+ sys.exit(1)
26
+
27
+ import os
28
+ print(f"6. HF_TOKEN={'set' if os.getenv('HF_TOKEN') else 'missing'}", flush=True)
29
+
30
+ print("7. Testing server...", flush=True)
31
+ try:
32
+ r = httpx.get("http://localhost:8000/tasks", timeout=5.0)
33
+ print(f"8. Server response: {r.status_code}", flush=True)
34
+ except Exception as e:
35
+ print(f"8. Server error: {e}", flush=True)
36
+ sys.exit(1)
37
+
38
+ print("9. Testing reset...", flush=True)
39
+ try:
40
+ r = httpx.post("http://localhost:8000/reset", json={"task_id": "incident_easy"}, timeout=5.0)
41
+ print(f"10. Reset status: {r.status_code}", flush=True)
42
+ data = r.json()
43
+ obs = data.get("observation", data)
44
+ print(f"11. Alert: {obs.get('alert_summary', 'N/A')[:50]}", flush=True)
45
+ except Exception as e:
46
+ print(f"10. Reset error: {e}", flush=True)
47
+ sys.exit(1)
48
+
49
+ print("12. Testing step...", flush=True)
50
+ try:
51
+ r = httpx.post(
52
+ "http://localhost:8000/step",
53
+ json={"action": {"action": "rollback_deploy"}},
54
+ timeout=5.0,
55
+ )
56
+ print(f"13. Step status: {r.status_code}", flush=True)
57
+ print(f"14. Step body: {r.text[:200]}", flush=True)
58
+ except Exception as e:
59
+ print(f"13. Step error: {e}", flush=True)
60
+
61
+ print("15. Testing grade...", flush=True)
62
+ try:
63
+ r = httpx.get("http://localhost:8000/grade", params={"task_id": "incident_easy"}, timeout=5.0)
64
+ print(f"16. Grade status: {r.status_code}", flush=True)
65
+ print(f"17. Grade body: {r.text[:200]}", flush=True)
66
+ except Exception as e:
67
+ print(f"16. Grade error: {e}", flush=True)
68
+
69
+ print("18. ALL DONE", flush=True)
graders.py CHANGED
@@ -1,82 +1,86 @@
 
 
1
  from __future__ import annotations
2
- from typing import Any, Dict
3
 
4
 
5
- class IncidentEasyGrader:
6
- """Grader for easy task: single_service_outage"""
7
-
8
- task_id = "incident_easy"
9
-
10
- def grade(self, trajectory: list[dict]) -> float:
11
- """Score an episode trajectory. Returns 0.0 - 1.0"""
 
 
 
 
 
12
  if not trajectory:
13
  return 0.0
14
-
15
- actions = [s.get("action", "") for s in trajectory if s.get("action")]
16
- resolved = any(s.get("observation", {}).get("incident_resolved", False) for s in trajectory)
17
- steps = len(trajectory)
18
- sla = 5 # easy SLA
19
-
20
- if not resolved:
21
- return max(0.0, 0.1 * (len([a for a in actions if a == "rollback_deploy"]) / max(len(actions), 1)))
22
-
23
- sla_ok = steps <= sla
24
- correctness = sum(1 for a in actions if a in ["rollback_deploy", "resolve_incident"]) / 2.0
25
-
26
- if sla_ok:
27
- return round(min(1.0, 0.5 + 0.5 * correctness), 4)
28
- return round(min(0.6, 0.3 + 0.3 * correctness), 4)
29
-
30
-
31
- class IncidentMediumGrader:
32
- """Grader for medium task: dependency_failure"""
33
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  task_id = "incident_medium"
35
-
36
- def grade(self, trajectory: list[dict]) -> float:
37
- if not trajectory:
38
- return 0.0
39
-
40
- actions = [s.get("action", "") for s in trajectory if s.get("action")]
41
- resolved = any(s.get("observation", {}).get("incident_resolved", False) for s in trajectory)
42
- steps = len(trajectory)
43
- sla = 8
44
-
45
- correct_seq = ["request_logs", "query_dependencies", "escalate_db_team", "restart_service", "resolve_incident"]
46
-
47
- if not resolved:
48
- return max(0.0, 0.1 * (len([a for a in actions if a in correct_seq]) / max(len(correct_seq), 1)))
49
-
50
- sla_ok = steps <= sla
51
- correctness = sum(1 for a in actions if a in correct_seq) / len(correct_seq)
52
-
53
- if sla_ok:
54
- return round(min(1.0, 0.5 + 0.5 * correctness), 4)
55
- return round(min(0.6, 0.3 + 0.3 * correctness), 4)
56
-
57
-
58
- class IncidentHardGrader:
59
- """Grader for hard task: multi_service_root_cause"""
60
-
61
  task_id = "incident_hard"
62
-
63
- def grade(self, trajectory: list[dict]) -> float:
64
- if not trajectory:
65
- return 0.0
66
-
67
- actions = [s.get("action", "") for s in trajectory if s.get("action")]
68
- resolved = any(s.get("observation", {}).get("incident_resolved", False) for s in trajectory)
69
- steps = len(trajectory)
70
- sla = 12
71
-
72
- correct_seq = ["query_region_health", "query_dns_status", "escalate_network_team", "broadcast_status_page", "resolve_incident"]
73
-
74
- if not resolved:
75
- return max(0.0, 0.1 * (len([a for a in actions if a in correct_seq]) / max(len(correct_seq), 1)))
76
-
77
- sla_ok = steps <= sla
78
- correctness = sum(1 for a in actions if a in correct_seq) / len(correct_seq)
79
-
80
- if sla_ok:
81
- return round(min(1.0, 0.5 + 0.5 * correctness), 4)
82
- return round(min(0.6, 0.3 + 0.3 * correctness), 4)
 
1
+ """Graders for the Incidentops environment."""
2
+
3
  from __future__ import annotations
4
+ from typing import Any, Dict, List
5
 
6
 
7
+ class BaseIncidentGrader:
8
+ """Base grader with shared logic."""
9
+
10
+ task_id: str = ""
11
+ expected_actions: List[str] = []
12
+ sla_steps: int = 10
13
+
14
+ def grade(self, trajectory: List[Dict[str, Any]]) -> float:
15
+ """
16
+ Grade a trajectory of (action, observation) pairs.
17
+ Returns a score in [0.0, 1.0].
18
+ """
19
  if not trajectory:
20
  return 0.0
21
+
22
+ actions_taken = []
23
+ resolved = False
24
+ wrong_escalations = 0
25
+
26
+ for entry in trajectory:
27
+ action = entry.get("action", "")
28
+ if isinstance(action, dict):
29
+ action = action.get("action", "")
30
+ actions_taken.append(action)
31
+
32
+ obs = entry.get("observation", {})
33
+ if isinstance(obs, dict):
34
+ resolved = obs.get("incident_resolved", False)
35
+
36
+ total_steps = len(actions_taken)
37
+
38
+ # Correctness: how many correct actions were taken
39
+ correct_actions = sum(
40
+ 1 for a in actions_taken if a in self.expected_actions
41
+ )
42
+ correctness_ratio = correct_actions / max(len(self.expected_actions), 1)
43
+
44
+ # Efficiency bonus
45
+ efficiency_bonus = max(0.0, (self.sla_steps - total_steps) / self.sla_steps)
46
+
47
+ sla_ok = total_steps <= self.sla_steps
48
+
49
+ if resolved and sla_ok:
50
+ score = min(1.0, 0.5 + 0.3 * correctness_ratio + 0.2 * efficiency_bonus)
51
+ elif resolved:
52
+ score = min(0.6, 0.3 + 0.3 * correctness_ratio)
53
+ else:
54
+ score = max(0.0, 0.1 * correctness_ratio)
55
+
56
+ return round(score, 4)
57
+
58
+
59
+ class IncidentEasyGrader(BaseIncidentGrader):
60
+ task_id = "incident_easy"
61
+ expected_actions = ["rollback_deploy", "resolve_incident"]
62
+ sla_steps = 5
63
+
64
+
65
+ class IncidentMediumGrader(BaseIncidentGrader):
66
  task_id = "incident_medium"
67
+ expected_actions = [
68
+ "request_logs",
69
+ "query_dependencies",
70
+ "escalate_db_team",
71
+ "restart_service",
72
+ "resolve_incident",
73
+ ]
74
+ sla_steps = 8
75
+
76
+
77
+ class IncidentHardGrader(BaseIncidentGrader):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  task_id = "incident_hard"
79
+ expected_actions = [
80
+ "query_region_health",
81
+ "query_dns_status",
82
+ "escalate_network_team",
83
+ "broadcast_status_page",
84
+ "resolve_incident",
85
+ ]
86
+ sla_steps = 12
 
 
 
 
 
 
 
 
 
 
 
 
 
inference.py CHANGED
@@ -1,149 +1,181 @@
1
- from __future__ import annotations
2
- from dotenv import load_dotenv
3
- import os
4
-
5
- load_dotenv()
6
- import asyncio
7
  import json
8
  import os
9
- from typing import List, Optional
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  from openai import OpenAI
12
 
13
- from client import IncidentopsEnv
14
- from models import IncidentopsAction
15
 
16
  API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY") or os.getenv("OPENAI_API_KEY")
17
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
18
  MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
19
- TASK_NAME = os.getenv("INCIDENTOPS_TASK", "incidentops")
20
- BENCHMARK = os.getenv("INCIDENTOPS_BENCHMARK", "incidentops_env")
21
- MAX_STEPS = int(os.getenv("MAX_STEPS", "12"))
22
- TEMPERATURE = float(os.getenv("TEMPERATURE", "0.2"))
23
  ENV_URL = os.getenv("ENV_URL", "http://localhost:8000")
24
- TASK_ID = os.getenv("TASK_ID", "incident_easy")
 
25
 
26
- SYSTEM_PROMPT = """
27
- You are an incident-response policy.
28
- Choose exactly one action from the environment's available actions.
29
- Prefer investigation when confidence is low.
30
- Prefer mitigation or escalation when evidence points to a cause.
31
- Return only the action string.
32
- """.strip()
33
 
34
 
35
- def log_start(task: str, env: str, model: str) -> None:
36
  print(f"[START] task={task} env={env} model={model}", flush=True)
37
 
38
 
39
- def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
40
- print(
41
- f"[STEP] step={step} action={action} reward={reward:.2f} done={str(done).lower()} error={error if error else 'null'}",
42
- flush=True,
43
- )
44
 
45
 
46
- def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
47
  rewards_str = ",".join(f"{r:.2f}" for r in rewards)
48
- print(
49
- f"[END] success={str(success).lower()} steps={steps} score={score:.2f} rewards={rewards_str}",
50
- flush=True,
51
- )
52
 
53
 
54
- def choose_action(client: OpenAI, obs) -> str:
55
- available = obs.available_actions or []
 
 
 
56
  if not available:
57
  return "resolve_incident"
58
-
59
- prompt = {
60
- "alert_summary": obs.alert_summary,
61
- "severity": obs.severity,
62
- "likely_cause": obs.likely_cause,
63
- "hf_confidence": obs.hf_confidence,
64
- "logs_available": obs.logs_available,
65
- "log_snippet": obs.log_snippet,
66
- "services_affected": obs.services_affected,
67
- "elapsed_steps": obs.elapsed_steps,
68
- "sla_steps_remaining": obs.sla_steps_remaining,
69
- "action_history": obs.action_history,
70
- "available_actions": available,
71
- }
72
-
73
- response = client.chat.completions.create(
74
- model=MODEL_NAME,
75
- messages=[
76
- {"role": "system", "content": SYSTEM_PROMPT},
77
- {"role": "user", "content": json.dumps(prompt)},
78
- ],
79
- temperature=TEMPERATURE,
80
- max_tokens=20,
81
- )
82
- text = (response.choices[0].message.content or "").strip().splitlines()[0].strip()
83
-
84
- if text in available:
85
- return text
86
-
87
- # fallback heuristics
88
- if not obs.logs_available and "request_logs" in available:
89
  return "request_logs"
90
- if obs.likely_cause == "dns_issue" and "query_dns_status" in available:
91
- return "query_dns_status"
92
- if obs.likely_cause == "dependency_issue" and "query_dependencies" in available:
93
  return "query_dependencies"
94
- if obs.hf_confidence < 0.7 and "query_region_health" in available:
95
  return "query_region_health"
96
- if "resolve_incident" in available and (obs.service_healthy or obs.incident_resolved):
 
 
 
 
 
 
 
 
 
 
97
  return "resolve_incident"
98
- return available[0]
99
 
100
 
101
- async def main() -> None:
102
- if not API_KEY:
103
- raise RuntimeError("Missing HF_TOKEN/API_KEY/OPENAI_API_KEY")
 
 
 
 
 
104
 
105
- client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
106
- env = await IncidentopsEnv.from_docker_image(os.getenv("IMAGE_NAME")) if os.getenv("IMAGE_NAME") else IncidentopsEnv(base_url=ENV_URL)
107
 
108
- rewards: List[float] = []
 
 
109
  steps_taken = 0
110
  success = False
111
  score = 0.0
112
 
113
- log_start(TASK_NAME, BENCHMARK, MODEL_NAME)
114
 
115
  try:
116
- result = await env.reset(task_id=TASK_ID)
117
- obs = result.observation
 
 
 
 
118
 
119
  for step in range(1, MAX_STEPS + 1):
120
- if result.done:
121
  break
122
 
123
- action_name = choose_action(client, obs)
124
- result = await env.step(IncidentopsAction(action=action_name))
125
- obs = result.observation
126
- reward = float(result.reward or 0.0)
127
- done = bool(result.done)
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  rewards.append(reward)
130
  steps_taken = step
131
- log_step(step, action_name, reward, done, None)
132
 
133
- if done:
134
- break
 
 
 
 
135
 
136
- total_reward = sum(rewards)
137
- score = max(0.0, min(1.0, total_reward / 5.0))
138
- success = bool(obs.incident_resolved) and score >= 0.1
139
 
140
  finally:
141
- try:
142
- await env.close()
143
- except Exception:
144
- pass
145
  log_end(success, steps_taken, score, rewards)
146
 
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  if __name__ == "__main__":
149
- asyncio.run(main())
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import os
3
+ import sys
4
+ import traceback
5
+
6
+ print("[DEBUG] line 6", flush=True)
7
+
8
+ try:
9
+ from dotenv import load_dotenv
10
+ load_dotenv()
11
+ except ImportError:
12
+ pass
13
+
14
+ print("[DEBUG] line 14", flush=True)
15
+
16
+ import httpx
17
+
18
+ print("[DEBUG] line 18", flush=True)
19
 
20
  from openai import OpenAI
21
 
22
+ print("[DEBUG] line 22", flush=True)
 
23
 
24
  API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY") or os.getenv("OPENAI_API_KEY")
25
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
26
  MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
27
+ BENCHMARK = "incidentops_env"
28
+ TASK_IDS = ["incident_easy", "incident_medium", "incident_hard"]
 
 
29
  ENV_URL = os.getenv("ENV_URL", "http://localhost:8000")
30
+ MAX_STEPS = 12
31
+ TEMPERATURE = 0.2
32
 
33
+ print("[DEBUG] line 33", flush=True)
 
 
 
 
 
 
34
 
35
 
36
+ def log_start(task, env, model):
37
  print(f"[START] task={task} env={env} model={model}", flush=True)
38
 
39
 
40
+ def log_step(step, action, reward, done, error):
41
+ err = error if error else "null"
42
+ d = str(done).lower()
43
+ print(f"[STEP] step={step} action={action} reward={reward:.2f} done={d} error={err}", flush=True)
 
44
 
45
 
46
+ def log_end(success, steps, score, rewards):
47
  rewards_str = ",".join(f"{r:.2f}" for r in rewards)
48
+ print(f"[END] success={str(success).lower()} steps={steps} score={score:.2f} rewards={rewards_str}", flush=True)
 
 
 
49
 
50
 
51
+ def choose_action(obs):
52
+ available = obs.get("available_actions", [])
53
+ logs_available = obs.get("logs_available", False)
54
+ likely_cause = obs.get("likely_cause", "unknown")
55
+
56
  if not available:
57
  return "resolve_incident"
58
+ if not logs_available and "request_logs" in available:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  return "request_logs"
60
+ if likely_cause == "bad_deployment" and "rollback_deploy" in available:
61
+ return "rollback_deploy"
62
+ if likely_cause == "dependency_issue" and "query_dependencies" in available:
63
  return "query_dependencies"
64
+ if likely_cause == "ambiguous" and "query_region_health" in available:
65
  return "query_region_health"
66
+ if likely_cause == "dns_issue" and "query_dns_status" in available:
67
+ return "query_dns_status"
68
+ if likely_cause == "db_timeout" and "escalate_db_team" in available:
69
+ return "escalate_db_team"
70
+ if likely_cause == "dns_issue" and "escalate_network_team" in available:
71
+ return "escalate_network_team"
72
+ if likely_cause == "dns_issue" and "broadcast_status_page" in available:
73
+ return "broadcast_status_page"
74
+ if "restart_service" in available and likely_cause in ("db_timeout", "bad_deployment"):
75
+ return "restart_service"
76
+ if "resolve_incident" in available:
77
  return "resolve_incident"
78
+ return available[0] if available else "resolve_incident"
79
 
80
 
81
+ def extract_obs(data):
82
+ if "observation" in data:
83
+ obs = data["observation"]
84
+ else:
85
+ obs = data
86
+ if isinstance(obs, str):
87
+ obs = json.loads(obs)
88
+ return obs
89
 
 
 
90
 
91
+ def run_task(http, task_id):
92
+ print(f"[DEBUG] Starting task: {task_id}", flush=True)
93
+ rewards = []
94
  steps_taken = 0
95
  success = False
96
  score = 0.0
97
 
98
+ log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
99
 
100
  try:
101
+ r = http.post(f"{ENV_URL}/reset", json={"task_id": task_id}, timeout=30.0)
102
+ r.raise_for_status()
103
+ obs = extract_obs(r.json())
104
+ print(f"[DEBUG] Reset OK: cause={obs.get('likely_cause')}", flush=True)
105
+
106
+ finished = obs.get("done", False) or obs.get("incident_resolved", False)
107
 
108
  for step in range(1, MAX_STEPS + 1):
109
+ if finished:
110
  break
111
 
112
+ action_name = choose_action(obs)
113
+ print(f"[DEBUG] Step {step}: {action_name}", flush=True)
114
+
115
+ r = http.post(
116
+ f"{ENV_URL}/step",
117
+ json={"action": {"action": action_name}},
118
+ timeout=30.0,
119
+ )
120
+ r.raise_for_status()
121
+ step_data = r.json()
122
+ obs = extract_obs(step_data)
123
+
124
+ reward = float(step_data.get("reward", obs.get("reward", 0.0)))
125
+ finished = bool(
126
+ step_data.get("done", obs.get("done", False))
127
+ or obs.get("incident_resolved", False)
128
+ )
129
 
130
  rewards.append(reward)
131
  steps_taken = step
132
+ log_step(step, action_name, reward, finished, None)
133
 
134
+ r = http.get(f"{ENV_URL}/grade", params={"task_id": task_id}, timeout=30.0)
135
+ r.raise_for_status()
136
+ grade = r.json()
137
+ score = float(grade.get("score", 0.0))
138
+ success = bool(grade.get("success", False))
139
+ print(f"[DEBUG] Grade: {grade}", flush=True)
140
 
141
+ except Exception as e:
142
+ print(f"[DEBUG] Error: {e}", flush=True)
143
+ traceback.print_exc()
144
 
145
  finally:
 
 
 
 
146
  log_end(success, steps_taken, score, rewards)
147
 
148
 
149
+ print("[DEBUG] line 137 - about to define main", flush=True)
150
+
151
+
152
+ def main():
153
+ print(f"[DEBUG] main() called", flush=True)
154
+ print(f"[DEBUG] ENV_URL={ENV_URL}", flush=True)
155
+
156
+ http = httpx.Client()
157
+
158
+ try:
159
+ r = http.get(f"{ENV_URL}/tasks", timeout=10.0)
160
+ print(f"[DEBUG] Server OK: {r.status_code}", flush=True)
161
+ except Exception as e:
162
+ print(f"[ERROR] Server not running: {e}", flush=True)
163
+ return
164
+
165
+ for task_id in TASK_IDS:
166
+ run_task(http, task_id)
167
+
168
+ http.close()
169
+ print("[DEBUG] Done!", flush=True)
170
+
171
+
172
+ print("[DEBUG] line 160 - about to check name", flush=True)
173
+ print(f"[DEBUG] name = {__name__}", flush=True)
174
+
175
  if __name__ == "__main__":
176
+ print("[DEBUG] entering main()", flush=True)
177
+ try:
178
+ main()
179
+ except Exception as e:
180
+ print(f"[FATAL] {e}", flush=True)
181
+ traceback.print_exc()
openenv.yaml CHANGED
@@ -3,7 +3,7 @@ name: incidentops_env
3
  type: space
4
  runtime: fastapi
5
  app: server.app:app
6
- port: 8000
7
 
8
  tasks:
9
  - id: incident_easy
@@ -18,6 +18,7 @@ tasks:
18
 
19
  - id: incident_medium
20
  name: "Dependency Failure (Medium)"
 
21
  reset_kwargs:
22
  task_id: incident_medium
23
  grader:
@@ -27,6 +28,7 @@ tasks:
27
 
28
  - id: incident_hard
29
  name: "Multi-Service Root Cause (Hard)"
 
30
  reset_kwargs:
31
  task_id: incident_hard
32
  grader:
 
3
  type: space
4
  runtime: fastapi
5
  app: server.app:app
6
+ port: 8000
7
 
8
  tasks:
9
  - id: incident_easy
 
18
 
19
  - id: incident_medium
20
  name: "Dependency Failure (Medium)"
21
+ description: "Investigate and resolve cascading failures caused by database timeouts affecting multiple services."
22
  reset_kwargs:
23
  task_id: incident_medium
24
  grader:
 
28
 
29
  - id: incident_hard
30
  name: "Multi-Service Root Cause (Hard)"
31
+ description: "Diagnose EU checkout failures with ambiguous signals across auth, payment, and checkout services caused by DNS issues."
32
  reset_kwargs:
33
  task_id: incident_hard
34
  grader:
server/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (326 Bytes). View file
 
server/__pycache__/app.cpython-313.pyc ADDED
Binary file (3.82 kB). View file
 
server/__pycache__/incidentops_env_environment.cpython-313.pyc ADDED
Binary file (13.3 kB). View file
 
server/incidentops_env_environment.py CHANGED
@@ -1,16 +1,3 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """
8
- Incidentops Env Environment Implementation.
9
-
10
- A simple test environment that echoes back messages sent to it.
11
- Perfect for testing HTTP server infrastructure.
12
- """
13
-
14
  from __future__ import annotations
15
 
16
  from dataclasses import dataclass, field
@@ -21,8 +8,8 @@ from openenv.core.env_server.interfaces import Environment
21
  from openenv.core.env_server.types import State
22
 
23
  try:
24
- from ..models import IncidentopsAction, IncidentopsObservation
25
- except Exception:
26
  from models import IncidentopsAction, IncidentopsObservation
27
 
28
 
@@ -66,16 +53,15 @@ SCENARIOS: Dict[str, List[Dict[str, Any]]] = {
66
  "request_logs",
67
  "rollback_deploy",
68
  "restart_service",
69
- "resolve_incident"
70
  ],
71
  "correct_action_sequence": [
72
  "rollback_deploy",
73
- "resolve_incident"
74
  ],
75
  "sla_steps": 5,
76
  }
77
  ],
78
-
79
  "incident_medium": [
80
  {
81
  "scenario_id": "medium_001",
@@ -101,12 +87,11 @@ SCENARIOS: Dict[str, List[Dict[str, Any]]] = {
101
  "query_dependencies",
102
  "escalate_db_team",
103
  "restart_service",
104
- "resolve_incident"
105
  ],
106
  "sla_steps": 8,
107
  }
108
  ],
109
-
110
  "incident_hard": [
111
  {
112
  "scenario_id": "hard_001",
@@ -146,16 +131,12 @@ SCENARIOS: Dict[str, List[Dict[str, Any]]] = {
146
  class IncidentopsEnvironment(Environment):
147
  SUPPORTS_CONCURRENT_SESSIONS: bool = True
148
 
149
- def __init__(self):
150
  self._state = State(episode_id=str(uuid4()), step_count=0)
151
  self._snapshot: Optional[IncidentSnapshot] = None
152
  self._difficulty = "easy"
153
  self._last_observation: Optional[IncidentopsObservation] = None
154
 
155
- def _pick_scenario(self, difficulty: str) -> Dict[str, Any]:
156
- scenarios = SCENARIOS.get(difficulty, SCENARIOS["easy"])
157
- return scenarios[0]
158
-
159
  def _build_observation(self) -> IncidentopsObservation:
160
  assert self._snapshot is not None
161
  remaining = max(self._snapshot.sla_steps - self._snapshot.step_count, 0)
@@ -189,7 +170,7 @@ class IncidentopsEnvironment(Environment):
189
  assert self._snapshot is not None
190
  s = self._snapshot
191
 
192
- reward = -0.05 # small step cost
193
 
194
  if s.action_history.count(action) > 1:
195
  reward -= 0.2
@@ -245,7 +226,9 @@ class IncidentopsEnvironment(Environment):
245
 
246
  if action == "resolve_incident":
247
  if s.resolved or s.hidden_truth in {"bad_deployment", "db_timeout", "dns_issue"}:
248
- if s.step_count <= s.sla_steps and (s.evidence_collected or s.team_engaged is not None or s.hidden_truth == "bad_deployment"):
 
 
249
  reward += 1.5
250
  s.resolved = True
251
  else:
@@ -258,38 +241,43 @@ class IncidentopsEnvironment(Environment):
258
 
259
  return reward
260
 
261
- def reset(
262
- self,
263
- episode_id: str = None,
264
- task_id: str = "incident_easy",
265
- **kwargs
266
- ) -> IncidentopsObservation:
267
-
268
- # ✅ Pick scenario based on task_id (not difficulty)
269
  scenarios = SCENARIOS.get(task_id, SCENARIOS["incident_easy"])
270
  scenario = scenarios[0]
271
 
272
- # Initialize state
273
- self._state = State(
274
- episode_id=episode_id or str(uuid4()),
275
- step_count=0
276
- )
277
-
278
- # ✅ Load scenario into snapshot
279
  self._snapshot = IncidentSnapshot(**scenario)
280
  self._snapshot.action_history = []
281
 
282
- # ✅ Build first observation
283
  self._last_observation = self._build_observation()
284
-
285
  return self._last_observation
286
 
287
- def step(self, action: IncidentopsAction) -> IncidentopsObservation: # type: ignore[override]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  assert self._snapshot is not None
 
289
  self._snapshot.step_count += 1
290
  self._state.step_count = self._snapshot.step_count
291
 
292
- action_name = action.action
293
  self._snapshot.action_history.append(action_name)
294
 
295
  reward = self._calc_reward(action_name)
@@ -303,25 +291,25 @@ class IncidentopsEnvironment(Environment):
303
  "last_action": action_name,
304
  "last_reward": reward,
305
  }
 
306
  if done:
307
  grade_result = self.grade()
308
- obs.grader_score = grade_result["score"]
309
 
310
  self._last_observation = obs
 
311
  return obs
 
312
  def grade(self) -> dict:
313
- """Called by the OpenEnv validator to score a completed episode."""
314
  assert self._snapshot is not None
315
  s = self._snapshot
316
 
317
- total_steps = max(s.step_count, 1) # ✅ used below
318
  sla_ok = s.step_count <= s.sla_steps
319
  correct_actions = sum(
320
  1 for a in s.action_history if a in s.correct_action_sequence
321
  )
322
  correctness_ratio = correct_actions / max(len(s.correct_action_sequence), 1)
323
-
324
- # ✅ efficiency bonus — fewer steps = better score
325
  efficiency_bonus = max(0.0, (s.sla_steps - total_steps) / s.sla_steps)
326
 
327
  if s.resolved and sla_ok:
@@ -341,6 +329,7 @@ class IncidentopsEnvironment(Environment):
341
  "wrong_escalations": s.wrong_escalations,
342
  "evidence_collected": s.evidence_collected,
343
  }
 
344
  @property
345
  def state(self) -> State:
346
- return self._state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
 
3
  from dataclasses import dataclass, field
 
8
  from openenv.core.env_server.types import State
9
 
10
  try:
11
+ from models import IncidentopsAction, IncidentopsObservation
12
+ except ImportError:
13
  from models import IncidentopsAction, IncidentopsObservation
14
 
15
 
 
53
  "request_logs",
54
  "rollback_deploy",
55
  "restart_service",
56
+ "resolve_incident",
57
  ],
58
  "correct_action_sequence": [
59
  "rollback_deploy",
60
+ "resolve_incident",
61
  ],
62
  "sla_steps": 5,
63
  }
64
  ],
 
65
  "incident_medium": [
66
  {
67
  "scenario_id": "medium_001",
 
87
  "query_dependencies",
88
  "escalate_db_team",
89
  "restart_service",
90
+ "resolve_incident",
91
  ],
92
  "sla_steps": 8,
93
  }
94
  ],
 
95
  "incident_hard": [
96
  {
97
  "scenario_id": "hard_001",
 
131
  class IncidentopsEnvironment(Environment):
132
  SUPPORTS_CONCURRENT_SESSIONS: bool = True
133
 
134
+ def init(self):
135
  self._state = State(episode_id=str(uuid4()), step_count=0)
136
  self._snapshot: Optional[IncidentSnapshot] = None
137
  self._difficulty = "easy"
138
  self._last_observation: Optional[IncidentopsObservation] = None
139
 
 
 
 
 
140
  def _build_observation(self) -> IncidentopsObservation:
141
  assert self._snapshot is not None
142
  remaining = max(self._snapshot.sla_steps - self._snapshot.step_count, 0)
 
170
  assert self._snapshot is not None
171
  s = self._snapshot
172
 
173
+ reward = -0.05
174
 
175
  if s.action_history.count(action) > 1:
176
  reward -= 0.2
 
226
 
227
  if action == "resolve_incident":
228
  if s.resolved or s.hidden_truth in {"bad_deployment", "db_timeout", "dns_issue"}:
229
+ if s.step_count <= s.sla_steps and (
230
+ s.evidence_collected or s.team_engaged is not None or s.hidden_truth == "bad_deployment"
231
+ ):
232
  reward += 1.5
233
  s.resolved = True
234
  else:
 
241
 
242
  return reward
243
 
244
+ def reset(self, episode_id=None, task_id="incident_easy", **kwargs):
245
+ print(f"[ENV] reset called: task_id={task_id}", flush=True)
 
 
 
 
 
 
246
  scenarios = SCENARIOS.get(task_id, SCENARIOS["incident_easy"])
247
  scenario = scenarios[0]
248
 
249
+ self._state = State(episode_id=episode_id or str(uuid4()), step_count=0)
 
 
 
 
 
 
250
  self._snapshot = IncidentSnapshot(**scenario)
251
  self._snapshot.action_history = []
252
 
 
253
  self._last_observation = self._build_observation()
 
254
  return self._last_observation
255
 
256
+ def step(self, action) -> IncidentopsObservation:
257
+ """Handle step - accept both IncidentopsAction objects and dicts."""
258
+ print(f"[ENV] step called: action={action}, type={type(action)}", flush=True)
259
+
260
+ # Extract action string from whatever format we receive
261
+ if isinstance(action, IncidentopsAction):
262
+ action_name = action.action
263
+ elif isinstance(action, dict):
264
+ action_name = action.get("action", "resolve_incident")
265
+ elif isinstance(action, str):
266
+ action_name = action
267
+ else:
268
+ action_name = str(action)
269
+
270
+ print(f"[ENV] action_name={action_name}", flush=True)
271
+
272
+ if self._snapshot is None:
273
+ print("[ENV] ERROR: No snapshot! Calling reset first.", flush=True)
274
+ self.reset()
275
+
276
  assert self._snapshot is not None
277
+
278
  self._snapshot.step_count += 1
279
  self._state.step_count = self._snapshot.step_count
280
 
 
281
  self._snapshot.action_history.append(action_name)
282
 
283
  reward = self._calc_reward(action_name)
 
291
  "last_action": action_name,
292
  "last_reward": reward,
293
  }
294
+
295
  if done:
296
  grade_result = self.grade()
297
+ obs.metadata["grader_score"] = grade_result["score"]
298
 
299
  self._last_observation = obs
300
+ print(f"[ENV] step done: reward={reward:.2f}, done={done}", flush=True)
301
  return obs
302
+
303
  def grade(self) -> dict:
 
304
  assert self._snapshot is not None
305
  s = self._snapshot
306
 
307
+ total_steps = max(s.step_count, 1)
308
  sla_ok = s.step_count <= s.sla_steps
309
  correct_actions = sum(
310
  1 for a in s.action_history if a in s.correct_action_sequence
311
  )
312
  correctness_ratio = correct_actions / max(len(s.correct_action_sequence), 1)
 
 
313
  efficiency_bonus = max(0.0, (s.sla_steps - total_steps) / s.sla_steps)
314
 
315
  if s.resolved and sla_ok:
 
329
  "wrong_escalations": s.wrong_escalations,
330
  "evidence_collected": s.evidence_collected,
331
  }
332
+
333
  @property
334
  def state(self) -> State:
335
+ return self._state
test_inference.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ print("Script started", flush=True)
3
+ print(f"Python: {sys.executable}", flush=True)
4
+
5
+ try:
6
+ import httpx
7
+ print("httpx imported OK", flush=True)
8
+ except ImportError as e:
9
+ print(f"httpx import FAILED: {e}", flush=True)
10
+
11
+ try:
12
+ from openai import OpenAI
13
+ print("openai imported OK", flush=True)
14
+ except ImportError as e:
15
+ print(f"openai import FAILED: {e}", flush=True)
16
+
17
+ try:
18
+ from models import IncidentopsAction, IncidentopsObservation
19
+ print("models imported OK", flush=True)
20
+ except ImportError as e:
21
+ print(f"models import FAILED: {e}", flush=True)
22
+
23
+ try:
24
+ from dotenv import load_dotenv
25
+ load_dotenv()
26
+ print("dotenv loaded OK", flush=True)
27
+ except ImportError:
28
+ print("dotenv not available (OK)", flush=True)
29
+
30
+ import os
31
+ print(f"HF_TOKEN set: {bool(os.getenv('HF_TOKEN'))}", flush=True)
32
+ print(f"API_KEY set: {bool(os.getenv('API_KEY'))}", flush=True)
33
+
34
+ import httpx
35
+ print("\nTesting server connection...", flush=True)
36
+ try:
37
+ r = httpx.get("http://localhost:8000/tasks", timeout=5.0)
38
+ print(f" /tasks status: {r.status_code}", flush=True)
39
+ print(f" /tasks body: {r.text}", flush=True)
40
+ except Exception as e:
41
+ print(f" Server error: {e}", flush=True)
42
+
43
+ try:
44
+ r = httpx.post("http://localhost:8000/reset", json={"task_id": "incident_easy"}, timeout=5.0)
45
+ print(f" /reset status: {r.status_code}", flush=True)
46
+ print(f" /reset body: {r.text[:300]}", flush=True)
47
+ except Exception as e:
48
+ print(f" Reset error: {e}", flush=True)
49
+
50
+ try:
51
+ r = httpx.post("http://localhost:8000/step", json={"action": "rollback_deploy"}, timeout=5.0)
52
+ print(f" /step status: {r.status_code}", flush=True)
53
+ print(f" /step body: {r.text[:300]}", flush=True)
54
+ except Exception as e:
55
+ print(f" Step error: {e}", flush=True)
56
+
57
+ print("\nAll checks done!", flush=True)