Jayant-Kernel commited on
Commit
cbfd883
·
0 Parent(s):

initial: cicd diagnosis env for openenv hackathon

Browse files
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile
2
+ FROM python:3.11-slim
3
+
4
+ WORKDIR /app
5
+
6
+ # install deps first for better layer caching
7
+ COPY requirements.txt /tmp/requirements.txt
8
+ RUN pip install --no-cache-dir -r /tmp/requirements.txt
9
+
10
+ # copy the package and the inference script
11
+ COPY cicd_diagnosis_env/ /app/cicd_diagnosis_env/
12
+ COPY inference.py /app/inference.py
13
+
14
+ ENV PYTHONPATH=/app
15
+ EXPOSE 8000
16
+
17
+ HEALTHCHECK --interval=10s --timeout=5s --retries=3 \
18
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"
19
+
20
+ CMD ["uvicorn", "cicd_diagnosis_env.server.app:app", \
21
+ "--host", "0.0.0.0", "--port", "8000", "--workers", "1"]
README.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CI/CD Failure Diagnosis Environment
2
+
3
+ An OpenEnv RL environment where an agent reads broken CI/CD pipeline logs and diagnoses what went wrong.
4
+
5
+ ## What the Agent Does
6
+
7
+ Given a synthetic pipeline failure log, the agent must:
8
+ 1. Classify the failure category (`dependency`, `config`, `flaky`, `code_bug`, `infra`)
9
+ 2. Identify the root cause
10
+ 3. Suggest a concrete fix
11
+
12
+ ## Three Task Tiers
13
+
14
+ | Task | Difficulty | Failure Type |
15
+ |------|-----------|-------------|
16
+ | 1 | Easy | Single `ModuleNotFoundError` — one missing package |
17
+ | 2 | Medium | Misconfigured env var causes 3 cascading test failures |
18
+ | 3 | Hard | Async timing flaky test (looks like a code bug, isn't) |
19
+
20
+ ## Reward Function
21
+
22
+ | Component | Points |
23
+ |-----------|--------|
24
+ | Correct failure category | +0.20 |
25
+ | Correct root cause | +0.30 |
26
+ | Valid fix suggested | +0.30 |
27
+ | Confidence calibration | +0.20 |
28
+ | Per irrelevant section mentioned | -0.10 |
29
+
30
+ ## Quick Start
31
+
32
+ ```bash
33
+ # Build and run the server
34
+ docker build -t cicd-env .
35
+ docker run -p 8000:8000 cicd-env
36
+
37
+ # Run the LLM agent
38
+ export API_BASE_URL=https://api.openai.com/v1
39
+ export MODEL_NAME=gpt-4o-mini
40
+ export OPENAI_API_KEY=sk-...
41
+ export ENV_URL=http://localhost:8000
42
+ python inference.py
43
+ ```
44
+
45
+ ## API Endpoints
46
+
47
+ | Endpoint | Method | Description |
48
+ |----------|--------|-------------|
49
+ | `/reset` | POST | Start new episode, returns initial observation with pipeline log |
50
+ | `/step` | POST | Submit `DiagnoseAction`, returns scored observation |
51
+ | `/state` | GET | Current episode metadata |
52
+ | `/health` | GET | Health check |
53
+
54
+ ## Action Schema
55
+
56
+ ```json
57
+ {
58
+ "action": {
59
+ "failure_category": "dependency",
60
+ "root_cause": "missing pytest-cov package",
61
+ "suggested_fix": "add pytest-cov to requirements.txt",
62
+ "confidence": 0.9
63
+ }
64
+ }
65
+ ```
__pycache__/inference.cpython-314.pyc ADDED
Binary file (5.93 kB). View file
 
cicd_diagnosis_env/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from .models import DiagnoseAction, PipelineObservation, PipelineState
2
+
3
+ # client import is here so callers can do: from cicd_diagnosis_env import CICDEnv
4
+ try:
5
+ from .client import CICDEnv
6
+ except ImportError:
7
+ CICDEnv = None
8
+
9
+ __all__ = ["DiagnoseAction", "PipelineObservation", "PipelineState", "CICDEnv"]
cicd_diagnosis_env/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (442 Bytes). View file
 
cicd_diagnosis_env/__pycache__/client.cpython-314.pyc ADDED
Binary file (5 kB). View file
 
cicd_diagnosis_env/__pycache__/models.cpython-314.pyc ADDED
Binary file (2.88 kB). View file
 
cicd_diagnosis_env/client.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cicd_diagnosis_env/client.py
2
+ # HTTP client wrapper — tries openenv SDK first, falls back to raw requests
3
+ from __future__ import annotations
4
+
5
+ try:
6
+ from openenv.core.client_types import StepResult
7
+ from openenv.core.env_client import EnvClient
8
+ _sdk = True
9
+ except ImportError:
10
+ _sdk = False
11
+
12
+ from cicd_diagnosis_env.models import DiagnoseAction, PipelineObservation, PipelineState
13
+
14
+ if _sdk:
15
+ class CICDEnv(EnvClient[DiagnoseAction, PipelineObservation, PipelineState]):
16
+ # --- EnvClient abstract hooks ---
17
+
18
+ def _step_payload(self, action: DiagnoseAction) -> dict:
19
+ # wire format expected by /step endpoint under "action" key
20
+ return {
21
+ "failure_category": action.failure_category,
22
+ "root_cause": action.root_cause,
23
+ "suggested_fix": action.suggested_fix,
24
+ "confidence": action.confidence,
25
+ }
26
+
27
+ def _parse_result(self, payload: dict) -> StepResult[PipelineObservation]:
28
+ # Expecting: { "observation": {...}, "reward": float|null, "done": bool }
29
+ obs = PipelineObservation(**payload["observation"])
30
+ return StepResult(
31
+ observation=obs,
32
+ reward=payload.get("reward"),
33
+ done=bool(payload.get("done", False)),
34
+ )
35
+
36
+ def _parse_state(self, payload: dict) -> PipelineState:
37
+ return PipelineState(
38
+ episode_id=payload.get("episode_id", ""),
39
+ step_count=payload.get("step_count", 0),
40
+ last_score=payload.get("last_score", 0.0),
41
+ task_id=payload.get("task_id", 0),
42
+ pipeline_name=payload.get("pipeline_name", ""),
43
+ )
44
+
45
+ else:
46
+ # fallback when openenv SDK is not installed — plain requests, sync only
47
+ import requests
48
+
49
+ class CICDEnv: # type: ignore[no-redef]
50
+ def __init__(self, base_url: str = "http://localhost:8000"):
51
+ self.base_url = base_url.rstrip("/")
52
+
53
+ def reset(self) -> PipelineObservation:
54
+ r = requests.post(f"{self.base_url}/reset", timeout=30)
55
+ r.raise_for_status()
56
+ return PipelineObservation(**r.json()["observation"])
57
+
58
+ def step(self, action: DiagnoseAction) -> PipelineObservation:
59
+ payload = {
60
+ "action": {
61
+ "failure_category": action.failure_category,
62
+ "root_cause": action.root_cause,
63
+ "suggested_fix": action.suggested_fix,
64
+ "confidence": action.confidence,
65
+ }
66
+ }
67
+ r = requests.post(f"{self.base_url}/step", json=payload, timeout=30)
68
+ r.raise_for_status()
69
+ return PipelineObservation(**r.json()["observation"])
cicd_diagnosis_env/models.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, Optional
4
+
5
+ try:
6
+ from openenv.core.env_server.interfaces import Action, Observation, State
7
+ from dataclasses import dataclass, field
8
+ except ImportError:
9
+ # running outside the openenv SDK (local dev, tests) — define minimal base classes
10
+ from dataclasses import dataclass, field
11
+
12
+ @dataclass(kw_only=True)
13
+ class Action:
14
+ pass
15
+
16
+ @dataclass(kw_only=True)
17
+ class Observation:
18
+ done: bool = False
19
+ reward: Optional[float] = None
20
+ metadata: Dict[str, Any] = field(default_factory=dict)
21
+
22
+ @dataclass(kw_only=True)
23
+ class State:
24
+ episode_id: str = ""
25
+ step_count: int = 0
26
+
27
+
28
+ @dataclass(kw_only=True)
29
+ class DiagnoseAction(Action):
30
+ # one of: dependency, config, flaky, code_bug, infra
31
+ failure_category: str
32
+ root_cause: str
33
+ suggested_fix: str
34
+ # 0-1 self-reported confidence; grader scales the 0.20 bonus by this
35
+ confidence: float = 0.8
36
+
37
+
38
+ @dataclass(kw_only=True)
39
+ class PipelineObservation(Observation):
40
+ pipeline_log: str = ""
41
+ error_summary: str = "" # one-liner pulled from the log
42
+ pipeline_stage: str = ""
43
+ task_id: int = 0 # 1=easy 2=medium 3=hard
44
+ attempt: int = 0
45
+ feedback: str = ""
46
+ score: float = 0.0
47
+ # TODO: add structured fields for log sections once we have more task types
48
+
49
+
50
+ @dataclass(kw_only=True)
51
+ class PipelineState(State):
52
+ last_score: float = 0.0
53
+ task_id: int = 0
54
+ pipeline_name: str = ""
cicd_diagnosis_env/server/__init__.py ADDED
File without changes
cicd_diagnosis_env/server/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (174 Bytes). View file
 
cicd_diagnosis_env/server/__pycache__/app.cpython-314.pyc ADDED
Binary file (3.92 kB). View file
 
cicd_diagnosis_env/server/__pycache__/environment.cpython-314.pyc ADDED
Binary file (4.67 kB). View file
 
cicd_diagnosis_env/server/__pycache__/graders.cpython-314.pyc ADDED
Binary file (7.08 kB). View file
 
cicd_diagnosis_env/server/__pycache__/log_generator.cpython-314.pyc ADDED
Binary file (10 kB). View file
 
cicd_diagnosis_env/server/app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cicd_diagnosis_env/server/app.py
2
+
3
+ from fastapi import FastAPI, HTTPException
4
+ from pydantic import BaseModel
5
+ from typing import Any, Dict
6
+
7
+ from cicd_diagnosis_env.models import DiagnoseAction
8
+ from cicd_diagnosis_env.server.environment import CICDEnvironment
9
+
10
+ app = FastAPI(title="cicd_diagnosis_env", version="0.1.0")
11
+
12
+ # one shared env instance — fine for hackathon scale
13
+ # TODO: add per-session map if we need concurrent multi-agent training
14
+ _env = CICDEnvironment()
15
+
16
+
17
+ class StepRequest(BaseModel):
18
+ action: Dict[str, Any]
19
+
20
+
21
+ @app.post("/reset")
22
+ def reset():
23
+ obs = _env.reset()
24
+ return _obs_dict(obs)
25
+
26
+
27
+ @app.post("/step")
28
+ def step(req: StepRequest):
29
+ try:
30
+ action = DiagnoseAction(**req.action)
31
+ except Exception as e:
32
+ raise HTTPException(status_code=422, detail=str(e))
33
+ obs = _env.step(action)
34
+ return _obs_dict(obs)
35
+
36
+
37
+ @app.get("/state")
38
+ def get_state():
39
+ s = _env.state
40
+ return {
41
+ "episode_id": s.episode_id,
42
+ "step_count": s.step_count,
43
+ "last_score": s.last_score,
44
+ "task_id": s.task_id,
45
+ "pipeline_name": s.pipeline_name,
46
+ }
47
+
48
+
49
+ @app.get("/health")
50
+ def health():
51
+ return {"status": "healthy"}
52
+
53
+
54
+ def _obs_dict(obs):
55
+ # separate observation from reward/done so clients match the OpenEnv wire format
56
+ return {
57
+ "observation": {
58
+ "pipeline_log": obs.pipeline_log,
59
+ "error_summary": obs.error_summary,
60
+ "pipeline_stage": obs.pipeline_stage,
61
+ "task_id": obs.task_id,
62
+ "attempt": obs.attempt,
63
+ "feedback": obs.feedback,
64
+ "score": obs.score,
65
+ },
66
+ "reward": obs.reward,
67
+ "done": obs.done,
68
+ "info": {},
69
+ }
70
+
71
+
72
+ def main():
73
+ import uvicorn
74
+ uvicorn.run(app, host="0.0.0.0", port=8000)
75
+
76
+
77
+ if __name__ == "__main__":
78
+ main()
cicd_diagnosis_env/server/environment.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cicd_diagnosis_env/server/environment.py
2
+ import uuid
3
+
4
+ try:
5
+ from openenv.core.env_server.interfaces import Environment
6
+ except ImportError:
7
+ class Environment:
8
+ pass
9
+
10
+ from cicd_diagnosis_env.models import DiagnoseAction, PipelineObservation, PipelineState
11
+ from cicd_diagnosis_env.server.log_generator import generate_log
12
+ from cicd_diagnosis_env.server.graders import grade
13
+
14
+ MAX_STEPS = 3 # agent gets up to 3 attempts per episode
15
+
16
+
17
+ class CICDEnvironment(Environment):
18
+ """
19
+ CI/CD failure diagnosis environment.
20
+ Agent sees a broken pipeline log and must identify category, root cause, and fix.
21
+ Episode ends on perfect score or after MAX_STEPS attempts.
22
+ """
23
+
24
+ def __init__(self):
25
+ self._state = PipelineState()
26
+ self._meta = {}
27
+ self._log = ""
28
+
29
+ def reset(self):
30
+ self._log, self._meta = generate_log()
31
+ self._state = PipelineState(episode_id=str(uuid.uuid4()), step_count=0)
32
+ self._state.task_id = self._meta["task_id"]
33
+ self._state.pipeline_name = self._meta.get("failed_stage", "unknown")
34
+ self._state.last_score = 0.0
35
+ summary = _extract_summary(self._log)
36
+ obs = PipelineObservation(done=False, reward=0.0)
37
+ obs.pipeline_log = self._log
38
+ obs.error_summary = summary
39
+ obs.pipeline_stage = self._meta["failed_stage"]
40
+ obs.task_id = self._meta["task_id"]
41
+ obs.attempt = 0
42
+ obs.feedback = ""
43
+ obs.score = 0.0
44
+ return obs
45
+
46
+ def step(self, action):
47
+ if not isinstance(action, DiagnoseAction):
48
+ raise ValueError(f"expected DiagnoseAction, got {type(action)}")
49
+
50
+ self._state.step_count += 1
51
+ step_num = self._state.step_count
52
+
53
+ score, feedback = grade(action, self._meta)
54
+ self._state.last_score = score
55
+
56
+ # TODO: track per-episode score history for better feedback
57
+ done = score >= 1.0 or step_num >= MAX_STEPS
58
+
59
+ obs = PipelineObservation(done=done, reward=score)
60
+ obs.pipeline_log = self._log
61
+ obs.error_summary = _extract_summary(self._log)
62
+ obs.pipeline_stage = self._meta["failed_stage"]
63
+ obs.task_id = self._meta["task_id"]
64
+ obs.attempt = step_num
65
+ obs.feedback = feedback
66
+ obs.score = score
67
+ return obs
68
+
69
+ @property
70
+ def state(self):
71
+ return self._state
72
+
73
+
74
+ def _extract_summary(log):
75
+ # grab the actual error class line — skip Traceback/File/assert noise
76
+ for line in log.splitlines():
77
+ if "[ERROR]" not in line:
78
+ continue
79
+ tail = line.split("[ERROR]")[-1].strip()
80
+ # skip frame lines and assertion lines — not useful as a summary
81
+ if tail.startswith('File "') or tail.startswith("assert ") or tail.startswith("Traceback"):
82
+ continue
83
+ if "FAILED" in tail or "failed" in tail:
84
+ continue
85
+ return tail
86
+ return "unknown error"
cicd_diagnosis_env/server/graders.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cicd_diagnosis_env/server/graders.py
2
+ from __future__ import annotations
3
+
4
+
5
+ def _cat_score(predicted, expected):
6
+ return 0.20 if predicted.lower().strip() == expected.lower() else 0.0
7
+
8
+
9
+ def _cause_score(predicted, keywords):
10
+ pred = predicted.lower()
11
+ # any keyword hit counts — agents phrase things differently
12
+ return 0.30 if any(kw.lower() in pred for kw in keywords) else 0.0
13
+
14
+
15
+ def _fix_score(predicted, keywords):
16
+ pred = predicted.lower()
17
+ return 0.30 if any(kw.lower() in pred for kw in keywords) else 0.0
18
+
19
+
20
+ def _conf_score(conf):
21
+ # scale 0.20 bonus by confidence — only called when base score > 0
22
+ # real calibration would need a held-out set, this is good enough for now
23
+ return round(max(0.0, min(1.0, float(conf))) * 0.20, 4)
24
+
25
+
26
+ def grade_task1(action, meta):
27
+ # easy task: single ModuleNotFoundError, category must be "dependency"
28
+ pkg = meta["pkg"]
29
+ score = 0.0
30
+ parts = []
31
+
32
+ cat = _cat_score(action.failure_category, "dependency")
33
+ score += cat
34
+ parts.append(f"cat={'OK' if cat else 'MISS'}")
35
+
36
+ cause = _cause_score(action.root_cause, [pkg, "ModuleNotFoundError", "missing"])
37
+ score += cause
38
+ parts.append(f"cause={'OK' if cause else 'MISS'}")
39
+
40
+ fix = _fix_score(action.suggested_fix, ["requirements", "pyproject", "install", pkg])
41
+ score += fix
42
+ parts.append(f"fix={'OK' if fix else 'MISS'}")
43
+
44
+ if score > 0:
45
+ c = _conf_score(action.confidence)
46
+ score += c
47
+ parts.append(f"conf={c:.2f}")
48
+
49
+ # penalise mentioning stages that passed fine
50
+ for stage in ["checkout", "lint", "setup-python"]:
51
+ if stage in (action.root_cause + action.suggested_fix).lower():
52
+ score -= 0.10
53
+ parts.append(f"penalty(irrelevant:{stage})")
54
+
55
+ score = max(0.0, min(1.0, round(score, 4)))
56
+ return score, f"task1: {', '.join(parts)} => {score}"
57
+
58
+
59
+ def grade_task2(action, meta):
60
+ # medium: config env var missing -> 3 cascading failures; root cause is NOT the test code
61
+ var = meta["var_name"]
62
+ score = 0.0
63
+ parts = []
64
+
65
+ cat = _cat_score(action.failure_category, "config")
66
+ score += cat
67
+ parts.append(f"cat={'OK' if cat else 'MISS'}")
68
+
69
+ cause = _cause_score(action.root_cause, [var, "environment variable", "config", "missing var"])
70
+ score += cause
71
+ parts.append(f"cause={'OK' if cause else 'MISS'}")
72
+
73
+ fix = _fix_score(action.suggested_fix, [var, "secret", ".env", "CI", "environment"])
74
+ score += fix
75
+ parts.append(f"fix={'OK' if fix else 'MISS'}")
76
+
77
+ if score > 0:
78
+ c = _conf_score(action.confidence)
79
+ score += c
80
+ parts.append(f"conf={c:.2f}")
81
+
82
+ # blaming the test code is wrong — that's a symptom
83
+ if "test" in action.root_cause.lower() and var.lower() not in action.root_cause.lower():
84
+ score -= 0.10
85
+ parts.append("penalty(blamed tests not config)")
86
+
87
+ score = max(0.0, min(1.0, round(score, 4)))
88
+ return score, f"task2: {', '.join(parts)} => {score}"
89
+
90
+
91
+ def grade_task3(action, meta):
92
+ # hard: async timeout flaky test — MUST classify as "flaky", not "code_bug"
93
+ score = 0.0
94
+ parts = []
95
+
96
+ cat = _cat_score(action.failure_category, "flaky")
97
+ score += cat
98
+ parts.append(f"cat={'OK' if cat else 'MISS'}")
99
+
100
+ cause = _cause_score(
101
+ action.root_cause,
102
+ ["timeout", "timing", "async", "intermittent", "flaky", "race"],
103
+ )
104
+ score += cause
105
+ parts.append(f"cause={'OK' if cause else 'MISS'}")
106
+
107
+ fix = _fix_score(
108
+ action.suggested_fix,
109
+ ["timeout", "retry", "increase", "skip", "xfail", "flaky marker"],
110
+ )
111
+ score += fix
112
+ parts.append(f"fix={'OK' if fix else 'MISS'}")
113
+
114
+ if score > 0:
115
+ c = _conf_score(action.confidence)
116
+ score += c
117
+ parts.append(f"conf={c:.2f}")
118
+
119
+ # heavy penalty for misclassifying as code_bug — that's the whole point of this task
120
+ if "code_bug" in action.failure_category.lower() or (
121
+ "logic" in action.root_cause.lower() and "timeout" not in action.root_cause.lower()
122
+ ):
123
+ score -= 0.20
124
+ parts.append("penalty(misclassified as code_bug)")
125
+
126
+ score = max(0.0, min(1.0, round(score, 4)))
127
+ return score, f"task3: {', '.join(parts)} => {score}"
128
+
129
+
130
+ _GRADERS = {1: grade_task1, 2: grade_task2, 3: grade_task3}
131
+
132
+
133
+ def grade(action, meta):
134
+ tid = meta.get("task_id")
135
+ if tid not in _GRADERS:
136
+ return 0.0, f"unknown task_id {tid}"
137
+ return _GRADERS[tid](action, meta)
cicd_diagnosis_env/server/log_generator.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cicd_diagnosis_env/server/log_generator.py
2
+ import random
3
+ from datetime import datetime, timedelta
4
+
5
+ # packages that show up as missing deps in task1
6
+ _PACKAGES = [
7
+ "pytest-cov", "httpx", "pydantic", "sqlalchemy",
8
+ "celery", "redis", "boto3", "fastapi", "uvicorn",
9
+ "alembic", "mypy", "black", "ruff", "anyio",
10
+ ]
11
+
12
+ _REPOS = [
13
+ "api-gateway", "user-service", "payment-service",
14
+ "notification-worker", "data-pipeline", "auth-service",
15
+ ]
16
+
17
+ _BRANCHES = ["main", "develop", "feat/auth-refactor", "fix/retry-logic", "release/v2.1"]
18
+
19
+
20
+ def _ts(base, offset_s):
21
+ return (base + timedelta(seconds=offset_s)).strftime("%Y-%m-%dT%H:%M:%SZ")
22
+
23
+
24
+ def _rand_line():
25
+ return random.randint(12, 340)
26
+
27
+
28
+ def _task1_log():
29
+ # easy - one obvious ModuleNotFoundError, single stage fails
30
+ base = datetime.utcnow().replace(microsecond=0)
31
+ pkg = random.choice(_PACKAGES)
32
+ repo = random.choice(_REPOS)
33
+ branch = random.choice(_BRANCHES)
34
+ line = _rand_line()
35
+ run_id = random.randint(1000, 9999)
36
+
37
+ log = f"""##[group]Run details
38
+ Repository: {repo}
39
+ Branch: {branch}
40
+ Run ID: {run_id}
41
+ Triggered: push
42
+ ##[endgroup]
43
+
44
+ {_ts(base, 0)} [INFO ] Pipeline started
45
+ {_ts(base, 2)} [INFO ] Stage: checkout - OK
46
+ {_ts(base, 4)} [INFO ] Stage: setup-python - OK
47
+ {_ts(base, 6)} [INFO ] Stage: install-dependencies - OK
48
+ {_ts(base, 8)} [INFO ] Stage: lint - OK
49
+ {_ts(base, 10)} [INFO ] Stage: test - RUNNING
50
+ {_ts(base, 11)} [ERROR] Traceback (most recent call last):
51
+ {_ts(base, 11)} [ERROR] File "tests/test_main.py", line {line}, in test_endpoint
52
+ {_ts(base, 11)} [ERROR] import {pkg}
53
+ {_ts(base, 11)} [ERROR] ModuleNotFoundError: No module named '{pkg}'
54
+ {_ts(base, 12)} [ERROR] Stage: test - FAILED (exit code 1)
55
+ {_ts(base, 13)} [INFO ] Stage: deploy - SKIPPED
56
+ {_ts(base, 13)} [ERROR] Pipeline FAILED"""
57
+
58
+ meta = {
59
+ "task_id": 1,
60
+ "failure_category": "dependency",
61
+ "root_cause": f"missing package: {pkg}",
62
+ "expected_fix": f"add {pkg} to requirements.txt or pyproject.toml",
63
+ "failed_stage": "test",
64
+ "pkg": pkg,
65
+ }
66
+ return log, meta
67
+
68
+
69
+ def _task2_log():
70
+ # medium - missing env var causes 3 downstream test failures
71
+ # the tricky part: symptoms look like test failures but root cause is config
72
+ base = datetime.utcnow().replace(microsecond=0)
73
+ repo = random.choice(_REPOS)
74
+ branch = random.choice(_BRANCHES)
75
+ run_id = random.randint(1000, 9999)
76
+ var_name = random.choice(["DATABASE_URL", "REDIS_URL", "SECRET_KEY", "API_BASE_URL"])
77
+ test_names = random.sample(
78
+ ["test_create_user", "test_login", "test_refresh_token",
79
+ "test_get_profile", "test_update_settings", "test_delete_account"],
80
+ 3,
81
+ )
82
+ lines = [_rand_line() for _ in range(3)]
83
+
84
+ failing_tests = ""
85
+ for i, (t, ln) in enumerate(zip(test_names, lines)):
86
+ failing_tests += (
87
+ f"{_ts(base, 14 + i)} [ERROR] FAILED tests/test_api.py::{t}\n"
88
+ f"{_ts(base, 14 + i)} [ERROR] File \"tests/test_api.py\", line {ln}, in {t}\n"
89
+ f"{_ts(base, 14 + i)} [ERROR] assert response.status_code == 200\n"
90
+ f"{_ts(base, 14 + i)} [ERROR] AssertionError: assert 500 == 200\n"
91
+ )
92
+
93
+ log = f"""##[group]Run details
94
+ Repository: {repo}
95
+ Branch: {branch}
96
+ Run ID: {run_id}
97
+ Triggered: push
98
+ ##[endgroup]
99
+
100
+ {_ts(base, 0)} [INFO ] Pipeline started
101
+ {_ts(base, 2)} [INFO ] Stage: checkout - OK
102
+ {_ts(base, 4)} [INFO ] Stage: setup-python - OK
103
+ {_ts(base, 6)} [INFO ] Stage: install-dependencies - OK
104
+ {_ts(base, 8)} [INFO ] Stage: lint - OK
105
+ {_ts(base, 10)} [INFO ] Stage: test - RUNNING
106
+ {_ts(base, 12)} [WARN ] Environment variable {var_name} not set, using fallback value ''
107
+ {_ts(base, 13)} [INFO ] Connecting to service... using config: {var_name}=''
108
+ {failing_tests.rstrip()}
109
+ {_ts(base, 17)} [ERROR] 3 failed, 12 passed in 4.21s
110
+ {_ts(base, 18)} [ERROR] Stage: test - FAILED (exit code 1)
111
+ {_ts(base, 19)} [INFO ] Stage: deploy - SKIPPED
112
+ {_ts(base, 19)} [ERROR] Pipeline FAILED"""
113
+
114
+ meta = {
115
+ "task_id": 2,
116
+ "failure_category": "config",
117
+ "root_cause": f"missing environment variable: {var_name}",
118
+ "expected_fix": f"set {var_name} in CI/CD secrets or .env file",
119
+ "failed_stage": "test",
120
+ "var_name": var_name,
121
+ "failing_tests": test_names,
122
+ }
123
+ return log, meta
124
+
125
+
126
+ def _task3_log():
127
+ # hard - async timeout that *looks* like a code bug but it's a flaky timing issue
128
+ # the hint is "This test passed on the last 4 runs" buried in the log
129
+ base = datetime.utcnow().replace(microsecond=0)
130
+ repo = random.choice(_REPOS)
131
+ branch = random.choice(_BRANCHES)
132
+ run_id = random.randint(1000, 9999)
133
+ timeout_ms = random.choice([50, 100, 150, 200])
134
+ test_name = random.choice(
135
+ ["test_async_handler", "test_concurrent_requests",
136
+ "test_background_task", "test_websocket_ping"]
137
+ )
138
+ line = _rand_line()
139
+ elapsed_ms = random.randint(10, 99) # for the "2.XX s" runtime line
140
+
141
+ log = f"""##[group]Run details
142
+ Repository: {repo}
143
+ Branch: {branch}
144
+ Run ID: {run_id}
145
+ Triggered: push
146
+ ##[endgroup]
147
+
148
+ {_ts(base, 0)} [INFO ] Pipeline started
149
+ {_ts(base, 2)} [INFO ] Stage: checkout - OK
150
+ {_ts(base, 4)} [INFO ] Stage: setup-python - OK
151
+ {_ts(base, 6)} [INFO ] Stage: install-dependencies - OK
152
+ {_ts(base, 8)} [INFO ] Stage: lint - OK
153
+ {_ts(base, 10)} [INFO ] Stage: test - RUNNING
154
+ {_ts(base, 11)} [INFO ] pytest -x tests/ --timeout={timeout_ms / 1000:.1f}
155
+ {_ts(base, 13)} [ERROR] FAILED tests/test_handlers.py::{test_name}
156
+ {_ts(base, 13)} [ERROR] File "tests/test_handlers.py", line {line}, in {test_name}
157
+ {_ts(base, 13)} [ERROR] result = await asyncio.wait_for(handler(), timeout={timeout_ms / 1000:.2f})
158
+ {_ts(base, 13)} [ERROR] asyncio.exceptions.TimeoutError
159
+ {_ts(base, 14)} [ERROR] 1 failed, 27 passed in 2.{elapsed_ms}s
160
+ {_ts(base, 15)} [WARN ] Note: This test passed on the last 4 runs
161
+ {_ts(base, 15)} [ERROR] Stage: test - FAILED (exit code 1)
162
+ {_ts(base, 16)} [INFO ] Stage: deploy - SKIPPED
163
+ {_ts(base, 16)} [ERROR] Pipeline FAILED"""
164
+
165
+ meta = {
166
+ "task_id": 3,
167
+ "failure_category": "flaky",
168
+ "root_cause": f"async timeout in {test_name} - timing-sensitive, not a logic bug",
169
+ "expected_fix": "increase timeout or add retry logic for this test; do not modify handler code",
170
+ "failed_stage": "test",
171
+ "test_name": test_name,
172
+ "timeout_ms": timeout_ms,
173
+ }
174
+ return log, meta
175
+
176
+
177
+ _GENERATORS = {1: _task1_log, 2: _task2_log, 3: _task3_log}
178
+
179
+
180
+ def generate_log(task_id=None):
181
+ # task_id=None -> pick randomly, useful during training
182
+ if task_id is None:
183
+ task_id = random.choice([1, 2, 3])
184
+ if task_id not in _GENERATORS:
185
+ raise ValueError(f"task_id must be 1, 2, or 3 - got {task_id}")
186
+ return _GENERATORS[task_id]()
docs/superpowers/plans/2026-04-05-cicd-diagnosis-env.md ADDED
@@ -0,0 +1,1148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CI/CD Failure Diagnosis Environment Implementation Plan
2
+
3
+ > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
4
+
5
+ **Goal:** Build a complete OpenEnv-compliant RL environment for CI/CD failure diagnosis with hybrid log generation, deterministic grading, and an LLM inference script.
6
+
7
+ **Architecture:** FastAPI server inside Docker exposes `/reset` and `/step` endpoints. `log_generator.py` produces synthetic logs by randomizing seed templates — the injected failure metadata is stored in server state so `graders.py` can score deterministically. Three difficulty tiers map to three grader functions.
8
+
9
+ **Tech Stack:** Python 3.11, FastAPI, Pydantic v2, OpenEnv core SDK, OpenAI-compatible client (for inference.py), Docker, uvicorn
10
+
11
+ ---
12
+
13
+ ## File Map
14
+
15
+ | File | Role |
16
+ |------|------|
17
+ | `cicd_diagnosis_env/server/log_generator.py` | Hybrid template engine — seed templates + randomization, returns (log_str, metadata) |
18
+ | `cicd_diagnosis_env/models.py` | DiagnoseAction, PipelineObservation, PipelineState Pydantic dataclasses |
19
+ | `cicd_diagnosis_env/server/graders.py` | grade_task1, grade_task2, grade_task3 — return (score, feedback) |
20
+ | `cicd_diagnosis_env/server/environment.py` | CICDEnvironment(Environment) — reset/step/state property |
21
+ | `cicd_diagnosis_env/server/app.py` | FastAPI app — manual routes /reset /step /state /health |
22
+ | `cicd_diagnosis_env/client.py` | CICDEnv(EnvClient) — _step_payload, _parse_result, _parse_state |
23
+ | `inference.py` | LLM agent loop — [START]/[STEP]/[END] structured logs |
24
+ | `cicd_diagnosis_env/Dockerfile` | Server container — port 8000 |
25
+ | `cicd_diagnosis_env/openenv.yaml` | OpenEnv manifest |
26
+ | `cicd_diagnosis_env/README.md` | Environment documentation |
27
+
28
+ ---
29
+
30
+ ## Task 1: log_generator.py
31
+
32
+ **Files:**
33
+ - Create: `cicd_diagnosis_env/server/log_generator.py`
34
+
35
+ - [ ] **Step 1: Create the file with imports and helpers**
36
+
37
+ ```python
38
+ # cicd_diagnosis_env/server/log_generator.py
39
+ import random
40
+ import uuid
41
+ from datetime import datetime, timedelta
42
+
43
+ # packages that can appear in task1 dependency errors
44
+ _PACKAGES = [
45
+ "pytest-cov", "httpx", "pydantic", "sqlalchemy",
46
+ "celery", "redis", "boto3", "fastapi", "uvicorn",
47
+ "alembic", "mypy", "black", "ruff", "anyio",
48
+ ]
49
+
50
+ _REPOS = [
51
+ "api-gateway", "user-service", "payment-service",
52
+ "notification-worker", "data-pipeline", "auth-service",
53
+ ]
54
+
55
+ _BRANCHES = ["main", "develop", "feat/auth-refactor", "fix/retry-logic", "release/v2.1"]
56
+
57
+ def _ts(base: datetime, offset_s: int) -> str:
58
+ return (base + timedelta(seconds=offset_s)).strftime("%Y-%m-%dT%H:%M:%SZ")
59
+
60
+ def _rand_line() -> int:
61
+ return random.randint(12, 340)
62
+ ```
63
+
64
+ - [ ] **Step 2: Add Task 1 generator — single ModuleNotFoundError**
65
+
66
+ ```python
67
+ def _task1_log() -> tuple[str, dict]:
68
+ """Easy: one missing package, one stage fails."""
69
+ base = datetime.utcnow().replace(microsecond=0)
70
+ pkg = random.choice(_PACKAGES)
71
+ repo = random.choice(_REPOS)
72
+ branch = random.choice(_BRANCHES)
73
+ line = _rand_line()
74
+ run_id = random.randint(1000, 9999)
75
+
76
+ log = f"""##[group]Run details
77
+ Repository: {repo}
78
+ Branch: {branch}
79
+ Run ID: {run_id}
80
+ Triggered: push
81
+ ##[endgroup]
82
+
83
+ {_ts(base, 0)} [INFO ] Pipeline started
84
+ {_ts(base, 2)} [INFO ] Stage: checkout — OK
85
+ {_ts(base, 4)} [INFO ] Stage: setup-python — OK
86
+ {_ts(base, 6)} [INFO ] Stage: install-dependencies — OK
87
+ {_ts(base, 8)} [INFO ] Stage: lint — OK
88
+ {_ts(base, 10)} [INFO ] Stage: test — RUNNING
89
+ {_ts(base, 11)} [ERROR] Traceback (most recent call last):
90
+ {_ts(base, 11)} [ERROR] File "tests/test_main.py", line {line}, in test_endpoint
91
+ {_ts(base, 11)} [ERROR] import {pkg}
92
+ {_ts(base, 11)} [ERROR] ModuleNotFoundError: No module named '{pkg}'
93
+ {_ts(base, 12)} [ERROR] Stage: test — FAILED (exit code 1)
94
+ {_ts(base, 13)} [INFO ] Stage: deploy — SKIPPED
95
+ {_ts(base, 13)} [ERROR] Pipeline FAILED
96
+ """
97
+
98
+ meta = {
99
+ "task_id": 1,
100
+ "failure_category": "dependency",
101
+ "root_cause": f"missing package: {pkg}",
102
+ "expected_fix": f"add {pkg} to requirements.txt or pyproject.toml",
103
+ "failed_stage": "test",
104
+ "pkg": pkg,
105
+ }
106
+ return log.strip(), meta
107
+ ```
108
+
109
+ - [ ] **Step 3: Add Task 2 generator — config error causing cascading test failures**
110
+
111
+ ```python
112
+ def _task2_log() -> tuple[str, dict]:
113
+ """Medium: bad env var config causes 3 tests to fail downstream."""
114
+ base = datetime.utcnow().replace(microsecond=0)
115
+ repo = random.choice(_REPOS)
116
+ branch = random.choice(_BRANCHES)
117
+ run_id = random.randint(1000, 9999)
118
+ var_name = random.choice(["DATABASE_URL", "REDIS_URL", "SECRET_KEY", "API_BASE_URL"])
119
+ test_names = random.sample(
120
+ ["test_create_user", "test_login", "test_refresh_token",
121
+ "test_get_profile", "test_update_settings", "test_delete_account"],
122
+ 3,
123
+ )
124
+ lines = [_rand_line() for _ in range(3)]
125
+
126
+ failing_tests = ""
127
+ for i, (t, ln) in enumerate(zip(test_names, lines)):
128
+ failing_tests += f"""{_ts(base, 14 + i)} [ERROR] FAILED tests/test_api.py::{t}
129
+ {_ts(base, 14 + i)} [ERROR] File "tests/test_api.py", line {ln}, in {t}
130
+ {_ts(base, 14 + i)} [ERROR] assert response.status_code == 200
131
+ {_ts(base, 14 + i)} [ERROR] AssertionError: assert 500 == 200
132
+ """
133
+
134
+ log = f"""##[group]Run details
135
+ Repository: {repo}
136
+ Branch: {branch}
137
+ Run ID: {run_id}
138
+ Triggered: push
139
+ ##[endgroup]
140
+
141
+ {_ts(base, 0)} [INFO ] Pipeline started
142
+ {_ts(base, 2)} [INFO ] Stage: checkout — OK
143
+ {_ts(base, 4)} [INFO ] Stage: setup-python — OK
144
+ {_ts(base, 6)} [INFO ] Stage: install-dependencies — OK
145
+ {_ts(base, 8)} [INFO ] Stage: lint — OK
146
+ {_ts(base, 10)} [INFO ] Stage: test — RUNNING
147
+ {_ts(base, 12)} [WARN ] Environment variable {var_name} not set, using fallback value ''
148
+ {_ts(base, 13)} [INFO ] Connecting to service... using config: {var_name}=''
149
+ {failing_tests.rstrip()}
150
+ {_ts(base, 17)} [ERROR] 3 failed, 12 passed in 4.21s
151
+ {_ts(base, 18)} [ERROR] Stage: test — FAILED (exit code 1)
152
+ {_ts(base, 19)} [INFO ] Stage: deploy — SKIPPED
153
+ {_ts(base, 19)} [ERROR] Pipeline FAILED
154
+ """
155
+
156
+ meta = {
157
+ "task_id": 2,
158
+ "failure_category": "config",
159
+ "root_cause": f"missing environment variable: {var_name}",
160
+ "expected_fix": f"set {var_name} in CI/CD secrets or .env file",
161
+ "failed_stage": "test",
162
+ "var_name": var_name,
163
+ "failing_tests": test_names,
164
+ }
165
+ return log.strip(), meta
166
+ ```
167
+
168
+ - [ ] **Step 4: Add Task 3 generator — async timing / flaky test**
169
+
170
+ ```python
171
+ def _task3_log() -> tuple[str, dict]:
172
+ """Hard: async timing failure that looks like a code bug but is flaky."""
173
+ base = datetime.utcnow().replace(microsecond=0)
174
+ repo = random.choice(_REPOS)
175
+ branch = random.choice(_BRANCHES)
176
+ run_id = random.randint(1000, 9999)
177
+ timeout_ms = random.choice([50, 100, 150, 200])
178
+ test_name = random.choice(
179
+ ["test_async_handler", "test_concurrent_requests",
180
+ "test_background_task", "test_websocket_ping"]
181
+ )
182
+ line = _rand_line()
183
+
184
+ log = f"""##[group]Run details
185
+ Repository: {repo}
186
+ Branch: {branch}
187
+ Run ID: {run_id}
188
+ Triggered: push
189
+ ##[endgroup]
190
+
191
+ {_ts(base, 0)} [INFO ] Pipeline started
192
+ {_ts(base, 2)} [INFO ] Stage: checkout — OK
193
+ {_ts(base, 4)} [INFO ] Stage: setup-python — OK
194
+ {_ts(base, 6)} [INFO ] Stage: install-dependencies — OK
195
+ {_ts(base, 8)} [INFO ] Stage: lint — OK
196
+ {_ts(base, 10)} [INFO ] Stage: test — RUNNING
197
+ {_ts(base, 11)} [INFO ] pytest -x tests/ --timeout={timeout_ms / 1000:.1f}
198
+ {_ts(base, 13)} [ERROR] FAILED tests/test_handlers.py::{test_name}
199
+ {_ts(base, 13)} [ERROR] File "tests/test_handlers.py", line {line}, in {test_name}
200
+ {_ts(base, 13)} [ERROR] result = await asyncio.wait_for(handler(), timeout={timeout_ms / 1000:.2f})
201
+ {_ts(base, 13)} [ERROR] asyncio.exceptions.TimeoutError
202
+ {_ts(base, 14)} [ERROR] 1 failed, 27 passed in 2.{random.randint(10,99)}s
203
+ {_ts(base, 15)} [WARN ] Note: This test passed on the last 4 runs
204
+ {_ts(base, 15)} [ERROR] Stage: test — FAILED (exit code 1)
205
+ {_ts(base, 16)} [INFO ] Stage: deploy — SKIPPED
206
+ {_ts(base, 16)} [ERROR] Pipeline FAILED
207
+ """
208
+
209
+ meta = {
210
+ "task_id": 3,
211
+ "failure_category": "flaky",
212
+ "root_cause": f"async timeout in {test_name} — timing-sensitive, not a logic bug",
213
+ "expected_fix": "increase timeout or add retry logic for this test; do not modify handler code",
214
+ "failed_stage": "test",
215
+ "test_name": test_name,
216
+ "timeout_ms": timeout_ms,
217
+ }
218
+ return log.strip(), meta
219
+ ```
220
+
221
+ - [ ] **Step 5: Add the public `generate_log()` entry point**
222
+
223
+ ```python
224
+ _GENERATORS = {1: _task1_log, 2: _task2_log, 3: _task3_log}
225
+
226
+ def generate_log(task_id: int | None = None) -> tuple[str, dict]:
227
+ """
228
+ Returns (log_text, metadata).
229
+ metadata always has: task_id, failure_category, root_cause, expected_fix, failed_stage
230
+ task_id=None picks randomly.
231
+ """
232
+ if task_id is None:
233
+ task_id = random.choice([1, 2, 3])
234
+ if task_id not in _GENERATORS:
235
+ raise ValueError(f"task_id must be 1, 2, or 3 — got {task_id}")
236
+ return _GENERATORS[task_id]()
237
+ ```
238
+
239
+ - [ ] **Step 6: Quick smoke test in terminal**
240
+
241
+ ```bash
242
+ cd "c:/Users/jayan/Desktop/New folder (2)"
243
+ python -c "
244
+ import sys; sys.path.insert(0, '.')
245
+ from cicd_diagnosis_env.server.log_generator import generate_log
246
+ for tid in [1,2,3]:
247
+ log, meta = generate_log(tid)
248
+ print(f'--- Task {tid} ---')
249
+ print(log[:300])
250
+ print(meta)
251
+ print()
252
+ "
253
+ ```
254
+ Expected: 3 different log blocks printed, each with a meta dict containing `failure_category`.
255
+
256
+ ---
257
+
258
+ ## Task 2: models.py
259
+
260
+ **Files:**
261
+ - Create: `cicd_diagnosis_env/models.py`
262
+ - Create: `cicd_diagnosis_env/__init__.py`
263
+
264
+ - [ ] **Step 1: Write models.py**
265
+
266
+ ```python
267
+ # cicd_diagnosis_env/models.py
268
+ from __future__ import annotations
269
+
270
+ from typing import Any, Dict, List, Optional
271
+
272
+ try:
273
+ from openenv.core.env_server.interfaces import Action, Observation, State
274
+ except ImportError:
275
+ from dataclasses import dataclass, field
276
+
277
+ @dataclass(kw_only=True)
278
+ class Action:
279
+ pass
280
+
281
+ @dataclass(kw_only=True)
282
+ class Observation:
283
+ done: bool = False
284
+ reward: Optional[float] = None
285
+ metadata: Dict[str, Any] = field(default_factory=dict)
286
+
287
+ @dataclass(kw_only=True)
288
+ class State:
289
+ episode_id: str = ""
290
+ step_count: int = 0
291
+
292
+
293
+ class DiagnoseAction(Action):
294
+ """Agent's diagnosis for a CI/CD pipeline failure."""
295
+ failure_category: str # "dependency" | "config" | "flaky" | "code_bug" | "infra"
296
+ root_cause: str # free-text explanation
297
+ suggested_fix: str # what the agent recommends
298
+ confidence: float = 0.8 # 0.0–1.0, used for calibration score
299
+
300
+
301
+ class PipelineObservation(Observation):
302
+ pipeline_log: str = ""
303
+ error_summary: str = "" # one-line extracted error
304
+ pipeline_stage: str = "" # which stage failed
305
+ task_id: int = 0 # 1=easy, 2=medium, 3=hard
306
+ attempt: int = 0
307
+ feedback: str = ""
308
+ score: float = 0.0
309
+
310
+
311
+ class PipelineState(State):
312
+ last_score: float = 0.0
313
+ task_id: int = 0
314
+ pipeline_name: str = ""
315
+ ```
316
+
317
+ - [ ] **Step 2: Write `__init__.py`**
318
+
319
+ ```python
320
+ # cicd_diagnosis_env/__init__.py
321
+ from .models import DiagnoseAction, PipelineObservation, PipelineState
322
+ from .client import CICDEnv
323
+
324
+ __all__ = ["DiagnoseAction", "PipelineObservation", "PipelineState", "CICDEnv"]
325
+ ```
326
+
327
+ - [ ] **Step 3: Create server `__init__.py`**
328
+
329
+ ```python
330
+ # cicd_diagnosis_env/server/__init__.py
331
+ ```
332
+ (empty, just makes it a package)
333
+
334
+ ---
335
+
336
+ ## Task 3: graders.py
337
+
338
+ **Files:**
339
+ - Create: `cicd_diagnosis_env/server/graders.py`
340
+
341
+ - [ ] **Step 1: Write graders.py with all three grader functions**
342
+
343
+ ```python
344
+ # cicd_diagnosis_env/server/graders.py
345
+ """
346
+ Grading logic for all three CI/CD diagnosis tasks.
347
+ Each grader returns (score: float, feedback: str).
348
+ Score is in [0.0, 1.0].
349
+ """
350
+
351
+ from __future__ import annotations
352
+
353
+
354
+ def _category_score(predicted: str, expected: str) -> float:
355
+ return 0.20 if predicted.lower().strip() == expected.lower().strip() else 0.0
356
+
357
+
358
+ def _cause_score(predicted: str, expected_keywords: list[str]) -> float:
359
+ pred = predicted.lower()
360
+ # at least one keyword must appear
361
+ hit = any(kw.lower() in pred for kw in expected_keywords)
362
+ return 0.30 if hit else 0.0
363
+
364
+
365
+ def _fix_score(predicted: str, expected_keywords: list[str]) -> float:
366
+ pred = predicted.lower()
367
+ hit = any(kw.lower() in pred for kw in expected_keywords)
368
+ return 0.30 if hit else 0.0
369
+
370
+
371
+ def _confidence_score(predicted: float) -> float:
372
+ # calibration: reward confident-and-correct answers
373
+ # called only when base score > 0; penalise extreme miscalibration
374
+ clamped = max(0.0, min(1.0, float(predicted)))
375
+ # simple linear: 0.8 confidence = full 0.20, lower = proportional
376
+ return round(clamped * 0.20, 4)
377
+
378
+
379
+ def _relevance_penalty(mentioned_sections: list[str], irrelevant: list[str]) -> float:
380
+ hits = sum(1 for s in mentioned_sections if any(irr in s.lower() for irr in irrelevant))
381
+ return -0.10 * hits
382
+
383
+
384
+ def grade_task1(action, meta: dict) -> tuple[float, str]:
385
+ """Task 1: single ModuleNotFoundError — dependency failure."""
386
+ pkg = meta["pkg"]
387
+ score = 0.0
388
+ parts = []
389
+
390
+ cat = _category_score(action.failure_category, "dependency")
391
+ score += cat
392
+ parts.append(f"category={'OK' if cat else 'MISS'}")
393
+
394
+ cause = _cause_score(action.root_cause, [pkg, "ModuleNotFoundError", "missing"])
395
+ score += cause
396
+ parts.append(f"root_cause={'OK' if cause else 'MISS'}")
397
+
398
+ fix = _fix_score(action.suggested_fix, ["requirements", "pyproject", "install", pkg])
399
+ score += fix
400
+ parts.append(f"fix={'OK' if fix else 'MISS'}")
401
+
402
+ if score > 0:
403
+ conf = _confidence_score(action.confidence)
404
+ score += conf
405
+ parts.append(f"confidence={conf:.2f}")
406
+
407
+ # penalise if agent mentions unrelated stages (checkout/lint passed fine)
408
+ mentioned = action.root_cause.lower() + action.suggested_fix.lower()
409
+ irrelevant_stages = ["checkout", "lint", "setup-python"]
410
+ for stage in irrelevant_stages:
411
+ if stage in mentioned:
412
+ score -= 0.10
413
+ parts.append(f"penalty(-0.10 for {stage})")
414
+
415
+ score = max(0.0, min(1.0, round(score, 4)))
416
+ feedback = f"Task1 grader: {', '.join(parts)}. Final={score}"
417
+ return score, feedback
418
+
419
+
420
+ def grade_task2(action, meta: dict) -> tuple[float, str]:
421
+ """Task 2: config env var causes 3 cascading test failures."""
422
+ var = meta["var_name"]
423
+ score = 0.0
424
+ parts = []
425
+
426
+ cat = _category_score(action.failure_category, "config")
427
+ score += cat
428
+ parts.append(f"category={'OK' if cat else 'MISS'}")
429
+
430
+ # root cause must mention the var name or "environment variable" / "config"
431
+ cause = _cause_score(action.root_cause, [var, "environment variable", "config", "missing var"])
432
+ score += cause
433
+ parts.append(f"root_cause={'OK' if cause else 'MISS'}")
434
+
435
+ fix = _fix_score(action.suggested_fix, [var, "secret", ".env", "CI", "environment"])
436
+ score += fix
437
+ parts.append(f"fix={'OK' if fix else 'MISS'}")
438
+
439
+ if score > 0:
440
+ conf = _confidence_score(action.confidence)
441
+ score += conf
442
+ parts.append(f"confidence={conf:.2f}")
443
+
444
+ # penalise if agent blames the test code (that's a symptom, not root cause)
445
+ if "test" in action.root_cause.lower() and var.lower() not in action.root_cause.lower():
446
+ score -= 0.10
447
+ parts.append("penalty(-0.10 for blaming tests instead of config)")
448
+
449
+ score = max(0.0, min(1.0, round(score, 4)))
450
+ feedback = f"Task2 grader: {', '.join(parts)}. Final={score}"
451
+ return score, feedback
452
+
453
+
454
+ def grade_task3(action, meta: dict) -> tuple[float, str]:
455
+ """Task 3: async timing flaky test — must classify as flaky not code_bug."""
456
+ test_name = meta["test_name"]
457
+ score = 0.0
458
+ parts = []
459
+
460
+ cat = _category_score(action.failure_category, "flaky")
461
+ score += cat
462
+ parts.append(f"category={'OK' if cat else 'MISS'}")
463
+
464
+ # root cause: must mention timing/timeout/async/intermittent — NOT logic bug
465
+ cause = _cause_score(
466
+ action.root_cause,
467
+ ["timeout", "timing", "async", "intermittent", "flaky", "race"],
468
+ )
469
+ score += cause
470
+ parts.append(f"root_cause={'OK' if cause else 'MISS'}")
471
+
472
+ fix = _fix_score(
473
+ action.suggested_fix,
474
+ ["timeout", "retry", "increase", "skip", "xfail", "flaky marker"],
475
+ )
476
+ score += fix
477
+ parts.append(f"fix={'OK' if fix else 'MISS'}")
478
+
479
+ if score > 0:
480
+ conf = _confidence_score(action.confidence)
481
+ score += conf
482
+ parts.append(f"confidence={conf:.2f}")
483
+
484
+ # heavy penalise if agent says it's a code bug
485
+ if "code_bug" in action.failure_category.lower() or (
486
+ "logic" in action.root_cause.lower() and "timeout" not in action.root_cause.lower()
487
+ ):
488
+ score -= 0.20
489
+ parts.append("penalty(-0.20 for misclassifying as code_bug)")
490
+
491
+ score = max(0.0, min(1.0, round(score, 4)))
492
+ feedback = f"Task3 grader: {', '.join(parts)}. Final={score}"
493
+ return score, feedback
494
+
495
+
496
+ GRADERS = {1: grade_task1, 2: grade_task2, 3: grade_task3}
497
+
498
+
499
+ def grade(action, meta: dict) -> tuple[float, str]:
500
+ """Dispatch to the right grader based on meta['task_id']."""
501
+ tid = meta.get("task_id")
502
+ if tid not in GRADERS:
503
+ return 0.0, f"Unknown task_id {tid}"
504
+ return GRADERS[tid](action, meta)
505
+ ```
506
+
507
+ ---
508
+
509
+ ## Task 4: environment.py
510
+
511
+ **Files:**
512
+ - Create: `cicd_diagnosis_env/server/environment.py`
513
+
514
+ - [ ] **Step 1: Write environment.py**
515
+
516
+ ```python
517
+ # cicd_diagnosis_env/server/environment.py
518
+ import uuid
519
+
520
+ try:
521
+ from openenv.core.env_server.interfaces import Action, Environment, Observation
522
+ except ImportError:
523
+ from cicd_diagnosis_env.models import Action, Observation
524
+
525
+ class Environment:
526
+ pass
527
+
528
+ from cicd_diagnosis_env.models import DiagnoseAction, PipelineObservation, PipelineState
529
+ from cicd_diagnosis_env.server.log_generator import generate_log
530
+ from cicd_diagnosis_env.server.graders import grade
531
+
532
+ MAX_STEPS = 3 # agent gets at most 3 attempts per episode
533
+
534
+
535
+ class CICDEnvironment(Environment):
536
+ """
537
+ RL environment for CI/CD failure diagnosis.
538
+
539
+ Each episode: random pipeline log is shown, agent diagnoses up to 3 times.
540
+ Episode ends when score=1.0 or max steps reached.
541
+ """
542
+
543
+ def __init__(self):
544
+ self._state = PipelineState()
545
+ self._meta = {} # injected failure ground truth for current episode
546
+ self._log = ""
547
+
548
+ def reset(self) -> Observation:
549
+ """Start a new episode with a fresh pipeline failure."""
550
+ self._log, self._meta = generate_log()
551
+ self._state = PipelineState(
552
+ episode_id=str(uuid.uuid4()),
553
+ step_count=0,
554
+ task_id=self._meta["task_id"],
555
+ pipeline_name=self._meta.get("failed_stage", "unknown"),
556
+ )
557
+
558
+ # extract one-line error summary from log
559
+ summary = _extract_summary(self._log)
560
+
561
+ return PipelineObservation(
562
+ pipeline_log=self._log,
563
+ error_summary=summary,
564
+ pipeline_stage=self._meta["failed_stage"],
565
+ task_id=self._meta["task_id"],
566
+ attempt=0,
567
+ feedback="",
568
+ score=0.0,
569
+ done=False,
570
+ reward=0.0,
571
+ )
572
+
573
+ def step(self, action: Action) -> Observation:
574
+ if not isinstance(action, DiagnoseAction):
575
+ raise ValueError(f"Expected DiagnoseAction, got {type(action)}")
576
+
577
+ self._state.step_count += 1
578
+ step_num = self._state.step_count
579
+
580
+ score, feedback = grade(action, self._meta)
581
+ self._state.last_score = score
582
+
583
+ done = score >= 1.0 or step_num >= MAX_STEPS
584
+
585
+ return PipelineObservation(
586
+ pipeline_log=self._log,
587
+ error_summary=_extract_summary(self._log),
588
+ pipeline_stage=self._meta["failed_stage"],
589
+ task_id=self._meta["task_id"],
590
+ attempt=step_num,
591
+ feedback=feedback,
592
+ score=score,
593
+ done=done,
594
+ reward=score,
595
+ )
596
+
597
+ @property
598
+ def state(self) -> PipelineState:
599
+ return self._state
600
+
601
+
602
+ def _extract_summary(log: str) -> str:
603
+ """Pull the first ERROR line as a one-liner summary."""
604
+ for line in log.splitlines():
605
+ if "[ERROR]" in line and "Traceback" not in line:
606
+ return line.split("[ERROR]")[-1].strip()
607
+ return "Unknown error"
608
+ ```
609
+
610
+ ---
611
+
612
+ ## Task 5: app.py
613
+
614
+ **Files:**
615
+ - Create: `cicd_diagnosis_env/server/app.py`
616
+
617
+ - [ ] **Step 1: Write app.py with manual FastAPI routes (no create_app helper)**
618
+
619
+ ```python
620
+ # cicd_diagnosis_env/server/app.py
621
+ """
622
+ FastAPI server for the CI/CD Failure Diagnosis environment.
623
+
624
+ Usage:
625
+ uvicorn cicd_diagnosis_env.server.app:app --reload --host 0.0.0.0 --port 8000
626
+ uvicorn cicd_diagnosis_env.server.app:app --host 0.0.0.0 --port 8000 --workers 2
627
+ """
628
+
629
+ from fastapi import FastAPI, HTTPException
630
+ from pydantic import BaseModel
631
+ from typing import Any, Dict, Optional
632
+
633
+ from cicd_diagnosis_env.models import DiagnoseAction, PipelineObservation
634
+ from cicd_diagnosis_env.server.environment import CICDEnvironment
635
+
636
+ app = FastAPI(title="CI/CD Diagnosis Environment", version="0.1.0")
637
+
638
+ # one shared env instance — fine for single-user / hackathon scale
639
+ # TODO: add per-session isolation if needed for multi-agent training
640
+ _env = CICDEnvironment()
641
+
642
+
643
+ class StepRequest(BaseModel):
644
+ action: Dict[str, Any]
645
+
646
+
647
+ @app.post("/reset")
648
+ def reset():
649
+ obs = _env.reset()
650
+ return _obs_to_dict(obs)
651
+
652
+
653
+ @app.post("/step")
654
+ def step(req: StepRequest):
655
+ try:
656
+ action = DiagnoseAction(**req.action)
657
+ except Exception as e:
658
+ raise HTTPException(status_code=422, detail=str(e))
659
+
660
+ obs = _env.step(action)
661
+ return _obs_to_dict(obs)
662
+
663
+
664
+ @app.get("/state")
665
+ def get_state():
666
+ s = _env.state
667
+ return {
668
+ "episode_id": s.episode_id,
669
+ "step_count": s.step_count,
670
+ "last_score": s.last_score,
671
+ "task_id": s.task_id,
672
+ "pipeline_name": s.pipeline_name,
673
+ }
674
+
675
+
676
+ @app.get("/health")
677
+ def health():
678
+ return {"status": "healthy"}
679
+
680
+
681
+ def _obs_to_dict(obs: PipelineObservation) -> dict:
682
+ return {
683
+ "observation": {
684
+ "pipeline_log": obs.pipeline_log,
685
+ "error_summary": obs.error_summary,
686
+ "pipeline_stage": obs.pipeline_stage,
687
+ "task_id": obs.task_id,
688
+ "attempt": obs.attempt,
689
+ "feedback": obs.feedback,
690
+ "score": obs.score,
691
+ },
692
+ "reward": obs.reward,
693
+ "done": obs.done,
694
+ "info": {},
695
+ }
696
+
697
+
698
+ def main():
699
+ import uvicorn
700
+ uvicorn.run(app, host="0.0.0.0", port=8000)
701
+
702
+
703
+ if __name__ == "__main__":
704
+ main()
705
+ ```
706
+
707
+ - [ ] **Step 2: Smoke-test the server locally**
708
+
709
+ ```bash
710
+ cd "c:/Users/jayan/Desktop/New folder (2)"
711
+ # Start server in background
712
+ python -m uvicorn cicd_diagnosis_env.server.app:app --port 8000 &
713
+ sleep 2
714
+
715
+ # Test health
716
+ curl -s http://localhost:8000/health
717
+
718
+ # Test reset
719
+ curl -s -X POST http://localhost:8000/reset | python -m json.tool | head -20
720
+
721
+ # Test step
722
+ curl -s -X POST http://localhost:8000/step \
723
+ -H "Content-Type: application/json" \
724
+ -d '{"action": {"failure_category": "dependency", "root_cause": "missing package", "suggested_fix": "add to requirements.txt", "confidence": 0.9}}' \
725
+ | python -m json.tool
726
+ ```
727
+ Expected: `/health` → `{"status":"healthy"}`, `/reset` → JSON with `pipeline_log`, `/step` → JSON with `score` and `feedback`.
728
+
729
+ ---
730
+
731
+ ## Task 6: client.py
732
+
733
+ **Files:**
734
+ - Create: `cicd_diagnosis_env/client.py`
735
+
736
+ - [ ] **Step 1: Write client.py**
737
+
738
+ ```python
739
+ # cicd_diagnosis_env/client.py
740
+ """
741
+ CICDEnv
742
+ -------
743
+ HTTP client for the CI/CD Failure Diagnosis environment.
744
+
745
+ Instantiate with base_url pointing to a running server:
746
+ env = CICDEnv(base_url="http://localhost:8000")
747
+ obs = env.reset()
748
+ result = env.step(DiagnoseAction(...))
749
+ """
750
+
751
+ from __future__ import annotations
752
+
753
+ try:
754
+ from openenv.core.client_types import StepResult
755
+ from openenv.core.env_client import EnvClient
756
+ _has_openenv = True
757
+ except ImportError:
758
+ _has_openenv = False
759
+
760
+ from cicd_diagnosis_env.models import DiagnoseAction, PipelineObservation, PipelineState
761
+
762
+ if _has_openenv:
763
+ class CICDEnv(EnvClient[DiagnoseAction, PipelineObservation, PipelineState]):
764
+ # --- EnvClient hooks ---
765
+
766
+ def _step_payload(self, action: DiagnoseAction) -> dict:
767
+ # wire format expected by /step under "action" key
768
+ return {
769
+ "failure_category": action.failure_category,
770
+ "root_cause": action.root_cause,
771
+ "suggested_fix": action.suggested_fix,
772
+ "confidence": action.confidence,
773
+ }
774
+
775
+ def _parse_result(self, payload: dict) -> StepResult[PipelineObservation]:
776
+ # Expecting: { "observation": {...}, "reward": float|null, "done": bool }
777
+ obs = PipelineObservation(**payload["observation"])
778
+ return StepResult(
779
+ observation=obs,
780
+ reward=payload.get("reward"),
781
+ done=bool(payload.get("done", False)),
782
+ )
783
+
784
+ def _parse_state(self, payload: dict) -> PipelineState:
785
+ return PipelineState(
786
+ episode_id=payload.get("episode_id", ""),
787
+ step_count=payload.get("step_count", 0),
788
+ last_score=payload.get("last_score", 0.0),
789
+ task_id=payload.get("task_id", 0),
790
+ pipeline_name=payload.get("pipeline_name", ""),
791
+ )
792
+ else:
793
+ # fallback HTTP client when openenv SDK is not installed
794
+ import requests
795
+
796
+ class CICDEnv: # type: ignore[no-redef]
797
+ def __init__(self, base_url: str = "http://localhost:8000"):
798
+ self.base_url = base_url.rstrip("/")
799
+
800
+ def reset(self) -> PipelineObservation:
801
+ r = requests.post(f"{self.base_url}/reset")
802
+ r.raise_for_status()
803
+ return PipelineObservation(**r.json()["observation"])
804
+
805
+ def step(self, action: DiagnoseAction):
806
+ payload = {
807
+ "action": {
808
+ "failure_category": action.failure_category,
809
+ "root_cause": action.root_cause,
810
+ "suggested_fix": action.suggested_fix,
811
+ "confidence": action.confidence,
812
+ }
813
+ }
814
+ r = requests.post(f"{self.base_url}/step", json=payload)
815
+ r.raise_for_status()
816
+ data = r.json()
817
+ obs = PipelineObservation(**data["observation"])
818
+ return obs
819
+ ```
820
+
821
+ ---
822
+
823
+ ## Task 7: inference.py (ROOT level)
824
+
825
+ **Files:**
826
+ - Create: `inference.py` (at repo root, NOT inside cicd_diagnosis_env/)
827
+
828
+ - [ ] **Step 1: Write inference.py**
829
+
830
+ ```python
831
+ # inference.py
832
+ """
833
+ LLM agent for CI/CD Failure Diagnosis environment.
834
+
835
+ Required env vars:
836
+ API_BASE_URL — OpenAI-compatible API base (e.g. https://api.openai.com/v1)
837
+ MODEL_NAME — model to use (e.g. gpt-4o-mini)
838
+ HF_TOKEN — HuggingFace token (used if env runs on HF Spaces)
839
+ ENV_URL — base URL of the running cicd_diagnosis_env server
840
+ (default: http://localhost:8000)
841
+
842
+ Structured log format: [START], [STEP n], [END]
843
+ Runtime: well under 20 minutes for 10 episodes
844
+ """
845
+
846
+ import json
847
+ import os
848
+ import sys
849
+ import time
850
+
851
+ from openai import OpenAI
852
+
853
+ from cicd_diagnosis_env.client import CICDEnv
854
+ from cicd_diagnosis_env.models import DiagnoseAction
855
+
856
+ API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
857
+ MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
858
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
859
+ ENV_URL = os.environ.get("ENV_URL", "http://localhost:8000")
860
+ NUM_EPISODES = int(os.environ.get("NUM_EPISODES", "10"))
861
+
862
+ client = OpenAI(api_key=HF_TOKEN or os.environ.get("OPENAI_API_KEY", ""), base_url=API_BASE_URL)
863
+
864
+ SYSTEM_PROMPT = """You are an expert CI/CD engineer. You will be shown a pipeline failure log.
865
+ Diagnose the failure by providing:
866
+ 1. failure_category: one of [dependency, config, flaky, code_bug, infra]
867
+ 2. root_cause: concise explanation (1-2 sentences)
868
+ 3. suggested_fix: concrete action to resolve it
869
+ 4. confidence: float 0.0-1.0
870
+
871
+ Respond ONLY with valid JSON matching this schema:
872
+ {"failure_category": "...", "root_cause": "...", "suggested_fix": "...", "confidence": 0.9}"""
873
+
874
+
875
+ def diagnose(log: str, error_summary: str) -> DiagnoseAction:
876
+ user_msg = f"Error summary: {error_summary}\n\nFull log:\n{log}"
877
+ resp = client.chat.completions.create(
878
+ model=MODEL_NAME,
879
+ messages=[
880
+ {"role": "system", "content": SYSTEM_PROMPT},
881
+ {"role": "user", "content": user_msg},
882
+ ],
883
+ temperature=0.2,
884
+ max_tokens=300,
885
+ )
886
+ raw = resp.choices[0].message.content.strip()
887
+ # strip markdown fences if model adds them
888
+ if raw.startswith("```"):
889
+ raw = raw.split("```")[1]
890
+ if raw.startswith("json"):
891
+ raw = raw[4:]
892
+ parsed = json.loads(raw)
893
+ return DiagnoseAction(
894
+ failure_category=parsed["failure_category"],
895
+ root_cause=parsed["root_cause"],
896
+ suggested_fix=parsed["suggested_fix"],
897
+ confidence=float(parsed.get("confidence", 0.8)),
898
+ )
899
+
900
+
901
+ def run_episode(env: CICDEnv, ep: int) -> float:
902
+ obs = env.reset()
903
+ print(f"[STEP {ep}.0] reset task_id={obs.task_id} stage={obs.pipeline_stage}")
904
+
905
+ total_reward = 0.0
906
+ for step_num in range(1, 4): # max 3 attempts
907
+ action = diagnose(obs.pipeline_log, obs.error_summary)
908
+ obs = env.step(action)
909
+ total_reward = obs.score
910
+ print(
911
+ f"[STEP {ep}.{step_num}] "
912
+ f"cat={action.failure_category} score={obs.score:.3f} "
913
+ f"feedback={obs.feedback[:80]}"
914
+ )
915
+ if obs.done:
916
+ break
917
+
918
+ return total_reward
919
+
920
+
921
+ def main():
922
+ print(f"[START] cicd_diagnosis_env inference | model={MODEL_NAME} episodes={NUM_EPISODES}")
923
+ env = CICDEnv(base_url=ENV_URL)
924
+
925
+ scores = []
926
+ start = time.time()
927
+
928
+ for ep in range(1, NUM_EPISODES + 1):
929
+ try:
930
+ score = run_episode(env, ep)
931
+ scores.append(score)
932
+ except Exception as e:
933
+ print(f"[STEP {ep}] ERROR: {e}", file=sys.stderr)
934
+ scores.append(0.0)
935
+
936
+ elapsed = time.time() - start
937
+ avg = sum(scores) / len(scores) if scores else 0.0
938
+ print(f"[END] episodes={NUM_EPISODES} avg_score={avg:.4f} elapsed={elapsed:.1f}s")
939
+
940
+
941
+ if __name__ == "__main__":
942
+ main()
943
+ ```
944
+
945
+ ---
946
+
947
+ ## Task 8: Dockerfile
948
+
949
+ **Files:**
950
+ - Create: `cicd_diagnosis_env/Dockerfile`
951
+ - Create: `cicd_diagnosis_env/requirements.txt`
952
+
953
+ - [ ] **Step 1: Write requirements.txt**
954
+
955
+ ```
956
+ fastapi>=0.110.0
957
+ uvicorn[standard]>=0.29.0
958
+ pydantic>=2.0.0
959
+ openai>=1.0.0
960
+ requests>=2.31.0
961
+ ```
962
+
963
+ - [ ] **Step 2: Write Dockerfile**
964
+
965
+ ```dockerfile
966
+ # cicd_diagnosis_env/Dockerfile
967
+ FROM python:3.11-slim
968
+
969
+ WORKDIR /app
970
+
971
+ # install deps first (better layer caching)
972
+ COPY requirements.txt /tmp/requirements.txt
973
+ RUN pip install --no-cache-dir -r /tmp/requirements.txt
974
+
975
+ # copy source
976
+ COPY . /app/cicd_diagnosis_env
977
+ COPY cicd_diagnosis_env/server /app/cicd_diagnosis_env/server
978
+
979
+ ENV PYTHONPATH=/app
980
+ EXPOSE 8000
981
+
982
+ HEALTHCHECK --interval=10s --timeout=5s --retries=3 \
983
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"
984
+
985
+ CMD ["uvicorn", "cicd_diagnosis_env.server.app:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"]
986
+ ```
987
+
988
+ - [ ] **Step 3: Test Docker build**
989
+
990
+ ```bash
991
+ cd "c:/Users/jayan/Desktop/New folder (2)"
992
+ docker build -t cicd-diagnosis-env:latest -f cicd_diagnosis_env/Dockerfile .
993
+ ```
994
+ Expected: build completes without errors.
995
+
996
+ - [ ] **Step 4: Test Docker run**
997
+
998
+ ```bash
999
+ docker run -d -p 8000:8000 --name cicd-test cicd-diagnosis-env:latest
1000
+ sleep 3
1001
+ curl -s http://localhost:8000/health
1002
+ curl -s -X POST http://localhost:8000/reset | python -m json.tool | head -10
1003
+ docker stop cicd-test && docker rm cicd-test
1004
+ ```
1005
+ Expected: health check passes, reset returns a log.
1006
+
1007
+ ---
1008
+
1009
+ ## Task 9: openenv.yaml + README
1010
+
1011
+ **Files:**
1012
+ - Create: `cicd_diagnosis_env/openenv.yaml`
1013
+ - Create: `cicd_diagnosis_env/README.md`
1014
+
1015
+ - [ ] **Step 1: Write openenv.yaml**
1016
+
1017
+ ```yaml
1018
+ spec_version: 1
1019
+ name: cicd_diagnosis_env
1020
+ version: "0.1.0"
1021
+ description: "RL environment for diagnosing CI/CD pipeline failures"
1022
+ type: space
1023
+ runtime: fastapi
1024
+ app: cicd_diagnosis_env.server.app:app
1025
+ port: 8000
1026
+ action: DiagnoseAction
1027
+ observation: PipelineObservation
1028
+ ```
1029
+
1030
+ - [ ] **Step 2: Write README.md**
1031
+
1032
+ ```markdown
1033
+ # CI/CD Failure Diagnosis Environment
1034
+
1035
+ An OpenEnv RL environment where an agent diagnoses broken CI/CD pipelines.
1036
+
1037
+ ## What the Agent Does
1038
+
1039
+ Given a synthetic pipeline log, the agent must:
1040
+ 1. Classify the failure category (`dependency`, `config`, `flaky`, `code_bug`, `infra`)
1041
+ 2. Identify the root cause
1042
+ 3. Suggest a concrete fix
1043
+
1044
+ ## Three Task Tiers
1045
+
1046
+ | Task | Difficulty | Failure Type |
1047
+ |------|-----------|-------------|
1048
+ | 1 | Easy | Single `ModuleNotFoundError` — one missing package |
1049
+ | 2 | Medium | Misconfigured env var → 3 cascading test failures |
1050
+ | 3 | Hard | Async timing flaky test (looks like code bug) |
1051
+
1052
+ ## Reward Function
1053
+
1054
+ | Component | Points |
1055
+ |-----------|--------|
1056
+ | Correct failure category | +0.20 |
1057
+ | Correct root cause | +0.30 |
1058
+ | Valid fix suggested | +0.30 |
1059
+ | Confidence calibration | +0.20 |
1060
+ | Per irrelevant section mentioned | -0.10 |
1061
+
1062
+ ## Quick Start
1063
+
1064
+ ```bash
1065
+ # Run server
1066
+ docker build -t cicd-env -f cicd_diagnosis_env/Dockerfile .
1067
+ docker run -p 8000:8000 cicd-env
1068
+
1069
+ # Run LLM agent
1070
+ export API_BASE_URL=https://api.openai.com/v1
1071
+ export MODEL_NAME=gpt-4o-mini
1072
+ export OPENAI_API_KEY=sk-...
1073
+ export ENV_URL=http://localhost:8000
1074
+ python inference.py
1075
+ ```
1076
+
1077
+ ## API
1078
+
1079
+ | Endpoint | Method | Description |
1080
+ |----------|--------|-------------|
1081
+ | `/reset` | POST | Start new episode, returns initial observation |
1082
+ | `/step` | POST | Submit diagnosis action, returns scored observation |
1083
+ | `/state` | GET | Current episode metadata |
1084
+ | `/health` | GET | Health check |
1085
+ ```
1086
+ ```
1087
+
1088
+ ---
1089
+
1090
+ ## Task 10: Final Validation
1091
+
1092
+ - [ ] **Step 1: Verify all files exist**
1093
+
1094
+ ```bash
1095
+ find "c:/Users/jayan/Desktop/New folder (2)/cicd_diagnosis_env" -type f | sort
1096
+ ls "c:/Users/jayan/Desktop/New folder (2)/inference.py"
1097
+ ```
1098
+
1099
+ Expected files:
1100
+ ```
1101
+ cicd_diagnosis_env/__init__.py
1102
+ cicd_diagnosis_env/models.py
1103
+ cicd_diagnosis_env/client.py
1104
+ cicd_diagnosis_env/openenv.yaml
1105
+ cicd_diagnosis_env/Dockerfile
1106
+ cicd_diagnosis_env/README.md
1107
+ cicd_diagnosis_env/requirements.txt
1108
+ cicd_diagnosis_env/server/__init__.py
1109
+ cicd_diagnosis_env/server/app.py
1110
+ cicd_diagnosis_env/server/environment.py
1111
+ cicd_diagnosis_env/server/log_generator.py
1112
+ cicd_diagnosis_env/server/graders.py
1113
+ inference.py
1114
+ ```
1115
+
1116
+ - [ ] **Step 2: Full integration test**
1117
+
1118
+ ```bash
1119
+ cd "c:/Users/jayan/Desktop/New folder (2)"
1120
+ # start server
1121
+ python -m uvicorn cicd_diagnosis_env.server.app:app --port 8000 &
1122
+ sleep 2
1123
+
1124
+ # hit all endpoints
1125
+ curl -s http://localhost:8000/health
1126
+ curl -s -X POST http://localhost:8000/reset > /tmp/reset_out.json
1127
+ cat /tmp/reset_out.json | python -m json.tool
1128
+
1129
+ curl -s -X POST http://localhost:8000/step \
1130
+ -H "Content-Type: application/json" \
1131
+ -d '{"action":{"failure_category":"dependency","root_cause":"missing pytest-cov package","suggested_fix":"add pytest-cov to requirements.txt","confidence":0.9}}' \
1132
+ | python -m json.tool
1133
+
1134
+ curl -s http://localhost:8000/state | python -m json.tool
1135
+ ```
1136
+
1137
+ - [ ] **Step 3: Run openenv validate (if SDK installed)**
1138
+
1139
+ ```bash
1140
+ cd "c:/Users/jayan/Desktop/New folder (2)/cicd_diagnosis_env"
1141
+ openenv validate || echo "SDK not installed — skip"
1142
+ ```
1143
+
1144
+ - [ ] **Step 4: Kill background server**
1145
+
1146
+ ```bash
1147
+ pkill -f "uvicorn cicd_diagnosis_env" || true
1148
+ ```
docs/superpowers/specs/2026-04-05-cicd-diagnosis-env-design.md ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CI/CD Failure Diagnosis Environment — Design Spec
2
+
3
+ **Team:** Technical Irony
4
+ **Hackathon:** Meta PyTorch x Scaler OpenEnv
5
+ **Deadline:** April 8, 2026 11:59 PM
6
+
7
+ ---
8
+
9
+ ## What We're Building
10
+
11
+ An OpenEnv-compliant RL environment where an agent reads synthetic CI/CD
12
+ pipeline failure logs and must diagnose: failure category, root cause, and
13
+ suggested fix. Three difficulty tiers. Dense reward function. 100%
14
+ deterministic grading via injected failure metadata.
15
+
16
+ ---
17
+
18
+ ## Hybrid Log Generation (Approved Approach)
19
+
20
+ - 3 "seed" failure types (easy / medium / hard), each with a hardcoded template
21
+ - Each `reset()` randomizes: package names, line numbers, repo names, branch names,
22
+ timestamp offsets within the template
23
+ - Grader always knows the injected failure (passed via internal state), so grading
24
+ is deterministic regardless of cosmetic variation
25
+
26
+ ---
27
+
28
+ ## Three Tasks
29
+
30
+ | Task | Difficulty | Failure Type | Key Signal | Max Reward |
31
+ |------|-----------|--------------|------------|-----------|
32
+ | 1 | Easy | Single `ModuleNotFoundError` | One package, one stage | 1.0 |
33
+ | 2 | Medium | Config error → 3 cascading test failures | Root cause is config, not tests | 1.0 |
34
+ | 3 | Hard | Async timing flaky failure | Must classify "flaky" not "code_bug" | 1.0 |
35
+
36
+ ---
37
+
38
+ ## Reward Function
39
+
40
+ | Component | Points |
41
+ |-----------|--------|
42
+ | Correct failure category | +0.20 |
43
+ | Correct root cause identified | +0.30 |
44
+ | Valid fix suggested | +0.30 |
45
+ | Confidence calibration (0.0–1.0, scored by proximity) | +0.20 |
46
+ | Per irrelevant log section mentioned (penalty) | -0.10 |
47
+
48
+ ---
49
+
50
+ ## File Structure
51
+
52
+ ```
53
+ cicd_diagnosis_env/ ← env package root
54
+ ├── __init__.py
55
+ ├── models.py ← Action, Observation, State (Pydantic dataclasses)
56
+ ├── client.py ← CICDEnv(EnvClient) subclass
57
+ ├── openenv.yaml ← manifest
58
+ ├── Dockerfile ← server container
59
+ ├── README.md
60
+ └── server/
61
+ ├── __init__.py
62
+ ├── environment.py ← CICDEnvironment(Environment) — reset/step/state
63
+ ├── app.py ← FastAPI routes (manual, not scaffolded)
64
+ ├── log_generator.py ← hybrid template + randomization
65
+ └── graders.py ← grader functions for all 3 tasks
66
+ inference.py ← ROOT level, LLM agent
67
+ ```
68
+
69
+ ---
70
+
71
+ ## Judging Alignment
72
+
73
+ - **Real-world utility (30%):** CI/CD diagnosis is a real pain point; dense reward encourages partial-credit learning
74
+ - **Task & grader quality (25%):** 3 difficulty tiers, deterministic grading, calibrated reward
75
+ - **Environment design (20%):** Clean OpenEnv spec compliance, async client, Docker-ready
76
+ - **Code quality (15%):** Human-written style, under 30 lines/function, sparse comments
77
+ - **Creativity (10%):** Hybrid log generation, confidence calibration penalty, flaky-test hard task
78
+
79
+ ---
80
+
81
+ ## Key Constraints
82
+
83
+ - Runtime under 20 minutes
84
+ - 2 vCPU / 8GB RAM
85
+ - `openenv validate` must pass
86
+ - Docker build + run must work
87
+ - HF Space deploys and responds to `reset()`
88
+ - `inference.py` uses `API_BASE_URL`, `MODEL_NAME`, `HF_TOKEN` env vars
89
+ - Structured logs: `[START]`, `[STEP]`, `[END]` format
inference.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # inference.py — run this from the project root
2
+ """
3
+ LLM agent for the CI/CD Failure Diagnosis environment.
4
+
5
+ Env vars:
6
+ API_BASE_URL OpenAI-compatible base URL (e.g. https://api.openai.com/v1)
7
+ MODEL_NAME model to call (e.g. gpt-4o-mini)
8
+ HF_TOKEN HuggingFace token — used as API key when running on HF Spaces
9
+ ENV_URL running server URL (default: http://localhost:8000)
10
+ NUM_EPISODES how many episodes to run (default: 10)
11
+ """
12
+ import json
13
+ import os
14
+ import sys
15
+ import time
16
+
17
+ from openai import OpenAI
18
+
19
+ from cicd_diagnosis_env.client import CICDEnv
20
+ from cicd_diagnosis_env.models import DiagnoseAction
21
+
22
+ API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
23
+ MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
24
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
25
+ ENV_URL = os.environ.get("ENV_URL", "http://localhost:8000")
26
+ NUM_EPISODES = int(os.environ.get("NUM_EPISODES", "10"))
27
+
28
+ # HF_TOKEN doubles as API key when running on Spaces — fall back to OPENAI_API_KEY locally
29
+ _api_key = HF_TOKEN or os.environ.get("OPENAI_API_KEY", "no-key")
30
+ llm = OpenAI(api_key=_api_key, base_url=API_BASE_URL)
31
+
32
+ _SYSTEM = """You are an expert CI/CD engineer diagnosing pipeline failures.
33
+ You will receive a failure log. Respond ONLY with valid JSON (no markdown):
34
+ {
35
+ "failure_category": "<dependency|config|flaky|code_bug|infra>",
36
+ "root_cause": "<concise 1-2 sentence explanation>",
37
+ "suggested_fix": "<concrete action>",
38
+ "confidence": <float 0.0-1.0>
39
+ }"""
40
+
41
+
42
+ def diagnose(log, summary):
43
+ msg = f"Error summary: {summary}\n\nFull log:\n{log}"
44
+ resp = llm.chat.completions.create(
45
+ model=MODEL_NAME,
46
+ messages=[{"role": "system", "content": _SYSTEM}, {"role": "user", "content": msg}],
47
+ temperature=0.2,
48
+ max_tokens=300,
49
+ )
50
+ raw = resp.choices[0].message.content.strip()
51
+ # strip markdown fences — some models add them even when told not to
52
+ # this is a bit fragile but works for the models we're targeting
53
+ if raw.startswith("```"):
54
+ lines = raw.splitlines()
55
+ # drop first line (```json or ```) and last line (```)
56
+ raw = "\n".join(lines[1:-1]).strip()
57
+ parsed = json.loads(raw)
58
+ return DiagnoseAction(
59
+ failure_category=parsed["failure_category"],
60
+ root_cause=parsed["root_cause"],
61
+ suggested_fix=parsed["suggested_fix"],
62
+ confidence=float(parsed.get("confidence", 0.8)),
63
+ )
64
+
65
+
66
+ def run_episode(env, ep):
67
+ obs = env.reset()
68
+ print(f"[STEP {ep}.0] reset task_id={obs.task_id} stage={obs.pipeline_stage}")
69
+
70
+ total = 0.0
71
+ for attempt in range(1, 4): # max 3 attempts per episode
72
+ try:
73
+ action = diagnose(obs.pipeline_log, obs.error_summary)
74
+ except Exception as e:
75
+ print(f"[STEP {ep}.{attempt}] LLM error: {e}", file=sys.stderr)
76
+ break
77
+
78
+ obs = env.step(action)
79
+ total = obs.score
80
+ print(
81
+ f"[STEP {ep}.{attempt}] "
82
+ f"cat={action.failure_category} "
83
+ f"score={obs.score:.3f} "
84
+ f"done={obs.done}"
85
+ )
86
+ if obs.done:
87
+ break
88
+
89
+ return total
90
+
91
+
92
+ def main():
93
+ print(f"[START] model={MODEL_NAME} episodes={NUM_EPISODES} env={ENV_URL}")
94
+ env = CICDEnv(base_url=ENV_URL)
95
+ scores = []
96
+ t0 = time.time()
97
+
98
+ for ep in range(1, NUM_EPISODES + 1):
99
+ try:
100
+ s = run_episode(env, ep)
101
+ scores.append(s)
102
+ except Exception as e:
103
+ print(f"[STEP {ep}] episode error: {e}", file=sys.stderr)
104
+ scores.append(0.0)
105
+
106
+ elapsed = time.time() - t0
107
+ avg = sum(scores) / len(scores) if scores else 0.0
108
+ print(f"[END] episodes={NUM_EPISODES} avg_score={avg:.4f} elapsed={elapsed:.1f}s")
109
+
110
+
111
+ if __name__ == "__main__":
112
+ main()
openenv.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: cicd_diagnosis_env
3
+ version: "0.1.0"
4
+ description: "RL environment for diagnosing CI/CD pipeline failures"
5
+ type: space
6
+ runtime: fastapi
7
+ app: cicd_diagnosis_env.server.app:app
8
+ port: 8000
9
+ action: DiagnoseAction
10
+ observation: PipelineObservation
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi>=0.110.0
2
+ uvicorn[standard]>=0.29.0
3
+ pydantic>=2.0.0
4
+ openai>=1.52.0
5
+ requests>=2.31.0