hirann commited on
Commit
97e0833
·
verified ·
1 Parent(s): 5867928

Upload env\core.py

Browse files
Files changed (1) hide show
  1. env//core.py +268 -0
env//core.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import random
3
+ import re
4
+ from typing import Any, Dict, Optional, Tuple
5
+ from uuid import uuid4
6
+ from dataclasses import dataclass, field
7
+
8
+ from models import (
9
+ Observation as ObsModel,
10
+ Action as ActModel,
11
+ Reward as RewModel,
12
+ Resource,
13
+ Metrics,
14
+ SLA,
15
+ )
16
+
17
+
18
+ INSTANCE_DATA = {
19
+ "t3.nano": {"cost": 3.6, "capacity": 1.0},
20
+ "t3.small": {"cost": 11.5, "capacity": 2.0},
21
+ "t3.medium": {"cost": 23.0, "capacity": 4.0},
22
+ "m5.large": {"cost": 70.0, "capacity": 8.0},
23
+ "m5.xlarge": {"cost": 140.0,"capacity": 16.0},
24
+ }
25
+
26
+
27
+ @dataclass
28
+ class TaskConfig:
29
+ task_id: str
30
+ name: str
31
+ difficulty: str
32
+ description: str
33
+ initial_resources: list
34
+ sla: dict
35
+ load: float
36
+
37
+
38
+ TASKS = {
39
+ "easy": TaskConfig(
40
+ task_id="easy_right_sizing",
41
+ name="Right-Sizing",
42
+ difficulty="easy",
43
+ description="Reduce an overpriced server without breaking the SLA",
44
+ initial_resources=[
45
+ {"id": "srv-1", "type": "m5.xlarge", "cpu_usage": 2.0, "mem_usage": 2.0, "monthly_cost": 140.0}
46
+ ],
47
+ sla={"max_latency_ms": 200.0, "max_budget": 30.0, "min_uptime_pct": 99.0},
48
+ load=2.0
49
+ ),
50
+ "medium": TaskConfig(
51
+ task_id="medium_latency_fix",
52
+ name="Latency Fix",
53
+ difficulty="medium",
54
+ description="Resolve performance bottleneck while staying under budget",
55
+ initial_resources=[
56
+ {"id": "srv-1", "type": "t3.nano", "cpu_usage": 98.0, "mem_usage": 90.0, "monthly_cost": 3.6}
57
+ ],
58
+ sla={"max_latency_ms": 100.0, "max_budget": 60.0, "min_uptime_pct": 99.9},
59
+ load=12.0
60
+ ),
61
+ "hard": TaskConfig(
62
+ task_id="hard_balance",
63
+ name="Balance Optimization",
64
+ difficulty="hard",
65
+ description="Optimize a mixed cluster under tight budget constraints",
66
+ initial_resources=[
67
+ {"id": "srv-1", "type": "m5.large", "cpu_usage": 40.0, "mem_usage": 30.0, "monthly_cost": 70.0},
68
+ {"id": "srv-2", "type": "t3.nano", "cpu_usage": 90.0, "mem_usage": 80.0, "monthly_cost": 3.6}
69
+ ],
70
+ sla={"max_latency_ms": 150.0, "max_budget": 35.0, "min_uptime_pct": 99.9},
71
+ load=25.0
72
+ ),
73
+ }
74
+
75
+
76
+ @dataclass
77
+ class EpisodeState:
78
+ task_config: TaskConfig
79
+ resources: list
80
+ current_load: float
81
+ initial_cost: float
82
+ initial_latency: float
83
+ steps: int = 0
84
+ crashed: bool = False
85
+ episode_id: str = field(default_factory=lambda: str(uuid4()))
86
+
87
+
88
+ class CloudOpsEnvironment:
89
+ """Cloud Infrastructure Optimization Environment.
90
+
91
+ The agent acts as a Cloud SRE optimizing cost and performance.
92
+ """
93
+
94
+ def __init__(self, max_steps: int = 12):
95
+ self._max_steps = max_steps
96
+ self._ep: Optional[EpisodeState] = None
97
+
98
+ def reset(
99
+ self,
100
+ seed: Optional[int] = None,
101
+ episode_id: Optional[str] = None,
102
+ task_id: Optional[str] = None,
103
+ **kwargs: Any,
104
+ ) -> ObsModel:
105
+ if seed is not None:
106
+ random.seed(seed)
107
+
108
+ task_key = task_id or random.choice(["easy", "medium", "hard"])
109
+ if task_key not in TASKS:
110
+ task_key = "easy"
111
+
112
+ task = TASKS[task_key]
113
+
114
+ resources = [
115
+ Resource(**r) for r in task.initial_resources
116
+ ]
117
+
118
+ initial_cost = sum(r.monthly_cost for r in resources)
119
+ initial_latency, _, _ = self._calculate_metrics(task.load, resources)
120
+
121
+ self._ep = EpisodeState(
122
+ task_config=task,
123
+ resources=resources,
124
+ current_load=task.load,
125
+ initial_cost=initial_cost,
126
+ initial_latency=initial_latency,
127
+ steps=0,
128
+ crashed=False,
129
+ episode_id=episode_id or str(uuid4()),
130
+ )
131
+
132
+ return self._build_observation("Environment ready. Analyze and optimize.")
133
+
134
+ def step(self, action: ActModel, **kwargs: Any) -> Tuple[ObsModel, RewModel, bool, Dict]:
135
+ if self._ep is None:
136
+ return self._error_obs("Environment not reset")
137
+
138
+ self._ep.steps += 1
139
+ msg = action.message.lower()
140
+
141
+ message = self._parse_and_execute(msg)
142
+ latency, error_rate, utilization = self._calculate_metrics(
143
+ self._ep.current_load,
144
+ self._ep.resources
145
+ )
146
+
147
+ if utilization > 1.1:
148
+ self._ep.crashed = True
149
+ obs = self._build_observation("SYSTEM CRASH: Resource exhaustion!")
150
+ reward = RewModel(value=0.0, reason="System crashed due to resource exhaustion")
151
+ return obs, reward, True, {"reason": "crash"}
152
+
153
+ reward = self._calculate_reward(latency, error_rate)
154
+
155
+ done = (
156
+ reward.value >= 0.98 or
157
+ self._ep.steps >= self._max_steps
158
+ )
159
+
160
+ obs = self._build_observation(message)
161
+ return obs, reward, done, {}
162
+
163
+ def _parse_and_execute(self, msg: str) -> str:
164
+ match = re.search(r"change\s+([a-z0-9-]+)\s+to\s+([a-z0-9.]+)", msg)
165
+ if match:
166
+ res_id, new_type = match.groups()
167
+ if new_type not in INSTANCE_DATA:
168
+ return f"Error: Unknown instance type '{new_type}'. Available: {', '.join(INSTANCE_DATA.keys())}"
169
+
170
+ for r in self._ep.resources:
171
+ if r.id == res_id:
172
+ r.type = new_type
173
+ r.monthly_cost = INSTANCE_DATA[new_type]["cost"]
174
+ return f"Changed {res_id} to {new_type}"
175
+
176
+ return f"Error: Resource '{res_id}' not found"
177
+
178
+ if "resize" in msg or "scale" in msg or "upgrade" in msg or "downgrade" in msg:
179
+ return "Use format: 'change [resource_id] to [instance_type]'"
180
+
181
+ return "Command not recognized. Use 'change [resource_id] to [instance_type]'"
182
+
183
+ def _calculate_metrics(self, load: float, resources: list) -> Tuple[float, float, float]:
184
+ total_cap = sum(INSTANCE_DATA[r.type]["capacity"] for r in resources)
185
+ utilization = load / (total_cap + 1e-6)
186
+
187
+ latency = 50 * (1 + math.exp(utilization * 2 - 2))
188
+ error_rate = 0.0 if utilization < 0.9 else (utilization - 0.9) * 2.0
189
+
190
+ return latency, error_rate, utilization
191
+
192
+ def _calculate_reward(self, latency: float, error_rate: float) -> RewModel:
193
+ total_cost = sum(r.monthly_cost for r in self._ep.resources)
194
+ budget = self._ep.task_config.sla["max_latency_ms"]
195
+
196
+ cost_ratio = total_cost / budget
197
+ cost_reward = 0.5 * (1.0 / (1.0 + max(0, cost_ratio - 1)))
198
+
199
+ lat_ratio = latency / budget
200
+ perf_reward = 0.5 * (1.0 / (1.0 + max(0, lat_ratio - 1)))
201
+
202
+ total_reward = cost_reward + perf_reward
203
+
204
+ initial_latency = self._ep.initial_latency
205
+ initial_cost = self._ep.initial_cost
206
+ cost_change = ((total_cost - initial_cost) / initial_cost) * 100 if initial_cost > 0 else 0
207
+ lat_change = ((latency - initial_latency) / initial_latency) * 100 if initial_latency > 0 else 0
208
+
209
+ return RewModel(
210
+ value=min(1.0, max(0.0, total_reward)),
211
+ reason=f"Cost: ${total_cost:.1f}/mo, Latency: {latency:.1f}ms",
212
+ cost_change_pct=cost_change,
213
+ latency_change_pct=lat_change,
214
+ )
215
+
216
+ def _build_observation(self, message: str) -> ObsModel:
217
+ if self._ep is None:
218
+ return self._error_obs()
219
+
220
+ latency, error_rate, _ = self._calculate_metrics(
221
+ self._ep.current_load,
222
+ self._ep.resources
223
+ )
224
+
225
+ for r in self._ep.resources:
226
+ r.cpu_usage = min(100.0, self._ep.current_load / INSTANCE_DATA[r.type]["capacity"] * 100)
227
+ r.mem_usage = min(100.0, r.cpu_usage * 0.9)
228
+
229
+ metrics = Metrics(
230
+ avg_latency_ms=latency,
231
+ error_rate=error_rate,
232
+ throughput_rps=100.0
233
+ )
234
+
235
+ sla = SLA(**self._ep.task_config.sla)
236
+
237
+ return ObsModel(
238
+ inventory=self._ep.resources,
239
+ metrics=metrics,
240
+ sla=sla,
241
+ echoed_message=message,
242
+ task_id=self._ep.task_config.task_id,
243
+ task_name=self._ep.task_config.name,
244
+ difficulty=self._ep.task_config.difficulty,
245
+ step=self._ep.steps,
246
+ )
247
+
248
+ def _error_obs(self, message: str = "Error: Environment not initialized") -> ObsModel:
249
+ return ObsModel(
250
+ inventory=[],
251
+ metrics=Metrics(avg_latency_ms=0, error_rate=0, throughput_rps=0),
252
+ sla=SLA(max_latency_ms=0, max_budget=0, min_uptime_pct=0),
253
+ echoed_message=message,
254
+ )
255
+
256
+ @property
257
+ def state(self) -> Dict[str, Any]:
258
+ if self._ep is None:
259
+ return {}
260
+ return {
261
+ "episode_id": self._ep.episode_id,
262
+ "task_id": self._ep.task_config.task_id,
263
+ "steps": self._ep.steps,
264
+ "crashed": self._ep.crashed,
265
+ }
266
+
267
+
268
+ Environment = CloudOpsEnvironment