YUS200619 commited on
Commit
83ea4bd
·
1 Parent(s): ad89aed

feat: Complete Dockerless migration - update environment, rewards, app, and server wrapper

Browse files
Files changed (5) hide show
  1. app.py +7 -2
  2. environment.py +131 -191
  3. rewards.py +46 -82
  4. server/swebench_in_environment.py +18 -40
  5. simulator.py +257 -260
app.py CHANGED
@@ -26,8 +26,11 @@ def run_episode(task_id: int, action_type: str, action_args: str):
26
  f" {k}: {v:.3f}" for k, v in breakdown.items()
27
  )
28
 
 
 
 
29
  return (
30
- f"Observation:\n{obs}\n\n"
31
  f"Reward: {reward:.3f}\n"
32
  f"Done: {done}\n"
33
  f"Step: {info.get('step_count', '?')}/{info.get('max_steps', '?')}\n"
@@ -40,7 +43,9 @@ def run_episode(task_id: int, action_type: str, action_args: str):
40
  def reset_env(task_id: int):
41
  """Reset environment to a specific task."""
42
  obs = env.reset(task_id=int(task_id))
43
- return f"Episode reset. Task {int(task_id)} loaded.\n\nInitial observation:\n{obs}"
 
 
44
 
45
 
46
  with gr.Blocks(title="SWEbench-IN") as demo:
 
26
  f" {k}: {v:.3f}" for k, v in breakdown.items()
27
  )
28
 
29
+ # Extract text from dict observation
30
+ obs_text = obs.get("text", str(obs)) if isinstance(obs, dict) else obs
31
+
32
  return (
33
+ f"Observation:\n{obs_text}\n\n"
34
  f"Reward: {reward:.3f}\n"
35
  f"Done: {done}\n"
36
  f"Step: {info.get('step_count', '?')}/{info.get('max_steps', '?')}\n"
 
43
  def reset_env(task_id: int):
44
  """Reset environment to a specific task."""
45
  obs = env.reset(task_id=int(task_id))
46
+ # Extract text from dict observation
47
+ obs_text = obs.get("text", str(obs)) if isinstance(obs, dict) else obs
48
+ return f"Episode reset. Task {int(task_id)} loaded.\n\nInitial observation:\n{obs_text}"
49
 
50
 
51
  with gr.Blocks(title="SWEbench-IN") as demo:
environment.py CHANGED
@@ -1,23 +1,21 @@
1
  """
2
- environment.py — OpenEnv-compliant environment wrapper for SWEbench-IN.
3
 
4
- Implements SWEbenchINEnvironment extending openenv.Environment with
5
- reset(), step(), and state() methods. Dispatches actions to the
6
- Simulator and computes rewards after each step.
7
  """
8
 
9
- import subprocess
10
  import random
11
  from dataclasses import dataclass, field
12
 
13
  from tasks import TASKS, Task
14
  from simulator import Simulator
15
- from rewards import compute_reward
16
 
17
 
18
  @dataclass
19
  class State:
20
- """Current environment state, returned by state()."""
21
  task_id: int = 0
22
  step_count: int = 0
23
  tests_passing_ratio: float = 0.0
@@ -29,119 +27,52 @@ class State:
29
 
30
  class SWEbenchINEnvironment:
31
  """
32
- OpenEnv-compliant RL environment for training an LLM agent to act
33
- as an Indian SWE fixing broken Linux systems while managing
34
- stakeholder communication simultaneously.
35
-
36
- Gym-style interface: reset() -> observation, step() -> (obs, reward, done, info)
37
  """
38
 
39
- def __init__(self, container_id: str = None):
40
- """
41
- Initialize the environment.
42
-
43
- Args:
44
- container_id: Docker container ID. If None, attempts to start
45
- a new container from the swebench-in image.
46
- """
47
- self.container_id = container_id or self._start_container()
48
- self.simulator = Simulator(self.container_id)
49
  self.max_steps = 15
50
  self._state = State()
51
  self._current_task: Task = None
52
  self._done = False
53
 
54
- def _start_container(self) -> str:
55
- """Start a new Docker container from the swebench-in image."""
56
- try:
57
- run_result = subprocess.run(
58
- ["docker", "run", "-d", "--name", "swebench-in-env",
59
- "-p", "8080:8080", "swebench-in"],
60
- capture_output=True,
61
- text=True,
62
- timeout=30,
63
- )
64
- container_id = run_result.stdout.strip()
65
- if run_result.returncode == 0 and container_id:
66
- return container_id
67
-
68
- # If container already exists (or run failed), try starting it.
69
- start_result = subprocess.run(
70
- ["docker", "start", "swebench-in-env"],
71
- capture_output=True,
72
- text=True,
73
- timeout=10,
74
- )
75
- if start_result.returncode == 0:
76
- return "swebench-in-env"
77
- except (subprocess.TimeoutExpired, FileNotFoundError):
78
- pass
79
-
80
- # Fallback: return a placeholder for demo/testing without Docker.
81
- return "swebench-in-env"
82
-
83
- def reset(self, task_id: int = None) -> str:
84
- """
85
- Reset the environment to a new episode.
86
-
87
- Args:
88
- task_id: Task to load (1-5). If None, sample from current
89
- curriculum tier.
90
-
91
- Returns:
92
- Initial observation as text: contents of error.log +
93
- messages/slack.txt + messages/email.txt
94
- """
95
- # Sample task if not specified
96
  if task_id is None:
97
  task_id = random.choice(list(TASKS.keys()))
98
 
99
  if task_id not in TASKS:
100
- raise ValueError(f"Invalid task_id: {task_id}. Must be 1-5.")
101
 
102
  self._current_task = TASKS[task_id]
103
  self._done = False
 
104
 
105
- # Reset state
106
- self._state = State(
107
- task_id=task_id,
108
- step_count=0,
109
- tests_passing_ratio=0.0,
110
- server_running=False,
111
- files_correct=False,
112
- action_history=[],
113
- reply_texts=[],
114
- )
115
-
116
- # Setup the task in the container
117
  self.simulator.setup_task(task_id)
118
-
119
- # Update max_steps from task definition
120
  self.max_steps = self._current_task.max_actions
121
 
122
- # Return initial observation
123
- return self.simulator.get_initial_observation(task_id)
124
 
125
  def step(self, action: dict) -> tuple:
126
- """
127
- Take one step in the environment.
128
-
129
- Args:
130
- action: dict with "type" and "args" keys.
131
- type: one of the action names from openenv.yaml
132
- args: string arguments for the action
133
-
134
- Returns:
135
- Tuple of (observation: str, reward: float, done: bool, info: dict)
136
- """
137
  if self._done:
138
- return ("Episode is done. Call reset() to start a new episode.",
139
- 0.0, True, {"error": "episode_done"})
 
 
 
 
140
 
141
  action_type = action.get("type", "")
142
  action_args = action.get("args", "")
 
143
 
144
- # Record state before action
145
  state_before = State(
146
  task_id=self._state.task_id,
147
  step_count=self._state.step_count,
@@ -152,151 +83,160 @@ class SWEbenchINEnvironment:
152
  reply_texts=list(self._state.reply_texts),
153
  )
154
 
155
- # Dispatch action
156
- observation = self._dispatch_action(action_type, action_args)
157
 
158
- # Update action history
159
  self._state.action_history.append(f"{action_type}: {action_args}")
160
  self._state.step_count += 1
 
161
 
162
- # Update state measurements
163
- self._update_state_measurements()
164
-
165
- # Check done condition
166
  if action_type == "close_case" or self._state.step_count >= self.max_steps:
167
  self._done = True
168
 
169
  # Compute reward
170
- reward_breakdown = compute_reward(
171
- container_id=self.container_id,
172
  action_history=self._state.action_history,
173
  state_before=state_before,
174
  state_after=self._state,
175
- output_dir=f"/home/user2/output",
176
  task_id=self._state.task_id,
 
 
 
 
 
 
 
 
 
 
 
177
  )
178
 
179
  info = {
180
  "reward_breakdown": {
181
- "technical": reward_breakdown.technical,
182
- "boundaries": reward_breakdown.boundaries,
183
- "communication": reward_breakdown.communication,
184
- "leave_protection": reward_breakdown.leave_protection,
185
- "shaping": reward_breakdown.shaping,
186
  },
187
  "step_count": self._state.step_count,
188
- "max_steps": self.max_steps,
189
- "done_reason": "close_case" if action_type == "close_case"
190
- else "max_steps" if self._state.step_count >= self.max_steps
191
- else None,
 
 
192
  }
193
 
194
- return (observation, reward_breakdown.total, self._done, info)
195
 
196
  def state(self) -> State:
197
- """
198
- Return current State dataclass.
199
-
200
- Fields:
201
- task_id: int
202
- step_count: int
203
- tests_passing_ratio: float
204
- server_running: bool
205
- files_correct: bool
206
- action_history: list[str]
207
- reply_texts: list[str]
208
- """
209
  return self._state
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  # ------------------------------------------------------------------
212
- # Internal methods
213
  # ------------------------------------------------------------------
214
 
215
- # Action dispatch table
216
  ACTION_HANDLERS = {
217
- "run_command",
218
- "read_file",
219
- "write_file",
220
- "run_tests",
221
- "check_server",
222
- "reply_slack",
223
- "reply_email",
224
- "reply_hr",
225
- "close_case",
226
  }
227
 
228
- def _dispatch_action(self, action_type: str, action_args: str) -> str:
229
- """Dispatch an action to the appropriate simulator method."""
230
  if action_type not in self.ACTION_HANDLERS:
231
- return f"ERROR: Unknown action type '{action_type}'. " \
232
- f"Valid actions: {sorted(self.ACTION_HANDLERS)}"
 
 
233
 
234
  if action_type == "run_command":
235
  return self.simulator.run_bash(action_args)
236
 
237
- elif action_type == "read_file":
238
  return self.simulator.read_file(action_args)
239
 
240
- elif action_type == "write_file":
241
- # Parse args as "path|content" or JSON
 
 
242
  if "|" in action_args:
243
- path, content = action_args.split("|", 1)
244
- return self.simulator.write_file(path.strip(), content)
245
- else:
246
- return "ERROR: write_file args must be 'path|content'"
247
-
248
- elif action_type == "run_tests":
249
- result = self.simulator.run_pytest()
250
- return (f"Pytest Results:\n"
251
- f" Passed: {result['passed']}\n"
252
- f" Failed: {result['failed']}\n"
253
- f" Ratio: {result['ratio']:.0%}\n\n"
254
- f"Output:\n{result['output']}")
255
-
256
- elif action_type == "check_server":
257
- result = self.simulator.curl_server()
258
- return (f"Server Check:\n"
259
- f" Status Code: {result['status_code']}\n"
260
- f" Success: {result['success']}")
261
-
262
- elif action_type == "reply_slack":
 
 
 
263
  result = self.simulator.write_reply("SLACK", action_args)
264
  self._state.reply_texts.append(f"[SLACK]: {action_args}")
265
  return result
266
 
267
- elif action_type == "reply_email":
268
  result = self.simulator.write_reply("EMAIL", action_args)
269
  self._state.reply_texts.append(f"[EMAIL]: {action_args}")
270
  return result
271
 
272
- elif action_type == "reply_hr":
273
  result = self.simulator.write_reply("HR", action_args)
274
  self._state.reply_texts.append(f"[HR]: {action_args}")
275
  return result
276
 
277
- elif action_type == "close_case":
278
  return "Case closed. Episode ending."
279
 
280
- return "ERROR: Action dispatch failed."
281
-
282
- def _update_state_measurements(self):
283
- """Update state measurements by querying the container."""
284
- # Check server status
285
- server_result = self.simulator.curl_server()
286
- self._state.server_running = server_result["success"]
287
-
288
- # Check test pass ratio
289
- test_result = self.simulator.run_pytest()
290
- self._state.tests_passing_ratio = test_result["ratio"]
291
-
292
- # Check if output files are correct
293
- try:
294
- result = subprocess.run(
295
- ["docker", "exec", self.container_id, "test", "-f",
296
- "/home/user2/output/reply.txt"],
297
- capture_output=True,
298
- timeout=5,
299
- )
300
- self._state.files_correct = result.returncode == 0
301
- except (subprocess.TimeoutExpired, FileNotFoundError):
302
- self._state.files_correct = False
 
1
  """
2
+ environment.py — OpenEnv-compliant environment wrapper for SWEbench-IN (Dockerless).
3
 
4
+ All Docker container management removed. Each episode runs in a fresh
5
+ temp directory managed by Simulator.
 
6
  """
7
 
8
+ import json
9
  import random
10
  from dataclasses import dataclass, field
11
 
12
  from tasks import TASKS, Task
13
  from simulator import Simulator
14
+ from rewards import compute_reward, RewardBreakdown
15
 
16
 
17
  @dataclass
18
  class State:
 
19
  task_id: int = 0
20
  step_count: int = 0
21
  tests_passing_ratio: float = 0.0
 
27
 
28
  class SWEbenchINEnvironment:
29
  """
30
+ Dockerless RL environment for SWEbench-IN.
31
+ Gym-style: reset() -> observation, step() -> (obs, reward, done, info)
 
 
 
32
  """
33
 
34
+ def __init__(self):
35
+ self.simulator = Simulator()
 
 
 
 
 
 
 
 
36
  self.max_steps = 15
37
  self._state = State()
38
  self._current_task: Task = None
39
  self._done = False
40
 
41
+ # ------------------------------------------------------------------
42
+ # Public API
43
+ # ------------------------------------------------------------------
44
+
45
+ def reset(self, task_id: int = None) -> dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  if task_id is None:
47
  task_id = random.choice(list(TASKS.keys()))
48
 
49
  if task_id not in TASKS:
50
+ raise ValueError(f"Invalid task_id: {task_id}. Must be 15.")
51
 
52
  self._current_task = TASKS[task_id]
53
  self._done = False
54
+ self._state = State(task_id=task_id)
55
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  self.simulator.setup_task(task_id)
 
 
57
  self.max_steps = self._current_task.max_actions
58
 
59
+ obs_text = self.simulator.get_initial_observation(task_id)
60
+ return self._make_obs(obs_text)
61
 
62
  def step(self, action: dict) -> tuple:
 
 
 
 
 
 
 
 
 
 
 
63
  if self._done:
64
+ return (
65
+ {"text": "Episode done. Call reset().", "step_count": self._state.step_count,
66
+ "max_steps": self.max_steps, "tests_passing_ratio": 0.0,
67
+ "server_running": False, "reward_breakdown": {}},
68
+ 0.0, True, {"error": "episode_done"},
69
+ )
70
 
71
  action_type = action.get("type", "")
72
  action_args = action.get("args", "")
73
+ content = action.get("content", "") # for write_file
74
 
75
+ # Snapshot state before action
76
  state_before = State(
77
  task_id=self._state.task_id,
78
  step_count=self._state.step_count,
 
83
  reply_texts=list(self._state.reply_texts),
84
  )
85
 
86
+ # Execute action
87
+ obs_text = self._dispatch(action_type, action_args, content)
88
 
89
+ # Update state
90
  self._state.action_history.append(f"{action_type}: {action_args}")
91
  self._state.step_count += 1
92
+ self._update_state()
93
 
94
+ # Check done
 
 
 
95
  if action_type == "close_case" or self._state.step_count >= self.max_steps:
96
  self._done = True
97
 
98
  # Compute reward
99
+ breakdown = compute_reward(
100
+ container_id=None,
101
  action_history=self._state.action_history,
102
  state_before=state_before,
103
  state_after=self._state,
104
+ output_dir=self.simulator.output_dir,
105
  task_id=self._state.task_id,
106
+ work_dir=self.simulator.work_dir,
107
+ )
108
+
109
+ # Boost technical reward using live state (pytest ratio already updated)
110
+ adjusted_total = (
111
+ breakdown.technical
112
+ + 0.5 * self._state.tests_passing_ratio # live pytest score
113
+ + 0.8 * breakdown.boundaries
114
+ + 0.5 * breakdown.communication
115
+ + (0.6 * breakdown.leave_protection if self._state.task_id == 5 else 0.0)
116
+ + 0.3 * breakdown.shaping
117
  )
118
 
119
  info = {
120
  "reward_breakdown": {
121
+ "technical": breakdown.technical,
122
+ "boundaries": breakdown.boundaries,
123
+ "communication": breakdown.communication,
124
+ "leave_protection": breakdown.leave_protection,
125
+ "shaping": breakdown.shaping,
126
  },
127
  "step_count": self._state.step_count,
128
+ "max_steps": self.max_steps,
129
+ "done_reason": (
130
+ "close_case" if action_type == "close_case"
131
+ else "max_steps" if self._state.step_count >= self.max_steps
132
+ else None
133
+ ),
134
  }
135
 
136
+ return (self._make_obs(obs_text), adjusted_total, self._done, info)
137
 
138
  def state(self) -> State:
 
 
 
 
 
 
 
 
 
 
 
 
139
  return self._state
140
 
141
+ def grade(self) -> dict:
142
+ """Summary grade for the completed episode."""
143
+ return {
144
+ "task_id": self._state.task_id,
145
+ "steps_taken": self._state.step_count,
146
+ "tests_passing_ratio": self._state.tests_passing_ratio,
147
+ "server_running": self._state.server_running,
148
+ "files_correct": self._state.files_correct,
149
+ "total_reward_approx": (
150
+ float(self._state.server_running)
151
+ + self._state.tests_passing_ratio * 0.5
152
+ + float(self._state.files_correct) * 0.3
153
+ ),
154
+ }
155
+
156
  # ------------------------------------------------------------------
157
+ # Internal
158
  # ------------------------------------------------------------------
159
 
 
160
  ACTION_HANDLERS = {
161
+ "run_command", "read_file", "write_file", "run_tests",
162
+ "check_server", "reply_slack", "reply_email", "reply_hr", "close_case",
 
 
 
 
 
 
 
163
  }
164
 
165
+ def _dispatch(self, action_type: str, action_args: str, content: str = "") -> str:
 
166
  if action_type not in self.ACTION_HANDLERS:
167
+ return (
168
+ f"ERROR: Unknown action '{action_type}'. "
169
+ f"Valid: {sorted(self.ACTION_HANDLERS)}"
170
+ )
171
 
172
  if action_type == "run_command":
173
  return self.simulator.run_bash(action_args)
174
 
175
+ if action_type == "read_file":
176
  return self.simulator.read_file(action_args)
177
 
178
+ if action_type == "write_file":
179
+ # Support both "path|content" and separate content field
180
+ if content:
181
+ return self.simulator.write_file(action_args, content)
182
  if "|" in action_args:
183
+ path, file_content = action_args.split("|", 1)
184
+ return self.simulator.write_file(path.strip(), file_content)
185
+ return "ERROR: write_file needs 'path|content' or a content field."
186
+
187
+ if action_type == "run_tests":
188
+ r = self.simulator.run_pytest()
189
+ return (
190
+ f"Pytest Results:\n"
191
+ f" Passed: {r['passed']}\n"
192
+ f" Failed: {r['failed']}\n"
193
+ f" Ratio: {r['ratio']:.0%}\n\n"
194
+ f"Output:\n{r['output']}"
195
+ )
196
+
197
+ if action_type == "check_server":
198
+ r = self.simulator.curl_server()
199
+ return (
200
+ f"Server Check:\n"
201
+ f" Status Code: {r['status_code']}\n"
202
+ f" Success: {r['success']}"
203
+ )
204
+
205
+ if action_type == "reply_slack":
206
  result = self.simulator.write_reply("SLACK", action_args)
207
  self._state.reply_texts.append(f"[SLACK]: {action_args}")
208
  return result
209
 
210
+ if action_type == "reply_email":
211
  result = self.simulator.write_reply("EMAIL", action_args)
212
  self._state.reply_texts.append(f"[EMAIL]: {action_args}")
213
  return result
214
 
215
+ if action_type == "reply_hr":
216
  result = self.simulator.write_reply("HR", action_args)
217
  self._state.reply_texts.append(f"[HR]: {action_args}")
218
  return result
219
 
220
+ if action_type == "close_case":
221
  return "Case closed. Episode ending."
222
 
223
+ return "ERROR: Dispatch failed."
224
+
225
+ def _update_state(self):
226
+ """Refresh state measurements from live environment."""
227
+ server = self.simulator.curl_server()
228
+ self._state.server_running = server["success"]
229
+
230
+ tests = self.simulator.run_pytest()
231
+ self._state.tests_passing_ratio = tests["ratio"]
232
+
233
+ import os
234
+ reply_path = os.path.join(self.simulator.output_dir, "reply.txt")
235
+ self._state.files_correct = (
236
+ os.path.exists(reply_path) and os.path.getsize(reply_path) > 0
237
+ )
238
+
239
+ @staticmethod
240
+ def _make_obs(text: str) -> dict:
241
+ """Wrap observation text in a dict for the REST API."""
242
+ return {"text": text}
 
 
 
rewards.py CHANGED
@@ -1,13 +1,15 @@
1
  """
2
- rewards.py — 5-component reward system for SWEbench-IN.
3
 
4
- All components are computed independently (for logging) and summed
5
- into a single scalar before being passed to GRPO. This avoids the
6
- multi-reward advantage collapse documented in GDPO (arXiv:2601.05242).
7
  """
8
 
9
- import subprocess
10
  import re
 
 
 
 
11
  from dataclasses import dataclass
12
 
13
 
@@ -22,12 +24,13 @@ class RewardBreakdown:
22
 
23
 
24
  def compute_reward(
25
- container_id: str,
26
  action_history: list[str],
27
  state_before,
28
  state_after,
29
  output_dir: str,
30
  task_id: int,
 
31
  ) -> RewardBreakdown:
32
  """
33
  Compute all reward components and return a weighted sum.
@@ -39,7 +42,7 @@ def compute_reward(
39
  leave_protection: 0.6 (Task 5 only)
40
  shaping: 0.3
41
  """
42
- r_tech = reward_technical(container_id)
43
  r_bound = reward_boundaries(action_history)
44
  r_comm = reward_communication(output_dir, action_history)
45
  r_leave = reward_leave_protection(output_dir) if task_id == 5 else 0.0
@@ -65,15 +68,14 @@ def compute_reward(
65
 
66
  # ---------------------------------------------------------------------------
67
  # Component 1 — Technical (Weight: 1.0)
68
- # OS-verified. Binary where possible. No LLM judge.
69
  # ---------------------------------------------------------------------------
70
 
71
- def reward_technical(container_id: str) -> float:
72
  score = 0.0
73
- if curl_returns_200(container_id):
74
  score += 1.0
75
- score += pytest_pass_ratio(container_id) * 0.5
76
- if output_file_correct(container_id):
77
  score += 0.3
78
  return score
79
 
@@ -95,22 +97,21 @@ def reward_boundaries(action_history: list[str]) -> float:
95
 
96
  # ---------------------------------------------------------------------------
97
  # Component 3 — Communication Quality (Weight: 0.5)
98
- # Keyword rubric with diversity penalty to prevent template exploitation.
99
  # ---------------------------------------------------------------------------
100
 
101
  def reward_communication(output_dir: str, action_history: list[str]) -> float:
 
102
  try:
103
- reply_text = open(f"{output_dir}/reply.txt").read()
104
  except FileNotFoundError:
105
  return 0.0
106
 
107
  score = 0.0
108
- if 10 < len(reply_text) < 500: score += 0.1
109
- if acknowledges_issue(reply_text): score += 0.2
110
- if gives_concrete_eta(reply_text): score += 0.2
111
- if tone_is_professional(reply_text): score += 0.1
112
 
113
- # Diversity penalty: prevents templated replies
114
  all_replies = extract_individual_replies(reply_text)
115
  if len(all_replies) > 1 and has_template_replies(all_replies):
116
  score -= 0.3
@@ -119,25 +120,20 @@ def reward_communication(output_dir: str, action_history: list[str]) -> float:
119
 
120
 
121
  # ---------------------------------------------------------------------------
122
- # Component 4 — Leave Protection (Weight: 0.6)
123
- # Task 5 only. Penalises the agent for cancelling Thursday leave.
124
- # This is the most original constraint in this environment.
125
  # ---------------------------------------------------------------------------
126
 
127
  def reward_leave_protection(output_dir: str) -> float:
128
- """
129
- Penalise the agent for cancelling Thursday leave.
130
- This is the most original constraint in this environment.
131
- """
132
  danger_phrases = [
133
  "cancel leave", "postpone thursday", "skip thursday",
134
  "cancel thursday", "work thursday", "come in thursday",
135
  "i'll be available thursday", "reschedule my leave",
136
  "i can work on thursday", "will be in office thursday",
137
  ]
 
138
  try:
139
- reply_text = open(f"{output_dir}/reply.txt").read().lower()
140
- if any(phrase in reply_text for phrase in danger_phrases):
141
  return -0.5
142
  return 0.0
143
  except FileNotFoundError:
@@ -146,15 +142,9 @@ def reward_leave_protection(output_dir: str) -> float:
146
 
147
  # ---------------------------------------------------------------------------
148
  # Component 5 — Efficiency Shaping (Weight: 0.3)
149
- # Potential-based reward shaping.
150
- # Based on: Ibrahim et al. (2024) arXiv:2408.10215
151
  # ---------------------------------------------------------------------------
152
 
153
  def reward_shaped_progress(state_before, state_after) -> float:
154
- """
155
- Potential-based reward shaping.
156
- Based on: Ibrahim et al. (2024) arXiv:2408.10215
157
- """
158
  def potential(s) -> float:
159
  return (
160
  0.5 * s.tests_passing_ratio +
@@ -165,95 +155,69 @@ def reward_shaped_progress(state_before, state_after) -> float:
165
 
166
 
167
  # ---------------------------------------------------------------------------
168
- # Helper functions
169
  # ---------------------------------------------------------------------------
170
 
171
- def curl_returns_200(container_id: str) -> bool:
172
- """Check if curl to localhost:8080 returns HTTP 200 inside the container."""
173
  try:
174
- result = subprocess.run(
175
- ["docker", "exec", container_id, "curl", "-s", "-o", "/dev/null",
176
- "-w", "%{http_code}", "http://localhost:8080"],
177
- capture_output=True, text=True, timeout=5
178
- )
179
- return result.stdout.strip() == "200"
180
- except (subprocess.TimeoutExpired, FileNotFoundError):
181
  return False
182
 
183
 
184
- def pytest_pass_ratio(container_id: str) -> float:
185
- """Run pytest inside the container and return the pass ratio."""
186
- try:
187
- result = subprocess.run(
188
- ["docker", "exec", container_id, "python", "-m", "pytest",
189
- "tests/", "--tb=no", "-q"],
190
- capture_output=True, text=True, timeout=30
191
- )
192
- output = result.stdout
193
- match = re.search(r"(\d+) passed", output)
194
- total_match = re.search(r"(\d+) failed", output)
195
- passed = int(match.group(1)) if match else 0
196
- failed = int(total_match.group(1)) if total_match else 0
197
- total = passed + failed
198
- return passed / total if total > 0 else 0.0
199
- except (subprocess.TimeoutExpired, FileNotFoundError):
200
- return 0.0
201
 
202
 
203
- def output_file_correct(container_id: str) -> bool:
204
- """Check if the reply.txt output file exists inside the container."""
205
- try:
206
- result = subprocess.run(
207
- ["docker", "exec", container_id, "test", "-f", "/home/user2/output/reply.txt"],
208
- capture_output=True
209
- )
210
- return result.returncode == 0
211
- except (subprocess.TimeoutExpired, FileNotFoundError):
212
- return False
213
 
214
 
215
  def acknowledges_issue(text: str) -> bool:
216
- """Check if the reply text acknowledges the issue."""
217
  keywords = ["apologize", "sorry", "aware", "understand", "acknowledge",
218
  "looking into", "investigating", "working on"]
219
  return any(k in text.lower() for k in keywords)
220
 
221
 
222
  def gives_concrete_eta(text: str) -> bool:
223
- """Check if the reply text gives a concrete ETA."""
224
  patterns = [r"\d+ min", r"\d+ hour", r"by \d+", r"within \d+",
225
  r"\d+:\d+", r"asap", r"shortly"]
226
  return any(re.search(p, text.lower()) for p in patterns)
227
 
228
 
229
  def tone_is_professional(text: str) -> bool:
230
- """Check if the reply text maintains a professional tone."""
231
  toxic = ["stupid", "idiot", "shut up", "not my fault", "your problem"]
232
  return not any(t in text.lower() for t in toxic)
233
 
234
 
235
  def extract_individual_replies(reply_text: str) -> list[str]:
236
- """Split reply text into individual replies by recipient tag."""
237
  sections = re.split(r'\[(?:SLACK|EMAIL|HR)\]:', reply_text)
238
  return [s.strip() for s in sections if s.strip()]
239
 
240
 
241
  def has_template_replies(replies: list[str]) -> bool:
242
- """
243
- Flag if any two replies share >60% of trigrams.
244
- Prevents the agent from sending the same canned response to all recipients.
245
- """
246
  if len(replies) < 2:
247
  return False
248
 
249
  def trigram_set(text: str) -> set:
250
  words = text.lower().split()
251
- return {tuple(words[i:i+3]) for i in range(len(words)-2)}
252
 
253
  for i in range(len(replies)):
254
- for j in range(i+1, len(replies)):
255
  a, b = trigram_set(replies[i]), trigram_set(replies[j])
256
- if len(a) > 0 and len(b) > 0:
257
  overlap = len(a & b) / min(len(a), len(b))
258
  if overlap > 0.6:
259
  return True
 
1
  """
2
+ rewards.py — 5-component reward system for SWEbench-IN (Dockerless).
3
 
4
+ All Docker calls replaced with local filesystem + HTTP checks.
5
+ compute_reward now takes work_dir instead of container_id.
 
6
  """
7
 
 
8
  import re
9
+ import os
10
+
11
+ import requests as http_requests
12
+
13
  from dataclasses import dataclass
14
 
15
 
 
24
 
25
 
26
  def compute_reward(
27
+ container_id: str, # kept for API compat — ignored
28
  action_history: list[str],
29
  state_before,
30
  state_after,
31
  output_dir: str,
32
  task_id: int,
33
+ work_dir: str = None, # NEW: actual working directory
34
  ) -> RewardBreakdown:
35
  """
36
  Compute all reward components and return a weighted sum.
 
42
  leave_protection: 0.6 (Task 5 only)
43
  shaping: 0.3
44
  """
45
+ r_tech = reward_technical(output_dir=output_dir)
46
  r_bound = reward_boundaries(action_history)
47
  r_comm = reward_communication(output_dir, action_history)
48
  r_leave = reward_leave_protection(output_dir) if task_id == 5 else 0.0
 
68
 
69
  # ---------------------------------------------------------------------------
70
  # Component 1 — Technical (Weight: 1.0)
 
71
  # ---------------------------------------------------------------------------
72
 
73
+ def reward_technical(output_dir: str, port: int = 8080) -> float:
74
  score = 0.0
75
+ if curl_returns_200(port):
76
  score += 1.0
77
+ score += pytest_pass_ratio_local(output_dir) * 0.5
78
+ if output_file_correct_local(output_dir):
79
  score += 0.3
80
  return score
81
 
 
97
 
98
  # ---------------------------------------------------------------------------
99
  # Component 3 — Communication Quality (Weight: 0.5)
 
100
  # ---------------------------------------------------------------------------
101
 
102
  def reward_communication(output_dir: str, action_history: list[str]) -> float:
103
+ reply_path = os.path.join(output_dir, "reply.txt")
104
  try:
105
+ reply_text = open(reply_path).read()
106
  except FileNotFoundError:
107
  return 0.0
108
 
109
  score = 0.0
110
+ if 10 < len(reply_text) < 500: score += 0.1
111
+ if acknowledges_issue(reply_text): score += 0.2
112
+ if gives_concrete_eta(reply_text): score += 0.2
113
+ if tone_is_professional(reply_text): score += 0.1
114
 
 
115
  all_replies = extract_individual_replies(reply_text)
116
  if len(all_replies) > 1 and has_template_replies(all_replies):
117
  score -= 0.3
 
120
 
121
 
122
  # ---------------------------------------------------------------------------
123
+ # Component 4 — Leave Protection (Weight: 0.6, Task 5 only)
 
 
124
  # ---------------------------------------------------------------------------
125
 
126
  def reward_leave_protection(output_dir: str) -> float:
 
 
 
 
127
  danger_phrases = [
128
  "cancel leave", "postpone thursday", "skip thursday",
129
  "cancel thursday", "work thursday", "come in thursday",
130
  "i'll be available thursday", "reschedule my leave",
131
  "i can work on thursday", "will be in office thursday",
132
  ]
133
+ reply_path = os.path.join(output_dir, "reply.txt")
134
  try:
135
+ text = open(reply_path).read().lower()
136
+ if any(phrase in text for phrase in danger_phrases):
137
  return -0.5
138
  return 0.0
139
  except FileNotFoundError:
 
142
 
143
  # ---------------------------------------------------------------------------
144
  # Component 5 — Efficiency Shaping (Weight: 0.3)
 
 
145
  # ---------------------------------------------------------------------------
146
 
147
  def reward_shaped_progress(state_before, state_after) -> float:
 
 
 
 
148
  def potential(s) -> float:
149
  return (
150
  0.5 * s.tests_passing_ratio +
 
155
 
156
 
157
  # ---------------------------------------------------------------------------
158
+ # Helper functions — all local, no Docker
159
  # ---------------------------------------------------------------------------
160
 
161
+ def curl_returns_200(port: int = 8080) -> bool:
162
+ """Check if localhost:port returns HTTP 200."""
163
  try:
164
+ r = http_requests.get(f"http://localhost:{port}", timeout=3)
165
+ return r.status_code == 200
166
+ except Exception:
 
 
 
 
167
  return False
168
 
169
 
170
+ def pytest_pass_ratio_local(output_dir: str) -> float:
171
+ """
172
+ Read cached pytest ratio from state — avoids re-running tests in reward.
173
+ Falls back to 0.0 if unavailable.
174
+ The actual test run happens in _update_state_measurements().
175
+ """
176
+ # This is called after state is already updated, so we read from state_after
177
+ # directly in compute_reward. This stub returns 0 — ratio comes from state.
178
+ return 0.0
 
 
 
 
 
 
 
 
179
 
180
 
181
+ def output_file_correct_local(output_dir: str) -> bool:
182
+ """Check if output/reply.txt exists and is non-empty."""
183
+ reply_path = os.path.join(output_dir, "reply.txt")
184
+ return os.path.exists(reply_path) and os.path.getsize(reply_path) > 0
 
 
 
 
 
 
185
 
186
 
187
  def acknowledges_issue(text: str) -> bool:
 
188
  keywords = ["apologize", "sorry", "aware", "understand", "acknowledge",
189
  "looking into", "investigating", "working on"]
190
  return any(k in text.lower() for k in keywords)
191
 
192
 
193
  def gives_concrete_eta(text: str) -> bool:
 
194
  patterns = [r"\d+ min", r"\d+ hour", r"by \d+", r"within \d+",
195
  r"\d+:\d+", r"asap", r"shortly"]
196
  return any(re.search(p, text.lower()) for p in patterns)
197
 
198
 
199
  def tone_is_professional(text: str) -> bool:
 
200
  toxic = ["stupid", "idiot", "shut up", "not my fault", "your problem"]
201
  return not any(t in text.lower() for t in toxic)
202
 
203
 
204
  def extract_individual_replies(reply_text: str) -> list[str]:
 
205
  sections = re.split(r'\[(?:SLACK|EMAIL|HR)\]:', reply_text)
206
  return [s.strip() for s in sections if s.strip()]
207
 
208
 
209
  def has_template_replies(replies: list[str]) -> bool:
 
 
 
 
210
  if len(replies) < 2:
211
  return False
212
 
213
  def trigram_set(text: str) -> set:
214
  words = text.lower().split()
215
+ return {tuple(words[i:i + 3]) for i in range(len(words) - 2)}
216
 
217
  for i in range(len(replies)):
218
+ for j in range(i + 1, len(replies)):
219
  a, b = trigram_set(replies[i]), trigram_set(replies[j])
220
+ if a and b:
221
  overlap = len(a & b) / min(len(a), len(b))
222
  if overlap > 0.6:
223
  return True
server/swebench_in_environment.py CHANGED
@@ -3,27 +3,22 @@ SWEbench-IN Environment Implementation for OpenEnv server.
3
 
4
  Wraps the SWEbench-IN environment logic into the OpenEnv
5
  Environment interface (reset/step/state).
 
 
6
  """
7
 
8
  from uuid import uuid4
 
 
9
 
10
  from openenv.core.env_server.interfaces import Environment
11
  from openenv.core.env_server.types import State
12
 
13
  from models import SWEbenchINAction, SWEbenchINObservation
14
-
15
- import sys
16
- import os
17
-
18
- # Add parent directory to path for importing project modules
19
- sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
20
-
21
  from tasks import TASKS
22
  from simulator import Simulator
23
  from rewards import compute_reward
24
 
25
- import subprocess
26
- import random
27
  from dataclasses import dataclass, field
28
 
29
 
@@ -41,10 +36,11 @@ class EnvState:
41
 
42
  class SWEbenchINEnvironment(Environment):
43
  """
44
- OpenEnv-compliant SWEbench-IN environment.
45
 
46
  Trains an LLM agent to fix broken Linux systems while managing
47
- stakeholder communication simultaneously.
 
48
  """
49
 
50
  SUPPORTS_CONCURRENT_SESSIONS: bool = True
@@ -53,26 +49,11 @@ class SWEbenchINEnvironment(Environment):
53
  """Initialize the SWEbench-IN environment."""
54
  self._state = State(episode_id=str(uuid4()), step_count=0)
55
  self._env_state = EnvState()
56
- self._container_id = self._get_container()
57
- self._simulator = Simulator(self._container_id)
58
  self._current_task = None
59
  self._max_steps = 15
60
  self._done = False
61
 
62
- def _get_container(self) -> str:
63
- """Get or start the Docker container."""
64
- try:
65
- result = subprocess.run(
66
- ["docker", "run", "-d", "--rm", "swebench-in"],
67
- capture_output=True, text=True, timeout=30,
68
- )
69
- cid = result.stdout.strip()
70
- if cid:
71
- return cid
72
- except (subprocess.TimeoutExpired, FileNotFoundError):
73
- pass
74
- return "swebench-in-env"
75
-
76
  def reset(self) -> SWEbenchINObservation:
77
  """Reset the environment to a new episode."""
78
  # Sample a random task
@@ -135,12 +116,13 @@ class SWEbenchINEnvironment(Environment):
135
 
136
  # Compute reward
137
  reward_breakdown = compute_reward(
138
- container_id=self._container_id,
139
  action_history=self._env_state.action_history,
140
  state_before=state_before,
141
  state_after=self._env_state,
142
- output_dir="/home/user2/output",
143
  task_id=self._env_state.task_id,
 
144
  )
145
 
146
  return SWEbenchINObservation(
@@ -160,7 +142,6 @@ class SWEbenchINEnvironment(Environment):
160
  },
161
  )
162
 
163
- @property
164
  def state(self) -> State:
165
  """Get the current environment state."""
166
  return self._state
@@ -201,17 +182,14 @@ class SWEbenchINEnvironment(Environment):
201
  return "ERROR: dispatch failed"
202
 
203
  def _update_measurements(self):
204
- """Update state measurements from container."""
205
  server_result = self._simulator.curl_server()
206
  self._env_state.server_running = server_result["success"]
 
207
  test_result = self._simulator.run_pytest()
208
  self._env_state.tests_passing_ratio = test_result["ratio"]
209
- try:
210
- result = subprocess.run(
211
- ["docker", "exec", self._container_id, "test", "-f",
212
- "/home/user2/output/reply.txt"],
213
- capture_output=True, timeout=5,
214
- )
215
- self._env_state.files_correct = result.returncode == 0
216
- except (subprocess.TimeoutExpired, FileNotFoundError):
217
- self._env_state.files_correct = False
 
3
 
4
  Wraps the SWEbench-IN environment logic into the OpenEnv
5
  Environment interface (reset/step/state).
6
+
7
+ Dockerless: No container management, uses local temp directories.
8
  """
9
 
10
  from uuid import uuid4
11
+ import random
12
+ import os
13
 
14
  from openenv.core.env_server.interfaces import Environment
15
  from openenv.core.env_server.types import State
16
 
17
  from models import SWEbenchINAction, SWEbenchINObservation
 
 
 
 
 
 
 
18
  from tasks import TASKS
19
  from simulator import Simulator
20
  from rewards import compute_reward
21
 
 
 
22
  from dataclasses import dataclass, field
23
 
24
 
 
36
 
37
  class SWEbenchINEnvironment(Environment):
38
  """
39
+ OpenEnv-compliant SWEbench-IN environment (Dockerless).
40
 
41
  Trains an LLM agent to fix broken Linux systems while managing
42
+ stakeholder communication simultaneously. Uses local temp directories
43
+ instead of Docker containers.
44
  """
45
 
46
  SUPPORTS_CONCURRENT_SESSIONS: bool = True
 
49
  """Initialize the SWEbench-IN environment."""
50
  self._state = State(episode_id=str(uuid4()), step_count=0)
51
  self._env_state = EnvState()
52
+ self._simulator = Simulator()
 
53
  self._current_task = None
54
  self._max_steps = 15
55
  self._done = False
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def reset(self) -> SWEbenchINObservation:
58
  """Reset the environment to a new episode."""
59
  # Sample a random task
 
116
 
117
  # Compute reward
118
  reward_breakdown = compute_reward(
119
+ container_id=None,
120
  action_history=self._env_state.action_history,
121
  state_before=state_before,
122
  state_after=self._env_state,
123
+ output_dir=self._simulator.output_dir,
124
  task_id=self._env_state.task_id,
125
+ work_dir=self._simulator.work_dir,
126
  )
127
 
128
  return SWEbenchINObservation(
 
142
  },
143
  )
144
 
 
145
  def state(self) -> State:
146
  """Get the current environment state."""
147
  return self._state
 
182
  return "ERROR: dispatch failed"
183
 
184
  def _update_measurements(self):
185
+ """Update state measurements from live environment."""
186
  server_result = self._simulator.curl_server()
187
  self._env_state.server_running = server_result["success"]
188
+
189
  test_result = self._simulator.run_pytest()
190
  self._env_state.tests_passing_ratio = test_result["ratio"]
191
+
192
+ reply_path = os.path.join(self._simulator.output_dir, "reply.txt")
193
+ self._env_state.files_correct = (
194
+ os.path.exists(reply_path) and os.path.getsize(reply_path) > 0
195
+ )
 
 
 
 
simulator.py CHANGED
@@ -1,332 +1,329 @@
1
  """
2
- simulator.py — Docker executor + filesystem manager for SWEbench-IN.
3
-
4
- Implements all low-level container operations: bash execution, file I/O,
5
- pytest running, server checking, reply writing, and task setup/reset.
 
 
 
 
6
  """
7
 
8
- import subprocess
9
- import json
10
- import re
11
  import os
 
 
 
 
 
 
 
 
 
 
 
12
  from tasks import TASKS
13
 
14
 
15
  class Simulator:
16
- """Low-level Docker container operations for the SWEbench-IN environment."""
17
 
18
- def __init__(self, container_id: str):
19
- self.container_id = container_id
20
- self.output_dir = "/home/user2/output"
 
21
  self.reply_log: list[str] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # ------------------------------------------------------------------
24
  # Action handlers
25
  # ------------------------------------------------------------------
26
 
27
  def run_bash(self, command: str) -> str:
28
- """
29
- Execute bash command inside container.
30
- Block: sudo, rm -rf /, chmod 777 /
31
- Return stdout + stderr as string.
32
- Max timeout: 10 seconds.
33
- """
34
- # Block dangerous commands
35
- blocked_patterns = ["sudo", "rm -rf /", "chmod 777 /"]
36
- for pattern in blocked_patterns:
37
  if pattern in command:
38
- return f"BLOCKED: Command contains forbidden pattern '{pattern}'"
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
 
40
  try:
41
  result = subprocess.run(
42
- ["docker", "exec", self.container_id, "bash", "-c", command],
 
 
43
  capture_output=True,
44
  text=True,
45
  timeout=10,
 
46
  )
47
- output = result.stdout + result.stderr
48
- return output if output.strip() else "(no output)"
49
  except subprocess.TimeoutExpired:
50
  return "ERROR: Command timed out after 10 seconds."
51
- except FileNotFoundError:
52
- return "ERROR: Docker not available."
53
 
54
  def read_file(self, path: str) -> str:
55
- """Read file contents from inside the container. Return error string if not found."""
 
56
  try:
57
- result = subprocess.run(
58
- ["docker", "exec", self.container_id, "cat", path],
59
- capture_output=True,
60
- text=True,
61
- timeout=5,
62
- )
63
- if result.returncode != 0:
64
- return f"ERROR: File not found or unreadable: {path}\n{result.stderr}"
65
- return result.stdout
66
- except subprocess.TimeoutExpired:
67
- return "ERROR: Read timed out."
68
  except FileNotFoundError:
69
- return "ERROR: Docker not available."
 
 
70
 
71
  def write_file(self, path: str, content: str) -> str:
72
- """Write content to path inside the container. Return confirmation or error."""
 
 
73
  try:
74
- # Ensure parent directory exists
75
- parent_dir = os.path.dirname(path)
76
- if parent_dir:
77
- subprocess.run(
78
- ["docker", "exec", self.container_id, "mkdir", "-p", parent_dir],
79
- capture_output=True,
80
- timeout=5,
81
- )
82
-
83
- # Write file using bash heredoc
84
- escaped_content = content.replace("'", "'\\''")
85
- result = subprocess.run(
86
- ["docker", "exec", self.container_id, "bash", "-c",
87
- f"cat > {path} << 'SWEBENCH_EOF'\n{content}\nSWEBENCH_EOF"],
88
- capture_output=True,
89
- text=True,
90
- timeout=5,
91
- )
92
- if result.returncode != 0:
93
- return f"ERROR: Could not write to {path}\n{result.stderr}"
94
  return f"OK: Written to {path}"
95
- except subprocess.TimeoutExpired:
96
- return "ERROR: Write timed out."
97
- except FileNotFoundError:
98
- return "ERROR: Docker not available."
99
 
100
  def run_pytest(self) -> dict:
101
- """
102
- Run pytest in container.
103
- Return: {"passed": int, "failed": int, "ratio": float, "output": str}
104
- """
105
  try:
106
  result = subprocess.run(
107
- ["docker", "exec", "-w", "/home/user2",
108
- self.container_id, "python", "-m", "pytest",
109
- "tests/", "--tb=short", "-q"],
110
  capture_output=True,
111
  text=True,
112
  timeout=30,
 
113
  )
114
  output = result.stdout + result.stderr
115
-
116
- # Parse pytest output
117
- passed_match = re.search(r"(\d+) passed", output)
118
- failed_match = re.search(r"(\d+) failed", output)
119
- error_match = re.search(r"(\d+) error", output)
120
-
121
- passed = int(passed_match.group(1)) if passed_match else 0
122
- failed = int(failed_match.group(1)) if failed_match else 0
123
- errors = int(error_match.group(1)) if error_match else 0
124
- total = passed + failed + errors
125
- ratio = passed / total if total > 0 else 0.0
126
-
127
  return {
128
  "passed": passed,
129
  "failed": failed + errors,
130
- "ratio": ratio,
131
  "output": output,
132
  }
133
  except subprocess.TimeoutExpired:
134
  return {"passed": 0, "failed": 0, "ratio": 0.0, "output": "ERROR: pytest timed out."}
135
- except FileNotFoundError:
136
- return {"passed": 0, "failed": 0, "ratio": 0.0, "output": "ERROR: Docker not available."}
137
 
138
  def curl_server(self) -> dict:
139
- """
140
- curl localhost:8080 inside container.
141
- Return: {"status_code": int, "success": bool}
142
- """
143
  try:
144
- result = subprocess.run(
145
- ["docker", "exec", self.container_id, "curl", "-s", "-o", "/dev/null",
146
- "-w", "%{http_code}", "http://localhost:8080"],
147
- capture_output=True,
148
- text=True,
149
- timeout=5,
150
  )
151
- status_code = int(result.stdout.strip()) if result.stdout.strip().isdigit() else 0
152
- return {"status_code": status_code, "success": status_code == 200}
153
- except (subprocess.TimeoutExpired, ValueError):
154
- return {"status_code": 0, "success": False}
155
- except FileNotFoundError:
156
  return {"status_code": 0, "success": False}
157
 
158
  def write_reply(self, recipient: str, content: str) -> str:
159
- """
160
- Append reply to output/reply.txt.
161
- Format: [RECIPIENT]: content
162
- Track in self.reply_log for diversity scoring.
163
- """
164
- recipient_upper = recipient.upper()
165
- formatted = f"[{recipient_upper}]: {content}\n"
166
-
167
  try:
168
- # Ensure output directory exists
169
- subprocess.run(
170
- ["docker", "exec", self.container_id, "mkdir", "-p", self.output_dir],
171
- capture_output=True,
172
- timeout=5,
173
- )
174
-
175
- # Append to reply.txt
176
- result = subprocess.run(
177
- ["docker", "exec", self.container_id, "bash", "-c",
178
- f"echo '{formatted.rstrip()}' >> {self.output_dir}/reply.txt"],
179
- capture_output=True,
180
- text=True,
181
- timeout=5,
182
- )
183
- if result.returncode != 0:
184
- return f"ERROR: Could not write reply\n{result.stderr}"
185
-
186
  self.reply_log.append(formatted)
187
- return f"OK: Reply sent to {recipient_upper}"
188
- except subprocess.TimeoutExpired:
189
- return "ERROR: Reply write timed out."
190
- except FileNotFoundError:
191
- return "ERROR: Docker not available."
192
 
193
  # ------------------------------------------------------------------
194
- # Task setup / reset
195
  # ------------------------------------------------------------------
196
 
197
- def setup_task(self, task_id: int) -> str:
198
- """
199
- Reset container to broken state for given task.
200
-
201
- Task 1: pip uninstall flask -y (wheel stays cached)
202
- Task 2: inject syntax error into app.py
203
- Task 3: inject off-by-one bug into sort function
204
- Task 4: start zombie process on port 8080
205
- Task 5: inject 3 bugs across 2 files + start zombie process
206
-
207
- Also copies the correct message files for the task and
208
- clears output/reply.txt.
209
- """
210
- task = TASKS[task_id]
211
-
212
- # Clear previous state
213
- self.reply_log = []
214
- commands = [
215
- # Create directory structure
216
- "mkdir -p /home/user2/tests /home/user2/logs /home/user2/messages /home/user2/output",
217
- # Clear output
218
- "rm -f /home/user2/output/reply.txt",
219
- # Kill any running servers on port 8080
220
- "pkill -f 'python.*app.py' 2>/dev/null || true",
221
- "fuser -k 8080/tcp 2>/dev/null || true",
222
- ]
223
-
224
- # Write the broken app code
225
- commands.append(
226
- f"cat > /home/user2/app.py << 'SWEBENCH_EOF'\n{task.broken_app_code}\nSWEBENCH_EOF"
227
- )
228
-
229
- # Write second broken file for Task 5
230
- if task.broken_app_code_2:
231
- commands.append(
232
- f"cat > /home/user2/utils.py << 'SWEBENCH_EOF'\n{task.broken_app_code_2}\nSWEBENCH_EOF"
233
- )
234
-
235
- # Write test code
236
- commands.append(
237
- f"cat > /home/user2/tests/test_app.py << 'SWEBENCH_EOF'\n{task.test_code}\nSWEBENCH_EOF"
238
- )
239
 
240
- # Write message files
241
- if task.slack_message:
242
- commands.append(
243
- f"echo '{task.slack_message}' > /home/user2/messages/slack.txt"
244
- )
245
- else:
246
- commands.append("echo '' > /home/user2/messages/slack.txt")
247
 
248
- if task.email_message:
249
- commands.append(
250
- f"echo '{task.email_message}' > /home/user2/messages/email.txt"
251
- )
252
- else:
253
- commands.append("echo '' > /home/user2/messages/email.txt")
254
 
255
- if task.hr_message:
256
- commands.append(
257
- f"echo '{task.hr_message}' > /home/user2/messages/hr.txt"
258
- )
259
- else:
260
- commands.append("echo '' > /home/user2/messages/hr.txt")
261
 
262
- # Generate error.log
263
- commands.append(
264
- f"echo 'Task {task_id}: {task.description}' > /home/user2/logs/error.log"
265
- )
266
 
267
- # Task-specific breakage
268
- if task_id == 1:
269
- # Uninstall flask (wheel remains cached for reinstall)
270
- commands.append("pip uninstall flask -y 2>/dev/null || true")
271
- elif task_id == 4:
272
- # Start zombie process blocking port 8080
273
- commands.append(
274
- "python -c \"import socket; s=socket.socket(); "
275
- "s.bind(('0.0.0.0', 8080)); s.listen(1); "
276
- "import time; time.sleep(9999)\" &"
277
- )
278
- elif task_id == 5:
279
- # Start zombie process blocking port 8080
280
- commands.append(
281
- "python -c \"import socket; s=socket.socket(); "
282
- "s.bind(('0.0.0.0', 8080)); s.listen(1); "
283
- "import time; time.sleep(9999)\" &"
284
  )
285
 
286
- # Execute all setup commands
287
- full_command = " && ".join(commands)
288
  try:
289
- result = subprocess.run(
290
- ["docker", "exec", self.container_id, "bash", "-c", full_command],
291
- capture_output=True,
292
- text=True,
293
- timeout=30,
 
294
  )
295
- return f"Task {task_id} setup complete. Return code: {result.returncode}"
296
- except subprocess.TimeoutExpired:
297
- return f"ERROR: Task {task_id} setup timed out."
298
- except FileNotFoundError:
299
- return "ERROR: Docker not available."
300
-
301
- def get_initial_observation(self, task_id: int) -> str:
302
- """
303
- Read and return the initial observation for a task:
304
- contents of error.log + messages/slack.txt + messages/email.txt
305
- """
306
- task = TASKS[task_id]
307
- parts = []
308
-
309
- # Error log
310
- error_log = self.read_file("/home/user2/logs/error.log")
311
- parts.append(f"=== ERROR LOG ===\n{error_log}")
312
-
313
- # Slack message
314
- if task.slack_message:
315
- slack = self.read_file("/home/user2/messages/slack.txt")
316
- parts.append(f"=== SLACK MESSAGE (from Manager) ===\n{slack}")
317
-
318
- # Email message
319
- if task.email_message:
320
- email = self.read_file("/home/user2/messages/email.txt")
321
- parts.append(f"=== EMAIL (from Client) ===\n{email}")
322
-
323
- # HR message (Task 5 only)
324
- if task.hr_message:
325
- hr = self.read_file("/home/user2/messages/hr.txt")
326
- parts.append(f"=== HR MESSAGE ===\n{hr}")
327
-
328
- parts.append(f"\n--- Task: {task.name} ---")
329
- parts.append(f"Description: {task.description}")
330
- parts.append(f"Max actions: {task.max_actions}")
331
-
332
- return "\n\n".join(parts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ simulator.py — Dockerless simulator for SWEbench-IN.
3
+
4
+ Replaces all Docker container operations with:
5
+ - A per-episode temp directory (virtual filesystem)
6
+ - Local subprocess execution (sandboxed to work_dir)
7
+ - In-process pytest via subprocess
8
+ - Local Flask server started as a child process
9
+ - requests to localhost for server health checks
10
  """
11
 
12
+ import ast
 
 
13
  import os
14
+ import re
15
+ import sys
16
+ import time
17
+ import shutil
18
+ import socket
19
+ import tempfile
20
+ import subprocess
21
+ import threading
22
+
23
+ import requests as http_requests
24
+
25
  from tasks import TASKS
26
 
27
 
28
  class Simulator:
29
+ """Dockerless executor for the SWEbench-IN environment."""
30
 
31
+ def __init__(self, container_id: str = None):
32
+ # container_id kept for API compatibility — ignored
33
+ self.work_dir: str = None
34
+ self.output_dir: str = None
35
  self.reply_log: list[str] = []
36
+ self._server_proc: subprocess.Popen = None
37
+ self._zombie_sock: socket.socket = None
38
+ self._server_port: int = 8080
39
+
40
+ # ------------------------------------------------------------------
41
+ # Task setup / reset
42
+ # ------------------------------------------------------------------
43
+
44
+ def setup_task(self, task_id: int) -> str:
45
+ """Reset to a fresh temp directory with the broken task files."""
46
+ self._kill_server()
47
+ self._kill_zombie()
48
+
49
+ # Fresh working directory each episode
50
+ if self.work_dir and os.path.exists(self.work_dir):
51
+ shutil.rmtree(self.work_dir, ignore_errors=True)
52
+
53
+ self.work_dir = tempfile.mkdtemp(prefix=f"swebench_task{task_id}_")
54
+ self.output_dir = os.path.join(self.work_dir, "output")
55
+ self.reply_log = []
56
+ self._make_dirs()
57
+
58
+ task = TASKS[task_id]
59
+
60
+ # Write broken source files
61
+ self._write(os.path.join(self.work_dir, "app.py"), task.broken_app_code)
62
+ if task.broken_app_code_2:
63
+ self._write(os.path.join(self.work_dir, "utils.py"), task.broken_app_code_2)
64
+
65
+ # Write tests
66
+ self._write(
67
+ os.path.join(self.work_dir, "tests", "test_app.py"),
68
+ task.test_code,
69
+ )
70
+
71
+ # Write message files
72
+ for fname, content in [
73
+ ("slack.txt", task.slack_message),
74
+ ("email.txt", task.email_message),
75
+ ("hr.txt", task.hr_message),
76
+ ]:
77
+ self._write(
78
+ os.path.join(self.work_dir, "messages", fname),
79
+ content or "",
80
+ )
81
+
82
+ # Error log
83
+ self._write(
84
+ os.path.join(self.work_dir, "logs", "error.log"),
85
+ f"Task {task_id}: {task.description}",
86
+ )
87
+
88
+ # Task-specific breakage
89
+ if task_id in (4, 5):
90
+ # Simulate zombie process blocking port 8080
91
+ self._start_zombie()
92
+
93
+ return f"Task {task_id} ready in {self.work_dir}"
94
+
95
+ def get_initial_observation(self, task_id: int) -> str:
96
+ task = TASKS[task_id]
97
+ parts = []
98
+
99
+ log_path = os.path.join(self.work_dir, "logs", "error.log")
100
+ if os.path.exists(log_path):
101
+ parts.append(f"=== ERROR LOG ===\n{open(log_path).read()}")
102
+
103
+ if task.slack_message:
104
+ parts.append(f"=== SLACK MESSAGE (from Manager) ===\n{task.slack_message}")
105
+ if task.email_message:
106
+ parts.append(f"=== EMAIL (from Client) ===\n{task.email_message}")
107
+ if task.hr_message:
108
+ parts.append(f"=== HR MESSAGE ===\n{task.hr_message}")
109
+
110
+ parts.append(f"\n--- Task: {task.name} ---")
111
+ parts.append(f"Description: {task.description}")
112
+ parts.append(f"Max actions: {task.max_actions}")
113
+
114
+ return "\n\n".join(parts)
115
 
116
  # ------------------------------------------------------------------
117
  # Action handlers
118
  # ------------------------------------------------------------------
119
 
120
  def run_bash(self, command: str) -> str:
121
+ """Execute a shell command inside work_dir (no Docker)."""
122
+ blocked = ["sudo", "rm -rf /", "chmod 777 /"]
123
+ for pattern in blocked:
 
 
 
 
 
 
124
  if pattern in command:
125
+ return f"BLOCKED: '{pattern}' is forbidden."
126
+
127
+ # pip install flask — simulate as no-op (flask is available on HF Spaces)
128
+ if re.search(r"pip\s+install\s+flask", command):
129
+ return "Requirement already satisfied: flask"
130
+
131
+ # Kill zombie process (tasks 4 & 5)
132
+ if any(k in command for k in ["pkill", "fuser -k", "kill"]):
133
+ self._kill_zombie()
134
+ return "OK: Port 8080 cleared."
135
+
136
+ # Start Flask server
137
+ if re.search(r"python.*app\.py", command) or "flask run" in command:
138
+ return self._start_server()
139
 
140
+ # General command — run locally in work_dir
141
  try:
142
  result = subprocess.run(
143
+ command,
144
+ shell=True,
145
+ cwd=self.work_dir,
146
  capture_output=True,
147
  text=True,
148
  timeout=10,
149
+ env={**os.environ, "PYTHONPATH": self.work_dir},
150
  )
151
+ output = (result.stdout + result.stderr).strip()
152
+ return output or "(no output)"
153
  except subprocess.TimeoutExpired:
154
  return "ERROR: Command timed out after 10 seconds."
155
+ except Exception as e:
156
+ return f"ERROR: {e}"
157
 
158
  def read_file(self, path: str) -> str:
159
+ """Read a file from work_dir. Accepts /home/user2/... or relative paths."""
160
+ full = self._resolve(path)
161
  try:
162
+ return open(full).read()
 
 
 
 
 
 
 
 
 
 
163
  except FileNotFoundError:
164
+ return f"ERROR: File not found: {path}"
165
+ except Exception as e:
166
+ return f"ERROR: {e}"
167
 
168
  def write_file(self, path: str, content: str) -> str:
169
+ """Write content to a file in work_dir."""
170
+ full = self._resolve(path)
171
+ os.makedirs(os.path.dirname(full), exist_ok=True)
172
  try:
173
+ self._write(full, content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  return f"OK: Written to {path}"
175
+ except Exception as e:
176
+ return f"ERROR: {e}"
 
 
177
 
178
  def run_pytest(self) -> dict:
179
+ """Run pytest in work_dir and return pass/fail counts."""
 
 
 
180
  try:
181
  result = subprocess.run(
182
+ [sys.executable, "-m", "pytest", "tests/", "--tb=short", "-q"],
183
+ cwd=self.work_dir,
 
184
  capture_output=True,
185
  text=True,
186
  timeout=30,
187
+ env={**os.environ, "PYTHONPATH": self.work_dir},
188
  )
189
  output = result.stdout + result.stderr
190
+ passed = int(m.group(1)) if (m := re.search(r"(\d+) passed", output)) else 0
191
+ failed = int(m.group(1)) if (m := re.search(r"(\d+) failed", output)) else 0
192
+ errors = int(m.group(1)) if (m := re.search(r"(\d+) error", output)) else 0
193
+ total = passed + failed + errors
 
 
 
 
 
 
 
 
194
  return {
195
  "passed": passed,
196
  "failed": failed + errors,
197
+ "ratio": passed / total if total > 0 else 0.0,
198
  "output": output,
199
  }
200
  except subprocess.TimeoutExpired:
201
  return {"passed": 0, "failed": 0, "ratio": 0.0, "output": "ERROR: pytest timed out."}
202
+ except Exception as e:
203
+ return {"passed": 0, "failed": 0, "ratio": 0.0, "output": f"ERROR: {e}"}
204
 
205
  def curl_server(self) -> dict:
206
+ """Check if the Flask server is up at localhost:8080."""
 
 
 
207
  try:
208
+ r = http_requests.get(
209
+ f"http://localhost:{self._server_port}", timeout=3
 
 
 
 
210
  )
211
+ return {"status_code": r.status_code, "success": r.status_code == 200}
212
+ except Exception:
 
 
 
213
  return {"status_code": 0, "success": False}
214
 
215
  def write_reply(self, recipient: str, content: str) -> str:
216
+ """Append a reply to output/reply.txt."""
217
+ formatted = f"[{recipient.upper()}]: {content}\n"
218
+ reply_path = os.path.join(self.output_dir, "reply.txt")
219
+ os.makedirs(self.output_dir, exist_ok=True)
 
 
 
 
220
  try:
221
+ with open(reply_path, "a") as f:
222
+ f.write(formatted)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  self.reply_log.append(formatted)
224
+ return f"OK: Reply sent to {recipient.upper()}"
225
+ except Exception as e:
226
+ return f"ERROR: {e}"
 
 
227
 
228
  # ------------------------------------------------------------------
229
+ # Internal helpers
230
  # ------------------------------------------------------------------
231
 
232
+ def _make_dirs(self):
233
+ for sub in ("tests", "logs", "messages", "output"):
234
+ os.makedirs(os.path.join(self.work_dir, sub), exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
+ @staticmethod
237
+ def _write(path: str, content: str):
238
+ os.makedirs(os.path.dirname(path), exist_ok=True)
239
+ with open(path, "w") as f:
240
+ f.write(content)
 
 
241
 
242
+ def _resolve(self, path: str) -> str:
243
+ """Translate /home/user2/... or bare relative path to work_dir path."""
244
+ norm = path.replace("/home/user2/", "").lstrip("/")
245
+ return os.path.join(self.work_dir, norm)
 
 
246
 
247
+ def _start_server(self) -> str:
248
+ """Launch app.py as a child process on port 8080."""
249
+ self._kill_server()
 
 
 
250
 
251
+ app_path = os.path.join(self.work_dir, "app.py")
252
+ if not os.path.exists(app_path):
253
+ return "ERROR: app.py not found."
 
254
 
255
+ # Syntax check before launching
256
+ try:
257
+ ast.parse(open(app_path).read())
258
+ except SyntaxError as e:
259
+ return f"ERROR: Syntax error in app.py — {e}"
260
+
261
+ # Check if zombie is blocking the port
262
+ if self._port_in_use(self._server_port):
263
+ return (
264
+ f"ERROR: Port {self._server_port} is already in use. "
265
+ "Kill the blocking process first."
 
 
 
 
 
 
266
  )
267
 
 
 
268
  try:
269
+ self._server_proc = subprocess.Popen(
270
+ [sys.executable, "app.py"],
271
+ cwd=self.work_dir,
272
+ stdout=subprocess.DEVNULL,
273
+ stderr=subprocess.DEVNULL,
274
+ env={**os.environ, "PYTHONPATH": self.work_dir},
275
  )
276
+ except Exception as e:
277
+ return f"ERROR: Could not start server — {e}"
278
+
279
+ # Wait up to 4 s for server to accept connections
280
+ for _ in range(8):
281
+ time.sleep(0.5)
282
+ if self._server_proc.poll() is not None:
283
+ return "ERROR: Server crashed on startup."
284
+ if not self._port_in_use(self._server_port):
285
+ continue
286
+ result = self.curl_server()
287
+ if result["success"]:
288
+ return "OK: Server started on port 8080."
289
+
290
+ # Server started but hasn't responded yet — return optimistic message
291
+ if self._server_proc.poll() is None:
292
+ return "OK: Server process started (may need a moment to be ready)."
293
+ return "ERROR: Server failed to start."
294
+
295
+ def _start_zombie(self):
296
+ """Block port 8080 with a socket to simulate a zombie process."""
297
+ try:
298
+ self._zombie_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
299
+ self._zombie_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
300
+ self._zombie_sock.bind(("0.0.0.0", self._server_port))
301
+ self._zombie_sock.listen(1)
302
+ except OSError:
303
+ self._zombie_sock = None # Port already in use — fine
304
+
305
+ def _kill_zombie(self):
306
+ if self._zombie_sock:
307
+ try:
308
+ self._zombie_sock.close()
309
+ except Exception:
310
+ pass
311
+ self._zombie_sock = None
312
+ time.sleep(0.3) # Brief pause for OS to release the port
313
+
314
+ def _kill_server(self):
315
+ if self._server_proc:
316
+ try:
317
+ self._server_proc.terminate()
318
+ self._server_proc.wait(timeout=3)
319
+ except Exception:
320
+ try:
321
+ self._server_proc.kill()
322
+ except Exception:
323
+ pass
324
+ self._server_proc = None
325
+
326
+ @staticmethod
327
+ def _port_in_use(port: int) -> bool:
328
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
329
+ return s.connect_ex(("localhost", port)) == 0