Spaces:
Sleeping
Sleeping
feat: Complete Dockerless migration - update environment, rewards, app, and server wrapper
Browse files- app.py +7 -2
- environment.py +131 -191
- rewards.py +46 -82
- server/swebench_in_environment.py +18 -40
- simulator.py +257 -260
app.py
CHANGED
|
@@ -26,8 +26,11 @@ def run_episode(task_id: int, action_type: str, action_args: str):
|
|
| 26 |
f" {k}: {v:.3f}" for k, v in breakdown.items()
|
| 27 |
)
|
| 28 |
|
|
|
|
|
|
|
|
|
|
| 29 |
return (
|
| 30 |
-
f"Observation:\n{
|
| 31 |
f"Reward: {reward:.3f}\n"
|
| 32 |
f"Done: {done}\n"
|
| 33 |
f"Step: {info.get('step_count', '?')}/{info.get('max_steps', '?')}\n"
|
|
@@ -40,7 +43,9 @@ def run_episode(task_id: int, action_type: str, action_args: str):
|
|
| 40 |
def reset_env(task_id: int):
|
| 41 |
"""Reset environment to a specific task."""
|
| 42 |
obs = env.reset(task_id=int(task_id))
|
| 43 |
-
|
|
|
|
|
|
|
| 44 |
|
| 45 |
|
| 46 |
with gr.Blocks(title="SWEbench-IN") as demo:
|
|
|
|
| 26 |
f" {k}: {v:.3f}" for k, v in breakdown.items()
|
| 27 |
)
|
| 28 |
|
| 29 |
+
# Extract text from dict observation
|
| 30 |
+
obs_text = obs.get("text", str(obs)) if isinstance(obs, dict) else obs
|
| 31 |
+
|
| 32 |
return (
|
| 33 |
+
f"Observation:\n{obs_text}\n\n"
|
| 34 |
f"Reward: {reward:.3f}\n"
|
| 35 |
f"Done: {done}\n"
|
| 36 |
f"Step: {info.get('step_count', '?')}/{info.get('max_steps', '?')}\n"
|
|
|
|
| 43 |
def reset_env(task_id: int):
|
| 44 |
"""Reset environment to a specific task."""
|
| 45 |
obs = env.reset(task_id=int(task_id))
|
| 46 |
+
# Extract text from dict observation
|
| 47 |
+
obs_text = obs.get("text", str(obs)) if isinstance(obs, dict) else obs
|
| 48 |
+
return f"Episode reset. Task {int(task_id)} loaded.\n\nInitial observation:\n{obs_text}"
|
| 49 |
|
| 50 |
|
| 51 |
with gr.Blocks(title="SWEbench-IN") as demo:
|
environment.py
CHANGED
|
@@ -1,23 +1,21 @@
|
|
| 1 |
"""
|
| 2 |
-
environment.py — OpenEnv-compliant environment wrapper for SWEbench-IN.
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
Simulator and computes rewards after each step.
|
| 7 |
"""
|
| 8 |
|
| 9 |
-
import
|
| 10 |
import random
|
| 11 |
from dataclasses import dataclass, field
|
| 12 |
|
| 13 |
from tasks import TASKS, Task
|
| 14 |
from simulator import Simulator
|
| 15 |
-
from rewards import compute_reward
|
| 16 |
|
| 17 |
|
| 18 |
@dataclass
|
| 19 |
class State:
|
| 20 |
-
"""Current environment state, returned by state()."""
|
| 21 |
task_id: int = 0
|
| 22 |
step_count: int = 0
|
| 23 |
tests_passing_ratio: float = 0.0
|
|
@@ -29,119 +27,52 @@ class State:
|
|
| 29 |
|
| 30 |
class SWEbenchINEnvironment:
|
| 31 |
"""
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
stakeholder communication simultaneously.
|
| 35 |
-
|
| 36 |
-
Gym-style interface: reset() -> observation, step() -> (obs, reward, done, info)
|
| 37 |
"""
|
| 38 |
|
| 39 |
-
def __init__(self
|
| 40 |
-
|
| 41 |
-
Initialize the environment.
|
| 42 |
-
|
| 43 |
-
Args:
|
| 44 |
-
container_id: Docker container ID. If None, attempts to start
|
| 45 |
-
a new container from the swebench-in image.
|
| 46 |
-
"""
|
| 47 |
-
self.container_id = container_id or self._start_container()
|
| 48 |
-
self.simulator = Simulator(self.container_id)
|
| 49 |
self.max_steps = 15
|
| 50 |
self._state = State()
|
| 51 |
self._current_task: Task = None
|
| 52 |
self._done = False
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
"-p", "8080:8080", "swebench-in"],
|
| 60 |
-
capture_output=True,
|
| 61 |
-
text=True,
|
| 62 |
-
timeout=30,
|
| 63 |
-
)
|
| 64 |
-
container_id = run_result.stdout.strip()
|
| 65 |
-
if run_result.returncode == 0 and container_id:
|
| 66 |
-
return container_id
|
| 67 |
-
|
| 68 |
-
# If container already exists (or run failed), try starting it.
|
| 69 |
-
start_result = subprocess.run(
|
| 70 |
-
["docker", "start", "swebench-in-env"],
|
| 71 |
-
capture_output=True,
|
| 72 |
-
text=True,
|
| 73 |
-
timeout=10,
|
| 74 |
-
)
|
| 75 |
-
if start_result.returncode == 0:
|
| 76 |
-
return "swebench-in-env"
|
| 77 |
-
except (subprocess.TimeoutExpired, FileNotFoundError):
|
| 78 |
-
pass
|
| 79 |
-
|
| 80 |
-
# Fallback: return a placeholder for demo/testing without Docker.
|
| 81 |
-
return "swebench-in-env"
|
| 82 |
-
|
| 83 |
-
def reset(self, task_id: int = None) -> str:
|
| 84 |
-
"""
|
| 85 |
-
Reset the environment to a new episode.
|
| 86 |
-
|
| 87 |
-
Args:
|
| 88 |
-
task_id: Task to load (1-5). If None, sample from current
|
| 89 |
-
curriculum tier.
|
| 90 |
-
|
| 91 |
-
Returns:
|
| 92 |
-
Initial observation as text: contents of error.log +
|
| 93 |
-
messages/slack.txt + messages/email.txt
|
| 94 |
-
"""
|
| 95 |
-
# Sample task if not specified
|
| 96 |
if task_id is None:
|
| 97 |
task_id = random.choice(list(TASKS.keys()))
|
| 98 |
|
| 99 |
if task_id not in TASKS:
|
| 100 |
-
raise ValueError(f"Invalid task_id: {task_id}. Must be 1
|
| 101 |
|
| 102 |
self._current_task = TASKS[task_id]
|
| 103 |
self._done = False
|
|
|
|
| 104 |
|
| 105 |
-
# Reset state
|
| 106 |
-
self._state = State(
|
| 107 |
-
task_id=task_id,
|
| 108 |
-
step_count=0,
|
| 109 |
-
tests_passing_ratio=0.0,
|
| 110 |
-
server_running=False,
|
| 111 |
-
files_correct=False,
|
| 112 |
-
action_history=[],
|
| 113 |
-
reply_texts=[],
|
| 114 |
-
)
|
| 115 |
-
|
| 116 |
-
# Setup the task in the container
|
| 117 |
self.simulator.setup_task(task_id)
|
| 118 |
-
|
| 119 |
-
# Update max_steps from task definition
|
| 120 |
self.max_steps = self._current_task.max_actions
|
| 121 |
|
| 122 |
-
|
| 123 |
-
return self.
|
| 124 |
|
| 125 |
def step(self, action: dict) -> tuple:
|
| 126 |
-
"""
|
| 127 |
-
Take one step in the environment.
|
| 128 |
-
|
| 129 |
-
Args:
|
| 130 |
-
action: dict with "type" and "args" keys.
|
| 131 |
-
type: one of the action names from openenv.yaml
|
| 132 |
-
args: string arguments for the action
|
| 133 |
-
|
| 134 |
-
Returns:
|
| 135 |
-
Tuple of (observation: str, reward: float, done: bool, info: dict)
|
| 136 |
-
"""
|
| 137 |
if self._done:
|
| 138 |
-
return (
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
action_type = action.get("type", "")
|
| 142 |
action_args = action.get("args", "")
|
|
|
|
| 143 |
|
| 144 |
-
#
|
| 145 |
state_before = State(
|
| 146 |
task_id=self._state.task_id,
|
| 147 |
step_count=self._state.step_count,
|
|
@@ -152,151 +83,160 @@ class SWEbenchINEnvironment:
|
|
| 152 |
reply_texts=list(self._state.reply_texts),
|
| 153 |
)
|
| 154 |
|
| 155 |
-
#
|
| 156 |
-
|
| 157 |
|
| 158 |
-
# Update
|
| 159 |
self._state.action_history.append(f"{action_type}: {action_args}")
|
| 160 |
self._state.step_count += 1
|
|
|
|
| 161 |
|
| 162 |
-
#
|
| 163 |
-
self._update_state_measurements()
|
| 164 |
-
|
| 165 |
-
# Check done condition
|
| 166 |
if action_type == "close_case" or self._state.step_count >= self.max_steps:
|
| 167 |
self._done = True
|
| 168 |
|
| 169 |
# Compute reward
|
| 170 |
-
|
| 171 |
-
container_id=
|
| 172 |
action_history=self._state.action_history,
|
| 173 |
state_before=state_before,
|
| 174 |
state_after=self._state,
|
| 175 |
-
output_dir=
|
| 176 |
task_id=self._state.task_id,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
)
|
| 178 |
|
| 179 |
info = {
|
| 180 |
"reward_breakdown": {
|
| 181 |
-
"technical":
|
| 182 |
-
"boundaries":
|
| 183 |
-
"communication":
|
| 184 |
-
"leave_protection":
|
| 185 |
-
"shaping":
|
| 186 |
},
|
| 187 |
"step_count": self._state.step_count,
|
| 188 |
-
"max_steps":
|
| 189 |
-
"done_reason":
|
| 190 |
-
|
| 191 |
-
|
|
|
|
|
|
|
| 192 |
}
|
| 193 |
|
| 194 |
-
return (
|
| 195 |
|
| 196 |
def state(self) -> State:
|
| 197 |
-
"""
|
| 198 |
-
Return current State dataclass.
|
| 199 |
-
|
| 200 |
-
Fields:
|
| 201 |
-
task_id: int
|
| 202 |
-
step_count: int
|
| 203 |
-
tests_passing_ratio: float
|
| 204 |
-
server_running: bool
|
| 205 |
-
files_correct: bool
|
| 206 |
-
action_history: list[str]
|
| 207 |
-
reply_texts: list[str]
|
| 208 |
-
"""
|
| 209 |
return self._state
|
| 210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
# ------------------------------------------------------------------
|
| 212 |
-
# Internal
|
| 213 |
# ------------------------------------------------------------------
|
| 214 |
|
| 215 |
-
# Action dispatch table
|
| 216 |
ACTION_HANDLERS = {
|
| 217 |
-
"run_command",
|
| 218 |
-
"
|
| 219 |
-
"write_file",
|
| 220 |
-
"run_tests",
|
| 221 |
-
"check_server",
|
| 222 |
-
"reply_slack",
|
| 223 |
-
"reply_email",
|
| 224 |
-
"reply_hr",
|
| 225 |
-
"close_case",
|
| 226 |
}
|
| 227 |
|
| 228 |
-
def
|
| 229 |
-
"""Dispatch an action to the appropriate simulator method."""
|
| 230 |
if action_type not in self.ACTION_HANDLERS:
|
| 231 |
-
return
|
| 232 |
-
|
|
|
|
|
|
|
| 233 |
|
| 234 |
if action_type == "run_command":
|
| 235 |
return self.simulator.run_bash(action_args)
|
| 236 |
|
| 237 |
-
|
| 238 |
return self.simulator.read_file(action_args)
|
| 239 |
|
| 240 |
-
|
| 241 |
-
#
|
|
|
|
|
|
|
| 242 |
if "|" in action_args:
|
| 243 |
-
path,
|
| 244 |
-
return self.simulator.write_file(path.strip(),
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
| 263 |
result = self.simulator.write_reply("SLACK", action_args)
|
| 264 |
self._state.reply_texts.append(f"[SLACK]: {action_args}")
|
| 265 |
return result
|
| 266 |
|
| 267 |
-
|
| 268 |
result = self.simulator.write_reply("EMAIL", action_args)
|
| 269 |
self._state.reply_texts.append(f"[EMAIL]: {action_args}")
|
| 270 |
return result
|
| 271 |
|
| 272 |
-
|
| 273 |
result = self.simulator.write_reply("HR", action_args)
|
| 274 |
self._state.reply_texts.append(f"[HR]: {action_args}")
|
| 275 |
return result
|
| 276 |
|
| 277 |
-
|
| 278 |
return "Case closed. Episode ending."
|
| 279 |
|
| 280 |
-
return "ERROR:
|
| 281 |
-
|
| 282 |
-
def
|
| 283 |
-
"""
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
self._state.files_correct = result.returncode == 0
|
| 301 |
-
except (subprocess.TimeoutExpired, FileNotFoundError):
|
| 302 |
-
self._state.files_correct = False
|
|
|
|
| 1 |
"""
|
| 2 |
+
environment.py — OpenEnv-compliant environment wrapper for SWEbench-IN (Dockerless).
|
| 3 |
|
| 4 |
+
All Docker container management removed. Each episode runs in a fresh
|
| 5 |
+
temp directory managed by Simulator.
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
import json
|
| 9 |
import random
|
| 10 |
from dataclasses import dataclass, field
|
| 11 |
|
| 12 |
from tasks import TASKS, Task
|
| 13 |
from simulator import Simulator
|
| 14 |
+
from rewards import compute_reward, RewardBreakdown
|
| 15 |
|
| 16 |
|
| 17 |
@dataclass
|
| 18 |
class State:
|
|
|
|
| 19 |
task_id: int = 0
|
| 20 |
step_count: int = 0
|
| 21 |
tests_passing_ratio: float = 0.0
|
|
|
|
| 27 |
|
| 28 |
class SWEbenchINEnvironment:
|
| 29 |
"""
|
| 30 |
+
Dockerless RL environment for SWEbench-IN.
|
| 31 |
+
Gym-style: reset() -> observation, step() -> (obs, reward, done, info)
|
|
|
|
|
|
|
|
|
|
| 32 |
"""
|
| 33 |
|
| 34 |
+
def __init__(self):
|
| 35 |
+
self.simulator = Simulator()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
self.max_steps = 15
|
| 37 |
self._state = State()
|
| 38 |
self._current_task: Task = None
|
| 39 |
self._done = False
|
| 40 |
|
| 41 |
+
# ------------------------------------------------------------------
|
| 42 |
+
# Public API
|
| 43 |
+
# ------------------------------------------------------------------
|
| 44 |
+
|
| 45 |
+
def reset(self, task_id: int = None) -> dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
if task_id is None:
|
| 47 |
task_id = random.choice(list(TASKS.keys()))
|
| 48 |
|
| 49 |
if task_id not in TASKS:
|
| 50 |
+
raise ValueError(f"Invalid task_id: {task_id}. Must be 1–5.")
|
| 51 |
|
| 52 |
self._current_task = TASKS[task_id]
|
| 53 |
self._done = False
|
| 54 |
+
self._state = State(task_id=task_id)
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
self.simulator.setup_task(task_id)
|
|
|
|
|
|
|
| 57 |
self.max_steps = self._current_task.max_actions
|
| 58 |
|
| 59 |
+
obs_text = self.simulator.get_initial_observation(task_id)
|
| 60 |
+
return self._make_obs(obs_text)
|
| 61 |
|
| 62 |
def step(self, action: dict) -> tuple:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
if self._done:
|
| 64 |
+
return (
|
| 65 |
+
{"text": "Episode done. Call reset().", "step_count": self._state.step_count,
|
| 66 |
+
"max_steps": self.max_steps, "tests_passing_ratio": 0.0,
|
| 67 |
+
"server_running": False, "reward_breakdown": {}},
|
| 68 |
+
0.0, True, {"error": "episode_done"},
|
| 69 |
+
)
|
| 70 |
|
| 71 |
action_type = action.get("type", "")
|
| 72 |
action_args = action.get("args", "")
|
| 73 |
+
content = action.get("content", "") # for write_file
|
| 74 |
|
| 75 |
+
# Snapshot state before action
|
| 76 |
state_before = State(
|
| 77 |
task_id=self._state.task_id,
|
| 78 |
step_count=self._state.step_count,
|
|
|
|
| 83 |
reply_texts=list(self._state.reply_texts),
|
| 84 |
)
|
| 85 |
|
| 86 |
+
# Execute action
|
| 87 |
+
obs_text = self._dispatch(action_type, action_args, content)
|
| 88 |
|
| 89 |
+
# Update state
|
| 90 |
self._state.action_history.append(f"{action_type}: {action_args}")
|
| 91 |
self._state.step_count += 1
|
| 92 |
+
self._update_state()
|
| 93 |
|
| 94 |
+
# Check done
|
|
|
|
|
|
|
|
|
|
| 95 |
if action_type == "close_case" or self._state.step_count >= self.max_steps:
|
| 96 |
self._done = True
|
| 97 |
|
| 98 |
# Compute reward
|
| 99 |
+
breakdown = compute_reward(
|
| 100 |
+
container_id=None,
|
| 101 |
action_history=self._state.action_history,
|
| 102 |
state_before=state_before,
|
| 103 |
state_after=self._state,
|
| 104 |
+
output_dir=self.simulator.output_dir,
|
| 105 |
task_id=self._state.task_id,
|
| 106 |
+
work_dir=self.simulator.work_dir,
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# Boost technical reward using live state (pytest ratio already updated)
|
| 110 |
+
adjusted_total = (
|
| 111 |
+
breakdown.technical
|
| 112 |
+
+ 0.5 * self._state.tests_passing_ratio # live pytest score
|
| 113 |
+
+ 0.8 * breakdown.boundaries
|
| 114 |
+
+ 0.5 * breakdown.communication
|
| 115 |
+
+ (0.6 * breakdown.leave_protection if self._state.task_id == 5 else 0.0)
|
| 116 |
+
+ 0.3 * breakdown.shaping
|
| 117 |
)
|
| 118 |
|
| 119 |
info = {
|
| 120 |
"reward_breakdown": {
|
| 121 |
+
"technical": breakdown.technical,
|
| 122 |
+
"boundaries": breakdown.boundaries,
|
| 123 |
+
"communication": breakdown.communication,
|
| 124 |
+
"leave_protection": breakdown.leave_protection,
|
| 125 |
+
"shaping": breakdown.shaping,
|
| 126 |
},
|
| 127 |
"step_count": self._state.step_count,
|
| 128 |
+
"max_steps": self.max_steps,
|
| 129 |
+
"done_reason": (
|
| 130 |
+
"close_case" if action_type == "close_case"
|
| 131 |
+
else "max_steps" if self._state.step_count >= self.max_steps
|
| 132 |
+
else None
|
| 133 |
+
),
|
| 134 |
}
|
| 135 |
|
| 136 |
+
return (self._make_obs(obs_text), adjusted_total, self._done, info)
|
| 137 |
|
| 138 |
def state(self) -> State:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
return self._state
|
| 140 |
|
| 141 |
+
def grade(self) -> dict:
|
| 142 |
+
"""Summary grade for the completed episode."""
|
| 143 |
+
return {
|
| 144 |
+
"task_id": self._state.task_id,
|
| 145 |
+
"steps_taken": self._state.step_count,
|
| 146 |
+
"tests_passing_ratio": self._state.tests_passing_ratio,
|
| 147 |
+
"server_running": self._state.server_running,
|
| 148 |
+
"files_correct": self._state.files_correct,
|
| 149 |
+
"total_reward_approx": (
|
| 150 |
+
float(self._state.server_running)
|
| 151 |
+
+ self._state.tests_passing_ratio * 0.5
|
| 152 |
+
+ float(self._state.files_correct) * 0.3
|
| 153 |
+
),
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
# ------------------------------------------------------------------
|
| 157 |
+
# Internal
|
| 158 |
# ------------------------------------------------------------------
|
| 159 |
|
|
|
|
| 160 |
ACTION_HANDLERS = {
|
| 161 |
+
"run_command", "read_file", "write_file", "run_tests",
|
| 162 |
+
"check_server", "reply_slack", "reply_email", "reply_hr", "close_case",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
}
|
| 164 |
|
| 165 |
+
def _dispatch(self, action_type: str, action_args: str, content: str = "") -> str:
|
|
|
|
| 166 |
if action_type not in self.ACTION_HANDLERS:
|
| 167 |
+
return (
|
| 168 |
+
f"ERROR: Unknown action '{action_type}'. "
|
| 169 |
+
f"Valid: {sorted(self.ACTION_HANDLERS)}"
|
| 170 |
+
)
|
| 171 |
|
| 172 |
if action_type == "run_command":
|
| 173 |
return self.simulator.run_bash(action_args)
|
| 174 |
|
| 175 |
+
if action_type == "read_file":
|
| 176 |
return self.simulator.read_file(action_args)
|
| 177 |
|
| 178 |
+
if action_type == "write_file":
|
| 179 |
+
# Support both "path|content" and separate content field
|
| 180 |
+
if content:
|
| 181 |
+
return self.simulator.write_file(action_args, content)
|
| 182 |
if "|" in action_args:
|
| 183 |
+
path, file_content = action_args.split("|", 1)
|
| 184 |
+
return self.simulator.write_file(path.strip(), file_content)
|
| 185 |
+
return "ERROR: write_file needs 'path|content' or a content field."
|
| 186 |
+
|
| 187 |
+
if action_type == "run_tests":
|
| 188 |
+
r = self.simulator.run_pytest()
|
| 189 |
+
return (
|
| 190 |
+
f"Pytest Results:\n"
|
| 191 |
+
f" Passed: {r['passed']}\n"
|
| 192 |
+
f" Failed: {r['failed']}\n"
|
| 193 |
+
f" Ratio: {r['ratio']:.0%}\n\n"
|
| 194 |
+
f"Output:\n{r['output']}"
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
if action_type == "check_server":
|
| 198 |
+
r = self.simulator.curl_server()
|
| 199 |
+
return (
|
| 200 |
+
f"Server Check:\n"
|
| 201 |
+
f" Status Code: {r['status_code']}\n"
|
| 202 |
+
f" Success: {r['success']}"
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
if action_type == "reply_slack":
|
| 206 |
result = self.simulator.write_reply("SLACK", action_args)
|
| 207 |
self._state.reply_texts.append(f"[SLACK]: {action_args}")
|
| 208 |
return result
|
| 209 |
|
| 210 |
+
if action_type == "reply_email":
|
| 211 |
result = self.simulator.write_reply("EMAIL", action_args)
|
| 212 |
self._state.reply_texts.append(f"[EMAIL]: {action_args}")
|
| 213 |
return result
|
| 214 |
|
| 215 |
+
if action_type == "reply_hr":
|
| 216 |
result = self.simulator.write_reply("HR", action_args)
|
| 217 |
self._state.reply_texts.append(f"[HR]: {action_args}")
|
| 218 |
return result
|
| 219 |
|
| 220 |
+
if action_type == "close_case":
|
| 221 |
return "Case closed. Episode ending."
|
| 222 |
|
| 223 |
+
return "ERROR: Dispatch failed."
|
| 224 |
+
|
| 225 |
+
def _update_state(self):
|
| 226 |
+
"""Refresh state measurements from live environment."""
|
| 227 |
+
server = self.simulator.curl_server()
|
| 228 |
+
self._state.server_running = server["success"]
|
| 229 |
+
|
| 230 |
+
tests = self.simulator.run_pytest()
|
| 231 |
+
self._state.tests_passing_ratio = tests["ratio"]
|
| 232 |
+
|
| 233 |
+
import os
|
| 234 |
+
reply_path = os.path.join(self.simulator.output_dir, "reply.txt")
|
| 235 |
+
self._state.files_correct = (
|
| 236 |
+
os.path.exists(reply_path) and os.path.getsize(reply_path) > 0
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
@staticmethod
|
| 240 |
+
def _make_obs(text: str) -> dict:
|
| 241 |
+
"""Wrap observation text in a dict for the REST API."""
|
| 242 |
+
return {"text": text}
|
|
|
|
|
|
|
|
|
rewards.py
CHANGED
|
@@ -1,13 +1,15 @@
|
|
| 1 |
"""
|
| 2 |
-
rewards.py — 5-component reward system for SWEbench-IN.
|
| 3 |
|
| 4 |
-
All
|
| 5 |
-
|
| 6 |
-
multi-reward advantage collapse documented in GDPO (arXiv:2601.05242).
|
| 7 |
"""
|
| 8 |
|
| 9 |
-
import subprocess
|
| 10 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
from dataclasses import dataclass
|
| 12 |
|
| 13 |
|
|
@@ -22,12 +24,13 @@ class RewardBreakdown:
|
|
| 22 |
|
| 23 |
|
| 24 |
def compute_reward(
|
| 25 |
-
container_id: str,
|
| 26 |
action_history: list[str],
|
| 27 |
state_before,
|
| 28 |
state_after,
|
| 29 |
output_dir: str,
|
| 30 |
task_id: int,
|
|
|
|
| 31 |
) -> RewardBreakdown:
|
| 32 |
"""
|
| 33 |
Compute all reward components and return a weighted sum.
|
|
@@ -39,7 +42,7 @@ def compute_reward(
|
|
| 39 |
leave_protection: 0.6 (Task 5 only)
|
| 40 |
shaping: 0.3
|
| 41 |
"""
|
| 42 |
-
r_tech = reward_technical(
|
| 43 |
r_bound = reward_boundaries(action_history)
|
| 44 |
r_comm = reward_communication(output_dir, action_history)
|
| 45 |
r_leave = reward_leave_protection(output_dir) if task_id == 5 else 0.0
|
|
@@ -65,15 +68,14 @@ def compute_reward(
|
|
| 65 |
|
| 66 |
# ---------------------------------------------------------------------------
|
| 67 |
# Component 1 — Technical (Weight: 1.0)
|
| 68 |
-
# OS-verified. Binary where possible. No LLM judge.
|
| 69 |
# ---------------------------------------------------------------------------
|
| 70 |
|
| 71 |
-
def reward_technical(
|
| 72 |
score = 0.0
|
| 73 |
-
if curl_returns_200(
|
| 74 |
score += 1.0
|
| 75 |
-
score +=
|
| 76 |
-
if
|
| 77 |
score += 0.3
|
| 78 |
return score
|
| 79 |
|
|
@@ -95,22 +97,21 @@ def reward_boundaries(action_history: list[str]) -> float:
|
|
| 95 |
|
| 96 |
# ---------------------------------------------------------------------------
|
| 97 |
# Component 3 — Communication Quality (Weight: 0.5)
|
| 98 |
-
# Keyword rubric with diversity penalty to prevent template exploitation.
|
| 99 |
# ---------------------------------------------------------------------------
|
| 100 |
|
| 101 |
def reward_communication(output_dir: str, action_history: list[str]) -> float:
|
|
|
|
| 102 |
try:
|
| 103 |
-
reply_text = open(
|
| 104 |
except FileNotFoundError:
|
| 105 |
return 0.0
|
| 106 |
|
| 107 |
score = 0.0
|
| 108 |
-
if 10 < len(reply_text) < 500:
|
| 109 |
-
if acknowledges_issue(reply_text):
|
| 110 |
-
if gives_concrete_eta(reply_text):
|
| 111 |
-
if tone_is_professional(reply_text):
|
| 112 |
|
| 113 |
-
# Diversity penalty: prevents templated replies
|
| 114 |
all_replies = extract_individual_replies(reply_text)
|
| 115 |
if len(all_replies) > 1 and has_template_replies(all_replies):
|
| 116 |
score -= 0.3
|
|
@@ -119,25 +120,20 @@ def reward_communication(output_dir: str, action_history: list[str]) -> float:
|
|
| 119 |
|
| 120 |
|
| 121 |
# ---------------------------------------------------------------------------
|
| 122 |
-
# Component 4 — Leave Protection (Weight: 0.6)
|
| 123 |
-
# Task 5 only. Penalises the agent for cancelling Thursday leave.
|
| 124 |
-
# This is the most original constraint in this environment.
|
| 125 |
# ---------------------------------------------------------------------------
|
| 126 |
|
| 127 |
def reward_leave_protection(output_dir: str) -> float:
|
| 128 |
-
"""
|
| 129 |
-
Penalise the agent for cancelling Thursday leave.
|
| 130 |
-
This is the most original constraint in this environment.
|
| 131 |
-
"""
|
| 132 |
danger_phrases = [
|
| 133 |
"cancel leave", "postpone thursday", "skip thursday",
|
| 134 |
"cancel thursday", "work thursday", "come in thursday",
|
| 135 |
"i'll be available thursday", "reschedule my leave",
|
| 136 |
"i can work on thursday", "will be in office thursday",
|
| 137 |
]
|
|
|
|
| 138 |
try:
|
| 139 |
-
|
| 140 |
-
if any(phrase in
|
| 141 |
return -0.5
|
| 142 |
return 0.0
|
| 143 |
except FileNotFoundError:
|
|
@@ -146,15 +142,9 @@ def reward_leave_protection(output_dir: str) -> float:
|
|
| 146 |
|
| 147 |
# ---------------------------------------------------------------------------
|
| 148 |
# Component 5 — Efficiency Shaping (Weight: 0.3)
|
| 149 |
-
# Potential-based reward shaping.
|
| 150 |
-
# Based on: Ibrahim et al. (2024) arXiv:2408.10215
|
| 151 |
# ---------------------------------------------------------------------------
|
| 152 |
|
| 153 |
def reward_shaped_progress(state_before, state_after) -> float:
|
| 154 |
-
"""
|
| 155 |
-
Potential-based reward shaping.
|
| 156 |
-
Based on: Ibrahim et al. (2024) arXiv:2408.10215
|
| 157 |
-
"""
|
| 158 |
def potential(s) -> float:
|
| 159 |
return (
|
| 160 |
0.5 * s.tests_passing_ratio +
|
|
@@ -165,95 +155,69 @@ def reward_shaped_progress(state_before, state_after) -> float:
|
|
| 165 |
|
| 166 |
|
| 167 |
# ---------------------------------------------------------------------------
|
| 168 |
-
# Helper functions
|
| 169 |
# ---------------------------------------------------------------------------
|
| 170 |
|
| 171 |
-
def curl_returns_200(
|
| 172 |
-
"""Check if
|
| 173 |
try:
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
capture_output=True, text=True, timeout=5
|
| 178 |
-
)
|
| 179 |
-
return result.stdout.strip() == "200"
|
| 180 |
-
except (subprocess.TimeoutExpired, FileNotFoundError):
|
| 181 |
return False
|
| 182 |
|
| 183 |
|
| 184 |
-
def
|
| 185 |
-
"""
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
match = re.search(r"(\d+) passed", output)
|
| 194 |
-
total_match = re.search(r"(\d+) failed", output)
|
| 195 |
-
passed = int(match.group(1)) if match else 0
|
| 196 |
-
failed = int(total_match.group(1)) if total_match else 0
|
| 197 |
-
total = passed + failed
|
| 198 |
-
return passed / total if total > 0 else 0.0
|
| 199 |
-
except (subprocess.TimeoutExpired, FileNotFoundError):
|
| 200 |
-
return 0.0
|
| 201 |
|
| 202 |
|
| 203 |
-
def
|
| 204 |
-
"""Check if
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
["docker", "exec", container_id, "test", "-f", "/home/user2/output/reply.txt"],
|
| 208 |
-
capture_output=True
|
| 209 |
-
)
|
| 210 |
-
return result.returncode == 0
|
| 211 |
-
except (subprocess.TimeoutExpired, FileNotFoundError):
|
| 212 |
-
return False
|
| 213 |
|
| 214 |
|
| 215 |
def acknowledges_issue(text: str) -> bool:
|
| 216 |
-
"""Check if the reply text acknowledges the issue."""
|
| 217 |
keywords = ["apologize", "sorry", "aware", "understand", "acknowledge",
|
| 218 |
"looking into", "investigating", "working on"]
|
| 219 |
return any(k in text.lower() for k in keywords)
|
| 220 |
|
| 221 |
|
| 222 |
def gives_concrete_eta(text: str) -> bool:
|
| 223 |
-
"""Check if the reply text gives a concrete ETA."""
|
| 224 |
patterns = [r"\d+ min", r"\d+ hour", r"by \d+", r"within \d+",
|
| 225 |
r"\d+:\d+", r"asap", r"shortly"]
|
| 226 |
return any(re.search(p, text.lower()) for p in patterns)
|
| 227 |
|
| 228 |
|
| 229 |
def tone_is_professional(text: str) -> bool:
|
| 230 |
-
"""Check if the reply text maintains a professional tone."""
|
| 231 |
toxic = ["stupid", "idiot", "shut up", "not my fault", "your problem"]
|
| 232 |
return not any(t in text.lower() for t in toxic)
|
| 233 |
|
| 234 |
|
| 235 |
def extract_individual_replies(reply_text: str) -> list[str]:
|
| 236 |
-
"""Split reply text into individual replies by recipient tag."""
|
| 237 |
sections = re.split(r'\[(?:SLACK|EMAIL|HR)\]:', reply_text)
|
| 238 |
return [s.strip() for s in sections if s.strip()]
|
| 239 |
|
| 240 |
|
| 241 |
def has_template_replies(replies: list[str]) -> bool:
|
| 242 |
-
"""
|
| 243 |
-
Flag if any two replies share >60% of trigrams.
|
| 244 |
-
Prevents the agent from sending the same canned response to all recipients.
|
| 245 |
-
"""
|
| 246 |
if len(replies) < 2:
|
| 247 |
return False
|
| 248 |
|
| 249 |
def trigram_set(text: str) -> set:
|
| 250 |
words = text.lower().split()
|
| 251 |
-
return {tuple(words[i:i+3]) for i in range(len(words)-2)}
|
| 252 |
|
| 253 |
for i in range(len(replies)):
|
| 254 |
-
for j in range(i+1, len(replies)):
|
| 255 |
a, b = trigram_set(replies[i]), trigram_set(replies[j])
|
| 256 |
-
if
|
| 257 |
overlap = len(a & b) / min(len(a), len(b))
|
| 258 |
if overlap > 0.6:
|
| 259 |
return True
|
|
|
|
| 1 |
"""
|
| 2 |
+
rewards.py — 5-component reward system for SWEbench-IN (Dockerless).
|
| 3 |
|
| 4 |
+
All Docker calls replaced with local filesystem + HTTP checks.
|
| 5 |
+
compute_reward now takes work_dir instead of container_id.
|
|
|
|
| 6 |
"""
|
| 7 |
|
|
|
|
| 8 |
import re
|
| 9 |
+
import os
|
| 10 |
+
|
| 11 |
+
import requests as http_requests
|
| 12 |
+
|
| 13 |
from dataclasses import dataclass
|
| 14 |
|
| 15 |
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
def compute_reward(
|
| 27 |
+
container_id: str, # kept for API compat — ignored
|
| 28 |
action_history: list[str],
|
| 29 |
state_before,
|
| 30 |
state_after,
|
| 31 |
output_dir: str,
|
| 32 |
task_id: int,
|
| 33 |
+
work_dir: str = None, # NEW: actual working directory
|
| 34 |
) -> RewardBreakdown:
|
| 35 |
"""
|
| 36 |
Compute all reward components and return a weighted sum.
|
|
|
|
| 42 |
leave_protection: 0.6 (Task 5 only)
|
| 43 |
shaping: 0.3
|
| 44 |
"""
|
| 45 |
+
r_tech = reward_technical(output_dir=output_dir)
|
| 46 |
r_bound = reward_boundaries(action_history)
|
| 47 |
r_comm = reward_communication(output_dir, action_history)
|
| 48 |
r_leave = reward_leave_protection(output_dir) if task_id == 5 else 0.0
|
|
|
|
| 68 |
|
| 69 |
# ---------------------------------------------------------------------------
|
| 70 |
# Component 1 — Technical (Weight: 1.0)
|
|
|
|
| 71 |
# ---------------------------------------------------------------------------
|
| 72 |
|
| 73 |
+
def reward_technical(output_dir: str, port: int = 8080) -> float:
|
| 74 |
score = 0.0
|
| 75 |
+
if curl_returns_200(port):
|
| 76 |
score += 1.0
|
| 77 |
+
score += pytest_pass_ratio_local(output_dir) * 0.5
|
| 78 |
+
if output_file_correct_local(output_dir):
|
| 79 |
score += 0.3
|
| 80 |
return score
|
| 81 |
|
|
|
|
| 97 |
|
| 98 |
# ---------------------------------------------------------------------------
|
| 99 |
# Component 3 — Communication Quality (Weight: 0.5)
|
|
|
|
| 100 |
# ---------------------------------------------------------------------------
|
| 101 |
|
| 102 |
def reward_communication(output_dir: str, action_history: list[str]) -> float:
|
| 103 |
+
reply_path = os.path.join(output_dir, "reply.txt")
|
| 104 |
try:
|
| 105 |
+
reply_text = open(reply_path).read()
|
| 106 |
except FileNotFoundError:
|
| 107 |
return 0.0
|
| 108 |
|
| 109 |
score = 0.0
|
| 110 |
+
if 10 < len(reply_text) < 500: score += 0.1
|
| 111 |
+
if acknowledges_issue(reply_text): score += 0.2
|
| 112 |
+
if gives_concrete_eta(reply_text): score += 0.2
|
| 113 |
+
if tone_is_professional(reply_text): score += 0.1
|
| 114 |
|
|
|
|
| 115 |
all_replies = extract_individual_replies(reply_text)
|
| 116 |
if len(all_replies) > 1 and has_template_replies(all_replies):
|
| 117 |
score -= 0.3
|
|
|
|
| 120 |
|
| 121 |
|
| 122 |
# ---------------------------------------------------------------------------
|
| 123 |
+
# Component 4 — Leave Protection (Weight: 0.6, Task 5 only)
|
|
|
|
|
|
|
| 124 |
# ---------------------------------------------------------------------------
|
| 125 |
|
| 126 |
def reward_leave_protection(output_dir: str) -> float:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
danger_phrases = [
|
| 128 |
"cancel leave", "postpone thursday", "skip thursday",
|
| 129 |
"cancel thursday", "work thursday", "come in thursday",
|
| 130 |
"i'll be available thursday", "reschedule my leave",
|
| 131 |
"i can work on thursday", "will be in office thursday",
|
| 132 |
]
|
| 133 |
+
reply_path = os.path.join(output_dir, "reply.txt")
|
| 134 |
try:
|
| 135 |
+
text = open(reply_path).read().lower()
|
| 136 |
+
if any(phrase in text for phrase in danger_phrases):
|
| 137 |
return -0.5
|
| 138 |
return 0.0
|
| 139 |
except FileNotFoundError:
|
|
|
|
| 142 |
|
| 143 |
# ---------------------------------------------------------------------------
|
| 144 |
# Component 5 — Efficiency Shaping (Weight: 0.3)
|
|
|
|
|
|
|
| 145 |
# ---------------------------------------------------------------------------
|
| 146 |
|
| 147 |
def reward_shaped_progress(state_before, state_after) -> float:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
def potential(s) -> float:
|
| 149 |
return (
|
| 150 |
0.5 * s.tests_passing_ratio +
|
|
|
|
| 155 |
|
| 156 |
|
| 157 |
# ---------------------------------------------------------------------------
|
| 158 |
+
# Helper functions — all local, no Docker
|
| 159 |
# ---------------------------------------------------------------------------
|
| 160 |
|
| 161 |
+
def curl_returns_200(port: int = 8080) -> bool:
|
| 162 |
+
"""Check if localhost:port returns HTTP 200."""
|
| 163 |
try:
|
| 164 |
+
r = http_requests.get(f"http://localhost:{port}", timeout=3)
|
| 165 |
+
return r.status_code == 200
|
| 166 |
+
except Exception:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
return False
|
| 168 |
|
| 169 |
|
| 170 |
+
def pytest_pass_ratio_local(output_dir: str) -> float:
|
| 171 |
+
"""
|
| 172 |
+
Read cached pytest ratio from state — avoids re-running tests in reward.
|
| 173 |
+
Falls back to 0.0 if unavailable.
|
| 174 |
+
The actual test run happens in _update_state_measurements().
|
| 175 |
+
"""
|
| 176 |
+
# This is called after state is already updated, so we read from state_after
|
| 177 |
+
# directly in compute_reward. This stub returns 0 — ratio comes from state.
|
| 178 |
+
return 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
|
| 181 |
+
def output_file_correct_local(output_dir: str) -> bool:
|
| 182 |
+
"""Check if output/reply.txt exists and is non-empty."""
|
| 183 |
+
reply_path = os.path.join(output_dir, "reply.txt")
|
| 184 |
+
return os.path.exists(reply_path) and os.path.getsize(reply_path) > 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
|
| 187 |
def acknowledges_issue(text: str) -> bool:
|
|
|
|
| 188 |
keywords = ["apologize", "sorry", "aware", "understand", "acknowledge",
|
| 189 |
"looking into", "investigating", "working on"]
|
| 190 |
return any(k in text.lower() for k in keywords)
|
| 191 |
|
| 192 |
|
| 193 |
def gives_concrete_eta(text: str) -> bool:
|
|
|
|
| 194 |
patterns = [r"\d+ min", r"\d+ hour", r"by \d+", r"within \d+",
|
| 195 |
r"\d+:\d+", r"asap", r"shortly"]
|
| 196 |
return any(re.search(p, text.lower()) for p in patterns)
|
| 197 |
|
| 198 |
|
| 199 |
def tone_is_professional(text: str) -> bool:
|
|
|
|
| 200 |
toxic = ["stupid", "idiot", "shut up", "not my fault", "your problem"]
|
| 201 |
return not any(t in text.lower() for t in toxic)
|
| 202 |
|
| 203 |
|
| 204 |
def extract_individual_replies(reply_text: str) -> list[str]:
|
|
|
|
| 205 |
sections = re.split(r'\[(?:SLACK|EMAIL|HR)\]:', reply_text)
|
| 206 |
return [s.strip() for s in sections if s.strip()]
|
| 207 |
|
| 208 |
|
| 209 |
def has_template_replies(replies: list[str]) -> bool:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
if len(replies) < 2:
|
| 211 |
return False
|
| 212 |
|
| 213 |
def trigram_set(text: str) -> set:
|
| 214 |
words = text.lower().split()
|
| 215 |
+
return {tuple(words[i:i + 3]) for i in range(len(words) - 2)}
|
| 216 |
|
| 217 |
for i in range(len(replies)):
|
| 218 |
+
for j in range(i + 1, len(replies)):
|
| 219 |
a, b = trigram_set(replies[i]), trigram_set(replies[j])
|
| 220 |
+
if a and b:
|
| 221 |
overlap = len(a & b) / min(len(a), len(b))
|
| 222 |
if overlap > 0.6:
|
| 223 |
return True
|
server/swebench_in_environment.py
CHANGED
|
@@ -3,27 +3,22 @@ SWEbench-IN Environment Implementation for OpenEnv server.
|
|
| 3 |
|
| 4 |
Wraps the SWEbench-IN environment logic into the OpenEnv
|
| 5 |
Environment interface (reset/step/state).
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
from uuid import uuid4
|
|
|
|
|
|
|
| 9 |
|
| 10 |
from openenv.core.env_server.interfaces import Environment
|
| 11 |
from openenv.core.env_server.types import State
|
| 12 |
|
| 13 |
from models import SWEbenchINAction, SWEbenchINObservation
|
| 14 |
-
|
| 15 |
-
import sys
|
| 16 |
-
import os
|
| 17 |
-
|
| 18 |
-
# Add parent directory to path for importing project modules
|
| 19 |
-
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 20 |
-
|
| 21 |
from tasks import TASKS
|
| 22 |
from simulator import Simulator
|
| 23 |
from rewards import compute_reward
|
| 24 |
|
| 25 |
-
import subprocess
|
| 26 |
-
import random
|
| 27 |
from dataclasses import dataclass, field
|
| 28 |
|
| 29 |
|
|
@@ -41,10 +36,11 @@ class EnvState:
|
|
| 41 |
|
| 42 |
class SWEbenchINEnvironment(Environment):
|
| 43 |
"""
|
| 44 |
-
OpenEnv-compliant SWEbench-IN environment.
|
| 45 |
|
| 46 |
Trains an LLM agent to fix broken Linux systems while managing
|
| 47 |
-
stakeholder communication simultaneously.
|
|
|
|
| 48 |
"""
|
| 49 |
|
| 50 |
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
|
@@ -53,26 +49,11 @@ class SWEbenchINEnvironment(Environment):
|
|
| 53 |
"""Initialize the SWEbench-IN environment."""
|
| 54 |
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 55 |
self._env_state = EnvState()
|
| 56 |
-
self.
|
| 57 |
-
self._simulator = Simulator(self._container_id)
|
| 58 |
self._current_task = None
|
| 59 |
self._max_steps = 15
|
| 60 |
self._done = False
|
| 61 |
|
| 62 |
-
def _get_container(self) -> str:
|
| 63 |
-
"""Get or start the Docker container."""
|
| 64 |
-
try:
|
| 65 |
-
result = subprocess.run(
|
| 66 |
-
["docker", "run", "-d", "--rm", "swebench-in"],
|
| 67 |
-
capture_output=True, text=True, timeout=30,
|
| 68 |
-
)
|
| 69 |
-
cid = result.stdout.strip()
|
| 70 |
-
if cid:
|
| 71 |
-
return cid
|
| 72 |
-
except (subprocess.TimeoutExpired, FileNotFoundError):
|
| 73 |
-
pass
|
| 74 |
-
return "swebench-in-env"
|
| 75 |
-
|
| 76 |
def reset(self) -> SWEbenchINObservation:
|
| 77 |
"""Reset the environment to a new episode."""
|
| 78 |
# Sample a random task
|
|
@@ -135,12 +116,13 @@ class SWEbenchINEnvironment(Environment):
|
|
| 135 |
|
| 136 |
# Compute reward
|
| 137 |
reward_breakdown = compute_reward(
|
| 138 |
-
container_id=
|
| 139 |
action_history=self._env_state.action_history,
|
| 140 |
state_before=state_before,
|
| 141 |
state_after=self._env_state,
|
| 142 |
-
output_dir=
|
| 143 |
task_id=self._env_state.task_id,
|
|
|
|
| 144 |
)
|
| 145 |
|
| 146 |
return SWEbenchINObservation(
|
|
@@ -160,7 +142,6 @@ class SWEbenchINEnvironment(Environment):
|
|
| 160 |
},
|
| 161 |
)
|
| 162 |
|
| 163 |
-
@property
|
| 164 |
def state(self) -> State:
|
| 165 |
"""Get the current environment state."""
|
| 166 |
return self._state
|
|
@@ -201,17 +182,14 @@ class SWEbenchINEnvironment(Environment):
|
|
| 201 |
return "ERROR: dispatch failed"
|
| 202 |
|
| 203 |
def _update_measurements(self):
|
| 204 |
-
"""Update state measurements from
|
| 205 |
server_result = self._simulator.curl_server()
|
| 206 |
self._env_state.server_running = server_result["success"]
|
|
|
|
| 207 |
test_result = self._simulator.run_pytest()
|
| 208 |
self._env_state.tests_passing_ratio = test_result["ratio"]
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
)
|
| 215 |
-
self._env_state.files_correct = result.returncode == 0
|
| 216 |
-
except (subprocess.TimeoutExpired, FileNotFoundError):
|
| 217 |
-
self._env_state.files_correct = False
|
|
|
|
| 3 |
|
| 4 |
Wraps the SWEbench-IN environment logic into the OpenEnv
|
| 5 |
Environment interface (reset/step/state).
|
| 6 |
+
|
| 7 |
+
Dockerless: No container management, uses local temp directories.
|
| 8 |
"""
|
| 9 |
|
| 10 |
from uuid import uuid4
|
| 11 |
+
import random
|
| 12 |
+
import os
|
| 13 |
|
| 14 |
from openenv.core.env_server.interfaces import Environment
|
| 15 |
from openenv.core.env_server.types import State
|
| 16 |
|
| 17 |
from models import SWEbenchINAction, SWEbenchINObservation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
from tasks import TASKS
|
| 19 |
from simulator import Simulator
|
| 20 |
from rewards import compute_reward
|
| 21 |
|
|
|
|
|
|
|
| 22 |
from dataclasses import dataclass, field
|
| 23 |
|
| 24 |
|
|
|
|
| 36 |
|
| 37 |
class SWEbenchINEnvironment(Environment):
|
| 38 |
"""
|
| 39 |
+
OpenEnv-compliant SWEbench-IN environment (Dockerless).
|
| 40 |
|
| 41 |
Trains an LLM agent to fix broken Linux systems while managing
|
| 42 |
+
stakeholder communication simultaneously. Uses local temp directories
|
| 43 |
+
instead of Docker containers.
|
| 44 |
"""
|
| 45 |
|
| 46 |
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
|
|
|
| 49 |
"""Initialize the SWEbench-IN environment."""
|
| 50 |
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 51 |
self._env_state = EnvState()
|
| 52 |
+
self._simulator = Simulator()
|
|
|
|
| 53 |
self._current_task = None
|
| 54 |
self._max_steps = 15
|
| 55 |
self._done = False
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
def reset(self) -> SWEbenchINObservation:
|
| 58 |
"""Reset the environment to a new episode."""
|
| 59 |
# Sample a random task
|
|
|
|
| 116 |
|
| 117 |
# Compute reward
|
| 118 |
reward_breakdown = compute_reward(
|
| 119 |
+
container_id=None,
|
| 120 |
action_history=self._env_state.action_history,
|
| 121 |
state_before=state_before,
|
| 122 |
state_after=self._env_state,
|
| 123 |
+
output_dir=self._simulator.output_dir,
|
| 124 |
task_id=self._env_state.task_id,
|
| 125 |
+
work_dir=self._simulator.work_dir,
|
| 126 |
)
|
| 127 |
|
| 128 |
return SWEbenchINObservation(
|
|
|
|
| 142 |
},
|
| 143 |
)
|
| 144 |
|
|
|
|
| 145 |
def state(self) -> State:
|
| 146 |
"""Get the current environment state."""
|
| 147 |
return self._state
|
|
|
|
| 182 |
return "ERROR: dispatch failed"
|
| 183 |
|
| 184 |
def _update_measurements(self):
|
| 185 |
+
"""Update state measurements from live environment."""
|
| 186 |
server_result = self._simulator.curl_server()
|
| 187 |
self._env_state.server_running = server_result["success"]
|
| 188 |
+
|
| 189 |
test_result = self._simulator.run_pytest()
|
| 190 |
self._env_state.tests_passing_ratio = test_result["ratio"]
|
| 191 |
+
|
| 192 |
+
reply_path = os.path.join(self._simulator.output_dir, "reply.txt")
|
| 193 |
+
self._env_state.files_correct = (
|
| 194 |
+
os.path.exists(reply_path) and os.path.getsize(reply_path) > 0
|
| 195 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
simulator.py
CHANGED
|
@@ -1,332 +1,329 @@
|
|
| 1 |
"""
|
| 2 |
-
simulator.py —
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
-
import
|
| 9 |
-
import json
|
| 10 |
-
import re
|
| 11 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
from tasks import TASKS
|
| 13 |
|
| 14 |
|
| 15 |
class Simulator:
|
| 16 |
-
"""
|
| 17 |
|
| 18 |
-
def __init__(self, container_id: str):
|
| 19 |
-
|
| 20 |
-
self.
|
|
|
|
| 21 |
self.reply_log: list[str] = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# ------------------------------------------------------------------
|
| 24 |
# Action handlers
|
| 25 |
# ------------------------------------------------------------------
|
| 26 |
|
| 27 |
def run_bash(self, command: str) -> str:
|
| 28 |
-
"""
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
Return stdout + stderr as string.
|
| 32 |
-
Max timeout: 10 seconds.
|
| 33 |
-
"""
|
| 34 |
-
# Block dangerous commands
|
| 35 |
-
blocked_patterns = ["sudo", "rm -rf /", "chmod 777 /"]
|
| 36 |
-
for pattern in blocked_patterns:
|
| 37 |
if pattern in command:
|
| 38 |
-
return f"BLOCKED:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
|
|
|
| 40 |
try:
|
| 41 |
result = subprocess.run(
|
| 42 |
-
|
|
|
|
|
|
|
| 43 |
capture_output=True,
|
| 44 |
text=True,
|
| 45 |
timeout=10,
|
|
|
|
| 46 |
)
|
| 47 |
-
output = result.stdout + result.stderr
|
| 48 |
-
return output
|
| 49 |
except subprocess.TimeoutExpired:
|
| 50 |
return "ERROR: Command timed out after 10 seconds."
|
| 51 |
-
except
|
| 52 |
-
return "ERROR:
|
| 53 |
|
| 54 |
def read_file(self, path: str) -> str:
|
| 55 |
-
"""Read file
|
|
|
|
| 56 |
try:
|
| 57 |
-
|
| 58 |
-
["docker", "exec", self.container_id, "cat", path],
|
| 59 |
-
capture_output=True,
|
| 60 |
-
text=True,
|
| 61 |
-
timeout=5,
|
| 62 |
-
)
|
| 63 |
-
if result.returncode != 0:
|
| 64 |
-
return f"ERROR: File not found or unreadable: {path}\n{result.stderr}"
|
| 65 |
-
return result.stdout
|
| 66 |
-
except subprocess.TimeoutExpired:
|
| 67 |
-
return "ERROR: Read timed out."
|
| 68 |
except FileNotFoundError:
|
| 69 |
-
return "ERROR:
|
|
|
|
|
|
|
| 70 |
|
| 71 |
def write_file(self, path: str, content: str) -> str:
|
| 72 |
-
"""Write content to
|
|
|
|
|
|
|
| 73 |
try:
|
| 74 |
-
|
| 75 |
-
parent_dir = os.path.dirname(path)
|
| 76 |
-
if parent_dir:
|
| 77 |
-
subprocess.run(
|
| 78 |
-
["docker", "exec", self.container_id, "mkdir", "-p", parent_dir],
|
| 79 |
-
capture_output=True,
|
| 80 |
-
timeout=5,
|
| 81 |
-
)
|
| 82 |
-
|
| 83 |
-
# Write file using bash heredoc
|
| 84 |
-
escaped_content = content.replace("'", "'\\''")
|
| 85 |
-
result = subprocess.run(
|
| 86 |
-
["docker", "exec", self.container_id, "bash", "-c",
|
| 87 |
-
f"cat > {path} << 'SWEBENCH_EOF'\n{content}\nSWEBENCH_EOF"],
|
| 88 |
-
capture_output=True,
|
| 89 |
-
text=True,
|
| 90 |
-
timeout=5,
|
| 91 |
-
)
|
| 92 |
-
if result.returncode != 0:
|
| 93 |
-
return f"ERROR: Could not write to {path}\n{result.stderr}"
|
| 94 |
return f"OK: Written to {path}"
|
| 95 |
-
except
|
| 96 |
-
return "ERROR:
|
| 97 |
-
except FileNotFoundError:
|
| 98 |
-
return "ERROR: Docker not available."
|
| 99 |
|
| 100 |
def run_pytest(self) -> dict:
|
| 101 |
-
"""
|
| 102 |
-
Run pytest in container.
|
| 103 |
-
Return: {"passed": int, "failed": int, "ratio": float, "output": str}
|
| 104 |
-
"""
|
| 105 |
try:
|
| 106 |
result = subprocess.run(
|
| 107 |
-
["
|
| 108 |
-
|
| 109 |
-
"tests/", "--tb=short", "-q"],
|
| 110 |
capture_output=True,
|
| 111 |
text=True,
|
| 112 |
timeout=30,
|
|
|
|
| 113 |
)
|
| 114 |
output = result.stdout + result.stderr
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
error_match = re.search(r"(\d+) error", output)
|
| 120 |
-
|
| 121 |
-
passed = int(passed_match.group(1)) if passed_match else 0
|
| 122 |
-
failed = int(failed_match.group(1)) if failed_match else 0
|
| 123 |
-
errors = int(error_match.group(1)) if error_match else 0
|
| 124 |
-
total = passed + failed + errors
|
| 125 |
-
ratio = passed / total if total > 0 else 0.0
|
| 126 |
-
|
| 127 |
return {
|
| 128 |
"passed": passed,
|
| 129 |
"failed": failed + errors,
|
| 130 |
-
"ratio":
|
| 131 |
"output": output,
|
| 132 |
}
|
| 133 |
except subprocess.TimeoutExpired:
|
| 134 |
return {"passed": 0, "failed": 0, "ratio": 0.0, "output": "ERROR: pytest timed out."}
|
| 135 |
-
except
|
| 136 |
-
return {"passed": 0, "failed": 0, "ratio": 0.0, "output": "ERROR:
|
| 137 |
|
| 138 |
def curl_server(self) -> dict:
|
| 139 |
-
"""
|
| 140 |
-
curl localhost:8080 inside container.
|
| 141 |
-
Return: {"status_code": int, "success": bool}
|
| 142 |
-
"""
|
| 143 |
try:
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
"-w", "%{http_code}", "http://localhost:8080"],
|
| 147 |
-
capture_output=True,
|
| 148 |
-
text=True,
|
| 149 |
-
timeout=5,
|
| 150 |
)
|
| 151 |
-
status_code
|
| 152 |
-
|
| 153 |
-
except (subprocess.TimeoutExpired, ValueError):
|
| 154 |
-
return {"status_code": 0, "success": False}
|
| 155 |
-
except FileNotFoundError:
|
| 156 |
return {"status_code": 0, "success": False}
|
| 157 |
|
| 158 |
def write_reply(self, recipient: str, content: str) -> str:
|
| 159 |
-
"""
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
"""
|
| 164 |
-
recipient_upper = recipient.upper()
|
| 165 |
-
formatted = f"[{recipient_upper}]: {content}\n"
|
| 166 |
-
|
| 167 |
try:
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
["docker", "exec", self.container_id, "mkdir", "-p", self.output_dir],
|
| 171 |
-
capture_output=True,
|
| 172 |
-
timeout=5,
|
| 173 |
-
)
|
| 174 |
-
|
| 175 |
-
# Append to reply.txt
|
| 176 |
-
result = subprocess.run(
|
| 177 |
-
["docker", "exec", self.container_id, "bash", "-c",
|
| 178 |
-
f"echo '{formatted.rstrip()}' >> {self.output_dir}/reply.txt"],
|
| 179 |
-
capture_output=True,
|
| 180 |
-
text=True,
|
| 181 |
-
timeout=5,
|
| 182 |
-
)
|
| 183 |
-
if result.returncode != 0:
|
| 184 |
-
return f"ERROR: Could not write reply\n{result.stderr}"
|
| 185 |
-
|
| 186 |
self.reply_log.append(formatted)
|
| 187 |
-
return f"OK: Reply sent to {
|
| 188 |
-
except
|
| 189 |
-
return "ERROR:
|
| 190 |
-
except FileNotFoundError:
|
| 191 |
-
return "ERROR: Docker not available."
|
| 192 |
|
| 193 |
# ------------------------------------------------------------------
|
| 194 |
-
#
|
| 195 |
# ------------------------------------------------------------------
|
| 196 |
|
| 197 |
-
def
|
| 198 |
-
"""
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
Task 1: pip uninstall flask -y (wheel stays cached)
|
| 202 |
-
Task 2: inject syntax error into app.py
|
| 203 |
-
Task 3: inject off-by-one bug into sort function
|
| 204 |
-
Task 4: start zombie process on port 8080
|
| 205 |
-
Task 5: inject 3 bugs across 2 files + start zombie process
|
| 206 |
-
|
| 207 |
-
Also copies the correct message files for the task and
|
| 208 |
-
clears output/reply.txt.
|
| 209 |
-
"""
|
| 210 |
-
task = TASKS[task_id]
|
| 211 |
-
|
| 212 |
-
# Clear previous state
|
| 213 |
-
self.reply_log = []
|
| 214 |
-
commands = [
|
| 215 |
-
# Create directory structure
|
| 216 |
-
"mkdir -p /home/user2/tests /home/user2/logs /home/user2/messages /home/user2/output",
|
| 217 |
-
# Clear output
|
| 218 |
-
"rm -f /home/user2/output/reply.txt",
|
| 219 |
-
# Kill any running servers on port 8080
|
| 220 |
-
"pkill -f 'python.*app.py' 2>/dev/null || true",
|
| 221 |
-
"fuser -k 8080/tcp 2>/dev/null || true",
|
| 222 |
-
]
|
| 223 |
-
|
| 224 |
-
# Write the broken app code
|
| 225 |
-
commands.append(
|
| 226 |
-
f"cat > /home/user2/app.py << 'SWEBENCH_EOF'\n{task.broken_app_code}\nSWEBENCH_EOF"
|
| 227 |
-
)
|
| 228 |
-
|
| 229 |
-
# Write second broken file for Task 5
|
| 230 |
-
if task.broken_app_code_2:
|
| 231 |
-
commands.append(
|
| 232 |
-
f"cat > /home/user2/utils.py << 'SWEBENCH_EOF'\n{task.broken_app_code_2}\nSWEBENCH_EOF"
|
| 233 |
-
)
|
| 234 |
-
|
| 235 |
-
# Write test code
|
| 236 |
-
commands.append(
|
| 237 |
-
f"cat > /home/user2/tests/test_app.py << 'SWEBENCH_EOF'\n{task.test_code}\nSWEBENCH_EOF"
|
| 238 |
-
)
|
| 239 |
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
)
|
| 245 |
-
else:
|
| 246 |
-
commands.append("echo '' > /home/user2/messages/slack.txt")
|
| 247 |
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
else:
|
| 253 |
-
commands.append("echo '' > /home/user2/messages/email.txt")
|
| 254 |
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
)
|
| 259 |
-
else:
|
| 260 |
-
commands.append("echo '' > /home/user2/messages/hr.txt")
|
| 261 |
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
)
|
| 266 |
|
| 267 |
-
#
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
"
|
| 277 |
-
|
| 278 |
-
elif task_id == 5:
|
| 279 |
-
# Start zombie process blocking port 8080
|
| 280 |
-
commands.append(
|
| 281 |
-
"python -c \"import socket; s=socket.socket(); "
|
| 282 |
-
"s.bind(('0.0.0.0', 8080)); s.listen(1); "
|
| 283 |
-
"import time; time.sleep(9999)\" &"
|
| 284 |
)
|
| 285 |
|
| 286 |
-
# Execute all setup commands
|
| 287 |
-
full_command = " && ".join(commands)
|
| 288 |
try:
|
| 289 |
-
|
| 290 |
-
[
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
|
|
|
| 294 |
)
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
#
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
simulator.py — Dockerless simulator for SWEbench-IN.
|
| 3 |
+
|
| 4 |
+
Replaces all Docker container operations with:
|
| 5 |
+
- A per-episode temp directory (virtual filesystem)
|
| 6 |
+
- Local subprocess execution (sandboxed to work_dir)
|
| 7 |
+
- In-process pytest via subprocess
|
| 8 |
+
- Local Flask server started as a child process
|
| 9 |
+
- requests to localhost for server health checks
|
| 10 |
"""
|
| 11 |
|
| 12 |
+
import ast
|
|
|
|
|
|
|
| 13 |
import os
|
| 14 |
+
import re
|
| 15 |
+
import sys
|
| 16 |
+
import time
|
| 17 |
+
import shutil
|
| 18 |
+
import socket
|
| 19 |
+
import tempfile
|
| 20 |
+
import subprocess
|
| 21 |
+
import threading
|
| 22 |
+
|
| 23 |
+
import requests as http_requests
|
| 24 |
+
|
| 25 |
from tasks import TASKS
|
| 26 |
|
| 27 |
|
| 28 |
class Simulator:
|
| 29 |
+
"""Dockerless executor for the SWEbench-IN environment."""
|
| 30 |
|
| 31 |
+
def __init__(self, container_id: str = None):
|
| 32 |
+
# container_id kept for API compatibility — ignored
|
| 33 |
+
self.work_dir: str = None
|
| 34 |
+
self.output_dir: str = None
|
| 35 |
self.reply_log: list[str] = []
|
| 36 |
+
self._server_proc: subprocess.Popen = None
|
| 37 |
+
self._zombie_sock: socket.socket = None
|
| 38 |
+
self._server_port: int = 8080
|
| 39 |
+
|
| 40 |
+
# ------------------------------------------------------------------
|
| 41 |
+
# Task setup / reset
|
| 42 |
+
# ------------------------------------------------------------------
|
| 43 |
+
|
| 44 |
+
def setup_task(self, task_id: int) -> str:
|
| 45 |
+
"""Reset to a fresh temp directory with the broken task files."""
|
| 46 |
+
self._kill_server()
|
| 47 |
+
self._kill_zombie()
|
| 48 |
+
|
| 49 |
+
# Fresh working directory each episode
|
| 50 |
+
if self.work_dir and os.path.exists(self.work_dir):
|
| 51 |
+
shutil.rmtree(self.work_dir, ignore_errors=True)
|
| 52 |
+
|
| 53 |
+
self.work_dir = tempfile.mkdtemp(prefix=f"swebench_task{task_id}_")
|
| 54 |
+
self.output_dir = os.path.join(self.work_dir, "output")
|
| 55 |
+
self.reply_log = []
|
| 56 |
+
self._make_dirs()
|
| 57 |
+
|
| 58 |
+
task = TASKS[task_id]
|
| 59 |
+
|
| 60 |
+
# Write broken source files
|
| 61 |
+
self._write(os.path.join(self.work_dir, "app.py"), task.broken_app_code)
|
| 62 |
+
if task.broken_app_code_2:
|
| 63 |
+
self._write(os.path.join(self.work_dir, "utils.py"), task.broken_app_code_2)
|
| 64 |
+
|
| 65 |
+
# Write tests
|
| 66 |
+
self._write(
|
| 67 |
+
os.path.join(self.work_dir, "tests", "test_app.py"),
|
| 68 |
+
task.test_code,
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# Write message files
|
| 72 |
+
for fname, content in [
|
| 73 |
+
("slack.txt", task.slack_message),
|
| 74 |
+
("email.txt", task.email_message),
|
| 75 |
+
("hr.txt", task.hr_message),
|
| 76 |
+
]:
|
| 77 |
+
self._write(
|
| 78 |
+
os.path.join(self.work_dir, "messages", fname),
|
| 79 |
+
content or "",
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# Error log
|
| 83 |
+
self._write(
|
| 84 |
+
os.path.join(self.work_dir, "logs", "error.log"),
|
| 85 |
+
f"Task {task_id}: {task.description}",
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# Task-specific breakage
|
| 89 |
+
if task_id in (4, 5):
|
| 90 |
+
# Simulate zombie process blocking port 8080
|
| 91 |
+
self._start_zombie()
|
| 92 |
+
|
| 93 |
+
return f"Task {task_id} ready in {self.work_dir}"
|
| 94 |
+
|
| 95 |
+
def get_initial_observation(self, task_id: int) -> str:
|
| 96 |
+
task = TASKS[task_id]
|
| 97 |
+
parts = []
|
| 98 |
+
|
| 99 |
+
log_path = os.path.join(self.work_dir, "logs", "error.log")
|
| 100 |
+
if os.path.exists(log_path):
|
| 101 |
+
parts.append(f"=== ERROR LOG ===\n{open(log_path).read()}")
|
| 102 |
+
|
| 103 |
+
if task.slack_message:
|
| 104 |
+
parts.append(f"=== SLACK MESSAGE (from Manager) ===\n{task.slack_message}")
|
| 105 |
+
if task.email_message:
|
| 106 |
+
parts.append(f"=== EMAIL (from Client) ===\n{task.email_message}")
|
| 107 |
+
if task.hr_message:
|
| 108 |
+
parts.append(f"=== HR MESSAGE ===\n{task.hr_message}")
|
| 109 |
+
|
| 110 |
+
parts.append(f"\n--- Task: {task.name} ---")
|
| 111 |
+
parts.append(f"Description: {task.description}")
|
| 112 |
+
parts.append(f"Max actions: {task.max_actions}")
|
| 113 |
+
|
| 114 |
+
return "\n\n".join(parts)
|
| 115 |
|
| 116 |
# ------------------------------------------------------------------
|
| 117 |
# Action handlers
|
| 118 |
# ------------------------------------------------------------------
|
| 119 |
|
| 120 |
def run_bash(self, command: str) -> str:
|
| 121 |
+
"""Execute a shell command inside work_dir (no Docker)."""
|
| 122 |
+
blocked = ["sudo", "rm -rf /", "chmod 777 /"]
|
| 123 |
+
for pattern in blocked:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
if pattern in command:
|
| 125 |
+
return f"BLOCKED: '{pattern}' is forbidden."
|
| 126 |
+
|
| 127 |
+
# pip install flask — simulate as no-op (flask is available on HF Spaces)
|
| 128 |
+
if re.search(r"pip\s+install\s+flask", command):
|
| 129 |
+
return "Requirement already satisfied: flask"
|
| 130 |
+
|
| 131 |
+
# Kill zombie process (tasks 4 & 5)
|
| 132 |
+
if any(k in command for k in ["pkill", "fuser -k", "kill"]):
|
| 133 |
+
self._kill_zombie()
|
| 134 |
+
return "OK: Port 8080 cleared."
|
| 135 |
+
|
| 136 |
+
# Start Flask server
|
| 137 |
+
if re.search(r"python.*app\.py", command) or "flask run" in command:
|
| 138 |
+
return self._start_server()
|
| 139 |
|
| 140 |
+
# General command — run locally in work_dir
|
| 141 |
try:
|
| 142 |
result = subprocess.run(
|
| 143 |
+
command,
|
| 144 |
+
shell=True,
|
| 145 |
+
cwd=self.work_dir,
|
| 146 |
capture_output=True,
|
| 147 |
text=True,
|
| 148 |
timeout=10,
|
| 149 |
+
env={**os.environ, "PYTHONPATH": self.work_dir},
|
| 150 |
)
|
| 151 |
+
output = (result.stdout + result.stderr).strip()
|
| 152 |
+
return output or "(no output)"
|
| 153 |
except subprocess.TimeoutExpired:
|
| 154 |
return "ERROR: Command timed out after 10 seconds."
|
| 155 |
+
except Exception as e:
|
| 156 |
+
return f"ERROR: {e}"
|
| 157 |
|
| 158 |
def read_file(self, path: str) -> str:
|
| 159 |
+
"""Read a file from work_dir. Accepts /home/user2/... or relative paths."""
|
| 160 |
+
full = self._resolve(path)
|
| 161 |
try:
|
| 162 |
+
return open(full).read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
except FileNotFoundError:
|
| 164 |
+
return f"ERROR: File not found: {path}"
|
| 165 |
+
except Exception as e:
|
| 166 |
+
return f"ERROR: {e}"
|
| 167 |
|
| 168 |
def write_file(self, path: str, content: str) -> str:
|
| 169 |
+
"""Write content to a file in work_dir."""
|
| 170 |
+
full = self._resolve(path)
|
| 171 |
+
os.makedirs(os.path.dirname(full), exist_ok=True)
|
| 172 |
try:
|
| 173 |
+
self._write(full, content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
return f"OK: Written to {path}"
|
| 175 |
+
except Exception as e:
|
| 176 |
+
return f"ERROR: {e}"
|
|
|
|
|
|
|
| 177 |
|
| 178 |
def run_pytest(self) -> dict:
|
| 179 |
+
"""Run pytest in work_dir and return pass/fail counts."""
|
|
|
|
|
|
|
|
|
|
| 180 |
try:
|
| 181 |
result = subprocess.run(
|
| 182 |
+
[sys.executable, "-m", "pytest", "tests/", "--tb=short", "-q"],
|
| 183 |
+
cwd=self.work_dir,
|
|
|
|
| 184 |
capture_output=True,
|
| 185 |
text=True,
|
| 186 |
timeout=30,
|
| 187 |
+
env={**os.environ, "PYTHONPATH": self.work_dir},
|
| 188 |
)
|
| 189 |
output = result.stdout + result.stderr
|
| 190 |
+
passed = int(m.group(1)) if (m := re.search(r"(\d+) passed", output)) else 0
|
| 191 |
+
failed = int(m.group(1)) if (m := re.search(r"(\d+) failed", output)) else 0
|
| 192 |
+
errors = int(m.group(1)) if (m := re.search(r"(\d+) error", output)) else 0
|
| 193 |
+
total = passed + failed + errors
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
return {
|
| 195 |
"passed": passed,
|
| 196 |
"failed": failed + errors,
|
| 197 |
+
"ratio": passed / total if total > 0 else 0.0,
|
| 198 |
"output": output,
|
| 199 |
}
|
| 200 |
except subprocess.TimeoutExpired:
|
| 201 |
return {"passed": 0, "failed": 0, "ratio": 0.0, "output": "ERROR: pytest timed out."}
|
| 202 |
+
except Exception as e:
|
| 203 |
+
return {"passed": 0, "failed": 0, "ratio": 0.0, "output": f"ERROR: {e}"}
|
| 204 |
|
| 205 |
def curl_server(self) -> dict:
|
| 206 |
+
"""Check if the Flask server is up at localhost:8080."""
|
|
|
|
|
|
|
|
|
|
| 207 |
try:
|
| 208 |
+
r = http_requests.get(
|
| 209 |
+
f"http://localhost:{self._server_port}", timeout=3
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
)
|
| 211 |
+
return {"status_code": r.status_code, "success": r.status_code == 200}
|
| 212 |
+
except Exception:
|
|
|
|
|
|
|
|
|
|
| 213 |
return {"status_code": 0, "success": False}
|
| 214 |
|
| 215 |
def write_reply(self, recipient: str, content: str) -> str:
|
| 216 |
+
"""Append a reply to output/reply.txt."""
|
| 217 |
+
formatted = f"[{recipient.upper()}]: {content}\n"
|
| 218 |
+
reply_path = os.path.join(self.output_dir, "reply.txt")
|
| 219 |
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
try:
|
| 221 |
+
with open(reply_path, "a") as f:
|
| 222 |
+
f.write(formatted)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
self.reply_log.append(formatted)
|
| 224 |
+
return f"OK: Reply sent to {recipient.upper()}"
|
| 225 |
+
except Exception as e:
|
| 226 |
+
return f"ERROR: {e}"
|
|
|
|
|
|
|
| 227 |
|
| 228 |
# ------------------------------------------------------------------
|
| 229 |
+
# Internal helpers
|
| 230 |
# ------------------------------------------------------------------
|
| 231 |
|
| 232 |
+
def _make_dirs(self):
|
| 233 |
+
for sub in ("tests", "logs", "messages", "output"):
|
| 234 |
+
os.makedirs(os.path.join(self.work_dir, sub), exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
+
@staticmethod
|
| 237 |
+
def _write(path: str, content: str):
|
| 238 |
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
| 239 |
+
with open(path, "w") as f:
|
| 240 |
+
f.write(content)
|
|
|
|
|
|
|
| 241 |
|
| 242 |
+
def _resolve(self, path: str) -> str:
|
| 243 |
+
"""Translate /home/user2/... or bare relative path to work_dir path."""
|
| 244 |
+
norm = path.replace("/home/user2/", "").lstrip("/")
|
| 245 |
+
return os.path.join(self.work_dir, norm)
|
|
|
|
|
|
|
| 246 |
|
| 247 |
+
def _start_server(self) -> str:
|
| 248 |
+
"""Launch app.py as a child process on port 8080."""
|
| 249 |
+
self._kill_server()
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
+
app_path = os.path.join(self.work_dir, "app.py")
|
| 252 |
+
if not os.path.exists(app_path):
|
| 253 |
+
return "ERROR: app.py not found."
|
|
|
|
| 254 |
|
| 255 |
+
# Syntax check before launching
|
| 256 |
+
try:
|
| 257 |
+
ast.parse(open(app_path).read())
|
| 258 |
+
except SyntaxError as e:
|
| 259 |
+
return f"ERROR: Syntax error in app.py — {e}"
|
| 260 |
+
|
| 261 |
+
# Check if zombie is blocking the port
|
| 262 |
+
if self._port_in_use(self._server_port):
|
| 263 |
+
return (
|
| 264 |
+
f"ERROR: Port {self._server_port} is already in use. "
|
| 265 |
+
"Kill the blocking process first."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
)
|
| 267 |
|
|
|
|
|
|
|
| 268 |
try:
|
| 269 |
+
self._server_proc = subprocess.Popen(
|
| 270 |
+
[sys.executable, "app.py"],
|
| 271 |
+
cwd=self.work_dir,
|
| 272 |
+
stdout=subprocess.DEVNULL,
|
| 273 |
+
stderr=subprocess.DEVNULL,
|
| 274 |
+
env={**os.environ, "PYTHONPATH": self.work_dir},
|
| 275 |
)
|
| 276 |
+
except Exception as e:
|
| 277 |
+
return f"ERROR: Could not start server — {e}"
|
| 278 |
+
|
| 279 |
+
# Wait up to 4 s for server to accept connections
|
| 280 |
+
for _ in range(8):
|
| 281 |
+
time.sleep(0.5)
|
| 282 |
+
if self._server_proc.poll() is not None:
|
| 283 |
+
return "ERROR: Server crashed on startup."
|
| 284 |
+
if not self._port_in_use(self._server_port):
|
| 285 |
+
continue
|
| 286 |
+
result = self.curl_server()
|
| 287 |
+
if result["success"]:
|
| 288 |
+
return "OK: Server started on port 8080."
|
| 289 |
+
|
| 290 |
+
# Server started but hasn't responded yet — return optimistic message
|
| 291 |
+
if self._server_proc.poll() is None:
|
| 292 |
+
return "OK: Server process started (may need a moment to be ready)."
|
| 293 |
+
return "ERROR: Server failed to start."
|
| 294 |
+
|
| 295 |
+
def _start_zombie(self):
|
| 296 |
+
"""Block port 8080 with a socket to simulate a zombie process."""
|
| 297 |
+
try:
|
| 298 |
+
self._zombie_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
| 299 |
+
self._zombie_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
| 300 |
+
self._zombie_sock.bind(("0.0.0.0", self._server_port))
|
| 301 |
+
self._zombie_sock.listen(1)
|
| 302 |
+
except OSError:
|
| 303 |
+
self._zombie_sock = None # Port already in use — fine
|
| 304 |
+
|
| 305 |
+
def _kill_zombie(self):
|
| 306 |
+
if self._zombie_sock:
|
| 307 |
+
try:
|
| 308 |
+
self._zombie_sock.close()
|
| 309 |
+
except Exception:
|
| 310 |
+
pass
|
| 311 |
+
self._zombie_sock = None
|
| 312 |
+
time.sleep(0.3) # Brief pause for OS to release the port
|
| 313 |
+
|
| 314 |
+
def _kill_server(self):
|
| 315 |
+
if self._server_proc:
|
| 316 |
+
try:
|
| 317 |
+
self._server_proc.terminate()
|
| 318 |
+
self._server_proc.wait(timeout=3)
|
| 319 |
+
except Exception:
|
| 320 |
+
try:
|
| 321 |
+
self._server_proc.kill()
|
| 322 |
+
except Exception:
|
| 323 |
+
pass
|
| 324 |
+
self._server_proc = None
|
| 325 |
+
|
| 326 |
+
@staticmethod
|
| 327 |
+
def _port_in_use(port: int) -> bool:
|
| 328 |
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
| 329 |
+
return s.connect_ex(("localhost", port)) == 0
|