swebench-ind / server /swebench_in_environment.py
YUS200619's picture
feat: Complete Dockerless migration - update environment, rewards, app, and server wrapper
83ea4bd
"""
SWEbench-IN Environment Implementation for OpenEnv server.
Wraps the SWEbench-IN environment logic into the OpenEnv
Environment interface (reset/step/state).
Dockerless: No container management, uses local temp directories.
"""
from uuid import uuid4
import random
import os
from openenv.core.env_server.interfaces import Environment
from openenv.core.env_server.types import State
from models import SWEbenchINAction, SWEbenchINObservation
from tasks import TASKS
from simulator import Simulator
from rewards import compute_reward
from dataclasses import dataclass, field
@dataclass
class EnvState:
"""Internal environment state tracking."""
task_id: int = 0
step_count: int = 0
tests_passing_ratio: float = 0.0
server_running: bool = False
files_correct: bool = False
action_history: list = field(default_factory=list)
reply_texts: list = field(default_factory=list)
class SWEbenchINEnvironment(Environment):
"""
OpenEnv-compliant SWEbench-IN environment (Dockerless).
Trains an LLM agent to fix broken Linux systems while managing
stakeholder communication simultaneously. Uses local temp directories
instead of Docker containers.
"""
SUPPORTS_CONCURRENT_SESSIONS: bool = True
def __init__(self):
"""Initialize the SWEbench-IN environment."""
self._state = State(episode_id=str(uuid4()), step_count=0)
self._env_state = EnvState()
self._simulator = Simulator()
self._current_task = None
self._max_steps = 15
self._done = False
def reset(self) -> SWEbenchINObservation:
"""Reset the environment to a new episode."""
# Sample a random task
task_id = random.choice(list(TASKS.keys()))
self._current_task = TASKS[task_id]
self._done = False
self._max_steps = self._current_task.max_actions
self._state = State(episode_id=str(uuid4()), step_count=0)
self._env_state = EnvState(task_id=task_id)
self._simulator.setup_task(task_id)
obs_text = self._simulator.get_initial_observation(task_id)
return SWEbenchINObservation(
text=obs_text,
reward=0.0,
done=False,
step_count=0,
max_steps=self._max_steps,
tests_passing_ratio=0.0,
server_running=False,
)
def step(self, action: SWEbenchINAction) -> SWEbenchINObservation:
"""Execute a step in the environment."""
if self._done:
return SWEbenchINObservation(
text="Episode is done. Call reset() to start a new episode.",
reward=0.0,
done=True,
step_count=self._state.step_count,
max_steps=self._max_steps,
)
action_type = action.type
action_args = action.args
# Record state before
state_before = EnvState(
task_id=self._env_state.task_id,
step_count=self._env_state.step_count,
tests_passing_ratio=self._env_state.tests_passing_ratio,
server_running=self._env_state.server_running,
files_correct=self._env_state.files_correct,
)
# Dispatch action
obs_text = self._dispatch_action(action_type, action_args)
# Update state
self._env_state.action_history.append(f"{action_type}: {action_args}")
self._env_state.step_count += 1
self._state.step_count += 1
self._update_measurements()
# Check done
if action_type == "close_case" or self._env_state.step_count >= self._max_steps:
self._done = True
# Compute reward
reward_breakdown = compute_reward(
container_id=None,
action_history=self._env_state.action_history,
state_before=state_before,
state_after=self._env_state,
output_dir=self._simulator.output_dir,
task_id=self._env_state.task_id,
work_dir=self._simulator.work_dir,
)
return SWEbenchINObservation(
text=obs_text,
reward=reward_breakdown.total,
done=self._done,
step_count=self._env_state.step_count,
max_steps=self._max_steps,
tests_passing_ratio=self._env_state.tests_passing_ratio,
server_running=self._env_state.server_running,
reward_breakdown={
"technical": reward_breakdown.technical,
"boundaries": reward_breakdown.boundaries,
"communication": reward_breakdown.communication,
"leave_protection": reward_breakdown.leave_protection,
"shaping": reward_breakdown.shaping,
},
)
def state(self) -> State:
"""Get the current environment state."""
return self._state
# --- Internal helpers ---
VALID_ACTIONS = {
"run_command", "read_file", "write_file", "run_tests",
"check_server", "reply_slack", "reply_email", "reply_hr", "close_case",
}
def _dispatch_action(self, action_type: str, action_args: str) -> str:
"""Dispatch an action to the simulator."""
if action_type not in self.VALID_ACTIONS:
return f"ERROR: Unknown action '{action_type}'"
if action_type == "run_command":
return self._simulator.run_bash(action_args)
elif action_type == "read_file":
return self._simulator.read_file(action_args)
elif action_type == "write_file":
if "|" in action_args:
path, content = action_args.split("|", 1)
return self._simulator.write_file(path.strip(), content)
return "ERROR: write_file args must be 'path|content'"
elif action_type == "run_tests":
result = self._simulator.run_pytest()
return f"Passed: {result['passed']}, Failed: {result['failed']}, Ratio: {result['ratio']:.0%}\n{result['output']}"
elif action_type == "check_server":
result = self._simulator.curl_server()
return f"Status: {result['status_code']}, Success: {result['success']}"
elif action_type in ("reply_slack", "reply_email", "reply_hr"):
recipient = action_type.replace("reply_", "").upper()
self._env_state.reply_texts.append(f"[{recipient}]: {action_args}")
return self._simulator.write_reply(recipient, action_args)
elif action_type == "close_case":
return "Case closed. Episode ending."
return "ERROR: dispatch failed"
def _update_measurements(self):
"""Update state measurements from live environment."""
server_result = self._simulator.curl_server()
self._env_state.server_running = server_result["success"]
test_result = self._simulator.run_pytest()
self._env_state.tests_passing_ratio = test_result["ratio"]
reply_path = os.path.join(self._simulator.output_dir, "reply.txt")
self._env_state.files_correct = (
os.path.exists(reply_path) and os.path.getsize(reply_path) > 0
)