""" SWEbench-IN Environment Implementation for OpenEnv server. Wraps the SWEbench-IN environment logic into the OpenEnv Environment interface (reset/step/state). Dockerless: No container management, uses local temp directories. """ from uuid import uuid4 import random import os from openenv.core.env_server.interfaces import Environment from openenv.core.env_server.types import State from models import SWEbenchINAction, SWEbenchINObservation from tasks import TASKS from simulator import Simulator from rewards import compute_reward from dataclasses import dataclass, field @dataclass class EnvState: """Internal environment state tracking.""" task_id: int = 0 step_count: int = 0 tests_passing_ratio: float = 0.0 server_running: bool = False files_correct: bool = False action_history: list = field(default_factory=list) reply_texts: list = field(default_factory=list) class SWEbenchINEnvironment(Environment): """ OpenEnv-compliant SWEbench-IN environment (Dockerless). Trains an LLM agent to fix broken Linux systems while managing stakeholder communication simultaneously. Uses local temp directories instead of Docker containers. """ SUPPORTS_CONCURRENT_SESSIONS: bool = True def __init__(self): """Initialize the SWEbench-IN environment.""" self._state = State(episode_id=str(uuid4()), step_count=0) self._env_state = EnvState() self._simulator = Simulator() self._current_task = None self._max_steps = 15 self._done = False def reset(self) -> SWEbenchINObservation: """Reset the environment to a new episode.""" # Sample a random task task_id = random.choice(list(TASKS.keys())) self._current_task = TASKS[task_id] self._done = False self._max_steps = self._current_task.max_actions self._state = State(episode_id=str(uuid4()), step_count=0) self._env_state = EnvState(task_id=task_id) self._simulator.setup_task(task_id) obs_text = self._simulator.get_initial_observation(task_id) return SWEbenchINObservation( text=obs_text, reward=0.0, done=False, step_count=0, max_steps=self._max_steps, tests_passing_ratio=0.0, server_running=False, ) def step(self, action: SWEbenchINAction) -> SWEbenchINObservation: """Execute a step in the environment.""" if self._done: return SWEbenchINObservation( text="Episode is done. Call reset() to start a new episode.", reward=0.0, done=True, step_count=self._state.step_count, max_steps=self._max_steps, ) action_type = action.type action_args = action.args # Record state before state_before = EnvState( task_id=self._env_state.task_id, step_count=self._env_state.step_count, tests_passing_ratio=self._env_state.tests_passing_ratio, server_running=self._env_state.server_running, files_correct=self._env_state.files_correct, ) # Dispatch action obs_text = self._dispatch_action(action_type, action_args) # Update state self._env_state.action_history.append(f"{action_type}: {action_args}") self._env_state.step_count += 1 self._state.step_count += 1 self._update_measurements() # Check done if action_type == "close_case" or self._env_state.step_count >= self._max_steps: self._done = True # Compute reward reward_breakdown = compute_reward( container_id=None, action_history=self._env_state.action_history, state_before=state_before, state_after=self._env_state, output_dir=self._simulator.output_dir, task_id=self._env_state.task_id, work_dir=self._simulator.work_dir, ) return SWEbenchINObservation( text=obs_text, reward=reward_breakdown.total, done=self._done, step_count=self._env_state.step_count, max_steps=self._max_steps, tests_passing_ratio=self._env_state.tests_passing_ratio, server_running=self._env_state.server_running, reward_breakdown={ "technical": reward_breakdown.technical, "boundaries": reward_breakdown.boundaries, "communication": reward_breakdown.communication, "leave_protection": reward_breakdown.leave_protection, "shaping": reward_breakdown.shaping, }, ) def state(self) -> State: """Get the current environment state.""" return self._state # --- Internal helpers --- VALID_ACTIONS = { "run_command", "read_file", "write_file", "run_tests", "check_server", "reply_slack", "reply_email", "reply_hr", "close_case", } def _dispatch_action(self, action_type: str, action_args: str) -> str: """Dispatch an action to the simulator.""" if action_type not in self.VALID_ACTIONS: return f"ERROR: Unknown action '{action_type}'" if action_type == "run_command": return self._simulator.run_bash(action_args) elif action_type == "read_file": return self._simulator.read_file(action_args) elif action_type == "write_file": if "|" in action_args: path, content = action_args.split("|", 1) return self._simulator.write_file(path.strip(), content) return "ERROR: write_file args must be 'path|content'" elif action_type == "run_tests": result = self._simulator.run_pytest() return f"Passed: {result['passed']}, Failed: {result['failed']}, Ratio: {result['ratio']:.0%}\n{result['output']}" elif action_type == "check_server": result = self._simulator.curl_server() return f"Status: {result['status_code']}, Success: {result['success']}" elif action_type in ("reply_slack", "reply_email", "reply_hr"): recipient = action_type.replace("reply_", "").upper() self._env_state.reply_texts.append(f"[{recipient}]: {action_args}") return self._simulator.write_reply(recipient, action_args) elif action_type == "close_case": return "Case closed. Episode ending." return "ERROR: dispatch failed" def _update_measurements(self): """Update state measurements from live environment.""" server_result = self._simulator.curl_server() self._env_state.server_running = server_result["success"] test_result = self._simulator.run_pytest() self._env_state.tests_passing_ratio = test_result["ratio"] reply_path = os.path.join(self._simulator.output_dir, "reply.txt") self._env_state.files_correct = ( os.path.exists(reply_path) and os.path.getsize(reply_path) > 0 )