Spaces:
Sleeping
Sleeping
| """ | |
| SWEbench-IN Environment Implementation for OpenEnv server. | |
| Wraps the SWEbench-IN environment logic into the OpenEnv | |
| Environment interface (reset/step/state). | |
| Dockerless: No container management, uses local temp directories. | |
| """ | |
| from uuid import uuid4 | |
| import random | |
| import os | |
| from openenv.core.env_server.interfaces import Environment | |
| from openenv.core.env_server.types import State | |
| from models import SWEbenchINAction, SWEbenchINObservation | |
| from tasks import TASKS | |
| from simulator import Simulator | |
| from rewards import compute_reward | |
| from dataclasses import dataclass, field | |
| class EnvState: | |
| """Internal environment state tracking.""" | |
| task_id: int = 0 | |
| step_count: int = 0 | |
| tests_passing_ratio: float = 0.0 | |
| server_running: bool = False | |
| files_correct: bool = False | |
| action_history: list = field(default_factory=list) | |
| reply_texts: list = field(default_factory=list) | |
| class SWEbenchINEnvironment(Environment): | |
| """ | |
| OpenEnv-compliant SWEbench-IN environment (Dockerless). | |
| Trains an LLM agent to fix broken Linux systems while managing | |
| stakeholder communication simultaneously. Uses local temp directories | |
| instead of Docker containers. | |
| """ | |
| SUPPORTS_CONCURRENT_SESSIONS: bool = True | |
| def __init__(self): | |
| """Initialize the SWEbench-IN environment.""" | |
| self._state = State(episode_id=str(uuid4()), step_count=0) | |
| self._env_state = EnvState() | |
| self._simulator = Simulator() | |
| self._current_task = None | |
| self._max_steps = 15 | |
| self._done = False | |
| def reset(self) -> SWEbenchINObservation: | |
| """Reset the environment to a new episode.""" | |
| # Sample a random task | |
| task_id = random.choice(list(TASKS.keys())) | |
| self._current_task = TASKS[task_id] | |
| self._done = False | |
| self._max_steps = self._current_task.max_actions | |
| self._state = State(episode_id=str(uuid4()), step_count=0) | |
| self._env_state = EnvState(task_id=task_id) | |
| self._simulator.setup_task(task_id) | |
| obs_text = self._simulator.get_initial_observation(task_id) | |
| return SWEbenchINObservation( | |
| text=obs_text, | |
| reward=0.0, | |
| done=False, | |
| step_count=0, | |
| max_steps=self._max_steps, | |
| tests_passing_ratio=0.0, | |
| server_running=False, | |
| ) | |
| def step(self, action: SWEbenchINAction) -> SWEbenchINObservation: | |
| """Execute a step in the environment.""" | |
| if self._done: | |
| return SWEbenchINObservation( | |
| text="Episode is done. Call reset() to start a new episode.", | |
| reward=0.0, | |
| done=True, | |
| step_count=self._state.step_count, | |
| max_steps=self._max_steps, | |
| ) | |
| action_type = action.type | |
| action_args = action.args | |
| # Record state before | |
| state_before = EnvState( | |
| task_id=self._env_state.task_id, | |
| step_count=self._env_state.step_count, | |
| tests_passing_ratio=self._env_state.tests_passing_ratio, | |
| server_running=self._env_state.server_running, | |
| files_correct=self._env_state.files_correct, | |
| ) | |
| # Dispatch action | |
| obs_text = self._dispatch_action(action_type, action_args) | |
| # Update state | |
| self._env_state.action_history.append(f"{action_type}: {action_args}") | |
| self._env_state.step_count += 1 | |
| self._state.step_count += 1 | |
| self._update_measurements() | |
| # Check done | |
| if action_type == "close_case" or self._env_state.step_count >= self._max_steps: | |
| self._done = True | |
| # Compute reward | |
| reward_breakdown = compute_reward( | |
| container_id=None, | |
| action_history=self._env_state.action_history, | |
| state_before=state_before, | |
| state_after=self._env_state, | |
| output_dir=self._simulator.output_dir, | |
| task_id=self._env_state.task_id, | |
| work_dir=self._simulator.work_dir, | |
| ) | |
| return SWEbenchINObservation( | |
| text=obs_text, | |
| reward=reward_breakdown.total, | |
| done=self._done, | |
| step_count=self._env_state.step_count, | |
| max_steps=self._max_steps, | |
| tests_passing_ratio=self._env_state.tests_passing_ratio, | |
| server_running=self._env_state.server_running, | |
| reward_breakdown={ | |
| "technical": reward_breakdown.technical, | |
| "boundaries": reward_breakdown.boundaries, | |
| "communication": reward_breakdown.communication, | |
| "leave_protection": reward_breakdown.leave_protection, | |
| "shaping": reward_breakdown.shaping, | |
| }, | |
| ) | |
| def state(self) -> State: | |
| """Get the current environment state.""" | |
| return self._state | |
| # --- Internal helpers --- | |
| VALID_ACTIONS = { | |
| "run_command", "read_file", "write_file", "run_tests", | |
| "check_server", "reply_slack", "reply_email", "reply_hr", "close_case", | |
| } | |
| def _dispatch_action(self, action_type: str, action_args: str) -> str: | |
| """Dispatch an action to the simulator.""" | |
| if action_type not in self.VALID_ACTIONS: | |
| return f"ERROR: Unknown action '{action_type}'" | |
| if action_type == "run_command": | |
| return self._simulator.run_bash(action_args) | |
| elif action_type == "read_file": | |
| return self._simulator.read_file(action_args) | |
| elif action_type == "write_file": | |
| if "|" in action_args: | |
| path, content = action_args.split("|", 1) | |
| return self._simulator.write_file(path.strip(), content) | |
| return "ERROR: write_file args must be 'path|content'" | |
| elif action_type == "run_tests": | |
| result = self._simulator.run_pytest() | |
| return f"Passed: {result['passed']}, Failed: {result['failed']}, Ratio: {result['ratio']:.0%}\n{result['output']}" | |
| elif action_type == "check_server": | |
| result = self._simulator.curl_server() | |
| return f"Status: {result['status_code']}, Success: {result['success']}" | |
| elif action_type in ("reply_slack", "reply_email", "reply_hr"): | |
| recipient = action_type.replace("reply_", "").upper() | |
| self._env_state.reply_texts.append(f"[{recipient}]: {action_args}") | |
| return self._simulator.write_reply(recipient, action_args) | |
| elif action_type == "close_case": | |
| return "Case closed. Episode ending." | |
| return "ERROR: dispatch failed" | |
| def _update_measurements(self): | |
| """Update state measurements from live environment.""" | |
| server_result = self._simulator.curl_server() | |
| self._env_state.server_running = server_result["success"] | |
| test_result = self._simulator.run_pytest() | |
| self._env_state.tests_passing_ratio = test_result["ratio"] | |
| reply_path = os.path.join(self._simulator.output_dir, "reply.txt") | |
| self._env_state.files_correct = ( | |
| os.path.exists(reply_path) and os.path.getsize(reply_path) > 0 | |
| ) | |