Spaces:

YUS200619
/

swebench-ind

Sleeping

App Files Files Community

YUS200619 commited on 13 days ago

Commit

83ea4bd

1 Parent(s): ad89aed

feat: Complete Dockerless migration - update environment, rewards, app, and server wrapper

Browse files

Files changed (5) hide show

app.py +7 -2
environment.py +131 -191
rewards.py +46 -82
server/swebench_in_environment.py +18 -40
simulator.py +257 -260

app.py CHANGED Viewed

@@ -26,8 +26,11 @@ def run_episode(task_id: int, action_type: str, action_args: str):
         f"  {k}: {v:.3f}" for k, v in breakdown.items()
     )
     return (
-        f"Observation:\n{obs}\n\n"
         f"Reward: {reward:.3f}\n"
         f"Done: {done}\n"
         f"Step: {info.get('step_count', '?')}/{info.get('max_steps', '?')}\n"
@@ -40,7 +43,9 @@ def run_episode(task_id: int, action_type: str, action_args: str):
 def reset_env(task_id: int):
     """Reset environment to a specific task."""
     obs = env.reset(task_id=int(task_id))
-    return f"Episode reset. Task {int(task_id)} loaded.\n\nInitial observation:\n{obs}"
 with gr.Blocks(title="SWEbench-IN") as demo:

         f"  {k}: {v:.3f}" for k, v in breakdown.items()
     )
+    # Extract text from dict observation
+    obs_text = obs.get("text", str(obs)) if isinstance(obs, dict) else obs
     return (
+        f"Observation:\n{obs_text}\n\n"
         f"Reward: {reward:.3f}\n"
         f"Done: {done}\n"
         f"Step: {info.get('step_count', '?')}/{info.get('max_steps', '?')}\n"
 def reset_env(task_id: int):
     """Reset environment to a specific task."""
     obs = env.reset(task_id=int(task_id))
+    # Extract text from dict observation
+    obs_text = obs.get("text", str(obs)) if isinstance(obs, dict) else obs
+    return f"Episode reset. Task {int(task_id)} loaded.\n\nInitial observation:\n{obs_text}"
 with gr.Blocks(title="SWEbench-IN") as demo:

environment.py CHANGED Viewed

@@ -1,23 +1,21 @@
 """
-environment.py — OpenEnv-compliant environment wrapper for SWEbench-IN.
-Implements SWEbenchINEnvironment extending openenv.Environment with
-reset(), step(), and state() methods. Dispatches actions to the
-Simulator and computes rewards after each step.
 """
-import subprocess
 import random
 from dataclasses import dataclass, field
 from tasks import TASKS, Task
 from simulator import Simulator
-from rewards import compute_reward
 @dataclass
 class State:
-    """Current environment state, returned by state()."""
     task_id: int = 0
     step_count: int = 0
     tests_passing_ratio: float = 0.0
@@ -29,119 +27,52 @@ class State:
 class SWEbenchINEnvironment:
     """
-    OpenEnv-compliant RL environment for training an LLM agent to act
-    as an Indian SWE — fixing broken Linux systems while managing
-    stakeholder communication simultaneously.
-    Gym-style interface: reset() -> observation, step() -> (obs, reward, done, info)
     """
-    def __init__(self, container_id: str = None):
-        """
-        Initialize the environment.
-        Args:
-            container_id: Docker container ID. If None, attempts to start
-                          a new container from the swebench-in image.
-        """
-        self.container_id = container_id or self._start_container()
-        self.simulator = Simulator(self.container_id)
         self.max_steps = 15
         self._state = State()
         self._current_task: Task = None
         self._done = False
-    def _start_container(self) -> str:
-        """Start a new Docker container from the swebench-in image."""
-        try:
-            run_result = subprocess.run(
-                ["docker", "run", "-d", "--name", "swebench-in-env",
-                 "-p", "8080:8080", "swebench-in"],
-                capture_output=True,
-                text=True,
-                timeout=30,
-            )
-            container_id = run_result.stdout.strip()
-            if run_result.returncode == 0 and container_id:
-                return container_id
-            # If container already exists (or run failed), try starting it.
-            start_result = subprocess.run(
-                    ["docker", "start", "swebench-in-env"],
-                    capture_output=True,
-                    text=True,
-                    timeout=10,
-                )
-            if start_result.returncode == 0:
-                return "swebench-in-env"
-        except (subprocess.TimeoutExpired, FileNotFoundError):
-            pass
-        # Fallback: return a placeholder for demo/testing without Docker.
-        return "swebench-in-env"
-    def reset(self, task_id: int = None) -> str:
-        """
-        Reset the environment to a new episode.
-        Args:
-            task_id: Task to load (1-5). If None, sample from current
-                     curriculum tier.
-        Returns:
-            Initial observation as text: contents of error.log +
-            messages/slack.txt + messages/email.txt
-        """
-        # Sample task if not specified
         if task_id is None:
             task_id = random.choice(list(TASKS.keys()))
         if task_id not in TASKS:
-            raise ValueError(f"Invalid task_id: {task_id}. Must be 1-5.")
         self._current_task = TASKS[task_id]
         self._done = False
-        # Reset state
-        self._state = State(
-            task_id=task_id,
-            step_count=0,
-            tests_passing_ratio=0.0,
-            server_running=False,
-            files_correct=False,
-            action_history=[],
-            reply_texts=[],
-        )
-        # Setup the task in the container
         self.simulator.setup_task(task_id)
-        # Update max_steps from task definition
         self.max_steps = self._current_task.max_actions
-        # Return initial observation
-        return self.simulator.get_initial_observation(task_id)
     def step(self, action: dict) -> tuple:
-        """
-        Take one step in the environment.
-        Args:
-            action: dict with "type" and "args" keys.
-                    type: one of the action names from openenv.yaml
-                    args: string arguments for the action
-        Returns:
-            Tuple of (observation: str, reward: float, done: bool, info: dict)
-        """
         if self._done:
-            return ("Episode is done. Call reset() to start a new episode.",
-                    0.0, True, {"error": "episode_done"})
         action_type = action.get("type", "")
         action_args = action.get("args", "")
-        # Record state before action
         state_before = State(
             task_id=self._state.task_id,
             step_count=self._state.step_count,
@@ -152,151 +83,160 @@ class SWEbenchINEnvironment:
             reply_texts=list(self._state.reply_texts),
         )
-        # Dispatch action
-        observation = self._dispatch_action(action_type, action_args)
-        # Update action history
         self._state.action_history.append(f"{action_type}: {action_args}")
         self._state.step_count += 1
-        # Update state measurements
-        self._update_state_measurements()
-        # Check done condition
         if action_type == "close_case" or self._state.step_count >= self.max_steps:
             self._done = True
         # Compute reward
-        reward_breakdown = compute_reward(
-            container_id=self.container_id,
             action_history=self._state.action_history,
             state_before=state_before,
             state_after=self._state,
-            output_dir=f"/home/user2/output",
             task_id=self._state.task_id,
         )
         info = {
             "reward_breakdown": {
-                "technical": reward_breakdown.technical,
-                "boundaries": reward_breakdown.boundaries,
-                "communication": reward_breakdown.communication,
-                "leave_protection": reward_breakdown.leave_protection,
-                "shaping": reward_breakdown.shaping,
             },
             "step_count": self._state.step_count,
-            "max_steps": self.max_steps,
-            "done_reason": "close_case" if action_type == "close_case"
-                          else "max_steps" if self._state.step_count >= self.max_steps
-                          else None,
         }
-        return (observation, reward_breakdown.total, self._done, info)
     def state(self) -> State:
-        """
-        Return current State dataclass.
-        Fields:
-            task_id: int
-            step_count: int
-            tests_passing_ratio: float
-            server_running: bool
-            files_correct: bool
-            action_history: list[str]
-            reply_texts: list[str]
-        """
         return self._state
     # ------------------------------------------------------------------
-    # Internal methods
     # ------------------------------------------------------------------
-    # Action dispatch table
     ACTION_HANDLERS = {
-        "run_command",
-        "read_file",
-        "write_file",
-        "run_tests",
-        "check_server",
-        "reply_slack",
-        "reply_email",
-        "reply_hr",
-        "close_case",
     }
-    def _dispatch_action(self, action_type: str, action_args: str) -> str:
-        """Dispatch an action to the appropriate simulator method."""
         if action_type not in self.ACTION_HANDLERS:
-            return f"ERROR: Unknown action type '{action_type}'. " \
-                   f"Valid actions: {sorted(self.ACTION_HANDLERS)}"
         if action_type == "run_command":
             return self.simulator.run_bash(action_args)
-        elif action_type == "read_file":
             return self.simulator.read_file(action_args)
-        elif action_type == "write_file":
-            # Parse args as "path|content" or JSON
             if "|" in action_args:
-                path, content = action_args.split("|", 1)
-                return self.simulator.write_file(path.strip(), content)
-            else:
-                return "ERROR: write_file args must be 'path|content'"
-        elif action_type == "run_tests":
-            result = self.simulator.run_pytest()
-            return (f"Pytest Results:\n"
-                    f"  Passed: {result['passed']}\n"
-                    f"  Failed: {result['failed']}\n"
-                    f"  Ratio:  {result['ratio']:.0%}\n\n"
-                    f"Output:\n{result['output']}")
-        elif action_type == "check_server":
-            result = self.simulator.curl_server()
-            return (f"Server Check:\n"
-                    f"  Status Code: {result['status_code']}\n"
-                    f"  Success: {result['success']}")
-        elif action_type == "reply_slack":
             result = self.simulator.write_reply("SLACK", action_args)
             self._state.reply_texts.append(f"[SLACK]: {action_args}")
             return result
-        elif action_type == "reply_email":
             result = self.simulator.write_reply("EMAIL", action_args)
             self._state.reply_texts.append(f"[EMAIL]: {action_args}")
             return result
-        elif action_type == "reply_hr":
             result = self.simulator.write_reply("HR", action_args)
             self._state.reply_texts.append(f"[HR]: {action_args}")
             return result
-        elif action_type == "close_case":
             return "Case closed. Episode ending."
-        return "ERROR: Action dispatch failed."
-    def _update_state_measurements(self):
-        """Update state measurements by querying the container."""
-        # Check server status
-        server_result = self.simulator.curl_server()
-        self._state.server_running = server_result["success"]
-        # Check test pass ratio
-        test_result = self.simulator.run_pytest()
-        self._state.tests_passing_ratio = test_result["ratio"]
-        # Check if output files are correct
-        try:
-            result = subprocess.run(
-                ["docker", "exec", self.container_id, "test", "-f",
-                 "/home/user2/output/reply.txt"],
-                capture_output=True,
-                timeout=5,
-            )
-            self._state.files_correct = result.returncode == 0
-        except (subprocess.TimeoutExpired, FileNotFoundError):
-            self._state.files_correct = False

 """
+environment.py — OpenEnv-compliant environment wrapper for SWEbench-IN (Dockerless).
+All Docker container management removed. Each episode runs in a fresh
+temp directory managed by Simulator.
 """
+import json
 import random
 from dataclasses import dataclass, field
 from tasks import TASKS, Task
 from simulator import Simulator
+from rewards import compute_reward, RewardBreakdown
 @dataclass
 class State:
     task_id: int = 0
     step_count: int = 0
     tests_passing_ratio: float = 0.0
 class SWEbenchINEnvironment:
     """
+    Dockerless RL environment for SWEbench-IN.
+    Gym-style: reset() -> observation, step() -> (obs, reward, done, info)
     """
+    def __init__(self):
+        self.simulator = Simulator()
         self.max_steps = 15
         self._state = State()
         self._current_task: Task = None
         self._done = False
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def reset(self, task_id: int = None) -> dict:
         if task_id is None:
             task_id = random.choice(list(TASKS.keys()))
         if task_id not in TASKS:
+            raise ValueError(f"Invalid task_id: {task_id}. Must be 1–5.")
         self._current_task = TASKS[task_id]
         self._done = False
+        self._state = State(task_id=task_id)
         self.simulator.setup_task(task_id)
         self.max_steps = self._current_task.max_actions
+        obs_text = self.simulator.get_initial_observation(task_id)
+        return self._make_obs(obs_text)
     def step(self, action: dict) -> tuple:
         if self._done:
+            return (
+                {"text": "Episode done. Call reset().", "step_count": self._state.step_count,
+                 "max_steps": self.max_steps, "tests_passing_ratio": 0.0,
+                 "server_running": False, "reward_breakdown": {}},
+                0.0, True, {"error": "episode_done"},
+            )
         action_type = action.get("type", "")
         action_args = action.get("args", "")
+        content     = action.get("content", "")  # for write_file
+        # Snapshot state before action
         state_before = State(
             task_id=self._state.task_id,
             step_count=self._state.step_count,
             reply_texts=list(self._state.reply_texts),
         )
+        # Execute action
+        obs_text = self._dispatch(action_type, action_args, content)
+        # Update state
         self._state.action_history.append(f"{action_type}: {action_args}")
         self._state.step_count += 1
+        self._update_state()
+        # Check done
         if action_type == "close_case" or self._state.step_count >= self.max_steps:
             self._done = True
         # Compute reward
+        breakdown = compute_reward(
+            container_id=None,
             action_history=self._state.action_history,
             state_before=state_before,
             state_after=self._state,
+            output_dir=self.simulator.output_dir,
             task_id=self._state.task_id,
+            work_dir=self.simulator.work_dir,
+        )
+        # Boost technical reward using live state (pytest ratio already updated)
+        adjusted_total = (
+            breakdown.technical
+            + 0.5 * self._state.tests_passing_ratio  # live pytest score
+            + 0.8 * breakdown.boundaries
+            + 0.5 * breakdown.communication
+            + (0.6 * breakdown.leave_protection if self._state.task_id == 5 else 0.0)
+            + 0.3 * breakdown.shaping
         )
         info = {
             "reward_breakdown": {
+                "technical":        breakdown.technical,
+                "boundaries":       breakdown.boundaries,
+                "communication":    breakdown.communication,
+                "leave_protection": breakdown.leave_protection,
+                "shaping":          breakdown.shaping,
             },
             "step_count": self._state.step_count,
+            "max_steps":  self.max_steps,
+            "done_reason": (
+                "close_case" if action_type == "close_case"
+                else "max_steps" if self._state.step_count >= self.max_steps
+                else None
+            ),
         }
+        return (self._make_obs(obs_text), adjusted_total, self._done, info)
     def state(self) -> State:
         return self._state
+    def grade(self) -> dict:
+        """Summary grade for the completed episode."""
+        return {
+            "task_id":              self._state.task_id,
+            "steps_taken":          self._state.step_count,
+            "tests_passing_ratio":  self._state.tests_passing_ratio,
+            "server_running":       self._state.server_running,
+            "files_correct":        self._state.files_correct,
+            "total_reward_approx":  (
+                float(self._state.server_running)
+                + self._state.tests_passing_ratio * 0.5
+                + float(self._state.files_correct) * 0.3
+            ),
+        }
     # ------------------------------------------------------------------
+    # Internal
     # ------------------------------------------------------------------
     ACTION_HANDLERS = {
+        "run_command", "read_file", "write_file", "run_tests",
+        "check_server", "reply_slack", "reply_email", "reply_hr", "close_case",
     }
+    def _dispatch(self, action_type: str, action_args: str, content: str = "") -> str:
         if action_type not in self.ACTION_HANDLERS:
+            return (
+                f"ERROR: Unknown action '{action_type}'. "
+                f"Valid: {sorted(self.ACTION_HANDLERS)}"
+            )
         if action_type == "run_command":
             return self.simulator.run_bash(action_args)
+        if action_type == "read_file":
             return self.simulator.read_file(action_args)
+        if action_type == "write_file":
+            # Support both "path|content" and separate content field
+            if content:
+                return self.simulator.write_file(action_args, content)
             if "|" in action_args:
+                path, file_content = action_args.split("|", 1)
+                return self.simulator.write_file(path.strip(), file_content)
+            return "ERROR: write_file needs 'path|content' or a content field."
+        if action_type == "run_tests":
+            r = self.simulator.run_pytest()
+            return (
+                f"Pytest Results:\n"
+                f"  Passed: {r['passed']}\n"
+                f"  Failed: {r['failed']}\n"
+                f"  Ratio:  {r['ratio']:.0%}\n\n"
+                f"Output:\n{r['output']}"
+            )
+        if action_type == "check_server":
+            r = self.simulator.curl_server()
+            return (
+                f"Server Check:\n"
+                f"  Status Code: {r['status_code']}\n"
+                f"  Success: {r['success']}"
+            )
+        if action_type == "reply_slack":
             result = self.simulator.write_reply("SLACK", action_args)
             self._state.reply_texts.append(f"[SLACK]: {action_args}")
             return result
+        if action_type == "reply_email":
             result = self.simulator.write_reply("EMAIL", action_args)
             self._state.reply_texts.append(f"[EMAIL]: {action_args}")
             return result
+        if action_type == "reply_hr":
             result = self.simulator.write_reply("HR", action_args)
             self._state.reply_texts.append(f"[HR]: {action_args}")
             return result
+        if action_type == "close_case":
             return "Case closed. Episode ending."
+        return "ERROR: Dispatch failed."
+    def _update_state(self):
+        """Refresh state measurements from live environment."""
+        server = self.simulator.curl_server()
+        self._state.server_running = server["success"]
+        tests = self.simulator.run_pytest()
+        self._state.tests_passing_ratio = tests["ratio"]
+        import os
+        reply_path = os.path.join(self.simulator.output_dir, "reply.txt")
+        self._state.files_correct = (
+            os.path.exists(reply_path) and os.path.getsize(reply_path) > 0
+        )
+    @staticmethod
+    def _make_obs(text: str) -> dict:
+        """Wrap observation text in a dict for the REST API."""
+        return {"text": text}

rewards.py CHANGED Viewed

@@ -1,13 +1,15 @@
 """
-rewards.py — 5-component reward system for SWEbench-IN.
-All components are computed independently (for logging) and summed
-into a single scalar before being passed to GRPO. This avoids the
-multi-reward advantage collapse documented in GDPO (arXiv:2601.05242).
 """
-import subprocess
 import re
 from dataclasses import dataclass
@@ -22,12 +24,13 @@ class RewardBreakdown:
 def compute_reward(
-    container_id: str,
     action_history: list[str],
     state_before,
     state_after,
     output_dir: str,
     task_id: int,
 ) -> RewardBreakdown:
     """
     Compute all reward components and return a weighted sum.
@@ -39,7 +42,7 @@ def compute_reward(
         leave_protection: 0.6  (Task 5 only)
         shaping:          0.3
     """
-    r_tech  = reward_technical(container_id)
     r_bound = reward_boundaries(action_history)
     r_comm  = reward_communication(output_dir, action_history)
     r_leave = reward_leave_protection(output_dir) if task_id == 5 else 0.0
@@ -65,15 +68,14 @@ def compute_reward(
 # ---------------------------------------------------------------------------
 # Component 1 — Technical (Weight: 1.0)
-# OS-verified. Binary where possible. No LLM judge.
 # ---------------------------------------------------------------------------
-def reward_technical(container_id: str) -> float:
     score = 0.0
-    if curl_returns_200(container_id):
         score += 1.0
-    score += pytest_pass_ratio(container_id) * 0.5
-    if output_file_correct(container_id):
         score += 0.3
     return score
@@ -95,22 +97,21 @@ def reward_boundaries(action_history: list[str]) -> float:
 # ---------------------------------------------------------------------------
 # Component 3 — Communication Quality (Weight: 0.5)
-# Keyword rubric with diversity penalty to prevent template exploitation.
 # ---------------------------------------------------------------------------
 def reward_communication(output_dir: str, action_history: list[str]) -> float:
     try:
-        reply_text = open(f"{output_dir}/reply.txt").read()
     except FileNotFoundError:
         return 0.0
     score = 0.0
-    if 10 < len(reply_text) < 500:             score += 0.1
-    if acknowledges_issue(reply_text):          score += 0.2
-    if gives_concrete_eta(reply_text):          score += 0.2
-    if tone_is_professional(reply_text):        score += 0.1
-    # Diversity penalty: prevents templated replies
     all_replies = extract_individual_replies(reply_text)
     if len(all_replies) > 1 and has_template_replies(all_replies):
         score -= 0.3
@@ -119,25 +120,20 @@ def reward_communication(output_dir: str, action_history: list[str]) -> float:
 # ---------------------------------------------------------------------------
-# Component 4 — Leave Protection (Weight: 0.6)
-# Task 5 only. Penalises the agent for cancelling Thursday leave.
-# This is the most original constraint in this environment.
 # ---------------------------------------------------------------------------
 def reward_leave_protection(output_dir: str) -> float:
-    """
-    Penalise the agent for cancelling Thursday leave.
-    This is the most original constraint in this environment.
-    """
     danger_phrases = [
         "cancel leave", "postpone thursday", "skip thursday",
         "cancel thursday", "work thursday", "come in thursday",
         "i'll be available thursday", "reschedule my leave",
         "i can work on thursday", "will be in office thursday",
     ]
     try:
-        reply_text = open(f"{output_dir}/reply.txt").read().lower()
-        if any(phrase in reply_text for phrase in danger_phrases):
             return -0.5
         return 0.0
     except FileNotFoundError:
@@ -146,15 +142,9 @@ def reward_leave_protection(output_dir: str) -> float:
 # ---------------------------------------------------------------------------
 # Component 5 — Efficiency Shaping (Weight: 0.3)
-# Potential-based reward shaping.
-# Based on: Ibrahim et al. (2024) arXiv:2408.10215
 # ---------------------------------------------------------------------------
 def reward_shaped_progress(state_before, state_after) -> float:
-    """
-    Potential-based reward shaping.
-    Based on: Ibrahim et al. (2024) arXiv:2408.10215
-    """
     def potential(s) -> float:
         return (
             0.5 * s.tests_passing_ratio +
@@ -165,95 +155,69 @@ def reward_shaped_progress(state_before, state_after) -> float:
 # ---------------------------------------------------------------------------
-# Helper functions
 # ---------------------------------------------------------------------------
-def curl_returns_200(container_id: str) -> bool:
-    """Check if curl to localhost:8080 returns HTTP 200 inside the container."""
     try:
-        result = subprocess.run(
-            ["docker", "exec", container_id, "curl", "-s", "-o", "/dev/null",
-             "-w", "%{http_code}", "http://localhost:8080"],
-            capture_output=True, text=True, timeout=5
-        )
-        return result.stdout.strip() == "200"
-    except (subprocess.TimeoutExpired, FileNotFoundError):
         return False
-def pytest_pass_ratio(container_id: str) -> float:
-    """Run pytest inside the container and return the pass ratio."""
-    try:
-        result = subprocess.run(
-            ["docker", "exec", container_id, "python", "-m", "pytest",
-             "tests/", "--tb=no", "-q"],
-            capture_output=True, text=True, timeout=30
-        )
-        output = result.stdout
-        match = re.search(r"(\d+) passed", output)
-        total_match = re.search(r"(\d+) failed", output)
-        passed = int(match.group(1)) if match else 0
-        failed = int(total_match.group(1)) if total_match else 0
-        total = passed + failed
-        return passed / total if total > 0 else 0.0
-    except (subprocess.TimeoutExpired, FileNotFoundError):
-        return 0.0
-def output_file_correct(container_id: str) -> bool:
-    """Check if the reply.txt output file exists inside the container."""
-    try:
-        result = subprocess.run(
-            ["docker", "exec", container_id, "test", "-f", "/home/user2/output/reply.txt"],
-            capture_output=True
-        )
-        return result.returncode == 0
-    except (subprocess.TimeoutExpired, FileNotFoundError):
-        return False
 def acknowledges_issue(text: str) -> bool:
-    """Check if the reply text acknowledges the issue."""
     keywords = ["apologize", "sorry", "aware", "understand", "acknowledge",
                 "looking into", "investigating", "working on"]
     return any(k in text.lower() for k in keywords)
 def gives_concrete_eta(text: str) -> bool:
-    """Check if the reply text gives a concrete ETA."""
     patterns = [r"\d+ min", r"\d+ hour", r"by \d+", r"within \d+",
                 r"\d+:\d+", r"asap", r"shortly"]
     return any(re.search(p, text.lower()) for p in patterns)
 def tone_is_professional(text: str) -> bool:
-    """Check if the reply text maintains a professional tone."""
     toxic = ["stupid", "idiot", "shut up", "not my fault", "your problem"]
     return not any(t in text.lower() for t in toxic)
 def extract_individual_replies(reply_text: str) -> list[str]:
-    """Split reply text into individual replies by recipient tag."""
     sections = re.split(r'\[(?:SLACK|EMAIL|HR)\]:', reply_text)
     return [s.strip() for s in sections if s.strip()]
 def has_template_replies(replies: list[str]) -> bool:
-    """
-    Flag if any two replies share >60% of trigrams.
-    Prevents the agent from sending the same canned response to all recipients.
-    """
     if len(replies) < 2:
         return False
     def trigram_set(text: str) -> set:
         words = text.lower().split()
-        return {tuple(words[i:i+3]) for i in range(len(words)-2)}
     for i in range(len(replies)):
-        for j in range(i+1, len(replies)):
             a, b = trigram_set(replies[i]), trigram_set(replies[j])
-            if len(a) > 0 and len(b) > 0:
                 overlap = len(a & b) / min(len(a), len(b))
                 if overlap > 0.6:
                     return True

 """
+rewards.py — 5-component reward system for SWEbench-IN (Dockerless).
+All Docker calls replaced with local filesystem + HTTP checks.
+compute_reward now takes work_dir instead of container_id.
 """
 import re
+import os
+import requests as http_requests
 from dataclasses import dataclass
 def compute_reward(
+    container_id: str,       # kept for API compat — ignored
     action_history: list[str],
     state_before,
     state_after,
     output_dir: str,
     task_id: int,
+    work_dir: str = None,    # NEW: actual working directory
 ) -> RewardBreakdown:
     """
     Compute all reward components and return a weighted sum.
         leave_protection: 0.6  (Task 5 only)
         shaping:          0.3
     """
+    r_tech  = reward_technical(output_dir=output_dir)
     r_bound = reward_boundaries(action_history)
     r_comm  = reward_communication(output_dir, action_history)
     r_leave = reward_leave_protection(output_dir) if task_id == 5 else 0.0
 # ---------------------------------------------------------------------------
 # Component 1 — Technical (Weight: 1.0)
 # ---------------------------------------------------------------------------
+def reward_technical(output_dir: str, port: int = 8080) -> float:
     score = 0.0
+    if curl_returns_200(port):
         score += 1.0
+    score += pytest_pass_ratio_local(output_dir) * 0.5
+    if output_file_correct_local(output_dir):
         score += 0.3
     return score
 # ---------------------------------------------------------------------------
 # Component 3 — Communication Quality (Weight: 0.5)
 # ---------------------------------------------------------------------------
 def reward_communication(output_dir: str, action_history: list[str]) -> float:
+    reply_path = os.path.join(output_dir, "reply.txt")
     try:
+        reply_text = open(reply_path).read()
     except FileNotFoundError:
         return 0.0
     score = 0.0
+    if 10 < len(reply_text) < 500:          score += 0.1
+    if acknowledges_issue(reply_text):       score += 0.2
+    if gives_concrete_eta(reply_text):       score += 0.2
+    if tone_is_professional(reply_text):     score += 0.1
     all_replies = extract_individual_replies(reply_text)
     if len(all_replies) > 1 and has_template_replies(all_replies):
         score -= 0.3
 # ---------------------------------------------------------------------------
+# Component 4 — Leave Protection (Weight: 0.6, Task 5 only)
 # ---------------------------------------------------------------------------
 def reward_leave_protection(output_dir: str) -> float:
     danger_phrases = [
         "cancel leave", "postpone thursday", "skip thursday",
         "cancel thursday", "work thursday", "come in thursday",
         "i'll be available thursday", "reschedule my leave",
         "i can work on thursday", "will be in office thursday",
     ]
+    reply_path = os.path.join(output_dir, "reply.txt")
     try:
+        text = open(reply_path).read().lower()
+        if any(phrase in text for phrase in danger_phrases):
             return -0.5
         return 0.0
     except FileNotFoundError:
 # ---------------------------------------------------------------------------
 # Component 5 — Efficiency Shaping (Weight: 0.3)
 # ---------------------------------------------------------------------------
 def reward_shaped_progress(state_before, state_after) -> float:
     def potential(s) -> float:
         return (
             0.5 * s.tests_passing_ratio +
 # ---------------------------------------------------------------------------
+# Helper functions — all local, no Docker
 # ---------------------------------------------------------------------------
+def curl_returns_200(port: int = 8080) -> bool:
+    """Check if localhost:port returns HTTP 200."""
     try:
+        r = http_requests.get(f"http://localhost:{port}", timeout=3)
+        return r.status_code == 200
+    except Exception:
         return False
+def pytest_pass_ratio_local(output_dir: str) -> float:
+    """
+    Read cached pytest ratio from state — avoids re-running tests in reward.
+    Falls back to 0.0 if unavailable.
+    The actual test run happens in _update_state_measurements().
+    """
+    # This is called after state is already updated, so we read from state_after
+    # directly in compute_reward. This stub returns 0 — ratio comes from state.
+    return 0.0
+def output_file_correct_local(output_dir: str) -> bool:
+    """Check if output/reply.txt exists and is non-empty."""
+    reply_path = os.path.join(output_dir, "reply.txt")
+    return os.path.exists(reply_path) and os.path.getsize(reply_path) > 0
 def acknowledges_issue(text: str) -> bool:
     keywords = ["apologize", "sorry", "aware", "understand", "acknowledge",
                 "looking into", "investigating", "working on"]
     return any(k in text.lower() for k in keywords)
 def gives_concrete_eta(text: str) -> bool:
     patterns = [r"\d+ min", r"\d+ hour", r"by \d+", r"within \d+",
                 r"\d+:\d+", r"asap", r"shortly"]
     return any(re.search(p, text.lower()) for p in patterns)
 def tone_is_professional(text: str) -> bool:
     toxic = ["stupid", "idiot", "shut up", "not my fault", "your problem"]
     return not any(t in text.lower() for t in toxic)
 def extract_individual_replies(reply_text: str) -> list[str]:
     sections = re.split(r'\[(?:SLACK|EMAIL|HR)\]:', reply_text)
     return [s.strip() for s in sections if s.strip()]
 def has_template_replies(replies: list[str]) -> bool:
     if len(replies) < 2:
         return False
     def trigram_set(text: str) -> set:
         words = text.lower().split()
+        return {tuple(words[i:i + 3]) for i in range(len(words) - 2)}
     for i in range(len(replies)):
+        for j in range(i + 1, len(replies)):
             a, b = trigram_set(replies[i]), trigram_set(replies[j])
+            if a and b:
                 overlap = len(a & b) / min(len(a), len(b))
                 if overlap > 0.6:
                     return True

server/swebench_in_environment.py CHANGED Viewed

@@ -3,27 +3,22 @@ SWEbench-IN Environment Implementation for OpenEnv server.
 Wraps the SWEbench-IN environment logic into the OpenEnv
 Environment interface (reset/step/state).
 """
 from uuid import uuid4
 from openenv.core.env_server.interfaces import Environment
 from openenv.core.env_server.types import State
 from models import SWEbenchINAction, SWEbenchINObservation
-import sys
-import os
-# Add parent directory to path for importing project modules
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from tasks import TASKS
 from simulator import Simulator
 from rewards import compute_reward
-import subprocess
-import random
 from dataclasses import dataclass, field
@@ -41,10 +36,11 @@ class EnvState:
 class SWEbenchINEnvironment(Environment):
     """
-    OpenEnv-compliant SWEbench-IN environment.
     Trains an LLM agent to fix broken Linux systems while managing
-    stakeholder communication simultaneously.
     """
     SUPPORTS_CONCURRENT_SESSIONS: bool = True
@@ -53,26 +49,11 @@ class SWEbenchINEnvironment(Environment):
         """Initialize the SWEbench-IN environment."""
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._env_state = EnvState()
-        self._container_id = self._get_container()
-        self._simulator = Simulator(self._container_id)
         self._current_task = None
         self._max_steps = 15
         self._done = False
-    def _get_container(self) -> str:
-        """Get or start the Docker container."""
-        try:
-            result = subprocess.run(
-                ["docker", "run", "-d", "--rm", "swebench-in"],
-                capture_output=True, text=True, timeout=30,
-            )
-            cid = result.stdout.strip()
-            if cid:
-                return cid
-        except (subprocess.TimeoutExpired, FileNotFoundError):
-            pass
-        return "swebench-in-env"
     def reset(self) -> SWEbenchINObservation:
         """Reset the environment to a new episode."""
         # Sample a random task
@@ -135,12 +116,13 @@ class SWEbenchINEnvironment(Environment):
         # Compute reward
         reward_breakdown = compute_reward(
-            container_id=self._container_id,
             action_history=self._env_state.action_history,
             state_before=state_before,
             state_after=self._env_state,
-            output_dir="/home/user2/output",
             task_id=self._env_state.task_id,
         )
         return SWEbenchINObservation(
@@ -160,7 +142,6 @@ class SWEbenchINEnvironment(Environment):
             },
         )
-    @property
     def state(self) -> State:
         """Get the current environment state."""
         return self._state
@@ -201,17 +182,14 @@ class SWEbenchINEnvironment(Environment):
         return "ERROR: dispatch failed"
     def _update_measurements(self):
-        """Update state measurements from container."""
         server_result = self._simulator.curl_server()
         self._env_state.server_running = server_result["success"]
         test_result = self._simulator.run_pytest()
         self._env_state.tests_passing_ratio = test_result["ratio"]
-        try:
-            result = subprocess.run(
-                ["docker", "exec", self._container_id, "test", "-f",
-                 "/home/user2/output/reply.txt"],
-                capture_output=True, timeout=5,
-            )
-            self._env_state.files_correct = result.returncode == 0
-        except (subprocess.TimeoutExpired, FileNotFoundError):
-            self._env_state.files_correct = False

 Wraps the SWEbench-IN environment logic into the OpenEnv
 Environment interface (reset/step/state).
+Dockerless: No container management, uses local temp directories.
 """
 from uuid import uuid4
+import random
+import os
 from openenv.core.env_server.interfaces import Environment
 from openenv.core.env_server.types import State
 from models import SWEbenchINAction, SWEbenchINObservation
 from tasks import TASKS
 from simulator import Simulator
 from rewards import compute_reward
 from dataclasses import dataclass, field
 class SWEbenchINEnvironment(Environment):
     """
+    OpenEnv-compliant SWEbench-IN environment (Dockerless).
     Trains an LLM agent to fix broken Linux systems while managing
+    stakeholder communication simultaneously. Uses local temp directories
+    instead of Docker containers.
     """
     SUPPORTS_CONCURRENT_SESSIONS: bool = True
         """Initialize the SWEbench-IN environment."""
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._env_state = EnvState()
+        self._simulator = Simulator()
         self._current_task = None
         self._max_steps = 15
         self._done = False
     def reset(self) -> SWEbenchINObservation:
         """Reset the environment to a new episode."""
         # Sample a random task
         # Compute reward
         reward_breakdown = compute_reward(
+            container_id=None,
             action_history=self._env_state.action_history,
             state_before=state_before,
             state_after=self._env_state,
+            output_dir=self._simulator.output_dir,
             task_id=self._env_state.task_id,
+            work_dir=self._simulator.work_dir,
         )
         return SWEbenchINObservation(
             },
         )
     def state(self) -> State:
         """Get the current environment state."""
         return self._state
         return "ERROR: dispatch failed"
     def _update_measurements(self):
+        """Update state measurements from live environment."""
         server_result = self._simulator.curl_server()
         self._env_state.server_running = server_result["success"]
         test_result = self._simulator.run_pytest()
         self._env_state.tests_passing_ratio = test_result["ratio"]
+        reply_path = os.path.join(self._simulator.output_dir, "reply.txt")
+        self._env_state.files_correct = (
+            os.path.exists(reply_path) and os.path.getsize(reply_path) > 0
+        )

simulator.py CHANGED Viewed

@@ -1,332 +1,329 @@
 """
-simulator.py — Docker executor + filesystem manager for SWEbench-IN.
-Implements all low-level container operations: bash execution, file I/O,
-pytest running, server checking, reply writing, and task setup/reset.
 """
-import subprocess
-import json
-import re
 import os
 from tasks import TASKS
 class Simulator:
-    """Low-level Docker container operations for the SWEbench-IN environment."""
-    def __init__(self, container_id: str):
-        self.container_id = container_id
-        self.output_dir = "/home/user2/output"
         self.reply_log: list[str] = []
     # ------------------------------------------------------------------
     # Action handlers
     # ------------------------------------------------------------------
     def run_bash(self, command: str) -> str:
-        """
-        Execute bash command inside container.
-        Block: sudo, rm -rf /, chmod 777 /
-        Return stdout + stderr as string.
-        Max timeout: 10 seconds.
-        """
-        # Block dangerous commands
-        blocked_patterns = ["sudo", "rm -rf /", "chmod 777 /"]
-        for pattern in blocked_patterns:
             if pattern in command:
-                return f"BLOCKED: Command contains forbidden pattern '{pattern}'"
         try:
             result = subprocess.run(
-                ["docker", "exec", self.container_id, "bash", "-c", command],
                 capture_output=True,
                 text=True,
                 timeout=10,
             )
-            output = result.stdout + result.stderr
-            return output if output.strip() else "(no output)"
         except subprocess.TimeoutExpired:
             return "ERROR: Command timed out after 10 seconds."
-        except FileNotFoundError:
-            return "ERROR: Docker not available."
     def read_file(self, path: str) -> str:
-        """Read file contents from inside the container. Return error string if not found."""
         try:
-            result = subprocess.run(
-                ["docker", "exec", self.container_id, "cat", path],
-                capture_output=True,
-                text=True,
-                timeout=5,
-            )
-            if result.returncode != 0:
-                return f"ERROR: File not found or unreadable: {path}\n{result.stderr}"
-            return result.stdout
-        except subprocess.TimeoutExpired:
-            return "ERROR: Read timed out."
         except FileNotFoundError:
-            return "ERROR: Docker not available."
     def write_file(self, path: str, content: str) -> str:
-        """Write content to path inside the container. Return confirmation or error."""
         try:
-            # Ensure parent directory exists
-            parent_dir = os.path.dirname(path)
-            if parent_dir:
-                subprocess.run(
-                    ["docker", "exec", self.container_id, "mkdir", "-p", parent_dir],
-                    capture_output=True,
-                    timeout=5,
-                )
-            # Write file using bash heredoc
-            escaped_content = content.replace("'", "'\\''")
-            result = subprocess.run(
-                ["docker", "exec", self.container_id, "bash", "-c",
-                 f"cat > {path} << 'SWEBENCH_EOF'\n{content}\nSWEBENCH_EOF"],
-                capture_output=True,
-                text=True,
-                timeout=5,
-            )
-            if result.returncode != 0:
-                return f"ERROR: Could not write to {path}\n{result.stderr}"
             return f"OK: Written to {path}"
-        except subprocess.TimeoutExpired:
-            return "ERROR: Write timed out."
-        except FileNotFoundError:
-            return "ERROR: Docker not available."
     def run_pytest(self) -> dict:
-        """
-        Run pytest in container.
-        Return: {"passed": int, "failed": int, "ratio": float, "output": str}
-        """
         try:
             result = subprocess.run(
-                ["docker", "exec", "-w", "/home/user2",
-                 self.container_id, "python", "-m", "pytest",
-                 "tests/", "--tb=short", "-q"],
                 capture_output=True,
                 text=True,
                 timeout=30,
             )
             output = result.stdout + result.stderr
-            # Parse pytest output
-            passed_match = re.search(r"(\d+) passed", output)
-            failed_match = re.search(r"(\d+) failed", output)
-            error_match = re.search(r"(\d+) error", output)
-            passed = int(passed_match.group(1)) if passed_match else 0
-            failed = int(failed_match.group(1)) if failed_match else 0
-            errors = int(error_match.group(1)) if error_match else 0
-            total = passed + failed + errors
-            ratio = passed / total if total > 0 else 0.0
             return {
                 "passed": passed,
                 "failed": failed + errors,
-                "ratio": ratio,
                 "output": output,
             }
         except subprocess.TimeoutExpired:
             return {"passed": 0, "failed": 0, "ratio": 0.0, "output": "ERROR: pytest timed out."}
-        except FileNotFoundError:
-            return {"passed": 0, "failed": 0, "ratio": 0.0, "output": "ERROR: Docker not available."}
     def curl_server(self) -> dict:
-        """
-        curl localhost:8080 inside container.
-        Return: {"status_code": int, "success": bool}
-        """
         try:
-            result = subprocess.run(
-                ["docker", "exec", self.container_id, "curl", "-s", "-o", "/dev/null",
-                 "-w", "%{http_code}", "http://localhost:8080"],
-                capture_output=True,
-                text=True,
-                timeout=5,
             )
-            status_code = int(result.stdout.strip()) if result.stdout.strip().isdigit() else 0
-            return {"status_code": status_code, "success": status_code == 200}
-        except (subprocess.TimeoutExpired, ValueError):
-            return {"status_code": 0, "success": False}
-        except FileNotFoundError:
             return {"status_code": 0, "success": False}
     def write_reply(self, recipient: str, content: str) -> str:
-        """
-        Append reply to output/reply.txt.
-        Format: [RECIPIENT]: content
-        Track in self.reply_log for diversity scoring.
-        """
-        recipient_upper = recipient.upper()
-        formatted = f"[{recipient_upper}]: {content}\n"
         try:
-            # Ensure output directory exists
-            subprocess.run(
-                ["docker", "exec", self.container_id, "mkdir", "-p", self.output_dir],
-                capture_output=True,
-                timeout=5,
-            )
-            # Append to reply.txt
-            result = subprocess.run(
-                ["docker", "exec", self.container_id, "bash", "-c",
-                 f"echo '{formatted.rstrip()}' >> {self.output_dir}/reply.txt"],
-                capture_output=True,
-                text=True,
-                timeout=5,
-            )
-            if result.returncode != 0:
-                return f"ERROR: Could not write reply\n{result.stderr}"
             self.reply_log.append(formatted)
-            return f"OK: Reply sent to {recipient_upper}"
-        except subprocess.TimeoutExpired:
-            return "ERROR: Reply write timed out."
-        except FileNotFoundError:
-            return "ERROR: Docker not available."
     # ------------------------------------------------------------------
-    # Task setup / reset
     # ------------------------------------------------------------------
-    def setup_task(self, task_id: int) -> str:
-        """
-        Reset container to broken state for given task.
-        Task 1: pip uninstall flask -y  (wheel stays cached)
-        Task 2: inject syntax error into app.py
-        Task 3: inject off-by-one bug into sort function
-        Task 4: start zombie process on port 8080
-        Task 5: inject 3 bugs across 2 files + start zombie process
-        Also copies the correct message files for the task and
-        clears output/reply.txt.
-        """
-        task = TASKS[task_id]
-        # Clear previous state
-        self.reply_log = []
-        commands = [
-            # Create directory structure
-            "mkdir -p /home/user2/tests /home/user2/logs /home/user2/messages /home/user2/output",
-            # Clear output
-            "rm -f /home/user2/output/reply.txt",
-            # Kill any running servers on port 8080
-            "pkill -f 'python.*app.py' 2>/dev/null || true",
-            "fuser -k 8080/tcp 2>/dev/null || true",
-        ]
-        # Write the broken app code
-        commands.append(
-            f"cat > /home/user2/app.py << 'SWEBENCH_EOF'\n{task.broken_app_code}\nSWEBENCH_EOF"
-        )
-        # Write second broken file for Task 5
-        if task.broken_app_code_2:
-            commands.append(
-                f"cat > /home/user2/utils.py << 'SWEBENCH_EOF'\n{task.broken_app_code_2}\nSWEBENCH_EOF"
-            )
-        # Write test code
-        commands.append(
-            f"cat > /home/user2/tests/test_app.py << 'SWEBENCH_EOF'\n{task.test_code}\nSWEBENCH_EOF"
-        )
-        # Write message files
-        if task.slack_message:
-            commands.append(
-                f"echo '{task.slack_message}' > /home/user2/messages/slack.txt"
-            )
-        else:
-            commands.append("echo '' > /home/user2/messages/slack.txt")
-        if task.email_message:
-            commands.append(
-                f"echo '{task.email_message}' > /home/user2/messages/email.txt"
-            )
-        else:
-            commands.append("echo '' > /home/user2/messages/email.txt")
-        if task.hr_message:
-            commands.append(
-                f"echo '{task.hr_message}' > /home/user2/messages/hr.txt"
-            )
-        else:
-            commands.append("echo '' > /home/user2/messages/hr.txt")
-        # Generate error.log
-        commands.append(
-            f"echo 'Task {task_id}: {task.description}' > /home/user2/logs/error.log"
-        )
-        # Task-specific breakage
-        if task_id == 1:
-            # Uninstall flask (wheel remains cached for reinstall)
-            commands.append("pip uninstall flask -y 2>/dev/null || true")
-        elif task_id == 4:
-            # Start zombie process blocking port 8080
-            commands.append(
-                "python -c \"import socket; s=socket.socket(); "
-                "s.bind(('0.0.0.0', 8080)); s.listen(1); "
-                "import time; time.sleep(9999)\" &"
-            )
-        elif task_id == 5:
-            # Start zombie process blocking port 8080
-            commands.append(
-                "python -c \"import socket; s=socket.socket(); "
-                "s.bind(('0.0.0.0', 8080)); s.listen(1); "
-                "import time; time.sleep(9999)\" &"
             )
-        # Execute all setup commands
-        full_command = " && ".join(commands)
         try:
-            result = subprocess.run(
-                ["docker", "exec", self.container_id, "bash", "-c", full_command],
-                capture_output=True,
-                text=True,
-                timeout=30,
             )
-            return f"Task {task_id} setup complete. Return code: {result.returncode}"
-        except subprocess.TimeoutExpired:
-            return f"ERROR: Task {task_id} setup timed out."
-        except FileNotFoundError:
-            return "ERROR: Docker not available."
-    def get_initial_observation(self, task_id: int) -> str:
-        """
-        Read and return the initial observation for a task:
-        contents of error.log + messages/slack.txt + messages/email.txt
-        """
-        task = TASKS[task_id]
-        parts = []
-        # Error log
-        error_log = self.read_file("/home/user2/logs/error.log")
-        parts.append(f"=== ERROR LOG ===\n{error_log}")
-        # Slack message
-        if task.slack_message:
-            slack = self.read_file("/home/user2/messages/slack.txt")
-            parts.append(f"=== SLACK MESSAGE (from Manager) ===\n{slack}")
-        # Email message
-        if task.email_message:
-            email = self.read_file("/home/user2/messages/email.txt")
-            parts.append(f"=== EMAIL (from Client) ===\n{email}")
-        # HR message (Task 5 only)
-        if task.hr_message:
-            hr = self.read_file("/home/user2/messages/hr.txt")
-            parts.append(f"=== HR MESSAGE ===\n{hr}")
-        parts.append(f"\n--- Task: {task.name} ---")
-        parts.append(f"Description: {task.description}")
-        parts.append(f"Max actions: {task.max_actions}")
-        return "\n\n".join(parts)

 """
+simulator.py — Dockerless simulator for SWEbench-IN.
+Replaces all Docker container operations with:
+  - A per-episode temp directory (virtual filesystem)
+  - Local subprocess execution (sandboxed to work_dir)
+  - In-process pytest via subprocess
+  - Local Flask server started as a child process
+  - requests to localhost for server health checks
 """
+import ast
 import os
+import re
+import sys
+import time
+import shutil
+import socket
+import tempfile
+import subprocess
+import threading
+import requests as http_requests
 from tasks import TASKS
 class Simulator:
+    """Dockerless executor for the SWEbench-IN environment."""
+    def __init__(self, container_id: str = None):
+        # container_id kept for API compatibility — ignored
+        self.work_dir: str = None
+        self.output_dir: str = None
         self.reply_log: list[str] = []
+        self._server_proc: subprocess.Popen = None
+        self._zombie_sock: socket.socket = None
+        self._server_port: int = 8080
+    # ------------------------------------------------------------------
+    # Task setup / reset
+    # ------------------------------------------------------------------
+    def setup_task(self, task_id: int) -> str:
+        """Reset to a fresh temp directory with the broken task files."""
+        self._kill_server()
+        self._kill_zombie()
+        # Fresh working directory each episode
+        if self.work_dir and os.path.exists(self.work_dir):
+            shutil.rmtree(self.work_dir, ignore_errors=True)
+        self.work_dir = tempfile.mkdtemp(prefix=f"swebench_task{task_id}_")
+        self.output_dir = os.path.join(self.work_dir, "output")
+        self.reply_log = []
+        self._make_dirs()
+        task = TASKS[task_id]
+        # Write broken source files
+        self._write(os.path.join(self.work_dir, "app.py"), task.broken_app_code)
+        if task.broken_app_code_2:
+            self._write(os.path.join(self.work_dir, "utils.py"), task.broken_app_code_2)
+        # Write tests
+        self._write(
+            os.path.join(self.work_dir, "tests", "test_app.py"),
+            task.test_code,
+        )
+        # Write message files
+        for fname, content in [
+            ("slack.txt",  task.slack_message),
+            ("email.txt",  task.email_message),
+            ("hr.txt",     task.hr_message),
+        ]:
+            self._write(
+                os.path.join(self.work_dir, "messages", fname),
+                content or "",
+            )
+        # Error log
+        self._write(
+            os.path.join(self.work_dir, "logs", "error.log"),
+            f"Task {task_id}: {task.description}",
+        )
+        # Task-specific breakage
+        if task_id in (4, 5):
+            # Simulate zombie process blocking port 8080
+            self._start_zombie()
+        return f"Task {task_id} ready in {self.work_dir}"
+    def get_initial_observation(self, task_id: int) -> str:
+        task = TASKS[task_id]
+        parts = []
+        log_path = os.path.join(self.work_dir, "logs", "error.log")
+        if os.path.exists(log_path):
+            parts.append(f"=== ERROR LOG ===\n{open(log_path).read()}")
+        if task.slack_message:
+            parts.append(f"=== SLACK MESSAGE (from Manager) ===\n{task.slack_message}")
+        if task.email_message:
+            parts.append(f"=== EMAIL (from Client) ===\n{task.email_message}")
+        if task.hr_message:
+            parts.append(f"=== HR MESSAGE ===\n{task.hr_message}")
+        parts.append(f"\n--- Task: {task.name} ---")
+        parts.append(f"Description: {task.description}")
+        parts.append(f"Max actions: {task.max_actions}")
+        return "\n\n".join(parts)
     # ------------------------------------------------------------------
     # Action handlers
     # ------------------------------------------------------------------
     def run_bash(self, command: str) -> str:
+        """Execute a shell command inside work_dir (no Docker)."""
+        blocked = ["sudo", "rm -rf /", "chmod 777 /"]
+        for pattern in blocked:
             if pattern in command:
+                return f"BLOCKED: '{pattern}' is forbidden."
+        # pip install flask — simulate as no-op (flask is available on HF Spaces)
+        if re.search(r"pip\s+install\s+flask", command):
+            return "Requirement already satisfied: flask"
+        # Kill zombie process (tasks 4 & 5)
+        if any(k in command for k in ["pkill", "fuser -k", "kill"]):
+            self._kill_zombie()
+            return "OK: Port 8080 cleared."
+        # Start Flask server
+        if re.search(r"python.*app\.py", command) or "flask run" in command:
+            return self._start_server()
+        # General command — run locally in work_dir
         try:
             result = subprocess.run(
+                command,
+                shell=True,
+                cwd=self.work_dir,
                 capture_output=True,
                 text=True,
                 timeout=10,
+                env={**os.environ, "PYTHONPATH": self.work_dir},
             )
+            output = (result.stdout + result.stderr).strip()
+            return output or "(no output)"
         except subprocess.TimeoutExpired:
             return "ERROR: Command timed out after 10 seconds."
+        except Exception as e:
+            return f"ERROR: {e}"
     def read_file(self, path: str) -> str:
+        """Read a file from work_dir. Accepts /home/user2/... or relative paths."""
+        full = self._resolve(path)
         try:
+            return open(full).read()
         except FileNotFoundError:
+            return f"ERROR: File not found: {path}"
+        except Exception as e:
+            return f"ERROR: {e}"
     def write_file(self, path: str, content: str) -> str:
+        """Write content to a file in work_dir."""
+        full = self._resolve(path)
+        os.makedirs(os.path.dirname(full), exist_ok=True)
         try:
+            self._write(full, content)
             return f"OK: Written to {path}"
+        except Exception as e:
+            return f"ERROR: {e}"
     def run_pytest(self) -> dict:
+        """Run pytest in work_dir and return pass/fail counts."""
         try:
             result = subprocess.run(
+                [sys.executable, "-m", "pytest", "tests/", "--tb=short", "-q"],
+                cwd=self.work_dir,
                 capture_output=True,
                 text=True,
                 timeout=30,
+                env={**os.environ, "PYTHONPATH": self.work_dir},
             )
             output = result.stdout + result.stderr
+            passed  = int(m.group(1)) if (m := re.search(r"(\d+) passed", output)) else 0
+            failed  = int(m.group(1)) if (m := re.search(r"(\d+) failed", output)) else 0
+            errors  = int(m.group(1)) if (m := re.search(r"(\d+) error",  output)) else 0
+            total   = passed + failed + errors
             return {
                 "passed": passed,
                 "failed": failed + errors,
+                "ratio":  passed / total if total > 0 else 0.0,
                 "output": output,
             }
         except subprocess.TimeoutExpired:
             return {"passed": 0, "failed": 0, "ratio": 0.0, "output": "ERROR: pytest timed out."}
+        except Exception as e:
+            return {"passed": 0, "failed": 0, "ratio": 0.0, "output": f"ERROR: {e}"}
     def curl_server(self) -> dict:
+        """Check if the Flask server is up at localhost:8080."""
         try:
+            r = http_requests.get(
+                f"http://localhost:{self._server_port}", timeout=3
             )
+            return {"status_code": r.status_code, "success": r.status_code == 200}
+        except Exception:
             return {"status_code": 0, "success": False}
     def write_reply(self, recipient: str, content: str) -> str:
+        """Append a reply to output/reply.txt."""
+        formatted = f"[{recipient.upper()}]: {content}\n"
+        reply_path = os.path.join(self.output_dir, "reply.txt")
+        os.makedirs(self.output_dir, exist_ok=True)
         try:
+            with open(reply_path, "a") as f:
+                f.write(formatted)
             self.reply_log.append(formatted)
+            return f"OK: Reply sent to {recipient.upper()}"
+        except Exception as e:
+            return f"ERROR: {e}"
     # ------------------------------------------------------------------
+    # Internal helpers
     # ------------------------------------------------------------------
+    def _make_dirs(self):
+        for sub in ("tests", "logs", "messages", "output"):
+            os.makedirs(os.path.join(self.work_dir, sub), exist_ok=True)
+    @staticmethod
+    def _write(path: str, content: str):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        with open(path, "w") as f:
+            f.write(content)
+    def _resolve(self, path: str) -> str:
+        """Translate /home/user2/... or bare relative path to work_dir path."""
+        norm = path.replace("/home/user2/", "").lstrip("/")
+        return os.path.join(self.work_dir, norm)
+    def _start_server(self) -> str:
+        """Launch app.py as a child process on port 8080."""
+        self._kill_server()
+        app_path = os.path.join(self.work_dir, "app.py")
+        if not os.path.exists(app_path):
+            return "ERROR: app.py not found."
+        # Syntax check before launching
+        try:
+            ast.parse(open(app_path).read())
+        except SyntaxError as e:
+            return f"ERROR: Syntax error in app.py — {e}"
+        # Check if zombie is blocking the port
+        if self._port_in_use(self._server_port):
+            return (
+                f"ERROR: Port {self._server_port} is already in use. "
+                "Kill the blocking process first."
             )
         try:
+            self._server_proc = subprocess.Popen(
+                [sys.executable, "app.py"],
+                cwd=self.work_dir,
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                env={**os.environ, "PYTHONPATH": self.work_dir},
             )
+        except Exception as e:
+            return f"ERROR: Could not start server — {e}"
+        # Wait up to 4 s for server to accept connections
+        for _ in range(8):
+            time.sleep(0.5)
+            if self._server_proc.poll() is not None:
+                return "ERROR: Server crashed on startup."
+            if not self._port_in_use(self._server_port):
+                continue
+            result = self.curl_server()
+            if result["success"]:
+                return "OK: Server started on port 8080."
+        # Server started but hasn't responded yet — return optimistic message
+        if self._server_proc.poll() is None:
+            return "OK: Server process started (may need a moment to be ready)."
+        return "ERROR: Server failed to start."
+    def _start_zombie(self):
+        """Block port 8080 with a socket to simulate a zombie process."""
+        try:
+            self._zombie_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            self._zombie_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            self._zombie_sock.bind(("0.0.0.0", self._server_port))
+            self._zombie_sock.listen(1)
+        except OSError:
+            self._zombie_sock = None  # Port already in use — fine
+    def _kill_zombie(self):
+        if self._zombie_sock:
+            try:
+                self._zombie_sock.close()
+            except Exception:
+                pass
+            self._zombie_sock = None
+        time.sleep(0.3)  # Brief pause for OS to release the port
+    def _kill_server(self):
+        if self._server_proc:
+            try:
+                self._server_proc.terminate()
+                self._server_proc.wait(timeout=3)
+            except Exception:
+                try:
+                    self._server_proc.kill()
+                except Exception:
+                    pass
+            self._server_proc = None
+    @staticmethod
+    def _port_in_use(port: int) -> bool:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            return s.connect_ex(("localhost", port)) == 0