Spaces:

anugrah55
/

opensleuth-env-gemini-cli

Paused

App Files Files Community

anugrah55 commited on 12 days ago

Commit

d3cd20c

verified ·

1 Parent(s): 63bb50c

Overhaul env: per-episode state, ast.literal_eval probe parsing, sandboxed verifier with timeouts, 9 black-box functions, slim Dockerfile

Browse files

Files changed (10) hide show

.gitignore +9 -0
Dockerfile +12 -13
README.md +45 -4
opensleuth_env/__init__.py +28 -1
opensleuth_env/black_box.py +249 -21
opensleuth_env/env.py +195 -83
opensleuth_env/models.py +65 -15
opensleuth_env/verifier.py +220 -52
requirements.txt +3 -3
server.py +73 -24

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+__pycache__/
+*.pyc
+.DS_Store
+.env
+.venv/
+.pytest_cache/
+.cache/
+verifier_log.txt
+*.log

Dockerfile CHANGED Viewed

@@ -1,19 +1,18 @@
-# Use a standard Python 3.9 image
-FROM python:3.9-slim
-# Set the working directory
-WORKDIR /app
-# Copy the environment files into the container
-COPY ./opensleuth_env /app/opensleuth_env
-COPY ./server.py /app/
-COPY ./requirements.txt /app/
-# Install dependencies
 RUN pip install --no-cache-dir -r requirements.txt
-# Expose the port the app runs on
-EXPOSE 8000
-# Run the application
-CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]

+FROM python:3.11-slim
+ENV PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1
+WORKDIR /app
+COPY requirements.txt /app/
 RUN pip install --no-cache-dir -r requirements.txt
+COPY opensleuth_env /app/opensleuth_env
+COPY server.py /app/
+EXPOSE 7860
+# HF Spaces require listening on $PORT (default 7860). uvicorn binds 0.0.0.0.
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,51 @@
 ---
-title: Opensleuth Env Gemini Cli
-emoji: 📊
 colorFrom: indigo
-colorTo: blue
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: OpenSleuth Env
+emoji: 🕵️
 colorFrom: indigo
+colorTo: pink
 sdk: docker
+app_port: 7860
 pinned: false
+suggested_hardware: cpu-basic
 ---
+# OpenSleuth — Environment
+FastAPI service that exposes an OpenEnv-style `/reset` + `/step` API for the
+**Algorithmic Detective** task. An agent has to figure out an unknown Python
+function by probing it, then submit Python source that replicates it.
+## Endpoints
+| Method | Path          | Body                                   | Notes                                  |
+|-------:|---------------|----------------------------------------|----------------------------------------|
+| GET    | `/health`     | —                                      | Liveness probe.                        |
+| GET    | `/functions`  | —                                      | Catalogue of available black-boxes.    |
+| POST   | `/reset`      | `{"target_name": "fibonacci", "seed": 0}` | Starts a new episode, returns initial obs + `episode_id`. |
+| POST   | `/step`       | `{"episode_id": "...", "action": {...}}` | One agent action.                      |
+| GET    | `/state/{eid}`| —                                      | Inspect the live state of an episode (debug). |
+### Action shapes
+```json
+{"action_type": "probe",  "input_repr": "5"}             // input_repr is parsed via ast.literal_eval
+{"action_type": "submit", "code": "def fibonacci(n):..."}
+```
+### Reward
+* **Probe:** `-1` step cost, plus `+2` per newly-seen output and `+5` per
+  newly-seen exception type, encouraging exploration of edge cases.
+* **Submit (terminal):** `100 * matches/fuzz_count` minus a logarithmic
+  cyclomatic-complexity penalty. A perfect submission gets a `+50` bonus.
+## Hardware
+CPU-only — `cpu-basic` is plenty. Do **not** assign GPU to this Space.
+## Running locally
+```bash
+pip install -r requirements.txt
+uvicorn server:app --port 7860 --reload
+```

opensleuth_env/__init__.py CHANGED Viewed

	@@ -1 +1,28 @@
1	- # ~~This~~ ~~file makes the 'opensleuth_env' directory a Python package~~.

+"""OpenSleuth environment library."""
+from .env import OpenSleuthEnv
+from .models import (
+    Action,
+    ProbeAction,
+    SubmitAction,
+    Observation,
+    State,
+    StepResponse,
+    ResetRequest,
+    StepRequest,
+)
+from .black_box import BLACK_BOX_FUNCTIONS, FunctionSpec
+__all__ = [
+    "OpenSleuthEnv",
+    "Action",
+    "ProbeAction",
+    "SubmitAction",
+    "Observation",
+    "State",
+    "StepResponse",
+    "ResetRequest",
+    "StepRequest",
+    "BLACK_BOX_FUNCTIONS",
+    "FunctionSpec",
+]

opensleuth_env/black_box.py CHANGED Viewed

@@ -1,31 +1,259 @@
-def fibonacci(n: int) -> int:
-    """
-    Calculates the nth Fibonacci number.
-    - Handles positive integers up to 90 to avoid large numbers.
-    - Raises ValueError for non-positive inputs or large inputs.
-    """
-    if not isinstance(n, int) or n <= 0 or n > 90:
-        raise ValueError("Input must be a positive integer less than or equal to 90.")
-    if n == 1:
-        return 1
     a, b = 0, 1
     for _ in range(n - 1):
         a, b = b, a + b
-    return b
-# --- Add more black-box functions for later stages ---
-def reverse_string(s: str) -> str:
-    """
-    Reverses a string.
-    - Raises TypeError for non-string inputs.
-    """
     if not isinstance(s, str):
         raise TypeError("Input must be a string.")
     return s[::-1]
-# --- Dictionary to hold all available black-box functions ---
-BLACK_BOX_FUNCTIONS = {
-    "fibonacci": fibonacci,
-    "reverse_string": reverse_string,
 }

+"""Catalogue of hidden 'black-box' Python functions the agent must reproduce.
+Each entry pairs the reference implementation with a *typed input domain*
+generator so the verifier can fuzz it, plus a public signature/docstring shown
+to the agent in the prompt. The reference implementation itself is never
+shown to the agent.
+"""
+from __future__ import annotations
+import random
+import string
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List
+# ----- Reference implementations --------------------------------------------
+def _fibonacci(n: int) -> int:
+    if not isinstance(n, int) or isinstance(n, bool) or n <= 0 or n > 90:
+        raise ValueError("Input must be a positive integer <= 90.")
     a, b = 0, 1
     for _ in range(n - 1):
         a, b = b, a + b
+    return b if n > 0 else a
+def _reverse_string(s: str) -> str:
     if not isinstance(s, str):
         raise TypeError("Input must be a string.")
     return s[::-1]
+def _is_palindrome(s: str) -> bool:
+    if not isinstance(s, str):
+        raise TypeError("Input must be a string.")
+    cleaned = "".join(ch.lower() for ch in s if ch.isalnum())
+    return cleaned == cleaned[::-1]
+def _digit_sum(n: int) -> int:
+    if not isinstance(n, int) or isinstance(n, bool):
+        raise TypeError("Input must be int.")
+    if n < 0:
+        raise ValueError("Input must be non-negative.")
+    return sum(int(c) for c in str(n))
+def _count_vowels(s: str) -> int:
+    if not isinstance(s, str):
+        raise TypeError("Input must be a string.")
+    return sum(1 for c in s.lower() if c in "aeiou")
+def _gcd(pair) -> int:
+    """Greatest common divisor of two non-negative ints, given as a 2-tuple
+    or 2-list. Hidden trick: tuple/list both accepted, ints only."""
+    if not isinstance(pair, (list, tuple)) or len(pair) != 2:
+        raise TypeError("Input must be a 2-element list or tuple.")
+    a, b = pair
+    if not all(isinstance(x, int) and not isinstance(x, bool) for x in (a, b)):
+        raise TypeError("Both elements must be int.")
+    if a < 0 or b < 0:
+        raise ValueError("Both elements must be non-negative.")
+    while b:
+        a, b = b, a % b
+    return a
+def _sort_unique(xs) -> list:
+    """Return sorted unique elements from a list of ints."""
+    if not isinstance(xs, list):
+        raise TypeError("Input must be a list.")
+    if not all(isinstance(x, int) and not isinstance(x, bool) for x in xs):
+        raise TypeError("All elements must be int.")
+    return sorted(set(xs))
+def _caesar_cipher(s: str) -> str:
+    """Caesar shift by +3 on lowercase letters; everything else unchanged."""
+    if not isinstance(s, str):
+        raise TypeError("Input must be a string.")
+    out = []
+    for ch in s:
+        if "a" <= ch <= "z":
+            out.append(chr((ord(ch) - ord("a") + 3) % 26 + ord("a")))
+        else:
+            out.append(ch)
+    return "".join(out)
+def _is_prime(n: int) -> bool:
+    if not isinstance(n, int) or isinstance(n, bool):
+        raise TypeError("Input must be int.")
+    if n < 2:
+        return False
+    if n < 4:
+        return True
+    if n % 2 == 0:
+        return False
+    i = 3
+    while i * i <= n:
+        if n % i == 0:
+            return False
+        i += 2
+    return True
+# ----- Fuzz input generators ------------------------------------------------
+def _fuzz_small_pos_int(rng: random.Random, n: int) -> List[int]:
+    return [rng.randint(1, 30) for _ in range(n)]
+def _fuzz_fib_int(rng: random.Random, n: int) -> List[int]:
+    # Mix common values, edges, and random.
+    pool = [1, 2, 3, 10, 20, 30, 50, 89, 90]
+    return [rng.choice(pool) if rng.random() < 0.3 else rng.randint(1, 90) for _ in range(n)]
+def _fuzz_short_string(rng: random.Random, n: int) -> List[str]:
+    alpha = string.ascii_letters + string.digits
+    return ["".join(rng.choices(alpha, k=rng.randint(0, 12))) for _ in range(n)]
+def _fuzz_palindrome_string(rng: random.Random, n: int) -> List[str]:
+    out = []
+    for _ in range(n):
+        if rng.random() < 0.4:
+            base = "".join(rng.choices(string.ascii_lowercase, k=rng.randint(0, 6)))
+            out.append(base + base[::-1])
+        else:
+            out.append("".join(rng.choices(string.ascii_letters + " ", k=rng.randint(0, 12))))
+    return out
+def _fuzz_nonneg_int(rng: random.Random, n: int) -> List[int]:
+    return [rng.randint(0, 10_000) for _ in range(n)]
+def _fuzz_int_pair(rng: random.Random, n: int):
+    return [(rng.randint(0, 1000), rng.randint(0, 1000)) for _ in range(n)]
+def _fuzz_int_list(rng: random.Random, n: int):
+    return [
+        [rng.randint(-50, 50) for _ in range(rng.randint(0, 8))] for _ in range(n)
+    ]
+def _fuzz_lower_string(rng: random.Random, n: int) -> List[str]:
+    return [
+        "".join(rng.choices(string.ascii_lowercase + " ,!", k=rng.randint(0, 16)))
+        for _ in range(n)
+    ]
+def _fuzz_prime_int(rng: random.Random, n: int) -> List[int]:
+    # Mix in known primes and composites to cover both branches.
+    seeded = [0, 1, 2, 3, 4, 9, 11, 15, 17, 25, 29, 97, 100]
+    return [rng.choice(seeded) if rng.random() < 0.3 else rng.randint(0, 200) for _ in range(n)]
+# ----- Spec ----------------------------------------------------------------
+@dataclass(frozen=True)
+class FunctionSpec:
+    name: str
+    fn: Callable[[Any], Any]
+    signature: str
+    description: str
+    fuzzer: Callable[[random.Random, int], list]
+BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
+    spec.name: spec
+    for spec in [
+        FunctionSpec(
+            name="fibonacci",
+            fn=_fibonacci,
+            signature="fibonacci(n: int) -> int",
+            description=(
+                "Returns the n-th Fibonacci number. Raises ValueError for "
+                "invalid n (n must be a positive int <= 90)."
+            ),
+            fuzzer=_fuzz_fib_int,
+        ),
+        FunctionSpec(
+            name="reverse_string",
+            fn=_reverse_string,
+            signature="reverse_string(s: str) -> str",
+            description="Returns the reversed string. Raises TypeError for non-str.",
+            fuzzer=_fuzz_short_string,
+        ),
+        FunctionSpec(
+            name="is_palindrome",
+            fn=_is_palindrome,
+            signature="is_palindrome(s: str) -> bool",
+            description=(
+                "Case-insensitive palindrome check, ignoring non-alphanumeric "
+                "characters. Raises TypeError for non-str."
+            ),
+            fuzzer=_fuzz_palindrome_string,
+        ),
+        FunctionSpec(
+            name="digit_sum",
+            fn=_digit_sum,
+            signature="digit_sum(n: int) -> int",
+            description=(
+                "Sum of the decimal digits of n. n must be a non-negative int."
+            ),
+            fuzzer=_fuzz_nonneg_int,
+        ),
+        FunctionSpec(
+            name="count_vowels",
+            fn=_count_vowels,
+            signature="count_vowels(s: str) -> int",
+            description="Count of vowels (a/e/i/o/u, case-insensitive) in s.",
+            fuzzer=_fuzz_lower_string,
+        ),
+        FunctionSpec(
+            name="gcd",
+            fn=_gcd,
+            signature="gcd(pair: tuple[int, int] | list[int]) -> int",
+            description=(
+                "Greatest common divisor of a 2-element tuple/list of "
+                "non-negative ints."
+            ),
+            fuzzer=_fuzz_int_pair,
+        ),
+        FunctionSpec(
+            name="sort_unique",
+            fn=_sort_unique,
+            signature="sort_unique(xs: list[int]) -> list[int]",
+            description="Sorted, deduplicated list of ints from xs.",
+            fuzzer=_fuzz_int_list,
+        ),
+        FunctionSpec(
+            name="caesar_cipher",
+            fn=_caesar_cipher,
+            signature="caesar_cipher(s: str) -> str",
+            description=(
+                "Caesar shift by +3 on lowercase letters; non-lowercase chars "
+                "are unchanged."
+            ),
+            fuzzer=_fuzz_lower_string,
+        ),
+        FunctionSpec(
+            name="is_prime",
+            fn=_is_prime,
+            signature="is_prime(n: int) -> bool",
+            description="True iff n is a prime int. n must be int.",
+            fuzzer=_fuzz_prime_int,
+        ),
+    ]
 }

opensleuth_env/env.py CHANGED Viewed

@@ -1,93 +1,205 @@
-from opensleuth_env.models import Action, Observation, State, ProbeAction, SubmitAction
-from opensleuth_env.black_box import BLACK_BOX_FUNCTIONS
-from opensleuth_env.verifier import verify_submission
-import random
-import traceback
 class OpenSleuthEnv:
-    def __init__(self):
-        self.state = None
-        # The verifier is now a static function, so no need to init it
-    def reset(self, target_name: str = "fibonacci") -> Observation:
-        """
-        Resets the environment to a new episode.
-        Selects a black-box function and clears the history.
-        """
-        if target_name not in BLACK_BOX_FUNCTIONS:
-            raise ValueError(f"Unknown target function: {target_name}")
-        self.state = State(
             target_function_name=target_name,
-            probe_history=[],
-            seen_outputs=set(),
-            seen_error_types=set(),
         )
-        return Observation(probe_history=[], last_error="")
-    def step(self, action: Action) -> tuple[Observation, float, bool]:
-        """
-        Takes a step in the environment.
-        """
-        if self.state is None:
-            # If reset() was not called, do it now.
-            self.reset()
-        # The Pydantic model binding in FastAPI should handle the conversion.
-        # This check is for robustness.
-        if not isinstance(action, (ProbeAction, SubmitAction)):
-            try:
-                if action.get("action_type") == "probe":
-                    action = ProbeAction(**action)
-                elif action.get("action_type") == "submit":
-                    action = SubmitAction(**action)
-                else:
-                    raise ValueError("Invalid action_type")
-            except Exception as e:
-                obs = Observation(probe_history=self.state.probe_history, last_error=f"Invalid action format: {e}")
-                return obs, -20.0, True
-        if action.action_type == "probe":
-            return self._handle_probe(action)
-        elif action.action_type == "submit":
-            return self._handle_submit(action)
         else:
-            obs = Observation(probe_history=self.state.probe_history, last_error=f"Invalid action type: {action.action_type}")
-            return obs, -20.0, True
-    def _handle_probe(self, action: ProbeAction) -> tuple[Observation, float, bool]:
-        target_func = BLACK_BOX_FUNCTIONS[self.state.target_function_name]
-        intrinsic_reward = 0.0
-        last_error = ""
         try:
-            eval_input = action.input
-            output = target_func(eval_input)
-            self.state.probe_history.append((eval_input, output))
-            if str(output) not in self.state.seen_outputs:
-                intrinsic_reward += 2.0
-                self.state.seen_outputs.add(str(output))
-        except Exception as e:
             error_type = type(e).__name__
-            error_str = traceback.format_exc()
-            self.state.probe_history.append((action.input, error_str))
-            last_error = error_str
-            if error_type not in self.state.seen_error_types:
-                intrinsic_reward += 5.0
-                self.state.seen_error_types.add(error_type)
-        reward = intrinsic_reward - 1.0
-        obs = Observation(probe_history=self.state.probe_history, last_error=last_error)
-        return obs, reward, False
-    def _handle_submit(self, action: SubmitAction) -> tuple[Observation, float, bool]:
-        target_func = BLACK_BOX_FUNCTIONS[self.state.target_function_name]
-        execution_reward, complexity_penalty = verify_submission(action.code, target_func)
-        total_reward = execution_reward - complexity_penalty
-        if execution_reward == 100.0:
-            total_reward += 50.0
-        obs = Observation(probe_history=self.state.probe_history, last_error="")
-        return obs, total_reward, True

+"""Core OpenSleuth episodic environment.
+A single OpenSleuthEnv holds a *registry of episodes* keyed by episode_id, so
+multiple training rollouts can hit the same FastAPI server in parallel without
+stepping on each other's state.
+"""
+from __future__ import annotations
+import ast
+import logging
+import uuid
+from typing import Tuple
+from .black_box import BLACK_BOX_FUNCTIONS, FunctionSpec
+from .models import (
+    Action,
+    Observation,
+    ProbeAction,
+    ProbeRecord,
+    State,
+    StepResponse,
+    SubmitAction,
+)
+from .verifier import generate_fuzz_inputs, verify_submission
+log = logging.getLogger("opensleuth.env")
+# Reward shaping knobs (kept here so they're easy to tune).
+PROBE_STEP_COST = -1.0
+NEW_OUTPUT_BONUS = 2.0
+NEW_ERROR_TYPE_BONUS = 5.0
+PERFECT_SUBMISSION_BONUS = 50.0
+MAX_PROBE_HISTORY_IN_OBS = 25
 class OpenSleuthEnv:
+    """Multi-episode environment registry."""
+    def __init__(self, fuzz_count: int = 100) -> None:
+        self._states: dict[str, State] = {}
+        self._configs: dict[str, dict] = {}
+        self.fuzz_count = fuzz_count
+    # --- Lifecycle ---------------------------------------------------------
+    def reset(self, target_name: str, seed: int = 0, max_steps: int = 25) -> Observation:
+        if target_name not in BLACK_BOX_FUNCTIONS:
+            raise ValueError(
+                f"Unknown target function: {target_name!r}. "
+                f"Available: {sorted(BLACK_BOX_FUNCTIONS)}"
+            )
+        spec = BLACK_BOX_FUNCTIONS[target_name]
+        episode_id = uuid.uuid4().hex
+        self._states[episode_id] = State(
+            episode_id=episode_id,
             target_function_name=target_name,
+            seed=seed,
         )
+        self._configs[episode_id] = {"max_steps": max_steps}
+        return self._build_observation(episode_id, spec, last_error="")
+    def step(self, episode_id: str, action: Action) -> StepResponse:
+        state = self._states.get(episode_id)
+        if state is None:
+            raise KeyError(f"Unknown episode_id {episode_id!r}. Did you /reset first?")
+        if state.done:
+            spec = BLACK_BOX_FUNCTIONS[state.target_function_name]
+            obs = self._build_observation(episode_id, spec, last_error="Episode already terminated.")
+            return StepResponse(observation=obs, reward=0.0, done=True, info={"reason": "already_done"})
+        spec = BLACK_BOX_FUNCTIONS[state.target_function_name]
+        state.steps_taken += 1
+        max_steps = self._configs[episode_id]["max_steps"]
+        if isinstance(action, ProbeAction):
+            obs, reward, done, info = self._handle_probe(state, spec, action)
+        elif isinstance(action, SubmitAction):
+            obs, reward, done, info = self._handle_submit(state, spec, action)
         else:
+            obs = self._build_observation(
+                episode_id, spec, last_error=f"Invalid action type: {type(action).__name__}"
+            )
+            reward, done, info = -20.0, True, {"reason": "invalid_action"}
+        # Step-budget exhaustion ends the episode with no extra reward.
+        if not done and state.steps_taken >= max_steps:
+            done = True
+            info = {**info, "reason": info.get("reason", "step_limit")}
+        if done:
+            state.done = True
+        return StepResponse(observation=obs, reward=reward, done=done, info=info)
+    # --- Action handlers ---------------------------------------------------
+    def _handle_probe(
+        self, state: State, spec: FunctionSpec, action: ProbeAction
+    ) -> Tuple[Observation, float, bool, dict]:
+        # Parse the agent's input from a Python literal repr.
+        try:
+            parsed = ast.literal_eval(action.input_repr)
+        except (ValueError, SyntaxError) as e:
+            err = f"Could not parse input_repr as a Python literal: {e}"
+            state.probe_history.append(
+                ProbeRecord(
+                    input_repr=action.input_repr,
+                    output_repr=err,
+                    is_error=True,
+                    error_type="ParseError",
+                )
+            )
+            obs = self._build_observation(state.episode_id, spec, last_error=err)
+            return obs, PROBE_STEP_COST, False, {"reason": "parse_error"}
+        intrinsic = 0.0
+        last_error = ""
         try:
+            output = spec.fn(parsed)
+            output_repr = repr(output)
+            state.probe_history.append(
+                ProbeRecord(input_repr=repr(parsed), output_repr=output_repr, is_error=False)
+            )
+            if output_repr not in state.seen_outputs:
+                intrinsic += NEW_OUTPUT_BONUS
+                state.seen_outputs.add(output_repr)
+        except Exception as e:  # noqa: BLE001
             error_type = type(e).__name__
+            err_repr = f"{error_type}: {e}"
+            state.probe_history.append(
+                ProbeRecord(
+                    input_repr=repr(parsed),
+                    output_repr=err_repr,
+                    is_error=True,
+                    error_type=error_type,
+                )
+            )
+            last_error = err_repr
+            if error_type not in state.seen_error_types:
+                intrinsic += NEW_ERROR_TYPE_BONUS
+                state.seen_error_types.add(error_type)
+        reward = intrinsic + PROBE_STEP_COST
+        obs = self._build_observation(state.episode_id, spec, last_error=last_error)
+        return obs, reward, False, {"intrinsic": intrinsic}
+    def _handle_submit(
+        self, state: State, spec: FunctionSpec, action: SubmitAction
+    ) -> Tuple[Observation, float, bool, dict]:
+        fuzz_inputs = generate_fuzz_inputs(spec, count=self.fuzz_count, seed=state.seed)
+        result = verify_submission(action.code, spec.fn, fuzz_inputs, target_name=spec.name)
+        total = result.execution_reward - result.complexity_penalty
+        if result.execution_reward >= 99.999:
+            total += PERFECT_SUBMISSION_BONUS
+        obs = self._build_observation(
+            state.episode_id,
+            spec,
+            last_error=result.define_error or "",
+        )
+        info = {
+            "execution_reward": result.execution_reward,
+            "complexity_penalty": result.complexity_penalty,
+            "matches": result.matches,
+            "fuzz_count": result.fuzz_count,
+            "define_error": result.define_error,
+            "reason": "submission",
+        }
+        return obs, total, True, info
+    # --- Helpers -----------------------------------------------------------
+    def _build_observation(
+        self, episode_id: str, spec: FunctionSpec, last_error: str
+    ) -> Observation:
+        state = self._states[episode_id]
+        max_steps = self._configs[episode_id]["max_steps"]
+        history = state.probe_history[-MAX_PROBE_HISTORY_IN_OBS:]
+        return Observation(
+            episode_id=episode_id,
+            target_function_name=state.target_function_name,
+            target_function_signature=f"{spec.signature}\n\n{spec.description}",
+            probe_history=history,
+            last_error=last_error,
+            steps_taken=state.steps_taken,
+            max_steps=max_steps,
+        )
+    # --- Introspection -----------------------------------------------------
+    def get_state(self, episode_id: str) -> dict:
+        s = self._states.get(episode_id)
+        if s is None:
+            return {}
+        return {
+            "episode_id": s.episode_id,
+            "target_function_name": s.target_function_name,
+            "steps_taken": s.steps_taken,
+            "done": s.done,
+            "seen_outputs": sorted(s.seen_outputs),
+            "seen_error_types": sorted(s.seen_error_types),
+            "probe_history": [r.model_dump() for r in s.probe_history],
+        }

opensleuth_env/models.py CHANGED Viewed

@@ -1,29 +1,79 @@
-from typing import Union, List, Tuple, Any, Literal
-from pydantic import BaseModel, Field
 class ProbeAction(BaseModel):
     action_type: Literal["probe"] = "probe"
-    input: Any
 class SubmitAction(BaseModel):
     action_type: Literal["submit"] = "submit"
-    code: str
 Action = Union[ProbeAction, SubmitAction]
 class Observation(BaseModel):
-    probe_history: List[Tuple[Any, Any]] = Field(
-        ...,
-        description="A list of (input, output) pairs from previous probes. Output can be a value or an error string."
-    )
-    last_error: str = Field(
-        "",
-        description="The error message from the last action, if any."
     )
 class State(BaseModel):
     target_function_name: str
-    probe_history: List[Tuple[Any, Any]]
-    # Store unique outputs and error types to calculate intrinsic reward
-    seen_outputs: set
-    seen_error_types: set

+"""Pydantic models for the OpenSleuth API and core state."""
+from __future__ import annotations
+from typing import Any, List, Literal, Optional, Tuple, Union
+from pydantic import BaseModel, ConfigDict, Field
 class ProbeAction(BaseModel):
     action_type: Literal["probe"] = "probe"
+    # The agent submits inputs as a Python literal string (e.g. "5", "'abc'",
+    # "[1, 2, 3]"). We parse it server-side with ast.literal_eval. Keeping it
+    # as a string avoids a class of FastAPI auto-coercion bugs and matches
+    # what an LLM naturally emits.
+    input_repr: str = Field(..., description="Python literal repr of the probe input")
 class SubmitAction(BaseModel):
     action_type: Literal["submit"] = "submit"
+    code: str = Field(..., description="Python source defining the target function")
 Action = Union[ProbeAction, SubmitAction]
+class ProbeRecord(BaseModel):
+    """One entry in the probe history. Output is either the function's return
+    value (Pythonic repr) or, if it raised, an error string."""
+    input_repr: str
+    output_repr: str
+    is_error: bool = False
+    error_type: Optional[str] = None
 class Observation(BaseModel):
+    episode_id: str
+    target_function_name: str
+    target_function_signature: str = Field(
+        "", description="Human readable signature + docstring shown to the agent"
     )
+    probe_history: List[ProbeRecord] = Field(default_factory=list)
+    last_error: str = ""
+    steps_taken: int = 0
+    max_steps: int = 25
+class StepResponse(BaseModel):
+    observation: Observation
+    reward: float
+    done: bool
+    info: dict = Field(default_factory=dict)
 class State(BaseModel):
+    """Internal mutable state for one episode. Not exposed in /step responses
+    in full, but available via /state/{eid} for debugging."""
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    episode_id: str
     target_function_name: str
+    probe_history: List[ProbeRecord] = Field(default_factory=list)
+    seen_outputs: set = Field(default_factory=set)
+    seen_error_types: set = Field(default_factory=set)
+    steps_taken: int = 0
+    done: bool = False
+    seed: int = 0
+class ResetRequest(BaseModel):
+    target_name: str = "fibonacci"
+    seed: int = 0
+    max_steps: int = 25
+class StepRequest(BaseModel):
+    episode_id: str
+    action: Action

opensleuth_env/verifier.py CHANGED Viewed

@@ -1,68 +1,236 @@
 import ast
-import random
-import string
 import math
-class ComplexityVisitor(ast.NodeVisitor):
     def __init__(self):
-        self.complexity = 1
-    def visit_If(self, node):
-        self.complexity += 1
-        self.generic_visit(node)
-    def visit_For(self, node):
-        self.complexity += 1
-        self.generic_visit(node)
-    def visit_While(self, node):
-        self.complexity += 1
-        self.generic_visit(node)
-    def visit_And(self, node):
-        self.complexity += 1
-        self.generic_visit(node)
-    def visit_Or(self, node):
-        self.complexity += 1
         self.generic_visit(node)
-    def visit_ExceptHandler(self, node):
-        self.complexity += 1
         self.generic_visit(node)
-def _calculate_cyclomatic_complexity(code: str) -> int:
     try:
         tree = ast.parse(code)
-        visitor = ComplexityVisitor()
-        visitor.visit(tree)
-        return math.log(visitor.complexity)
     except SyntaxError:
-        return 50
-def _generate_fuzz_inputs(target_func, count=100):
-    inputs = []
-    if target_func.__name__ == "fibonacci":
-        inputs = [random.randint(1, 90) for _ in range(count)]
-    elif target_func.__name__ == "reverse_string":
-        inputs = [''.join(random.choices(string.ascii_letters + string.digits, k=random.randint(1, 20))) for _ in range(count)]
-    return inputs
-def verify_submission(submitted_code: str, target_function: callable, fuzz_count: int = 100) -> tuple[float, float]:
     try:
-        local_scope = {}
-        exec(submitted_code, {}, local_scope)
-        submitted_func = local_scope.get(target_function.__name__)
-        if not callable(submitted_func):
-            return 0.0, 50.0
-    except Exception:
-        return 0.0, 50.0
-    fuzz_inputs = _generate_fuzz_inputs(target_function, fuzz_count)
     matches = 0
     for inp in fuzz_inputs:
         try:
-            target_output = target_function(inp)
-            submitted_output = submitted_func(inp)
-            if target_output == submitted_output:
-                matches += 1
-        except Exception:
-            continue
-    execution_reward = 100.0 * (matches / fuzz_count)
-    complexity_penalty = _calculate_cyclomatic_complexity(submitted_code)
-    return execution_reward, complexity_penalty

+"""Verifier: scores a submitted Python source against a hidden reference
+function by domain-aware fuzzing, with sandboxed execution and a complexity
+penalty.
+Reward design:
+  execution_reward in [0, 100]   = 100 * matches/fuzz_count
+  complexity_penalty in [0, 50]  = log(cyclomatic) clipped, else 50 on syntax error
+  exec_failure_penalty           = 25 if def-time exec raised, before fuzzing
+"""
+from __future__ import annotations
 import ast
 import math
+import multiprocessing as mp
+import random
+import signal
+from dataclasses import dataclass
+from typing import Any, Callable, List, Optional
+# ----- AST complexity ------------------------------------------------------
+class _CCVisitor(ast.NodeVisitor):
     def __init__(self):
+        self.cc = 1
+    def _bump(self, node):
+        self.cc += 1
         self.generic_visit(node)
+    visit_If = _bump
+    visit_For = _bump
+    visit_While = _bump
+    visit_AsyncFor = _bump
+    visit_ExceptHandler = _bump
+    visit_With = _bump
+    visit_IfExp = _bump
+    def visit_BoolOp(self, node):
+        self.cc += max(0, len(node.values) - 1)
         self.generic_visit(node)
+def calculate_complexity_penalty(code: str) -> float:
+    """Bounded log-scaled cyclomatic complexity, or 50 if code won't parse."""
     try:
         tree = ast.parse(code)
     except SyntaxError:
+        return 50.0
+    v = _CCVisitor()
+    v.visit(tree)
+    # log2 keeps small functions at ~0..2 and aggressive 100-branch lookups
+    # up around log2(100) ≈ 6.6, then we clamp.
+    return min(50.0, math.log2(v.cc))
+# ----- Sandboxed execution -------------------------------------------------
+def _exec_target_in_sandbox(code: str, target_name: str, queue: mp.Queue) -> None:
+    """Run inside a child process so we can hard-kill on timeout."""
     try:
+        # Restricted but still useful builtins. Submitted code rarely needs
+        # imports beyond math/string/itertools/functools, so we whitelist.
+        allowed_modules = {"math", "string", "itertools", "functools", "collections", "re"}
+        safe_globals = {
+            "__builtins__": __builtins__,
+            "__name__": "__sandbox__",
+        }
+        # Pre-import the whitelisted modules so the agent can use them
+        # without needing import statements (and we keep everything inproc).
+        for mod_name in allowed_modules:
+            safe_globals[mod_name] = __import__(mod_name)
+        local_scope: dict = {}
+        exec(code, safe_globals, local_scope)
+        fn = local_scope.get(target_name) or safe_globals.get(target_name)
+        if not callable(fn):
+            queue.put(("err", f"No callable named {target_name!r} defined."))
+            return
+        queue.put(("ok", None))
+    except Exception as e:  # noqa: BLE001
+        queue.put(("err", f"{type(e).__name__}: {e}"))
+def _can_define(code: str, target_name: str, timeout_s: float) -> Optional[str]:
+    """Return None if the submitted code defines the target callable, else an
+    error string. Uses a child process with a wall-clock timeout."""
+    ctx = mp.get_context("fork") if mp.get_start_method(allow_none=True) != "spawn" else mp.get_context("spawn")
+    q: mp.Queue = ctx.Queue()
+    p = ctx.Process(target=_exec_target_in_sandbox, args=(code, target_name, q))
+    p.start()
+    p.join(timeout=timeout_s)
+    if p.is_alive():
+        p.terminate()
+        p.join(1.0)
+        if p.is_alive():
+            p.kill()
+        return f"Definition timed out after {timeout_s}s."
+    if q.empty():
+        return "Sandbox produced no result."
+    status, payload = q.get()
+    return None if status == "ok" else payload
+# Per-call (per-input) sandboxing is too slow for 100 fuzz inputs, so we
+# accept the trade-off of running the submitted callable in-process for
+# fuzzing, but we wrap each call in a SIGALRM-based timeout and we already
+# proved at definition-time that the import didn't blow up.
+class _CallTimeout(Exception):
+    pass
+def _call_with_timeout(fn: Callable, arg: Any, timeout_s: float):
+    def _handler(signum, frame):  # noqa: ARG001
+        raise _CallTimeout()
+    old = signal.signal(signal.SIGALRM, _handler)
+    signal.setitimer(signal.ITIMER_REAL, timeout_s)
+    try:
+        return fn(arg)
+    finally:
+        signal.setitimer(signal.ITIMER_REAL, 0)
+        signal.signal(signal.SIGALRM, old)
+def _safe_call(fn: Callable, arg: Any, timeout_s: float):
+    """Returns (kind, value): kind in {'val', 'err', 'timeout'}."""
+    try:
+        return ("val", _call_with_timeout(fn, arg, timeout_s))
+    except _CallTimeout:
+        return ("timeout", f"timed out after {timeout_s}s")
+    except Exception as e:  # noqa: BLE001
+        return ("err", f"{type(e).__name__}: {e}")
+# ----- Public scoring ------------------------------------------------------
+@dataclass
+class VerificationResult:
+    execution_reward: float
+    complexity_penalty: float
+    define_error: Optional[str]
+    matches: int
+    fuzz_count: int
+def verify_submission(
+    submitted_code: str,
+    target_function: Callable[[Any], Any],
+    fuzz_inputs: List[Any],
+    *,
+    target_name: Optional[str] = None,
+    define_timeout_s: float = 5.0,
+    call_timeout_s: float = 1.0,
+) -> VerificationResult:
+    """Score `submitted_code` against `target_function` over the supplied
+    `fuzz_inputs`. The agent is expected to define a top-level function with
+    the same name as `target_function` (overridable via `target_name`)."""
+    name = target_name or target_function.__name__
+    define_err = _can_define(submitted_code, name, define_timeout_s)
+    complexity = calculate_complexity_penalty(submitted_code)
+    if define_err is not None:
+        return VerificationResult(
+            execution_reward=0.0,
+            complexity_penalty=complexity,
+            define_error=define_err,
+            matches=0,
+            fuzz_count=len(fuzz_inputs),
+        )
+    # Re-define in-process for fast fuzzing. We just confirmed it won't blow
+    # up at import-time; we still time-bound each call.
+    safe_globals: dict = {
+        "__builtins__": __builtins__,
+        "__name__": "__opensleuth_submission__",
+        "math": __import__("math"),
+        "string": __import__("string"),
+        "itertools": __import__("itertools"),
+        "functools": __import__("functools"),
+        "collections": __import__("collections"),
+        "re": __import__("re"),
+    }
+    local_scope: dict = {}
+    exec(submitted_code, safe_globals, local_scope)
+    submitted_fn = local_scope.get(name) or safe_globals.get(name)
     matches = 0
     for inp in fuzz_inputs:
+        ref = _safe_call(target_function, inp, call_timeout_s)
+        sub = _safe_call(submitted_fn, inp, call_timeout_s)
+        if _outputs_equivalent(ref, sub):
+            matches += 1
+    fuzz_count = len(fuzz_inputs) or 1
+    exec_reward = 100.0 * (matches / fuzz_count)
+    return VerificationResult(
+        execution_reward=exec_reward,
+        complexity_penalty=complexity,
+        define_error=None,
+        matches=matches,
+        fuzz_count=fuzz_count,
+    )
+def _outputs_equivalent(ref, sub) -> bool:
+    """Ref and sub are (kind, value) tuples from `_safe_call`. They count as
+    equivalent if both raised the same exception type, or both returned values
+    that are == equal."""
+    rkind, rval = ref
+    skind, sval = sub
+    if rkind == "val" and skind == "val":
         try:
+            return rval == sval
+        except Exception:  # noqa: BLE001
+            return False
+    if rkind == "err" and skind == "err":
+        # Match on exception class name.
+        return rval.split(":", 1)[0] == sval.split(":", 1)[0]
+    if rkind == "timeout" and skind == "timeout":
+        return True
+    return False
+def generate_fuzz_inputs(
+    spec, count: int = 100, seed: Optional[int] = None
+) -> List[Any]:
+    """Public helper: pull `count` fuzz inputs from a FunctionSpec, optionally
+    seeded for reproducibility."""
+    rng = random.Random(seed)
+    return spec.fuzzer(rng, count)

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
-fastapi
-uvicorn
-pydantic

+fastapi==0.115.6
+uvicorn[standard]==0.32.1
+pydantic==2.10.3

server.py CHANGED Viewed

@@ -1,27 +1,76 @@
-from fastapi import FastAPI
-from pydantic import BaseModel
-from opensleuth_env.env import OpenSleuthEnv
-from opensleuth_env.models import Action, Observation
-app = FastAPI()
 env = OpenSleuthEnv()
-class ResetBody(BaseModel):
-    target_name: str = "fibonacci"
-@app.post("/reset", response_model=Observation)
-def reset_env(body: ResetBody):
-    # Ensure the environment is reset for a new session
-    return env.reset(target_name=body.target_name)
-@app.post("/step")
-def step_env(action: Action):
-    # The environment now handles the case where it's not reset
-    obs, reward, done = env.step(action)
-    return {"observation": obs, "reward": reward, "done": done}
-@app.get("/state")
-def get_state():
-    if env.state is None:
-        return {}
-    return env.get_state()

+"""FastAPI server exposing the OpenSleuth environment over HTTP."""
+from __future__ import annotations
+import logging
+from fastapi import FastAPI, HTTPException
+from opensleuth_env import (
+    BLACK_BOX_FUNCTIONS,
+    OpenSleuthEnv,
+    ProbeAction,
+    ResetRequest,
+    StepRequest,
+    StepResponse,
+    SubmitAction,
+)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+log = logging.getLogger("opensleuth.server")
+app = FastAPI(title="OpenSleuth Env", version="0.2.0")
 env = OpenSleuthEnv()
+@app.get("/health")
+def health():
+    return {"status": "ok", "episodes_tracked": len(env._states)}  # noqa: SLF001
+@app.get("/functions")
+def list_functions():
+    return {
+        "functions": [
+            {
+                "name": s.name,
+                "signature": s.signature,
+                "description": s.description,
+            }
+            for s in BLACK_BOX_FUNCTIONS.values()
+        ]
+    }
+@app.post("/reset")
+def reset(req: ResetRequest):
+    try:
+        obs = env.reset(target_name=req.target_name, seed=req.seed, max_steps=req.max_steps)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e)) from e
+    return obs
+@app.post("/step", response_model=StepResponse)
+def step(req: StepRequest):
+    try:
+        return env.step(req.episode_id, req.action)
+    except KeyError as e:
+        raise HTTPException(status_code=404, detail=str(e)) from e
+@app.get("/state/{episode_id}")
+def get_state(episode_id: str):
+    state = env.get_state(episode_id)
+    if not state:
+        raise HTTPException(status_code=404, detail=f"Unknown episode_id {episode_id!r}")
+    return state
+# Convenience: a flat /step that does reset+step in one call is occasionally
+# useful for shell-style debugging.
+@app.post("/probe_once")
+def probe_once(target_name: str, input_repr: str):
+    obs = env.reset(target_name=target_name)
+    resp = env.step(obs.episode_id, ProbeAction(input_repr=input_repr))
+    return resp