Spaces:

anugrah55
/

opensleuth-env-gemini-cli

Paused

App Files Files Community

anugrah55 commited on 13 days ago

Commit

536dda7

verified ·

1 Parent(s): d3cd20c

Apply paper-driven improvements: stratified verifier, sandbox hardening, coverage bonus, anti-hacking penalties, curriculum metadata

Browse files

Files changed (6) hide show

README.md +31 -6
opensleuth_env/black_box.py +37 -3
opensleuth_env/env.py +134 -8
opensleuth_env/models.py +23 -1
opensleuth_env/verifier.py +204 -43
server.py +18 -8

README.md CHANGED Viewed

@@ -20,7 +20,7 @@ function by probing it, then submit Python source that replicates it.
 | Method | Path          | Body                                   | Notes                                  |
 |-------:|---------------|----------------------------------------|----------------------------------------|
 | GET    | `/health`     | —                                      | Liveness probe.                        |
-| GET    | `/functions`  | —                                      | Catalogue of available black-boxes.    |
 | POST   | `/reset`      | `{"target_name": "fibonacci", "seed": 0}` | Starts a new episode, returns initial obs + `episode_id`. |
 | POST   | `/step`       | `{"episode_id": "...", "action": {...}}` | One agent action.                      |
 | GET    | `/state/{eid}`| —                                      | Inspect the live state of an episode (debug). |
@@ -32,12 +32,37 @@ function by probing it, then submit Python source that replicates it.
 {"action_type": "submit", "code": "def fibonacci(n):..."}
 ```
-### Reward
-* **Probe:** `-1` step cost, plus `+2` per newly-seen output and `+5` per
-  newly-seen exception type, encouraging exploration of edge cases.
-* **Submit (terminal):** `100 * matches/fuzz_count` minus a logarithmic
-  cyclomatic-complexity penalty. A perfect submission gets a `+50` bonus.
 ## Hardware

 | Method | Path          | Body                                   | Notes                                  |
 |-------:|---------------|----------------------------------------|----------------------------------------|
 | GET    | `/health`     | —                                      | Liveness probe.                        |
+| GET    | `/functions`  | optional `?difficulty=easy\|medium\|hard` | Catalogue of available black-boxes (with curriculum metadata). |
 | POST   | `/reset`      | `{"target_name": "fibonacci", "seed": 0}` | Starts a new episode, returns initial obs + `episode_id`. |
 | POST   | `/step`       | `{"episode_id": "...", "action": {...}}` | One agent action.                      |
 | GET    | `/state/{eid}`| —                                      | Inspect the live state of an episode (debug). |
 {"action_type": "submit", "code": "def fibonacci(n):..."}
 ```
+### Reward (v0.3 – paper-driven update)
+Inspired by Masud et al. 2026 (*Reward Engineering for RL in Software Tasks*,
+arXiv:2601.19100) and Ibrahim et al. 2024 (*Comprehensive Overview of Reward
+Engineering and Shaping*, arXiv:2408.10215).
+* **Probe:** `-1` step cost, plus `+2` per newly-seen output, `+5` per
+  newly-seen exception type, **and `+0.5` per newly-explored input bucket**
+  (CovRL-Fuzz / SimHash-style coverage bonus).
+* **Submit (terminal):**
+  `execution_reward − complexity_penalty − reward_hack_penalty − floor_penalty
+  (+50 perfect bonus if 100% match)` where:
+  * `execution_reward` ∈ `[0, 100]` is computed over **stratified** fuzz
+    inputs: spec-defined `edge_cases` are *always* tested in addition to the
+    random fuzz batch, and the per-category match counts are returned in
+    `info["matches_by_category"]`.
+  * `floor_penalty` is a hard `-25` for sub-50% match-rate submissions
+    (Vul-R2 style; Wen et al. 2025), preventing agents from learning that
+    emitting *any* function pays out.
+  * `reward_hack_penalty` fires for static import-of-reference attempts
+    (`+25`) and for "constant-output" collapse against a diverse reference
+    (`+15`). The sandbox additionally **blocks** `__import__`, `open`,
+    `eval`, `exec`, `compile`, etc.
+### Backwards compatibility
+Existing trainer / eval clients only read `info["execution_reward"]`,
+`info["matches"]`, `info["fuzz_count"]` and `resp["reward"]` — all preserved
+with the same meaning. New fields (`difficulty`, `coverage_buckets_seen`,
+`matches_by_category`, `edge_pass_rate`, `reward_hack_penalty`,
+`floor_penalty`, `perfect_bonus`) are additive and ignored by older clients.
 ## Hardware

opensleuth_env/black_box.py CHANGED Viewed

@@ -4,13 +4,26 @@ Each entry pairs the reference implementation with a *typed input domain*
 generator so the verifier can fuzz it, plus a public signature/docstring shown
 to the agent in the prompt. The reference implementation itself is never
 shown to the agent.
 """
 from __future__ import annotations
 import random
 import string
-from dataclasses import dataclass
 from typing import Any, Callable, Dict, List
@@ -115,7 +128,6 @@ def _fuzz_small_pos_int(rng: random.Random, n: int) -> List[int]:
 def _fuzz_fib_int(rng: random.Random, n: int) -> List[int]:
-    # Mix common values, edges, and random.
     pool = [1, 2, 3, 10, 20, 30, 50, 89, 90]
     return [rng.choice(pool) if rng.random() < 0.3 else rng.randint(1, 90) for _ in range(n)]
@@ -158,7 +170,6 @@ def _fuzz_lower_string(rng: random.Random, n: int) -> List[str]:
 def _fuzz_prime_int(rng: random.Random, n: int) -> List[int]:
-    # Mix in known primes and composites to cover both branches.
     seeded = [0, 1, 2, 3, 4, 9, 11, 15, 17, 25, 29, 97, 100]
     return [rng.choice(seeded) if rng.random() < 0.3 else rng.randint(0, 200) for _ in range(n)]
@@ -173,6 +184,11 @@ class FunctionSpec:
     signature: str
     description: str
     fuzzer: Callable[[random.Random, int], list]
 BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
@@ -187,6 +203,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
                 "invalid n (n must be a positive int <= 90)."
             ),
             fuzzer=_fuzz_fib_int,
         ),
         FunctionSpec(
             name="reverse_string",
@@ -194,6 +212,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
             signature="reverse_string(s: str) -> str",
             description="Returns the reversed string. Raises TypeError for non-str.",
             fuzzer=_fuzz_short_string,
         ),
         FunctionSpec(
             name="is_palindrome",
@@ -204,6 +224,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
                 "characters. Raises TypeError for non-str."
             ),
             fuzzer=_fuzz_palindrome_string,
         ),
         FunctionSpec(
             name="digit_sum",
@@ -213,6 +235,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
                 "Sum of the decimal digits of n. n must be a non-negative int."
             ),
             fuzzer=_fuzz_nonneg_int,
         ),
         FunctionSpec(
             name="count_vowels",
@@ -220,6 +244,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
             signature="count_vowels(s: str) -> int",
             description="Count of vowels (a/e/i/o/u, case-insensitive) in s.",
             fuzzer=_fuzz_lower_string,
         ),
         FunctionSpec(
             name="gcd",
@@ -230,6 +256,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
                 "non-negative ints."
             ),
             fuzzer=_fuzz_int_pair,
         ),
         FunctionSpec(
             name="sort_unique",
@@ -237,6 +265,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
             signature="sort_unique(xs: list[int]) -> list[int]",
             description="Sorted, deduplicated list of ints from xs.",
             fuzzer=_fuzz_int_list,
         ),
         FunctionSpec(
             name="caesar_cipher",
@@ -247,6 +277,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
                 "are unchanged."
             ),
             fuzzer=_fuzz_lower_string,
         ),
         FunctionSpec(
             name="is_prime",
@@ -254,6 +286,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
             signature="is_prime(n: int) -> bool",
             description="True iff n is a prime int. n must be int.",
             fuzzer=_fuzz_prime_int,
         ),
     ]
 }

 generator so the verifier can fuzz it, plus a public signature/docstring shown
 to the agent in the prompt. The reference implementation itself is never
 shown to the agent.
+Each spec also carries:
+* ``difficulty`` -- one of ``"easy" | "medium" | "hard"``. The trainer / eval
+  harnesses use this for curriculum scheduling (easy first, then medium, ...).
+  Inspired by the curriculum recommendation in Masud et al. (2026) §C2 and the
+  reward-horizon shortening discussion in Ibrahim et al. (2024) §IV-F.
+* ``edge_cases`` -- a list of must-be-included probe inputs the verifier
+  *always* injects on top of the random fuzz batch. This makes the execution
+  reward robust against agents that learn to handle the random regime but miss
+  edge cases (Masud et al. (2026) C1: proxy-failure mitigation; Ibrahim et al.
+  (2024) §III-A: deceptive-reward mitigation).
 """
 from __future__ import annotations
 import random
 import string
+from dataclasses import dataclass, field
 from typing import Any, Callable, Dict, List
 def _fuzz_fib_int(rng: random.Random, n: int) -> List[int]:
     pool = [1, 2, 3, 10, 20, 30, 50, 89, 90]
     return [rng.choice(pool) if rng.random() < 0.3 else rng.randint(1, 90) for _ in range(n)]
 def _fuzz_prime_int(rng: random.Random, n: int) -> List[int]:
     seeded = [0, 1, 2, 3, 4, 9, 11, 15, 17, 25, 29, 97, 100]
     return [rng.choice(seeded) if rng.random() < 0.3 else rng.randint(0, 200) for _ in range(n)]
     signature: str
     description: str
     fuzzer: Callable[[random.Random, int], list]
+    difficulty: str = "medium"
+    # `edge_cases` are *always* probed by the verifier on top of the random
+    # fuzz batch. They are scored as their own category ("edge") so the
+    # verifier can report stratified pass-rates back to the trainer.
+    edge_cases: List[Any] = field(default_factory=list)
 BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
                 "invalid n (n must be a positive int <= 90)."
             ),
             fuzzer=_fuzz_fib_int,
+            difficulty="easy",
+            edge_cases=[1, 2, 3, 10, 89, 90],
         ),
         FunctionSpec(
             name="reverse_string",
             signature="reverse_string(s: str) -> str",
             description="Returns the reversed string. Raises TypeError for non-str.",
             fuzzer=_fuzz_short_string,
+            difficulty="easy",
+            edge_cases=["", "a", "ab", "racecar", "Hello, World!"],
         ),
         FunctionSpec(
             name="is_palindrome",
                 "characters. Raises TypeError for non-str."
             ),
             fuzzer=_fuzz_palindrome_string,
+            difficulty="medium",
+            edge_cases=["", "a", "ab", "abba", "A man, a plan, a canal: Panama", "Hello"],
         ),
         FunctionSpec(
             name="digit_sum",
                 "Sum of the decimal digits of n. n must be a non-negative int."
             ),
             fuzzer=_fuzz_nonneg_int,
+            difficulty="easy",
+            edge_cases=[0, 1, 9, 10, 99, 100, 9999],
         ),
         FunctionSpec(
             name="count_vowels",
             signature="count_vowels(s: str) -> int",
             description="Count of vowels (a/e/i/o/u, case-insensitive) in s.",
             fuzzer=_fuzz_lower_string,
+            difficulty="easy",
+            edge_cases=["", "bcd", "AEIOU", "Hello, World!", "aaaaa"],
         ),
         FunctionSpec(
             name="gcd",
                 "non-negative ints."
             ),
             fuzzer=_fuzz_int_pair,
+            difficulty="medium",
+            edge_cases=[(0, 0), (0, 7), (12, 18), (17, 13), (100, 75)],
         ),
         FunctionSpec(
             name="sort_unique",
             signature="sort_unique(xs: list[int]) -> list[int]",
             description="Sorted, deduplicated list of ints from xs.",
             fuzzer=_fuzz_int_list,
+            difficulty="easy",
+            edge_cases=[[], [1], [1, 1, 1], [3, 1, 2], [-5, 5, 0, -5, 5]],
         ),
         FunctionSpec(
             name="caesar_cipher",
                 "are unchanged."
             ),
             fuzzer=_fuzz_lower_string,
+            difficulty="hard",
+            edge_cases=["", "abc", "xyz", "Hello, World!", "ABC", "hello world"],
         ),
         FunctionSpec(
             name="is_prime",
             signature="is_prime(n: int) -> bool",
             description="True iff n is a prime int. n must be int.",
             fuzzer=_fuzz_prime_int,
+            difficulty="medium",
+            edge_cases=[0, 1, 2, 3, 4, 17, 25, 97, 100],
         ),
     ]
 }

opensleuth_env/env.py CHANGED Viewed

@@ -3,6 +3,34 @@
 A single OpenSleuthEnv holds a *registry of episodes* keyed by episode_id, so
 multiple training rollouts can hit the same FastAPI server in parallel without
 stepping on each other's state.
 """
 from __future__ import annotations
@@ -10,7 +38,7 @@ from __future__ import annotations
 import ast
 import logging
 import uuid
-from typing import Tuple
 from .black_box import BLACK_BOX_FUNCTIONS, FunctionSpec
 from .models import (
@@ -22,7 +50,7 @@ from .models import (
     StepResponse,
     SubmitAction,
 )
-from .verifier import generate_fuzz_inputs, verify_submission
 log = logging.getLogger("opensleuth.env")
@@ -31,10 +59,65 @@ log = logging.getLogger("opensleuth.env")
 PROBE_STEP_COST = -1.0
 NEW_OUTPUT_BONUS = 2.0
 NEW_ERROR_TYPE_BONUS = 5.0
 PERFECT_SUBMISSION_BONUS = 50.0
 MAX_PROBE_HISTORY_IN_OBS = 25
 class OpenSleuthEnv:
     """Multi-episode environment registry."""
@@ -98,7 +181,6 @@ class OpenSleuthEnv:
     def _handle_probe(
         self, state: State, spec: FunctionSpec, action: ProbeAction
     ) -> Tuple[Observation, float, bool, dict]:
-        # Parse the agent's input from a Python literal repr.
         try:
             parsed = ast.literal_eval(action.input_repr)
         except (ValueError, SyntaxError) as e:
@@ -114,13 +196,24 @@ class OpenSleuthEnv:
             obs = self._build_observation(state.episode_id, spec, last_error=err)
             return obs, PROBE_STEP_COST, False, {"reason": "parse_error"}
         intrinsic = 0.0
         last_error = ""
         try:
             output = spec.fn(parsed)
             output_repr = repr(output)
             state.probe_history.append(
-                ProbeRecord(input_repr=repr(parsed), output_repr=output_repr, is_error=False)
             )
             if output_repr not in state.seen_outputs:
                 intrinsic += NEW_OUTPUT_BONUS
@@ -134,6 +227,7 @@ class OpenSleuthEnv:
                     output_repr=err_repr,
                     is_error=True,
                     error_type=error_type,
                 )
             )
             last_error = err_repr
@@ -141,17 +235,34 @@ class OpenSleuthEnv:
                 intrinsic += NEW_ERROR_TYPE_BONUS
                 state.seen_error_types.add(error_type)
-        reward = intrinsic + PROBE_STEP_COST
         obs = self._build_observation(state.episode_id, spec, last_error=last_error)
-        return obs, reward, False, {"intrinsic": intrinsic}
     def _handle_submit(
         self, state: State, spec: FunctionSpec, action: SubmitAction
     ) -> Tuple[Observation, float, bool, dict]:
         fuzz_inputs = generate_fuzz_inputs(spec, count=self.fuzz_count, seed=state.seed)
-        result = verify_submission(action.code, spec.fn, fuzz_inputs, target_name=spec.name)
-        total = result.execution_reward - result.complexity_penalty
         if result.execution_reward >= 99.999:
             total += PERFECT_SUBMISSION_BONUS
@@ -161,12 +272,22 @@ class OpenSleuthEnv:
             last_error=result.define_error or "",
         )
         info = {
             "execution_reward": result.execution_reward,
             "complexity_penalty": result.complexity_penalty,
             "matches": result.matches,
             "fuzz_count": result.fuzz_count,
             "define_error": result.define_error,
             "reason": "submission",
         }
         return obs, total, True, info
@@ -186,6 +307,10 @@ class OpenSleuthEnv:
             last_error=last_error,
             steps_taken=state.steps_taken,
             max_steps=max_steps,
         )
     # --- Introspection -----------------------------------------------------
@@ -201,5 +326,6 @@ class OpenSleuthEnv:
             "done": s.done,
             "seen_outputs": sorted(s.seen_outputs),
             "seen_error_types": sorted(s.seen_error_types),
             "probe_history": [r.model_dump() for r in s.probe_history],
         }

 A single OpenSleuthEnv holds a *registry of episodes* keyed by episode_id, so
 multiple training rollouts can hit the same FastAPI server in parallel without
 stepping on each other's state.
+Reward shaping (v0.3 -- paper-driven update):
+* ``PROBE_STEP_COST`` -- per-step cost so the agent doesn't probe forever.
+* ``NEW_OUTPUT_BONUS`` -- first-visit bonus for an output value the target
+  hasn't produced yet (existing behaviour, kept).
+* ``NEW_ERROR_TYPE_BONUS`` -- first-visit bonus for an exception type the
+  target hasn't raised yet (existing behaviour, kept).
+* ``NEW_BUCKET_BONUS`` -- *new*: TF-IDF / count-based exploration bonus
+  (CovRL-Fuzz; Eom et al. 2024 in Masud et al. 2026 §3.5.2 and SimHash;
+  Ibrahim et al. 2024 §IV-C-1). Encourages probing *under-explored regions
+  of the input domain* (negative ints, empty strings, edge values, ...) not
+  just under-observed outputs. Small magnitude so it doesn't drown out the
+  output/error-type bonuses.
+* ``PERFECT_SUBMISSION_BONUS`` -- existing terminal bonus, gated to require
+  100% match (including all edge cases).
+The submission reward formula is now::
+    reward = execution_reward
+           - complexity_penalty
+           - reward_hack_penalty   # new: import-of-reference detector etc.
+           - floor_penalty         # new: -25 floor below 50% match rate
+           + (PERFECT_SUBMISSION_BONUS if execution_reward >= 99.999 else 0)
+This keeps the ``reward`` field a single float (so the in-flight trainer's
+``reward / 100`` GRPO scaling still works) but pushes wrong submissions
+clearly into the negative regime.
 """
 from __future__ import annotations
 import ast
 import logging
 import uuid
+from typing import Any, Tuple
 from .black_box import BLACK_BOX_FUNCTIONS, FunctionSpec
 from .models import (
     StepResponse,
     SubmitAction,
 )
+from .verifier import generate_fuzz_inputs, get_edge_inputs, verify_submission
 log = logging.getLogger("opensleuth.env")
 PROBE_STEP_COST = -1.0
 NEW_OUTPUT_BONUS = 2.0
 NEW_ERROR_TYPE_BONUS = 5.0
+NEW_BUCKET_BONUS = 0.5  # CovRL-style coverage bonus; small to avoid drowning the rest.
 PERFECT_SUBMISSION_BONUS = 50.0
 MAX_PROBE_HISTORY_IN_OBS = 25
+def _bucket_of(x: Any) -> str:
+    """Coarse, deterministic bucketisation of a probe input, used for
+    coverage-based intrinsic reward (CovRL-Fuzz inspired). Buckets are by
+    type + a few qualitative magnitudes (sign / size / emptiness) so that
+    e.g. ``-1`` and ``-99`` share a bucket, while ``-1`` and ``0`` don't.
+    """
+    if isinstance(x, bool):
+        return f"bool:{x}"
+    if isinstance(x, int):
+        if x < 0:
+            return "int:negative"
+        if x == 0:
+            return "int:zero"
+        if x < 10:
+            return "int:small"
+        if x < 100:
+            return "int:medium"
+        if x < 10_000:
+            return "int:large"
+        return "int:huge"
+    if isinstance(x, float):
+        if x != x:  # NaN
+            return "float:nan"
+        if x < 0:
+            return "float:negative"
+        if x == 0:
+            return "float:zero"
+        return "float:positive"
+    if isinstance(x, str):
+        if x == "":
+            return "str:empty"
+        if len(x) == 1:
+            return "str:singleton"
+        if len(x) <= 5:
+            return "str:short"
+        if len(x) <= 20:
+            return "str:medium"
+        return "str:long"
+    if isinstance(x, (list, tuple)):
+        kind = type(x).__name__
+        if len(x) == 0:
+            return f"{kind}:empty"
+        if len(x) == 1:
+            return f"{kind}:singleton"
+        if len(x) <= 5:
+            return f"{kind}:short"
+        return f"{kind}:long"
+    if isinstance(x, dict):
+        return f"dict:{len(x)}"
+    if x is None:
+        return "none"
+    return f"other:{type(x).__name__}"
 class OpenSleuthEnv:
     """Multi-episode environment registry."""
     def _handle_probe(
         self, state: State, spec: FunctionSpec, action: ProbeAction
     ) -> Tuple[Observation, float, bool, dict]:
         try:
             parsed = ast.literal_eval(action.input_repr)
         except (ValueError, SyntaxError) as e:
             obs = self._build_observation(state.episode_id, spec, last_error=err)
             return obs, PROBE_STEP_COST, False, {"reason": "parse_error"}
+        bucket = _bucket_of(parsed)
+        bucket_bonus = 0.0
+        if bucket not in state.seen_buckets:
+            bucket_bonus = NEW_BUCKET_BONUS
+            state.seen_buckets.add(bucket)
         intrinsic = 0.0
         last_error = ""
         try:
             output = spec.fn(parsed)
             output_repr = repr(output)
             state.probe_history.append(
+                ProbeRecord(
+                    input_repr=repr(parsed),
+                    output_repr=output_repr,
+                    is_error=False,
+                    bucket=bucket,
+                )
             )
             if output_repr not in state.seen_outputs:
                 intrinsic += NEW_OUTPUT_BONUS
                     output_repr=err_repr,
                     is_error=True,
                     error_type=error_type,
+                    bucket=bucket,
                 )
             )
             last_error = err_repr
                 intrinsic += NEW_ERROR_TYPE_BONUS
                 state.seen_error_types.add(error_type)
+        reward = intrinsic + bucket_bonus + PROBE_STEP_COST
         obs = self._build_observation(state.episode_id, spec, last_error=last_error)
+        return obs, reward, False, {
+            "intrinsic": intrinsic,
+            "coverage_bonus": bucket_bonus,
+            "bucket": bucket,
+            "buckets_seen": len(state.seen_buckets),
+        }
     def _handle_submit(
         self, state: State, spec: FunctionSpec, action: SubmitAction
     ) -> Tuple[Observation, float, bool, dict]:
         fuzz_inputs = generate_fuzz_inputs(spec, count=self.fuzz_count, seed=state.seed)
+        edge_inputs = get_edge_inputs(spec)
+        result = verify_submission(
+            action.code,
+            spec.fn,
+            fuzz_inputs,
+            target_name=spec.name,
+            edge_inputs=edge_inputs,
+        )
+        total = (
+            result.execution_reward
+            - result.complexity_penalty
+            - result.reward_hack_penalty
+            - result.floor_penalty
+        )
         if result.execution_reward >= 99.999:
             total += PERFECT_SUBMISSION_BONUS
             last_error=result.define_error or "",
         )
         info = {
+            # --- Existing fields the live trainer + eval already read. ----
             "execution_reward": result.execution_reward,
             "complexity_penalty": result.complexity_penalty,
             "matches": result.matches,
             "fuzz_count": result.fuzz_count,
             "define_error": result.define_error,
             "reason": "submission",
+            # --- New, additive fields. -----------------------------------
+            "matches_by_category": result.matches_by_category,
+            "counts_by_category": result.counts_by_category,
+            "edge_pass_rate": result.edge_pass_rate,
+            "reward_hack_penalty": result.reward_hack_penalty,
+            "floor_penalty": result.floor_penalty,
+            "perfect_bonus": (
+                PERFECT_SUBMISSION_BONUS if result.execution_reward >= 99.999 else 0.0
+            ),
         }
         return obs, total, True, info
             last_error=last_error,
             steps_taken=state.steps_taken,
             max_steps=max_steps,
+            difficulty=getattr(spec, "difficulty", None),
+            coverage_buckets_seen=len(state.seen_buckets),
+            seen_outputs_count=len(state.seen_outputs),
+            seen_error_types_count=len(state.seen_error_types),
         )
     # --- Introspection -----------------------------------------------------
             "done": s.done,
             "seen_outputs": sorted(s.seen_outputs),
             "seen_error_types": sorted(s.seen_error_types),
+            "seen_buckets": sorted(s.seen_buckets),
             "probe_history": [r.model_dump() for r in s.probe_history],
         }

opensleuth_env/models.py CHANGED Viewed

@@ -1,4 +1,9 @@
-"""Pydantic models for the OpenSleuth API and core state."""
 from __future__ import annotations
@@ -31,6 +36,9 @@ class ProbeRecord(BaseModel):
     output_repr: str
     is_error: bool = False
     error_type: Optional[str] = None
 class Observation(BaseModel):
@@ -43,6 +51,19 @@ class Observation(BaseModel):
     last_error: str = ""
     steps_taken: int = 0
     max_steps: int = 25
 class StepResponse(BaseModel):
@@ -63,6 +84,7 @@ class State(BaseModel):
     probe_history: List[ProbeRecord] = Field(default_factory=list)
     seen_outputs: set = Field(default_factory=set)
     seen_error_types: set = Field(default_factory=set)
     steps_taken: int = 0
     done: bool = False
     seed: int = 0

+"""Pydantic models for the OpenSleuth API and core state.
+Backwards-compat note: any field added to ``Observation`` /
+``StepResponse`` /``State`` after v0.2 carries a default value so the in-flight
+trainer (which only inspects a small subset of fields) keeps working.
+"""
 from __future__ import annotations
     output_repr: str
     is_error: bool = False
     error_type: Optional[str] = None
+    # Coverage bucket label assigned by the env when the probe was recorded.
+    # ``None`` for parse-error probes (we never executed the target).
+    bucket: Optional[str] = None
 class Observation(BaseModel):
     last_error: str = ""
     steps_taken: int = 0
     max_steps: int = 25
+    # --- New, optional metadata fields (default-safe; trainer ignores them) ---
+    difficulty: Optional[str] = Field(
+        None, description="Curriculum difficulty: easy / medium / hard."
+    )
+    coverage_buckets_seen: int = Field(
+        0, description="How many distinct input-domain buckets the agent has probed so far."
+    )
+    seen_outputs_count: int = Field(
+        0, description="How many distinct outputs the target function has produced so far."
+    )
+    seen_error_types_count: int = Field(
+        0, description="How many distinct error types the target function has raised so far."
+    )
 class StepResponse(BaseModel):
     probe_history: List[ProbeRecord] = Field(default_factory=list)
     seen_outputs: set = Field(default_factory=set)
     seen_error_types: set = Field(default_factory=set)
+    seen_buckets: set = Field(default_factory=set)
     steps_taken: int = 0
     done: bool = False
     seed: int = 0

opensleuth_env/verifier.py CHANGED Viewed

@@ -2,10 +2,34 @@
 function by domain-aware fuzzing, with sandboxed execution and a complexity
 penalty.
-Reward design:
-  execution_reward in [0, 100]   = 100 * matches/fuzz_count
-  complexity_penalty in [0, 50]  = log(cyclomatic) clipped, else 50 on syntax error
-  exec_failure_penalty           = 25 if def-time exec raised, before fuzzing
 """
 from __future__ import annotations
@@ -15,8 +39,8 @@ import math
 import multiprocessing as mp
 import random
 import signal
-from dataclasses import dataclass
-from typing import Any, Callable, List, Optional
 # ----- AST complexity ------------------------------------------------------
@@ -56,24 +80,66 @@ def calculate_complexity_penalty(code: str) -> float:
     return min(50.0, math.log2(v.cc))
-# ----- Sandboxed execution -------------------------------------------------
 def _exec_target_in_sandbox(code: str, target_name: str, queue: mp.Queue) -> None:
     """Run inside a child process so we can hard-kill on timeout."""
     try:
-        # Restricted but still useful builtins. Submitted code rarely needs
-        # imports beyond math/string/itertools/functools, so we whitelist.
-        allowed_modules = {"math", "string", "itertools", "functools", "collections", "re"}
-        safe_globals = {
-            "__builtins__": __builtins__,
-            "__name__": "__sandbox__",
-        }
-        # Pre-import the whitelisted modules so the agent can use them
-        # without needing import statements (and we keep everything inproc).
-        for mod_name in allowed_modules:
-            safe_globals[mod_name] = __import__(mod_name)
         local_scope: dict = {}
         exec(code, safe_globals, local_scope)
         fn = local_scope.get(target_name) or safe_globals.get(target_name)
@@ -147,6 +213,59 @@ class VerificationResult:
     define_error: Optional[str]
     matches: int
     fuzz_count: int
 def verify_submission(
@@ -157,54 +276,91 @@ def verify_submission(
     target_name: Optional[str] = None,
     define_timeout_s: float = 5.0,
     call_timeout_s: float = 1.0,
 ) -> VerificationResult:
-    """Score `submitted_code` against `target_function` over the supplied
-    `fuzz_inputs`. The agent is expected to define a top-level function with
-    the same name as `target_function` (overridable via `target_name`)."""
     name = target_name or target_function.__name__
     define_err = _can_define(submitted_code, name, define_timeout_s)
     complexity = calculate_complexity_penalty(submitted_code)
     if define_err is not None:
         return VerificationResult(
             execution_reward=0.0,
             complexity_penalty=complexity,
             define_error=define_err,
             matches=0,
-            fuzz_count=len(fuzz_inputs),
         )
     # Re-define in-process for fast fuzzing. We just confirmed it won't blow
-    # up at import-time; we still time-bound each call.
-    safe_globals: dict = {
-        "__builtins__": __builtins__,
-        "__name__": "__opensleuth_submission__",
-        "math": __import__("math"),
-        "string": __import__("string"),
-        "itertools": __import__("itertools"),
-        "functools": __import__("functools"),
-        "collections": __import__("collections"),
-        "re": __import__("re"),
-    }
     local_scope: dict = {}
     exec(submitted_code, safe_globals, local_scope)
     submitted_fn = local_scope.get(name) or safe_globals.get(name)
-    matches = 0
-    for inp in fuzz_inputs:
-        ref = _safe_call(target_function, inp, call_timeout_s)
-        sub = _safe_call(submitted_fn, inp, call_timeout_s)
-        if _outputs_equivalent(ref, sub):
-            matches += 1
-    fuzz_count = len(fuzz_inputs) or 1
     exec_reward = 100.0 * (matches / fuzz_count)
     return VerificationResult(
         execution_reward=exec_reward,
         complexity_penalty=complexity,
         define_error=None,
         matches=matches,
         fuzz_count=fuzz_count,
     )
@@ -220,7 +376,6 @@ def _outputs_equivalent(ref, sub) -> bool:
         except Exception:  # noqa: BLE001
             return False
     if rkind == "err" and skind == "err":
-        # Match on exception class name.
         return rval.split(":", 1)[0] == sval.split(":", 1)[0]
     if rkind == "timeout" and skind == "timeout":
         return True
@@ -230,7 +385,13 @@ def _outputs_equivalent(ref, sub) -> bool:
 def generate_fuzz_inputs(
     spec, count: int = 100, seed: Optional[int] = None
 ) -> List[Any]:
-    """Public helper: pull `count` fuzz inputs from a FunctionSpec, optionally
     seeded for reproducibility."""
     rng = random.Random(seed)
     return spec.fuzzer(rng, count)

 function by domain-aware fuzzing, with sandboxed execution and a complexity
 penalty.
+Reward design (v0.3, paper-driven update):
+* ``execution_reward`` in ``[0, 100]`` is the fraction of fuzz inputs whose
+  outputs match the reference, scaled to 100. Inputs are drawn from two
+  categories that are scored separately so the trainer can see *which*
+  regime the agent fails on (Masud et al., 2026 §P3 "reward granularity"):
+    - ``"edge"``   -- spec-defined must-pass cases (anti-deception, paper
+      §C1 of Ibrahim et al., 2024).
+    - ``"random"`` -- the original sampler.
+* ``complexity_penalty`` in ``[0, 50]`` is a bounded log-scaled cyclomatic
+  complexity, or 50 on syntax error.
+* ``reward_hack_penalty`` is a soft anti-hacking signal that fires when the
+  submission is a "constant function" (single distinct output / single
+  exception type) while the reference is genuinely diverse, OR the agent
+  attempts to import the reference module (we block this at sandbox-level
+  too, but we surface the attempt so the trainer can punish it).
+* ``floor_penalty`` adds a hard ``-25`` floor for sub-50% submissions
+  (Vul-R2 style; Wen et al. 2025 in Masud et al. 2026 §3.4.2). This stops
+  agents from learning that emitting *any* syntactically-valid function
+  pays positive reward.
+The headline ``total_reward`` returned in ``info`` is the *recommended*
+total the env should hand back; the env is free to add a perfect-bonus on
+top.
 """
 from __future__ import annotations
 import multiprocessing as mp
 import random
 import signal
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional
 # ----- AST complexity ------------------------------------------------------
     return min(50.0, math.log2(v.cc))
+# ----- Hardened sandbox ----------------------------------------------------
+#
+# Previous version exposed the real ``__builtins__`` to submitted code,
+# which let an agent reward-hack with::
+#
+#     def fibonacci(n):
+#         from opensleuth_env.black_box import _fibonacci
+#         return _fibonacci(n)
+#
+# We now restrict builtins to a hand-picked safe subset and hand-import the
+# whitelisted helper modules so the agent doesn't need ``import`` at all.
+# This is cheap defence-in-depth; the multiprocessing wall-clock timeout
+# below handles infinite loops independently.
+# Builtins safe to expose. Notably *no* ``__import__``, ``open``, ``exec``,
+# ``eval``, ``compile``, ``input``, ``__build_class__``-via-import, etc.
+_SAFE_BUILTINS_NAMES = (
+    "abs all any ascii bin bool bytes bytearray callable chr complex dict "
+    "divmod enumerate filter float format frozenset getattr hasattr hash "
+    "hex id int isinstance issubclass iter len list map max min next object "
+    "oct ord pow print property range repr reversed round set slice sorted "
+    "str sum tuple type zip True False None NotImplemented Ellipsis "
+    "ArithmeticError AssertionError AttributeError BaseException "
+    "BufferError BytesWarning DeprecationWarning EOFError Exception "
+    "FloatingPointError IndexError KeyError LookupError MemoryError "
+    "NameError NotImplementedError OverflowError RecursionError "
+    "ReferenceError RuntimeError StopAsyncIteration StopIteration "
+    "SyntaxError TypeError UnboundLocalError UnicodeError ValueError "
+    "ZeroDivisionError __build_class__"
+).split()
+def _make_safe_builtins() -> Dict[str, Any]:
+    import builtins as _b
+    out: Dict[str, Any] = {}
+    for n in _SAFE_BUILTINS_NAMES:
+        if hasattr(_b, n):
+            out[n] = getattr(_b, n)
+    return out
+_SAFE_BUILTINS = _make_safe_builtins()
+_PREIMPORTED_MODULES = ("math", "string", "itertools", "functools", "collections", "re")
+def _make_safe_globals() -> Dict[str, Any]:
+    g: Dict[str, Any] = {
+        "__builtins__": _SAFE_BUILTINS,
+        "__name__": "__opensleuth_submission__",
+    }
+    for mod_name in _PREIMPORTED_MODULES:
+        g[mod_name] = __import__(mod_name)
+    return g
 def _exec_target_in_sandbox(code: str, target_name: str, queue: mp.Queue) -> None:
     """Run inside a child process so we can hard-kill on timeout."""
     try:
+        safe_globals = _make_safe_globals()
         local_scope: dict = {}
         exec(code, safe_globals, local_scope)
         fn = local_scope.get(target_name) or safe_globals.get(target_name)
     define_error: Optional[str]
     matches: int
     fuzz_count: int
+    # New, additive fields (do not change existing field meanings).
+    matches_by_category: Dict[str, int] = field(default_factory=dict)
+    counts_by_category: Dict[str, int] = field(default_factory=dict)
+    edge_pass_rate: float = 0.0
+    reward_hack_penalty: float = 0.0
+    floor_penalty: float = 0.0
+def _detect_constant_collapse(
+    sub_outputs: List[Any], ref_outputs: List[Any], min_inputs: int = 6
+) -> bool:
+    """Return True if the submission collapsed to a single output / error type
+    while the reference produced genuine diversity. This catches the
+    'always return 0' / 'always raise' reward-hacking pattern.
+    """
+    if len(sub_outputs) < min_inputs:
+        return False
+    def _signature(call_result):
+        kind, val = call_result
+        if kind == "val":
+            try:
+                return ("val", repr(val))
+            except Exception:  # noqa: BLE001
+                return ("val", id(val))
+        if kind == "err":
+            return ("err", val.split(":", 1)[0])
+        return ("timeout", "")
+    sub_sig = {_signature(o) for o in sub_outputs}
+    ref_sig = {_signature(o) for o in ref_outputs}
+    return len(sub_sig) == 1 and len(ref_sig) >= 3
+def _looks_like_reference_import(code: str) -> bool:
+    """Static check for the most obvious reward-hacking pattern: importing
+    the reference function out of opensleuth_env. The sandbox already blocks
+    actual imports, but flagging them lets the env feed back a clear penalty
+    instead of a silent zero.
+    """
+    try:
+        tree = ast.parse(code)
+    except SyntaxError:
+        return False
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                if alias.name.startswith("opensleuth"):
+                    return True
+        elif isinstance(node, ast.ImportFrom):
+            if node.module and node.module.startswith("opensleuth"):
+                return True
+    return False
 def verify_submission(
     target_name: Optional[str] = None,
     define_timeout_s: float = 5.0,
     call_timeout_s: float = 1.0,
+    edge_inputs: Optional[List[Any]] = None,
 ) -> VerificationResult:
+    """Score ``submitted_code`` against ``target_function`` over the supplied
+    ``fuzz_inputs`` (random regime) and ``edge_inputs`` (must-pass regime).
+    The agent is expected to define a top-level function with the same name as
+    ``target_function`` (overridable via ``target_name``)."""
     name = target_name or target_function.__name__
+    edge_inputs = list(edge_inputs or [])
+    # Static reward-hack flag: import-of-reference is always a -25 hit on top
+    # of whatever score the rest of the rubric assigns. Even if the sandbox
+    # successfully blocks the import (it will), we want to *teach* the agent
+    # not to try.
+    hack_penalty = 25.0 if _looks_like_reference_import(submitted_code) else 0.0
     define_err = _can_define(submitted_code, name, define_timeout_s)
     complexity = calculate_complexity_penalty(submitted_code)
     if define_err is not None:
+        total = len(fuzz_inputs) + len(edge_inputs)
         return VerificationResult(
             execution_reward=0.0,
             complexity_penalty=complexity,
             define_error=define_err,
             matches=0,
+            fuzz_count=total,
+            matches_by_category={"edge": 0, "random": 0},
+            counts_by_category={"edge": len(edge_inputs), "random": len(fuzz_inputs)},
+            edge_pass_rate=0.0,
+            reward_hack_penalty=hack_penalty,
+            floor_penalty=25.0,
         )
     # Re-define in-process for fast fuzzing. We just confirmed it won't blow
+    # up at import-time; we still time-bound each call. Note: we use the
+    # restricted globals so e.g. `__import__` is unavailable here too.
+    safe_globals = _make_safe_globals()
     local_scope: dict = {}
     exec(submitted_code, safe_globals, local_scope)
     submitted_fn = local_scope.get(name) or safe_globals.get(name)
+    matches_by_cat: Dict[str, int] = {"edge": 0, "random": 0}
+    counts_by_cat: Dict[str, int] = {"edge": len(edge_inputs), "random": len(fuzz_inputs)}
+    sub_results: List[Any] = []
+    ref_results: List[Any] = []
+    def _score(inputs: List[Any], category: str) -> None:
+        for inp in inputs:
+            ref = _safe_call(target_function, inp, call_timeout_s)
+            sub = _safe_call(submitted_fn, inp, call_timeout_s)
+            sub_results.append(sub)
+            ref_results.append(ref)
+            if _outputs_equivalent(ref, sub):
+                matches_by_cat[category] += 1
+    _score(edge_inputs, "edge")
+    _score(fuzz_inputs, "random")
+    matches = matches_by_cat["edge"] + matches_by_cat["random"]
+    fuzz_count = len(fuzz_inputs) + len(edge_inputs) or 1
     exec_reward = 100.0 * (matches / fuzz_count)
+    edge_pass_rate = (
+        matches_by_cat["edge"] / counts_by_cat["edge"] if counts_by_cat["edge"] else 0.0
+    )
+    # Anti-hacking: constant collapse penalty.
+    if _detect_constant_collapse(sub_results, ref_results):
+        hack_penalty += 15.0
+    # Hard floor for sub-50% match rate. Vul-R2 style: a wrong patch deserves
+    # a clearly negative signal so the agent doesn't learn that 'any defined
+    # function' pays out via the small complexity-bonus / step structure.
+    floor_penalty = 25.0 if exec_reward < 50.0 else 0.0
     return VerificationResult(
         execution_reward=exec_reward,
         complexity_penalty=complexity,
         define_error=None,
         matches=matches,
         fuzz_count=fuzz_count,
+        matches_by_category=matches_by_cat,
+        counts_by_category=counts_by_cat,
+        edge_pass_rate=edge_pass_rate,
+        reward_hack_penalty=hack_penalty,
+        floor_penalty=floor_penalty,
     )
         except Exception:  # noqa: BLE001
             return False
     if rkind == "err" and skind == "err":
         return rval.split(":", 1)[0] == sval.split(":", 1)[0]
     if rkind == "timeout" and skind == "timeout":
         return True
 def generate_fuzz_inputs(
     spec, count: int = 100, seed: Optional[int] = None
 ) -> List[Any]:
+    """Public helper: pull ``count`` fuzz inputs from a FunctionSpec, optionally
     seeded for reproducibility."""
     rng = random.Random(seed)
     return spec.fuzzer(rng, count)
+def get_edge_inputs(spec) -> List[Any]:
+    """Return the spec's must-pass edge inputs (empty list if the spec
+    predates the v0.3 schema)."""
+    return list(getattr(spec, "edge_cases", []) or [])

server.py CHANGED Viewed

@@ -3,8 +3,9 @@
 from __future__ import annotations
 import logging
-from fastapi import FastAPI, HTTPException
 from opensleuth_env import (
     BLACK_BOX_FUNCTIONS,
@@ -19,7 +20,7 @@ from opensleuth_env import (
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
 log = logging.getLogger("opensleuth.server")
-app = FastAPI(title="OpenSleuth Env", version="0.2.0")
 env = OpenSleuthEnv()
@@ -29,17 +30,26 @@ def health():
 @app.get("/functions")
-def list_functions():
-    return {
-        "functions": [
             {
                 "name": s.name,
                 "signature": s.signature,
                 "description": s.description,
             }
-            for s in BLACK_BOX_FUNCTIONS.values()
-        ]
-    }
 @app.post("/reset")

 from __future__ import annotations
 import logging
+from typing import Optional
+from fastapi import FastAPI, HTTPException, Query
 from opensleuth_env import (
     BLACK_BOX_FUNCTIONS,
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
 log = logging.getLogger("opensleuth.server")
+app = FastAPI(title="OpenSleuth Env", version="0.3.0")
 env = OpenSleuthEnv()
 @app.get("/functions")
+def list_functions(
+    difficulty: Optional[str] = Query(
+        None,
+        description="Optional filter: easy / medium / hard. Used by the trainer for curriculum scheduling.",
+    ),
+):
+    items = []
+    for s in BLACK_BOX_FUNCTIONS.values():
+        if difficulty is not None and getattr(s, "difficulty", None) != difficulty:
+            continue
+        items.append(
             {
                 "name": s.name,
                 "signature": s.signature,
                 "description": s.description,
+                "difficulty": getattr(s, "difficulty", None),
+                "edge_case_count": len(getattr(s, "edge_cases", []) or []),
             }
+        )
+    return {"functions": items}
 @app.post("/reset")