Apply paper-driven improvements: stratified verifier, sandbox hardening, coverage bonus, anti-hacking penalties, curriculum metadata
Browse files- README.md +31 -6
- opensleuth_env/black_box.py +37 -3
- opensleuth_env/env.py +134 -8
- opensleuth_env/models.py +23 -1
- opensleuth_env/verifier.py +204 -43
- server.py +18 -8
README.md
CHANGED
|
@@ -20,7 +20,7 @@ function by probing it, then submit Python source that replicates it.
|
|
| 20 |
| Method | Path | Body | Notes |
|
| 21 |
|-------:|---------------|----------------------------------------|----------------------------------------|
|
| 22 |
| GET | `/health` | — | Liveness probe. |
|
| 23 |
-
| GET | `/functions` |
|
| 24 |
| POST | `/reset` | `{"target_name": "fibonacci", "seed": 0}` | Starts a new episode, returns initial obs + `episode_id`. |
|
| 25 |
| POST | `/step` | `{"episode_id": "...", "action": {...}}` | One agent action. |
|
| 26 |
| GET | `/state/{eid}`| — | Inspect the live state of an episode (debug). |
|
|
@@ -32,12 +32,37 @@ function by probing it, then submit Python source that replicates it.
|
|
| 32 |
{"action_type": "submit", "code": "def fibonacci(n):..."}
|
| 33 |
```
|
| 34 |
|
| 35 |
-
### Reward
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
## Hardware
|
| 43 |
|
|
|
|
| 20 |
| Method | Path | Body | Notes |
|
| 21 |
|-------:|---------------|----------------------------------------|----------------------------------------|
|
| 22 |
| GET | `/health` | — | Liveness probe. |
|
| 23 |
+
| GET | `/functions` | optional `?difficulty=easy\|medium\|hard` | Catalogue of available black-boxes (with curriculum metadata). |
|
| 24 |
| POST | `/reset` | `{"target_name": "fibonacci", "seed": 0}` | Starts a new episode, returns initial obs + `episode_id`. |
|
| 25 |
| POST | `/step` | `{"episode_id": "...", "action": {...}}` | One agent action. |
|
| 26 |
| GET | `/state/{eid}`| — | Inspect the live state of an episode (debug). |
|
|
|
|
| 32 |
{"action_type": "submit", "code": "def fibonacci(n):..."}
|
| 33 |
```
|
| 34 |
|
| 35 |
+
### Reward (v0.3 – paper-driven update)
|
| 36 |
|
| 37 |
+
Inspired by Masud et al. 2026 (*Reward Engineering for RL in Software Tasks*,
|
| 38 |
+
arXiv:2601.19100) and Ibrahim et al. 2024 (*Comprehensive Overview of Reward
|
| 39 |
+
Engineering and Shaping*, arXiv:2408.10215).
|
| 40 |
+
|
| 41 |
+
* **Probe:** `-1` step cost, plus `+2` per newly-seen output, `+5` per
|
| 42 |
+
newly-seen exception type, **and `+0.5` per newly-explored input bucket**
|
| 43 |
+
(CovRL-Fuzz / SimHash-style coverage bonus).
|
| 44 |
+
* **Submit (terminal):**
|
| 45 |
+
`execution_reward − complexity_penalty − reward_hack_penalty − floor_penalty
|
| 46 |
+
(+50 perfect bonus if 100% match)` where:
|
| 47 |
+
* `execution_reward` ∈ `[0, 100]` is computed over **stratified** fuzz
|
| 48 |
+
inputs: spec-defined `edge_cases` are *always* tested in addition to the
|
| 49 |
+
random fuzz batch, and the per-category match counts are returned in
|
| 50 |
+
`info["matches_by_category"]`.
|
| 51 |
+
* `floor_penalty` is a hard `-25` for sub-50% match-rate submissions
|
| 52 |
+
(Vul-R2 style; Wen et al. 2025), preventing agents from learning that
|
| 53 |
+
emitting *any* function pays out.
|
| 54 |
+
* `reward_hack_penalty` fires for static import-of-reference attempts
|
| 55 |
+
(`+25`) and for "constant-output" collapse against a diverse reference
|
| 56 |
+
(`+15`). The sandbox additionally **blocks** `__import__`, `open`,
|
| 57 |
+
`eval`, `exec`, `compile`, etc.
|
| 58 |
+
|
| 59 |
+
### Backwards compatibility
|
| 60 |
+
|
| 61 |
+
Existing trainer / eval clients only read `info["execution_reward"]`,
|
| 62 |
+
`info["matches"]`, `info["fuzz_count"]` and `resp["reward"]` — all preserved
|
| 63 |
+
with the same meaning. New fields (`difficulty`, `coverage_buckets_seen`,
|
| 64 |
+
`matches_by_category`, `edge_pass_rate`, `reward_hack_penalty`,
|
| 65 |
+
`floor_penalty`, `perfect_bonus`) are additive and ignored by older clients.
|
| 66 |
|
| 67 |
## Hardware
|
| 68 |
|
opensleuth_env/black_box.py
CHANGED
|
@@ -4,13 +4,26 @@ Each entry pairs the reference implementation with a *typed input domain*
|
|
| 4 |
generator so the verifier can fuzz it, plus a public signature/docstring shown
|
| 5 |
to the agent in the prompt. The reference implementation itself is never
|
| 6 |
shown to the agent.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
from __future__ import annotations
|
| 10 |
|
| 11 |
import random
|
| 12 |
import string
|
| 13 |
-
from dataclasses import dataclass
|
| 14 |
from typing import Any, Callable, Dict, List
|
| 15 |
|
| 16 |
|
|
@@ -115,7 +128,6 @@ def _fuzz_small_pos_int(rng: random.Random, n: int) -> List[int]:
|
|
| 115 |
|
| 116 |
|
| 117 |
def _fuzz_fib_int(rng: random.Random, n: int) -> List[int]:
|
| 118 |
-
# Mix common values, edges, and random.
|
| 119 |
pool = [1, 2, 3, 10, 20, 30, 50, 89, 90]
|
| 120 |
return [rng.choice(pool) if rng.random() < 0.3 else rng.randint(1, 90) for _ in range(n)]
|
| 121 |
|
|
@@ -158,7 +170,6 @@ def _fuzz_lower_string(rng: random.Random, n: int) -> List[str]:
|
|
| 158 |
|
| 159 |
|
| 160 |
def _fuzz_prime_int(rng: random.Random, n: int) -> List[int]:
|
| 161 |
-
# Mix in known primes and composites to cover both branches.
|
| 162 |
seeded = [0, 1, 2, 3, 4, 9, 11, 15, 17, 25, 29, 97, 100]
|
| 163 |
return [rng.choice(seeded) if rng.random() < 0.3 else rng.randint(0, 200) for _ in range(n)]
|
| 164 |
|
|
@@ -173,6 +184,11 @@ class FunctionSpec:
|
|
| 173 |
signature: str
|
| 174 |
description: str
|
| 175 |
fuzzer: Callable[[random.Random, int], list]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
|
| 178 |
BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
|
|
@@ -187,6 +203,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
|
|
| 187 |
"invalid n (n must be a positive int <= 90)."
|
| 188 |
),
|
| 189 |
fuzzer=_fuzz_fib_int,
|
|
|
|
|
|
|
| 190 |
),
|
| 191 |
FunctionSpec(
|
| 192 |
name="reverse_string",
|
|
@@ -194,6 +212,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
|
|
| 194 |
signature="reverse_string(s: str) -> str",
|
| 195 |
description="Returns the reversed string. Raises TypeError for non-str.",
|
| 196 |
fuzzer=_fuzz_short_string,
|
|
|
|
|
|
|
| 197 |
),
|
| 198 |
FunctionSpec(
|
| 199 |
name="is_palindrome",
|
|
@@ -204,6 +224,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
|
|
| 204 |
"characters. Raises TypeError for non-str."
|
| 205 |
),
|
| 206 |
fuzzer=_fuzz_palindrome_string,
|
|
|
|
|
|
|
| 207 |
),
|
| 208 |
FunctionSpec(
|
| 209 |
name="digit_sum",
|
|
@@ -213,6 +235,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
|
|
| 213 |
"Sum of the decimal digits of n. n must be a non-negative int."
|
| 214 |
),
|
| 215 |
fuzzer=_fuzz_nonneg_int,
|
|
|
|
|
|
|
| 216 |
),
|
| 217 |
FunctionSpec(
|
| 218 |
name="count_vowels",
|
|
@@ -220,6 +244,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
|
|
| 220 |
signature="count_vowels(s: str) -> int",
|
| 221 |
description="Count of vowels (a/e/i/o/u, case-insensitive) in s.",
|
| 222 |
fuzzer=_fuzz_lower_string,
|
|
|
|
|
|
|
| 223 |
),
|
| 224 |
FunctionSpec(
|
| 225 |
name="gcd",
|
|
@@ -230,6 +256,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
|
|
| 230 |
"non-negative ints."
|
| 231 |
),
|
| 232 |
fuzzer=_fuzz_int_pair,
|
|
|
|
|
|
|
| 233 |
),
|
| 234 |
FunctionSpec(
|
| 235 |
name="sort_unique",
|
|
@@ -237,6 +265,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
|
|
| 237 |
signature="sort_unique(xs: list[int]) -> list[int]",
|
| 238 |
description="Sorted, deduplicated list of ints from xs.",
|
| 239 |
fuzzer=_fuzz_int_list,
|
|
|
|
|
|
|
| 240 |
),
|
| 241 |
FunctionSpec(
|
| 242 |
name="caesar_cipher",
|
|
@@ -247,6 +277,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
|
|
| 247 |
"are unchanged."
|
| 248 |
),
|
| 249 |
fuzzer=_fuzz_lower_string,
|
|
|
|
|
|
|
| 250 |
),
|
| 251 |
FunctionSpec(
|
| 252 |
name="is_prime",
|
|
@@ -254,6 +286,8 @@ BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
|
|
| 254 |
signature="is_prime(n: int) -> bool",
|
| 255 |
description="True iff n is a prime int. n must be int.",
|
| 256 |
fuzzer=_fuzz_prime_int,
|
|
|
|
|
|
|
| 257 |
),
|
| 258 |
]
|
| 259 |
}
|
|
|
|
| 4 |
generator so the verifier can fuzz it, plus a public signature/docstring shown
|
| 5 |
to the agent in the prompt. The reference implementation itself is never
|
| 6 |
shown to the agent.
|
| 7 |
+
|
| 8 |
+
Each spec also carries:
|
| 9 |
+
|
| 10 |
+
* ``difficulty`` -- one of ``"easy" | "medium" | "hard"``. The trainer / eval
|
| 11 |
+
harnesses use this for curriculum scheduling (easy first, then medium, ...).
|
| 12 |
+
Inspired by the curriculum recommendation in Masud et al. (2026) §C2 and the
|
| 13 |
+
reward-horizon shortening discussion in Ibrahim et al. (2024) §IV-F.
|
| 14 |
+
|
| 15 |
+
* ``edge_cases`` -- a list of must-be-included probe inputs the verifier
|
| 16 |
+
*always* injects on top of the random fuzz batch. This makes the execution
|
| 17 |
+
reward robust against agents that learn to handle the random regime but miss
|
| 18 |
+
edge cases (Masud et al. (2026) C1: proxy-failure mitigation; Ibrahim et al.
|
| 19 |
+
(2024) §III-A: deceptive-reward mitigation).
|
| 20 |
"""
|
| 21 |
|
| 22 |
from __future__ import annotations
|
| 23 |
|
| 24 |
import random
|
| 25 |
import string
|
| 26 |
+
from dataclasses import dataclass, field
|
| 27 |
from typing import Any, Callable, Dict, List
|
| 28 |
|
| 29 |
|
|
|
|
| 128 |
|
| 129 |
|
| 130 |
def _fuzz_fib_int(rng: random.Random, n: int) -> List[int]:
|
|
|
|
| 131 |
pool = [1, 2, 3, 10, 20, 30, 50, 89, 90]
|
| 132 |
return [rng.choice(pool) if rng.random() < 0.3 else rng.randint(1, 90) for _ in range(n)]
|
| 133 |
|
|
|
|
| 170 |
|
| 171 |
|
| 172 |
def _fuzz_prime_int(rng: random.Random, n: int) -> List[int]:
|
|
|
|
| 173 |
seeded = [0, 1, 2, 3, 4, 9, 11, 15, 17, 25, 29, 97, 100]
|
| 174 |
return [rng.choice(seeded) if rng.random() < 0.3 else rng.randint(0, 200) for _ in range(n)]
|
| 175 |
|
|
|
|
| 184 |
signature: str
|
| 185 |
description: str
|
| 186 |
fuzzer: Callable[[random.Random, int], list]
|
| 187 |
+
difficulty: str = "medium"
|
| 188 |
+
# `edge_cases` are *always* probed by the verifier on top of the random
|
| 189 |
+
# fuzz batch. They are scored as their own category ("edge") so the
|
| 190 |
+
# verifier can report stratified pass-rates back to the trainer.
|
| 191 |
+
edge_cases: List[Any] = field(default_factory=list)
|
| 192 |
|
| 193 |
|
| 194 |
BLACK_BOX_FUNCTIONS: Dict[str, FunctionSpec] = {
|
|
|
|
| 203 |
"invalid n (n must be a positive int <= 90)."
|
| 204 |
),
|
| 205 |
fuzzer=_fuzz_fib_int,
|
| 206 |
+
difficulty="easy",
|
| 207 |
+
edge_cases=[1, 2, 3, 10, 89, 90],
|
| 208 |
),
|
| 209 |
FunctionSpec(
|
| 210 |
name="reverse_string",
|
|
|
|
| 212 |
signature="reverse_string(s: str) -> str",
|
| 213 |
description="Returns the reversed string. Raises TypeError for non-str.",
|
| 214 |
fuzzer=_fuzz_short_string,
|
| 215 |
+
difficulty="easy",
|
| 216 |
+
edge_cases=["", "a", "ab", "racecar", "Hello, World!"],
|
| 217 |
),
|
| 218 |
FunctionSpec(
|
| 219 |
name="is_palindrome",
|
|
|
|
| 224 |
"characters. Raises TypeError for non-str."
|
| 225 |
),
|
| 226 |
fuzzer=_fuzz_palindrome_string,
|
| 227 |
+
difficulty="medium",
|
| 228 |
+
edge_cases=["", "a", "ab", "abba", "A man, a plan, a canal: Panama", "Hello"],
|
| 229 |
),
|
| 230 |
FunctionSpec(
|
| 231 |
name="digit_sum",
|
|
|
|
| 235 |
"Sum of the decimal digits of n. n must be a non-negative int."
|
| 236 |
),
|
| 237 |
fuzzer=_fuzz_nonneg_int,
|
| 238 |
+
difficulty="easy",
|
| 239 |
+
edge_cases=[0, 1, 9, 10, 99, 100, 9999],
|
| 240 |
),
|
| 241 |
FunctionSpec(
|
| 242 |
name="count_vowels",
|
|
|
|
| 244 |
signature="count_vowels(s: str) -> int",
|
| 245 |
description="Count of vowels (a/e/i/o/u, case-insensitive) in s.",
|
| 246 |
fuzzer=_fuzz_lower_string,
|
| 247 |
+
difficulty="easy",
|
| 248 |
+
edge_cases=["", "bcd", "AEIOU", "Hello, World!", "aaaaa"],
|
| 249 |
),
|
| 250 |
FunctionSpec(
|
| 251 |
name="gcd",
|
|
|
|
| 256 |
"non-negative ints."
|
| 257 |
),
|
| 258 |
fuzzer=_fuzz_int_pair,
|
| 259 |
+
difficulty="medium",
|
| 260 |
+
edge_cases=[(0, 0), (0, 7), (12, 18), (17, 13), (100, 75)],
|
| 261 |
),
|
| 262 |
FunctionSpec(
|
| 263 |
name="sort_unique",
|
|
|
|
| 265 |
signature="sort_unique(xs: list[int]) -> list[int]",
|
| 266 |
description="Sorted, deduplicated list of ints from xs.",
|
| 267 |
fuzzer=_fuzz_int_list,
|
| 268 |
+
difficulty="easy",
|
| 269 |
+
edge_cases=[[], [1], [1, 1, 1], [3, 1, 2], [-5, 5, 0, -5, 5]],
|
| 270 |
),
|
| 271 |
FunctionSpec(
|
| 272 |
name="caesar_cipher",
|
|
|
|
| 277 |
"are unchanged."
|
| 278 |
),
|
| 279 |
fuzzer=_fuzz_lower_string,
|
| 280 |
+
difficulty="hard",
|
| 281 |
+
edge_cases=["", "abc", "xyz", "Hello, World!", "ABC", "hello world"],
|
| 282 |
),
|
| 283 |
FunctionSpec(
|
| 284 |
name="is_prime",
|
|
|
|
| 286 |
signature="is_prime(n: int) -> bool",
|
| 287 |
description="True iff n is a prime int. n must be int.",
|
| 288 |
fuzzer=_fuzz_prime_int,
|
| 289 |
+
difficulty="medium",
|
| 290 |
+
edge_cases=[0, 1, 2, 3, 4, 17, 25, 97, 100],
|
| 291 |
),
|
| 292 |
]
|
| 293 |
}
|
opensleuth_env/env.py
CHANGED
|
@@ -3,6 +3,34 @@
|
|
| 3 |
A single OpenSleuthEnv holds a *registry of episodes* keyed by episode_id, so
|
| 4 |
multiple training rollouts can hit the same FastAPI server in parallel without
|
| 5 |
stepping on each other's state.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
from __future__ import annotations
|
|
@@ -10,7 +38,7 @@ from __future__ import annotations
|
|
| 10 |
import ast
|
| 11 |
import logging
|
| 12 |
import uuid
|
| 13 |
-
from typing import Tuple
|
| 14 |
|
| 15 |
from .black_box import BLACK_BOX_FUNCTIONS, FunctionSpec
|
| 16 |
from .models import (
|
|
@@ -22,7 +50,7 @@ from .models import (
|
|
| 22 |
StepResponse,
|
| 23 |
SubmitAction,
|
| 24 |
)
|
| 25 |
-
from .verifier import generate_fuzz_inputs, verify_submission
|
| 26 |
|
| 27 |
log = logging.getLogger("opensleuth.env")
|
| 28 |
|
|
@@ -31,10 +59,65 @@ log = logging.getLogger("opensleuth.env")
|
|
| 31 |
PROBE_STEP_COST = -1.0
|
| 32 |
NEW_OUTPUT_BONUS = 2.0
|
| 33 |
NEW_ERROR_TYPE_BONUS = 5.0
|
|
|
|
| 34 |
PERFECT_SUBMISSION_BONUS = 50.0
|
| 35 |
MAX_PROBE_HISTORY_IN_OBS = 25
|
| 36 |
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
class OpenSleuthEnv:
|
| 39 |
"""Multi-episode environment registry."""
|
| 40 |
|
|
@@ -98,7 +181,6 @@ class OpenSleuthEnv:
|
|
| 98 |
def _handle_probe(
|
| 99 |
self, state: State, spec: FunctionSpec, action: ProbeAction
|
| 100 |
) -> Tuple[Observation, float, bool, dict]:
|
| 101 |
-
# Parse the agent's input from a Python literal repr.
|
| 102 |
try:
|
| 103 |
parsed = ast.literal_eval(action.input_repr)
|
| 104 |
except (ValueError, SyntaxError) as e:
|
|
@@ -114,13 +196,24 @@ class OpenSleuthEnv:
|
|
| 114 |
obs = self._build_observation(state.episode_id, spec, last_error=err)
|
| 115 |
return obs, PROBE_STEP_COST, False, {"reason": "parse_error"}
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
intrinsic = 0.0
|
| 118 |
last_error = ""
|
| 119 |
try:
|
| 120 |
output = spec.fn(parsed)
|
| 121 |
output_repr = repr(output)
|
| 122 |
state.probe_history.append(
|
| 123 |
-
ProbeRecord(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
)
|
| 125 |
if output_repr not in state.seen_outputs:
|
| 126 |
intrinsic += NEW_OUTPUT_BONUS
|
|
@@ -134,6 +227,7 @@ class OpenSleuthEnv:
|
|
| 134 |
output_repr=err_repr,
|
| 135 |
is_error=True,
|
| 136 |
error_type=error_type,
|
|
|
|
| 137 |
)
|
| 138 |
)
|
| 139 |
last_error = err_repr
|
|
@@ -141,17 +235,34 @@ class OpenSleuthEnv:
|
|
| 141 |
intrinsic += NEW_ERROR_TYPE_BONUS
|
| 142 |
state.seen_error_types.add(error_type)
|
| 143 |
|
| 144 |
-
reward = intrinsic + PROBE_STEP_COST
|
| 145 |
obs = self._build_observation(state.episode_id, spec, last_error=last_error)
|
| 146 |
-
return obs, reward, False, {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
def _handle_submit(
|
| 149 |
self, state: State, spec: FunctionSpec, action: SubmitAction
|
| 150 |
) -> Tuple[Observation, float, bool, dict]:
|
| 151 |
fuzz_inputs = generate_fuzz_inputs(spec, count=self.fuzz_count, seed=state.seed)
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
-
total =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
if result.execution_reward >= 99.999:
|
| 156 |
total += PERFECT_SUBMISSION_BONUS
|
| 157 |
|
|
@@ -161,12 +272,22 @@ class OpenSleuthEnv:
|
|
| 161 |
last_error=result.define_error or "",
|
| 162 |
)
|
| 163 |
info = {
|
|
|
|
| 164 |
"execution_reward": result.execution_reward,
|
| 165 |
"complexity_penalty": result.complexity_penalty,
|
| 166 |
"matches": result.matches,
|
| 167 |
"fuzz_count": result.fuzz_count,
|
| 168 |
"define_error": result.define_error,
|
| 169 |
"reason": "submission",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
}
|
| 171 |
return obs, total, True, info
|
| 172 |
|
|
@@ -186,6 +307,10 @@ class OpenSleuthEnv:
|
|
| 186 |
last_error=last_error,
|
| 187 |
steps_taken=state.steps_taken,
|
| 188 |
max_steps=max_steps,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
)
|
| 190 |
|
| 191 |
# --- Introspection -----------------------------------------------------
|
|
@@ -201,5 +326,6 @@ class OpenSleuthEnv:
|
|
| 201 |
"done": s.done,
|
| 202 |
"seen_outputs": sorted(s.seen_outputs),
|
| 203 |
"seen_error_types": sorted(s.seen_error_types),
|
|
|
|
| 204 |
"probe_history": [r.model_dump() for r in s.probe_history],
|
| 205 |
}
|
|
|
|
| 3 |
A single OpenSleuthEnv holds a *registry of episodes* keyed by episode_id, so
|
| 4 |
multiple training rollouts can hit the same FastAPI server in parallel without
|
| 5 |
stepping on each other's state.
|
| 6 |
+
|
| 7 |
+
Reward shaping (v0.3 -- paper-driven update):
|
| 8 |
+
|
| 9 |
+
* ``PROBE_STEP_COST`` -- per-step cost so the agent doesn't probe forever.
|
| 10 |
+
* ``NEW_OUTPUT_BONUS`` -- first-visit bonus for an output value the target
|
| 11 |
+
hasn't produced yet (existing behaviour, kept).
|
| 12 |
+
* ``NEW_ERROR_TYPE_BONUS`` -- first-visit bonus for an exception type the
|
| 13 |
+
target hasn't raised yet (existing behaviour, kept).
|
| 14 |
+
* ``NEW_BUCKET_BONUS`` -- *new*: TF-IDF / count-based exploration bonus
|
| 15 |
+
(CovRL-Fuzz; Eom et al. 2024 in Masud et al. 2026 §3.5.2 and SimHash;
|
| 16 |
+
Ibrahim et al. 2024 §IV-C-1). Encourages probing *under-explored regions
|
| 17 |
+
of the input domain* (negative ints, empty strings, edge values, ...) not
|
| 18 |
+
just under-observed outputs. Small magnitude so it doesn't drown out the
|
| 19 |
+
output/error-type bonuses.
|
| 20 |
+
* ``PERFECT_SUBMISSION_BONUS`` -- existing terminal bonus, gated to require
|
| 21 |
+
100% match (including all edge cases).
|
| 22 |
+
|
| 23 |
+
The submission reward formula is now::
|
| 24 |
+
|
| 25 |
+
reward = execution_reward
|
| 26 |
+
- complexity_penalty
|
| 27 |
+
- reward_hack_penalty # new: import-of-reference detector etc.
|
| 28 |
+
- floor_penalty # new: -25 floor below 50% match rate
|
| 29 |
+
+ (PERFECT_SUBMISSION_BONUS if execution_reward >= 99.999 else 0)
|
| 30 |
+
|
| 31 |
+
This keeps the ``reward`` field a single float (so the in-flight trainer's
|
| 32 |
+
``reward / 100`` GRPO scaling still works) but pushes wrong submissions
|
| 33 |
+
clearly into the negative regime.
|
| 34 |
"""
|
| 35 |
|
| 36 |
from __future__ import annotations
|
|
|
|
| 38 |
import ast
|
| 39 |
import logging
|
| 40 |
import uuid
|
| 41 |
+
from typing import Any, Tuple
|
| 42 |
|
| 43 |
from .black_box import BLACK_BOX_FUNCTIONS, FunctionSpec
|
| 44 |
from .models import (
|
|
|
|
| 50 |
StepResponse,
|
| 51 |
SubmitAction,
|
| 52 |
)
|
| 53 |
+
from .verifier import generate_fuzz_inputs, get_edge_inputs, verify_submission
|
| 54 |
|
| 55 |
log = logging.getLogger("opensleuth.env")
|
| 56 |
|
|
|
|
| 59 |
PROBE_STEP_COST = -1.0
|
| 60 |
NEW_OUTPUT_BONUS = 2.0
|
| 61 |
NEW_ERROR_TYPE_BONUS = 5.0
|
| 62 |
+
NEW_BUCKET_BONUS = 0.5 # CovRL-style coverage bonus; small to avoid drowning the rest.
|
| 63 |
PERFECT_SUBMISSION_BONUS = 50.0
|
| 64 |
MAX_PROBE_HISTORY_IN_OBS = 25
|
| 65 |
|
| 66 |
|
| 67 |
+
def _bucket_of(x: Any) -> str:
|
| 68 |
+
"""Coarse, deterministic bucketisation of a probe input, used for
|
| 69 |
+
coverage-based intrinsic reward (CovRL-Fuzz inspired). Buckets are by
|
| 70 |
+
type + a few qualitative magnitudes (sign / size / emptiness) so that
|
| 71 |
+
e.g. ``-1`` and ``-99`` share a bucket, while ``-1`` and ``0`` don't.
|
| 72 |
+
"""
|
| 73 |
+
if isinstance(x, bool):
|
| 74 |
+
return f"bool:{x}"
|
| 75 |
+
if isinstance(x, int):
|
| 76 |
+
if x < 0:
|
| 77 |
+
return "int:negative"
|
| 78 |
+
if x == 0:
|
| 79 |
+
return "int:zero"
|
| 80 |
+
if x < 10:
|
| 81 |
+
return "int:small"
|
| 82 |
+
if x < 100:
|
| 83 |
+
return "int:medium"
|
| 84 |
+
if x < 10_000:
|
| 85 |
+
return "int:large"
|
| 86 |
+
return "int:huge"
|
| 87 |
+
if isinstance(x, float):
|
| 88 |
+
if x != x: # NaN
|
| 89 |
+
return "float:nan"
|
| 90 |
+
if x < 0:
|
| 91 |
+
return "float:negative"
|
| 92 |
+
if x == 0:
|
| 93 |
+
return "float:zero"
|
| 94 |
+
return "float:positive"
|
| 95 |
+
if isinstance(x, str):
|
| 96 |
+
if x == "":
|
| 97 |
+
return "str:empty"
|
| 98 |
+
if len(x) == 1:
|
| 99 |
+
return "str:singleton"
|
| 100 |
+
if len(x) <= 5:
|
| 101 |
+
return "str:short"
|
| 102 |
+
if len(x) <= 20:
|
| 103 |
+
return "str:medium"
|
| 104 |
+
return "str:long"
|
| 105 |
+
if isinstance(x, (list, tuple)):
|
| 106 |
+
kind = type(x).__name__
|
| 107 |
+
if len(x) == 0:
|
| 108 |
+
return f"{kind}:empty"
|
| 109 |
+
if len(x) == 1:
|
| 110 |
+
return f"{kind}:singleton"
|
| 111 |
+
if len(x) <= 5:
|
| 112 |
+
return f"{kind}:short"
|
| 113 |
+
return f"{kind}:long"
|
| 114 |
+
if isinstance(x, dict):
|
| 115 |
+
return f"dict:{len(x)}"
|
| 116 |
+
if x is None:
|
| 117 |
+
return "none"
|
| 118 |
+
return f"other:{type(x).__name__}"
|
| 119 |
+
|
| 120 |
+
|
| 121 |
class OpenSleuthEnv:
|
| 122 |
"""Multi-episode environment registry."""
|
| 123 |
|
|
|
|
| 181 |
def _handle_probe(
|
| 182 |
self, state: State, spec: FunctionSpec, action: ProbeAction
|
| 183 |
) -> Tuple[Observation, float, bool, dict]:
|
|
|
|
| 184 |
try:
|
| 185 |
parsed = ast.literal_eval(action.input_repr)
|
| 186 |
except (ValueError, SyntaxError) as e:
|
|
|
|
| 196 |
obs = self._build_observation(state.episode_id, spec, last_error=err)
|
| 197 |
return obs, PROBE_STEP_COST, False, {"reason": "parse_error"}
|
| 198 |
|
| 199 |
+
bucket = _bucket_of(parsed)
|
| 200 |
+
bucket_bonus = 0.0
|
| 201 |
+
if bucket not in state.seen_buckets:
|
| 202 |
+
bucket_bonus = NEW_BUCKET_BONUS
|
| 203 |
+
state.seen_buckets.add(bucket)
|
| 204 |
+
|
| 205 |
intrinsic = 0.0
|
| 206 |
last_error = ""
|
| 207 |
try:
|
| 208 |
output = spec.fn(parsed)
|
| 209 |
output_repr = repr(output)
|
| 210 |
state.probe_history.append(
|
| 211 |
+
ProbeRecord(
|
| 212 |
+
input_repr=repr(parsed),
|
| 213 |
+
output_repr=output_repr,
|
| 214 |
+
is_error=False,
|
| 215 |
+
bucket=bucket,
|
| 216 |
+
)
|
| 217 |
)
|
| 218 |
if output_repr not in state.seen_outputs:
|
| 219 |
intrinsic += NEW_OUTPUT_BONUS
|
|
|
|
| 227 |
output_repr=err_repr,
|
| 228 |
is_error=True,
|
| 229 |
error_type=error_type,
|
| 230 |
+
bucket=bucket,
|
| 231 |
)
|
| 232 |
)
|
| 233 |
last_error = err_repr
|
|
|
|
| 235 |
intrinsic += NEW_ERROR_TYPE_BONUS
|
| 236 |
state.seen_error_types.add(error_type)
|
| 237 |
|
| 238 |
+
reward = intrinsic + bucket_bonus + PROBE_STEP_COST
|
| 239 |
obs = self._build_observation(state.episode_id, spec, last_error=last_error)
|
| 240 |
+
return obs, reward, False, {
|
| 241 |
+
"intrinsic": intrinsic,
|
| 242 |
+
"coverage_bonus": bucket_bonus,
|
| 243 |
+
"bucket": bucket,
|
| 244 |
+
"buckets_seen": len(state.seen_buckets),
|
| 245 |
+
}
|
| 246 |
|
| 247 |
def _handle_submit(
|
| 248 |
self, state: State, spec: FunctionSpec, action: SubmitAction
|
| 249 |
) -> Tuple[Observation, float, bool, dict]:
|
| 250 |
fuzz_inputs = generate_fuzz_inputs(spec, count=self.fuzz_count, seed=state.seed)
|
| 251 |
+
edge_inputs = get_edge_inputs(spec)
|
| 252 |
+
result = verify_submission(
|
| 253 |
+
action.code,
|
| 254 |
+
spec.fn,
|
| 255 |
+
fuzz_inputs,
|
| 256 |
+
target_name=spec.name,
|
| 257 |
+
edge_inputs=edge_inputs,
|
| 258 |
+
)
|
| 259 |
|
| 260 |
+
total = (
|
| 261 |
+
result.execution_reward
|
| 262 |
+
- result.complexity_penalty
|
| 263 |
+
- result.reward_hack_penalty
|
| 264 |
+
- result.floor_penalty
|
| 265 |
+
)
|
| 266 |
if result.execution_reward >= 99.999:
|
| 267 |
total += PERFECT_SUBMISSION_BONUS
|
| 268 |
|
|
|
|
| 272 |
last_error=result.define_error or "",
|
| 273 |
)
|
| 274 |
info = {
|
| 275 |
+
# --- Existing fields the live trainer + eval already read. ----
|
| 276 |
"execution_reward": result.execution_reward,
|
| 277 |
"complexity_penalty": result.complexity_penalty,
|
| 278 |
"matches": result.matches,
|
| 279 |
"fuzz_count": result.fuzz_count,
|
| 280 |
"define_error": result.define_error,
|
| 281 |
"reason": "submission",
|
| 282 |
+
# --- New, additive fields. -----------------------------------
|
| 283 |
+
"matches_by_category": result.matches_by_category,
|
| 284 |
+
"counts_by_category": result.counts_by_category,
|
| 285 |
+
"edge_pass_rate": result.edge_pass_rate,
|
| 286 |
+
"reward_hack_penalty": result.reward_hack_penalty,
|
| 287 |
+
"floor_penalty": result.floor_penalty,
|
| 288 |
+
"perfect_bonus": (
|
| 289 |
+
PERFECT_SUBMISSION_BONUS if result.execution_reward >= 99.999 else 0.0
|
| 290 |
+
),
|
| 291 |
}
|
| 292 |
return obs, total, True, info
|
| 293 |
|
|
|
|
| 307 |
last_error=last_error,
|
| 308 |
steps_taken=state.steps_taken,
|
| 309 |
max_steps=max_steps,
|
| 310 |
+
difficulty=getattr(spec, "difficulty", None),
|
| 311 |
+
coverage_buckets_seen=len(state.seen_buckets),
|
| 312 |
+
seen_outputs_count=len(state.seen_outputs),
|
| 313 |
+
seen_error_types_count=len(state.seen_error_types),
|
| 314 |
)
|
| 315 |
|
| 316 |
# --- Introspection -----------------------------------------------------
|
|
|
|
| 326 |
"done": s.done,
|
| 327 |
"seen_outputs": sorted(s.seen_outputs),
|
| 328 |
"seen_error_types": sorted(s.seen_error_types),
|
| 329 |
+
"seen_buckets": sorted(s.seen_buckets),
|
| 330 |
"probe_history": [r.model_dump() for r in s.probe_history],
|
| 331 |
}
|
opensleuth_env/models.py
CHANGED
|
@@ -1,4 +1,9 @@
|
|
| 1 |
-
"""Pydantic models for the OpenSleuth API and core state.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
@@ -31,6 +36,9 @@ class ProbeRecord(BaseModel):
|
|
| 31 |
output_repr: str
|
| 32 |
is_error: bool = False
|
| 33 |
error_type: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
class Observation(BaseModel):
|
|
@@ -43,6 +51,19 @@ class Observation(BaseModel):
|
|
| 43 |
last_error: str = ""
|
| 44 |
steps_taken: int = 0
|
| 45 |
max_steps: int = 25
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
class StepResponse(BaseModel):
|
|
@@ -63,6 +84,7 @@ class State(BaseModel):
|
|
| 63 |
probe_history: List[ProbeRecord] = Field(default_factory=list)
|
| 64 |
seen_outputs: set = Field(default_factory=set)
|
| 65 |
seen_error_types: set = Field(default_factory=set)
|
|
|
|
| 66 |
steps_taken: int = 0
|
| 67 |
done: bool = False
|
| 68 |
seed: int = 0
|
|
|
|
| 1 |
+
"""Pydantic models for the OpenSleuth API and core state.
|
| 2 |
+
|
| 3 |
+
Backwards-compat note: any field added to ``Observation`` /
|
| 4 |
+
``StepResponse`` /``State`` after v0.2 carries a default value so the in-flight
|
| 5 |
+
trainer (which only inspects a small subset of fields) keeps working.
|
| 6 |
+
"""
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
|
|
|
| 36 |
output_repr: str
|
| 37 |
is_error: bool = False
|
| 38 |
error_type: Optional[str] = None
|
| 39 |
+
# Coverage bucket label assigned by the env when the probe was recorded.
|
| 40 |
+
# ``None`` for parse-error probes (we never executed the target).
|
| 41 |
+
bucket: Optional[str] = None
|
| 42 |
|
| 43 |
|
| 44 |
class Observation(BaseModel):
|
|
|
|
| 51 |
last_error: str = ""
|
| 52 |
steps_taken: int = 0
|
| 53 |
max_steps: int = 25
|
| 54 |
+
# --- New, optional metadata fields (default-safe; trainer ignores them) ---
|
| 55 |
+
difficulty: Optional[str] = Field(
|
| 56 |
+
None, description="Curriculum difficulty: easy / medium / hard."
|
| 57 |
+
)
|
| 58 |
+
coverage_buckets_seen: int = Field(
|
| 59 |
+
0, description="How many distinct input-domain buckets the agent has probed so far."
|
| 60 |
+
)
|
| 61 |
+
seen_outputs_count: int = Field(
|
| 62 |
+
0, description="How many distinct outputs the target function has produced so far."
|
| 63 |
+
)
|
| 64 |
+
seen_error_types_count: int = Field(
|
| 65 |
+
0, description="How many distinct error types the target function has raised so far."
|
| 66 |
+
)
|
| 67 |
|
| 68 |
|
| 69 |
class StepResponse(BaseModel):
|
|
|
|
| 84 |
probe_history: List[ProbeRecord] = Field(default_factory=list)
|
| 85 |
seen_outputs: set = Field(default_factory=set)
|
| 86 |
seen_error_types: set = Field(default_factory=set)
|
| 87 |
+
seen_buckets: set = Field(default_factory=set)
|
| 88 |
steps_taken: int = 0
|
| 89 |
done: bool = False
|
| 90 |
seed: int = 0
|
opensleuth_env/verifier.py
CHANGED
|
@@ -2,10 +2,34 @@
|
|
| 2 |
function by domain-aware fuzzing, with sandboxed execution and a complexity
|
| 3 |
penalty.
|
| 4 |
|
| 5 |
-
Reward design:
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
"""
|
| 10 |
|
| 11 |
from __future__ import annotations
|
|
@@ -15,8 +39,8 @@ import math
|
|
| 15 |
import multiprocessing as mp
|
| 16 |
import random
|
| 17 |
import signal
|
| 18 |
-
from dataclasses import dataclass
|
| 19 |
-
from typing import Any, Callable, List, Optional
|
| 20 |
|
| 21 |
|
| 22 |
# ----- AST complexity ------------------------------------------------------
|
|
@@ -56,24 +80,66 @@ def calculate_complexity_penalty(code: str) -> float:
|
|
| 56 |
return min(50.0, math.log2(v.cc))
|
| 57 |
|
| 58 |
|
| 59 |
-
# -----
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
def _exec_target_in_sandbox(code: str, target_name: str, queue: mp.Queue) -> None:
|
| 63 |
"""Run inside a child process so we can hard-kill on timeout."""
|
| 64 |
try:
|
| 65 |
-
|
| 66 |
-
# imports beyond math/string/itertools/functools, so we whitelist.
|
| 67 |
-
allowed_modules = {"math", "string", "itertools", "functools", "collections", "re"}
|
| 68 |
-
safe_globals = {
|
| 69 |
-
"__builtins__": __builtins__,
|
| 70 |
-
"__name__": "__sandbox__",
|
| 71 |
-
}
|
| 72 |
-
# Pre-import the whitelisted modules so the agent can use them
|
| 73 |
-
# without needing import statements (and we keep everything inproc).
|
| 74 |
-
for mod_name in allowed_modules:
|
| 75 |
-
safe_globals[mod_name] = __import__(mod_name)
|
| 76 |
-
|
| 77 |
local_scope: dict = {}
|
| 78 |
exec(code, safe_globals, local_scope)
|
| 79 |
fn = local_scope.get(target_name) or safe_globals.get(target_name)
|
|
@@ -147,6 +213,59 @@ class VerificationResult:
|
|
| 147 |
define_error: Optional[str]
|
| 148 |
matches: int
|
| 149 |
fuzz_count: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
|
| 152 |
def verify_submission(
|
|
@@ -157,54 +276,91 @@ def verify_submission(
|
|
| 157 |
target_name: Optional[str] = None,
|
| 158 |
define_timeout_s: float = 5.0,
|
| 159 |
call_timeout_s: float = 1.0,
|
|
|
|
| 160 |
) -> VerificationResult:
|
| 161 |
-
"""Score `submitted_code` against `target_function` over the supplied
|
| 162 |
-
`fuzz_inputs`
|
| 163 |
-
|
|
|
|
| 164 |
name = target_name or target_function.__name__
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
define_err = _can_define(submitted_code, name, define_timeout_s)
|
| 167 |
complexity = calculate_complexity_penalty(submitted_code)
|
| 168 |
if define_err is not None:
|
|
|
|
| 169 |
return VerificationResult(
|
| 170 |
execution_reward=0.0,
|
| 171 |
complexity_penalty=complexity,
|
| 172 |
define_error=define_err,
|
| 173 |
matches=0,
|
| 174 |
-
fuzz_count=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
)
|
| 176 |
|
| 177 |
# Re-define in-process for fast fuzzing. We just confirmed it won't blow
|
| 178 |
-
# up at import-time; we still time-bound each call.
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
"__name__": "__opensleuth_submission__",
|
| 182 |
-
"math": __import__("math"),
|
| 183 |
-
"string": __import__("string"),
|
| 184 |
-
"itertools": __import__("itertools"),
|
| 185 |
-
"functools": __import__("functools"),
|
| 186 |
-
"collections": __import__("collections"),
|
| 187 |
-
"re": __import__("re"),
|
| 188 |
-
}
|
| 189 |
local_scope: dict = {}
|
| 190 |
exec(submitted_code, safe_globals, local_scope)
|
| 191 |
submitted_fn = local_scope.get(name) or safe_globals.get(name)
|
| 192 |
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
ref = _safe_call(target_function, inp, call_timeout_s)
|
| 196 |
-
sub = _safe_call(submitted_fn, inp, call_timeout_s)
|
| 197 |
-
if _outputs_equivalent(ref, sub):
|
| 198 |
-
matches += 1
|
| 199 |
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
exec_reward = 100.0 * (matches / fuzz_count)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
return VerificationResult(
|
| 203 |
execution_reward=exec_reward,
|
| 204 |
complexity_penalty=complexity,
|
| 205 |
define_error=None,
|
| 206 |
matches=matches,
|
| 207 |
fuzz_count=fuzz_count,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
)
|
| 209 |
|
| 210 |
|
|
@@ -220,7 +376,6 @@ def _outputs_equivalent(ref, sub) -> bool:
|
|
| 220 |
except Exception: # noqa: BLE001
|
| 221 |
return False
|
| 222 |
if rkind == "err" and skind == "err":
|
| 223 |
-
# Match on exception class name.
|
| 224 |
return rval.split(":", 1)[0] == sval.split(":", 1)[0]
|
| 225 |
if rkind == "timeout" and skind == "timeout":
|
| 226 |
return True
|
|
@@ -230,7 +385,13 @@ def _outputs_equivalent(ref, sub) -> bool:
|
|
| 230 |
def generate_fuzz_inputs(
|
| 231 |
spec, count: int = 100, seed: Optional[int] = None
|
| 232 |
) -> List[Any]:
|
| 233 |
-
"""Public helper: pull `count` fuzz inputs from a FunctionSpec, optionally
|
| 234 |
seeded for reproducibility."""
|
| 235 |
rng = random.Random(seed)
|
| 236 |
return spec.fuzzer(rng, count)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
function by domain-aware fuzzing, with sandboxed execution and a complexity
|
| 3 |
penalty.
|
| 4 |
|
| 5 |
+
Reward design (v0.3, paper-driven update):
|
| 6 |
+
|
| 7 |
+
* ``execution_reward`` in ``[0, 100]`` is the fraction of fuzz inputs whose
|
| 8 |
+
outputs match the reference, scaled to 100. Inputs are drawn from two
|
| 9 |
+
categories that are scored separately so the trainer can see *which*
|
| 10 |
+
regime the agent fails on (Masud et al., 2026 §P3 "reward granularity"):
|
| 11 |
+
|
| 12 |
+
- ``"edge"`` -- spec-defined must-pass cases (anti-deception, paper
|
| 13 |
+
§C1 of Ibrahim et al., 2024).
|
| 14 |
+
- ``"random"`` -- the original sampler.
|
| 15 |
+
|
| 16 |
+
* ``complexity_penalty`` in ``[0, 50]`` is a bounded log-scaled cyclomatic
|
| 17 |
+
complexity, or 50 on syntax error.
|
| 18 |
+
|
| 19 |
+
* ``reward_hack_penalty`` is a soft anti-hacking signal that fires when the
|
| 20 |
+
submission is a "constant function" (single distinct output / single
|
| 21 |
+
exception type) while the reference is genuinely diverse, OR the agent
|
| 22 |
+
attempts to import the reference module (we block this at sandbox-level
|
| 23 |
+
too, but we surface the attempt so the trainer can punish it).
|
| 24 |
+
|
| 25 |
+
* ``floor_penalty`` adds a hard ``-25`` floor for sub-50% submissions
|
| 26 |
+
(Vul-R2 style; Wen et al. 2025 in Masud et al. 2026 §3.4.2). This stops
|
| 27 |
+
agents from learning that emitting *any* syntactically-valid function
|
| 28 |
+
pays positive reward.
|
| 29 |
+
|
| 30 |
+
The headline ``total_reward`` returned in ``info`` is the *recommended*
|
| 31 |
+
total the env should hand back; the env is free to add a perfect-bonus on
|
| 32 |
+
top.
|
| 33 |
"""
|
| 34 |
|
| 35 |
from __future__ import annotations
|
|
|
|
| 39 |
import multiprocessing as mp
|
| 40 |
import random
|
| 41 |
import signal
|
| 42 |
+
from dataclasses import dataclass, field
|
| 43 |
+
from typing import Any, Callable, Dict, List, Optional
|
| 44 |
|
| 45 |
|
| 46 |
# ----- AST complexity ------------------------------------------------------
|
|
|
|
| 80 |
return min(50.0, math.log2(v.cc))
|
| 81 |
|
| 82 |
|
| 83 |
+
# ----- Hardened sandbox ----------------------------------------------------
|
| 84 |
+
#
|
| 85 |
+
# Previous version exposed the real ``__builtins__`` to submitted code,
|
| 86 |
+
# which let an agent reward-hack with::
|
| 87 |
+
#
|
| 88 |
+
# def fibonacci(n):
|
| 89 |
+
# from opensleuth_env.black_box import _fibonacci
|
| 90 |
+
# return _fibonacci(n)
|
| 91 |
+
#
|
| 92 |
+
# We now restrict builtins to a hand-picked safe subset and hand-import the
|
| 93 |
+
# whitelisted helper modules so the agent doesn't need ``import`` at all.
|
| 94 |
+
# This is cheap defence-in-depth; the multiprocessing wall-clock timeout
|
| 95 |
+
# below handles infinite loops independently.
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
# Builtins safe to expose. Notably *no* ``__import__``, ``open``, ``exec``,
|
| 99 |
+
# ``eval``, ``compile``, ``input``, ``__build_class__``-via-import, etc.
|
| 100 |
+
_SAFE_BUILTINS_NAMES = (
|
| 101 |
+
"abs all any ascii bin bool bytes bytearray callable chr complex dict "
|
| 102 |
+
"divmod enumerate filter float format frozenset getattr hasattr hash "
|
| 103 |
+
"hex id int isinstance issubclass iter len list map max min next object "
|
| 104 |
+
"oct ord pow print property range repr reversed round set slice sorted "
|
| 105 |
+
"str sum tuple type zip True False None NotImplemented Ellipsis "
|
| 106 |
+
"ArithmeticError AssertionError AttributeError BaseException "
|
| 107 |
+
"BufferError BytesWarning DeprecationWarning EOFError Exception "
|
| 108 |
+
"FloatingPointError IndexError KeyError LookupError MemoryError "
|
| 109 |
+
"NameError NotImplementedError OverflowError RecursionError "
|
| 110 |
+
"ReferenceError RuntimeError StopAsyncIteration StopIteration "
|
| 111 |
+
"SyntaxError TypeError UnboundLocalError UnicodeError ValueError "
|
| 112 |
+
"ZeroDivisionError __build_class__"
|
| 113 |
+
).split()
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def _make_safe_builtins() -> Dict[str, Any]:
|
| 117 |
+
import builtins as _b
|
| 118 |
+
out: Dict[str, Any] = {}
|
| 119 |
+
for n in _SAFE_BUILTINS_NAMES:
|
| 120 |
+
if hasattr(_b, n):
|
| 121 |
+
out[n] = getattr(_b, n)
|
| 122 |
+
return out
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
_SAFE_BUILTINS = _make_safe_builtins()
|
| 126 |
+
_PREIMPORTED_MODULES = ("math", "string", "itertools", "functools", "collections", "re")
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def _make_safe_globals() -> Dict[str, Any]:
|
| 130 |
+
g: Dict[str, Any] = {
|
| 131 |
+
"__builtins__": _SAFE_BUILTINS,
|
| 132 |
+
"__name__": "__opensleuth_submission__",
|
| 133 |
+
}
|
| 134 |
+
for mod_name in _PREIMPORTED_MODULES:
|
| 135 |
+
g[mod_name] = __import__(mod_name)
|
| 136 |
+
return g
|
| 137 |
|
| 138 |
|
| 139 |
def _exec_target_in_sandbox(code: str, target_name: str, queue: mp.Queue) -> None:
|
| 140 |
"""Run inside a child process so we can hard-kill on timeout."""
|
| 141 |
try:
|
| 142 |
+
safe_globals = _make_safe_globals()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
local_scope: dict = {}
|
| 144 |
exec(code, safe_globals, local_scope)
|
| 145 |
fn = local_scope.get(target_name) or safe_globals.get(target_name)
|
|
|
|
| 213 |
define_error: Optional[str]
|
| 214 |
matches: int
|
| 215 |
fuzz_count: int
|
| 216 |
+
# New, additive fields (do not change existing field meanings).
|
| 217 |
+
matches_by_category: Dict[str, int] = field(default_factory=dict)
|
| 218 |
+
counts_by_category: Dict[str, int] = field(default_factory=dict)
|
| 219 |
+
edge_pass_rate: float = 0.0
|
| 220 |
+
reward_hack_penalty: float = 0.0
|
| 221 |
+
floor_penalty: float = 0.0
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def _detect_constant_collapse(
|
| 225 |
+
sub_outputs: List[Any], ref_outputs: List[Any], min_inputs: int = 6
|
| 226 |
+
) -> bool:
|
| 227 |
+
"""Return True if the submission collapsed to a single output / error type
|
| 228 |
+
while the reference produced genuine diversity. This catches the
|
| 229 |
+
'always return 0' / 'always raise' reward-hacking pattern.
|
| 230 |
+
"""
|
| 231 |
+
if len(sub_outputs) < min_inputs:
|
| 232 |
+
return False
|
| 233 |
+
|
| 234 |
+
def _signature(call_result):
|
| 235 |
+
kind, val = call_result
|
| 236 |
+
if kind == "val":
|
| 237 |
+
try:
|
| 238 |
+
return ("val", repr(val))
|
| 239 |
+
except Exception: # noqa: BLE001
|
| 240 |
+
return ("val", id(val))
|
| 241 |
+
if kind == "err":
|
| 242 |
+
return ("err", val.split(":", 1)[0])
|
| 243 |
+
return ("timeout", "")
|
| 244 |
+
|
| 245 |
+
sub_sig = {_signature(o) for o in sub_outputs}
|
| 246 |
+
ref_sig = {_signature(o) for o in ref_outputs}
|
| 247 |
+
return len(sub_sig) == 1 and len(ref_sig) >= 3
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def _looks_like_reference_import(code: str) -> bool:
|
| 251 |
+
"""Static check for the most obvious reward-hacking pattern: importing
|
| 252 |
+
the reference function out of opensleuth_env. The sandbox already blocks
|
| 253 |
+
actual imports, but flagging them lets the env feed back a clear penalty
|
| 254 |
+
instead of a silent zero.
|
| 255 |
+
"""
|
| 256 |
+
try:
|
| 257 |
+
tree = ast.parse(code)
|
| 258 |
+
except SyntaxError:
|
| 259 |
+
return False
|
| 260 |
+
for node in ast.walk(tree):
|
| 261 |
+
if isinstance(node, ast.Import):
|
| 262 |
+
for alias in node.names:
|
| 263 |
+
if alias.name.startswith("opensleuth"):
|
| 264 |
+
return True
|
| 265 |
+
elif isinstance(node, ast.ImportFrom):
|
| 266 |
+
if node.module and node.module.startswith("opensleuth"):
|
| 267 |
+
return True
|
| 268 |
+
return False
|
| 269 |
|
| 270 |
|
| 271 |
def verify_submission(
|
|
|
|
| 276 |
target_name: Optional[str] = None,
|
| 277 |
define_timeout_s: float = 5.0,
|
| 278 |
call_timeout_s: float = 1.0,
|
| 279 |
+
edge_inputs: Optional[List[Any]] = None,
|
| 280 |
) -> VerificationResult:
|
| 281 |
+
"""Score ``submitted_code`` against ``target_function`` over the supplied
|
| 282 |
+
``fuzz_inputs`` (random regime) and ``edge_inputs`` (must-pass regime).
|
| 283 |
+
The agent is expected to define a top-level function with the same name as
|
| 284 |
+
``target_function`` (overridable via ``target_name``)."""
|
| 285 |
name = target_name or target_function.__name__
|
| 286 |
+
edge_inputs = list(edge_inputs or [])
|
| 287 |
+
|
| 288 |
+
# Static reward-hack flag: import-of-reference is always a -25 hit on top
|
| 289 |
+
# of whatever score the rest of the rubric assigns. Even if the sandbox
|
| 290 |
+
# successfully blocks the import (it will), we want to *teach* the agent
|
| 291 |
+
# not to try.
|
| 292 |
+
hack_penalty = 25.0 if _looks_like_reference_import(submitted_code) else 0.0
|
| 293 |
|
| 294 |
define_err = _can_define(submitted_code, name, define_timeout_s)
|
| 295 |
complexity = calculate_complexity_penalty(submitted_code)
|
| 296 |
if define_err is not None:
|
| 297 |
+
total = len(fuzz_inputs) + len(edge_inputs)
|
| 298 |
return VerificationResult(
|
| 299 |
execution_reward=0.0,
|
| 300 |
complexity_penalty=complexity,
|
| 301 |
define_error=define_err,
|
| 302 |
matches=0,
|
| 303 |
+
fuzz_count=total,
|
| 304 |
+
matches_by_category={"edge": 0, "random": 0},
|
| 305 |
+
counts_by_category={"edge": len(edge_inputs), "random": len(fuzz_inputs)},
|
| 306 |
+
edge_pass_rate=0.0,
|
| 307 |
+
reward_hack_penalty=hack_penalty,
|
| 308 |
+
floor_penalty=25.0,
|
| 309 |
)
|
| 310 |
|
| 311 |
# Re-define in-process for fast fuzzing. We just confirmed it won't blow
|
| 312 |
+
# up at import-time; we still time-bound each call. Note: we use the
|
| 313 |
+
# restricted globals so e.g. `__import__` is unavailable here too.
|
| 314 |
+
safe_globals = _make_safe_globals()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
local_scope: dict = {}
|
| 316 |
exec(submitted_code, safe_globals, local_scope)
|
| 317 |
submitted_fn = local_scope.get(name) or safe_globals.get(name)
|
| 318 |
|
| 319 |
+
matches_by_cat: Dict[str, int] = {"edge": 0, "random": 0}
|
| 320 |
+
counts_by_cat: Dict[str, int] = {"edge": len(edge_inputs), "random": len(fuzz_inputs)}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
|
| 322 |
+
sub_results: List[Any] = []
|
| 323 |
+
ref_results: List[Any] = []
|
| 324 |
+
|
| 325 |
+
def _score(inputs: List[Any], category: str) -> None:
|
| 326 |
+
for inp in inputs:
|
| 327 |
+
ref = _safe_call(target_function, inp, call_timeout_s)
|
| 328 |
+
sub = _safe_call(submitted_fn, inp, call_timeout_s)
|
| 329 |
+
sub_results.append(sub)
|
| 330 |
+
ref_results.append(ref)
|
| 331 |
+
if _outputs_equivalent(ref, sub):
|
| 332 |
+
matches_by_cat[category] += 1
|
| 333 |
+
|
| 334 |
+
_score(edge_inputs, "edge")
|
| 335 |
+
_score(fuzz_inputs, "random")
|
| 336 |
+
|
| 337 |
+
matches = matches_by_cat["edge"] + matches_by_cat["random"]
|
| 338 |
+
fuzz_count = len(fuzz_inputs) + len(edge_inputs) or 1
|
| 339 |
exec_reward = 100.0 * (matches / fuzz_count)
|
| 340 |
+
edge_pass_rate = (
|
| 341 |
+
matches_by_cat["edge"] / counts_by_cat["edge"] if counts_by_cat["edge"] else 0.0
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
# Anti-hacking: constant collapse penalty.
|
| 345 |
+
if _detect_constant_collapse(sub_results, ref_results):
|
| 346 |
+
hack_penalty += 15.0
|
| 347 |
+
|
| 348 |
+
# Hard floor for sub-50% match rate. Vul-R2 style: a wrong patch deserves
|
| 349 |
+
# a clearly negative signal so the agent doesn't learn that 'any defined
|
| 350 |
+
# function' pays out via the small complexity-bonus / step structure.
|
| 351 |
+
floor_penalty = 25.0 if exec_reward < 50.0 else 0.0
|
| 352 |
+
|
| 353 |
return VerificationResult(
|
| 354 |
execution_reward=exec_reward,
|
| 355 |
complexity_penalty=complexity,
|
| 356 |
define_error=None,
|
| 357 |
matches=matches,
|
| 358 |
fuzz_count=fuzz_count,
|
| 359 |
+
matches_by_category=matches_by_cat,
|
| 360 |
+
counts_by_category=counts_by_cat,
|
| 361 |
+
edge_pass_rate=edge_pass_rate,
|
| 362 |
+
reward_hack_penalty=hack_penalty,
|
| 363 |
+
floor_penalty=floor_penalty,
|
| 364 |
)
|
| 365 |
|
| 366 |
|
|
|
|
| 376 |
except Exception: # noqa: BLE001
|
| 377 |
return False
|
| 378 |
if rkind == "err" and skind == "err":
|
|
|
|
| 379 |
return rval.split(":", 1)[0] == sval.split(":", 1)[0]
|
| 380 |
if rkind == "timeout" and skind == "timeout":
|
| 381 |
return True
|
|
|
|
| 385 |
def generate_fuzz_inputs(
|
| 386 |
spec, count: int = 100, seed: Optional[int] = None
|
| 387 |
) -> List[Any]:
|
| 388 |
+
"""Public helper: pull ``count`` fuzz inputs from a FunctionSpec, optionally
|
| 389 |
seeded for reproducibility."""
|
| 390 |
rng = random.Random(seed)
|
| 391 |
return spec.fuzzer(rng, count)
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
def get_edge_inputs(spec) -> List[Any]:
|
| 395 |
+
"""Return the spec's must-pass edge inputs (empty list if the spec
|
| 396 |
+
predates the v0.3 schema)."""
|
| 397 |
+
return list(getattr(spec, "edge_cases", []) or [])
|
server.py
CHANGED
|
@@ -3,8 +3,9 @@
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import logging
|
|
|
|
| 6 |
|
| 7 |
-
from fastapi import FastAPI, HTTPException
|
| 8 |
|
| 9 |
from opensleuth_env import (
|
| 10 |
BLACK_BOX_FUNCTIONS,
|
|
@@ -19,7 +20,7 @@ from opensleuth_env import (
|
|
| 19 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
| 20 |
log = logging.getLogger("opensleuth.server")
|
| 21 |
|
| 22 |
-
app = FastAPI(title="OpenSleuth Env", version="0.
|
| 23 |
env = OpenSleuthEnv()
|
| 24 |
|
| 25 |
|
|
@@ -29,17 +30,26 @@ def health():
|
|
| 29 |
|
| 30 |
|
| 31 |
@app.get("/functions")
|
| 32 |
-
def list_functions(
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
{
|
| 36 |
"name": s.name,
|
| 37 |
"signature": s.signature,
|
| 38 |
"description": s.description,
|
|
|
|
|
|
|
| 39 |
}
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
}
|
| 43 |
|
| 44 |
|
| 45 |
@app.post("/reset")
|
|
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import logging
|
| 6 |
+
from typing import Optional
|
| 7 |
|
| 8 |
+
from fastapi import FastAPI, HTTPException, Query
|
| 9 |
|
| 10 |
from opensleuth_env import (
|
| 11 |
BLACK_BOX_FUNCTIONS,
|
|
|
|
| 20 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
| 21 |
log = logging.getLogger("opensleuth.server")
|
| 22 |
|
| 23 |
+
app = FastAPI(title="OpenSleuth Env", version="0.3.0")
|
| 24 |
env = OpenSleuthEnv()
|
| 25 |
|
| 26 |
|
|
|
|
| 30 |
|
| 31 |
|
| 32 |
@app.get("/functions")
|
| 33 |
+
def list_functions(
|
| 34 |
+
difficulty: Optional[str] = Query(
|
| 35 |
+
None,
|
| 36 |
+
description="Optional filter: easy / medium / hard. Used by the trainer for curriculum scheduling.",
|
| 37 |
+
),
|
| 38 |
+
):
|
| 39 |
+
items = []
|
| 40 |
+
for s in BLACK_BOX_FUNCTIONS.values():
|
| 41 |
+
if difficulty is not None and getattr(s, "difficulty", None) != difficulty:
|
| 42 |
+
continue
|
| 43 |
+
items.append(
|
| 44 |
{
|
| 45 |
"name": s.name,
|
| 46 |
"signature": s.signature,
|
| 47 |
"description": s.description,
|
| 48 |
+
"difficulty": getattr(s, "difficulty", None),
|
| 49 |
+
"edge_case_count": len(getattr(s, "edge_cases", []) or []),
|
| 50 |
}
|
| 51 |
+
)
|
| 52 |
+
return {"functions": items}
|
|
|
|
| 53 |
|
| 54 |
|
| 55 |
@app.post("/reset")
|