Spaces:

anugrah55
/

opensleuth-training-gemini-cli

Paused

App Files Files Community

anugrah55 commited on 29 days ago

Commit

78575eb

verified ·

1 Parent(s): 8c92f05

trainer v0.4: switch to Qwen2.5-3B-Instruct, dynamic task discovery, delegated probe sampling, difficulty-weighted rollouts, push to opensleuth-qwen2.5-3b-grpo-v2; sentinel cleared on FORCE_TRAIN=1.

Browse files

Files changed (5) hide show

entrypoint.sh +10 -0
opensleuth_train/__init__.py +9 -2
opensleuth_train/client.py +42 -1
opensleuth_train/dataset.py +178 -30
train.py +74 -13

entrypoint.sh CHANGED Viewed

@@ -37,7 +37,17 @@ sleep 2
 #    previous container start (the Space orchestrator restarts containers
 #    that exit cleanly), so just idle on the heartbeat to avoid wasting GPU
 #    on duplicate runs. Set FORCE_TRAIN=1 to override.
 SENTINEL="/data/.opensleuth-trained"
 if [[ -f "$SENTINEL" && -z "${FORCE_TRAIN:-}" ]]; then
     log "sentinel $SENTINEL exists; skipping training (set FORCE_TRAIN=1 to retrain). Idling..."
     sleep infinity

 #    previous container start (the Space orchestrator restarts containers
 #    that exit cleanly), so just idle on the heartbeat to avoid wasting GPU
 #    on duplicate runs. Set FORCE_TRAIN=1 to override.
+#
+# v0.4 update: when FORCE_TRAIN=1 is set, we explicitly *delete* the old
+# sentinel up-front. Without this the sentinel from a previous v0.2 run
+# (Qwen 0.5B / 9 builtins) blocks the v0.4 run (Qwen 3B / 15 tasks) on
+# Space restart. The sentinel only ever gets re-touched after a fresh
+# successful training run completes below.
 SENTINEL="/data/.opensleuth-trained"
+if [[ -n "${FORCE_TRAIN:-}" && -f "$SENTINEL" ]]; then
+    log "FORCE_TRAIN=1 set; removing stale sentinel $SENTINEL so we re-train."
+    rm -f "$SENTINEL"
+fi
 if [[ -f "$SENTINEL" && -z "${FORCE_TRAIN:-}" ]]; then
     log "sentinel $SENTINEL exists; skipping training (set FORCE_TRAIN=1 to retrain). Idling..."
     sleep infinity

opensleuth_train/__init__.py CHANGED Viewed

@@ -1,13 +1,20 @@
 """OpenSleuth training-side helpers (env client, dataset, reward fn)."""
 from .client import EnvClient
-from .dataset import build_synthesis_dataset, FUNCTIONS_FOR_TRAINING
 from .prompt import SYSTEM_PROMPT, build_prompt, extract_code
 __all__ = [
     "EnvClient",
-    "build_synthesis_dataset",
     "FUNCTIONS_FOR_TRAINING",
     "SYSTEM_PROMPT",
     "build_prompt",
     "extract_code",

 """OpenSleuth training-side helpers (env client, dataset, reward fn)."""
 from .client import EnvClient
+from .dataset import (
+    DEFAULT_N_BY_DIFFICULTY,
+    FUNCTIONS_FOR_TRAINING,
+    build_synthesis_dataset,
+    discover_functions,
+)
 from .prompt import SYSTEM_PROMPT, build_prompt, extract_code
 __all__ = [
     "EnvClient",
+    "DEFAULT_N_BY_DIFFICULTY",
     "FUNCTIONS_FOR_TRAINING",
+    "build_synthesis_dataset",
+    "discover_functions",
     "SYSTEM_PROMPT",
     "build_prompt",
     "extract_code",

opensleuth_train/client.py CHANGED Viewed

@@ -5,7 +5,7 @@ from __future__ import annotations
 import logging
 import os
 import time
-from typing import Any, Dict
 import requests
@@ -32,16 +32,57 @@ class EnvClient:
                 time.sleep(wait)
         raise RuntimeError(f"env POST {path} failed after {self.retries} retries: {last_exc}")
     def health(self) -> Dict[str, Any]:
         r = requests.get(f"{self.base_url}/health", timeout=self.timeout)
         r.raise_for_status()
         return r.json()
     def list_functions(self) -> list[Dict[str, str]]:
         r = requests.get(f"{self.base_url}/functions", timeout=self.timeout)
         r.raise_for_status()
         return r.json()["functions"]
     def reset(self, target_name: str, seed: int = 0, max_steps: int = 25) -> Dict[str, Any]:
         return self._post("/reset", {"target_name": target_name, "seed": seed, "max_steps": max_steps})

 import logging
 import os
 import time
+from typing import Any, Dict, List, Optional
 import requests
                 time.sleep(wait)
         raise RuntimeError(f"env POST {path} failed after {self.retries} retries: {last_exc}")
+    def _get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+        last_exc: Exception | None = None
+        for attempt in range(self.retries):
+            try:
+                r = requests.get(f"{self.base_url}{path}", params=params, timeout=self.timeout)
+                r.raise_for_status()
+                return r.json()
+            except (requests.RequestException, ValueError) as e:  # noqa: PERF203
+                last_exc = e
+                wait = 0.5 * (2 ** attempt)
+                log.warning("env GET %s failed (%s); retrying in %.1fs", path, e, wait)
+                time.sleep(wait)
+        raise RuntimeError(f"env GET {path} failed after {self.retries} retries: {last_exc}")
     def health(self) -> Dict[str, Any]:
         r = requests.get(f"{self.base_url}/health", timeout=self.timeout)
         r.raise_for_status()
         return r.json()
     def list_functions(self) -> list[Dict[str, str]]:
+        """Legacy v0.3 endpoint -- only the 9 builtin functions."""
         r = requests.get(f"{self.base_url}/functions", timeout=self.timeout)
         r.raise_for_status()
         return r.json()["functions"]
+    def list_tasks(
+        self,
+        source: str = "all",
+        difficulty: Optional[str] = None,
+    ) -> List[Dict[str, Any]]:
+        """v0.4 catalog endpoint -- builtins + Hub-driven tasks.
+        Each item carries: ``name``, ``signature``, ``description``,
+        ``difficulty`` (``easy|medium|hard|None``), ``edge_case_count``,
+        ``source`` (``builtin|hub``).
+        """
+        params: Dict[str, Any] = {"source": source}
+        if difficulty:
+            params["difficulty"] = difficulty
+        return self._get("/tasks", params=params)["tasks"]
+    def sample_inputs(self, target_name: str, n: int = 8, seed: int = 0) -> List[str]:
+        """Pull ``n`` ready-to-probe input_repr strings from the env's own
+        auto-fuzzer. Encapsulates the fuzz logic on the env side so the
+        trainer doesn't have to keep its own per-task input pools in sync."""
+        resp = self._get(
+            f"/tasks/{target_name}/sample_inputs",
+            params={"n": n, "seed": seed},
+        )
+        return list(resp["inputs"])
     def reset(self, target_name: str, seed: int = 0, max_steps: int = 25) -> Dict[str, Any]:
         return self._post("/reset", {"target_name": target_name, "seed": seed, "max_steps": max_steps})

opensleuth_train/dataset.py CHANGED Viewed

@@ -1,16 +1,26 @@
 """Build the training dataset of (function_name, signature, probes) → prompt.
-We pre-sample probes server-side with deterministic seeds so the LLM trains
-on a consistent set of in-context examples per task. The actual *reward* is
-computed by re-submitting the model's code against the env with a fresh fuzz
-seed, so the model can't memorise probe outputs.
 """
 from __future__ import annotations
 import logging
 import random
-from typing import List
 from datasets import Dataset
@@ -19,6 +29,51 @@ from .prompt import build_prompt
 log = logging.getLogger("opensleuth.dataset")
 FUNCTIONS_FOR_TRAINING: List[str] = [
     "fibonacci",
     "reverse_string",
@@ -32,31 +87,50 @@ FUNCTIONS_FOR_TRAINING: List[str] = [
 ]
-def _sample_probes(client: EnvClient, target_name: str, seed: int, n_probes: int) -> tuple[str, list[tuple[str, str, bool]]]:
-    """Open an episode and feed it `n_probes` random valid inputs sourced from
-    the env's own fuzz generator (we just hit /functions and synthesise inputs
-    locally to avoid coupling to a specific spec API)."""
-    rng = random.Random(seed)
-    ep = client.reset(target_name=target_name, seed=seed, max_steps=n_probes + 5)
-    sig = ep["target_function_signature"]
-    eid = ep["episode_id"]
-    inputs = _make_probe_inputs(target_name, rng, n_probes)
-    history: list[tuple[str, str, bool]] = []
-    for inp_repr in inputs:
-        resp = client.probe(eid, inp_repr)
-        last = resp["observation"]["probe_history"][-1]
-        history.append((last["input_repr"], last["output_repr"], bool(last["is_error"])))
-    return sig, history
-def _make_probe_inputs(target_name: str, rng: random.Random, n: int) -> list[str]:
-    """Generate `n` Python-literal repr strings appropriate for this function.
-    Kept in lock-step (loosely) with the env's fuzz generators so probes
-    almost always land on the function's valid domain, with a few intentional
-    out-of-domain inputs to expose error-handling.
     """
     if target_name == "fibonacci":
         pool = [1, 2, 5, 10, 20, 40, 89, -1, 0, 100]
     elif target_name == "reverse_string":
@@ -86,27 +160,101 @@ def _make_probe_inputs(target_name: str, rng: random.Random, n: int) -> list[str
     return [repr(rng.choice(pool)) for _ in range(n)]
 def build_synthesis_dataset(
     client: EnvClient,
     *,
-    n_per_function: int = 24,
     n_probes: int = 6,
     seed: int = 0,
 ) -> Dataset:
-    """Build a HuggingFace Dataset of {prompt, target_function_name} rows."""
     rows = []
     rng = random.Random(seed)
-    for fn_name in FUNCTIONS_FOR_TRAINING:
-        for k in range(n_per_function):
             row_seed = rng.randrange(0, 2**31)
-            sig, probes = _sample_probes(client, fn_name, row_seed, n_probes)
             prompt = build_prompt(fn_name, sig, probes)
             rows.append(
                 {
                     "prompt": prompt,
                     "target_function_name": fn_name,
                     "row_seed": row_seed,
                 }
             )
     rng.shuffle(rows)
     return Dataset.from_list(rows)

 """Build the training dataset of (function_name, signature, probes) → prompt.
+v0.4 update: tasks and probe inputs are *discovered from the live env*, not
+hardcoded on the trainer side. This means a fresh task pushed to the
+``anugrah55/opensleuth-tasks`` Hub dataset is picked up by the next
+trainer run with zero code changes here.
+Per-task probe inputs come from the env's ``/tasks/{name}/sample_inputs``
+endpoint, which delegates to the same hand-written fuzzer (for the 9
+builtins) or auto-fuzzer (for Hub-driven tasks) that the verifier uses.
+This guarantees the in-context probes the model trains on are drawn from
+the same distribution as the verifier's fuzz batch.
+Difficulty-weighted sampling: harder tasks get more rollouts (longer tail
+of unique seeds), since the agent needs more attempts to learn them.
+Defaults: ``easy=8, medium=16, hard=24`` rollouts per task.
 """
 from __future__ import annotations
 import logging
 import random
+from typing import Iterable, List, Optional, Sequence
 from datasets import Dataset
 log = logging.getLogger("opensleuth.dataset")
+# Difficulty bucket → default rollouts per task. Caller can override per call
+# or via N_EASY / N_MEDIUM / N_HARD env vars in train.py.
+DEFAULT_N_BY_DIFFICULTY = {"easy": 8, "medium": 16, "hard": 24}
+# Tasks with no/unknown difficulty fall back to "medium".
+DEFAULT_N_FALLBACK = 16
+# ---------------------------------------------------------------------------
+# Task discovery
+# ---------------------------------------------------------------------------
+def discover_functions(
+    client: EnvClient,
+    *,
+    source: str = "all",
+    include: Optional[Sequence[str]] = None,
+    difficulty: Optional[str] = None,
+) -> List[dict]:
+    """Return the live task catalog from the env Space, optionally filtered.
+    Parameters:
+      ``source``: ``"builtin" | "hub" | "all"`` (default ``"all"``).
+      ``include``: if non-empty, keep only tasks whose ``name`` is in it.
+      ``difficulty``: ``"easy" | "medium" | "hard" | "all" | None``.
+        ``None`` and ``"all"`` mean no filtering.
+    """
+    tasks = client.list_tasks(source=source)
+    if difficulty and difficulty.lower() != "all":
+        tasks = [t for t in tasks if (t.get("difficulty") or "").lower() == difficulty.lower()]
+    if include:
+        wanted = {n.strip() for n in include if n and n.strip()}
+        if wanted:
+            tasks = [t for t in tasks if t["name"] in wanted]
+    if not tasks:
+        raise RuntimeError(
+            f"discover_functions filtered down to 0 tasks "
+            f"(source={source!r}, include={include!r}, difficulty={difficulty!r})."
+        )
+    return tasks
+# Backwards-compat shim: old callers (eval/run_eval.py) imported a static
+# list. Now defaults to the 9 builtins so import-time consumers don't make
+# a network call. Use ``discover_functions(client)`` for the live catalog.
 FUNCTIONS_FOR_TRAINING: List[str] = [
     "fibonacci",
     "reverse_string",
 ]
+# ---------------------------------------------------------------------------
+# Probe sampling -- delegated to env's auto-fuzzer
+# ---------------------------------------------------------------------------
+def _make_probe_inputs(
+    target_name: str,
+    rng: random.Random,
+    n: int,
+    *,
+    client: Optional[EnvClient] = None,
+    seed: Optional[int] = None,
+) -> List[str]:
+    """Get ``n`` Python-literal repr strings appropriate for ``target_name``.
+    Preferred path: hit the env's ``/tasks/{name}/sample_inputs`` endpoint
+    so the trainer-side probe pool is always in lock-step with the
+    verifier's fuzzer. Falls back to a tiny hardcoded pool only for the
+    9 legacy builtins so callers without a client (e.g. unit tests) still
+    work.
+    ``rng`` is consulted only for the legacy fallback path; when ``client``
+    is provided we forward ``seed`` (or a fresh one drawn from ``rng``) to
+    the env so the result is reproducible across runs.
     """
+    if client is not None:
+        if seed is None:
+            seed = rng.randrange(0, 2**31) if rng is not None else 0
+        try:
+            return client.sample_inputs(target_name=target_name, n=n, seed=seed)
+        except Exception as e:  # noqa: BLE001
+            # Don't crash the dataset build if the env hiccups -- fall through
+            # to the legacy pool for builtins, or "1" * n for unknowns.
+            log.warning(
+                "env sample_inputs(%s, n=%d, seed=%s) failed: %s; falling back to legacy pool",
+                target_name, n, seed, e,
+            )
+    return _legacy_probe_pool(target_name, rng, n)
+def _legacy_probe_pool(target_name: str, rng: random.Random, n: int) -> List[str]:
+    """Hardcoded pool for the 9 builtin functions. Kept as a fallback only
+    so unit tests / offline callers still work; the live trainer uses
+    ``client.sample_inputs`` exclusively."""
     if target_name == "fibonacci":
         pool = [1, 2, 5, 10, 20, 40, 89, -1, 0, 100]
     elif target_name == "reverse_string":
     return [repr(rng.choice(pool)) for _ in range(n)]
+# ---------------------------------------------------------------------------
+# Single-row sampler
+# ---------------------------------------------------------------------------
+def _sample_probes(
+    client: EnvClient,
+    target_name: str,
+    seed: int,
+    n_probes: int,
+) -> tuple[str, list[tuple[str, str, bool]]]:
+    """Open an episode and feed it ``n_probes`` random valid inputs sourced
+    from the env's own auto-fuzzer."""
+    rng = random.Random(seed)
+    ep = client.reset(target_name=target_name, seed=seed, max_steps=n_probes + 5)
+    sig = ep["target_function_signature"]
+    eid = ep["episode_id"]
+    inputs = _make_probe_inputs(target_name, rng, n_probes, client=client, seed=seed)
+    history: list[tuple[str, str, bool]] = []
+    for inp_repr in inputs:
+        try:
+            resp = client.probe(eid, inp_repr)
+        except Exception as e:  # noqa: BLE001
+            log.warning("probe failed for %s with %r: %s", target_name, inp_repr, e)
+            continue
+        last = resp["observation"]["probe_history"][-1]
+        history.append((last["input_repr"], last["output_repr"], bool(last["is_error"])))
+    return sig, history
+# ---------------------------------------------------------------------------
+# Dataset builder
+# ---------------------------------------------------------------------------
 def build_synthesis_dataset(
     client: EnvClient,
     *,
+    n_per_function: Optional[int] = None,
+    n_easy: int = DEFAULT_N_BY_DIFFICULTY["easy"],
+    n_medium: int = DEFAULT_N_BY_DIFFICULTY["medium"],
+    n_hard: int = DEFAULT_N_BY_DIFFICULTY["hard"],
     n_probes: int = 6,
     seed: int = 0,
+    include: Optional[Sequence[str]] = None,
+    difficulty: Optional[str] = None,
+    tasks: Optional[Iterable[dict]] = None,
 ) -> Dataset:
+    """Build a HuggingFace Dataset of {prompt, target_function_name} rows.
+    ``n_per_function`` (legacy v0.3 knob) overrides the difficulty-weighted
+    sampling and applies a uniform N to every task. The new default behaviour
+    is to sample ``n_easy / n_medium / n_hard`` rollouts per task by
+    difficulty bucket; harder tasks need more rollouts to learn.
+    """
+    if tasks is None:
+        tasks = discover_functions(
+            client, include=include, difficulty=difficulty,
+        )
+    tasks = list(tasks)
+    by_diff = {"easy": n_easy, "medium": n_medium, "hard": n_hard}
     rows = []
     rng = random.Random(seed)
+    log.info("building dataset over %d task(s); per-difficulty rollouts: %s%s",
+             len(tasks), by_diff,
+             f" (override n_per_function={n_per_function})" if n_per_function else "")
+    for task in tasks:
+        fn_name = task["name"]
+        diff = (task.get("difficulty") or "").lower()
+        if n_per_function is not None:
+            n_rollouts = int(n_per_function)
+        else:
+            n_rollouts = by_diff.get(diff, DEFAULT_N_FALLBACK)
+        log.info("  %-22s difficulty=%-8s rollouts=%d source=%s",
+                 fn_name, diff or "?", n_rollouts, task.get("source", "?"))
+        for _ in range(n_rollouts):
             row_seed = rng.randrange(0, 2**31)
+            try:
+                sig, probes = _sample_probes(client, fn_name, row_seed, n_probes)
+            except Exception as e:  # noqa: BLE001
+                log.warning("rollout build failed for %s seed=%d: %s; skipping row",
+                            fn_name, row_seed, e)
+                continue
             prompt = build_prompt(fn_name, sig, probes)
             rows.append(
                 {
                     "prompt": prompt,
                     "target_function_name": fn_name,
                     "row_seed": row_seed,
+                    "difficulty": diff or "unknown",
                 }
             )
     rng.shuffle(rows)
+    log.info("built dataset: %d rows total", len(rows))
     return Dataset.from_list(rows)

train.py CHANGED Viewed

@@ -27,6 +27,7 @@ from opensleuth_train import (
     EnvClient,
     SYSTEM_PROMPT,
     build_synthesis_dataset,
 )
 from opensleuth_train.reward import format_reward, make_env_reward
@@ -39,23 +40,58 @@ logging.basicConfig(
 log = logging.getLogger("opensleuth.train")
 def parse_args() -> argparse.Namespace:
     p = argparse.ArgumentParser()
     p.add_argument("--env-url", default=os.environ.get("ENV_URL", "https://anugrah55-opensleuth-env-gemini-cli.hf.space"))
-    p.add_argument("--model-name", default=os.environ.get("MODEL_NAME", "Qwen/Qwen2.5-0.5B-Instruct"))
     p.add_argument("--output-dir", default=os.environ.get("OUTPUT_DIR", "/data/opensleuth-grpo"))
-    p.add_argument("--push-to-hub", default=os.environ.get("PUSH_TO_HUB", "anugrah55/opensleuth-qwen2.5-0.5b-grpo"))
-    p.add_argument("--n-per-function", type=int, default=int(os.environ.get("N_PER_FUNCTION", "16")))
     p.add_argument("--n-probes", type=int, default=int(os.environ.get("N_PROBES", "6")))
-    p.add_argument("--num-generations", type=int, default=int(os.environ.get("NUM_GENERATIONS", "4")))
-    p.add_argument("--max-completion-length", type=int, default=int(os.environ.get("MAX_COMPLETION_LENGTH", "320")))
-    p.add_argument("--max-prompt-length", type=int, default=int(os.environ.get("MAX_PROMPT_LENGTH", "768")))
     p.add_argument("--learning-rate", type=float, default=float(os.environ.get("LEARNING_RATE", "1e-5")))
     p.add_argument("--num-train-epochs", type=float, default=float(os.environ.get("NUM_TRAIN_EPOCHS", "1")))
     # GRPO requires per_device_train_batch_size to be a multiple of num_generations
     # (one prompt is repeated num_generations times, all in the same forward pass).
-    # Default to 1 prompt × num_generations completions per device step.
-    p.add_argument("--per-device-batch-size", type=int, default=int(os.environ.get("PER_DEVICE_BATCH_SIZE", "0")))
     p.add_argument("--gradient-accumulation-steps", type=int, default=int(os.environ.get("GRAD_ACCUM", "4")))
     p.add_argument("--no-4bit", action="store_true", default=os.environ.get("NO_4BIT", "0") == "1")
     p.add_argument("--seed", type=int, default=int(os.environ.get("SEED", "42")))
@@ -83,12 +119,34 @@ def main() -> int:
     client = EnvClient(base_url=args.env_url, timeout=60.0, retries=4)
     wait_for_env(client)
-    fns = client.list_functions()
-    log.info("env exposes %d functions: %s", len(fns), [f["name"] for f in fns])
-    log.info("building synthesis dataset (n_per_function=%d, n_probes=%d)", args.n_per_function, args.n_probes)
     dataset = build_synthesis_dataset(
-        client, n_per_function=args.n_per_function, n_probes=args.n_probes, seed=args.seed
     )
     log.info("dataset size: %d rows", len(dataset))
@@ -105,7 +163,10 @@ def main() -> int:
             "row_seed": row["row_seed"],
         }
-    dataset = dataset.map(to_chat, remove_columns=["prompt"])
     # ---- Model + LoRA ----
     log.info("loading model %s (4bit=%s)", args.model_name, not args.no_4bit)

     EnvClient,
     SYSTEM_PROMPT,
     build_synthesis_dataset,
+    discover_functions,
 )
 from opensleuth_train.reward import format_reward, make_env_reward
 log = logging.getLogger("opensleuth.train")
+def _split_csv(s: str) -> list[str]:
+    return [x.strip() for x in s.split(",") if x.strip()]
 def parse_args() -> argparse.Namespace:
     p = argparse.ArgumentParser()
     p.add_argument("--env-url", default=os.environ.get("ENV_URL", "https://anugrah55-opensleuth-env-gemini-cli.hf.space"))
+    # v0.4 default: switch to Qwen2.5-3B-Instruct for the open-ended task pool.
+    # The 0.5B baseline saturated easy tasks but couldn't solve the hard /
+    # Hub-driven ones. 3B + LoRA + 4-bit fits T4-small (16GB).
+    p.add_argument("--model-name", default=os.environ.get("MODEL_NAME", "Qwen/Qwen2.5-3B-Instruct"))
     p.add_argument("--output-dir", default=os.environ.get("OUTPUT_DIR", "/data/opensleuth-grpo"))
+    p.add_argument(
+        "--push-to-hub",
+        default=os.environ.get(
+            "PUSH_TO_HUB", "anugrah55/opensleuth-qwen2.5-3b-grpo-v2"
+        ),
+    )
+    # Task selection / curriculum knobs (v0.4).
+    p.add_argument(
+        "--functions",
+        default=os.environ.get("FUNCTIONS_INCLUDE", ""),
+        help="Comma-separated subset of task names to train on. Empty = all "
+        "tasks the env exposes (builtin + Hub).",
+    )
+    p.add_argument(
+        "--difficulty",
+        default=os.environ.get("DIFFICULTY_FILTER", "all"),
+        choices=["easy", "medium", "hard", "all"],
+        help="Curriculum filter: only sample tasks at this difficulty level.",
+    )
+    # Difficulty-weighted rollout counts. Replaces the v0.3 single
+    # n-per-function knob (kept as an optional override).
+    p.add_argument("--n-easy", type=int, default=int(os.environ.get("N_EASY", "8")))
+    p.add_argument("--n-medium", type=int, default=int(os.environ.get("N_MEDIUM", "16")))
+    p.add_argument("--n-hard", type=int, default=int(os.environ.get("N_HARD", "24")))
+    p.add_argument(
+        "--n-per-function",
+        type=int,
+        default=int(os.environ.get("N_PER_FUNCTION", "0")),
+        help="If >0, overrides per-difficulty rollout counts with a uniform N.",
+    )
     p.add_argument("--n-probes", type=int, default=int(os.environ.get("N_PROBES", "6")))
+    # GRPO/optim defaults sized for T4-small (16GB) + Qwen2.5-3B-4bit + LoRA.
+    p.add_argument("--num-generations", type=int, default=int(os.environ.get("NUM_GENERATIONS", "2")))
+    p.add_argument("--max-completion-length", type=int, default=int(os.environ.get("MAX_COMPLETION_LENGTH", "384")))
+    p.add_argument("--max-prompt-length", type=int, default=int(os.environ.get("MAX_PROMPT_LENGTH", "1024")))
     p.add_argument("--learning-rate", type=float, default=float(os.environ.get("LEARNING_RATE", "1e-5")))
     p.add_argument("--num-train-epochs", type=float, default=float(os.environ.get("NUM_TRAIN_EPOCHS", "1")))
     # GRPO requires per_device_train_batch_size to be a multiple of num_generations
     # (one prompt is repeated num_generations times, all in the same forward pass).
+    p.add_argument("--per-device-batch-size", type=int, default=int(os.environ.get("PER_DEVICE_BATCH_SIZE", "2")))
     p.add_argument("--gradient-accumulation-steps", type=int, default=int(os.environ.get("GRAD_ACCUM", "4")))
     p.add_argument("--no-4bit", action="store_true", default=os.environ.get("NO_4BIT", "0") == "1")
     p.add_argument("--seed", type=int, default=int(os.environ.get("SEED", "42")))
     client = EnvClient(base_url=args.env_url, timeout=60.0, retries=4)
     wait_for_env(client)
+    include = _split_csv(args.functions) if args.functions else None
+    difficulty = None if args.difficulty == "all" else args.difficulty
+    tasks = discover_functions(client, include=include, difficulty=difficulty)
+    log.info(
+        "env catalog: %d task(s) after filter (include=%s, difficulty=%s):",
+        len(tasks), include, difficulty,
+    )
+    for t in tasks:
+        log.info(
+            "  - %-22s difficulty=%-8s source=%s",
+            t["name"], t.get("difficulty"), t.get("source"),
+        )
+    n_per_function_override = args.n_per_function if args.n_per_function > 0 else None
+    log.info(
+        "building synthesis dataset (n_easy=%d n_medium=%d n_hard=%d override=%s n_probes=%d)",
+        args.n_easy, args.n_medium, args.n_hard, n_per_function_override, args.n_probes,
+    )
     dataset = build_synthesis_dataset(
+        client,
+        tasks=tasks,
+        n_per_function=n_per_function_override,
+        n_easy=args.n_easy,
+        n_medium=args.n_medium,
+        n_hard=args.n_hard,
+        n_probes=args.n_probes,
+        seed=args.seed,
     )
     log.info("dataset size: %d rows", len(dataset))
             "row_seed": row["row_seed"],
         }
+    # Drop the human-readable difficulty column from the GRPO-visible map so
+    # the trainer doesn't try to forward it as a reward-fn kwarg.
+    drop_cols = [c for c in ("prompt", "difficulty") if c in dataset.column_names]
+    dataset = dataset.map(to_chat, remove_columns=drop_cols)
     # ---- Model + LoRA ----
     log.info("loading model %s (4bit=%s)", args.model_name, not args.no_4bit)