Spaces:

jampuramprem
/

AxiomForgeAI

Sleeping

App Files Files Community

jampuramprem commited on 12 days ago

Commit

ec4ae03

0 Parent(s):

Initial Space deployment

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +73 -0
Dockerfile +130 -0
README.md +132 -0
__init__.py +16 -0
blog.md +94 -0
client.py +76 -0
docs/environment-overview.puml +69 -0
docs/reward-system.puml +51 -0
docs/training-phases.puml +27 -0
images/axiomforgeai_scenes/scene_01.svg +52 -0
images/axiomforgeai_scenes/scene_02.svg +72 -0
images/axiomforgeai_scenes/scene_03.svg +67 -0
images/axiomforgeai_scenes/scene_04.svg +78 -0
images/axiomforgeai_scenes/scene_05.svg +66 -0
images/axiomforgeai_scenes/scene_06.svg +79 -0
images/axiomforgeai_scenes/scene_07.svg +66 -0
images/axiomforgeai_scenes/scene_08.svg +74 -0
images/axiomforgeai_scenes/scene_09.svg +61 -0
images/axiomforgeai_scenes/scene_10.svg +86 -0
images/blog_flow/architecture.svg +50 -0
images/blog_flow/grading.svg +45 -0
images/blog_flow/grpo-loop.svg +44 -0
images/blog_flow/task-sources.svg +35 -0
images/environment_overview.svg +0 -0
images/training_phases.svg +1 -0
logs/grpo/grpo_20260426_024029.log +44 -0
logs/grpo/grpo_20260426_032827.log +0 -0
logs/grpo/grpo_20260426_032827/config.json +44 -0
logs/grpo/grpo_20260426_032827/console_output.log +0 -0
logs/grpo/grpo_20260426_032827/metrics.csv +31 -0
logs/metrics.jsonl +31 -0
models.py +67 -0
openenv.yaml +7 -0
pyproject.toml +55 -0
requirements.txt +160 -0
scripts/__init__.py +1 -0
scripts/convert_gsm8k_to_sft.py +193 -0
scripts/create_dual_task_dataset.py +321 -0
scripts/demo_before_after.py +591 -0
scripts/dual_task_sft_pipeline.py +390 -0
scripts/eval_sft_inference.py +565 -0
scripts/gsm8k_sft_pipeline.py +475 -0
scripts/launch_grpo.sh +127 -0
scripts/plot_grpo_run.py +425 -0
scripts/plot_training_results.py +521 -0
scripts/precompute_extraction_cache.py +174 -0
scripts/prepare_aqua_dataset.py +265 -0
scripts/prepare_combined_dataset.py +711 -0
scripts/run_grpo_training.py +0 -0
scripts/run_inference.py +502 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,73 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+.pytest_cache/
+.coverage
+.coverage.*
+htmlcov/
+.tox/
+.nox/
+coverage.xml
+*.cover
+*.py,cover
+# Type checkers / static analyzers
+.mypy_cache/
+.pyre/
+.ruff_cache/
+.pytype/
+# Virtual environments
+.venv/
+venv/
+env/
+ENV/
+# Local environment files
+.env
+.env.*
+*.local
+# IDE / editor files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS files
+.DS_Store
+Thumbs.db
+data/
+*/ui
+images/

Dockerfile ADDED Viewed

	@@ -0,0 +1,130 @@

+# AxiomForgeAI — GRPO Training Image
+# ─────────────────────────────────────────────────────────────────────────────
+# Hardware target  : 1× A100 PCIE 80 GB  |  AMD EPYC 7V13  |  NVMe 300 GB
+#
+# CUDA driver      : >= 13.0  (enforced at container start via entrypoint)
+# CUDA toolkit     : 12.4.1   (backward-compatible with driver 13.x)
+# PyTorch          : 2.5.1+cu124  (pinned in requirements.txt)
+# Flash-Attn       : 2.8.3        (pinned in requirements.txt)
+#
+# All Python package versions are taken exclusively from requirements.txt.
+# No versions are hard-coded in this file.
+#
+# ── Build ─────────────────────────────────────────────────────────────────────
+#   docker build -t axiomforgeai-train:latest .
+#
+# ── Interactive shell ─────────────────────────────────────────────────────────
+#   docker run --gpus all --ipc=host --ulimit memlock=-1 \
+#     -v $(pwd)/data:/workspace/data \
+#     -v $(pwd)/checkpoints:/workspace/checkpoints \
+#     -v $(pwd)/logs:/workspace/logs \
+#     -it axiomforgeai-train:latest bash
+#
+# ── GRPO training (one-shot) ──────────────────────────────────────────────────
+#   docker run --gpus all --ipc=host --ulimit memlock=-1 \
+#     -v $(pwd)/data:/workspace/data \
+#     -v $(pwd)/checkpoints:/workspace/checkpoints \
+#     -v $(pwd)/logs:/workspace/logs \
+#     axiomforgeai-train:latest \
+#     python scripts/run_grpo_training.py \
+#       --base-model checkpoints/dual_task_v1 \
+#       --gsm8k-data data/sft/gsm8k_sft.jsonl \
+#       --num-iterations 30 --group-size 8 --questions-per-iter 16
+# ─────────────────────────────────────────────────────────────────────────────
+# CUDA toolkit 12.4.1 — matches the cu124 wheels in requirements.txt and is
+# fully compatible with the A100's CUDA 13.2 driver (driver is always ≥ toolkit).
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
+LABEL org.opencontainers.image.title="AxiomForgeAI Training" \
+      cuda.driver.minimum="13.0" \
+      cuda.toolkit="12.4.1" \
+      torch.version="2.5.1+cu124" \
+      flash_attn.version="2.8.3"
+# ── System packages ────────────────────────────────────────────────────────────
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        python3.11 \
+        python3.11-dev \
+        python3-pip \
+        python3.11-venv \
+        git \
+        git-lfs \
+        curl \
+        wget \
+        build-essential \
+        ninja-build \
+        pkg-config \
+        libssl-dev \
+        libffi-dev \
+        ca-certificates \
+    && ln -sf /usr/bin/python3.11 /usr/bin/python3 \
+    && ln -sf /usr/bin/python3    /usr/bin/python \
+    && rm -rf /var/lib/apt/lists/*
+# ── Upgrade pip + build tooling ───────────────────────────────────────────────
+RUN python -m pip install --upgrade --no-cache-dir pip setuptools wheel
+# ── PyTorch (CUDA 12.4 wheels) ────────────────────────────────────────────────
+# Must be installed before flash-attn because flash-attn runs a torch version
+# check at install time.  The cu124 index is also used for all CUDA-linked wheels.
+# Version is taken from requirements.txt — the --constraint flag keeps pip from
+# re-resolving to a different version when requirements.txt is processed next.
+RUN pip install --no-cache-dir \
+        --extra-index-url https://download.pytorch.org/whl/cu124 \
+        "torch==2.5.1" "torchvision==0.20.1" "torchaudio==2.5.1"
+# ── All remaining pinned requirements (from requirements.txt) ─────────────────
+# flash-attn, xformers, vllm, triton, bitsandbytes, transformers, accelerate,
+# peft, ray, sympy, scipy, numpy, openenv-core, fastapi, uvicorn, … are all
+# installed here at the exact versions pinned in requirements.txt.
+# The cu124 index is provided so CUDA-linked wheels resolve correctly.
+COPY requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir \
+        --extra-index-url https://download.pytorch.org/whl/cu124 \
+        -r /tmp/requirements.txt
+# ── Project source ───────────��────────────────────────────────────────────────
+WORKDIR /workspace
+COPY . /workspace/
+# ── Environment variables ─────────────────────────────────────────────────────
+# Repo root on PYTHONPATH so `from src.rl.X import Y` works without editable install
+ENV PYTHONPATH="/workspace:$PYTHONPATH"
+# HuggingFace model cache — mount a host path here to persist model downloads:
+#   -v /host/hf_cache:/workspace/.hf_cache
+ENV HF_HOME="/workspace/.hf_cache"
+ENV TRANSFORMERS_CACHE="/workspace/.hf_cache"
+# A100 CUDA / NCCL tuning
+ENV CUDA_DEVICE_MAX_CONNECTIONS=1
+ENV NCCL_P2P_DISABLE=0
+ENV NCCL_IB_DISABLE=0
+# Required for Flash-Attn 2 with bfloat16 on Ampere
+ENV TORCH_CUDNN_V8_API_ENABLED=1
+# ── Runtime entrypoint: enforce CUDA driver >= 13.0 ──────────────────────────
+# nvidia-smi is injected at runtime via --gpus, so this check runs when the
+# container starts, not at build time.
+RUN printf '%s\n' \
+    '#!/bin/sh' \
+    'if command -v nvidia-smi >/dev/null 2>&1; then' \
+    '  CUDA_VER=$(nvidia-smi 2>/dev/null | grep -oP "CUDA Version: \K[0-9.]+" || echo "0.0")' \
+    '  MAJOR=$(echo "$CUDA_VER" | cut -d. -f1)' \
+    '  echo "[AxiomForgeAI] CUDA driver reports toolkit: $CUDA_VER"' \
+    '  if [ "${MAJOR:-0}" -lt 13 ] 2>/dev/null; then' \
+    '    echo "[ERROR] CUDA driver >= 13.0 required; detected $CUDA_VER. Upgrade your NVIDIA driver."' \
+    '    exit 1' \
+    '  fi' \
+    '  echo "[AxiomForgeAI] CUDA $CUDA_VER >= 13.0 — OK"' \
+    'else' \
+    '  echo "[WARNING] nvidia-smi not found — CUDA driver version check skipped."' \
+    'fi' \
+    'exec "$@"' \
+    > /usr/local/bin/entrypoint.sh \
+    && chmod +x /usr/local/bin/entrypoint.sh
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
+CMD ["bash"]

README.md ADDED Viewed

	@@ -0,0 +1,132 @@

+---
+title: AxiomForgeAI Environment Server
+emoji: 🌌
+colorFrom: indigo
+colorTo: pink
+sdk: docker
+pinned: false
+app_port: 8000
+base_path: /web
+tags:
+  - openenv
+---
+# AxiomForgeAI
+[![OpenEnv](https://img.shields.io/badge/Powered%20by-OpenEnv-blue)](https://github.com/meta-pytorch/OpenEnv)
+*A self-improving math environment where a model practices on verified problems, generates new challenges when ready, and learns from solution attempts whose reasoning steps and final answers agree.*
+## The Problem
+Math reasoning models can fail in two different ways. Sometimes the setup, arithmetic, and algebraic steps look reasonable, but the final answer is wrong. Sometimes the final answer is right, but the reasoning that produced it is incomplete, inconsistent, or hard to trust.
+For a math user, both failures matter. Checking only the final answer misses where the solution went off track. Checking only the steps misses whether the work actually reaches the right result. The useful signal is the agreement between the reasoning path and the final answer.
+This project builds a practice loop around that signal. The model first works on problems with known answers, gets feedback on both the chain of reasoning and the final result, and only then starts generating new challenges for itself. The constraint is intentionally small: a 1.5B math model.
+## The Environment
+The environment is a practice loop for math reasoning. Each training group starts with one problem, asks the model for multiple solution attempts, scores those attempts from several angles, and uses GRPO to reinforce the attempts that are stronger than the rest of the group.
+![AxiomForgeAI environment overview](images/environment_overview.svg)
+The environment has two task sources:
+- **Grounded source:** A dataset problem from GSM8K / MATH comes with a known final answer. This gives the environment a reliable anchor for checking whether the model actually reached the right result.
+- **Self-play source:** The curriculum selects a target skill and difficulty. The model writes a new question, then samples multiple solutions to that question. This adds practice beyond static datasets, but only after the grounded signal is stable enough.
+Both sources feed the same scoring and update loop. For every selected problem, the model samples `K` candidate solutions. The environment checks final-answer correctness when a gold answer exists, scores reasoning quality with a PRM, checks chain consistency and symbolic arithmetic where possible, checks answer formatting, and scores self-generated questions for clarity, novelty, difficulty fit, and solvability.
+GRPO then compares the `K` attempts against each other. The model is not rewarded for a solution in isolation; the strongest attempt in the group becomes the direction for learning. Training starts grounded-only, gradually mixes in self-play groups, and falls back to grounded practice if generated-question quality or answer correctness drops.
+## How Self-Improvement Works
+Self-improvement comes from turning each problem into a small comparison. The model does not produce one solution and move on; the environment samples several attempts, scores each attempt, and asks which reasoning path was strongest.
+GRPO uses that within-group comparison as the learning signal. Attempts with correct answers, stronger reasoning chains, and cleaner final-answer format are reinforced. Attempts with broken chains or unsupported answers become weaker examples.
+```text
+practice -> sample attempts -> verify steps and answer -> compare -> reinforce -> adjust difficulty
+```
+## Reward System
+The reward is designed to avoid a common math-training failure: optimizing for either the final answer or the reasoning trace alone. A good solution should reach the right answer, explain the path clearly, and keep the final result consistent with the steps that produced it.
+| Signal | What it checks | Why it matters |
+| --- | --- | --- |
+| Final answer | Matches the gold answer when one exists | Keeps grounded problems tied to objective correctness |
+| Process score | PRM score over the reasoning steps | Rewards clear mathematical progress, not just the last line |
+| Chain consistency | Correct-prefix and step-answer consistency signals | Gives partial learning signal when a solution goes wrong midway |
+| Format | Parseable final answer and clean response structure | Makes automatic grading reliable |
+| Question quality | Topic fit, difficulty fit, clarity, novelty, and solvability | Keeps self-play from generating vague or useless practice tasks |
+Grounded problems use the gold answer as the anchor. Self-play problems add a question-quality score before the solution reward is trusted. Both paths produce one combined score for each sampled attempt, and GRPO uses those scores only in comparison with the other attempts from the same problem.
+```text
+grounded: answer correctness + process score + chain consistency + format
+self-play: question quality + solution quality
+both -> one combined score per attempt -> GRPO compares attempts within the group
+```
+## Training Phases
+Training follows a simple three-phase schedule. It starts with grounded-only practice so the model learns to keep answers and reasoning stable on problems with known solutions. Self-play is then introduced gradually, while grounded questions remain as an anchor. Once both are stable, training continues with a mixed task source and falls back to grounded-only batches if answer quality drops.
+![Training phases overview](images/training_phases.svg)
+## Training Script
+The GRPO training loop is available in two forms:
+- [`scripts/launch_grpo.sh`](scripts/launch_grpo.sh) — the primary launch script; sets CUDA/threading env vars, verifies Flash-Attention, and calls `run_grpo_training.py` with the full parameter set.
+  ```bash
+  bash scripts/launch_grpo.sh
+  ```
+- [`train_grpo.ipynb`](train_grpo.ipynb) — notebook version with the same parameters, structured around `env.reset / env.step / env.state / env.close` for interactive inspection.
+## Results
+These plots come from a single GPU training run and focus on the core question: did the model get better at making its reasoning and final answer agree?
+### Evaluation Quality Over Training
+![Evaluation quality over training](images/plot1_eval_quality.png)
+The environment tracks final correctness, solution quality, step validity, and how long the reasoning chain stays correct. All four move upward together, which suggests the model is not just finding better final answers. It is also producing reasoning that holds up longer.
+### Training Journey
+![Training journey across all 30 iterations](images/plot2_training_journey.png)
+Training starts with grounded practice on problems with known answers. Self-play is introduced only after the grounded signal is stable, so the model does not train on its own generated problems too early. The transition is conditional, not just a timer.
+### Self-Play Curriculum
+![Self-play curriculum ramp and question quality](images/plot3_selfplay_success.png)
+By the end of training, most practice came from self-play. The important part is that generated problems stayed solvable and novel even after self-play became a larger share of training. That makes the ramp meaningful: self-play added useful practice instead of recycled noise.
+### Reward Confidence
+![Reward confidence and skipped groups](images/plot4_reward_confidence.png)
+The reward spread shows how much contrast exists between the model's best and worst attempts. Wide spread gives GRPO something to learn from. Skipped groups are cases where attempts are too similar to compare usefully. That rate falls as harder material enters the curriculum, which suggests the comparison signal stays useful.
+### Step-Level Reasoning Quality
+![Step accuracy and LCCP across training](images/plot5_reasoning_quality.png)
+Step accuracy checks whether each line of reasoning is valid. Chain integrity checks whether those valid steps form an unbroken path to the answer. Both improve together, which means the model is building solutions that hold together more often instead of only producing better-looking outputs.
+## Why It Matters
+Reliable math reasoning needs more than fluent explanations or lucky final answers. A system that can separate correct reasoning from unsupported answers gives the model a better training target: not just "get the number," but build a chain of logic that reaches the number.
+AxiomForgeAI matters because it turns that target into an environment. The same pattern can extend beyond math to other verifiable domains where attempts can be checked, compared, and improved: code, logic, structured data transformations, and scientific problem solving.
+---
+*Engineered for the OpenEnv Hackathon India 2026*

__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Axiomforgeai Environment."""
+from .client import AxiomforgeaiEnv
+from .models import AxiomforgeaiAction, AxiomforgeaiObservation
+__all__ = [
+    "AxiomforgeaiAction",
+    "AxiomforgeaiObservation",
+    "AxiomforgeaiEnv",
+]

blog.md ADDED Viewed

	@@ -0,0 +1,94 @@

+# AxiomForgeAI: Self-Improving Math Models Need More Than the Final Answer
+Math models have a strange failure mode.
+They can write a solution that looks careful, step-by-step, and confident, then end with the wrong answer. They can also produce the right final number with reasoning that is incomplete, inconsistent, or impossible to trust.
+For math, that gap matters. The final answer is not enough. A proof, derivation, or word-problem solution only becomes useful when the path and the answer support each other.
+AxiomForgeAI is built around that idea.
+Instead of treating math reasoning as a one-shot generation problem, AxiomForgeAI turns it into a practice environment. The model does not simply answer a question and move on. It attempts the same problem multiple ways, receives feedback on both the reasoning path and the final answer, and learns from the attempts where the two agree.
+## The Architecture
+![AxiomForgeAI architecture](./images/blog_flow/architecture.svg)
+AxiomForgeAI is a training loop around one simple idea: a math solution should be judged by whether the reasoning path and the final answer support each other.
+The environment first selects one task. It can come from a grounded dataset problem with a known answer, or from a self-play question written from a curriculum target. Only after that task is selected does the model sample `K` candidate solutions. The environment scores each attempt, and GRPO compares the attempts within that same problem group.
+That is the important part. The model is not rewarded for sounding fluent. It is rewarded when the chain of reasoning and the final answer line up.
+## Where Practice Comes From
+![Task sources](./images/blog_flow/task-sources.svg)
+The environment uses two sources of problems.
+Grounded practice starts with dataset problems from sources like GSM8K or MATH. These problems come with known final answers, so the environment has a reliable anchor for correctness.
+Self-play starts later. The curriculum selects a skill and difficulty, and the model writes a new question. That question is only useful if it is clear, solvable, on-topic, and appropriately difficult. This keeps self-play from becoming random problem generation.
+Both sources eventually become the same interface: one selected problem. From there, the model samples multiple candidate solutions and the environment compares the resulting reasoning paths.
+## What Gets Checked
+![Grading signals](./images/blog_flow/grading.svg)
+AxiomForgeAI does not rely on a single reward signal. A final answer check is useful, but it is not enough. A process score is useful, but it is also not enough. The environment combines several signals so that a polished but wrong solution does not look good, and a lucky answer with weak reasoning does not look good either.
+For grounded problems, the gold answer anchors correctness. For all attempts, the environment also looks at reasoning quality, chain consistency, symbolic arithmetic where possible, and whether the answer can be parsed cleanly. For self-play, the generated question itself is scored before the solution reward is trusted.
+The result is one score per attempt. That score is not the end of training. It becomes useful because there are other attempts for the same problem.
+## Why GRPO Fits
+![GRPO loop](./images/blog_flow/grpo-loop.svg)
+GRPO turns a problem into a small comparison game. The model samples several attempts for the same prompt. Some are wrong, some are partially right, and one may be clearly better because the answer follows from the steps.
+Instead of asking whether an attempt is good in isolation, GRPO asks which attempts are stronger relative to the rest of the group. That relative signal is exactly what this project needs. The model learns from contrast: this reasoning path held together better than the others.
+After the update, the improved model goes back into the environment for the next batch. The curriculum can keep it grounded, introduce more self-play, or fall back to grounded-only practice if quality drops.
+## Why the 1.5B Constraint Matters
+AxiomForgeAI is intentionally built around a compact math model.
+That constraint makes the loop easier to see. A smaller model cannot hide every reasoning mistake behind scale. If the setup is wrong, if the arithmetic drifts, or if the final answer does not follow from the steps, the environment has to catch it and turn it into feedback.
+The point is not that a compact model magically solves math. The point is that improvement has to come from better practice, better verification, and better selection of reasoning paths.
+## What the Model Learns From
+AxiomForgeAI rewards attempts that are mathematically useful, not just polished.
+The model learns to solve problems with reasoning that supports the answer. It also learns, during self-play, to generate practice problems that are worth solving. A useful self-generated problem should be clear, solvable, on-topic, appropriately difficult, and not just a duplicate of what the model has already seen.
+That makes the loop different from ordinary fine-tuning. The model is not only seeing more answers. It is practicing, being checked, and learning from the solution paths that survived verification.
+## Where Examples Will Go
+This section will include real model responses from the run.
+- an example where the model had good steps but a wrong final answer
+- an example where the model guessed correctly but the reasoning was weak
+- an example after training where the reasoning chain and final answer agree
+- a self-generated problem that passed the quality checks
+These examples are important because the project is not only about a metric. The clearest evidence is seeing the model become better at making the path and the answer line up.
+## Why This Matters
+Math is a good starting point because mistakes are often checkable. Arithmetic can be verified. Final answers can be compared. Reasoning steps can be scored. That makes math a clean domain for building self-improvement loops.
+But the pattern is bigger than math.
+Many useful AI tasks have the same structure. Generate an attempt, check it, compare it against alternatives, and reinforce the better path. Code, logic, structured data transformation, and scientific problem solving all benefit from environments where progress can be verified.
+AxiomForgeAI is one version of that pattern. It asks a simple question.
+> What if a model could practice until its reasoning and answers agreed?
+That is the loop this project builds.

client.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""AxiomForgeAI Math RL Environment Client."""
+from typing import Any, Dict, Optional
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from openenv.core.env_server.types import State
+from .models import AxiomforgeaiAction, AxiomforgeaiObservation
+class AxiomforgeaiEnv(
+    EnvClient[AxiomforgeaiAction, AxiomforgeaiObservation, State]
+):
+    """
+    Client for the AxiomForgeAI math RL environment.
+    Maintains a persistent WebSocket connection to the environment server.
+    Each client instance gets its own session with independent episode state.
+    Episode flow::
+        with AxiomforgeaiEnv(base_url="http://localhost:8000") as env:
+            # 1. Reset — receive a math question
+            result = env.reset()
+            question = result.observation.question
+            # 2. Step — submit a solution, receive reward + feedback
+            solution = "Step 1: ... Final Answer: 42"
+            result = env.step(AxiomforgeaiAction(solution=solution))
+            print(result.reward, result.observation.feedback)
+    Example with Docker::
+        client = AxiomforgeaiEnv.from_docker_image("axiomforgeai-env:latest")
+        try:
+            result = client.reset()
+            result = client.step(AxiomforgeaiAction(solution="Final Answer: 17"))
+        finally:
+            client.close()
+    """
+    def _step_payload(self, action: AxiomforgeaiAction) -> Dict[str, Any]:
+        """Convert AxiomforgeaiAction to JSON payload for the step endpoint."""
+        return {"solution": action.solution}
+    def _parse_result(self, payload: Dict[str, Any]) -> StepResult[AxiomforgeaiObservation]:
+        """Parse the server's step response into a StepResult."""
+        obs_data: Dict[str, Any] = payload.get("observation", {})
+        observation = AxiomforgeaiObservation(
+            question=obs_data.get("question", ""),
+            topic=obs_data.get("topic", ""),
+            difficulty=float(obs_data.get("difficulty", 0.5)),
+            feedback=obs_data.get("feedback", ""),
+            done=payload.get("done", False),
+            reward=payload.get("reward"),
+            metadata=obs_data.get("metadata"),
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward"),
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: Dict[str, Any]) -> State:
+        """Parse the server's state response into a State object."""
+        return State(
+            episode_id=payload.get("episode_id"),
+            step_count=payload.get("step_count", 0),
+        )

docs/environment-overview.puml ADDED Viewed

	@@ -0,0 +1,69 @@

+@startuml environment_overview
+!theme plain
+top to bottom direction
+skinparam backgroundColor #FEFEFE
+skinparam defaultFontName Arial
+skinparam defaultFontSize 14
+skinparam ArrowColor #334155
+skinparam RectangleBorderColor #64748B
+skinparam RectangleFontColor #0F172A
+skinparam roundcorner 10
+skinparam linetype ortho
+skinparam packageStyle rectangle
+skinparam nodesep 42
+skinparam ranksep 42
+title AxiomForgeAI - Phase-Controlled Math Reasoning Loop
+rectangle "Small Math Model\n1.5B parameters" as MODEL #DBEAFE
+rectangle "Phase Controller\nwarmup: grounded only\nramp: gradual self-play\ncontinuous: capped mix + fallback" as PHASE #E2E8F0
+rectangle "Task Source\nfor each GRPO group" as SELECT #E2E8F0
+rectangle "Grounded Source\nKnown-answer practice" as GLANE #ECFDF5 {
+    rectangle "Dataset problem\nGSM8K / MATH" as GQ #CCFBF1
+    rectangle "Gold answer\navailable" as GOLD #CCFBF1
+    rectangle "Model samples\nK solutions" as GSOL #CCFBF1
+}
+rectangle "Self-Play Source\nModel-made challenges" as SLANE #EEF2FF {
+    rectangle "Curriculum picks\nskill + difficulty" as CURRIC #E0E7FF
+    rectangle "Model writes\na new question" as SQ #E0E7FF
+    rectangle "Model samples\nK solutions" as SSOL #E0E7FF
+}
+rectangle "Shared Grading\nanswer, steps, arithmetic, format\n+ question quality for self-play" as GRADERS #F1F5F9
+rectangle "Group Comparison\nWhich attempts worked best?" as COMPARE #EDE9FE
+rectangle "GRPO Update\nReinforce stronger reasoning" as GRPO #DDD6FE
+rectangle "Improved Model\nfor the next round" as NEXT #DBEAFE
+MODEL -down-> PHASE
+PHASE -down-> SELECT
+note right of PHASE
+  sets mix
+end note
+SELECT -left-> GQ : grounded slot
+GQ --> GOLD
+GOLD --> GSOL
+SELECT -right-> CURRIC : self-play slot
+CURRIC --> SQ
+SQ --> SSOL
+GSOL -down-> GRADERS
+SSOL -down-> GRADERS
+GRADERS -right-> COMPARE
+COMPARE -right-> GRPO
+GRPO -right-> NEXT
+NEXT -up-> MODEL : repeat
+note bottom of SELECT
+  Each batch is randomly interleaved.
+  Phase 1 uses grounded only.
+  Later phases add self-play slots by ratio.
+end note
+@enduml

docs/reward-system.puml ADDED Viewed

	@@ -0,0 +1,51 @@

+@startuml reward_system
+!theme plain
+top to bottom direction
+skinparam backgroundColor #FEFEFE
+skinparam defaultFontName Arial
+skinparam defaultFontSize 14
+skinparam ArrowColor #334155
+skinparam RectangleBorderColor #64748B
+skinparam RectangleFontColor #0F172A
+skinparam roundcorner 10
+skinparam linetype ortho
+skinparam packageStyle rectangle
+skinparam nodesep 54
+skinparam ranksep 60
+title AxiomForgeAI - Reward System
+rectangle "Sampled Solution Attempt" as ATTEMPT #DBEAFE
+rectangle "Grounded Reward\nknown-answer problem" as GROUNDED #ECFDF5 {
+    rectangle "Final answer\nmatches gold" as GOLD #CCFBF1
+    rectangle "PRM process score\nreasoning quality" as GPRM #CCFBF1
+    rectangle "Chain consistency\ncorrect prefix + final check" as GCHAIN #CCFBF1
+    rectangle "Format score\nparseable final answer" as GFORMAT #CCFBF1
+}
+rectangle "Self-Play Reward\ngenerated challenge" as SELFPLAY #EEF2FF {
+    rectangle "Question quality\nclarity, novelty, solvability" as QUALITY #E0E7FF
+    rectangle "Solution quality\nPRM + chain checks" as SOLUTION #E0E7FF
+    rectangle "Format score\nparseable final answer" as SFORMAT #E0E7FF
+}
+rectangle "Combined Reward\none score per attempt" as SCORE #F1F5F9
+rectangle "GRPO Group Comparison\nrank attempts within the same problem" as COMPARE #EDE9FE
+rectangle "Step-Answer Alignment\nreward paths where reasoning supports the result" as ALIGN #DDD6FE
+ATTEMPT -left-> GROUNDED : grounded
+ATTEMPT -right-> SELFPLAY : self-play
+GOLD --> GPRM
+GPRM --> GCHAIN
+GCHAIN --> GFORMAT
+QUALITY --> SOLUTION
+SOLUTION --> SFORMAT
+GFORMAT -down-> SCORE
+SFORMAT -down-> SCORE
+SCORE -right-> COMPARE
+COMPARE -right-> ALIGN
+@enduml

docs/training-phases.puml ADDED Viewed

	@@ -0,0 +1,27 @@

+@startuml training_phases
+!theme plain
+left to right direction
+skinparam backgroundColor #FEFEFE
+skinparam defaultFontName Arial
+skinparam defaultFontSize 14
+skinparam ArrowColor #334155
+skinparam RectangleBorderColor #64748B
+skinparam RectangleFontColor #0F172A
+skinparam roundcorner 10
+skinparam linetype ortho
+skinparam packageStyle rectangle
+skinparam nodesep 42
+skinparam ranksep 42
+title AxiomForgeAI - Training Phases
+rectangle "Phase 1\nGrounded Only" as Warmup #ECFDF5
+rectangle "Phase 2\nSelf-Play Ramp" as Ramp #EEF2FF
+rectangle "Phase 3\nMixed Training" as Improve #F1F5F9
+rectangle "Fallback\nGrounded Recovery" as Fallback #EDE9FE
+Warmup --> Ramp
+Ramp --> Improve
+Improve --> Fallback : if quality drops
+Fallback --> Improve : recover
+@enduml

images/axiomforgeai_scenes/scene_01.svg ADDED Viewed

images/axiomforgeai_scenes/scene_02.svg ADDED Viewed

images/axiomforgeai_scenes/scene_03.svg ADDED Viewed

images/axiomforgeai_scenes/scene_04.svg ADDED Viewed

images/axiomforgeai_scenes/scene_05.svg ADDED Viewed

images/axiomforgeai_scenes/scene_06.svg ADDED Viewed

images/axiomforgeai_scenes/scene_07.svg ADDED Viewed

images/axiomforgeai_scenes/scene_08.svg ADDED Viewed

images/axiomforgeai_scenes/scene_09.svg ADDED Viewed

images/axiomforgeai_scenes/scene_10.svg ADDED Viewed

images/blog_flow/architecture.svg ADDED Viewed

images/blog_flow/grading.svg ADDED Viewed

images/blog_flow/grpo-loop.svg ADDED Viewed

images/blog_flow/task-sources.svg ADDED Viewed

images/environment_overview.svg ADDED Viewed

images/training_phases.svg ADDED Viewed

logs/grpo/grpo_20260426_024029.log ADDED Viewed

	@@ -0,0 +1,44 @@

+2026-04-26 02:40:33,617 INFO     __main__ - ======================================================================
+2026-04-26 02:40:33,617 INFO     __main__ - GRPO run: grpo_20260426_024029
+2026-04-26 02:40:33,617 INFO     __main__ - Checkpoints : checkpoints/grpo/grpo_20260426_024029
+2026-04-26 02:40:33,618 INFO     __main__ - Logs        : logs/grpo/grpo_20260426_024029
+2026-04-26 02:40:33,618 INFO     __main__ - Console log : logs/grpo/grpo_20260426_024029/console_output.log
+2026-04-26 02:40:33,618 INFO     __main__ - ======================================================================
+2026-04-26 02:40:33,736 INFO     src.utils.attn_backend - Attention backend selected: flash_attention_2
+2026-04-26 02:40:33,736 INFO     __main__ - Device: cuda:0 | attn: flash_attention_2
+2026-04-26 02:40:33,753 INFO     __main__ - GPU: NVIDIA A100 80GB PCIe | 85.1 GB VRAM | capability sm_80
+2026-04-26 02:40:33,753 INFO     __main__ - Run config: K=8 K_q=2 N=16 lr=5.0e-06 T=0.80 max_new=800 | clip_eps=0.20 kl_coef=0.0400 warmup=6 | diff_alpha=3.0 | self_play=70% grounded=30% | math_mix=30% math_maxdiff=3 | overlong_filter=True | eval_every=5 eval_N=100 | grad_clip=0.50 save_every=5 keep_last=3 | question_GRPO=ENABLED (K_q=2)
+2026-04-26 02:40:33,753 INFO     __main__ - Loading model from checkpoints/dual_task_v1 ...
+2026-04-26 02:40:34,405 INFO     __main__ - Tokenizer has no chat_template; loading from base model Qwen/Qwen2.5-Math-1.5B-Instruct
+2026-04-26 02:40:34,731 INFO     __main__ - Chat template loaded successfully.
+2026-04-26 02:40:34,731 INFO     __main__ - Detected PEFT adapter — loading base Qwen/Qwen2.5-Math-1.5B-Instruct then merging checkpoints/dual_task_v1
+2026-04-26 02:40:36,242 WARNING  __main__ - All parameters were frozen on load (PEFT merge_and_unload bug). Re-enabled requires_grad — any prior frozen runs were training nothing.
+2026-04-26 02:40:36,242 INFO     __main__ - Flash-Attn 2 active — gradient checkpointing OFF (Flash already gives O(T) attention memory).
+2026-04-26 02:40:36,243 INFO     __main__ - Trainable parameters: 1,543,714,304 / 1,543,714,304 (100.0%)
+2026-04-26 02:40:36,244 INFO     __main__ - Creating frozen reference policy (kl_coef=0.0400, ~3.1 GB VRAM)...
+2026-04-26 02:40:36,305 INFO     __main__ - Reference policy ready.
+2026-04-26 02:40:36,306 INFO     __main__ - LR schedule: 5.0e-06 warmup(6 iters) → cosine decay(24 iters, min=5.0e-07)
+2026-04-26 02:40:36,415 INFO     __main__ - Loaded 8792 QA pairs from data/sft/gsm8k_sft.jsonl
+2026-04-26 02:40:36,424 INFO     __main__ - Loaded 4072 MATH pairs from data/math/math_numeric.jsonl
+2026-04-26 02:40:36,424 INFO     __main__ - MATH mixing: 30% MATH (4072 problems) + 70% GSM8K (8792 problems)
+2026-04-26 02:40:36,424 INFO     src.rl.prm_scorer - Loading PRM Qwen/Qwen2.5-Math-PRM-7B (4-bit=True, dtype=torch.bfloat16) on cuda:0 …
+Some weights of the model checkpoint at Qwen/Qwen2.5-Math-PRM-7B were not used when initializing Qwen2ForProcessRewardModel: ['lm_head.weight']
+- This IS expected if you are initializing Qwen2ForProcessRewardModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing Qwen2ForProcessRewardModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+2026-04-26 02:40:40,150 INFO     src.rl.prm_scorer - PRM ready.  GPU memory allocated: 9.97 GB  step_sep_id=151651
+2026-04-26 02:40:40,151 INFO     __main__ - PRM loaded: Qwen/Qwen2.5-Math-PRM-7B (4-bit)
+2026-04-26 02:40:40,154 INFO     src.rl.unified_accuracy - Extraction cache not found at data/extraction_cache.json — will build on first use
+2026-04-26 02:40:40,154 INFO     __main__ - Unified accuracy calculator ready (extractor=Qwen/Qwen2.5-0.5B-Instruct, cache=data/extraction_cache.json)
+2026-04-26 02:40:40,154 INFO     __main__ - Warming up step-chain extractor (eager load)...
+2026-04-26 02:40:40,154 INFO     src.rl.unified_accuracy - Loading step chain extractor: Qwen/Qwen2.5-0.5B-Instruct
+2026-04-26 02:40:41,033 INFO     src.rl.unified_accuracy - Step chain extractor loaded
+2026-04-26 02:40:41,034 INFO     __main__ - Extractor warmup complete
+2026-04-26 02:40:41,034 INFO     src.rl.llm_question_classifier - LLMQuestionClassifier ready  (model=Qwen2ForCausalLM, cache=10000, topics=24)
+2026-04-26 02:40:42,571 INFO     __main__ - Detected structured dataset (8792 records) — bootstrapping curriculum from skill_ids instead of keyword classifier.
+2026-04-26 02:40:42,575 INFO     src.rl.curriculum_manager - Curriculum bootstrapped from 8792 records across 1 topics
+2026-04-26 02:40:42,575 INFO     __main__ - ======================================================================
+2026-04-26 02:40:42,575 INFO     __main__ - INITIAL EVALUATION (Iteration 0)
+2026-04-26 02:40:42,575 INFO     __main__ - ======================================================================

logs/grpo/grpo_20260426_032827.log ADDED Viewed

The diff for this file is too large to render. See raw diff

logs/grpo/grpo_20260426_032827/config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "base_model": "checkpoints/dual_task_v1",
+  "output_dir": "checkpoints/grpo",
+  "gsm8k_data": "data/sft/gsm8k_sft.jsonl",
+  "eval_data_path": "data/sft/gsm8k_test.jsonl",
+  "num_iterations": 60,
+  "group_size": 10,
+  "q_group_size": 2,
+  "questions_per_iter": 20,
+  "learning_rate": 5e-06,
+  "max_new_tokens": 1000,
+  "temperature": 0.8,
+  "eval_every": 5,
+  "eval_max_samples": 150,
+  "eval_max_new_tokens": 1000,
+  "eval_pass_at_k": 0,
+  "use_prm": true,
+  "prm_model": "Qwen/Qwen2.5-Math-PRM-7B",
+  "skip_initial_eval": false,
+  "run_name": "grpo_20260426_032827",
+  "max_grad_norm": 0.5,
+  "kl_coef": 0.06,
+  "math_data": null,
+  "math_mix_ratio": 0.3,
+  "math_mix_ratio_late": 0.5,
+  "math_ramp_start": 18,
+  "math_max_difficulty": 3,
+  "clip_eps": 0.2,
+  "warmup_iters": 8,
+  "min_lr_ratio": 0.1,
+  "difficulty_alpha": 3.5,
+  "overlong_filter": true,
+  "save_every": 5,
+  "keep_last": 4,
+  "self_play_ratio": 0.7,
+  "min_warmup": 12,
+  "selfplay_gt_thresh": 0.65,
+  "selfplay_grounded_thresh": 0.65,
+  "selfplay_step_thresh": 0.68,
+  "selfplay_ramp_iters": 28,
+  "grounded_floor": 0.55,
+  "extractor_model": "Qwen/Qwen2.5-0.5B-Instruct",
+  "extraction_cache": "data/extraction_cache.json"
+}

logs/grpo/grpo_20260426_032827/console_output.log ADDED Viewed

The diff for this file is too large to render. See raw diff

logs/grpo/grpo_20260426_032827/metrics.csv ADDED Viewed

	@@ -0,0 +1,31 @@

+iteration,timestamp,loss,mean_reward,std_reward,batch_accuracy,grounded_acc,gt_match_rate,step_accuracy,lccp,n_groups,skipped_groups,n_sp_groups,sp_ratio,sp_suspended,training_phase,learning_rate,iter_time_s,q_reward,q_valid_rate,q_novelty,q_solvability,chain_prm_corr,chain_scoring_on,eval_combined,eval_correct_rt,eval_prm,eval_step_acc,eval_lccp,eval_format,eval_n_scored,eval_final_ans
+1,2026-04-26T03:38:38,0.000610,0.914309,0.163605,0.960000,0.960000,0.780000,0.894861,0.814111,12,8,0,0.000000,0,GROUNDED_ONLY,0.000001,127.637996,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+2,2026-04-26T03:41:58,-0.000034,0.847892,0.216018,0.914141,0.914141,0.651500,0.866692,0.765381,18,2,0,0.000000,0,GROUNDED_ONLY,0.000002,199.518393,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+3,2026-04-26T03:45:08,0.000366,0.896391,0.170699,0.954545,0.954545,0.707100,0.876898,0.765238,12,8,0,0.000000,0,GROUNDED_ONLY,0.000002,189.836063,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+4,2026-04-26T03:48:10,0.000942,0.865431,0.218756,0.893939,0.893939,0.732300,0.858504,0.764982,11,9,0,0.000000,0,GROUNDED_ONLY,0.000003,182.125475,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+5,2026-04-26T03:59:39,0.000081,0.856875,0.239487,0.884422,0.884422,0.693500,0.918500,0.843100,16,4,0,0.000000,0,GROUNDED_ONLY,0.000003,201.679190,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.919200,0.793300,0.903500,0.918500,0.843100,0.997700,150,0.793333
+6,2026-04-26T04:02:52,-0.000063,0.879253,0.215318,0.909548,0.909548,0.748700,0.884646,0.805897,12,8,0,0.000000,0,GROUNDED_ONLY,0.000004,193.350312,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+7,2026-04-26T04:06:20,0.001071,0.837888,0.223356,0.883249,0.883249,0.639600,0.813073,0.658069,14,6,0,0.000000,0,GROUNDED_ONLY,0.000004,208.223944,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+8,2026-04-26T04:09:11,-0.000257,0.875536,0.200109,0.895000,0.895000,0.690000,0.864722,0.747928,13,7,0,0.000000,0,GROUNDED_ONLY,0.000005,170.595953,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+9,2026-04-26T04:12:52,0.000060,0.906506,0.176914,0.964646,0.964646,0.803000,0.893573,0.817532,15,5,0,0.000000,0,GROUNDED_ONLY,0.000005,221.350669,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+10,2026-04-26T04:24:49,0.000425,0.880765,0.175501,0.954774,0.954774,0.683400,0.920500,0.842600,14,6,0,0.000000,0,GROUNDED_ONLY,0.000005,188.981772,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.919900,0.793300,0.906600,0.920500,0.842600,0.998000,150,0.793333
+11,2026-04-26T04:27:11,-0.000557,0.969814,0.098322,0.985000,0.985000,0.930000,0.966268,0.921810,8,12,0,0.000000,0,GROUNDED_ONLY,0.000005,141.966778,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+12,2026-04-26T04:30:09,0.000073,0.849274,0.212864,0.900000,0.900000,0.650000,0.820526,0.687272,14,6,0,0.000000,0,SELFPLAY_RAMP,0.000005,177.954757,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+13,2026-04-26T04:39:26,0.000268,0.898824,0.185992,0.930000,0.930000,0.780000,0.870960,0.788730,14,6,0,0.000000,0,SELFPLAY_RAMP,0.000005,556.185637,0.000000,0.000000,0.000000,0.000000,-0.040000,0,,,,,,,,
+14,2026-04-26T04:48:54,0.000496,0.855832,0.208499,0.952381,0.947368,0.673700,0.857607,0.747807,18,3,1,0.036000,0,SELFPLAY_RAMP,0.000005,568.400518,0.763000,1.000000,0.428900,1.000000,0.209000,0,,,,,,,,
+15,2026-04-26T05:06:28,0.000023,0.927972,0.167187,0.937799,0.931217,0.836000,0.924200,0.842400,12,9,1,0.071000,0,SELFPLAY_RAMP,0.000005,550.143772,0.721800,1.000000,0.458000,1.000000,0.079000,0,0.926200,0.800000,0.907200,0.924200,0.842400,1.000000,150,0.800000
+16,2026-04-26T05:16:04,0.000330,0.914605,0.172733,0.949772,0.938547,0.832400,0.895523,0.843899,15,7,2,0.107000,0,SELFPLAY_RAMP,0.000005,575.528946,0.787800,1.000000,0.447500,0.960000,0.089000,0,,,,,,,,
+17,2026-04-26T05:26:20,-0.000137,0.888123,0.195006,0.938326,0.916168,0.700600,0.855796,0.768235,20,3,3,0.143000,0,SELFPLAY_RAMP,0.000005,616.018573,0.798200,1.000000,0.461600,1.000000,-0.191000,0,,,,,,,,
+18,2026-04-26T05:35:30,0.000079,0.866401,0.178010,0.953975,0.943396,0.591200,0.830780,0.692011,19,5,4,0.179000,0,SELFPLAY_RAMP,0.000005,550.572628,0.739400,1.000000,0.452000,0.976200,0.021000,0,,,,,,,,
+19,2026-04-26T05:44:13,0.000151,0.891281,0.172665,0.953586,0.949045,0.764300,0.851398,0.756874,16,8,4,0.214000,0,SELFPLAY_RAMP,0.000005,522.428960,0.733100,1.000000,0.456400,0.972500,0.075000,0,,,,,,,,
+20,2026-04-26T06:02:54,0.000244,0.896291,0.177842,0.927711,0.906040,0.798700,0.925300,0.842800,18,7,5,0.250000,0,SELFPLAY_RAMP,0.000004,619.886349,0.770000,1.000000,0.474100,0.945000,-0.118000,0,0.923400,0.800000,0.905600,0.925300,0.842800,1.000000,150,0.800000
+21,2026-04-26T06:11:04,0.000192,0.841732,0.187981,0.923077,0.914286,0.735700,0.819504,0.693061,21,5,6,0.286000,0,SELFPLAY_RAMP,0.000004,490.366938,0.697200,1.000000,0.449300,0.962500,0.209000,0,,,,,,,,
+22,2026-04-26T06:21:16,0.000579,0.917519,0.124242,0.984314,0.985294,0.904400,0.964735,0.928489,20,6,6,0.321000,0,SELFPLAY_RAMP,0.000004,611.872286,0.699800,1.000000,0.457100,0.979000,0.145000,0,,,,,,,,
+23,2026-04-26T06:28:41,0.000614,0.920698,0.147419,0.977011,0.950820,0.803300,0.907500,0.847631,18,9,7,0.357000,0,SELFPLAY_RAMP,0.000004,444.320885,0.726000,1.000000,0.441200,0.988500,0.143000,0,,,,,,,,
+24,2026-04-26T06:36:32,-0.000213,0.879590,0.173313,0.935714,0.933333,0.791700,0.898819,0.812292,20,8,8,0.393000,0,SELFPLAY_RAMP,0.000004,471.698962,0.662100,1.000000,0.440800,0.968800,0.082000,0,,,,,,,,
+25,2026-04-26T06:53:36,0.000344,0.844528,0.208658,0.927336,0.853211,0.605500,0.919800,0.846800,28,1,9,0.429000,0,SELFPLAY_RAMP,0.000004,524.655717,0.647100,1.000000,0.439400,0.967200,0.127000,0,0.922100,0.793300,0.903400,0.919800,0.846800,1.000000,150,0.793333
+26,2026-04-26T07:02:06,0.000421,0.866649,0.179636,0.920415,0.926606,0.789000,0.889846,0.794302,26,3,9,0.464000,0,SELFPLAY_RAMP,0.000004,509.677450,0.679200,1.000000,0.448800,0.931700,0.065000,0,,,,,,,,
+27,2026-04-26T07:12:03,-0.000227,0.877934,0.162866,0.956376,0.939394,0.686900,0.861628,0.740657,25,5,10,0.500000,0,SELFPLAY_RAMP,0.000004,597.521238,0.683100,1.000000,0.458400,0.975900,0.067000,0,,,,,,,,
+28,2026-04-26T07:22:06,0.000042,0.869600,0.159154,0.941935,0.877778,0.655600,0.833443,0.618623,29,2,11,0.536000,0,SELFPLAY_RAMP,0.000004,603.099793,0.669300,1.000000,0.448900,0.983600,0.047000,0,,,,,,,,
+29,2026-04-26T07:31:46,0.000377,0.867441,0.170826,0.947020,0.892857,0.726200,0.867407,0.760394,28,3,11,0.571000,0,SELFPLAY_RAMP,0.000003,579.690467,0.649600,1.000000,0.442500,0.973900,0.123000,0,,,,,,,,
+30,2026-04-26T07:48:26,-0.000299,0.870581,0.160260,0.965517,0.950000,0.800000,0.923200,0.850000,27,5,12,0.607000,0,SELFPLAY_RAMP,0.000003,503.087982,0.676400,1.000000,0.456600,0.969900,0.099000,0,0.920400,0.793300,0.904400,0.923200,0.850000,1.000000,150,0.793333

logs/metrics.jsonl ADDED Viewed

	@@ -0,0 +1,31 @@

+{"iteration": 0, "accuracy": 0.9162, "combined_score": 0.9162, "step_accuracy": 0.9111, "lccp": 0.8392, "correct_rate": 0.7867, "prm_mean": 0.8988, "prm_final": 0.9275, "sympy_mean": 0.0, "format_mean": 1.0, "n_scored": 150, "total": 150, "final_answer_correct": 118, "final_answer_accuracy": 0.7866666666666666}
+{"iteration": 1, "loss": 0.0006103356778718686, "mean_reward": 0.914308755129325, "std_reward": 0.1636050993381563, "batch_accuracy": 0.96, "grounded_accuracy": 0.96, "gt_match_rate": 0.78, "step_accuracy": 0.8948611111111111, "lccp": 0.8141111111111111, "n_groups": 12, "skipped_groups": 8, "learning_rate": 1.0625000000000002e-06, "iter_time_s": 127.63799649500288, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 2, "loss": -3.432962815471304e-05, "mean_reward": 0.8478923191518654, "std_reward": 0.2160182166583165, "batch_accuracy": 0.9141414141414141, "grounded_accuracy": 0.9141414141414141, "gt_match_rate": 0.6515, "step_accuracy": 0.8666916416916417, "lccp": 0.7653809153809155, "n_groups": 18, "skipped_groups": 2, "learning_rate": 1.6250000000000001e-06, "iter_time_s": 199.5183933188673, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 3, "loss": 0.0003658987698145211, "mean_reward": 0.8963912433066207, "std_reward": 0.17069859725714537, "batch_accuracy": 0.9545454545454546, "grounded_accuracy": 0.9545454545454546, "gt_match_rate": 0.7071, "step_accuracy": 0.876897947731281, "lccp": 0.765237694404361, "n_groups": 12, "skipped_groups": 8, "learning_rate": 2.1875000000000002e-06, "iter_time_s": 189.83606291818433, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 4, "loss": 0.0009415318305731158, "mean_reward": 0.8654313890820613, "std_reward": 0.21875612713334075, "batch_accuracy": 0.8939393939393939, "grounded_accuracy": 0.8939393939393939, "gt_match_rate": 0.7323, "step_accuracy": 0.8585036876703543, "lccp": 0.7649821628988295, "n_groups": 11, "skipped_groups": 9, "learning_rate": 2.7500000000000004e-06, "iter_time_s": 182.12547484994866, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 5, "loss": 8.118284122815567e-05, "mean_reward": 0.8568747993989829, "std_reward": 0.23948718740823036, "batch_accuracy": 0.8844221105527639, "grounded_accuracy": 0.8844221105527639, "gt_match_rate": 0.6935, "step_accuracy": 0.9185, "lccp": 0.8431, "n_groups": 16, "skipped_groups": 4, "learning_rate": 3.3125000000000005e-06, "iter_time_s": 201.67919013393112, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0, "accuracy": 0.9192, "combined_score": 0.9192, "correct_rate": 0.7933, "prm_mean": 0.9035, "prm_final": 0.9305, "sympy_mean": 0.0, "format_mean": 0.9977, "n_scored": 150, "total": 150, "final_answer_correct": 119, "final_answer_accuracy": 0.7933333333333333}
+{"iteration": 6, "loss": -6.271734067316477e-05, "mean_reward": 0.8792530329566163, "std_reward": 0.21531797453446344, "batch_accuracy": 0.9095477386934674, "grounded_accuracy": 0.9095477386934674, "gt_match_rate": 0.7487, "step_accuracy": 0.8846455219822055, "lccp": 0.8058971263242619, "n_groups": 12, "skipped_groups": 8, "learning_rate": 3.875e-06, "iter_time_s": 193.35031225602143, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 7, "loss": 0.0010708057920315436, "mean_reward": 0.8378877251545859, "std_reward": 0.2233563664223874, "batch_accuracy": 0.883248730964467, "grounded_accuracy": 0.883248730964467, "gt_match_rate": 0.6396, "step_accuracy": 0.8130725309659319, "lccp": 0.6580686304671076, "n_groups": 14, "skipped_groups": 6, "learning_rate": 4.4375e-06, "iter_time_s": 208.22394350194372, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 8, "loss": -0.0002566667799678376, "mean_reward": 0.8755362041151912, "std_reward": 0.20010863742401203, "batch_accuracy": 0.895, "grounded_accuracy": 0.895, "gt_match_rate": 0.69, "step_accuracy": 0.8647215007215007, "lccp": 0.7479280303030303, "n_groups": 13, "skipped_groups": 7, "learning_rate": 5e-06, "iter_time_s": 170.59595341305248, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 9, "loss": 5.9516330460004004e-05, "mean_reward": 0.906506146327221, "std_reward": 0.1769136401553803, "batch_accuracy": 0.9646464646464646, "grounded_accuracy": 0.9646464646464646, "gt_match_rate": 0.803, "step_accuracy": 0.8935726310726311, "lccp": 0.8175324675324676, "n_groups": 15, "skipped_groups": 5, "learning_rate": 4.995894997002465e-06, "iter_time_s": 221.35066892812029, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 10, "loss": 0.0004252615440886335, "mean_reward": 0.8807654454859567, "std_reward": 0.17550108931309533, "batch_accuracy": 0.9547738693467337, "grounded_accuracy": 0.9547738693467337, "gt_match_rate": 0.6834, "step_accuracy": 0.9205, "lccp": 0.8426, "n_groups": 14, "skipped_groups": 6, "learning_rate": 4.983594966720622e-06, "iter_time_s": 188.98177218902856, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0, "accuracy": 0.9199, "combined_score": 0.9199, "correct_rate": 0.7933, "prm_mean": 0.9066, "prm_final": 0.9408, "sympy_mean": 0.0, "format_mean": 0.998, "n_scored": 150, "total": 150, "final_answer_correct": 119, "final_answer_accuracy": 0.7933333333333333}
+{"iteration": 11, "loss": -0.0005566358695432427, "mean_reward": 0.9698135460130081, "std_reward": 0.0983216960471261, "batch_accuracy": 0.985, "grounded_accuracy": 0.985, "gt_match_rate": 0.93, "step_accuracy": 0.9662678571428571, "lccp": 0.9218095238095237, "n_groups": 8, "skipped_groups": 12, "learning_rate": 4.963144790631074e-06, "iter_time_s": 141.96677790791728, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 12, "loss": 7.270745637859883e-05, "mean_reward": 0.8492740230597824, "std_reward": 0.2128636238290247, "batch_accuracy": 0.9, "grounded_accuracy": 0.9, "gt_match_rate": 0.65, "step_accuracy": 0.8205257936507937, "lccp": 0.6872718253968253, "n_groups": 14, "skipped_groups": 6, "learning_rate": 4.934619089208618e-06, "iter_time_s": 177.9547567779664, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 13, "loss": 0.00026773045517204864, "mean_reward": 0.8988236995312778, "std_reward": 0.18599151493605476, "batch_accuracy": 0.93, "grounded_accuracy": 0.93, "gt_match_rate": 0.78, "step_accuracy": 0.8709603174603174, "lccp": 0.7887301587301587, "n_groups": 14, "skipped_groups": 6, "learning_rate": 4.898121949644228e-06, "iter_time_s": 556.1856374200433, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": -0.04, "extraction_success_rate": 1.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 14, "loss": 0.0004961729192069066, "mean_reward": 0.8558324048863098, "std_reward": 0.20849902292009304, "batch_accuracy": 0.9523809523809523, "grounded_accuracy": 0.9473684210526315, "gt_match_rate": 0.6737, "step_accuracy": 0.8576065162907268, "lccp": 0.7478070175438597, "n_groups": 18, "skipped_groups": 3, "learning_rate": 4.853786546042184e-06, "iter_time_s": 568.4005180909298, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.036, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.209, "extraction_success_rate": 1.0, "chain_scoring_active": 0, "n_self_play_groups": 1, "q_gen_attempts": 1, "q_gen_valid": 1, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.763, "q_quality_rate": 1.0, "q_topic_match": 0.575, "q_difficulty_fit": 0.89, "q_clarity": 1.0, "q_novelty": 0.4289, "q_solvability": 1.0}
+{"iteration": 15, "loss": 2.3262581635208335e-05, "mean_reward": 0.927972135586315, "std_reward": 0.16718736928397065, "batch_accuracy": 0.937799043062201, "grounded_accuracy": 0.9312169312169312, "gt_match_rate": 0.836, "step_accuracy": 0.9242, "lccp": 0.8424, "n_groups": 12, "skipped_groups": 9, "learning_rate": 4.801774653482204e-06, "iter_time_s": 550.1437717408407, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.071, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.079, "extraction_success_rate": 0.98, "chain_scoring_active": 0, "n_self_play_groups": 1, "q_gen_attempts": 1, "q_gen_valid": 1, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.7218, "q_quality_rate": 1.0, "q_topic_match": 0.35, "q_difficulty_fit": 0.9511, "q_clarity": 1.0, "q_novelty": 0.458, "q_solvability": 1.0, "accuracy": 0.9262, "combined_score": 0.9262, "correct_rate": 0.8, "prm_mean": 0.9072, "prm_final": 0.9404, "sympy_mean": 0.0, "format_mean": 1.0, "n_scored": 150, "total": 150, "final_answer_correct": 120, "final_answer_accuracy": 0.8}
+{"iteration": 16, "loss": 0.0003296181123005226, "mean_reward": 0.9146047620088099, "std_reward": 0.17273258044260062, "batch_accuracy": 0.9497716894977168, "grounded_accuracy": 0.9385474860335196, "gt_match_rate": 0.8324, "step_accuracy": 0.8955234709424654, "lccp": 0.8438994897095455, "n_groups": 15, "skipped_groups": 7, "learning_rate": 4.742276057719723e-06, "iter_time_s": 575.5289459908381, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.107, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.089, "extraction_success_rate": 0.94, "chain_scoring_active": 0, "n_self_play_groups": 2, "q_gen_attempts": 2, "q_gen_valid": 2, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.7878, "q_quality_rate": 1.0, "q_topic_match": 0.875, "q_difficulty_fit": 0.5838, "q_clarity": 1.0, "q_novelty": 0.4475, "q_solvability": 0.96}
+{"iteration": 17, "loss": -0.00013719029248022708, "mean_reward": 0.8881227328092163, "std_reward": 0.1950058307020988, "batch_accuracy": 0.9383259911894273, "grounded_accuracy": 0.9161676646706587, "gt_match_rate": 0.7006, "step_accuracy": 0.8557955517536356, "lccp": 0.7682349586541203, "n_groups": 20, "skipped_groups": 3, "learning_rate": 4.675507862678258e-06, "iter_time_s": 616.0185732548125, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.143, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": -0.191, "extraction_success_rate": 1.0, "chain_scoring_active": 0, "n_self_play_groups": 3, "q_gen_attempts": 3, "q_gen_valid": 3, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.7982, "q_quality_rate": 1.0, "q_topic_match": 0.69, "q_difficulty_fit": 0.8892, "q_clarity": 1.0, "q_novelty": 0.4616, "q_solvability": 1.0}
+{"iteration": 18, "loss": 7.917114673641903e-05, "mean_reward": 0.8664005137011263, "std_reward": 0.178010205898339, "batch_accuracy": 0.9539748953974896, "grounded_accuracy": 0.9433962264150944, "gt_match_rate": 0.5912, "step_accuracy": 0.830780173704702, "lccp": 0.6920110811620246, "n_groups": 19, "skipped_groups": 5, "learning_rate": 4.601713698260728e-06, "iter_time_s": 550.572628196096, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.179, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.021, "extraction_success_rate": 0.98, "chain_scoring_active": 0, "n_self_play_groups": 4, "q_gen_attempts": 4, "q_gen_valid": 4, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.7394, "q_quality_rate": 1.0, "q_topic_match": 0.6375, "q_difficulty_fit": 0.6293, "q_clarity": 1.0, "q_novelty": 0.452, "q_solvability": 0.9762}
+{"iteration": 19, "loss": 0.00015087392284840462, "mean_reward": 0.8912812767256229, "std_reward": 0.1726645221785555, "batch_accuracy": 0.9535864978902954, "grounded_accuracy": 0.9490445859872612, "gt_match_rate": 0.7643, "step_accuracy": 0.8513975055376328, "lccp": 0.7568744772566428, "n_groups": 16, "skipped_groups": 8, "learning_rate": 4.521162831370364e-06, "iter_time_s": 522.4289600129705, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.214, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.075, "extraction_success_rate": 0.98, "chain_scoring_active": 0, "n_self_play_groups": 4, "q_gen_attempts": 4, "q_gen_valid": 4, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.7331, "q_quality_rate": 1.0, "q_topic_match": 0.4813, "q_difficulty_fit": 0.8466, "q_clarity": 1.0, "q_novelty": 0.4564, "q_solvability": 0.9725}
+{"iteration": 20, "loss": 0.00024373266084391312, "mean_reward": 0.8962914079724992, "std_reward": 0.1778417367801085, "batch_accuracy": 0.927710843373494, "grounded_accuracy": 0.9060402684563759, "gt_match_rate": 0.7987, "step_accuracy": 0.9253, "lccp": 0.8428, "n_groups": 18, "skipped_groups": 7, "learning_rate": 4.434149183384978e-06, "iter_time_s": 619.8863487117924, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.25, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": -0.118, "extraction_success_rate": 0.96, "chain_scoring_active": 0, "n_self_play_groups": 5, "q_gen_attempts": 5, "q_gen_valid": 5, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.77, "q_quality_rate": 1.0, "q_topic_match": 0.723, "q_difficulty_fit": 0.703, "q_clarity": 1.0, "q_novelty": 0.4741, "q_solvability": 0.945, "accuracy": 0.9234, "combined_score": 0.9234, "correct_rate": 0.8, "prm_mean": 0.9056, "prm_final": 0.9353, "sympy_mean": 0.0, "format_mean": 1.0, "n_scored": 150, "total": 150, "final_answer_correct": 120, "final_answer_accuracy": 0.8}
+{"iteration": 21, "loss": 0.0001916794737033862, "mean_reward": 0.8417323480901788, "std_reward": 0.1879809468583581, "batch_accuracy": 0.9230769230769231, "grounded_accuracy": 0.9142857142857143, "gt_match_rate": 0.7357, "step_accuracy": 0.8195039682539682, "lccp": 0.6930612244897959, "n_groups": 21, "skipped_groups": 5, "learning_rate": 4.340990257669732e-06, "iter_time_s": 490.36693838005885, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.286, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.209, "extraction_success_rate": 0.92, "chain_scoring_active": 0, "n_self_play_groups": 6, "q_gen_attempts": 6, "q_gen_valid": 6, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6972, "q_quality_rate": 1.0, "q_topic_match": 0.5742, "q_difficulty_fit": 0.4754, "q_clarity": 1.0, "q_novelty": 0.4493, "q_solvability": 0.9625}
+{"iteration": 22, "loss": 0.000578732604299148, "mean_reward": 0.9175190043251262, "std_reward": 0.12424225720214971, "batch_accuracy": 0.984313725490196, "grounded_accuracy": 0.9852941176470589, "gt_match_rate": 0.9044, "step_accuracy": 0.9647345301757068, "lccp": 0.9284886681945506, "n_groups": 20, "skipped_groups": 6, "learning_rate": 4.2420259810417895e-06, "iter_time_s": 611.8722857821267, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.321, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.145, "extraction_success_rate": 0.92, "chain_scoring_active": 0, "n_self_play_groups": 6, "q_gen_attempts": 6, "q_gen_valid": 6, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6998, "q_quality_rate": 1.0, "q_topic_match": 0.6189, "q_difficulty_fit": 0.3856, "q_clarity": 1.0, "q_novelty": 0.4571, "q_solvability": 0.979}
+{"iteration": 23, "loss": 0.0006137362383419208, "mean_reward": 0.9206978778568132, "std_reward": 0.14741914089456262, "batch_accuracy": 0.9770114942528736, "grounded_accuracy": 0.9508196721311475, "gt_match_rate": 0.8033, "step_accuracy": 0.9075003548364204, "lccp": 0.847631466893762, "n_groups": 18, "skipped_groups": 9, "learning_rate": 4.137617463414222e-06, "iter_time_s": 444.32088500098325, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.357, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.143, "extraction_success_rate": 1.0, "chain_scoring_active": 0, "n_self_play_groups": 7, "q_gen_attempts": 7, "q_gen_valid": 7, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.726, "q_quality_rate": 1.0, "q_topic_match": 0.5621, "q_difficulty_fit": 0.6634, "q_clarity": 1.0, "q_novelty": 0.4412, "q_solvability": 0.9885}
+{"iteration": 24, "loss": -0.00021296025724950595, "mean_reward": 0.8795895609748888, "std_reward": 0.1733128827089799, "batch_accuracy": 0.9357142857142857, "grounded_accuracy": 0.9333333333333333, "gt_match_rate": 0.7917, "step_accuracy": 0.8988194444444446, "lccp": 0.8122916666666666, "n_groups": 20, "skipped_groups": 8, "learning_rate": 4.0281456801451e-06, "iter_time_s": 471.6989622868132, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.393, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.082, "extraction_success_rate": 0.98, "chain_scoring_active": 0, "n_self_play_groups": 8, "q_gen_attempts": 8, "q_gen_valid": 8, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6621, "q_quality_rate": 1.0, "q_topic_match": 0.5344, "q_difficulty_fit": 0.3108, "q_clarity": 1.0, "q_novelty": 0.4408, "q_solvability": 0.9688}
+{"iteration": 25, "loss": 0.0003441530472758002, "mean_reward": 0.8445275205076134, "std_reward": 0.20865777545087066, "batch_accuracy": 0.9273356401384083, "grounded_accuracy": 0.8532110091743119, "gt_match_rate": 0.6055, "step_accuracy": 0.9198, "lccp": 0.8468, "n_groups": 28, "skipped_groups": 1, "learning_rate": 3.9140100818997275e-06, "iter_time_s": 524.655717118876, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.429, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.127, "extraction_success_rate": 0.94, "chain_scoring_active": 0, "n_self_play_groups": 9, "q_gen_attempts": 9, "q_gen_valid": 9, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6471, "q_quality_rate": 1.0, "q_topic_match": 0.505, "q_difficulty_fit": 0.2634, "q_clarity": 1.0, "q_novelty": 0.4394, "q_solvability": 0.9672, "accuracy": 0.9221, "combined_score": 0.9221, "correct_rate": 0.7933, "prm_mean": 0.9034, "prm_final": 0.9329, "sympy_mean": 0.0, "format_mean": 1.0, "n_scored": 150, "total": 150, "final_answer_correct": 119, "final_answer_accuracy": 0.7933333333333333}
+{"iteration": 26, "loss": 0.0004209962865808428, "mean_reward": 0.8666489827432893, "std_reward": 0.1796360842988206, "batch_accuracy": 0.9204152249134948, "grounded_accuracy": 0.926605504587156, "gt_match_rate": 0.789, "step_accuracy": 0.8898463666812292, "lccp": 0.7943024610455803, "n_groups": 26, "skipped_groups": 3, "learning_rate": 3.795627137098479e-06, "iter_time_s": 509.6774504878558, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.464, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.065, "extraction_success_rate": 0.94, "chain_scoring_active": 0, "n_self_play_groups": 9, "q_gen_attempts": 9, "q_gen_valid": 9, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6792, "q_quality_rate": 1.0, "q_topic_match": 0.6639, "q_difficulty_fit": 0.2476, "q_clarity": 1.0, "q_novelty": 0.4488, "q_solvability": 0.9317}
+{"iteration": 27, "loss": -0.00022697661013808103, "mean_reward": 0.877933982604161, "std_reward": 0.1628662024521015, "batch_accuracy": 0.9563758389261745, "grounded_accuracy": 0.9393939393939394, "gt_match_rate": 0.6869, "step_accuracy": 0.8616281866281865, "lccp": 0.7406565656565657, "n_groups": 25, "skipped_groups": 5, "learning_rate": 3.673428812268702e-06, "iter_time_s": 597.5212381640449, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.5, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.067, "extraction_success_rate": 0.92, "chain_scoring_active": 0, "n_self_play_groups": 10, "q_gen_attempts": 10, "q_gen_valid": 10, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6831, "q_quality_rate": 1.0, "q_topic_match": 0.5699, "q_difficulty_fit": 0.3583, "q_clarity": 1.0, "q_novelty": 0.4584, "q_solvability": 0.9759}
+{"iteration": 28, "loss": 4.199455770111822e-05, "mean_reward": 0.8695997487614422, "std_reward": 0.15915376074701193, "batch_accuracy": 0.9419354838709677, "grounded_accuracy": 0.8777777777777778, "gt_match_rate": 0.6556, "step_accuracy": 0.8334434828062279, "lccp": 0.6186230200445887, "n_groups": 29, "skipped_groups": 2, "learning_rate": 3.5478609958457035e-06, "iter_time_s": 603.0997926741838, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.536, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.047, "extraction_success_rate": 0.8, "chain_scoring_active": 0, "n_self_play_groups": 11, "q_gen_attempts": 11, "q_gen_valid": 11, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6693, "q_quality_rate": 1.0, "q_topic_match": 0.5931, "q_difficulty_fit": 0.23, "q_clarity": 1.0, "q_novelty": 0.4489, "q_solvability": 0.9836}
+{"iteration": 29, "loss": 0.0003765096731578004, "mean_reward": 0.8674408392873937, "std_reward": 0.17082623284979875, "batch_accuracy": 0.9470198675496688, "grounded_accuracy": 0.8928571428571429, "gt_match_rate": 0.7262, "step_accuracy": 0.8674065194639727, "lccp": 0.7603936306964257, "n_groups": 28, "skipped_groups": 3, "learning_rate": 3.419381871174205e-06, "iter_time_s": 579.6904674370307, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.571, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.123, "extraction_success_rate": 0.84, "chain_scoring_active": 0, "n_self_play_groups": 11, "q_gen_attempts": 11, "q_gen_valid": 11, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6496, "q_quality_rate": 1.0, "q_topic_match": 0.5636, "q_difficulty_fit": 0.1695, "q_clarity": 1.0, "q_novelty": 0.4425, "q_solvability": 0.9739}
+{"iteration": 30, "loss": -0.00029927124827130075, "mean_reward": 0.8705812118012987, "std_reward": 0.16025951815561293, "batch_accuracy": 0.9655172413793104, "grounded_accuracy": 0.95, "gt_match_rate": 0.8, "step_accuracy": 0.9232, "lccp": 0.85, "n_groups": 27, "skipped_groups": 5, "learning_rate": 3.2884602446470037e-06, "iter_time_s": 503.08798154001124, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.607, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.099, "extraction_success_rate": 0.92, "chain_scoring_active": 0, "n_self_play_groups": 12, "q_gen_attempts": 12, "q_gen_valid": 12, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6764, "q_quality_rate": 1.0, "q_topic_match": 0.6752, "q_difficulty_fit": 0.1485, "q_clarity": 1.0, "q_novelty": 0.4566, "q_solvability": 0.9699, "accuracy": 0.9204, "combined_score": 0.9204, "correct_rate": 0.7933, "prm_mean": 0.9044, "prm_final": 0.9289, "sympy_mean": 0.0, "format_mean": 1.0, "n_scored": 150, "total": 150, "final_answer_correct": 119, "final_answer_accuracy": 0.7933333333333333}

models.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Data models for the AxiomForgeAI math RL environment.
+The AxiomForgeAI environment presents math questions drawn from an adaptive
+curriculum; external agents submit step-by-step solutions and receive scored
+observations.  The environment integrates with the GRPO training pipeline
+defined in scripts/run_grpo_training.py.
+"""
+from openenv.core.env_server.types import Action, Observation
+from pydantic import Field
+class AxiomforgeaiAction(Action):
+    """Action for the AxiomForgeAI math environment.
+    The agent submits a step-by-step solution to the current question.
+    Solutions should follow the format::
+        Step 1: <reasoning>
+        Step 2: <reasoning>
+        ...
+        Final Answer: <numeric value>
+    """
+    solution: str = Field(
+        default="",
+        description=(
+            "Step-by-step solution to the current math question. "
+            "Use 'Step N: ...' lines and end with 'Final Answer: <value>'."
+        ),
+    )
+class AxiomforgeaiObservation(Observation):
+    """Observation from the AxiomForgeAI math environment.
+    On reset the question is populated and reward/feedback are empty.
+    After a step the reward and feedback reflect the quality of the submitted
+    solution; done=True signals the end of the single-step episode.
+    """
+    question: str = Field(
+        default="",
+        description="Math question the agent must solve.",
+    )
+    topic: str = Field(
+        default="",
+        description="Mathematical topic of the question (e.g. 'algebra', 'geometry').",
+    )
+    difficulty: float = Field(
+        default=0.5,
+        description="Estimated difficulty of the question in [0, 1].",
+    )
+    feedback: str = Field(
+        default="",
+        description=(
+            "Human-readable feedback on the submitted solution "
+            "(empty on reset, populated after step)."
+        ),
+    )

openenv.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+spec_version: 1
+name: AxiomForgeAI
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

pyproject.toml ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-AxiomForgeAI"
+version = "0.1.0"
+description = "Axiomforgeai environment for OpenEnv"
+requires-python = ">=3.10"
+dependencies = [
+    # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
+    # install from github
+    # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
+    "openenv-core[core]>=0.2.2",
+    # Environment-specific dependencies
+    # Add all dependencies needed for your environment here
+    # Examples:
+    # "numpy>=1.19.0",
+    # "torch>=2.0.0",
+    # "gymnasium>=0.29.0",
+    # "openspiel>=1.0.0",
+    # "smolagents>=1.22.0,<2",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+]
+[project.scripts]
+# Server entry point - enables running via: uv run --project . server
+# or: python -m AxiomForgeAI.server.app
+server = "AxiomForgeAI.server.app:main"
+[tool.setuptools]
+include-package-data = true
+packages = [
+    "AxiomForgeAI",
+    "AxiomForgeAI.server",
+    "src",
+    "src.config",
+    "src.rl",
+    "src.sft",
+    "src.utils",
+    "src.self_play",
+    "scripts",
+]
+package-dir = { "AxiomForgeAI" = ".", "AxiomForgeAI.server" = "server" }

requirements.txt ADDED Viewed

	@@ -0,0 +1,160 @@

+accelerate==1.2.1
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.5
+aiohttp-cors==0.8.1
+aiosignal==1.4.0
+airportsdata==20260315
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.13.0
+astor==0.8.1
+attrs==26.1.0
+bitsandbytes==0.44.1
+blake3==1.0.8
+certifi==2026.4.22
+cffi==2.0.0
+charset-normalizer==3.4.7
+click==8.3.2
+cloudpickle==3.1.2
+colorful==0.5.8
+compressed-tensors==0.9.0
+cryptography==46.0.7
+datasets==3.2.0
+depyf==0.18.0
+dill==0.3.8
+diskcache==5.6.3
+distlib==0.4.0
+distro==1.9.0
+einops==0.8.2
+fastapi==0.136.0
+filelock==3.29.0
+frozenlist==1.8.0
+fsspec==2024.9.0
+gguf==0.10.0
+google-api-core==2.30.3
+google-auth==2.49.2
+googleapis-common-protos==1.74.0
+grpcio==1.80.0
+h11==0.16.0
+hf-xet==1.4.3
+hjson==3.1.0
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+huggingface-hub==0.36.2
+idna==3.12
+importlib-metadata==9.0.0
+interegular==0.3.3
+jinja2==3.1.6
+jiter==0.14.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+lark==1.2.2
+linkify-it-py==2.1.0
+lm-format-enforcer==0.10.12
+markdown-it-py==4.0.0
+markupsafe==3.0.3
+mdit-py-plugins==0.5.0
+mdurl==0.1.2
+memray==1.19.3
+mistral-common==1.11.0
+mpmath==1.3.0
+msgpack==1.1.2
+msgspec==0.21.1
+multidict==6.7.1
+multiprocess==0.70.16
+nest-asyncio==1.6.0
+networkx==3.6.1
+ninja==1.13.0
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-ml-py==13.595.45
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==2.32.0
+opencensus==0.11.4
+opencensus-context==0.1.3
+opencv-python-headless==4.11.0.86
+outlines==0.1.11
+outlines-core==0.1.26
+packaging==26.1
+pandas==3.0.2
+partial-json-parser==0.2.1.1.post7
+peft==0.19.1
+pillow==12.2.0
+platformdirs==4.9.6
+prometheus-client==0.25.0
+prometheus-fastapi-instrumentator==7.1.0
+propcache==0.4.1
+proto-plus==1.27.2
+protobuf==7.34.1
+psutil==7.2.2
+py-cpuinfo==9.0.0
+py-spy==0.4.1
+pyarrow==24.0.0
+pyasn1==0.6.3
+pyasn1-modules==0.4.2
+pycountry==26.2.16
+pycparser==3.0
+pydantic==2.13.3
+pydantic-core==2.46.3
+pydantic-extra-types==2.11.1
+pygments==2.20.0
+python-dateutil==2.9.0.post0
+python-discovery==1.2.2
+python-dotenv==1.2.2
+pyyaml==6.0.3
+pyzmq==27.1.0
+ray==2.39.0
+referencing==0.37.0
+regex==2026.4.4
+requests==2.33.1
+rich==15.0.0
+rpds-py==0.30.0
+safetensors==0.7.0
+scipy>=1.14.0
+sentencepiece==0.2.1
+setuptools==82.0.1
+six==1.17.0
+smart-open==7.6.0
+sniffio==1.3.1
+starlette==0.52.1
+sympy==1.13.1
+textual==8.2.4
+tiktoken==0.12.0
+tokenizers==0.20.3
+torch==2.5.1
+torchaudio==2.5.1
+torchvision==0.20.1
+tqdm==4.67.3
+transformers==4.46.3
+triton==3.1.0
+trl==0.12.1
+typing-extensions==4.15.0
+typing-inspection==0.4.2
+uc-micro-py==2.0.0
+urllib3==2.6.3
+uvicorn==0.45.0
+uvloop==0.22.1
+virtualenv==21.2.4
+vllm==0.7.0
+watchfiles==1.1.1
+websockets==16.0
+wrapt==2.1.2
+xformers==0.0.28.post3
+xgrammar==0.1.33
+xxhash==3.6.0
+yarl==1.23.0
+zipp==3.23.1
+matplotlib==3.10.9
+flash-attn==2.8.3
+gradio>=4.44.0

scripts/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Training and evaluation scripts for math reasoning models."""

scripts/convert_gsm8k_to_sft.py ADDED Viewed

	@@ -0,0 +1,193 @@

+#!/usr/bin/env python3
+"""
+Convert OpenAI GSM8K to SFT JSONL aligned with MathAgent solver format:
+  Step 1: ...
+  Step 2: ...
+  ...
+  Final Answer: <integer>
+Each record uses a chat messages list for Qwen-style fine-tuning.
+Usage
+-----
+  # From Hugging Face (default; same data as in test.ipynb)
+  python scripts/convert_gsm8k_to_sft.py \\
+      --output data/sft/gsm8k_sft.jsonl \\
+      --splits train test
+  # From a saved JSONL with columns \"question\" and \"answer\" (GSM8K schema)
+  python scripts/convert_gsm8k_to_sft.py \\
+      --source jsonl \\
+      --input path/to/file.jsonl \\
+      --output data/sft/gsm8k_sft.jsonl
+Requires: pip install datasets (and datasets will pull pyarrow as needed)
+"""
+from __future__ import annotations
+import argparse
+import json
+import re
+from pathlib import Path
+from typing import Any, Iterator
+# Keep in sync with src.agent.math_agent.SOLVER_SYSTEM_PROMPT
+SOLVER_SYSTEM_PROMPT = (
+    "You are a step-by-step math solver. "
+    "Solve the given problem one step at a time. "
+    "Each step must be on its own line, starting with 'Step N:'. "
+    "End with a line starting with 'Final Answer:'. "
+    "Write every mathematical expression in Python/SymPy syntax "
+    "so it can be verified programmatically."
+)
+USER_WRAPPER = (
+    "Solve the following problem. Show your reasoning as numbered steps, "
+    "then give the final numeric answer on the last line.\n\nProblem:\n{question}"
+)
+def parse_gsm8k_answer(raw_answer: str) -> tuple[str, str]:
+    """
+    Split GSM8K 'answer' field into reasoning text and final integer string.
+    GSM8K ends solutions with a line like: #### 42
+    """
+    text = raw_answer.strip()
+    parts = re.split(r"\s*####\s*", text, maxsplit=1)
+    reasoning = parts[0].strip()
+    final = parts[1].strip() if len(parts) > 1 else ""
+    # Normalize final (sometimes extra whitespace or commas)
+    final = re.sub(r"[,\s]+", "", final)
+    final_match = re.search(r"-?\d+", final)
+    final_clean = final_match.group(0) if final_match else final
+    return reasoning, final_clean
+def reasoning_to_step_lines(reasoning: str) -> list[str]:
+    """Turn reasoning into non-empty lines; each line becomes one Step N:."""
+    lines: list[str] = []
+    for raw in reasoning.splitlines():
+        line = raw.strip()
+        if line:
+            lines.append(line)
+    if not lines:
+        # Rare: single blob without newlines — split on sentence boundaries lightly
+        blob = reasoning.strip()
+        if blob:
+            chunks = re.split(r"(?<=[.!?])\s+", blob)
+            lines = [c.strip() for c in chunks if c.strip()]
+    return lines
+def build_assistant_content(reasoning: str, final_answer: str) -> str:
+    lines = reasoning_to_step_lines(reasoning)
+    out_parts: list[str] = []
+    for i, line in enumerate(lines, start=1):
+        # Prefer SymPy-friendly numerics: ** not ^, ascii-friendly
+        cleaned = line.replace("^", "**")
+        out_parts.append(f"Step {i}: {cleaned}")
+    body = "\n".join(out_parts)
+    if final_answer:
+        body = f"{body}\nFinal Answer: {final_answer}" if body else f"Final Answer: {final_answer}"
+    return body
+def row_to_record(
+    question: str,
+    answer: str,
+    example_id: str,
+    split: str,
+) -> dict[str, Any] | None:
+    reasoning, final_answer = parse_gsm8k_answer(answer)
+    if not final_answer and "####" not in answer:
+        return None
+    assistant = build_assistant_content(reasoning, final_answer)
+    if not assistant.strip():
+        return None
+    user_content = USER_WRAPPER.format(question=question.strip())
+    return {
+        "id": f"gsm8k_{example_id}",
+        "skill_id": "gsm8k_grade_school",
+        "source": "openai/gsm8k",
+        "split": split,
+        "messages": [
+            {"role": "system", "content": SOLVER_SYSTEM_PROMPT},
+            {"role": "user", "content": user_content},
+            {"role": "assistant", "content": assistant},
+        ],
+        # Convenience for non-chat trainers
+        "text": f"<|system|>\n{SOLVER_SYSTEM_PROMPT}\n<|user|>\n{user_content}\n<|assistant|>\n{assistant}",
+    }
+def iter_hf_rows(dataset_name: str, config: str, splits: list[str]) -> Iterator[tuple[str, str, dict]]:
+    from datasets import load_dataset
+    ds = load_dataset(dataset_name, config)
+    for split in splits:
+        if split not in ds:
+            raise KeyError(f"Split {split!r} not in dataset. Available: {list(ds.keys())}")
+        for i, row in enumerate(ds[split]):
+            yield f"{split}_{i}", split, row
+def main() -> None:
+    p = argparse.ArgumentParser(description="Convert GSM8K to SFT JSONL (chat messages).")
+    p.add_argument(
+        "--source",
+        choices=("hf", "jsonl"),
+        default="hf",
+        help="Load from Hugging Face dataset or a local JSONL file.",
+    )
+    p.add_argument("--dataset", default="openai/gsm8k", help="HF dataset id when --source hf.")
+    p.add_argument("--config", default="main", help="HF config name when --source hf.")
+    p.add_argument("--splits", nargs="+", default=["train", "test"], help="HF splits to export.")
+    p.add_argument("--input", type=Path, help="Local JSONL path when --source jsonl.")
+    p.add_argument(
+        "--output",
+        type=Path,
+        default=Path("data/sft/gsm8k_sft.jsonl"),
+        help="Output JSONL path.",
+    )
+    args = p.parse_args()
+    if args.source == "jsonl" and not args.input:
+        raise SystemExit("--input is required when --source jsonl")
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    n_ok, n_skip = 0, 0
+    def process(example_id: str, split: str, row: dict) -> None:
+        nonlocal n_ok, n_skip
+        q = row.get("question", "")
+        a = row.get("answer", "")
+        rec = row_to_record(q, a, example_id, split)
+        if rec is None:
+            n_skip += 1
+            return
+        out_f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+        n_ok += 1
+    with args.output.open("w", encoding="utf-8") as out_f:
+        if args.source == "hf":
+            for example_id, split, row in iter_hf_rows(args.dataset, args.config, args.splits):
+                process(example_id, split, row)
+        else:
+            for i, line in enumerate(args.input.open(encoding="utf-8")):
+                line = line.strip()
+                if not line:
+                    continue
+                row = json.loads(line)
+                process(str(i), "jsonl", row)
+    print(f"Wrote {n_ok} examples to {args.output} ({n_skip} skipped).")
+if __name__ == "__main__":
+    main()

scripts/create_dual_task_dataset.py ADDED Viewed

	@@ -0,0 +1,321 @@

+#!/usr/bin/env python3
+"""
+Create dual-task training dataset by mixing question-generation and solution-generation examples.
+This script:
+1. Loads existing solution data (GSM8K format)
+2. Loads question-generation data (synthetic)
+3. Adds task prefixes to distinguish tasks
+4. Mixes datasets according to specified ratio
+5. Shuffles and splits into train/validation
+Usage:
+    python scripts/create_dual_task_dataset.py \
+        --solution-data data/sft/gsm8k_sft.jsonl \
+        --question-data data/sft/question_generation.jsonl \
+        --output-train data/sft/dual_task_train.jsonl \
+        --output-val data/sft/dual_task_val.jsonl \
+        --mix-ratio 0.8 \
+        --val-split 0.1
+"""
+from __future__ import annotations
+import argparse
+import json
+import random
+import sys
+from pathlib import Path
+from typing import Any
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+from src.config.prompts import SOLVE_TASK_PREFIX, GENERATE_TASK_PREFIX
+def load_jsonl(path: Path) -> list[dict[str, Any]]:
+    """Load JSONL file into list of records."""
+    records = []
+    with path.open(encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                records.append(json.loads(line))
+    return records
+def add_solve_prefix(record: dict[str, Any]) -> dict[str, Any]:
+    """
+    Add 'Solve Problem' task prefix to user message.
+    This signals the model to generate a step-by-step solution.
+    """
+    modified = record.copy()
+    modified["messages"] = []
+    for msg in record["messages"]:
+        new_msg = msg.copy()
+        if msg["role"] == "user":
+            # Add task prefix to user content
+            content = msg["content"]
+            if not content.startswith(SOLVE_TASK_PREFIX):
+                new_msg["content"] = SOLVE_TASK_PREFIX + content
+        modified["messages"].append(new_msg)
+    # Update text field if present
+    if "text" in modified:
+        # Find and update user section
+        text = modified["text"]
+        if "<|user|>" in text:
+            parts = text.split("<|user|>")
+            if len(parts) > 1:
+                user_part = parts[1]
+                if not user_part.strip().startswith(SOLVE_TASK_PREFIX):
+                    parts[1] = f"\n{SOLVE_TASK_PREFIX}" + user_part
+                    modified["text"] = "<|user|>".join(parts)
+    # Mark as solve task
+    modified["task_type"] = "solve"
+    return modified
+def verify_question_prefix(record: dict[str, Any]) -> dict[str, Any]:
+    """
+    Verify question generation record has proper prefix.
+    Should already have it from generation script, but double-check.
+    """
+    modified = record.copy()
+    modified["messages"] = []
+    for msg in record["messages"]:
+        new_msg = msg.copy()
+        if msg["role"] == "user":
+            content = msg["content"]
+            if not content.startswith(GENERATE_TASK_PREFIX):
+                new_msg["content"] = GENERATE_TASK_PREFIX + content
+        modified["messages"].append(new_msg)
+    # Update text field if present
+    if "text" in modified:
+        text = modified["text"]
+        if "<|user|>" in text:
+            parts = text.split("<|user|>")
+            if len(parts) > 1:
+                user_part = parts[1]
+                if not user_part.strip().startswith(GENERATE_TASK_PREFIX):
+                    parts[1] = f"\n{GENERATE_TASK_PREFIX}" + user_part
+                    modified["text"] = "<|user|>".join(parts)
+    # Mark as question generation task
+    modified["task_type"] = "generate"
+    return modified
+def sample_with_ratio(
+    solution_records: list[dict[str, Any]],
+    question_records: list[dict[str, Any]],
+    mix_ratio: float,
+    target_total: int | None = None,
+) -> list[dict[str, Any]]:
+    """
+    Sample and mix datasets according to specified ratio.
+    Args:
+        solution_records: Solution examples
+        question_records: Question generation examples
+        mix_ratio: Fraction of solutions in final dataset (0.8 = 80% solutions, 20% questions)
+        target_total: Target total examples (None = use all available data)
+    Returns:
+        Mixed dataset
+    """
+    n_solutions = len(solution_records)
+    n_questions = len(question_records)
+    if target_total is None:
+        # Use all available data
+        target_total = n_solutions + n_questions
+    # Calculate target counts
+    n_sol_target = int(target_total * mix_ratio)
+    n_q_target = target_total - n_sol_target
+    # Check availability
+    if n_sol_target > n_solutions:
+        print(f"Warning: Requested {n_sol_target} solutions but only {n_solutions} available.")
+        n_sol_target = n_solutions
+    if n_q_target > n_questions:
+        print(f"Warning: Requested {n_q_target} questions but only {n_questions} available.")
+        n_q_target = n_questions
+    # Sample
+    selected_solutions = random.sample(solution_records, n_sol_target)
+    selected_questions = random.sample(question_records, n_q_target)
+    print(f"Sampled {n_sol_target} solutions and {n_q_target} questions")
+    print(f"Actual ratio: {n_sol_target/(n_sol_target+n_q_target):.2%} solutions, "
+          f"{n_q_target/(n_sol_target+n_q_target):.2%} questions")
+    return selected_solutions + selected_questions
+def write_jsonl(records: list[dict[str, Any]], path: Path) -> None:
+    """Write records to JSONL file."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for record in records:
+            f.write(json.dumps(record, ensure_ascii=False) + "\n")
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Create dual-task training dataset from solution and question-generation examples."
+    )
+    parser.add_argument(
+        "--solution-data",
+        type=Path,
+        required=True,
+        help="Path to solution training data (GSM8K format)",
+    )
+    parser.add_argument(
+        "--question-data",
+        type=Path,
+        required=True,
+        help="Path to question-generation training data",
+    )
+    parser.add_argument(
+        "--output-train",
+        type=Path,
+        required=True,
+        help="Output path for training split",
+    )
+    parser.add_argument(
+        "--output-val",
+        type=Path,
+        required=True,
+        help="Output path for validation split",
+    )
+    parser.add_argument(
+        "--mix-ratio",
+        type=float,
+        default=0.8,
+        help="Fraction of solutions in mixed dataset (default: 0.8 = 80%% solutions)",
+    )
+    parser.add_argument(
+        "--val-split",
+        type=float,
+        default=0.1,
+        help="Fraction of data to use for validation (default: 0.1 = 10%%)",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for reproducibility",
+    )
+    parser.add_argument(
+        "--max-total",
+        type=int,
+        default=None,
+        help="Maximum total examples to include (None = use all available)",
+    )
+    args = parser.parse_args()
+    # Validate inputs
+    if not args.solution_data.exists():
+        raise SystemExit(f"Error: Solution data not found at {args.solution_data}")
+    if not args.question_data.exists():
+        raise SystemExit(f"Error: Question data not found at {args.question_data}")
+    if not (0 < args.mix_ratio < 1):
+        raise SystemExit("Error: --mix-ratio must be between 0 and 1")
+    if not (0 < args.val_split < 1):
+        raise SystemExit("Error: --val-split must be between 0 and 1")
+    # Set random seed
+    random.seed(args.seed)
+    print("=" * 60)
+    print("Dual-Task Dataset Creation")
+    print("=" * 60)
+    # Load data
+    print("\n1. Loading data...")
+    print(f"   Solution data: {args.solution_data}")
+    solution_records = load_jsonl(args.solution_data)
+    print(f"   Loaded {len(solution_records)} solution examples")
+    print(f"   Question data: {args.question_data}")
+    question_records = load_jsonl(args.question_data)
+    print(f"   Loaded {len(question_records)} question-generation examples")
+    # Add task prefixes
+    print("\n2. Adding task prefixes...")
+    print("   Adding 'Solve Problem' prefix to solution examples...")
+    solution_records = [add_solve_prefix(r) for r in solution_records]
+    print("   Verifying 'Generate Question' prefix on question examples...")
+    question_records = [verify_question_prefix(r) for r in question_records]
+    # Mix datasets
+    print(f"\n3. Mixing datasets (ratio: {args.mix_ratio:.0%} solutions, {1-args.mix_ratio:.0%} questions)...")
+    mixed_records = sample_with_ratio(
+        solution_records=solution_records,
+        question_records=question_records,
+        mix_ratio=args.mix_ratio,
+        target_total=args.max_total,
+    )
+    # Shuffle
+    print(f"\n4. Shuffling {len(mixed_records)} total examples...")
+    random.shuffle(mixed_records)
+    # Split train/val
+    n_val = int(len(mixed_records) * args.val_split)
+    n_train = len(mixed_records) - n_val
+    train_records = mixed_records[:n_train]
+    val_records = mixed_records[n_train:]
+    print(f"\n5. Splitting data:")
+    print(f"   Training: {len(train_records)} examples ({len(train_records)/len(mixed_records):.1%})")
+    print(f"   Validation: {len(val_records)} examples ({len(val_records)/len(mixed_records):.1%})")
+    # Verify split composition
+    train_solve = sum(1 for r in train_records if r.get("task_type") == "solve")
+    train_gen = sum(1 for r in train_records if r.get("task_type") == "generate")
+    val_solve = sum(1 for r in val_records if r.get("task_type") == "solve")
+    val_gen = sum(1 for r in val_records if r.get("task_type") == "generate")
+    print(f"\n   Train composition:")
+    print(f"     Solve: {train_solve} ({train_solve/len(train_records):.1%})")
+    print(f"     Generate: {train_gen} ({train_gen/len(train_records):.1%})")
+    print(f"   Val composition:")
+    print(f"     Solve: {val_solve} ({val_solve/len(val_records):.1%})")
+    print(f"     Generate: {val_gen} ({val_gen/len(val_records):.1%})")
+    # Write outputs
+    print(f"\n6. Writing output files...")
+    print(f"   Training data: {args.output_train}")
+    write_jsonl(train_records, args.output_train)
+    print(f"   Validation data: {args.output_val}")
+    write_jsonl(val_records, args.output_val)
+    print("\n" + "=" * 60)
+    print("Dual-task dataset creation complete!")
+    print("=" * 60)
+    print(f"\nOutput files:")
+    print(f"  Train: {args.output_train} ({len(train_records)} examples)")
+    print(f"  Val:   {args.output_val} ({len(val_records)} examples)")
+    print(f"\nNext step: Train dual-task model using these files")
+if __name__ == "__main__":
+    main()

scripts/demo_before_after.py ADDED Viewed

	@@ -0,0 +1,591 @@

+"""Before / after demo — baseline vs GRPO-trained policy.
+Designed for hackathon judges: loads both models, runs greedy evaluation on
+a fixed problem set, and prints a clean side-by-side comparison with full
+solution text for the most interesting examples.
+Features
+--------
+* Handles all checkpoint types: HF model IDs, GRPO full-weight saves,
+  PEFT/LoRA adapter directories.
+* Automatically loads the chat template from the base model when the
+  checkpoint tokenizer doesn't have one (fixes the 0% accuracy bug that
+  silently swallows TemplateErrors).
+* Reads ``metrics.jsonl`` (if present) and prints the full accuracy curve,
+  showing judges the training progression at a glance.
+* Saves machine-readable JSON (for grading scripts) and prints a human-
+  readable Markdown table.
+* Shows full solution text for the best wins and worst regressions.
+Quick-start
+-----------
+After a GRPO run, point at ``best_policy/``::
+    python scripts/demo_before_after.py \\
+        --baseline-model checkpoints/dual_task_v1 \\
+        --trained-model  checkpoints/grpo/<run>/best_policy \\
+        --problems       data/sft/gsm8k_sft.jsonl \\
+        --max-samples    100
+Include the training curve::
+    python scripts/demo_before_after.py \\
+        --baseline-model checkpoints/dual_task_v1 \\
+        --trained-model  checkpoints/grpo/<run>/best_policy \\
+        --metrics-jsonl  checkpoints/grpo/<run>/metrics.jsonl \\
+        --problems       data/sft/gsm8k_sft.jsonl \\
+        --max-samples    100 \\
+        --records-out    results/demo.json
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import re
+import sys
+import time
+import types
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import torch
+from peft import PeftModel
+from tqdm.auto import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from src.sft.solution_format import extract_final_answer_numeric_str
+from src.utils.attn_backend import select_attn_implementation
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)-8s %(name)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+_SEP = "=" * 78
+_SEP2 = "-" * 78
+# ---------------------------------------------------------------------------
+# Data
+# ---------------------------------------------------------------------------
+@dataclass
+class Problem:
+    question: str
+    gold_final: str
+def _parse_gold(answer: str) -> str:
+    m = re.search(r"####\s*([-0-9.,/ ]+)", answer)
+    if m:
+        return m.group(1).strip().replace(",", "")
+    return answer.strip().splitlines()[-1].strip()
+def _load_problems(path: Path, max_samples: int) -> List[Problem]:
+    """Accept GSM8K ``{question, answer}`` or SFT ``{messages}`` JSONL."""
+    out: List[Problem] = []
+    with path.open(encoding="utf-8") as fh:
+        for line in fh:
+            if max_samples > 0 and len(out) >= max_samples:
+                break
+            line = line.strip()
+            if not line:
+                continue
+            obj = json.loads(line)
+            if "question" in obj and "answer" in obj:
+                out.append(Problem(
+                    question=obj["question"].strip(),
+                    gold_final=_parse_gold(obj["answer"]),
+                ))
+            elif "messages" in obj:
+                user = next(
+                    (m["content"] for m in obj["messages"] if m.get("role") == "user"), ""
+                ).strip()
+                asst = next(
+                    (m["content"] for m in obj["messages"] if m.get("role") == "assistant"), ""
+                )
+                gold = extract_final_answer_numeric_str(asst) or ""
+                out.append(Problem(question=user, gold_final=gold.strip()))
+    return out
+# ---------------------------------------------------------------------------
+# Model loading — handles HF IDs, full-weight saves, and PEFT adapters
+# ---------------------------------------------------------------------------
+def _ensure_chat_template(
+    tokenizer: AutoTokenizer,
+    fallback_model: str = "Qwen/Qwen2.5-Math-1.5B-Instruct",
+) -> None:
+    """Load chat template from *fallback_model* when the checkpoint lacks one.
+    SFT adapter checkpoints often omit the chat_template from their tokenizer
+    config.  Without it, ``apply_chat_template`` raises a TemplateError that
+    is silently swallowed inside ``evaluate_gsm8k``, returning 0% accuracy.
+    """
+    if tokenizer.chat_template is not None:
+        return
+    logger.info("Tokenizer missing chat_template — loading from %s", fallback_model)
+    try:
+        _base_tok = AutoTokenizer.from_pretrained(fallback_model, trust_remote_code=True)
+        if _base_tok.chat_template is not None:
+            tokenizer.chat_template = _base_tok.chat_template
+            logger.info("Chat template loaded.")
+    except Exception as exc:
+        logger.warning("Could not load chat template: %s", exc)
+def _load_model(
+    checkpoint: str,
+    base_model_id: str,
+    device: torch.device,
+    dtype: torch.dtype,
+    attn_impl: str,
+) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
+    """Load model + tokenizer from any checkpoint style.
+    Handles:
+    * HuggingFace model ID  (e.g. ``Qwen/Qwen2.5-Math-1.5B-Instruct``)
+    * GRPO full-weight save (directory with ``model.safetensors`` / pytorch_model*)
+    * PEFT/LoRA adapter dir (directory with ``adapter_config.json``)
+    """
+    # PEFT shim — prevents crash in merge_and_unload on some versions.
+    if "transformers.integrations.tensor_parallel" not in sys.modules:
+        sys.modules["transformers.integrations.tensor_parallel"] = types.ModuleType(
+            "tensor_parallel"
+        )
+    ckpt_path = Path(checkpoint)
+    is_adapter = ckpt_path.is_dir() and (ckpt_path / "adapter_config.json").exists()
+    is_local_full = ckpt_path.is_dir() and not is_adapter
+    # Tokenizer
+    tok_src = checkpoint if (ckpt_path.is_dir() and (ckpt_path / "tokenizer_config.json").exists()) else base_model_id
+    tokenizer = AutoTokenizer.from_pretrained(tok_src, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "left"  # standard for generation
+    _ensure_chat_template(tokenizer, fallback_model=base_model_id)
+    load_kw = dict(
+        torch_dtype=dtype,
+        low_cpu_mem_usage=True,
+        device_map={"": device},
+        trust_remote_code=True,
+        attn_implementation=attn_impl,
+    )
+    if is_adapter:
+        # Read base model from pipeline_meta.json if present
+        meta_file = ckpt_path / "pipeline_meta.json"
+        _base = base_model_id
+        if meta_file.exists():
+            _base = json.loads(meta_file.read_text()).get("base_model", _base)
+        logger.info("PEFT adapter — loading base %s then merging %s", _base, checkpoint)
+        _base_mdl = AutoModelForCausalLM.from_pretrained(_base, **load_kw)
+        model = PeftModel.from_pretrained(_base_mdl, checkpoint).merge_and_unload()
+        model = model.to(device)
+    else:
+        # Full weights (GRPO save) or HF model ID
+        src = checkpoint if is_local_full else checkpoint
+        logger.info("Loading full-weight model from %s", src)
+        model = AutoModelForCausalLM.from_pretrained(src, **load_kw)
+    # Re-enable requires_grad isn't needed for eval, but ensure eval mode.
+    model.eval()
+    n = sum(p.numel() for p in model.parameters())
+    logger.info("Loaded: %s  (%.2fB params, %.1f GB VRAM est.)",
+                checkpoint, n / 1e9, n * 2 / 1e9)
+    return model, tokenizer
+# ---------------------------------------------------------------------------
+# Generation
+# ---------------------------------------------------------------------------
+def _build_prompt(tokenizer: AutoTokenizer, question: str) -> str:
+    """Format question using the model's chat template (matches training format)."""
+    if tokenizer.chat_template is None:
+        return question
+    msgs = [
+        {"role": "system", "content": "You are a helpful math assistant. Solve the problem step-by-step and end with 'Final Answer: <number>'."},
+        {"role": "user",   "content": question},
+    ]
+    try:
+        return tokenizer.apply_chat_template(
+            msgs, tokenize=False, add_generation_prompt=True
+        )
+    except Exception:
+        return question
+def _stop_ids(tokenizer: AutoTokenizer) -> List[int]:
+    ids = []
+    if tokenizer.eos_token_id is not None:
+        ids.append(tokenizer.eos_token_id)
+    im_end = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    if isinstance(im_end, int) and im_end not in ids:
+        ids.append(im_end)
+    return ids or None  # type: ignore[return-value]
+@torch.no_grad()
+def _generate(
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+    question: str,
+    max_new_tokens: int,
+    device: torch.device,
+) -> str:
+    prompt = _build_prompt(tokenizer, question)
+    enc = tokenizer(
+        prompt,
+        return_tensors="pt",
+        truncation=True,
+        max_length=1024,
+    ).to(device)
+    prompt_len = enc["input_ids"].shape[1]
+    out = model.generate(
+        input_ids=enc["input_ids"],
+        attention_mask=enc["attention_mask"],
+        max_new_tokens=max_new_tokens,
+        do_sample=False,       # greedy — deterministic for reproducibility
+        temperature=1.0,
+        pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
+        eos_token_id=_stop_ids(tokenizer),
+        use_cache=True,
+    )
+    return tokenizer.decode(out[0][prompt_len:], skip_special_tokens=True)
+# ---------------------------------------------------------------------------
+# Scoring
+# ---------------------------------------------------------------------------
+def _normalize(x: str) -> str:
+    if not x:
+        return ""
+    s = x.strip().replace(",", "").replace("$", "").strip()
+    try:
+        f = float(s)
+        return f"{int(f)}" if f == int(f) else f"{f}"
+    except ValueError:
+        return s
+@dataclass
+class Record:
+    question: str
+    gold: str
+    pred: str
+    correct: bool
+    solution_text: str
+def _score_model(
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+    problems: List[Problem],
+    max_new_tokens: int,
+    device: torch.device,
+    label: str,
+) -> Tuple[int, List[Record]]:
+    records: List[Record] = []
+    correct = 0
+    for prob in tqdm(problems, desc=f"Scoring {label}", unit="q", dynamic_ncols=True):
+        try:
+            text = _generate(model, tokenizer, prob.question, max_new_tokens, device)
+        except Exception as exc:
+            text = f"[generation error: {exc}]"
+        pred = extract_final_answer_numeric_str(text) or ""
+        ok = bool(pred) and _normalize(pred) == _normalize(prob.gold_final)
+        if ok:
+            correct += 1
+        records.append(Record(
+            question=prob.question,
+            gold=prob.gold_final,
+            pred=pred,
+            correct=ok,
+            solution_text=text,
+        ))
+    return correct, records
+# ---------------------------------------------------------------------------
+# Metrics curve
+# ---------------------------------------------------------------------------
+def _load_metrics_curve(path: Path) -> List[Dict]:
+    """Read metrics.jsonl and return rows that contain GSM8K accuracy."""
+    rows = []
+    if not path.exists():
+        return rows
+    with path.open(encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+                if "accuracy" in obj or "iteration" in obj:
+                    rows.append(obj)
+            except json.JSONDecodeError:
+                pass
+    return rows
+def _print_curve(rows: List[Dict]) -> None:
+    if not rows:
+        return
+    print(f"\n{_SEP}")
+    print("TRAINING ACCURACY CURVE  (from metrics.jsonl)")
+    print(_SEP)
+    print(f"{'Iter':>5}  {'GSM8K%':>7}  {'Reward':>7}  {'Batch%':>7}  {'LR':>10}  {'Time(s)':>8}")
+    print(_SEP2)
+    for r in rows:
+        it  = r.get("iteration", "")
+        acc = r.get("accuracy", None)
+        rwd = r.get("mean_reward", None)
+        bat = r.get("batch_accuracy", None)
+        lr  = r.get("learning_rate", None)
+        ts  = r.get("iter_time_s", None)
+        acc_s = f"{100*acc:.1f}%" if acc is not None else "—"
+        rwd_s = f"{rwd:.3f}"      if rwd is not None else "—"
+        bat_s = f"{100*bat:.1f}%" if bat is not None else "—"
+        lr_s  = f"{lr:.2e}"       if lr  is not None else "—"
+        ts_s  = f"{ts:.1f}"       if ts  is not None else "—"
+        print(f"{it:>5}  {acc_s:>7}  {rwd_s:>7}  {bat_s:>7}  {lr_s:>10}  {ts_s:>8}")
+    print()
+# ---------------------------------------------------------------------------
+# Output
+# ---------------------------------------------------------------------------
+def _print_summary(
+    base_correct: int,
+    tr_correct: int,
+    base_records: List[Record],
+    tr_records: List[Record],
+    baseline_name: str,
+    trained_name: str,
+    n_solutions: int = 3,
+) -> None:
+    n = len(base_records)
+    wins   = [(p, b, t) for p, b, t in zip(base_records, base_records, tr_records) if not b.correct and t.correct]
+    losses = [(p, b, t) for p, b, t in zip(base_records, base_records, tr_records) if b.correct and not t.correct]
+    both_wrong  = sum(1 for b, t in zip(base_records, tr_records) if not b.correct and not t.correct)
+    both_right  = sum(1 for b, t in zip(base_records, tr_records) if b.correct and t.correct)
+    delta = tr_correct - base_correct
+    sign  = "+" if delta >= 0 else ""
+    print(f"\n{_SEP}")
+    print("BEFORE  vs  AFTER — GSM8K accuracy (greedy decoding, fixed seed)")
+    print(_SEP)
+    print(f"  Baseline  : {baseline_name}")
+    print(f"  Trained   : {trained_name}")
+    print(_SEP2)
+    print(f"  Baseline accuracy  : {base_correct}/{n}  ({100*base_correct/n:.1f}%)")
+    print(f"  Trained  accuracy  : {tr_correct}/{n}  ({100*tr_correct/n:.1f}%)")
+    print(f"  Delta              : {sign}{delta} problems  ({sign}{100*delta/n:.1f} pp)")
+    print(_SEP2)
+    print(f"  Newly correct (wins)   : {len(wins)}")
+    print(f"  Newly wrong  (losses)  : {len(losses)}")
+    print(f"  Both correct           : {both_right}")
+    print(f"  Both wrong             : {both_wrong}")
+    print(_SEP)
+    if wins:
+        print(f"\n{'='*78}")
+        print(f"WINS — problems the RL model now solves that the baseline could not")
+        print(f"{'='*78}")
+        for i, (_, base_r, tr_r) in enumerate(wins[:n_solutions]):
+            print(f"\n[Win {i+1}/{min(n_solutions, len(wins))}]")
+            _print_problem(base_r, tr_r)
+    if losses:
+        print(f"\n{'='*78}")
+        print(f"REGRESSIONS — problems the baseline solved but the RL model now misses")
+        print(f"{'='*78}")
+        for i, (_, base_r, tr_r) in enumerate(losses[:min(2, len(losses))]):
+            print(f"\n[Regression {i+1}/{min(2, len(losses))}]")
+            _print_problem(base_r, tr_r, is_regression=True)
+    print(f"\n{_SEP}")
+    pct_gain = 100 * delta / max(n - base_correct, 1)
+    print(f"SUMMARY: RL training fixed {len(wins)} problems, regressed {len(losses)}.")
+    print(f"         Net: {sign}{delta} pts.  Relative gain on previously-wrong: {pct_gain:+.1f}%")
+    print(_SEP)
+def _print_problem(base_r: Record, tr_r: Record, is_regression: bool = False) -> None:
+    q = base_r.question
+    # Truncate long questions
+    if len(q) > 250:
+        q = q[:247] + "..."
+    print(f"  Q : {q}")
+    print(f"  Gold   : {base_r.gold}")
+    if not is_regression:
+        print(f"  Before : {base_r.pred!r:30s}  ✗")
+        print(f"  After  : {tr_r.pred!r:30s}  ✓")
+        # Show trained solution (truncated)
+        sol = tr_r.solution_text.strip()
+        if sol:
+            lines = sol.splitlines()
+            show = "\n    ".join(lines[:12])
+            if len(lines) > 12:
+                show += f"\n    ... ({len(lines)-12} more lines)"
+            print(f"\n  Solution (trained model):\n    {show}")
+    else:
+        print(f"  Before : {base_r.pred!r:30s}  ✓")
+        print(f"  After  : {tr_r.pred!r:30s}  ✗")
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--baseline-model", default="checkpoints/dual_task_v1",
+        help="Pre-RL checkpoint. HF model ID, full-weight dir, or PEFT adapter dir.",
+    )
+    parser.add_argument(
+        "--trained-model", required=True,
+        help="Post-RL checkpoint (GRPO best_policy/ dir, or iteration checkpoint).",
+    )
+    parser.add_argument(
+        "--base-model-for-adapter", default="Qwen/Qwen2.5-Math-1.5B-Instruct",
+        help="Base model used when loading a PEFT adapter checkpoint.",
+    )
+    parser.add_argument(
+        "--problems", type=Path, default=Path("data/sft/gsm8k_sft.jsonl"),
+        help="JSONL eval set. Defaults to GSM8K training split (first --max-samples rows).",
+    )
+    parser.add_argument("--max-samples", type=int, default=100)
+    parser.add_argument("--max-new-tokens", type=int, default=512)
+    parser.add_argument(
+        "--metrics-jsonl", type=Path, default=None,
+        help="Path to metrics.jsonl from a GRPO run — prints the accuracy curve.",
+    )
+    parser.add_argument(
+        "--n-solutions", type=int, default=3,
+        help="Number of win/loss examples to print in full.",
+    )
+    parser.add_argument(
+        "--records-out", type=Path, default=None,
+        help="Save full per-problem JSON records here (for judge grading scripts).",
+    )
+    parser.add_argument(
+        "--device", default="cuda" if torch.cuda.is_available() else "cpu",
+    )
+    parser.add_argument(
+        "--dtype", default="bfloat16",
+        choices=["float32", "float16", "bfloat16"],
+    )
+    args = parser.parse_args()
+    if not args.problems.is_file():
+        logger.error("Problems file not found: %s", args.problems)
+        return 2
+    dtype_map = {"float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16}
+    dtype  = dtype_map[args.dtype]
+    device = torch.device(args.device)
+    attn   = select_attn_implementation()
+    logger.info("Device: %s | dtype: %s | attn: %s", device, args.dtype, attn)
+    # Print training curve if available
+    if args.metrics_jsonl:
+        curve = _load_metrics_curve(args.metrics_jsonl)
+        _print_curve(curve)
+    problems = _load_problems(args.problems, args.max_samples)
+    if not problems:
+        logger.error("No problems loaded from %s", args.problems)
+        return 2
+    logger.info("Evaluating on %d problems from %s", len(problems), args.problems)
+    # ── Baseline ──────────────────────────────────────────────────────────
+    logger.info("%s\nScoring BASELINE: %s\n%s", _SEP, args.baseline_model, _SEP)
+    t0 = time.perf_counter()
+    base_model, base_tok = _load_model(
+        args.baseline_model, args.base_model_for_adapter, device, dtype, attn
+    )
+    base_correct, base_records = _score_model(
+        base_model, base_tok, problems, args.max_new_tokens, device, "baseline"
+    )
+    del base_model
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    logger.info("Baseline done in %.1fs — accuracy: %d/%d (%.1f%%)",
+                time.perf_counter() - t0,
+                base_correct, len(problems),
+                100 * base_correct / len(problems))
+    # ── Trained ───────────────────────────────────────────────────────────
+    logger.info("%s\nScoring TRAINED: %s\n%s", _SEP, args.trained_model, _SEP)
+    t0 = time.perf_counter()
+    tr_model, tr_tok = _load_model(
+        args.trained_model, args.base_model_for_adapter, device, dtype, attn
+    )
+    tr_correct, tr_records = _score_model(
+        tr_model, tr_tok, problems, args.max_new_tokens, device, "trained"
+    )
+    del tr_model
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    logger.info("Trained done in %.1fs — accuracy: %d/%d (%.1f%%)",
+                time.perf_counter() - t0,
+                tr_correct, len(problems),
+                100 * tr_correct / len(problems))
+    # ── Summary ───────────────────────────────────────────────────────────
+    _print_summary(
+        base_correct, tr_correct,
+        base_records, tr_records,
+        baseline_name=args.baseline_model,
+        trained_name=args.trained_model,
+        n_solutions=args.n_solutions,
+    )
+    # ── Save records ──────────────────────────────────────────────────────
+    if args.records_out:
+        args.records_out.parent.mkdir(parents=True, exist_ok=True)
+        payload = {
+            "baseline_model": args.baseline_model,
+            "trained_model":  args.trained_model,
+            "n_problems":     len(problems),
+            "baseline": {
+                "correct": base_correct,
+                "accuracy": base_correct / len(problems),
+                "records": [vars(r) for r in base_records],
+            },
+            "trained": {
+                "correct": tr_correct,
+                "accuracy": tr_correct / len(problems),
+                "records": [vars(r) for r in tr_records],
+            },
+        }
+        args.records_out.write_text(
+            json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8"
+        )
+        logger.info("Per-problem records saved to %s", args.records_out)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

scripts/dual_task_sft_pipeline.py ADDED Viewed

	@@ -0,0 +1,390 @@

+"""
+Dual-task SFT pipeline: train model on both question generation and solution tasks.
+This pipeline trains a single model that can:
+1. Generate math questions when prompted with "### Task: Generate Question"
+2. Solve math problems when prompted with "### Task: Solve Problem"
+Examples
+--------
+  # Train dual-task model
+  python scripts/dual_task_sft_pipeline.py train \\
+      --data data/sft/dual_task_train.jsonl \\
+      --output-dir checkpoints/dual_task_v1 \\
+      --epochs 2
+  # Infer - Question Generation
+  python scripts/dual_task_sft_pipeline.py infer \\
+      --adapter checkpoints/dual_task_v1 \\
+      --task generate \\
+      --prompt "Create a word problem about fractions and money requiring 3 steps."
+  # Infer - Solution Generation
+  python scripts/dual_task_sft_pipeline.py infer \\
+      --adapter checkpoints/dual_task_v1 \\
+      --task solve \\
+      --problem "Janet has 16 eggs. She eats 3. How many are left?"
+Dependencies: torch, transformers, peft, datasets, accelerate, bitsandbytes, trl
+"""
+from __future__ import annotations
+import os
+if "HF_HUB_DISABLE_XET" not in os.environ:
+    os.environ["HF_HUB_DISABLE_XET"] = "1"
+import argparse
+import json
+import math
+import sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+from src.config.prompts import (
+    SOLVE_TASK_PREFIX,
+    GENERATE_TASK_PREFIX,
+    SOLVER_SYSTEM_PROMPT,
+    GENERATOR_SYSTEM_PROMPT,
+)
+def _warmup_steps_from_ratio(
+    num_examples: int,
+    per_device_train_batch_size: int,
+    gradient_accumulation_steps: int,
+    num_train_epochs: float,
+    warmup_ratio: float,
+) -> int:
+    """Calculate warmup steps from ratio."""
+    if warmup_ratio <= 0:
+        return 0
+    num_batches = max(
+        1,
+        (num_examples + per_device_train_batch_size - 1) // per_device_train_batch_size,
+    )
+    num_update_steps_per_epoch = max(1, num_batches // gradient_accumulation_steps)
+    total_optimizer_steps = max(1, math.ceil(num_train_epochs * num_update_steps_per_epoch))
+    return min(total_optimizer_steps, int(total_optimizer_steps * warmup_ratio))
+def cmd_train(args: argparse.Namespace) -> None:
+    try:
+        import torch
+        from datasets import load_dataset
+        from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+        from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+        from trl import SFTConfig, SFTTrainer
+    except ImportError as e:
+        raise SystemExit(
+            "Missing dependency for training. Install:\n"
+            "  pip install torch transformers peft datasets accelerate bitsandbytes trl\n"
+            f"Original error: {e}"
+        ) from e
+    data_path = Path(args.data)
+    if not data_path.is_file():
+        raise SystemExit(f"Data file not found: {data_path}")
+    out_dir = Path(args.output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    compute_dtype = getattr(torch, args.bnb_compute_dtype)
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=compute_dtype,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+    print(f"Loading model {args.model} …")
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+        dtype=compute_dtype,
+    )
+    model = prepare_model_for_kbit_training(model)
+    peft = LoraConfig(
+        r=args.lora_rank,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM",
+        target_modules=list(args.target_modules.split(",")),
+    )
+    model = get_peft_model(model, peft)
+    model.config.use_cache = False
+    model.print_trainable_parameters()
+    print(f"Loading dual-task dataset from {data_path} …")
+    ds = load_dataset("json", data_files=str(data_path), split="train")
+    if args.max_samples and args.max_samples > 0:
+        ds = ds.select(range(min(args.max_samples, len(ds))))
+    task_counts = {"solve": 0, "generate": 0, "unknown": 0}
+    for example in ds:
+        task_type = example.get("task_type", "unknown")
+        task_counts[task_type] = task_counts.get(task_type, 0) + 1
+    print(f"Dataset composition:")
+    print(f"  Total examples: {len(ds)}")
+    print(f"  Solve tasks: {task_counts['solve']} ({task_counts['solve']/len(ds):.1%})")
+    print(f"  Generate tasks: {task_counts['generate']} ({task_counts['generate']/len(ds):.1%})")
+    if task_counts['unknown'] > 0:
+        print(f"  Unknown tasks: {task_counts['unknown']}")
+    def formatting_func(example):
+        return tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+            add_generation_prompt=False,
+        )
+    if args.warmup_steps is not None:
+        warmup_steps = max(0, args.warmup_steps)
+    else:
+        warmup_steps = _warmup_steps_from_ratio(
+            len(ds),
+            args.batch_size,
+            args.grad_accum,
+            args.epochs,
+            args.warmup_ratio,
+        )
+    sft_args = SFTConfig(
+        output_dir=str(out_dir),
+        num_train_epochs=args.epochs,
+        per_device_train_batch_size=args.batch_size,
+        gradient_accumulation_steps=args.grad_accum,
+        learning_rate=args.learning_rate,
+        logging_steps=args.logging_steps,
+        save_steps=args.save_steps,
+        save_total_limit=3,
+        bf16=args.bf16 and torch.cuda.is_available(),
+        fp16=args.fp16 and torch.cuda.is_available() and not args.bf16,
+        max_length=args.max_seq_length,
+        warmup_steps=warmup_steps,
+        lr_scheduler_type="cosine",
+        report_to="none",
+        gradient_checkpointing=True,
+    )
+    print("\nStarting dual-task training...")
+    trainer = SFTTrainer(
+        model=model,
+        args=sft_args,
+        train_dataset=ds,
+        processing_class=tokenizer,
+        formatting_func=formatting_func,
+    )
+    trainer.train()
+    trainer.save_model(str(out_dir))
+    tokenizer.save_pretrained(str(out_dir))
+    with (out_dir / "pipeline_meta.json").open("w", encoding="utf-8") as f:
+        json.dump(
+            {
+                "pipeline_type": "dual_task",
+                "base_model": args.model,
+                "data": str(data_path),
+                "lora_rank": args.lora_rank,
+                "epochs": args.epochs,
+                "task_distribution": task_counts,
+            },
+            f,
+            indent=2,
+        )
+    print(f"\nSaved dual-task adapter and tokenizer to {out_dir}")
+def cmd_infer(args: argparse.Namespace) -> None:
+    import torch
+    from peft import PeftModel
+    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+    adapter = Path(args.adapter)
+    meta_path = adapter / "pipeline_meta.json"
+    base_model = args.base_model
+    if meta_path.is_file():
+        meta = json.loads(meta_path.read_text(encoding="utf-8"))
+        base_model = meta.get("base_model", base_model)
+        pipeline_type = meta.get("pipeline_type", "unknown")
+        if pipeline_type != "dual_task":
+            print(f"Warning: Adapter trained with pipeline_type='{pipeline_type}', expected 'dual_task'")
+    compute_dtype = getattr(torch, args.bnb_compute_dtype)
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=compute_dtype,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(adapter, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    print(f"Loading base {base_model} + adapter {adapter} …")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    model = PeftModel.from_pretrained(base, str(adapter))
+    model.eval()
+    if args.task == "solve":
+        system_prompt = SOLVER_SYSTEM_PROMPT
+        user_content = (
+            f"{SOLVE_TASK_PREFIX}"
+            "Solve the following problem. Show your reasoning as numbered steps, "
+            "then give the final numeric answer on the last line.\n\n"
+            f"Problem:\n{args.problem.strip()}"
+        )
+    elif args.task == "generate":
+        system_prompt = GENERATOR_SYSTEM_PROMPT
+        user_content = f"{GENERATE_TASK_PREFIX}{args.prompt.strip()}"
+    else:
+        raise ValueError(f"Unknown task: {args.task}. Must be 'solve' or 'generate'")
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_content},
+    ]
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    print(f"\nTask: {args.task}")
+    print(f"Prompt length: {inputs['input_ids'].shape[1]} tokens")
+    print("\nGenerating...")
+    with torch.no_grad():
+        out = model.generate(
+            **inputs,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            do_sample=not args.greedy,
+            pad_token_id=tokenizer.pad_token_id,
+        )
+    gen_ids = out[0, inputs["input_ids"].shape[1] :]
+    text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
+    print("\n" + "=" * 60)
+    print("Generated Output")
+    print("=" * 60)
+    print(text)
+    print("=" * 60)
+    if args.task == "solve":
+        print("\n--- Format Validation ---")
+        from src.sft.solution_format import validate_sympy_solution_format
+        r = validate_sympy_solution_format(text)
+        print(json.dumps(r.__dict__, indent=2))
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(description="Dual-task SFT pipeline (train / infer)")
+    sub = p.add_subparsers(dest="command", required=True)
+    tr = sub.add_parser("train", help="Train dual-task model on mixed dataset")
+    tr.add_argument("--data", type=str, required=True, help="Dual-task training JSONL")
+    tr.add_argument("--output-dir", type=str, required=True, help="Output directory for adapter")
+    tr.add_argument("--model", type=str, default="Qwen/Qwen2.5-Math-1.5B-Instruct", help="Base model")
+    tr.add_argument("--epochs", type=float, default=2.0, help="Training epochs (default: 2.0 for dual-task)")
+    tr.add_argument("--batch-size", type=int, default=1)
+    tr.add_argument("--grad-accum", type=int, default=8)
+    tr.add_argument("--learning-rate", type=float, default=2e-4)
+    tr.add_argument("--max-samples", type=int, default=0, help="0 = use full dataset")
+    tr.add_argument("--lora-rank", type=int, default=16)
+    tr.add_argument("--lora-alpha", type=int, default=32)
+    tr.add_argument("--lora-dropout", type=float, default=0.05)
+    tr.add_argument(
+        "--target-modules",
+        type=str,
+        default="q_proj,v_proj,o_proj,gate_proj",
+    )
+    tr.add_argument("--max-seq-length", type=int, default=2048)
+    tr.add_argument("--save-steps", type=int, default=200)
+    tr.add_argument("--logging-steps", type=int, default=10)
+    tr.add_argument("--warmup-ratio", type=float, default=0.03)
+    tr.add_argument("--warmup-steps", type=int, default=None)
+    tr.add_argument("--bf16", action="store_true", default=True)
+    tr.add_argument("--no-bf16", dest="bf16", action="store_false")
+    tr.add_argument("--fp16", action="store_true")
+    tr.add_argument("--bnb-compute-dtype", type=str, default="bfloat16")
+    tr.set_defaults(func=cmd_train)
+    inf = sub.add_parser("infer", help="Generate with dual-task model")
+    inf.add_argument("--adapter", type=str, required=True, help="Adapter directory")
+    inf.add_argument(
+        "--base-model",
+        type=str,
+        default="Qwen/Qwen2.5-Math-1.5B-Instruct",
+        help="Base model (auto-detected from pipeline_meta.json if present)",
+    )
+    inf.add_argument(
+        "--task",
+        type=str,
+        required=True,
+        choices=["solve", "generate"],
+        help="Task type: 'solve' for problem solving, 'generate' for question generation",
+    )
+    inf.add_argument(
+        "--problem",
+        type=str,
+        default="",
+        help="Math problem to solve (required if --task solve)",
+    )
+    inf.add_argument(
+        "--prompt",
+        type=str,
+        default="",
+        help="Question generation prompt (required if --task generate)",
+    )
+    inf.add_argument("--max-new-tokens", type=int, default=1024)
+    inf.add_argument("--temperature", type=float, default=0.7)
+    inf.add_argument("--top-p", type=float, default=0.95)
+    inf.add_argument("--greedy", action="store_true", help="Use greedy decoding")
+    inf.add_argument("--bnb-compute-dtype", type=str, default="bfloat16")
+    inf.set_defaults(func=cmd_infer)
+    return p
+def main() -> None:
+    parser = build_parser()
+    args = parser.parse_args()
+    if args.command == "infer":
+        if args.task == "solve" and not args.problem:
+            raise SystemExit("Error: --problem is required when --task solve")
+        if args.task == "generate" and not args.prompt:
+            raise SystemExit("Error: --prompt is required when --task generate")
+    if str(ROOT) not in sys.path:
+        sys.path.insert(0, str(ROOT))
+    args.func(args)
+if __name__ == "__main__":
+    main()

scripts/eval_sft_inference.py ADDED Viewed

	@@ -0,0 +1,565 @@

+#!/usr/bin/env python3
+"""
+Run batch inference for a trained QLoRA adapter and report quality metrics.
+This helps decide whether another SFT epoch is needed before RL.
+Examples
+--------
+  # Evaluate on GSM8K test split (first 100 samples)
+  python scripts/eval_sft_inference.py \
+      --adapter checkpoints/gsm8k_sft \
+      --max-samples 100
+  # Evaluate on local JSONL with {question, answer} rows
+  python scripts/eval_sft_inference.py \
+      --adapter checkpoints/gsm8k_sft \
+      --source jsonl \
+      --input data/raw/gsm8k_test.jsonl \
+      --max-samples 50 \
+      --output-json reports/sft_eval.json
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import re
+import sys
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any, Optional
+# Prefer classic HTTP Hub downloads by default.
+if "HF_HUB_DISABLE_XET" not in os.environ:
+    os.environ["HF_HUB_DISABLE_XET"] = "1"
+# Ensure project-root imports work when invoked as `python scripts/...`.
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+import torch
+from datasets import load_dataset
+from peft import PeftModel
+from sympy import simplify
+from sympy.parsing.sympy_parser import parse_expr
+from tqdm.auto import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from scripts.convert_gsm8k_to_sft import parse_gsm8k_answer
+from src.config.prompts import create_solver_messages
+from src.sft.solution_format import extract_final_answer_numeric_str, validate_sympy_solution_format
+from src.sft.sympy_normalize import normalize_for_parse_expr
+@dataclass
+class EvalRow:
+    index: int
+    question: str
+    gold_final: str
+    pred_final: str
+    exact_match: Optional[bool]
+    format_ok: bool
+    step_count: int
+    scratchpad_leak: bool
+    output_text: str
+def _norm_expr(s: str) -> str:
+    s = s.strip()
+    s = s.replace("^", "**")
+    s = re.sub(r"[,$€£\s]+", "", s)
+    return s
+def _equiv_expr(a: str, b: str) -> Optional[bool]:
+    """Check if two answer strings are mathematically equivalent.
+    Uses the same normalization as CurriculumMathEnvironment._answers_equivalent
+    so eval and training agree on what counts as "correct".
+    """
+    if not a or not b:
+        return None
+    a_n = normalize_for_parse_expr(_norm_expr(a))
+    b_n = normalize_for_parse_expr(_norm_expr(b))
+    try:
+        return bool(simplify(parse_expr(a_n) - parse_expr(b_n)) == 0)
+    except Exception:
+        return a_n == b_n
+def _iter_examples(args: argparse.Namespace) -> list[dict[str, str]]:
+    rows: list[dict[str, str]] = []
+    if args.source == "hf":
+        ds = load_dataset(args.dataset, args.config, split=args.split)
+        if args.max_samples > 0:
+            ds = ds.select(range(min(args.max_samples, len(ds))))
+        for row in ds:
+            _, final = parse_gsm8k_answer(row["answer"])
+            rows.append({"question": row["question"].strip(), "gold_final": final})
+        return rows
+    in_path = Path(args.input)
+    if not in_path.is_file():
+        raise SystemExit(f"Input JSONL not found: {in_path}")
+    with in_path.open(encoding="utf-8") as f:
+        for line in f:
+            if args.max_samples > 0 and len(rows) >= args.max_samples:
+                break
+            line = line.strip()
+            if not line:
+                continue
+            o = json.loads(line)
+            if "question" in o and "answer" in o:
+                _, final = parse_gsm8k_answer(o["answer"])
+                rows.append({"question": o["question"].strip(), "gold_final": final})
+                continue
+            if "messages" in o:
+                user = next((m["content"] for m in o["messages"] if m.get("role") == "user"), "").strip()
+                asst = next((m["content"] for m in o["messages"] if m.get("role") == "assistant"), "")
+                gold = extract_final_answer_numeric_str(asst) or ""
+                user = re.sub(r"^Solve the following problem\..*?Problem:\n", "", user, flags=re.S)
+                rows.append({"question": user.strip(), "gold_final": gold.strip()})
+                continue
+            raise SystemExit("JSONL rows must contain either {question, answer} or {messages}.")
+    return rows
+def _generate(
+    model: Any,
+    tokenizer: Any,
+    problem: str,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+    greedy: bool,
+) -> str:
+    # Use the canonical solver prompt (same system + user format as GRPO training)
+    # so eval measures the model under the exact distribution it was trained on.
+    messages = create_solver_messages(problem.strip())
+    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    # HuggingFace warns once-per-call when `temperature`/`top_p` are passed
+    # alongside `do_sample=False`.  Skip those kwargs entirely in greedy mode
+    # so long eval loops don't spam the log.
+    gen_kwargs = {
+        "max_new_tokens": max_new_tokens,
+        "do_sample": not greedy,
+        "pad_token_id": tokenizer.pad_token_id,
+    }
+    if not greedy:
+        gen_kwargs["temperature"] = temperature
+        gen_kwargs["top_p"] = top_p
+    with torch.no_grad():
+        out = model.generate(**inputs, **gen_kwargs)
+    gen_ids = out[0, inputs["input_ids"].shape[1] :]
+    return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
+def main() -> None:
+    p = argparse.ArgumentParser(description="Batch eval for SFT adapter inference.")
+    p.add_argument("--adapter", type=Path, required=True, help="Adapter directory from training step.")
+    p.add_argument("--base-model", type=str, default="Qwen/Qwen2.5-Math-1.5B-Instruct")
+    p.add_argument("--source", choices=("hf", "jsonl"), default="hf")
+    p.add_argument("--dataset", type=str, default="openai/gsm8k")
+    p.add_argument("--config", type=str, default="main")
+    p.add_argument("--split", type=str, default="test")
+    p.add_argument("--input", type=Path, help="JSONL path for --source jsonl")
+    p.add_argument("--max-samples", type=int, default=100)
+    p.add_argument("--max-new-tokens", type=int, default=512)
+    p.add_argument("--temperature", type=float, default=0.0)
+    p.add_argument("--top-p", type=float, default=1.0)
+    p.add_argument("--greedy", action="store_true", default=True)
+    p.add_argument("--no-greedy", dest="greedy", action="store_false")
+    p.add_argument("--bnb-compute-dtype", type=str, default="bfloat16")
+    p.add_argument("--show-samples", type=int, default=3)
+    p.add_argument("--output-json", type=Path, default=None)
+    args = p.parse_args()
+    if args.source == "jsonl" and not args.input:
+        raise SystemExit("--input is required when --source jsonl")
+    meta_path = args.adapter / "pipeline_meta.json"
+    base_model = args.base_model
+    if meta_path.is_file():
+        meta = json.loads(meta_path.read_text(encoding="utf-8"))
+        base_model = meta.get("base_model", base_model)
+    rows = _iter_examples(args)
+    if not rows:
+        raise SystemExit("No evaluation examples loaded.")
+    print(f"Loaded {len(rows)} evaluation examples.")
+    compute_dtype = getattr(torch, args.bnb_compute_dtype)
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=compute_dtype,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True,
+    )
+    print(f"Loading base {base_model} + adapter {args.adapter} …")
+    tokenizer = AutoTokenizer.from_pretrained(args.adapter, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    model = PeftModel.from_pretrained(base, str(args.adapter))
+    model.eval()
+    results: list[EvalRow] = []
+    for i, row in enumerate(rows):
+        text = _generate(
+            model=model,
+            tokenizer=tokenizer,
+            problem=row["question"],
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            greedy=args.greedy,
+        )
+        fmt = validate_sympy_solution_format(text)
+        pred_final = extract_final_answer_numeric_str(text) or ""
+        exact = _equiv_expr(pred_final, row["gold_final"])
+        results.append(
+            EvalRow(
+                index=i,
+                question=row["question"],
+                gold_final=row["gold_final"],
+                pred_final=pred_final,
+                exact_match=exact,
+                format_ok=fmt.ok,
+                step_count=fmt.step_count,
+                scratchpad_leak=("<<" in text and ">>" in text),
+                output_text=text,
+            )
+        )
+        if i < args.show_samples:
+            print(f"\n=== Sample {i} ===")
+            print("Q:", row["question"])
+            print("Gold:", row["gold_final"])
+            print("Pred:", pred_final)
+            print("Format OK:", fmt.ok, "| Steps:", fmt.step_count)
+            print(text)
+    n = len(results)
+    n_format_ok = sum(1 for r in results if r.format_ok)
+    n_scratch = sum(1 for r in results if r.scratchpad_leak)
+    em_scored = [r for r in results if r.exact_match is not None]
+    n_em = sum(1 for r in em_scored if r.exact_match)
+    print("\n=== Summary ===")
+    print(f"Samples: {n}")
+    print(f"Format OK: {n_format_ok}/{n} ({100.0 * n_format_ok / n:.2f}%)")
+    print(f"Scratchpad leakage (<< >>): {n_scratch}/{n} ({100.0 * n_scratch / n:.2f}%)")
+    if em_scored:
+        print(f"Exact match (final answer): {n_em}/{len(em_scored)} ({100.0 * n_em / len(em_scored):.2f}%)")
+    else:
+        print("Exact match (final answer): N/A (missing gold labels)")
+    if args.output_json is not None:
+        args.output_json.parent.mkdir(parents=True, exist_ok=True)
+        payload = {
+            "summary": {
+                "samples": n,
+                "format_ok": n_format_ok,
+                "format_ok_rate": n_format_ok / n,
+                "scratchpad_leakage": n_scratch,
+                "scratchpad_leakage_rate": n_scratch / n,
+                "exact_match_scored": len(em_scored),
+                "exact_match": n_em,
+                "exact_match_rate": (n_em / len(em_scored)) if em_scored else None,
+            },
+            "results": [asdict(r) for r in results],
+        }
+        args.output_json.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
+        print(f"Wrote detailed report to {args.output_json}")
+def _infer_dataset_name(data_path: str) -> str:
+    """Derive a short human-readable dataset label from the file path."""
+    stem = Path(data_path).stem.lower()   # e.g. "aqua_validation", "gsm8k_test"
+    if "aqua" in stem:
+        return "AQuA-RAT"
+    if "math" in stem:
+        return "MATH"
+    if "gsm" in stem:
+        return "GSM8K"
+    return Path(data_path).stem          # fallback: raw filename stem
+def evaluate_gsm8k(
+    model: Any,
+    tokenizer: Any,
+    data_path: str = "data/sft/gsm8k_test.jsonl",
+    max_samples: int = 500,
+    max_new_tokens: int = 512,
+    temperature: float = 0.0,
+    top_p: float = 1.0,
+    reward_fn: Any = None,
+    pass_at_k: int = 0,
+    dataset_name: str = "",
+    pass_at_k_temperature: float = 0.8,
+) -> dict:
+    """
+    Evaluate *model* on a math JSONL file using the SAME scoring
+    function used during GRPO training.
+    Args:
+        model        : AutoModelForCausalLM (already on correct device).
+        tokenizer    : Matching AutoTokenizer.
+        data_path    : Path to JSONL with {question, answer} rows.
+        max_samples  : Evaluation cap.
+        max_new_tokens / temperature / top_p : generation hyper-params.
+        reward_fn    : callable(question: str, solution: str, gold: str) -> dict
+                       Must return at minimum {"combined_score": float} and
+                       optionally {"gt_match": bool, "prm_mean_score": float,
+                       "sympy_score": float, "format_score": float}.
+                       When supplied the primary accuracy metric becomes the
+                       mean combined_score — identical to the GRPO training
+                       objective — so every component (correctness, PRM step
+                       quality, SymPy verification, format) contributes and
+                       improvements in any of them show up immediately.
+                       When None the function falls back to final-answer
+                       exact-match accuracy (coarse binary).
+    Returns dict keys:
+        accuracy          – mean combined_score per solution (or exact-match if no reward_fn)
+        combined_score    – same as accuracy (alias)
+        correct_rate      – fraction of solutions with gt_match == True
+        prm_mean          – mean PRM step-quality score per solution
+        sympy_mean        – mean SymPy verification score
+        format_mean       – mean format compliance score
+        n_scored          – solutions successfully scored by reward_fn
+        total             – total solutions evaluated
+        # fallback (no reward_fn):
+        exact_match_rate  – fraction of final answers matching gold
+    """
+    import logging as _logging
+    _logger = _logging.getLogger(__name__)
+    greedy = temperature < 1e-6
+    rows: list[dict] = []
+    p = Path(data_path)
+    if p.is_file():
+        with p.open(encoding="utf-8") as fh:
+            for line in fh:
+                if max_samples > 0 and len(rows) >= max_samples:
+                    break
+                line = line.strip()
+                if not line:
+                    continue
+                obj = json.loads(line)
+                if "question" in obj and "gold_final" in obj and obj["gold_final"]:
+                    # Pre-extracted format (our gsm8k_test.jsonl)
+                    rows.append({"question": obj["question"].strip(), "gold_final": obj["gold_final"].strip()})
+                elif "question" in obj and "answer" in obj:
+                    _, final = parse_gsm8k_answer(obj["answer"])
+                    if final:
+                        rows.append({"question": obj["question"].strip(), "gold_final": final})
+                elif "messages" in obj:
+                    task_type = obj.get("task_type", "solve")
+                    if task_type != "solve":
+                        continue   # skip question-generation entries
+                    user = next(
+                        (m["content"] for m in obj["messages"] if m.get("role") == "user"), ""
+                    ).strip()
+                    asst = next(
+                        (m["content"] for m in obj["messages"] if m.get("role") == "assistant"), ""
+                    )
+                    gold = extract_final_answer_numeric_str(asst) or ""
+                    if not gold:
+                        continue   # skip entries with no parseable gold answer
+                    user = re.sub(r"^Solve the following problem\..*?Problem:\n", "", user, flags=re.S)
+                    rows.append({"question": user.strip(), "gold_final": gold.strip()})
+    else:
+        _logger.warning(
+            f"evaluate_gsm8k: {data_path} not found; loading openai/gsm8k from Hub."
+        )
+        try:
+            ds = load_dataset("openai/gsm8k", "main", split="test")
+            if max_samples > 0:
+                ds = ds.select(range(min(max_samples, len(ds))))
+            for row in ds:
+                _, final = parse_gsm8k_answer(row["answer"])
+                rows.append({"question": row["question"].strip(), "gold_final": final})
+        except Exception as exc:
+            _logger.error(f"Could not load GSM8K: {exc}")
+            return {"accuracy": 0.0, "correct": 0, "total": 0, "exact_match_rate": 0.0}
+    if not rows:
+        return {"accuracy": 0.0, "correct": 0, "total": 0, "exact_match_rate": 0.0}
+    correct = 0
+    total   = len(rows)
+    _n_errors = 0
+    _MAX_ERROR_WARNINGS = 3
+    # Per-solution reward accumulators (populated when reward_fn is supplied).
+    _combined:  list[float] = []
+    _gt_match:  list[float] = []
+    _prm_comp:  list[float] = []
+    _prm_final: list[float] = []
+    _step_acc:  list[float] = []   # fraction of steps rated correct by PRM (>0.5)
+    _lccp:      list[float] = []   # longest correct consecutive prefix ratio
+    _sympy_comp:list[float] = []
+    _fmt_comp:  list[float] = []
+    # Pass@K accumulators: for each problem, did ANY of K samples get it right?
+    _pak_any_correct: list[int] = []   # 1 if any of K samples correct, else 0
+    _eval_label = dataset_name or _infer_dataset_name(data_path)
+    pbar = tqdm(
+        rows, total=total, desc=f"{_eval_label} eval",
+        unit="q", dynamic_ncols=True, leave=True,
+    )
+    for i, row in enumerate(pbar):
+        pred_text = ""
+        try:
+            pred_text = _generate(
+                model=model, tokenizer=tokenizer,
+                problem=row["question"],
+                max_new_tokens=max_new_tokens,
+                temperature=temperature, top_p=top_p, greedy=greedy,
+            )
+            pred_final = extract_final_answer_numeric_str(pred_text) or ""
+            if _equiv_expr(pred_final, row["gold_final"]):
+                correct += 1
+        except Exception as exc:
+            _n_errors += 1
+            if _n_errors <= _MAX_ERROR_WARNINGS:
+                _logger.warning(
+                    "evaluate_gsm8k: sample %d raised %s: %s. "
+                    "If all fail check that tokenizer has a chat_template.",
+                    i, type(exc).__name__, exc,
+                )
+            elif _n_errors == _MAX_ERROR_WARNINGS + 1:
+                _logger.warning(
+                    "evaluate_gsm8k: suppressing further errors (%d so far).",
+                    _n_errors,
+                )
+            _logger.debug("Sample %d error: %s", i, exc, exc_info=True)
+        # ── Pass@K: sample K solutions at T=0.8 and check if any is correct ─
+        # This is the fair comparison to batch_acc during training (also K samples
+        # at T=0.8). Greedy (pass@1) is pessimistic; pass@k shows the upper bound
+        # the model can achieve with sampling, matching the training regime.
+        if pass_at_k > 1 and row.get("gold_final"):
+            _any = 0
+            for _ in range(pass_at_k):
+                try:
+                    s = _generate(
+                        model=model, tokenizer=tokenizer,
+                        problem=row["question"],
+                        max_new_tokens=max_new_tokens,
+                        temperature=pass_at_k_temperature,
+                        top_p=top_p, greedy=False,
+                    )
+                    pf = extract_final_answer_numeric_str(s) or ""
+                    if _equiv_expr(pf, row["gold_final"]):
+                        _any = 1
+                        break
+                except Exception:
+                    pass
+            _pak_any_correct.append(_any)
+        # ── Apply the SAME reward function used during GRPO training ──────────
+        if reward_fn is not None and pred_text:
+            try:
+                r = reward_fn(row["question"], pred_text, row["gold_final"])
+                _combined.append(float(r.get("combined_score",   0.0)))
+                _gt_match.append(1.0 if r.get("gt_match", False) else 0.0)
+                _prm_comp.append(float(r.get("prm_mean_score",   0.0)))
+                _prm_final.append(float(r.get("prm_final_score", 0.0)))
+                _step_acc.append(float(r.get("step_accuracy",    0.0)))
+                _lccp.append(float(r.get("lccp",                 0.0)))
+                _sympy_comp.append(float(r.get("sympy_score",    0.0)))
+                _fmt_comp.append(float(r.get("format_score",     0.0)))
+            except Exception as rfn_exc:
+                _logger.debug("reward_fn failed for sample %d: %s", i, rfn_exc)
+        done = i + 1
+        # Periodically flush the CUDA allocator's free-block pool so that
+        # fragmentation from large KV-cache + PRM tensors doesn't accumulate
+        # and cause per-sample allocation time to grow throughout the run.
+        if done % 20 == 0:
+            import gc; gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        # Live bar: show training-objective score when available, else acc.
+        if _combined:
+            _pf: dict = dict(
+                score=f"{sum(_combined) / len(_combined):.3f}",
+                correct=f"{sum(_gt_match):.0f}/{len(_combined)}",
+                step_acc=f"{sum(_step_acc)/len(_step_acc):.1%}" if _step_acc else "—",
+                lccp=f"{sum(_lccp)/len(_lccp):.1%}" if _lccp else "—",
+            )
+        else:
+            _pf = dict(acc=f"{correct / done:.1%}", correct=f"{correct}/{done}")
+        pbar.set_postfix(**_pf, refresh=False)
+    # ── Aggregate ──────────────────────────────────────────────────────────
+    n_scored = len(_combined)
+    _avg = lambda lst: round(sum(lst) / len(lst), 4) if lst else 0.0
+    # Pass@K: fraction of problems where any of K sampled solutions was correct.
+    pass_at_k_score = _avg(_pak_any_correct) if _pak_any_correct else None
+    if reward_fn is not None:
+        combined_score = _avg(_combined)
+        result: dict = {
+            # PRIMARY: mean training-objective score.
+            # Formula: 0.50×correct + 0.40×process(prm_final, prm_mean) + 0.10×format
+            "accuracy":       combined_score,
+            "combined_score": combined_score,
+            # PROCESS metrics — improve before correct_rate does
+            "step_accuracy":  _avg(_step_acc),
+            "lccp":           _avg(_lccp),   # chain integrity: how far into solution stays correct
+            # Answer correctness
+            "correct_rate":   _avg(_gt_match),
+            # PRM components
+            "prm_mean":       _avg(_prm_comp),
+            "prm_final":      _avg(_prm_final),
+            # Format / SymPy (informational)
+            "sympy_mean":     _avg(_sympy_comp),
+            "format_mean":    _avg(_fmt_comp),
+            "n_scored":       n_scored,
+            "total":          total,
+            "final_answer_correct":  correct,
+            "final_answer_accuracy": correct / total if total else 0.0,
+        }
+    else:
+        _logger.warning(
+            "evaluate_gsm8k: no reward_fn provided — using final-answer accuracy. "
+            "Pass reward_fn=math_env.compute_grounded_reward for full training-objective eval."
+        )
+        fa_acc = correct / total if total else 0.0
+        result = {
+            "accuracy":              fa_acc,
+            "combined_score":        fa_acc,
+            "correct_rate":          fa_acc,
+            "prm_mean":              0.0,
+            "sympy_mean":            0.0,
+            "format_mean":           0.0,
+            "n_scored":              0,
+            "total":                 total,
+            "final_answer_correct":  correct,
+            "final_answer_accuracy": fa_acc,
+        }
+    # Attach pass@k if it was computed
+    if pass_at_k_score is not None:
+        result["pass_at_k"]     = pass_at_k_score
+        result["pass_at_k_k"]   = pass_at_k
+    return result
+if __name__ == "__main__":
+    main()

scripts/gsm8k_sft_pipeline.py ADDED Viewed

	@@ -0,0 +1,475 @@

+#!/usr/bin/env python3
+"""
+End-to-end GSM8K pipeline: prepare JSONL → QLoRA SFT → save adapter → inference.
+The trained model follows ``Step N:`` / ``Final Answer:`` formatting with SymPy-friendly
+expressions (see ``src.agent.math_agent.SOLVER_SYSTEM_PROMPT``).
+Examples
+--------
+  # 1) Only build training JSONL from Hugging Face GSM8K
+  python scripts/gsm8k_sft_pipeline.py prepare --output data/sft/gsm8k_sft.jsonl
+  # 2) Fine-tune (requires GPU recommended)
+  python scripts/gsm8k_sft_pipeline.py train \\
+      --data data/sft/gsm8k_sft.jsonl \\
+      --output-dir checkpoints/gsm8k_sft
+  # 3) Run inference with saved adapter
+  python scripts/gsm8k_sft_pipeline.py infer \\
+      --adapter checkpoints/gsm8k_sft \\
+      --problem \"Janet has 16 eggs. She eats 3. How many are left?\"
+  # Full chain
+  python scripts/gsm8k_sft_pipeline.py all --output-dir checkpoints/gsm8k_sft
+Dependencies: torch, transformers, peft, datasets, accelerate, bitsandbytes, trl, sympy
+Tip: if downloads fail with XET / "Background writer channel closed", export ``HF_HUB_DISABLE_XET=1``
+before running (this script sets it by default unless already set).
+"""
+from __future__ import annotations
+import os
+# hf-xet can error or segfault on interrupted/large shards; classic HTTP download is more robust.
+if "HF_HUB_DISABLE_XET" not in os.environ:
+    os.environ["HF_HUB_DISABLE_XET"] = "1"
+import argparse
+import json
+import math
+import subprocess
+import sys
+from pathlib import Path
+# Project root (…/Maths_LLM)
+ROOT = Path(__file__).resolve().parents[1]
+def cmd_prepare(args: argparse.Namespace) -> None:
+    cmd = [
+        sys.executable,
+        str(ROOT / "scripts" / "convert_gsm8k_to_sft.py"),
+        "--output",
+        str(Path(args.output)),
+        "--splits",
+        *args.splits,
+    ]
+    if args.source == "jsonl":
+        cmd.extend(["--source", "jsonl", "--input", str(args.input)])
+    print("Running:", " ".join(cmd))
+    subprocess.check_call(cmd, cwd=str(ROOT))
+    if args.strip_scratchpads:
+        _rewrite_jsonl_strip_scratchpads(Path(args.output))
+def _rewrite_jsonl_strip_scratchpads(jsonl_path: Path) -> None:
+    from src.sft.solution_format import strip_gsm8k_scratchpads
+    tmp = jsonl_path.with_suffix(".jsonl.tmp")
+    n = 0
+    with jsonl_path.open(encoding="utf-8") as fin, tmp.open("w", encoding="utf-8") as fout:
+        for line in fin:
+            o = json.loads(line)
+            for m in o.get("messages", []):
+                if m.get("role") == "assistant":
+                    m["content"] = strip_gsm8k_scratchpads(m["content"])
+            if "text" in o:
+                sys_p = next(x["content"] for x in o["messages"] if x["role"] == "system")
+                usr = next(x["content"] for x in o["messages"] if x["role"] == "user")
+                asst = next(x["content"] for x in o["messages"] if x["role"] == "assistant")
+                o["text"] = (
+                    f"<|system|>\n{sys_p}\n<|user|>\n{usr}\n<|assistant|>\n{asst}"
+                )
+            fout.write(json.dumps(o, ensure_ascii=False) + "\n")
+            n += 1
+    tmp.replace(jsonl_path)
+    print(f"Stripped <<>> scratchpads in {n} records → {jsonl_path}")
+def _warmup_steps_from_ratio(
+    num_examples: int,
+    per_device_train_batch_size: int,
+    gradient_accumulation_steps: int,
+    num_train_epochs: float,
+    warmup_ratio: float,
+) -> int:
+    """Approximate HF Trainer optimizer steps; used to map legacy warmup_ratio → warmup_steps."""
+    if warmup_ratio <= 0:
+        return 0
+    num_batches = max(
+        1,
+        (num_examples + per_device_train_batch_size - 1) // per_device_train_batch_size,
+    )
+    num_update_steps_per_epoch = max(1, num_batches // gradient_accumulation_steps)
+    total_optimizer_steps = max(1, math.ceil(num_train_epochs * num_update_steps_per_epoch))
+    return min(total_optimizer_steps, int(total_optimizer_steps * warmup_ratio))
+def cmd_train(args: argparse.Namespace) -> None:
+    try:
+        import torch
+        from datasets import load_dataset
+        from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+        from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+        from trl import SFTConfig, SFTTrainer
+    except ImportError as e:
+        raise SystemExit(
+            "Missing dependency for training. Install:\n"
+            "  pip install torch transformers peft datasets accelerate bitsandbytes trl sympy\n"
+            f"Original error: {e}"
+        ) from e
+    data_path = Path(args.data)
+    if not data_path.is_file():
+        raise SystemExit(f"Data file not found: {data_path}")
+    out_dir = Path(args.output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    compute_dtype = getattr(torch, args.bnb_compute_dtype)
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=compute_dtype,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+    print(f"Loading model {args.model} …")
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+        dtype=compute_dtype,
+    )
+    model = prepare_model_for_kbit_training(model)
+    peft = LoraConfig(
+        r=args.lora_rank,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM",
+        target_modules=list(args.target_modules.split(",")),
+    )
+    model = get_peft_model(model, peft)
+    model.config.use_cache = False
+    model.print_trainable_parameters()
+    ds = load_dataset("json", data_files=str(data_path), split="train")
+    if args.max_samples and args.max_samples > 0:
+        ds = ds.select(range(min(args.max_samples, len(ds))))
+    def formatting_func(example):
+        return tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+            add_generation_prompt=False,
+        )
+    if args.warmup_steps is not None:
+        warmup_steps = max(0, args.warmup_steps)
+    else:
+        warmup_steps = _warmup_steps_from_ratio(
+            len(ds),
+            args.batch_size,
+            args.grad_accum,
+            args.epochs,
+            args.warmup_ratio,
+        )
+    sft_args = SFTConfig(
+        output_dir=str(out_dir),
+        num_train_epochs=args.epochs,
+        per_device_train_batch_size=args.batch_size,
+        gradient_accumulation_steps=args.grad_accum,
+        learning_rate=args.learning_rate,
+        logging_steps=args.logging_steps,
+        save_steps=args.save_steps,
+        save_total_limit=3,
+        bf16=args.bf16 and torch.cuda.is_available(),
+        fp16=args.fp16 and torch.cuda.is_available() and not args.bf16,
+        max_length=args.max_seq_length,
+        warmup_steps=warmup_steps,
+        lr_scheduler_type="cosine",
+        report_to="none",
+        gradient_checkpointing=True,
+    )
+    trainer = SFTTrainer(
+        model=model,
+        args=sft_args,
+        train_dataset=ds,
+        processing_class=tokenizer,
+        formatting_func=formatting_func,
+    )
+    trainer.train()
+    trainer.save_model(str(out_dir))
+    tokenizer.save_pretrained(str(out_dir))
+    with (out_dir / "pipeline_meta.json").open("w", encoding="utf-8") as f:
+        json.dump(
+            {
+                "base_model": args.model,
+                "data": str(data_path),
+                "lora_rank": args.lora_rank,
+                "epochs": args.epochs,
+            },
+            f,
+            indent=2,
+        )
+    print(f"Saved adapter and tokenizer to {out_dir}")
+def cmd_infer(args: argparse.Namespace) -> None:
+    import torch
+    from peft import PeftModel
+    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+    from src.agent.math_agent import SOLVER_SYSTEM_PROMPT
+    adapter = Path(args.adapter)
+    meta_path = adapter / "pipeline_meta.json"
+    base_model = args.base_model
+    if meta_path.is_file():
+        meta = json.loads(meta_path.read_text(encoding="utf-8"))
+        base_model = meta.get("base_model", base_model)
+    compute_dtype = getattr(torch, args.bnb_compute_dtype)
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=compute_dtype,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(adapter, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    print(f"Loading base {base_model} + adapter {adapter} …")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    model = PeftModel.from_pretrained(base, str(adapter))
+    model.eval()
+    user_content = (
+        "Solve the following problem. Show your reasoning as numbered steps, "
+        "then give the final numeric answer on the last line.\n\n"
+        f"Problem:\n{args.problem.strip()}"
+    )
+    messages = [
+        {"role": "system", "content": SOLVER_SYSTEM_PROMPT},
+        {"role": "user", "content": user_content},
+    ]
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        out = model.generate(
+            **inputs,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            do_sample=not args.greedy,
+            pad_token_id=tokenizer.pad_token_id,
+        )
+    gen_ids = out[0, inputs["input_ids"].shape[1] :]
+    text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
+    print("\n--- Generated ---\n")
+    print(text)
+    print("\n--- Format check ---")
+    from src.sft.solution_format import validate_sympy_solution_format
+    r = validate_sympy_solution_format(text)
+    print(json.dumps(r.__dict__, indent=2))
+def cmd_all(args: argparse.Namespace) -> None:
+    out_jsonl = Path(args.data) if args.data else ROOT / "data" / "sft" / "gsm8k_sft.jsonl"
+    ns = argparse.Namespace(
+        output=out_jsonl,
+        source=args.prepare_source,
+        input=args.input,
+        splits=args.splits,
+        strip_scratchpads=args.strip_scratchpads,
+    )
+    cmd_prepare(ns)
+    train_ns = argparse.Namespace(
+        data=str(out_jsonl),
+        output_dir=args.output_dir,
+        model=args.model,
+        epochs=args.epochs,
+        batch_size=args.batch_size,
+        grad_accum=args.grad_accum,
+        learning_rate=args.learning_rate,
+        max_samples=args.max_samples,
+        lora_rank=args.lora_rank,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        target_modules=args.target_modules,
+        max_seq_length=args.max_seq_length,
+        save_steps=args.save_steps,
+        logging_steps=args.logging_steps,
+        warmup_ratio=args.warmup_ratio,
+        warmup_steps=args.warmup_steps,
+        bf16=args.bf16,
+        fp16=args.fp16,
+        bnb_compute_dtype=args.bnb_compute_dtype,
+    )
+    cmd_train(train_ns)
+    if args.problem:
+        infer_ns = argparse.Namespace(
+            adapter=Path(args.output_dir),
+            base_model=args.model,
+            problem=args.problem,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            greedy=args.greedy,
+            bnb_compute_dtype=args.bnb_compute_dtype,
+        )
+        cmd_infer(infer_ns)
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(description="GSM8K SFT pipeline (prepare / train / infer / all)")
+    sub = p.add_subparsers(dest="command", required=True)
+    pr = sub.add_parser("prepare", help="Run convert_gsm8k_to_sft.py")
+    pr.add_argument("--output", type=str, default=str(ROOT / "data" / "sft" / "gsm8k_sft.jsonl"))
+    pr.add_argument("--source", choices=("hf", "jsonl"), default="hf")
+    pr.add_argument("--input", type=str, help="JSONL path for --source jsonl")
+    pr.add_argument("--splits", nargs="+", default=["train", "test"])
+    pr.add_argument(
+        "--strip-scratchpads",
+        action="store_true",
+        help="Remove GSM8K <<...>> traces from assistant text after conversion.",
+    )
+    pr.set_defaults(func=cmd_prepare)
+    tr = sub.add_parser("train", help="QLoRA SFT on JSONL with messages field")
+    tr.add_argument("--data", type=str, required=True, help="JSONL from prepare step")
+    tr.add_argument("--output-dir", type=str, required=True)
+    tr.add_argument("--model", type=str, default="Qwen/Qwen2.5-Math-1.5B-Instruct")
+    tr.add_argument("--epochs", type=float, default=1.0)
+    tr.add_argument("--batch-size", type=int, default=1)
+    tr.add_argument("--grad-accum", type=int, default=8)
+    tr.add_argument("--learning-rate", type=float, default=2e-4)
+    tr.add_argument("--max-samples", type=int, default=0, help="0 = use full dataset")
+    tr.add_argument("--lora-rank", type=int, default=16)
+    tr.add_argument("--lora-alpha", type=int, default=32)
+    tr.add_argument("--lora-dropout", type=float, default=0.05)
+    tr.add_argument(
+        "--target-modules",
+        type=str,
+        default="q_proj,v_proj,o_proj,gate_proj",
+    )
+    tr.add_argument("--max-seq-length", type=int, default=2048)
+    tr.add_argument("--save-steps", type=int, default=200)
+    tr.add_argument("--logging-steps", type=int, default=10)
+    tr.add_argument(
+        "--warmup-ratio",
+        type=float,
+        default=0.03,
+        help="Used only if --warmup-steps is not set; converted to warmup_steps.",
+    )
+    tr.add_argument(
+        "--warmup-steps",
+        type=int,
+        default=None,
+        help="LR warmup steps; if set, overrides --warmup-ratio.",
+    )
+    tr.add_argument("--bf16", action="store_true", default=True)
+    tr.add_argument("--no-bf16", dest="bf16", action="store_false")
+    tr.add_argument("--fp16", action="store_true")
+    tr.add_argument("--bnb-compute-dtype", type=str, default="bfloat16")
+    tr.set_defaults(func=cmd_train)
+    inf = sub.add_parser("infer", help="Generate with saved adapter")
+    inf.add_argument("--adapter", type=str, required=True, help="Directory from train step")
+    inf.add_argument(
+        "--base-model",
+        type=str,
+        default="Qwen/Qwen2.5-Math-1.5B-Instruct",
+        help="Must match base used in training if no pipeline_meta.json",
+    )
+    inf.add_argument("--problem", type=str, required=True)
+    inf.add_argument("--max-new-tokens", type=int, default=1024)
+    inf.add_argument("--temperature", type=float, default=0.7)
+    inf.add_argument("--top-p", type=float, default=0.95)
+    inf.add_argument("--greedy", action="store_true")
+    inf.add_argument("--bnb-compute-dtype", type=str, default="bfloat16")
+    inf.set_defaults(func=cmd_infer)
+    al = sub.add_parser("all", help="prepare + train [+ infer if --problem]")
+    al.add_argument("--data", type=str, default=None, help="Output JSONL path (default data/sft/gsm8k_sft.jsonl)")
+    al.add_argument("--prepare-source", choices=("hf", "jsonl"), default="hf")
+    al.add_argument("--input", type=str, help="For jsonl prepare")
+    al.add_argument("--splits", nargs="+", default=["train", "test"])
+    al.add_argument("--strip-scratchpads", action="store_true")
+    al.add_argument("--output-dir", type=str, required=True)
+    al.add_argument("--model", type=str, default="Qwen/Qwen2.5-Math-1.5B-Instruct")
+    al.add_argument("--epochs", type=float, default=1.0)
+    al.add_argument("--batch-size", type=int, default=1)
+    al.add_argument("--grad-accum", type=int, default=8)
+    al.add_argument("--learning-rate", type=float, default=2e-4)
+    al.add_argument("--max-samples", type=int, default=0)
+    al.add_argument("--lora-rank", type=int, default=16)
+    al.add_argument("--lora-alpha", type=int, default=32)
+    al.add_argument("--lora-dropout", type=float, default=0.05)
+    al.add_argument("--target-modules", type=str, default="q_proj,v_proj,o_proj,gate_proj")
+    al.add_argument("--max-seq-length", type=int, default=2048)
+    al.add_argument("--save-steps", type=int, default=200)
+    al.add_argument("--logging-steps", type=int, default=10)
+    al.add_argument(
+        "--warmup-ratio",
+        type=float,
+        default=0.03,
+        help="Used only if --warmup-steps is not set; converted to warmup_steps.",
+    )
+    al.add_argument(
+        "--warmup-steps",
+        type=int,
+        default=None,
+        help="LR warmup steps; if set, overrides --warmup-ratio.",
+    )
+    al.add_argument("--bf16", action="store_true", default=True)
+    al.add_argument("--no-bf16", dest="bf16", action="store_false")
+    al.add_argument("--fp16", action="store_true")
+    al.add_argument("--bnb-compute-dtype", type=str, default="bfloat16")
+    al.add_argument("--problem", type=str, default="", help="If set, run infer after train")
+    al.add_argument("--max-new-tokens", type=int, default=1024)
+    al.add_argument("--temperature", type=float, default=0.7)
+    al.add_argument("--top-p", type=float, default=0.95)
+    al.add_argument("--greedy", action="store_true")
+    al.set_defaults(func=cmd_all)
+    return p
+def main() -> None:
+    parser = build_parser()
+    args = parser.parse_args()
+    if str(ROOT) not in sys.path:
+        sys.path.insert(0, str(ROOT))
+    args.func(args)
+if __name__ == "__main__":
+    main()

scripts/launch_grpo.sh ADDED Viewed

	@@ -0,0 +1,127 @@

+set -euo pipefail
+# ── Flash-Attention 2 install (if missing) ────────────────────────────────────
+# flash-attn requires (torch version, CUDA version, Python version) alignment.
+# MAX_JOBS caps parallel compilation; prebuilt wheel installs in <30 s.
+# In the prior run (grpo_20260425_151304), flash-attn was absent → SDPA fallback
+# → iter times of 262-330 s once question-gen started (vs ~150 s with Flash).
+if ! python -c "import flash_attn; assert int(flash_attn.__version__.split('.')[0]) >= 2" 2>/dev/null; then
+    echo "[launch] flash-attn not found or < v2 — installing now …"
+    MAX_JOBS=4 pip install flash-attn --no-build-isolation -q
+    echo "[launch] flash-attn installed."
+else
+    FLASH_VER=$(python -c "import flash_attn; print(flash_attn.__version__)" 2>/dev/null)
+    echo "[launch] flash-attn ${FLASH_VER} already installed — skipping install."
+fi
+# ── GPU / allocator ───────────────────────────────────────────────────────────
+export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
+# expandable_segments: recovers 2-4 GB fragmented VRAM during long Flash+HF runs
+export PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}
+# ── CPU / threading ───────────────────────────────────────────────────────────
+export OMP_NUM_THREADS=${OMP_NUM_THREADS:-8}
+export MKL_NUM_THREADS=${MKL_NUM_THREADS:-8}
+export TOKENIZERS_PARALLELISM=${TOKENIZERS_PARALLELISM:-false}
+# ── Triton / Flash-Attn compilation cache ─────────────────────────────────────
+# Persists JIT kernels across runs — avoids ~30 s recompile each launch.
+export TRITON_CACHE_DIR=${TRITON_CACHE_DIR:-/tmp/triton_cache}
+export FLASH_ATTENTION_SKIP_CUDA_BUILD=${FLASH_ATTENTION_SKIP_CUDA_BUILD:-FALSE}
+# ── HuggingFace hub robustness ────────────────────────────────────────────────
+export HF_HUB_DISABLE_XET=${HF_HUB_DISABLE_XET:-1}
+export HF_HUB_ENABLE_HF_TRANSFER=${HF_HUB_ENABLE_HF_TRANSFER:-0}
+export TRANSFORMERS_VERBOSITY=${TRANSFORMERS_VERBOSITY:-warning}
+# ── Python path ───────────────────────────────────────────────────────────────
+export PYTHONPATH="${PYTHONPATH:-}:$(pwd)"
+# ── Pre-flight: GPU info ───────────────────────────────────────────────────────
+if command -v nvidia-smi >/dev/null 2>&1; then
+    echo "─── nvidia-smi ───────────────────────────────────────────────────"
+    nvidia-smi --query-gpu=name,memory.total,memory.free,driver_version \
+               --format=csv,noheader || true
+    echo "──────────────────────────────────────────────────────────────────"
+fi
+# ── Confirm attention backend ─────────────────────────────────────────────────
+python - <<'PYEOF'
+import sys; sys.path.insert(0, '.')
+from src.utils.attn_backend import select_attn_implementation
+impl = select_attn_implementation()
+tag = {
+    "flash_attention_2": "FAST   — Flash-Attn 2 active (O(T) memory, ~1.5-2× faster)",
+    "sdpa":              "OK     — SDPA active (install flash-attn for ~2× speedup)",
+    "eager":             "SLOW   — Eager fallback (install flash-attn for best speed)",
+}.get(impl, impl)
+print(f"[launch] attn_backend = {tag}")
+PYEOF
+# ── Log tee ───────────────────────────────────────────────────────────────────
+RUN_NAME="grpo_$(date +%Y%m%d_%H%M%S)"
+LOG_DIR="logs/grpo"
+mkdir -p "$LOG_DIR"
+LOG_FILE="$LOG_DIR/${RUN_NAME}.log"
+echo "[launch] run_name       = $RUN_NAME"
+echo "[launch] base_model     = checkpoints/dual_task_v1"
+echo "[launch] train_data     = data/sft/gsm8k_sft.jsonl + data/math/math_numeric.jsonl"
+echo "[launch] eval_data      = data/sft/gsm8k_test.jsonl"
+echo "[launch] log_file       = $LOG_FILE"
+echo "[launch] architecture   = Two-phase self-play (K_q=2, K=10, N=20)"
+echo "[launch] fixes_applied  = min-warmup↑12, selfplay-gt-thresh↑0.65, kl-coef↑0.06,"
+echo "[launch]                  math-ramp-start↑18, group-size↑10, num-iters↑60"
+echo "[launch] wall-time      ≈ 3.3 h (Flash active) / 4.5 h (SDPA fallback)"
+# ── Train ─────────────────────────────────────────────────────────────────────
+python -u scripts/run_grpo_training.py \
+    --base-model            checkpoints/dual_task_v1 \
+    --output-dir            checkpoints/grpo \
+    --gsm8k-data            data/sft/gsm8k_sft.jsonl \
+    --eval-data-path        data/sft/gsm8k_test.jsonl \
+    \
+    --num-iterations        60 \
+    --group-size            10 \
+    --q-group-size          2 \
+    --questions-per-iter    20 \
+    \
+    --learning-rate         5e-6 \
+    --max-new-tokens        1000 \
+    --temperature           0.8 \
+    --max-grad-norm         0.5 \
+    --clip-eps              0.2 \
+    --kl-coef               0.06 \
+    --warmup-iters          8 \
+    --min-lr-ratio          0.1 \
+    \
+    --difficulty-alpha      3.5 \
+    --self-play-ratio       0.70 \
+    \
+    --math-mix-ratio        0.30 \
+    --math-mix-ratio-late   0.50 \
+    --math-ramp-start       18 \
+    --math-max-difficulty   3 \
+    \
+    --overlong-filter \
+    --min-warmup            12 \
+    --selfplay-gt-thresh    0.65 \
+    --selfplay-grounded-thresh 0.65 \
+    --selfplay-step-thresh  0.68 \
+    --selfplay-ramp-iters   28 \
+    --grounded-floor        0.55 \
+    \
+    --extractor-model       Qwen/Qwen2.5-0.5B-Instruct \
+    --extraction-cache      data/extraction_cache.json \
+    \
+    --eval-every            5 \
+    --eval-max-samples      150 \
+    --eval-max-new-tokens   1000 \
+    --eval-pass-at-k        0 \
+    --save-every            5 \
+    --keep-last             4 \
+    \
+    --use-prm \
+    --prm-model             Qwen/Qwen2.5-Math-PRM-7B \
+    --run-name              "$RUN_NAME" \
+    "$@" 2>&1 | tee "$LOG_FILE"

scripts/plot_grpo_run.py ADDED Viewed

	@@ -0,0 +1,425 @@

+#!/usr/bin/env python3
+"""
+Generate demo-quality plots from a completed (or in-progress) GRPO run.
+Usage
+-----
+    # from the run output directory
+    python scripts/plot_grpo_run.py checkpoints/grpo/<run_name>/metrics.jsonl
+    # auto-discover the latest run
+    python scripts/plot_grpo_run.py --latest
+    # custom output directory
+    python scripts/plot_grpo_run.py metrics.jsonl --out-dir plots/my_run
+Output
+------
+Six PNG files saved next to the JSONL (or --out-dir if given):
+  01_training_objective.png   – combined_score vs iteration (PRIMARY demo plot)
+  02_reward_components.png    – 4-panel breakdown: correct / PRM / SymPy / format
+  03_training_dynamics.png    – GRPO loss + batch reward + batch accuracy
+  04_reward_vs_eval.png       – training reward vs eval score on same axis
+  05_component_area.png       – stacked-area chart of the 4 weighted components
+  06_summary_card.png         – single-panel card: all key metrics in one view
+All figures use a clean dark-on-white academic style.  They are saved at
+300 dpi so they look sharp in slides and posters.
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import matplotlib
+matplotlib.use("Agg")   # headless — no display needed on training servers
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+import numpy as np
+# ── Style ────────────────────────────────────────────────────────────────────
+PALETTE = {
+    "combined":  "#2563EB",   # blue  — training objective
+    "correct":   "#16A34A",   # green — correctness
+    "prm":       "#DC2626",   # red   — PRM step quality
+    "sympy":     "#D97706",   # amber — SymPy verification
+    "fmt":       "#7C3AED",   # violet — format
+    "reward":    "#0891B2",   # cyan  — mean batch reward
+    "loss":      "#64748B",   # slate — loss
+    "batch_acc": "#059669",   # emerald — batch accuracy
+}
+plt.rcParams.update({
+    "figure.dpi":          150,
+    "savefig.dpi":         300,
+    "font.family":         "DejaVu Sans",
+    "axes.spines.top":     False,
+    "axes.spines.right":   False,
+    "axes.grid":           True,
+    "grid.alpha":          0.3,
+    "grid.linestyle":      "--",
+    "axes.labelsize":      11,
+    "axes.titlesize":      13,
+    "legend.fontsize":     9,
+    "xtick.labelsize":     9,
+    "ytick.labelsize":     9,
+})
+# ── Data loading ─────────────────────────────────────────────────────────────
+def _load(path: Path) -> List[Dict[str, Any]]:
+    rows = []
+    with path.open(encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if line:
+                rows.append(json.loads(line))
+    return rows
+def _field(rows: List[Dict], key: str) -> Tuple[List[int], List[float]]:
+    """Return (iterations, values) for rows that have a non-empty key."""
+    iters, vals = [], []
+    for r in rows:
+        v = r.get(key)
+        if v is not None and v != "" and not (isinstance(v, float) and np.isnan(v)):
+            try:
+                iters.append(int(r["iteration"]))
+                vals.append(float(v))
+            except (TypeError, ValueError):
+                pass
+    return iters, vals
+# ── Individual plots ─────────────────────────────────────────────────────────
+def plot_training_objective(rows: List[Dict], out: Path) -> None:
+    """Plot 01: combined_score — the single most important demo plot."""
+    xi, xv = _field(rows, "combined_score")
+    if not xi:
+        return
+    fig, ax = plt.subplots(figsize=(9, 5))
+    ax.plot(xi, xv, color=PALETTE["combined"], linewidth=2.5,
+            marker="o", markersize=5, label="Training-objective score")
+    ax.fill_between(xi, xv, alpha=0.12, color=PALETTE["combined"])
+    # annotate first and last eval points
+    ax.annotate(f"{xv[0]:.3f}", (xi[0], xv[0]), textcoords="offset points",
+                xytext=(8, 6), fontsize=8, color=PALETTE["combined"])
+    ax.annotate(f"{xv[-1]:.3f}", (xi[-1], xv[-1]), textcoords="offset points",
+                xytext=(8, 6), fontsize=8, color=PALETTE["combined"])
+    ax.set_xlabel("Iteration")
+    ax.set_ylabel("Score  (0 – 1)")
+    ax.set_title(
+        "GRPO Training — Combined Reward Score\n"
+        "0.60 × correct + 0.15 × PRM + 0.15 × SymPy + 0.10 × format",
+        fontsize=12,
+    )
+    ax.set_ylim(0, 1.05)
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
+    ax.legend(loc="lower right")
+    fig.tight_layout()
+    fig.savefig(out)
+    plt.close(fig)
+    print(f"  saved {out.name}")
+def plot_reward_components(rows: List[Dict], out: Path) -> None:
+    """Plot 02: four-panel breakdown of each reward component."""
+    specs = [
+        ("correct_rate",   "correct",  "Correctness (gt_match)",           "60 %"),
+        ("prm_mean",       "prm",      "PRM Step Quality",                  "15 %"),
+        ("sympy_mean",     "sympy",    "SymPy Verification",                "15 %"),
+        ("format_mean",    "fmt",      "Format Compliance",                 "10 %"),
+    ]
+    fig, axes = plt.subplots(2, 2, figsize=(12, 7), sharex=False)
+    axes = axes.flatten()
+    for ax, (key, pal, title, weight) in zip(axes, specs):
+        xi, xv = _field(rows, key)
+        if not xi:
+            ax.set_visible(False)
+            continue
+        ax.plot(xi, xv, color=PALETTE[pal], linewidth=2,
+                marker="o", markersize=4)
+        ax.fill_between(xi, xv, alpha=0.12, color=PALETTE[pal])
+        ax.set_title(f"{title}  (weight {weight})", fontsize=11)
+        ax.set_xlabel("Iteration")
+        ax.set_ylabel("Score")
+        ax.set_ylim(0, 1.05)
+        ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
+        if xv:
+            delta = xv[-1] - xv[0]
+            sign = "+" if delta >= 0 else ""
+            ax.set_title(
+                f"{title}  (weight {weight})  Δ={sign}{delta:+.1%}",
+                fontsize=10,
+            )
+    fig.suptitle("Reward Component Breakdown over Training", fontsize=13, y=1.01)
+    fig.tight_layout()
+    fig.savefig(out, bbox_inches="tight")
+    plt.close(fig)
+    print(f"  saved {out.name}")
+def plot_training_dynamics(rows: List[Dict], out: Path) -> None:
+    """Plot 03: loss, mean_reward, batch_accuracy over all iterations."""
+    li, lv = _field(rows, "loss")
+    ri, rv = _field(rows, "mean_reward")
+    bi, bv = _field(rows, "batch_accuracy")
+    fig, axes = plt.subplots(3, 1, figsize=(10, 8), sharex=True)
+    if lv:
+        axes[0].plot(li, lv, color=PALETTE["loss"], linewidth=1.8)
+        axes[0].fill_between(li, lv, alpha=0.1, color=PALETTE["loss"])
+        axes[0].set_ylabel("GRPO Loss")
+        axes[0].set_title("Training Loss", fontsize=11)
+        axes[0].axhline(0, color="black", linewidth=0.8, linestyle="--", alpha=0.4)
+    if rv:
+        axes[1].plot(ri, rv, color=PALETTE["reward"], linewidth=1.8)
+        axes[1].fill_between(ri, rv, alpha=0.1, color=PALETTE["reward"])
+        axes[1].set_ylabel("Reward")
+        axes[1].set_ylim(0, 1.05)
+        axes[1].set_title("Mean Batch Reward", fontsize=11)
+        axes[1].yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
+    if bv:
+        axes[2].plot(bi, bv, color=PALETTE["batch_acc"], linewidth=1.8)
+        axes[2].fill_between(bi, bv, alpha=0.1, color=PALETTE["batch_acc"])
+        axes[2].set_ylabel("Accuracy")
+        axes[2].set_ylim(0, 1.05)
+        axes[2].set_title("Batch Accuracy (training rollouts)", fontsize=11)
+        axes[2].yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
+    for ax in axes:
+        ax.set_xlabel("Iteration")
+    fig.suptitle("GRPO Training Dynamics", fontsize=13)
+    fig.tight_layout()
+    fig.savefig(out)
+    plt.close(fig)
+    print(f"  saved {out.name}")
+def plot_reward_vs_eval(rows: List[Dict], out: Path) -> None:
+    """Plot 04: mean_reward (all iters) + combined_score (eval iters) overlaid."""
+    ri, rv = _field(rows, "mean_reward")
+    ei, ev = _field(rows, "combined_score")
+    fig, ax = plt.subplots(figsize=(10, 5))
+    if rv:
+        ax.plot(ri, rv, color=PALETTE["reward"], linewidth=1.4, alpha=0.7,
+                label="Batch reward (training)")
+        ax.fill_between(ri, rv, alpha=0.06, color=PALETTE["reward"])
+    if ev:
+        ax.plot(ei, ev, color=PALETTE["combined"], linewidth=2.5,
+                marker="D", markersize=6, label="Eval score (held-out GSM8K)")
+        for x, y in zip(ei, ev):
+            ax.annotate(f"{y:.3f}", (x, y), textcoords="offset points",
+                        xytext=(0, 8), ha="center", fontsize=7,
+                        color=PALETTE["combined"])
+    ax.set_xlabel("Iteration")
+    ax.set_ylabel("Score  (0 – 1)")
+    ax.set_ylim(0, 1.05)
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
+    ax.set_title("Training Reward vs Held-Out Eval Score", fontsize=12)
+    ax.legend()
+    fig.tight_layout()
+    fig.savefig(out)
+    plt.close(fig)
+    print(f"  saved {out.name}")
+def plot_component_area(rows: List[Dict], out: Path) -> None:
+    """Plot 05: stacked-area of the four WEIGHTED components summing to combined_score."""
+    ei, ev_combined = _field(rows, "combined_score")
+    if not ei:
+        return
+    # Build per-component weighted series aligned to eval iterations
+    iter_set = set(ei)
+    aligned: Dict[str, List[float]] = {k: [] for k in ("correct", "prm", "sympy", "fmt")}
+    weights = {"correct": 0.60, "prm": 0.15, "sympy": 0.15, "fmt": 0.10}
+    keys    = {"correct": "correct_rate", "prm": "prm_mean",
+               "sympy": "sympy_mean",     "fmt": "format_mean"}
+    # Build lookup per iteration
+    it_map: Dict[int, Dict] = {r["iteration"]: r for r in rows if r["iteration"] in iter_set}
+    iters_sorted = sorted(iter_set)
+    for it in iters_sorted:
+        row = it_map.get(it, {})
+        for comp, field in keys.items():
+            v = row.get(field)
+            if v is not None and v != "":
+                aligned[comp].append(float(v) * weights[comp])
+            else:
+                aligned[comp].append(0.0)
+    x = np.array(iters_sorted)
+    arr = np.array([aligned["correct"], aligned["prm"],
+                    aligned["sympy"],   aligned["fmt"]])
+    fig, ax = plt.subplots(figsize=(10, 5))
+    labels  = ["Correct (×0.60)", "PRM (×0.15)", "SymPy (×0.15)", "Format (×0.10)"]
+    colors  = [PALETTE[k] for k in ("correct", "prm", "sympy", "fmt")]
+    ax.stackplot(x, arr, labels=labels, colors=colors, alpha=0.75)
+    ax.plot(x, ev_combined, color="black", linewidth=1.5,
+            linestyle="--", label="Combined score", zorder=5)
+    ax.set_xlabel("Iteration")
+    ax.set_ylabel("Weighted contribution to score")
+    ax.set_ylim(0, 1.0)
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
+    ax.set_title("Contribution of Each Reward Component (Stacked)", fontsize=12)
+    ax.legend(loc="lower right", ncol=2)
+    fig.tight_layout()
+    fig.savefig(out)
+    plt.close(fig)
+    print(f"  saved {out.name}")
+def plot_summary_card(rows: List[Dict], run_name: str, out: Path) -> None:
+    """Plot 06: all key metrics on a single clean card — ideal for poster / slide."""
+    ei, ev = _field(rows, "combined_score")
+    _, crv  = _field(rows, "correct_rate")
+    _, prmv = _field(rows, "prm_mean")
+    _, syv  = _field(rows, "sympy_mean")
+    _, fmv  = _field(rows, "format_mean")
+    _, lv   = _field(rows, "loss")
+    _, rv   = _field(rows, "mean_reward")
+    li      = _field(rows, "loss")[0]
+    ri      = _field(rows, "mean_reward")[0]
+    fig, axes = plt.subplots(2, 3, figsize=(15, 8))
+    axes = axes.flatten()
+    def _panel(ax, iters, vals, color, title, pct=True):
+        if not iters:
+            ax.set_visible(False)
+            return
+        ax.plot(iters, vals, color=color, linewidth=2, marker="o", markersize=4)
+        ax.fill_between(iters, vals, alpha=0.12, color=color)
+        ax.set_title(title, fontsize=11, fontweight="bold")
+        ax.set_xlabel("Iteration", fontsize=9)
+        if pct:
+            ax.set_ylim(0, 1.05)
+            ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
+        if vals:
+            ax.annotate(f"{vals[-1]:.3f}", (iters[-1], vals[-1]),
+                        textcoords="offset points", xytext=(6, 4),
+                        fontsize=8, color=color)
+    _panel(axes[0], ei,  ev,   PALETTE["combined"],  "Training-Objective Score")
+    _panel(axes[1], ei,  crv,  PALETTE["correct"],   "Correctness Rate")
+    _panel(axes[2], ei,  prmv, PALETTE["prm"],       "PRM Step Quality")
+    _panel(axes[3], ei,  syv,  PALETTE["sympy"],     "SymPy Verification")
+    _panel(axes[4], ei,  fmv,  PALETTE["fmt"],       "Format Compliance")
+    _panel(axes[5], li,  lv,   PALETTE["loss"],      "GRPO Loss", pct=False)
+    fig.suptitle(f"GRPO Training Summary — {run_name}", fontsize=14, fontweight="bold")
+    fig.tight_layout()
+    fig.savefig(out, bbox_inches="tight")
+    plt.close(fig)
+    print(f"  saved {out.name}")
+# ── CLI ──────────────────────────────────────────────────────────────────────
+def find_latest_metrics() -> Optional[Path]:
+    """Find the most recently modified metrics.jsonl under checkpoints/grpo/."""
+    ckpt = Path("checkpoints/grpo")
+    if not ckpt.exists():
+        return None
+    candidates = sorted(
+        ckpt.rglob("metrics.jsonl"),
+        key=lambda p: p.stat().st_mtime,
+    )
+    return candidates[-1] if candidates else None
+def generate_plots(metrics_path: Path, out_dir: Optional[Path] = None) -> Path:
+    """Generate all six plots and return the output directory."""
+    rows = _load(metrics_path)
+    if not rows:
+        print(f"[plot] No data in {metrics_path}", file=sys.stderr)
+        return metrics_path.parent
+    out_dir = out_dir or metrics_path.parent / "plots"
+    out_dir.mkdir(parents=True, exist_ok=True)
+    # Derive run name from the directory name two levels up
+    run_name = metrics_path.parent.name
+    print(f"[plot] Generating plots for run '{run_name}'  ({len(rows)} iterations)")
+    print(f"[plot] Output → {out_dir}")
+    plot_training_objective(rows, out_dir / "01_training_objective.png")
+    plot_reward_components(rows,  out_dir / "02_reward_components.png")
+    plot_training_dynamics(rows,  out_dir / "03_training_dynamics.png")
+    plot_reward_vs_eval(rows,     out_dir / "04_reward_vs_eval.png")
+    plot_component_area(rows,     out_dir / "05_component_area.png")
+    plot_summary_card(rows, run_name, out_dir / "06_summary_card.png")
+    print(f"[plot] Done — {len(list(out_dir.glob('*.png')))} PNGs in {out_dir}")
+    return out_dir
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Generate demo plots from a GRPO metrics.jsonl file."
+    )
+    parser.add_argument(
+        "metrics_jsonl", nargs="?", type=Path, default=None,
+        help="Path to metrics.jsonl produced by run_grpo_training.py",
+    )
+    parser.add_argument(
+        "--latest", action="store_true",
+        help="Auto-discover the most recent metrics.jsonl under checkpoints/grpo/",
+    )
+    parser.add_argument(
+        "--out-dir", type=Path, default=None,
+        help="Directory to write PNG files (default: <metrics_dir>/plots/)",
+    )
+    args = parser.parse_args()
+    if args.latest:
+        path = find_latest_metrics()
+        if path is None:
+            print("No metrics.jsonl found under checkpoints/grpo/", file=sys.stderr)
+            sys.exit(1)
+        print(f"[plot] Auto-selected {path}")
+    elif args.metrics_jsonl:
+        path = args.metrics_jsonl
+    else:
+        parser.print_help()
+        sys.exit(1)
+    if not path.exists():
+        print(f"File not found: {path}", file=sys.stderr)
+        sys.exit(1)
+    generate_plots(path, args.out_dir)
+if __name__ == "__main__":
+    main()

scripts/plot_training_results.py ADDED Viewed

	@@ -0,0 +1,521 @@

+#!/usr/bin/env python3
+"""
+AxiomForgeAI — Training Results Plots
+======================================
+Reads the metrics CSV from a GRPO training run and generates five focused plots
+that tell the story of what improved, how self-play was earned, and why step-level
+reasoning quality matters as much as final-answer accuracy.
+All plots are saved to images/ as high-resolution PNGs.
+Usage
+-----
+  python scripts/plot_training_results.py
+  python scripts/plot_training_results.py --metrics logs/grpo/grpo_20260426_032827/metrics.csv
+  python scripts/plot_training_results.py --out images/
+"""
+from __future__ import annotations
+import argparse
+import csv
+from pathlib import Path
+from typing import Dict, List
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import numpy as np
+# ── Style ──────────────────────────────────────────────────────────────────────
+PALETTE = {
+    "indigo":    "#6366f1",
+    "pink":      "#ec4899",
+    "cyan":      "#06b6d4",
+    "amber":     "#f59e0b",
+    "emerald":   "#10b981",
+    "slate":     "#94a3b8",
+    "red":       "#ef4444",
+    "violet":    "#8b5cf6",
+    "white":     "#f8fafc",
+    "bg":        "#0f172a",
+    "bg2":       "#1e293b",
+    "gridline":  "#1e293b",
+}
+plt.rcParams.update({
+    "figure.facecolor":  PALETTE["bg"],
+    "axes.facecolor":    PALETTE["bg"],
+    "axes.edgecolor":    PALETTE["slate"],
+    "axes.labelcolor":   PALETTE["white"],
+    "axes.titlecolor":   PALETTE["white"],
+    "axes.titlesize":    13,
+    "axes.labelsize":    11,
+    "axes.grid":         True,
+    "grid.color":        "#1e293b",
+    "grid.linewidth":    0.8,
+    "xtick.color":       PALETTE["slate"],
+    "ytick.color":       PALETTE["slate"],
+    "xtick.labelsize":   9,
+    "ytick.labelsize":   9,
+    "legend.facecolor":  "#1e293b",
+    "legend.edgecolor":  PALETTE["slate"],
+    "legend.labelcolor": PALETTE["white"],
+    "legend.fontsize":   9,
+    "text.color":        PALETTE["white"],
+    "font.family":       "sans-serif",
+    "lines.linewidth":   2.0,
+})
+PHASE_COLORS = {
+    "GROUNDED_ONLY":  ("#6366f120", "#6366f1"),
+    "SELFPLAY_RAMP":  ("#10b98120", "#10b981"),
+}
+DPI = 160
+IMAGES_DIR = Path("images")
+DEFAULT_METRICS = (
+    "logs/grpo/grpo_20260426_032827/metrics.csv"
+)
+# ── Helpers ────────────────────────────────────────────────────────────────────
+def load_csv(path: str) -> List[Dict]:
+    rows = []
+    with open(path, encoding="utf-8") as f:
+        for r in csv.DictReader(f):
+            rows.append({k: v for k, v in r.items()})
+    return rows
+def f(row: Dict, key: str, default: float = float("nan")) -> float:
+    v = row.get(key, "")
+    try:
+        return float(v) if v != "" else default
+    except (ValueError, TypeError):
+        return default
+def moving_avg(values: List[float], w: int = 3) -> List[float]:
+    result = []
+    for i in range(len(values)):
+        lo = max(0, i - w + 1)
+        chunk = [v for v in values[lo : i + 1] if not np.isnan(v)]
+        result.append(float(np.mean(chunk)) if chunk else float("nan"))
+    return result
+def shade_phases(ax, iters, phases):
+    """Draw translucent background rectangles for each training phase."""
+    prev_phase, start = None, iters[0]
+    for it, ph in zip(iters, phases):
+        if ph != prev_phase:
+            if prev_phase is not None:
+                bg, _ = PHASE_COLORS.get(prev_phase, ("#ffffff10", "#ffffff"))
+                ax.axvspan(start - 0.5, it - 0.5, facecolor=bg, linewidth=0, zorder=0)
+            prev_phase, start = ph, it
+    if prev_phase is not None:
+        bg, _ = PHASE_COLORS.get(prev_phase, ("#ffffff10", "#ffffff"))
+        ax.axvspan(start - 0.5, iters[-1] + 0.5, facecolor=bg, linewidth=0, zorder=0)
+def phase_legend_patches(phases):
+    seen = []
+    patches = []
+    for ph in phases:
+        if ph not in seen:
+            seen.append(ph)
+            _, edge = PHASE_COLORS.get(ph, ("#ffffff10", "#ffffff"))
+            label = ph.replace("_", " ").title()
+            patches.append(mpatches.Patch(facecolor=edge + "40", edgecolor=edge,
+                                          linewidth=1.2, label=label))
+    return patches
+def annotate_transition(ax, x_iter, label, ypos=0.97, color="#94a3b8"):
+    ax.axvline(x=x_iter - 0.5, color=color, linewidth=1, linestyle="--", alpha=0.7)
+    ax.text(x_iter, ypos, label, transform=ax.get_xaxis_transform(),
+            fontsize=7.5, color=color, ha="left", va="top",
+            bbox=dict(facecolor=PALETTE["bg2"], edgecolor="none", pad=2))
+def save(fig: plt.Figure, name: str, out: Path):
+    out.mkdir(parents=True, exist_ok=True)
+    path = out / name
+    fig.savefig(path, dpi=DPI, bbox_inches="tight", facecolor=fig.get_facecolor())
+    print(f"  ✓  {path}")
+    plt.close(fig)
+# ══════════════════════════════════════════════════════════════════════════════
+# PLOT 1 — Hero: Reasoning quality at evaluation checkpoints
+# Shows four signals together: GSM8K accuracy, combined score, step accuracy,
+# and LCCP.  The message: the model doesn't just get more answers right —
+# every step of the reasoning chain gets better.
+# ══════════════════════════════════════════════════════════════════════════════
+def plot_eval_quality(rows: List[Dict], out: Path):
+    eval_rows = [r for r in rows if r.get("eval_combined", "") != ""]
+    iters     = [int(r["iteration"]) for r in eval_rows]
+    gsm8k_acc  = [f(r, "eval_correct_rt") * 100 for r in eval_rows]
+    combined   = [f(r, "eval_combined") * 100    for r in eval_rows]
+    step_acc   = [f(r, "eval_step_acc") * 100    for r in eval_rows]
+    lccp       = [f(r, "eval_lccp") * 100        for r in eval_rows]
+    prm        = [f(r, "eval_prm") * 100         for r in eval_rows]
+    fig, ax = plt.subplots(figsize=(9, 5))
+    fig.suptitle("Evaluation Quality Over Training — AxiomForgeAI",
+                 fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01)
+    # --- lines
+    ax.plot(iters, gsm8k_acc, "o-",  color=PALETTE["pink"],    label="GSM8K Accuracy (final answer)", ms=7, zorder=5)
+    ax.plot(iters, combined,  "s-",  color=PALETTE["indigo"],  label="Combined Score",                 ms=6, zorder=5)
+    ax.plot(iters, step_acc,  "^-",  color=PALETTE["cyan"],    label="Step Accuracy (reasoning chain)", ms=6, zorder=5)
+    ax.plot(iters, lccp,      "D-",  color=PALETTE["emerald"], label="LCCP (chain integrity)",          ms=6, zorder=5)
+    ax.plot(iters, prm,       "v--", color=PALETTE["amber"],   label="PRM Mean Score",                  ms=5, alpha=0.8, zorder=4)
+    # annotate best GSM8K
+    best_gsm = max(gsm8k_acc)
+    bi = gsm8k_acc.index(best_gsm)
+    ax.annotate(f"  {best_gsm:.1f}%",
+                xy=(iters[bi], best_gsm), fontsize=9, color=PALETTE["pink"],
+                va="bottom", ha="left")
+    # annotate best combined
+    best_c = max(combined)
+    bci = combined.index(best_c)
+    ax.annotate(f"  {best_c:.1f}",
+                xy=(iters[bci], best_c), fontsize=9, color=PALETTE["indigo"],
+                va="top", ha="left")
+    ax.set_xlabel("Training Iteration")
+    ax.set_ylabel("Score (%)")
+    ax.set_xticks(iters)
+    ax.set_ylim(78, 96)
+    ax.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
+    ax.legend(loc="lower right", framealpha=0.8)
+    ax.set_title(
+        "Four angles on quality — answer correctness, holistic score, per-step reasoning, and chain integrity",
+        fontsize=9, color=PALETTE["slate"], pad=6,
+    )
+    fig.tight_layout()
+    save(fig, "plot1_eval_quality.png", out)
+# ══════════════════════════════════════════════════════════════════════════════
+# PLOT 2 — Training Journey: full 30-iteration timeline with phase shading
+# Shows mean reward, GT match rate, and step accuracy over every iteration.
+# Phase backgrounds show when self-play unlocked and the curriculum ramped.
+# ══════════════════════════════════════════════════════════════════════════════
+def plot_training_journey(rows: List[Dict], out: Path):
+    iters      = [int(r["iteration"]) for r in rows]
+    phases     = [r["training_phase"] for r in rows]
+    mean_r     = [f(r, "mean_reward") * 100      for r in rows]
+    gt_match   = [f(r, "gt_match_rate") * 100    for r in rows]
+    step_acc   = [f(r, "step_accuracy") * 100    for r in rows]
+    batch_acc  = [f(r, "batch_accuracy") * 100   for r in rows]
+    ma_reward = moving_avg(mean_r,   w=4)
+    ma_gt     = moving_avg(gt_match, w=4)
+    ma_step   = moving_avg(step_acc, w=4)
+    fig, ax = plt.subplots(figsize=(11, 5))
+    shade_phases(ax, iters, phases)
+    # raw (faint)
+    ax.plot(iters, mean_r,   alpha=0.25, color=PALETTE["indigo"],  linewidth=1)
+    ax.plot(iters, gt_match, alpha=0.25, color=PALETTE["pink"],    linewidth=1)
+    ax.plot(iters, step_acc, alpha=0.25, color=PALETTE["cyan"],    linewidth=1)
+    # smoothed (bold)
+    ax.plot(iters, ma_reward, color=PALETTE["indigo"],  linewidth=2.5, label="Mean Reward (smooth)")
+    ax.plot(iters, ma_gt,     color=PALETTE["pink"],    linewidth=2.5, label="GT Match Rate (smooth)")
+    ax.plot(iters, ma_step,   color=PALETTE["cyan"],    linewidth=2.5, label="Step Accuracy (smooth)")
+    # self-play transition annotation
+    sp_start = next(i for i, p in enumerate(phases) if p == "SELFPLAY_RAMP")
+    annotate_transition(ax, iters[sp_start], "Self-play\nunlocked", ypos=0.98,
+                        color=PALETTE["emerald"])
+    ax.set_xlabel("Training Iteration")
+    ax.set_ylabel("Score (%)")
+    ax.set_ylim(55, 105)
+    ax.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
+    ax.set_xticks(range(1, max(iters) + 1, 2))
+    ax.set_title("30-Iteration GRPO Training Timeline  |  Faint = raw  ·  Bold = 4-iter moving average",
+                 fontsize=9, color=PALETTE["slate"], pad=6)
+    fig.suptitle("Training Journey — Reward, GT Match & Step Accuracy",
+                 fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01)
+    legend_patches = phase_legend_patches(phases)
+    h, l = ax.get_legend_handles_labels()
+    ax.legend(handles=h + legend_patches, loc="lower right", framealpha=0.8, ncol=2)
+    fig.tight_layout()
+    save(fig, "plot2_training_journey.png", out)
+# ══════════════════════════════════════════════════════════════════════════════
+# PLOT 3 — Self-Play Success: the curriculum earning its right to generate
+# Shows the self-play ratio ramping up while question quality stays high.
+# The headline: by iteration 30 more than 60% of training is model-generated,
+# and those questions are 95-100% solvable and genuinely novel.
+# ══════════════════════════════════════════════════════════════════════════════
+def plot_selfplay_success(rows: List[Dict], out: Path):
+    sp_rows = [r for r in rows if f(r, "q_reward") > 0]
+    iters   = [int(r["iteration"]) for r in sp_rows]
+    sp_rat  = [f(r, "sp_ratio") * 100      for r in sp_rows]
+    q_sol   = [f(r, "q_solvability") * 100 for r in sp_rows]
+    q_nov   = [f(r, "q_novelty") * 100     for r in sp_rows]
+    q_rew   = [f(r, "q_reward") * 100      for r in sp_rows]
+    fig, ax1 = plt.subplots(figsize=(10, 5))
+    ax2 = ax1.twinx()
+    ax2.tick_params(axis="y", labelcolor=PALETTE["slate"])
+    ax2.spines["right"].set_color(PALETTE["slate"])
+    # self-play ramp (left axis)
+    ax1.fill_between(iters, sp_rat, alpha=0.18, color=PALETTE["emerald"])
+    ax1.plot(iters, sp_rat, "o-", color=PALETTE["emerald"], ms=6,
+             label="Self-play ratio", linewidth=2.5)
+    ax1.set_ylabel("Self-play share of training (%)", color=PALETTE["emerald"])
+    ax1.tick_params(axis="y", labelcolor=PALETTE["emerald"])
+    ax1.set_ylim(0, 80)
+    # question quality (right axis)
+    ax2.plot(iters, q_sol, "s--", color=PALETTE["cyan"],    ms=5, label="Solvability",   linewidth=1.8)
+    ax2.plot(iters, q_nov, "^--", color=PALETTE["amber"],   ms=5, label="Novelty",        linewidth=1.8)
+    ax2.plot(iters, q_rew, "D--", color=PALETTE["pink"],    ms=5, label="Q-Reward",       linewidth=1.8)
+    ax2.set_ylabel("Question quality score (%)", color=PALETTE["slate"])
+    ax2.set_ylim(0, 115)
+    # merge legends
+    h1, l1 = ax1.get_legend_handles_labels()
+    h2, l2 = ax2.get_legend_handles_labels()
+    ax1.legend(h1 + h2, l1 + l2, loc="upper left", framealpha=0.8)
+    ax1.set_xlabel("Training Iteration")
+    ax1.set_xticks(iters)
+    ax1.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
+    ax2.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
+    # annotate final sp ratio
+    ax1.annotate(f"  {sp_rat[-1]:.0f}% self-play\n  by iter {iters[-1]}",
+                 xy=(iters[-1], sp_rat[-1]), fontsize=9, color=PALETTE["emerald"],
+                 va="center", ha="left")
+    fig.suptitle("Self-Play Curriculum — The Model Earns Its Own Training Data",
+                 fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01)
+    ax1.set_title(
+        "Self-play ratio ramps from 0 → 61%  ·  Generated questions stay 93-100% solvable throughout",
+        fontsize=9, color=PALETTE["slate"], pad=6,
+    )
+    fig.tight_layout()
+    save(fig, "plot3_selfplay_success.png", out)
+# ══════════════════════════════════════════════════════════════════════════════
+# PLOT 4 — Reward Signal Tightening: mean ± std over 30 iterations
+# As the policy learns what "good" looks like, the spread between the best
+# and worst solutions in a group narrows.  Lower variance = more consistent
+# reasoning, not lucky guessing.
+# ══════════════════════════════════════════════���═══════════════════════════════
+def plot_reward_confidence(rows: List[Dict], out: Path):
+    iters   = [int(r["iteration"]) for r in rows]
+    phases  = [r["training_phase"]  for r in rows]
+    mean_r  = np.array([f(r, "mean_reward")  for r in rows])
+    std_r   = np.array([f(r, "std_reward")   for r in rows])
+    skipped = np.array([f(r, "skipped_groups", 0) for r in rows])
+    n_grps  = np.array([f(r, "n_groups", 1)        for r in rows])
+    skip_rt = skipped / np.maximum(n_grps, 1) * 100
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(11, 7), sharex=True,
+                                    gridspec_kw={"height_ratios": [3, 1.2]})
+    fig.suptitle("Reward Confidence — Mean ± Std  &  Skipped Groups Over 30 Iterations",
+                 fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01)
+    shade_phases(ax1, iters, phases)
+    ax1.fill_between(iters, (mean_r - std_r) * 100, (mean_r + std_r) * 100,
+                     alpha=0.20, color=PALETTE["indigo"])
+    ax1.plot(iters, mean_r * 100, color=PALETTE["indigo"], linewidth=2.5, label="Mean reward")
+    ax1.plot(iters, (mean_r - std_r) * 100, "--", color=PALETTE["slate"], linewidth=1,
+             alpha=0.6, label="±1 std")
+    ax1.plot(iters, (mean_r + std_r) * 100, "--", color=PALETTE["slate"], linewidth=1,
+             alpha=0.6)
+    # highlight the two tight-cluster peaks
+    for special_iter, label in [(11, "iter 11\nstd=0.098"), (22, "iter 22\nstd=0.124")]:
+        si = iters.index(special_iter)
+        ax1.annotate(label,
+                     xy=(special_iter, (mean_r[si] + std_r[si]) * 100),
+                     xytext=(special_iter + 1, (mean_r[si] + std_r[si]) * 100 + 2),
+                     fontsize=8, color=PALETTE["amber"],
+                     arrowprops=dict(arrowstyle="->", color=PALETTE["amber"], lw=1.2))
+    ax1.set_ylabel("Reward (%)")
+    ax1.set_ylim(55, 115)
+    ax1.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
+    h1, l1 = ax1.get_legend_handles_labels()
+    ax1.legend(handles=h1 + phase_legend_patches(phases), framealpha=0.8, ncol=3)
+    # skip-rate bar chart (bottom panel)
+    shade_phases(ax2, iters, phases)
+    ax2.bar(iters, skip_rt, color=PALETTE["red"], alpha=0.7, width=0.7, label="Skipped groups %")
+    ax2.set_ylabel("Skipped\ngroups (%)")
+    ax2.set_xlabel("Training Iteration")
+    ax2.set_ylim(0, 75)
+    ax2.set_xticks(range(1, max(iters) + 1, 2))
+    ax2.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
+    ax2.legend(loc="upper right", framealpha=0.8)
+    fig.tight_layout()
+    save(fig, "plot4_reward_confidence.png", out)
+# ══════════════════════════════════════════════════════════════════════════════
+# PLOT 5 — Step-Level Reasoning Quality: train vs eval
+# Breaks down the two signals that measure HOW the model thinks (not just
+# whether it gets the final answer right): step accuracy and LCCP.
+# Train lines are noisy; eval lines show clean upward trends.
+# ══════════════════════════════════════════════════════════════════════════════
+def plot_reasoning_quality(rows: List[Dict], out: Path):
+    iters   = [int(r["iteration"]) for r in rows]
+    phases  = [r["training_phase"] for r in rows]
+    # training
+    t_step  = [f(r, "step_accuracy") * 100 for r in rows]
+    t_lccp  = [f(r, "lccp") * 100          for r in rows]
+    t_gt    = [f(r, "gt_match_rate") * 100 for r in rows]
+    # eval (only at checkpoint iters)
+    eval_rows  = [r for r in rows if r.get("eval_combined", "") != ""]
+    e_iters    = [int(r["iteration"])        for r in eval_rows]
+    e_step     = [f(r, "eval_step_acc") * 100 for r in eval_rows]
+    e_lccp     = [f(r, "eval_lccp") * 100     for r in eval_rows]
+    # moving averages
+    ma_step = moving_avg(t_step, w=4)
+    ma_lccp = moving_avg(t_lccp, w=4)
+    ma_gt   = moving_avg(t_gt,   w=4)
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5.5))
+    fig.suptitle("Step-Level Reasoning Quality — Training vs Held-Out Evaluation",
+                 fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01)
+    # ── LEFT: step accuracy ──
+    shade_phases(ax1, iters, phases)
+    ax1.plot(iters, t_step,  alpha=0.2,  color=PALETTE["cyan"],  linewidth=1)
+    ax1.plot(iters, ma_step, color=PALETTE["cyan"],  linewidth=2.5, label="Train step acc (smooth)")
+    ax1.plot(iters, t_gt,    alpha=0.15, color=PALETTE["pink"],  linewidth=1)
+    ax1.plot(iters, ma_gt,   color=PALETTE["pink"],  linewidth=2.5, label="Train GT match (smooth)")
+    ax1.plot(e_iters, e_step, "o-", color=PALETTE["white"], ms=8, linewidth=2,
+             label="Eval step accuracy", zorder=6)
+    # annotate eval start/end
+    ax1.annotate(f"{e_step[0]:.1f}%",  xy=(e_iters[0], e_step[0]),
+                 xytext=(e_iters[0] - 0.3, e_step[0] - 1.2), fontsize=8.5,
+                 color=PALETTE["white"], ha="right")
+    ax1.annotate(f"{e_step[-1]:.1f}%", xy=(e_iters[-1], e_step[-1]),
+                 xytext=(e_iters[-1] + 0.3, e_step[-1] + 0.5), fontsize=8.5,
+                 color=PALETTE["white"])
+    ax1.annotate("", xy=(e_iters[-1], e_step[-1]),
+                 xytext=(e_iters[0], e_step[0]),
+                 arrowprops=dict(arrowstyle="->", color=PALETTE["cyan"], lw=1.5,
+                                 connectionstyle="arc3,rad=-0.3"))
+    ax1.set_title("Step Accuracy  —  Did each reasoning step hold up?",
+                  fontsize=9.5, color=PALETTE["slate"], pad=5)
+    ax1.set_xlabel("Training Iteration")
+    ax1.set_ylabel("Score (%)")
+    ax1.set_ylim(55, 105)
+    ax1.set_xticks(range(1, max(iters) + 1, 3))
+    ax1.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
+    ax1.legend(handles=ax1.get_legend_handles_labels()[0] + phase_legend_patches(phases),
+               framealpha=0.8, ncol=1, loc="lower right")
+    # ── RIGHT: LCCP ──
+    shade_phases(ax2, iters, phases)
+    ax2.plot(iters, t_lccp,  alpha=0.2,  color=PALETTE["emerald"], linewidth=1)
+    ax2.plot(iters, ma_lccp, color=PALETTE["emerald"], linewidth=2.5, label="Train LCCP (smooth)")
+    ax2.plot(e_iters, e_lccp, "o-", color=PALETTE["white"], ms=8, linewidth=2,
+             label="Eval LCCP", zorder=6)
+    ax2.annotate(f"{e_lccp[0]:.1f}%",  xy=(e_iters[0], e_lccp[0]),
+                 xytext=(e_iters[0] - 0.3, e_lccp[0] - 1.5), fontsize=8.5,
+                 color=PALETTE["white"], ha="right")
+    ax2.annotate(f"{e_lccp[-1]:.1f}%", xy=(e_iters[-1], e_lccp[-1]),
+                 xytext=(e_iters[-1] + 0.3, e_lccp[-1] + 0.5), fontsize=8.5,
+                 color=PALETTE["white"])
+    # show LCCP delta
+    delta = e_lccp[-1] - e_lccp[0]
+    ax2.text(0.97, 0.06,
+             f"Eval LCCP  Δ = +{delta:.2f}pp\n(iter {e_iters[0]} → {e_iters[-1]})",
+             transform=ax2.transAxes, ha="right", va="bottom",
+             fontsize=8.5, color=PALETTE["emerald"],
+             bbox=dict(facecolor=PALETTE["bg2"], edgecolor=PALETTE["emerald"],
+                       linewidth=0.8, pad=5))
+    ax2.set_title("LCCP  —  Did the chain of reasoning stay correct until the first error?",
+                  fontsize=9.5, color=PALETTE["slate"], pad=5)
+    ax2.set_xlabel("Training Iteration")
+    ax2.set_ylabel("LCCP (%)")
+    ax2.set_ylim(55, 100)
+    ax2.set_xticks(range(1, max(iters) + 1, 3))
+    ax2.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
+    ax2.legend(handles=ax2.get_legend_handles_labels()[0] + phase_legend_patches(phases),
+               framealpha=0.8, ncol=1, loc="lower right")
+    fig.tight_layout()
+    save(fig, "plot5_reasoning_quality.png", out)
+# ══════════════════════════════════════════════════════════════════════════════
+# Main
+# ══════════════════════════════════════════════════════════════════════════════
+def parse_args():
+    p = argparse.ArgumentParser(description="Generate AxiomForgeAI training plots")
+    p.add_argument("--metrics", default=DEFAULT_METRICS,
+                   help=f"Path to metrics.csv  (default: {DEFAULT_METRICS})")
+    p.add_argument("--out", default="images",
+                   help="Output directory for PNGs  (default: images/)")
+    return p.parse_args()
+def main():
+    args = parse_args()
+    out  = Path(args.out)
+    print(f"Loading metrics from  : {args.metrics}")
+    print(f"Saving plots to       : {out}/")
+    print()
+    rows = load_csv(args.metrics)
+    print(f"Loaded {len(rows)} iterations.\n")
+    print("Generating plots …")
+    plot_eval_quality(rows, out)
+    plot_training_journey(rows, out)
+    plot_selfplay_success(rows, out)
+    plot_reward_confidence(rows, out)
+    plot_reasoning_quality(rows, out)
+    print(f"\n✅  All 5 plots saved to {out}/")
+    print("\nFiles:")
+    for p in sorted(out.glob("plot*.png")):
+        print(f"  {p}  ({p.stat().st_size // 1024} KB)")
+if __name__ == "__main__":
+    main()

scripts/precompute_extraction_cache.py ADDED Viewed

	@@ -0,0 +1,174 @@

+"""
+Offline step-chain extraction cache builder.
+Run this once before training to pre-extract structured step chains from all
+grounded training data (GSM8K + MATH).  The resulting cache file is passed to
+run_grpo_training.py via --extraction-cache so the extractor LLM is never
+called for fixed training examples — only novel self-play solutions require
+live extraction during training.
+Usage
+-----
+    python scripts/precompute_extraction_cache.py \\
+        --gsm8k-data  data/sft/gsm8k_sft.jsonl \\
+        --math-data   data/sft/math_sft.jsonl \\
+        --output-cache data/extraction_cache.json \\
+        --extractor-model Qwen/Qwen2.5-0.5B-Instruct \\
+        --device cuda
+Cache key: md5(question + "\\n" + solution) — keying on both prevents
+collisions when two MATH problems share identical solution text.
+Entries for solutions the extractor cannot parse are stored with
+success=False so training never re-attempts and correctly penalises them.
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import pathlib
+import sys
+from typing import List, Tuple
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s  %(levelname)-8s  %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+logger = logging.getLogger(__name__)
+def load_jsonl(path: str) -> list[dict]:
+    records: list[dict] = []
+    with open(path, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                try:
+                    records.append(json.loads(line))
+                except json.JSONDecodeError:
+                    pass
+    return records
+def collect_qa_pairs(records: list[dict]) -> List[Tuple[str, str]]:
+    """
+    Extract (question, solution) pairs from dataset records.
+    Returns pairs where both fields are non-empty.  Falls back to empty
+    string for the question when only the solution field is present.
+    """
+    pairs: List[Tuple[str, str]] = []
+    for rec in records:
+        sol = (
+            rec.get("solution")
+            or rec.get("output")
+            or rec.get("response")
+            or ""
+        )
+        q = (
+            rec.get("question")
+            or rec.get("problem")
+            or rec.get("input")
+            or ""
+        )
+        if sol.strip():
+            pairs.append((q.strip(), sol.strip()))
+    return pairs
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Pre-extract step chains for grounded training data."
+    )
+    parser.add_argument(
+        "--gsm8k-data", required=True,
+        help="Path to GSM8K training JSONL (e.g. data/sft/gsm8k_sft.jsonl).",
+    )
+    parser.add_argument(
+        "--math-data", default=None,
+        help="Optional path to MATH training JSONL. If provided, those solutions "
+             "are also extracted and added to the cache.",
+    )
+    parser.add_argument(
+        "--output-cache", required=True,
+        help="Destination JSON file for the extraction cache.",
+    )
+    parser.add_argument(
+        "--extractor-model", default="Qwen/Qwen2.5-0.5B-Instruct",
+        help="HuggingFace model ID for the step chain extractor. Default Qwen/Qwen2.5-0.5B-Instruct.",
+    )
+    parser.add_argument(
+        "--device", default="cuda",
+        help="Device for the extractor model (default: cuda).",
+    )
+    parser.add_argument(
+        "--batch-size", type=int, default=1,
+        help="Reserved for future batched extraction. Currently always 1.",
+    )
+    args = parser.parse_args()
+    # ── Load data ─────────────────────────────────────────────────────────────
+    logger.info("Loading GSM8K data from: %s", args.gsm8k_data)
+    gsm8k_records = load_jsonl(args.gsm8k_data)
+    qa_pairs = collect_qa_pairs(gsm8k_records)
+    logger.info("GSM8K: %d (question, solution) pairs", len(qa_pairs))
+    if args.math_data:
+        logger.info("Loading MATH data from: %s", args.math_data)
+        math_records = load_jsonl(args.math_data)
+        math_pairs = collect_qa_pairs(math_records)
+        logger.info("MATH: %d (question, solution) pairs", len(math_pairs))
+        qa_pairs += math_pairs
+    if not qa_pairs:
+        logger.error(
+            "No solutions found in provided files. "
+            "Check field names (question/problem/input + solution/output/response)."
+        )
+        sys.exit(1)
+    # Deduplicate by (question, solution) content
+    # Two different MATH problems can have identical solution text but different
+    # questions — the question+solution key keeps them distinct in the cache.
+    seen: set = set()
+    unique_pairs: List[Tuple[str, str]] = []
+    for q, sol in qa_pairs:
+        key = (q, sol)
+        if key not in seen:
+            seen.add(key)
+            unique_pairs.append((q, sol))
+    logger.info(
+        "Total: %d pairs (%d unique after dedup)", len(qa_pairs), len(unique_pairs)
+    )
+    # ── Load extractor ────────────────────────────────────────────────────────
+    sys.path.insert(0, str(pathlib.Path(__file__).parent.parent))
+    from src.rl.unified_accuracy import StepChainExtractor
+    extractor = StepChainExtractor(
+        model_name=args.extractor_model,
+        device=args.device,
+        cache_path=args.output_cache,   # load existing cache if present (resume)
+    )
+    # ── Build cache ───────────────────────────────────────────────────────────
+    already_cached = len(extractor._cache)
+    if already_cached:
+        logger.info("Resuming: %d entries already in cache", already_cached)
+    extractor.build_cache(unique_pairs)
+    # ── Save ──────────────────────────────────────────────────────────────────
+    extractor.save_cache()
+    logger.info(
+        "Done. Cache contains %d entries → %s",
+        len(extractor._cache),
+        args.output_cache,
+    )
+if __name__ == "__main__":
+    main()

scripts/prepare_aqua_dataset.py ADDED Viewed

	@@ -0,0 +1,265 @@

+#!/usr/bin/env python3
+"""
+Download Chinar/AQuA-RAT from HuggingFace and convert it to the same JSONL
+format used by gsm8k_sft.jsonl so the GRPO training script can consume it
+directly via --gsm8k-data.
+Chinar/AQuA-RAT schema (processed version)
+-------------------------------------------
+  prompt     : str  — the math question
+  completion : str  — step-by-step reasoning ending with:
+                      "The answer is X . Therefore, the correct answer is:  <value>"
+Output schema (messages format expected by load_gsm8k)
+-------------------------------------------------------
+  {
+    "id": "aqua_<idx>",
+    "skill_id": "aqua_rat_algebra",
+    "source": "Chinar/AQuA-RAT",
+    "split": "train" | "validation",
+    "messages": [
+        {"role": "system", "content": SOLVER_SYSTEM_PROMPT},
+        {"role": "user",   "content": "Solve ... Problem:\\n<question>"},
+        {"role": "assistant", "content": "Step 1: ...\\nFinal Answer: <value>"}
+    ]
+  }
+The dataset has only a 'train' split — we reserve the last 500 rows as
+a validation set and use the rest for training.
+Usage
+-----
+  python scripts/prepare_aqua_dataset.py
+  python scripts/prepare_aqua_dataset.py --val-size 300 --dry-run
+"""
+from __future__ import annotations
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Any, Optional
+# ---------------------------------------------------------------------------
+# Prompt constants (kept in sync with src/config/prompts.py)
+# ---------------------------------------------------------------------------
+SOLVER_SYSTEM_PROMPT = (
+    "You are a step-by-step math solver. "
+    "Solve the given problem one step at a time. "
+    "Each step must be on its own line, starting with 'Step N:'. "
+    "End with a line starting with 'Final Answer:'. "
+    "Write every mathematical expression in Python/SymPy syntax "
+    "so it can be verified programmatically."
+)
+USER_WRAPPER = (
+    "Solve the following problem. Show your reasoning as numbered steps, "
+    "then give the final numeric answer on the last line.\n\nProblem:\n{question}"
+)
+# ---------------------------------------------------------------------------
+# Answer extraction
+# ---------------------------------------------------------------------------
+# The completion always ends with a variant of:
+#   "The answer is E . Therefore, the correct answer is:  23"
+_ANSWER_TAIL = re.compile(
+    r"(?:The answer is\s+[A-Ea-e]\s*[.\-]?\s*)?"
+    r"Therefore,?\s+the correct answer is\s*:?\s*(.+)$",
+    re.IGNORECASE,
+)
+def _extract_answer_and_rationale(completion: str) -> Optional[tuple[str, str]]:
+    """
+    Split the completion into (rationale_lines, final_answer_str).
+    Returns None if no extractable numeric answer is found.
+    """
+    # Find the tail marker
+    m = _ANSWER_TAIL.search(completion)
+    if not m:
+        return None
+    raw_answer = m.group(1).strip()
+    # Everything before the tail is the rationale
+    rationale = completion[: m.start()].strip()
+    # Also strip a standalone "The answer is X ." line at the end of rationale
+    rationale = re.sub(r"\s*The answer is\s+[A-Ea-e]\s*[.\-]?\s*$", "", rationale, flags=re.IGNORECASE).strip()
+    # Normalise the answer to a clean numeric string
+    final_answer = _normalise_answer(raw_answer)
+    if final_answer is None:
+        return None
+    return rationale, final_answer
+def _normalise_answer(raw: str) -> Optional[str]:
+    """
+    Extract a single numeric value from an answer string.
+    "23"          → "23"
+    "$ 1600"      → "1600"
+    "8 seconds"   → "8"
+    "5 and 1"     → None  (multi-value — skip)
+    "I and II"    → None  (non-numeric — skip)
+    "− 3 ≤ x ≤ 4" → None  (inequality — skip)
+    """
+    text = raw.strip()
+    # Remove currency / whitespace
+    text = text.replace("$", "").replace("Rs.", "").replace("Rs", "").replace(",", "").strip()
+    # Handle unicode minus
+    text = text.replace("\u2212", "-").replace("−", "-")
+    # Skip if "and" still present (multi-value like "5 and 1")
+    if re.search(r"\band\b", text, re.IGNORECASE):
+        return None
+    # Skip inequalities / expressions with variables
+    if re.search(r"[a-zA-Z≤≥<>]", text):
+        return None
+    # Single number (integer or decimal, optionally negative)
+    m = re.fullmatch(r"\s*(-?\d+(?:\.\d+)?)\s*(?:[a-zA-Z%°].*)?", text)
+    if m:
+        val_str = m.group(1)
+        try:
+            val = float(val_str)
+            return str(int(val)) if val == int(val) else val_str
+        except ValueError:
+            pass
+    return None
+# ---------------------------------------------------------------------------
+# Rationale → Step N: format
+# ---------------------------------------------------------------------------
+def _rationale_to_steps(rationale: str) -> list[str]:
+    lines: list[str] = []
+    for raw in rationale.splitlines():
+        line = raw.strip()
+        if line:
+            line = line.replace("^", "**")
+            lines.append(line)
+    if not lines and rationale.strip():
+        sentences = re.split(r"(?<=[.!?])\s+", rationale.strip())
+        lines = [s.strip() for s in sentences if s.strip()]
+    return lines
+def _build_assistant(rationale: str, final_answer: str) -> str:
+    steps = _rationale_to_steps(rationale)
+    parts = [f"Step {i}: {line}" for i, line in enumerate(steps, 1)]
+    body = "\n".join(parts)
+    return f"{body}\nFinal Answer: {final_answer}" if body else f"Final Answer: {final_answer}"
+# ---------------------------------------------------------------------------
+# Row conversion
+# ---------------------------------------------------------------------------
+def convert_row(row: dict[str, Any], idx: int, split: str) -> Optional[dict[str, Any]]:
+    question   = (row.get("prompt") or "").strip()
+    completion = (row.get("completion") or "").strip()
+    if not question or not completion:
+        return None
+    result = _extract_answer_and_rationale(completion)
+    if result is None:
+        return None
+    rationale, final_answer = result
+    assistant_text = _build_assistant(rationale, final_answer)
+    return {
+        "id": f"aqua_{split}_{idx}",
+        "skill_id": "aqua_rat_algebra",
+        "source": "Chinar/AQuA-RAT",
+        "split": split,
+        "messages": [
+            {"role": "system",    "content": SOLVER_SYSTEM_PROMPT},
+            {"role": "user",      "content": USER_WRAPPER.format(question=question)},
+            {"role": "assistant", "content": assistant_text},
+        ],
+    }
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output-dir", default="data/sft")
+    parser.add_argument("--val-size",   type=int, default=500,
+                        help="How many rows from the end of the dataset to use as validation.")
+    parser.add_argument("--dry-run",    action="store_true")
+    parser.add_argument("--max-samples", type=int, default=None)
+    args = parser.parse_args()
+    try:
+        from datasets import load_dataset
+    except ImportError:
+        print("ERROR: pip install datasets", file=sys.stderr)
+        sys.exit(1)
+    print("Downloading Chinar/AQuA-RAT …")
+    ds = load_dataset("Chinar/AQuA-RAT")
+    all_rows = list(ds["train"])
+    total = len(all_rows)
+    print(f"  Total rows: {total:,}")
+    val_rows   = all_rows[-args.val_size:]
+    train_rows = all_rows[: -args.val_size]
+    splits = {
+        "train":      train_rows,
+        "validation": val_rows,
+    }
+    out_dir = Path(args.output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    for split, rows in splits.items():
+        if args.max_samples:
+            rows = rows[: args.max_samples]
+        records: list[dict] = []
+        skipped = 0
+        for idx, row in enumerate(rows):
+            rec = convert_row(row, idx, split)
+            if rec is None:
+                skipped += 1
+            else:
+                records.append(rec)
+        skip_pct = 100.0 * skipped / max(1, len(rows))
+        if args.dry_run:
+            print(f"\n── {split}: {len(records)} valid / {skipped} skipped ({skip_pct:.1f}%) ──")
+            for rec in records[:3]:
+                print(json.dumps(rec, indent=2))
+            continue
+        out_path = out_dir / f"aqua_{split}.jsonl"
+        with out_path.open("w", encoding="utf-8") as f:
+            for rec in records:
+                f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+        print(f"  [{split:12s}]  {len(records):6,d} valid  {skipped:5,d} skipped ({skip_pct:.1f}%)  →  {out_path}")
+    if not args.dry_run:
+        print("\nDone.  Launch continuation training with:")
+        print("  bash launch_grpo_aqua.sh")
+if __name__ == "__main__":
+    main()

scripts/prepare_combined_dataset.py ADDED Viewed

	@@ -0,0 +1,711 @@

+#!/usr/bin/env python3
+"""
+Combined dataset pipeline — NuminaMath-CoT + OpenMathInstruct-2
+================================================================
+Downloads, filters, normalises, and merges two large math datasets into a single
+JSONL file (train / val / test) that the GRPO training script can consume directly
+via --gsm8k-data.
+Why these two datasets
+----------------------
+  NuminaMath-CoT  (AI-MO/NuminaMath-CoT)
+      860 K problems.  Clean \\boxed{} answers.  7 rich topic categories that map
+      directly to ZPD skill_ids.  Sources span AMC, AIME, Chinese HS, olympiads,
+      and synthetic — giving natural difficulty diversity.
+  OpenMathInstruct-2  (nvidia/OpenMathInstruct-2)
+      14 M synthetic problems with step-level CoT.  `expected_answer` is pre-verified.
+      Diverse surface forms prevent pattern memorisation.  We skip any row whose
+      problem_source is "gsm8k" (already in prior training).
+Output schema (identical to gsm8k_sft.jsonl / aqua_train.jsonl)
+---------------------------------------------------------------
+  {
+    "id":       "<source>_<split>_<idx>",
+    "skill_id": "<topic_slug>",        ← used by ZPD CurriculumManager
+    "source":   "<hf_dataset_name>",
+    "split":    "train" | "val" | "test",
+    "difficulty": 1 | 2 | 3,          ← 1=easy 2=medium 3=hard (for ZPD)
+    "task_type": "solve",
+    "messages": [
+        {"role": "system",    "content": SOLVER_SYSTEM_PROMPT},
+        {"role": "user",      "content": "Solve ... Problem:\\n<question>"},
+        {"role": "assistant", "content": "Step 1: ...\\nFinal Answer: <answer>"}
+    ]
+  }
+Usage
+-----
+  # Quick test (no download, just show stats)
+  python scripts/prepare_combined_dataset.py --dry-run
+  # Full pipeline (default caps: 20 K numina + 15 K openmath)
+  python scripts/prepare_combined_dataset.py
+  # Larger run
+  python scripts/prepare_combined_dataset.py --max-numina 40000 --max-openmath 30000
+  # Only one source
+  python scripts/prepare_combined_dataset.py --skip-openmath
+  python scripts/prepare_combined_dataset.py --skip-numina
+  # Custom output dir
+  python scripts/prepare_combined_dataset.py --output-dir data/sft/combined
+"""
+from __future__ import annotations
+import argparse
+import hashlib
+import json
+import logging
+import math
+import random
+import re
+import sys
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s  %(levelname)-8s  %(message)s",
+    datefmt="%H:%M:%S",
+)
+log = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Constants — kept in sync with src/config/prompts.py
+# ---------------------------------------------------------------------------
+SOLVER_SYSTEM_PROMPT = (
+    "You are a step-by-step math solver. "
+    "Solve the given problem one step at a time. "
+    "Each step must be on its own line, starting with 'Step N:'. "
+    "End with a line starting with 'Final Answer:'. "
+    "Write every mathematical expression in Python/SymPy syntax "
+    "so it can be verified programmatically."
+)
+USER_WRAPPER = (
+    "Solve the following problem. Show your reasoning as numbered steps, "
+    "then give the final numeric answer on the last line.\n\nProblem:\n{question}"
+)
+# ---------------------------------------------------------------------------
+# Skill-ID mappings  (drives ZPD CurriculumManager per-topic mastery)
+# ---------------------------------------------------------------------------
+# NuminaMath-CoT  `type` field → skill_id
+NUMINA_TYPE_TO_SKILL: Dict[str, str] = {
+    "algebra":                   "numina_algebra",
+    "intermediate_algebra":      "numina_algebra",
+    "prealgebra":                "numina_prealgebra",
+    "number_theory":             "numina_number_theory",
+    "geometry":                  "numina_geometry",
+    "counting_and_probability":  "numina_combinatorics",
+    "precalculus":               "numina_calculus",
+    "calculus":                  "numina_calculus",
+    "statistics":                "numina_statistics",
+    "probability":               "numina_statistics",
+    # competition-source buckets (fallback when type not in map above)
+    "cn_k12":                    "numina_algebra",
+    "olympiads":                 "numina_olympiad",
+    "amc_aime":                  "numina_competition",
+    "synthetic_math":            "numina_synthetic",
+}
+# NuminaMath source → approximate difficulty (1=easy 2=medium 3=hard)
+NUMINA_SOURCE_DIFFICULTY: Dict[str, int] = {
+    "cn_k12":        1,
+    "synthetic_math": 2,
+    "amc_aime":       2,
+    "olympiads":      3,
+}
+# OpenMathInstruct-2 problem_source → skill_id / difficulty
+OPENMATH_SOURCE_TO_SKILL: Dict[str, str] = {
+    "math":                  "openmath_algebra",   # overridden per-row by subject
+    "amc_aime_1983_2024":    "openmath_competition",
+    "synthetic_math":        "openmath_synthetic",
+    "number_theory":         "openmath_number_theory",
+}
+OPENMATH_SOURCE_DIFFICULTY: Dict[str, int] = {
+    "math":                 2,
+    "amc_aime_1983_2024":   3,
+    "synthetic_math":       1,
+}
+# OpenMathInstruct MATH-subject → skill_id (when problem_source == "math")
+OPENMATH_MATH_SUBJECT_SKILL: Dict[str, str] = {
+    "Algebra":                   "openmath_algebra",
+    "Number Theory":             "openmath_number_theory",
+    "Geometry":                  "openmath_geometry",
+    "Counting & Probability":    "openmath_combinatorics",
+    "Intermediate Algebra":      "openmath_algebra",
+    "Prealgebra":                "openmath_prealgebra",
+    "Precalculus":               "openmath_calculus",
+    "Calculus":                  "openmath_calculus",
+}
+# ---------------------------------------------------------------------------
+# Answer normalisation
+# ---------------------------------------------------------------------------
+_BOXED_RE = re.compile(r"\\boxed\{((?:[^{}]|\{[^{}]*\})*)\}")
+_LATEX_FRAC = re.compile(r"\\frac\{(\d+)\}\{(\d+)\}")
+_PLAIN_FRAC = re.compile(r"^(-?\d+)\s*/\s*(\d+)$")
+_CURRENCY    = re.compile(r"(?:Rs\.?|USD|\$|€|£)\s*", re.IGNORECASE)
+_UNICODE_MINUS = str.maketrans({"\u2212": "-", "−": "-"})
+def extract_boxed(text: str) -> Optional[str]:
+    """Return the last \\boxed{} contents from a solution string."""
+    matches = _BOXED_RE.findall(text)
+    return matches[-1].strip() if matches else None
+def normalise_numeric(raw: str) -> Optional[str]:
+    """
+    Convert a raw answer string to a clean numeric string.
+    Returns None for:
+      - multi-value answers ("3 and 5")
+      - symbolic expressions ("3\\sqrt{2}", "x+1")
+      - inequalities
+      - fractions where num/den exceed safe range
+    """
+    text = raw.strip()
+    # Remove currency symbols and commas in numbers
+    text = _CURRENCY.sub("", text)
+    text = text.replace(",", "").translate(_UNICODE_MINUS).strip()
+    # Skip if still contains words other than units
+    if re.search(r"\b(and|or|none|no solution|undefined)\b", text, re.IGNORECASE):
+        return None
+    # Skip if contains letters (symbolic)
+    if re.search(r"[a-zA-Z]", text):
+        return None
+    # Skip inequalities / ranges
+    if re.search(r"[≤≥<>]", text):
+        return None
+    # Handle LaTeX fractions: \frac{3}{4}
+    m = _LATEX_FRAC.fullmatch(text)
+    if m:
+        num, den = int(m.group(1)), int(m.group(2))
+        if den:
+            v = num / den
+            return str(int(v)) if v == int(v) else f"{v:.4f}"
+        return None
+    # Handle plain fractions: 3/4
+    m = _PLAIN_FRAC.match(text)
+    if m:
+        num, den = int(m.group(1)), int(m.group(2))
+        if den:
+            v = num / den
+            return str(int(v)) if v == int(v) else f"{v:.4f}"
+        return None
+    # Handle percentage → decimal
+    pct = re.fullmatch(r"(-?\d+(?:\.\d+)?)\s*%", text)
+    if pct:
+        v = float(pct.group(1))
+        return str(int(v)) if v == int(v) else f"{v:.4f}"
+    # Plain integer or decimal (possibly negative, possibly with trailing unit like "km")
+    m = re.match(r"^\s*(-?\d+(?:\.\d+)?)\s*(?:[^0-9.\s].*)?\s*$", text)
+    if m:
+        val_str = m.group(1)
+        try:
+            v = float(val_str)
+            return str(int(v)) if v == int(v) else val_str
+        except ValueError:
+            pass
+    return None
+# ---------------------------------------------------------------------------
+# Solution → Step N: format
+# ---------------------------------------------------------------------------
+_SKIP_LINE_RE = re.compile(
+    r"^\s*("
+    r"\\boxed\{|"
+    r"(Therefore|Thus|Hence|So),?\s+(the\s+)?(final\s+)?answer\s+is|"
+    r"The\s+(final\s+)?answer\s+is|"
+    r"Answer\s*[:=]"
+    r")",
+    re.IGNORECASE,
+)
+def solution_to_steps(solution: str, final_answer: str, max_steps: int = 18) -> str:
+    """
+    Convert an arbitrary CoT solution to the pipeline's Step N: format.
+    Strategy:
+      1. Split on newlines.
+      2. Drop blank lines and lines that just announce the final answer
+         (those are replaced by the explicit Final Answer: line).
+      3. Strip any existing "Step N:" prefix to avoid double-numbering.
+      4. Re-number as "Step 1:", "Step 2:", …
+      5. Append "Final Answer: <answer>".
+    """
+    raw_lines = [l.strip() for l in solution.split("\n") if l.strip()]
+    clean: List[str] = []
+    for line in raw_lines:
+        if _SKIP_LINE_RE.match(line):
+            continue
+        # Strip old step prefix
+        line = re.sub(r"^Step\s*\d+\s*[:.)]\s*", "", line)
+        if line:
+            clean.append(line)
+    # Cap to max_steps to keep token count reasonable
+    clean = clean[:max_steps]
+    if not clean:
+        return f"Final Answer: {final_answer}"
+    parts = [f"Step {i}: {line}" for i, line in enumerate(clean, 1)]
+    return "\n".join(parts) + f"\nFinal Answer: {final_answer}"
+# ---------------------------------------------------------------------------
+# Record builders
+# ---------------------------------------------------------------------------
+def build_record(
+    idx: int,
+    split: str,
+    source_name: str,
+    skill_id: str,
+    difficulty: int,
+    question: str,
+    solution_text: str,
+    final_answer: str,
+) -> Dict[str, Any]:
+    assistant_content = solution_to_steps(solution_text, final_answer)
+    return {
+        "id":         f"{source_name.replace('/', '_')}_{split}_{idx}",
+        "skill_id":   skill_id,
+        "source":     source_name,
+        "split":      split,
+        "difficulty": difficulty,
+        "task_type":  "solve",
+        "messages": [
+            {"role": "system",    "content": SOLVER_SYSTEM_PROMPT},
+            {"role": "user",      "content": USER_WRAPPER.format(question=question.strip())},
+            {"role": "assistant", "content": assistant_content},
+        ],
+    }
+# ---------------------------------------------------------------------------
+# Deduplication
+# ---------------------------------------------------------------------------
+def problem_hash(text: str) -> str:
+    """Fast 16-char hash for near-dedup (exact-match on normalised text)."""
+    normalised = re.sub(r"\s+", " ", text.strip().lower())
+    return hashlib.md5(normalised.encode()).hexdigest()[:16]
+# ---------------------------------------------------------------------------
+# NuminaMath-CoT processing
+# ---------------------------------------------------------------------------
+def _numina_skill_and_difficulty(row: Dict) -> Tuple[str, int]:
+    topic = (row.get("type") or "").lower().strip()
+    source = (row.get("source") or "").lower().strip()
+    skill = NUMINA_TYPE_TO_SKILL.get(topic)
+    if skill is None:
+        skill = NUMINA_TYPE_TO_SKILL.get(source, "numina_general")
+    difficulty = NUMINA_SOURCE_DIFFICULTY.get(source, 2)
+    return skill, difficulty
+def iter_numina(
+    max_samples: int,
+    per_skill_cap: int,
+    skip_olympiad: bool,
+    seed: int,
+) -> Iterator[Dict[str, Any]]:
+    """
+    Stream NuminaMath-CoT from HuggingFace and yield cleaned records.
+    Uses per-skill quota to guarantee topic diversity.
+    """
+    try:
+        from datasets import load_dataset  # type: ignore
+    except ImportError:
+        log.error("pip install datasets huggingface_hub")
+        sys.exit(1)
+    log.info("Streaming AI-MO/NuminaMath-CoT …")
+    ds = load_dataset("AI-MO/NuminaMath-CoT", split="train", streaming=True,
+                      trust_remote_code=True)
+    skill_counts: Counter = Counter()
+    seen_hashes: set = set()
+    total_yielded = 0
+    rng = random.Random(seed)
+    for row in ds:
+        if total_yielded >= max_samples:
+            break
+        problem  = (row.get("problem") or "").strip()
+        solution = (row.get("solution") or "").strip()
+        if not problem or not solution:
+            continue
+        # Extract and normalise answer from \boxed{}
+        raw_answer = extract_boxed(solution)
+        if raw_answer is None:
+            continue
+        final_answer = normalise_numeric(raw_answer)
+        if final_answer is None:
+            continue
+        skill, difficulty = _numina_skill_and_difficulty(row)
+        # Optionally skip very hard olympiad problems
+        if skip_olympiad and skill == "numina_olympiad":
+            continue
+        # Per-skill cap to guarantee diversity
+        if skill_counts[skill] >= per_skill_cap:
+            continue
+        # Dedup
+        h = problem_hash(problem)
+        if h in seen_hashes:
+            continue
+        seen_hashes.add(h)
+        skill_counts[skill] += 1
+        total_yielded += 1
+        yield build_record(
+            idx=total_yielded,
+            split="__assign__",
+            source_name="AI-MO/NuminaMath-CoT",
+            skill_id=skill,
+            difficulty=difficulty,
+            question=problem,
+            solution_text=solution,
+            final_answer=final_answer,
+        )
+    log.info("NuminaMath-CoT: yielded %d records | skill dist: %s",
+             total_yielded, dict(skill_counts.most_common()))
+# ---------------------------------------------------------------------------
+# OpenMathInstruct-2 processing
+# ---------------------------------------------------------------------------
+def _openmath_skill_and_difficulty(row: Dict) -> Tuple[str, int]:
+    src    = (row.get("problem_source") or "").lower().strip()
+    subj   = (row.get("subject") or "").strip()
+    if src == "math" and subj:
+        skill = OPENMATH_MATH_SUBJECT_SKILL.get(subj, "openmath_algebra")
+    else:
+        skill = OPENMATH_SOURCE_TO_SKILL.get(src, "openmath_general")
+    difficulty = OPENMATH_SOURCE_DIFFICULTY.get(src, 2)
+    return skill, difficulty
+def iter_openmath(
+    max_samples: int,
+    per_skill_cap: int,
+    skip_gsm8k: bool,
+    seed: int,
+) -> Iterator[Dict[str, Any]]:
+    """
+    Stream OpenMathInstruct-2 from HuggingFace and yield cleaned records.
+    Only yields rows where `is_correct_solution` is True (pre-verified by NVIDIA).
+    """
+    try:
+        from datasets import load_dataset  # type: ignore
+    except ImportError:
+        log.error("pip install datasets huggingface_hub")
+        sys.exit(1)
+    log.info("Streaming nvidia/OpenMathInstruct-2 (this may take a moment) …")
+    ds = load_dataset(
+        "nvidia/OpenMathInstruct-2",
+        split="train",
+        streaming=True,
+        trust_remote_code=True,
+    )
+    skill_counts: Counter = Counter()
+    seen_hashes: set = set()
+    total_yielded = 0
+    for row in ds:
+        if total_yielded >= max_samples:
+            break
+        # Filter: skip gsm8k (contamination risk)
+        problem_src = (row.get("problem_source") or "").lower()
+        if skip_gsm8k and "gsm8k" in problem_src:
+            continue
+        # Filter: only verified correct solutions
+        if not row.get("is_correct_solution", True):
+            continue
+        problem  = (row.get("problem") or "").strip()
+        solution = (row.get("generated_solution") or "").strip()
+        expected = (row.get("expected_answer") or "").strip()
+        if not problem or not solution or not expected:
+            continue
+        # Normalise the pre-extracted answer
+        final_answer = normalise_numeric(expected)
+        if final_answer is None:
+            continue
+        skill, difficulty = _openmath_skill_and_difficulty(row)
+        # Per-skill cap
+        if skill_counts[skill] >= per_skill_cap:
+            continue
+        # Dedup
+        h = problem_hash(problem)
+        if h in seen_hashes:
+            continue
+        seen_hashes.add(h)
+        skill_counts[skill] += 1
+        total_yielded += 1
+        yield build_record(
+            idx=total_yielded,
+            split="__assign__",
+            source_name="nvidia/OpenMathInstruct-2",
+            skill_id=skill,
+            difficulty=difficulty,
+            question=problem,
+            solution_text=solution,
+            final_answer=final_answer,
+        )
+    log.info("OpenMathInstruct-2: yielded %d records | skill dist: %s",
+             total_yielded, dict(skill_counts.most_common()))
+# ---------------------------------------------------------------------------
+# Dataset stats printer
+# ---------------------------------------------------------------------------
+def print_stats(records: List[Dict], label: str) -> None:
+    skill_c: Counter = Counter(r["skill_id"] for r in records)
+    diff_c:  Counter = Counter(r["difficulty"] for r in records)
+    src_c:   Counter = Counter(r["source"] for r in records)
+    split_c: Counter = Counter(r["split"] for r in records)
+    log.info("─── %s  (%d records) ───────────────────────────────", label, len(records))
+    log.info("  by split:      %s", dict(split_c))
+    log.info("  by source:     %s", dict(src_c))
+    log.info("  by difficulty: %s", dict(sorted(diff_c.items())))
+    log.info("  by skill_id:")
+    for sk, cnt in skill_c.most_common():
+        log.info("    %-40s  %5d", sk, cnt)
+# ---------------------------------------------------------------------------
+# Write JSONL
+# ---------------------------------------------------------------------------
+def write_jsonl(records: List[Dict], path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for rec in records:
+            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+    log.info("Wrote %d records → %s", len(records), path)
+# ---------------------------------------------------------------------------
+# Train / val / test split  (stratified by skill_id)
+# ---------------------------------------------------------------------------
+def stratified_split(
+    records: List[Dict],
+    train_frac: float = 0.85,
+    val_frac:   float = 0.10,
+    seed:       int   = 42,
+) -> Tuple[List[Dict], List[Dict], List[Dict]]:
+    """
+    Stratified split by skill_id so every skill appears in all three sets.
+    Remaining fraction after train+val goes to test.
+    """
+    rng = random.Random(seed)
+    by_skill: Dict[str, List[Dict]] = defaultdict(list)
+    for r in records:
+        by_skill[r["skill_id"]].append(r)
+    train_, val_, test_ = [], [], []
+    for skill, items in by_skill.items():
+        rng.shuffle(items)
+        n = len(items)
+        n_train = math.floor(n * train_frac)
+        n_val   = math.floor(n * val_frac)
+        train_ += items[:n_train]
+        val_   += items[n_train: n_train + n_val]
+        test_  += items[n_train + n_val:]
+    for r in train_: r["split"] = "train"
+    for r in val_:   r["split"] = "val"
+    for r in test_:  r["split"] = "test"
+    # Shuffle each split so skill interleaves during training
+    rng.shuffle(train_)
+    rng.shuffle(val_)
+    rng.shuffle(test_)
+    return train_, val_, test_
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        description="Build combined NuminaMath + OpenMathInstruct-2 training data."
+    )
+    p.add_argument("--output-dir",    default="data/sft",
+                   help="Directory for output JSONL files.")
+    p.add_argument("--max-numina",    type=int, default=20_000,
+                   help="Max records from NuminaMath-CoT (default 20 000).")
+    p.add_argument("--max-openmath",  type=int, default=15_000,
+                   help="Max records from OpenMathInstruct-2 (default 15 000).")
+    p.add_argument("--per-skill-cap", type=int, default=4_000,
+                   help="Max records per skill_id to guarantee topic diversity.")
+    p.add_argument("--skip-numina",   action="store_true",
+                   help="Skip NuminaMath-CoT entirely.")
+    p.add_argument("--skip-openmath", action="store_true",
+                   help="Skip OpenMathInstruct-2 entirely.")
+    p.add_argument("--skip-olympiad", action="store_true", default=True,
+                   help="Skip numina_olympiad problems (too hard for 1.5B; default: True).")
+    p.add_argument("--no-skip-olympiad", dest="skip_olympiad", action="store_false",
+                   help="Include olympiad-level problems.")
+    p.add_argument("--train-frac", type=float, default=0.85)
+    p.add_argument("--val-frac",   type=float, default=0.10)
+    p.add_argument("--seed",       type=int,   default=42)
+    p.add_argument("--dry-run",    action="store_true",
+                   help="Process only 500 rows from each source and show stats (no write).")
+    return p.parse_args()
+def main() -> None:
+    args = parse_args()
+    rng  = random.Random(args.seed)
+    if args.dry_run:
+        args.max_numina   = min(args.max_numina,   500)
+        args.max_openmath = min(args.max_openmath, 500)
+        log.info("DRY RUN — capped at 500 samples per source, nothing written to disk.")
+    all_records: List[Dict] = []
+    # ── NuminaMath-CoT ────────────────────────────────────────────────────
+    if not args.skip_numina:
+        numina_recs = list(iter_numina(
+            max_samples   = args.max_numina,
+            per_skill_cap = args.per_skill_cap,
+            skip_olympiad = args.skip_olympiad,
+            seed          = args.seed,
+        ))
+        all_records.extend(numina_recs)
+        log.info("NuminaMath-CoT collected: %d records", len(numina_recs))
+    else:
+        log.info("Skipping NuminaMath-CoT (--skip-numina).")
+    # ── OpenMathInstruct-2 ────────────────────────────────────────────────
+    if not args.skip_openmath:
+        openmath_recs = list(iter_openmath(
+            max_samples   = args.max_openmath,
+            per_skill_cap = args.per_skill_cap,
+            skip_gsm8k    = True,
+            seed          = args.seed,
+        ))
+        all_records.extend(openmath_recs)
+        log.info("OpenMathInstruct-2 collected: %d records", len(openmath_recs))
+    else:
+        log.info("Skipping OpenMathInstruct-2 (--skip-openmath).")
+    if not all_records:
+        log.error("No records collected — check dataset availability.")
+        sys.exit(1)
+    # ── Deduplicate across sources ─────────────────────────────────────────
+    seen: set = set()
+    deduped: List[Dict] = []
+    for r in all_records:
+        question = r["messages"][1]["content"]
+        h = problem_hash(question)
+        if h not in seen:
+            seen.add(h)
+            deduped.append(r)
+    log.info("After cross-source dedup: %d → %d records  (removed %d dupes)",
+             len(all_records), len(deduped), len(all_records) - len(deduped))
+    # ── Stratified split ──────────────────────────────────────────────────
+    train_recs, val_recs, test_recs = stratified_split(
+        deduped, args.train_frac, args.val_frac, args.seed
+    )
+    print_stats(train_recs + val_recs + test_recs, "COMBINED DATASET")
+    # ── Write outputs ─────────────────────────────────────────────────────
+    if args.dry_run:
+        log.info("DRY RUN complete — no files written.")
+        log.info("  would write: combined_train.jsonl  (%d rows)", len(train_recs))
+        log.info("  would write: combined_val.jsonl    (%d rows)", len(val_recs))
+        log.info("  would write: combined_test.jsonl   (%d rows)", len(test_recs))
+        log.info("Sample record:")
+        print(json.dumps(train_recs[0], indent=2, ensure_ascii=False))
+        return
+    out = Path(args.output_dir)
+    write_jsonl(train_recs, out / "combined_train.jsonl")
+    write_jsonl(val_recs,   out / "combined_val.jsonl")
+    write_jsonl(test_recs,  out / "combined_test.jsonl")
+    log.info("")
+    log.info("╔══════════════════════════════════════════════════════════════╗")
+    log.info("║  Pipeline complete.  Next step:                              ║")
+    log.info("║    bash launch_grpo_combined.sh                              ║")
+    log.info("╚══════════════════════════════════════════════════════════════╝")
+    log.info("  train : %6d rows  → %s/combined_train.jsonl", len(train_recs), out)
+    log.info("  val   : %6d rows  → %s/combined_val.jsonl",   len(val_recs),   out)
+    log.info("  test  : %6d rows  → %s/combined_test.jsonl",  len(test_recs),  out)
+    log.info("")
+    log.info("Skill coverage (for ZPD CurriculumManager):")
+    skill_c = Counter(r["skill_id"] for r in train_recs)
+    for sk, cnt in sorted(skill_c.items()):
+        log.info("  %-40s  %5d train samples", sk, cnt)
+if __name__ == "__main__":
+    main()

scripts/run_grpo_training.py ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/run_inference.py ADDED Viewed

	@@ -0,0 +1,502 @@

+#!/usr/bin/env python3
+"""
+Inference pipeline: Base Qwen2.5-Math-1.5B-Instruct vs RL fine-tuned checkpoint.
+For each sampled GSM8K question, both models generate a step-by-step solution.
+Results are saved to reports/<run_name>/ as JSON files for the Gradio demo.
+Usage
+-----
+  # Full run (50 questions, both models):
+  python scripts/run_inference.py \\
+      --checkpoint checkpoints/grpo_run_v1 \\
+      --num-questions 50 \\
+      --run-name comparison_v1
+  # Quick smoke test (10 questions, no RL model):
+  python scripts/run_inference.py \\
+      --num-questions 10 \\
+      --base-only \\
+      --run-name smoke
+  # Custom data source:
+  python scripts/run_inference.py \\
+      --checkpoint checkpoints/grpo_run_v1 \\
+      --data data/sft/gsm8k_test.jsonl \\
+      --num-questions 30
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import random
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+from tqdm.auto import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from src.config.prompts import create_solver_messages
+from src.sft.solution_format import extract_final_answer_numeric_str
+from src.utils.attn_backend import select_attn_implementation
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)-8s %(name)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+BASE_MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
+REPORTS_DIR   = Path("reports")
+# ── Data loading ──────────────────────────────────────────────────────────────
+def load_gsm8k_questions(
+    data_path: Optional[str],
+    num_questions: int,
+    seed: int = 42,
+) -> List[Dict[str, str]]:
+    """
+    Load GSM8K questions from a local JSONL file or fall back to HuggingFace.
+    Each returned record has keys: ``question``, ``gold_final``, ``answer``.
+    """
+    # ── Try local JSONL first ────────────────────────────────────────────────
+    candidates = [data_path] if data_path else []
+    candidates += [
+        "data/sft/gsm8k_test.jsonl",
+        "data/sft/gsm8k_sft.jsonl",
+    ]
+    for path in candidates:
+        if path and Path(path).exists():
+            logger.info("Loading GSM8K from local file: %s", path)
+            rows: List[Dict] = []
+            with open(path, encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if line:
+                        rows.append(json.loads(line))
+            rng = random.Random(seed)
+            sample = rng.sample(rows, min(num_questions, len(rows)))
+            logger.info("Sampled %d / %d questions.", len(sample), len(rows))
+            return sample
+    # ── Fall back to HuggingFace datasets ────────────────────────────────────
+    logger.info("No local file found — downloading GSM8K from HuggingFace…")
+    try:
+        from datasets import load_dataset
+        ds = load_dataset("openai/gsm8k", "main", split="test")
+    except Exception as e:
+        raise RuntimeError(
+            "Could not load GSM8K. Provide --data or install datasets: pip install datasets"
+        ) from e
+    rows = []
+    for item in ds:
+        q = item["question"].strip()
+        a = item["answer"].strip()
+        # GSM8K answers end with "#### <number>"
+        gold = a.split("####")[-1].strip() if "####" in a else ""
+        rows.append({"question": q, "gold_final": gold, "answer": a})
+    rng = random.Random(seed)
+    sample = rng.sample(rows, min(num_questions, len(rows)))
+    logger.info("Sampled %d questions from HF GSM8K test split.", len(sample))
+    return sample
+# ── Model loading ─────────────────────────────────────────────────────────────
+def load_base_model(
+    device: torch.device,
+    attn_impl: str,
+) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
+    logger.info("Loading base model: %s", BASE_MODEL_ID)
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "left"
+    model = AutoModelForCausalLM.from_pretrained(
+        BASE_MODEL_ID,
+        torch_dtype=torch.bfloat16,
+        device_map={"": device},
+        trust_remote_code=True,
+        attn_implementation=attn_impl,
+    )
+    model.eval()
+    logger.info("Base model loaded.")
+    return model, tokenizer
+def load_rl_model(
+    checkpoint: str,
+    base_model: AutoModelForCausalLM,
+    base_tokenizer: AutoTokenizer,
+    device: torch.device,
+    attn_impl: str,
+) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
+    """
+    Load the RL fine-tuned checkpoint for comparison against the raw base model.
+    Two checkpoint formats are supported:
+    PEFT / LoRA adapter (has adapter_config.json)
+        The already-loaded base model weights are deep-copied in CPU memory,
+        the adapter is applied on top, then merged and unloaded.
+        This avoids downloading the 1.5B base weights from HuggingFace a
+        second time — the base model is downloaded only once per run.
+    Full saved model (has config.json, no adapter_config.json)
+        Loaded directly from disk with from_pretrained.
+    """
+    import copy
+    ckpt_path = Path(checkpoint)
+    if not ckpt_path.exists():
+        raise FileNotFoundError(f"Checkpoint not found: {checkpoint}")
+    is_peft = (ckpt_path / "adapter_config.json").exists()
+    if is_peft:
+        logger.info(
+            "Loading PEFT adapter from %s  (reusing base weights — no second HF download)",
+            checkpoint,
+        )
+        from peft import PeftModel
+        # Deep-copy the already-loaded base model so the base remains untouched
+        # for side-by-side comparison.  For a 1.5B bfloat16 model this takes
+        # ~1-2 s and avoids re-downloading ~3 GB from HuggingFace.
+        base_copy = copy.deepcopy(base_model)
+        model = PeftModel.from_pretrained(base_copy, checkpoint)
+        model = model.merge_and_unload()
+        model = model.to(device)
+    else:
+        logger.info("Loading full model checkpoint from %s", checkpoint)
+        model = AutoModelForCausalLM.from_pretrained(
+            checkpoint,
+            torch_dtype=torch.bfloat16,
+            device_map={"": device},
+            trust_remote_code=True,
+            attn_implementation=attn_impl,
+        )
+    # Patch chat_template from base tokenizer if missing
+    tokenizer = AutoTokenizer.from_pretrained(
+        checkpoint if (ckpt_path / "tokenizer_config.json").exists() else BASE_MODEL_ID,
+        trust_remote_code=True,
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "left"
+    if tokenizer.chat_template is None and base_tokenizer.chat_template:
+        tokenizer.chat_template = base_tokenizer.chat_template
+    model.eval()
+    logger.info("RL model loaded.")
+    return model, tokenizer
+# ── Inference ─────────────────────────────────────────────────────────────────
+def generate_solution(
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+    question: str,
+    device: torch.device,
+    max_new_tokens: int = 512,
+    temperature: float = 0.1,
+) -> Tuple[str, float]:
+    """
+    Generate a step-by-step solution for ``question``.
+    Returns ``(solution_text, elapsed_seconds)``.
+    Low temperature (0.1) for deterministic, greedy-like output during eval.
+    """
+    messages = create_solver_messages(question)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    prompt_len = inputs["input_ids"].shape[1]
+    stop_ids = [tokenizer.eos_token_id]
+    im_end = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    if isinstance(im_end, int) and im_end not in stop_ids:
+        stop_ids.append(im_end)
+    t0 = time.time()
+    with torch.no_grad():
+        output = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=temperature > 0.05,
+            temperature=temperature if temperature > 0.05 else None,
+            top_p=0.95 if temperature > 0.05 else None,
+            eos_token_id=stop_ids,
+            pad_token_id=tokenizer.pad_token_id,
+            use_cache=True,
+        )
+    elapsed = time.time() - t0
+    response_ids = output[0][prompt_len:]
+    solution = tokenizer.decode(response_ids, skip_special_tokens=True).strip()
+    return solution, elapsed
+def score_answer(solution: str, gold_final: str) -> Dict[str, Any]:
+    """
+    Extract the predicted final answer and compare with gold.
+    Returns a dict with ``predicted``, ``gold``, ``correct``, ``match_type``.
+    """
+    predicted_raw = extract_final_answer_numeric_str(solution)
+    if predicted_raw is None:
+        return {
+            "predicted": None,
+            "gold": gold_final,
+            "correct": False,
+            "match_type": "no_answer_found",
+        }
+    # Normalise: strip whitespace, remove commas (e.g. "1,200" → "1200")
+    def _norm(s: str) -> str:
+        return s.strip().replace(",", "").rstrip(".").lower()
+    pred_n = _norm(predicted_raw)
+    gold_n = _norm(gold_final)
+    # Direct string match
+    if pred_n == gold_n:
+        return {
+            "predicted": predicted_raw,
+            "gold": gold_final,
+            "correct": True,
+            "match_type": "exact",
+        }
+    # Numeric match (handles float/int equivalence)
+    try:
+        pred_f = float(pred_n)
+        gold_f = float(gold_n)
+        if abs(pred_f - gold_f) < 1e-6:
+            return {
+                "predicted": predicted_raw,
+                "gold": gold_final,
+                "correct": True,
+                "match_type": "numeric",
+            }
+    except (ValueError, TypeError):
+        pass
+    return {
+        "predicted": predicted_raw,
+        "gold": gold_final,
+        "correct": False,
+        "match_type": "wrong",
+    }
+# ── Report serialisation ──────────────────────────────────────────────────────
+def save_question_report(
+    report_dir: Path,
+    idx: int,
+    question: str,
+    gold_final: str,
+    base_result: Dict[str, Any],
+    rl_result: Optional[Dict[str, Any]],
+) -> Path:
+    record = {
+        "idx": idx,
+        "question": question,
+        "gold_final": gold_final,
+        "base_model": base_result,
+        "rl_model": rl_result,
+    }
+    out = report_dir / f"q_{idx:04d}.json"
+    out.write_text(json.dumps(record, indent=2, ensure_ascii=False), encoding="utf-8")
+    return out
+def save_summary(
+    report_dir: Path,
+    run_name: str,
+    checkpoint: Optional[str],
+    base_correct: int,
+    rl_correct: Optional[int],
+    total: int,
+    total_time_s: float,
+    args_dict: Dict,
+) -> None:
+    summary = {
+        "run_name": run_name,
+        "timestamp": datetime.now().isoformat(),
+        "base_model": BASE_MODEL_ID,
+        "rl_checkpoint": checkpoint,
+        "num_questions": total,
+        "base_accuracy": round(base_correct / total, 4) if total else 0,
+        "rl_accuracy": round(rl_correct / total, 4) if (rl_correct is not None and total) else None,
+        "base_correct": base_correct,
+        "rl_correct": rl_correct,
+        "total_time_s": round(total_time_s, 1),
+        "args": args_dict,
+    }
+    out = report_dir / "summary.json"
+    out.write_text(json.dumps(summary, indent=2), encoding="utf-8")
+    logger.info("Summary saved → %s", out)
+# ── Main ──────────────────────────────────────────────────────────────────────
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Run inference: base vs RL model on GSM8K")
+    p.add_argument("--checkpoint",      type=str,  default=None,
+                   help="Path to RL fine-tuned model or PEFT adapter. "
+                        "If omitted, only the base model is run.")
+    p.add_argument("--data",            type=str,  default=None,
+                   help="Path to local GSM8K JSONL file. "
+                        "Defaults to data/sft/gsm8k_test.jsonl or HuggingFace.")
+    p.add_argument("--num-questions",   type=int,  default=50)
+    p.add_argument("--seed",            type=int,  default=42)
+    p.add_argument("--max-new-tokens",  type=int,  default=512)
+    p.add_argument("--temperature",     type=float, default=0.1)
+    p.add_argument("--run-name",        type=str,  default=None,
+                   help="Report sub-folder name. Defaults to timestamp.")
+    p.add_argument("--base-only",       action="store_true",
+                   help="Skip RL model; only run the base model.")
+    p.add_argument("--reports-dir",     type=str,  default="reports")
+    return p.parse_args()
+def main() -> None:
+    args = parse_args()
+    run_name   = args.run_name or f"run_{datetime.now():%Y%m%d_%H%M%S}"
+    report_dir = Path(args.reports_dir) / run_name
+    report_dir.mkdir(parents=True, exist_ok=True)
+    logger.info("Reports → %s", report_dir)
+    # ── Device ────────────────────────────────────────────────────────────────
+    device   = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    attn_impl = select_attn_implementation()
+    logger.info("Device: %s | attn: %s", device, attn_impl)
+    if torch.cuda.is_available():
+        g = torch.cuda.get_device_properties(0)
+        logger.info("GPU: %s | %.1f GB", g.name, g.total_memory / 1e9)
+    # ── Data ──────────────────────────────────────────────────────────────────
+    questions = load_gsm8k_questions(args.data, args.num_questions, args.seed)
+    # ── Models ────────────────────────────────────────────────────────────────
+    base_model, base_tokenizer = load_base_model(device, attn_impl)
+    rl_model, rl_tokenizer = None, None
+    if not args.base_only and args.checkpoint:
+        rl_model, rl_tokenizer = load_rl_model(
+            args.checkpoint, base_model, base_tokenizer, device, attn_impl
+        )
+    elif not args.base_only and not args.checkpoint:
+        logger.warning("No --checkpoint provided. Running base model only.")
+    # ── Inference loop ────────────────────────────────────────────────────────
+    base_correct = 0
+    rl_correct   = 0 if rl_model else None
+    t_total_start = time.time()
+    for idx, row in enumerate(tqdm(questions, desc="Inference")):
+        question   = row["question"]
+        gold_final = row.get("gold_final", "").strip()
+        # Base model
+        base_solution, base_time = generate_solution(
+            base_model, base_tokenizer, question, device,
+            args.max_new_tokens, args.temperature,
+        )
+        base_score = score_answer(base_solution, gold_final)
+        if base_score["correct"]:
+            base_correct += 1
+        base_result = {
+            "solution":  base_solution,
+            "predicted": base_score["predicted"],
+            "correct":   base_score["correct"],
+            "match_type": base_score["match_type"],
+            "time_s":    round(base_time, 2),
+            "num_tokens": len(base_tokenizer.encode(base_solution)),
+        }
+        # RL model
+        rl_result = None
+        if rl_model is not None:
+            rl_solution, rl_time = generate_solution(
+                rl_model, rl_tokenizer, question, device,
+                args.max_new_tokens, args.temperature,
+            )
+            rl_score = score_answer(rl_solution, gold_final)
+            if rl_score["correct"]:
+                rl_correct += 1
+            rl_result = {
+                "solution":  rl_solution,
+                "predicted": rl_score["predicted"],
+                "correct":   rl_score["correct"],
+                "match_type": rl_score["match_type"],
+                "time_s":    round(rl_time, 2),
+                "num_tokens": len(rl_tokenizer.encode(rl_solution)),
+            }
+        save_question_report(report_dir, idx, question, gold_final, base_result, rl_result)
+        # Live progress log every 10 questions
+        if (idx + 1) % 10 == 0 or idx == len(questions) - 1:
+            done = idx + 1
+            b_acc = base_correct / done
+            log_str = f"[{done}/{len(questions)}] Base acc: {b_acc:.1%}"
+            if rl_correct is not None:
+                log_str += f"  |  RL acc: {rl_correct / done:.1%}"
+            logger.info(log_str)
+    total_time = time.time() - t_total_start
+    # ── Summary ───────────────────────────────────────────────────────────────
+    save_summary(
+        report_dir=report_dir,
+        run_name=run_name,
+        checkpoint=args.checkpoint,
+        base_correct=base_correct,
+        rl_correct=rl_correct,
+        total=len(questions),
+        total_time_s=total_time,
+        args_dict=vars(args),
+    )
+    logger.info("=" * 60)
+    logger.info("Run complete: %s", run_name)
+    logger.info("Base accuracy : %d / %d = %.1f%%",
+                base_correct, len(questions), 100 * base_correct / len(questions))
+    if rl_correct is not None:
+        logger.info("RL accuracy   : %d / %d = %.1f%%",
+                    rl_correct, len(questions), 100 * rl_correct / len(questions))
+        delta = rl_correct - base_correct
+        sign  = "+" if delta >= 0 else ""
+        logger.info("Delta         : %s%d questions (%s%.1f%%)",
+                    sign, delta, sign, 100 * delta / len(questions))
+    logger.info("Reports       : %s", report_dir)
+    logger.info("=" * 60)
+if __name__ == "__main__":
+    main()