diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..c03432d118679b731f1563288db6167506be3379
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,73 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+.pytest_cache/
+.coverage
+.coverage.*
+htmlcov/
+.tox/
+.nox/
+coverage.xml
+*.cover
+*.py,cover
+
+# Type checkers / static analyzers
+.mypy_cache/
+.pyre/
+.ruff_cache/
+.pytype/
+
+# Virtual environments
+.venv/
+venv/
+env/
+ENV/
+
+# Local environment files
+.env
+.env.*
+*.local
+
+# IDE / editor files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS files
+.DS_Store
+Thumbs.db
+data/
+
+*/ui
+images/
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..ef079d6e7a152b326245c486645867afc70a5b27
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,130 @@
+# AxiomForgeAI — GRPO Training Image
+# ─────────────────────────────────────────────────────────────────────────────
+# Hardware target : 1× A100 PCIE 80 GB | AMD EPYC 7V13 | NVMe 300 GB
+#
+# CUDA driver : >= 13.0 (enforced at container start via entrypoint)
+# CUDA toolkit : 12.4.1 (backward-compatible with driver 13.x)
+# PyTorch : 2.5.1+cu124 (pinned in requirements.txt)
+# Flash-Attn : 2.8.3 (pinned in requirements.txt)
+#
+# All Python package versions are taken exclusively from requirements.txt.
+# No versions are hard-coded in this file.
+#
+# ── Build ─────────────────────────────────────────────────────────────────────
+# docker build -t axiomforgeai-train:latest .
+#
+# ── Interactive shell ─────────────────────────────────────────────────────────
+# docker run --gpus all --ipc=host --ulimit memlock=-1 \
+# -v $(pwd)/data:/workspace/data \
+# -v $(pwd)/checkpoints:/workspace/checkpoints \
+# -v $(pwd)/logs:/workspace/logs \
+# -it axiomforgeai-train:latest bash
+#
+# ── GRPO training (one-shot) ──────────────────────────────────────────────────
+# docker run --gpus all --ipc=host --ulimit memlock=-1 \
+# -v $(pwd)/data:/workspace/data \
+# -v $(pwd)/checkpoints:/workspace/checkpoints \
+# -v $(pwd)/logs:/workspace/logs \
+# axiomforgeai-train:latest \
+# python scripts/run_grpo_training.py \
+# --base-model checkpoints/dual_task_v1 \
+# --gsm8k-data data/sft/gsm8k_sft.jsonl \
+# --num-iterations 30 --group-size 8 --questions-per-iter 16
+# ─────────────────────────────────────────────────────────────────────────────
+
+# CUDA toolkit 12.4.1 — matches the cu124 wheels in requirements.txt and is
+# fully compatible with the A100's CUDA 13.2 driver (driver is always ≥ toolkit).
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
+
+LABEL org.opencontainers.image.title="AxiomForgeAI Training" \
+ cuda.driver.minimum="13.0" \
+ cuda.toolkit="12.4.1" \
+ torch.version="2.5.1+cu124" \
+ flash_attn.version="2.8.3"
+
+# ── System packages ────────────────────────────────────────────────────────────
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ python3.11 \
+ python3.11-dev \
+ python3-pip \
+ python3.11-venv \
+ git \
+ git-lfs \
+ curl \
+ wget \
+ build-essential \
+ ninja-build \
+ pkg-config \
+ libssl-dev \
+ libffi-dev \
+ ca-certificates \
+ && ln -sf /usr/bin/python3.11 /usr/bin/python3 \
+ && ln -sf /usr/bin/python3 /usr/bin/python \
+ && rm -rf /var/lib/apt/lists/*
+
+# ── Upgrade pip + build tooling ───────────────────────────────────────────────
+RUN python -m pip install --upgrade --no-cache-dir pip setuptools wheel
+
+# ── PyTorch (CUDA 12.4 wheels) ────────────────────────────────────────────────
+# Must be installed before flash-attn because flash-attn runs a torch version
+# check at install time. The cu124 index is also used for all CUDA-linked wheels.
+# Version is taken from requirements.txt — the --constraint flag keeps pip from
+# re-resolving to a different version when requirements.txt is processed next.
+RUN pip install --no-cache-dir \
+ --extra-index-url https://download.pytorch.org/whl/cu124 \
+ "torch==2.5.1" "torchvision==0.20.1" "torchaudio==2.5.1"
+
+# ── All remaining pinned requirements (from requirements.txt) ─────────────────
+# flash-attn, xformers, vllm, triton, bitsandbytes, transformers, accelerate,
+# peft, ray, sympy, scipy, numpy, openenv-core, fastapi, uvicorn, … are all
+# installed here at the exact versions pinned in requirements.txt.
+# The cu124 index is provided so CUDA-linked wheels resolve correctly.
+COPY requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir \
+ --extra-index-url https://download.pytorch.org/whl/cu124 \
+ -r /tmp/requirements.txt
+
+# ── Project source ────────────────────────────────────────────────────────────
+WORKDIR /workspace
+COPY . /workspace/
+
+# ── Environment variables ─────────────────────────────────────────────────────
+# Repo root on PYTHONPATH so `from src.rl.X import Y` works without editable install
+ENV PYTHONPATH="/workspace:$PYTHONPATH"
+
+# HuggingFace model cache — mount a host path here to persist model downloads:
+# -v /host/hf_cache:/workspace/.hf_cache
+ENV HF_HOME="/workspace/.hf_cache"
+ENV TRANSFORMERS_CACHE="/workspace/.hf_cache"
+
+# A100 CUDA / NCCL tuning
+ENV CUDA_DEVICE_MAX_CONNECTIONS=1
+ENV NCCL_P2P_DISABLE=0
+ENV NCCL_IB_DISABLE=0
+# Required for Flash-Attn 2 with bfloat16 on Ampere
+ENV TORCH_CUDNN_V8_API_ENABLED=1
+
+# ── Runtime entrypoint: enforce CUDA driver >= 13.0 ──────────────────────────
+# nvidia-smi is injected at runtime via --gpus, so this check runs when the
+# container starts, not at build time.
+RUN printf '%s\n' \
+ '#!/bin/sh' \
+ 'if command -v nvidia-smi >/dev/null 2>&1; then' \
+ ' CUDA_VER=$(nvidia-smi 2>/dev/null | grep -oP "CUDA Version: \K[0-9.]+" || echo "0.0")' \
+ ' MAJOR=$(echo "$CUDA_VER" | cut -d. -f1)' \
+ ' echo "[AxiomForgeAI] CUDA driver reports toolkit: $CUDA_VER"' \
+ ' if [ "${MAJOR:-0}" -lt 13 ] 2>/dev/null; then' \
+ ' echo "[ERROR] CUDA driver >= 13.0 required; detected $CUDA_VER. Upgrade your NVIDIA driver."' \
+ ' exit 1' \
+ ' fi' \
+ ' echo "[AxiomForgeAI] CUDA $CUDA_VER >= 13.0 — OK"' \
+ 'else' \
+ ' echo "[WARNING] nvidia-smi not found — CUDA driver version check skipped."' \
+ 'fi' \
+ 'exec "$@"' \
+ > /usr/local/bin/entrypoint.sh \
+ && chmod +x /usr/local/bin/entrypoint.sh
+
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
+CMD ["bash"]
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ae70e605674d272b5ed2ffe4c1ad09e63f685481
--- /dev/null
+++ b/README.md
@@ -0,0 +1,132 @@
+---
+title: AxiomForgeAI Environment Server
+emoji: 🌌
+colorFrom: indigo
+colorTo: pink
+sdk: docker
+pinned: false
+app_port: 8000
+base_path: /web
+tags:
+ - openenv
+---
+
+# AxiomForgeAI
+
+[](https://github.com/meta-pytorch/OpenEnv)
+
+*A self-improving math environment where a model practices on verified problems, generates new challenges when ready, and learns from solution attempts whose reasoning steps and final answers agree.*
+
+## The Problem
+
+Math reasoning models can fail in two different ways. Sometimes the setup, arithmetic, and algebraic steps look reasonable, but the final answer is wrong. Sometimes the final answer is right, but the reasoning that produced it is incomplete, inconsistent, or hard to trust.
+
+For a math user, both failures matter. Checking only the final answer misses where the solution went off track. Checking only the steps misses whether the work actually reaches the right result. The useful signal is the agreement between the reasoning path and the final answer.
+
+This project builds a practice loop around that signal. The model first works on problems with known answers, gets feedback on both the chain of reasoning and the final result, and only then starts generating new challenges for itself. The constraint is intentionally small: a 1.5B math model.
+
+## The Environment
+
+The environment is a practice loop for math reasoning. Each training group starts with one problem, asks the model for multiple solution attempts, scores those attempts from several angles, and uses GRPO to reinforce the attempts that are stronger than the rest of the group.
+
+
+
+The environment has two task sources:
+
+- **Grounded source:** A dataset problem from GSM8K / MATH comes with a known final answer. This gives the environment a reliable anchor for checking whether the model actually reached the right result.
+- **Self-play source:** The curriculum selects a target skill and difficulty. The model writes a new question, then samples multiple solutions to that question. This adds practice beyond static datasets, but only after the grounded signal is stable enough.
+
+Both sources feed the same scoring and update loop. For every selected problem, the model samples `K` candidate solutions. The environment checks final-answer correctness when a gold answer exists, scores reasoning quality with a PRM, checks chain consistency and symbolic arithmetic where possible, checks answer formatting, and scores self-generated questions for clarity, novelty, difficulty fit, and solvability.
+
+GRPO then compares the `K` attempts against each other. The model is not rewarded for a solution in isolation; the strongest attempt in the group becomes the direction for learning. Training starts grounded-only, gradually mixes in self-play groups, and falls back to grounded practice if generated-question quality or answer correctness drops.
+
+## How Self-Improvement Works
+
+Self-improvement comes from turning each problem into a small comparison. The model does not produce one solution and move on; the environment samples several attempts, scores each attempt, and asks which reasoning path was strongest.
+
+GRPO uses that within-group comparison as the learning signal. Attempts with correct answers, stronger reasoning chains, and cleaner final-answer format are reinforced. Attempts with broken chains or unsupported answers become weaker examples.
+
+```text
+practice -> sample attempts -> verify steps and answer -> compare -> reinforce -> adjust difficulty
+```
+
+## Reward System
+
+The reward is designed to avoid a common math-training failure: optimizing for either the final answer or the reasoning trace alone. A good solution should reach the right answer, explain the path clearly, and keep the final result consistent with the steps that produced it.
+
+| Signal | What it checks | Why it matters |
+| --- | --- | --- |
+| Final answer | Matches the gold answer when one exists | Keeps grounded problems tied to objective correctness |
+| Process score | PRM score over the reasoning steps | Rewards clear mathematical progress, not just the last line |
+| Chain consistency | Correct-prefix and step-answer consistency signals | Gives partial learning signal when a solution goes wrong midway |
+| Format | Parseable final answer and clean response structure | Makes automatic grading reliable |
+| Question quality | Topic fit, difficulty fit, clarity, novelty, and solvability | Keeps self-play from generating vague or useless practice tasks |
+
+Grounded problems use the gold answer as the anchor. Self-play problems add a question-quality score before the solution reward is trusted. Both paths produce one combined score for each sampled attempt, and GRPO uses those scores only in comparison with the other attempts from the same problem.
+
+```text
+grounded: answer correctness + process score + chain consistency + format
+self-play: question quality + solution quality
+both -> one combined score per attempt -> GRPO compares attempts within the group
+```
+
+## Training Phases
+
+Training follows a simple three-phase schedule. It starts with grounded-only practice so the model learns to keep answers and reasoning stable on problems with known solutions. Self-play is then introduced gradually, while grounded questions remain as an anchor. Once both are stable, training continues with a mixed task source and falls back to grounded-only batches if answer quality drops.
+
+
+
+## Training Script
+
+The GRPO training loop is available in two forms:
+
+- [`scripts/launch_grpo.sh`](scripts/launch_grpo.sh) — the primary launch script; sets CUDA/threading env vars, verifies Flash-Attention, and calls `run_grpo_training.py` with the full parameter set.
+
+ ```bash
+ bash scripts/launch_grpo.sh
+ ```
+- [`train_grpo.ipynb`](train_grpo.ipynb) — notebook version with the same parameters, structured around `env.reset / env.step / env.state / env.close` for interactive inspection.
+
+
+## Results
+
+These plots come from a single GPU training run and focus on the core question: did the model get better at making its reasoning and final answer agree?
+
+### Evaluation Quality Over Training
+
+
+
+The environment tracks final correctness, solution quality, step validity, and how long the reasoning chain stays correct. All four move upward together, which suggests the model is not just finding better final answers. It is also producing reasoning that holds up longer.
+
+### Training Journey
+
+
+
+Training starts with grounded practice on problems with known answers. Self-play is introduced only after the grounded signal is stable, so the model does not train on its own generated problems too early. The transition is conditional, not just a timer.
+
+### Self-Play Curriculum
+
+
+
+By the end of training, most practice came from self-play. The important part is that generated problems stayed solvable and novel even after self-play became a larger share of training. That makes the ramp meaningful: self-play added useful practice instead of recycled noise.
+
+### Reward Confidence
+
+
+
+The reward spread shows how much contrast exists between the model's best and worst attempts. Wide spread gives GRPO something to learn from. Skipped groups are cases where attempts are too similar to compare usefully. That rate falls as harder material enters the curriculum, which suggests the comparison signal stays useful.
+
+### Step-Level Reasoning Quality
+
+
+
+Step accuracy checks whether each line of reasoning is valid. Chain integrity checks whether those valid steps form an unbroken path to the answer. Both improve together, which means the model is building solutions that hold together more often instead of only producing better-looking outputs.
+
+## Why It Matters
+
+Reliable math reasoning needs more than fluent explanations or lucky final answers. A system that can separate correct reasoning from unsupported answers gives the model a better training target: not just "get the number," but build a chain of logic that reaches the number.
+
+AxiomForgeAI matters because it turns that target into an environment. The same pattern can extend beyond math to other verifiable domains where attempts can be checked, compared, and improved: code, logic, structured data transformations, and scientific problem solving.
+
+---
+*Engineered for the OpenEnv Hackathon India 2026*
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e9435d08b0b3fe528eef1006b512980e96c220c
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Axiomforgeai Environment."""
+
+from .client import AxiomforgeaiEnv
+from .models import AxiomforgeaiAction, AxiomforgeaiObservation
+
+__all__ = [
+ "AxiomforgeaiAction",
+ "AxiomforgeaiObservation",
+ "AxiomforgeaiEnv",
+]
diff --git a/blog.md b/blog.md
new file mode 100644
index 0000000000000000000000000000000000000000..f59bd22db56fe3111257f694db30669c2fbb0ceb
--- /dev/null
+++ b/blog.md
@@ -0,0 +1,94 @@
+# AxiomForgeAI: Self-Improving Math Models Need More Than the Final Answer
+
+Math models have a strange failure mode.
+
+They can write a solution that looks careful, step-by-step, and confident, then end with the wrong answer. They can also produce the right final number with reasoning that is incomplete, inconsistent, or impossible to trust.
+
+For math, that gap matters. The final answer is not enough. A proof, derivation, or word-problem solution only becomes useful when the path and the answer support each other.
+
+AxiomForgeAI is built around that idea.
+
+Instead of treating math reasoning as a one-shot generation problem, AxiomForgeAI turns it into a practice environment. The model does not simply answer a question and move on. It attempts the same problem multiple ways, receives feedback on both the reasoning path and the final answer, and learns from the attempts where the two agree.
+
+## The Architecture
+
+
+
+AxiomForgeAI is a training loop around one simple idea: a math solution should be judged by whether the reasoning path and the final answer support each other.
+
+The environment first selects one task. It can come from a grounded dataset problem with a known answer, or from a self-play question written from a curriculum target. Only after that task is selected does the model sample `K` candidate solutions. The environment scores each attempt, and GRPO compares the attempts within that same problem group.
+
+That is the important part. The model is not rewarded for sounding fluent. It is rewarded when the chain of reasoning and the final answer line up.
+
+## Where Practice Comes From
+
+
+
+The environment uses two sources of problems.
+
+Grounded practice starts with dataset problems from sources like GSM8K or MATH. These problems come with known final answers, so the environment has a reliable anchor for correctness.
+
+Self-play starts later. The curriculum selects a skill and difficulty, and the model writes a new question. That question is only useful if it is clear, solvable, on-topic, and appropriately difficult. This keeps self-play from becoming random problem generation.
+
+Both sources eventually become the same interface: one selected problem. From there, the model samples multiple candidate solutions and the environment compares the resulting reasoning paths.
+
+## What Gets Checked
+
+
+
+AxiomForgeAI does not rely on a single reward signal. A final answer check is useful, but it is not enough. A process score is useful, but it is also not enough. The environment combines several signals so that a polished but wrong solution does not look good, and a lucky answer with weak reasoning does not look good either.
+
+For grounded problems, the gold answer anchors correctness. For all attempts, the environment also looks at reasoning quality, chain consistency, symbolic arithmetic where possible, and whether the answer can be parsed cleanly. For self-play, the generated question itself is scored before the solution reward is trusted.
+
+The result is one score per attempt. That score is not the end of training. It becomes useful because there are other attempts for the same problem.
+
+## Why GRPO Fits
+
+
+
+GRPO turns a problem into a small comparison game. The model samples several attempts for the same prompt. Some are wrong, some are partially right, and one may be clearly better because the answer follows from the steps.
+
+Instead of asking whether an attempt is good in isolation, GRPO asks which attempts are stronger relative to the rest of the group. That relative signal is exactly what this project needs. The model learns from contrast: this reasoning path held together better than the others.
+
+After the update, the improved model goes back into the environment for the next batch. The curriculum can keep it grounded, introduce more self-play, or fall back to grounded-only practice if quality drops.
+
+## Why the 1.5B Constraint Matters
+
+AxiomForgeAI is intentionally built around a compact math model.
+
+That constraint makes the loop easier to see. A smaller model cannot hide every reasoning mistake behind scale. If the setup is wrong, if the arithmetic drifts, or if the final answer does not follow from the steps, the environment has to catch it and turn it into feedback.
+
+The point is not that a compact model magically solves math. The point is that improvement has to come from better practice, better verification, and better selection of reasoning paths.
+
+## What the Model Learns From
+
+AxiomForgeAI rewards attempts that are mathematically useful, not just polished.
+
+The model learns to solve problems with reasoning that supports the answer. It also learns, during self-play, to generate practice problems that are worth solving. A useful self-generated problem should be clear, solvable, on-topic, appropriately difficult, and not just a duplicate of what the model has already seen.
+
+That makes the loop different from ordinary fine-tuning. The model is not only seeing more answers. It is practicing, being checked, and learning from the solution paths that survived verification.
+
+## Where Examples Will Go
+
+This section will include real model responses from the run.
+
+- an example where the model had good steps but a wrong final answer
+- an example where the model guessed correctly but the reasoning was weak
+- an example after training where the reasoning chain and final answer agree
+- a self-generated problem that passed the quality checks
+
+These examples are important because the project is not only about a metric. The clearest evidence is seeing the model become better at making the path and the answer line up.
+
+## Why This Matters
+
+Math is a good starting point because mistakes are often checkable. Arithmetic can be verified. Final answers can be compared. Reasoning steps can be scored. That makes math a clean domain for building self-improvement loops.
+
+But the pattern is bigger than math.
+
+Many useful AI tasks have the same structure. Generate an attempt, check it, compare it against alternatives, and reinforce the better path. Code, logic, structured data transformation, and scientific problem solving all benefit from environments where progress can be verified.
+
+AxiomForgeAI is one version of that pattern. It asks a simple question.
+
+> What if a model could practice until its reasoning and answers agreed?
+
+That is the loop this project builds.
diff --git a/client.py b/client.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf3b759f08724417cc2813615477b1d75738a20f
--- /dev/null
+++ b/client.py
@@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""AxiomForgeAI Math RL Environment Client."""
+
+from typing import Any, Dict, Optional
+
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from openenv.core.env_server.types import State
+
+from .models import AxiomforgeaiAction, AxiomforgeaiObservation
+
+
+class AxiomforgeaiEnv(
+ EnvClient[AxiomforgeaiAction, AxiomforgeaiObservation, State]
+):
+ """
+ Client for the AxiomForgeAI math RL environment.
+
+ Maintains a persistent WebSocket connection to the environment server.
+ Each client instance gets its own session with independent episode state.
+
+ Episode flow::
+
+ with AxiomforgeaiEnv(base_url="http://localhost:8000") as env:
+ # 1. Reset — receive a math question
+ result = env.reset()
+ question = result.observation.question
+
+ # 2. Step — submit a solution, receive reward + feedback
+ solution = "Step 1: ... Final Answer: 42"
+ result = env.step(AxiomforgeaiAction(solution=solution))
+ print(result.reward, result.observation.feedback)
+
+ Example with Docker::
+
+ client = AxiomforgeaiEnv.from_docker_image("axiomforgeai-env:latest")
+ try:
+ result = client.reset()
+ result = client.step(AxiomforgeaiAction(solution="Final Answer: 17"))
+ finally:
+ client.close()
+ """
+
+ def _step_payload(self, action: AxiomforgeaiAction) -> Dict[str, Any]:
+ """Convert AxiomforgeaiAction to JSON payload for the step endpoint."""
+ return {"solution": action.solution}
+
+ def _parse_result(self, payload: Dict[str, Any]) -> StepResult[AxiomforgeaiObservation]:
+ """Parse the server's step response into a StepResult."""
+ obs_data: Dict[str, Any] = payload.get("observation", {})
+ observation = AxiomforgeaiObservation(
+ question=obs_data.get("question", ""),
+ topic=obs_data.get("topic", ""),
+ difficulty=float(obs_data.get("difficulty", 0.5)),
+ feedback=obs_data.get("feedback", ""),
+ done=payload.get("done", False),
+ reward=payload.get("reward"),
+ metadata=obs_data.get("metadata"),
+ )
+ return StepResult(
+ observation=observation,
+ reward=payload.get("reward"),
+ done=payload.get("done", False),
+ )
+
+ def _parse_state(self, payload: Dict[str, Any]) -> State:
+ """Parse the server's state response into a State object."""
+ return State(
+ episode_id=payload.get("episode_id"),
+ step_count=payload.get("step_count", 0),
+ )
diff --git a/docs/environment-overview.puml b/docs/environment-overview.puml
new file mode 100644
index 0000000000000000000000000000000000000000..ac82c40a34fd9629d7c4d6b11192fd279ac18dc7
--- /dev/null
+++ b/docs/environment-overview.puml
@@ -0,0 +1,69 @@
+@startuml environment_overview
+!theme plain
+top to bottom direction
+skinparam backgroundColor #FEFEFE
+skinparam defaultFontName Arial
+skinparam defaultFontSize 14
+skinparam ArrowColor #334155
+skinparam RectangleBorderColor #64748B
+skinparam RectangleFontColor #0F172A
+skinparam roundcorner 10
+skinparam linetype ortho
+skinparam packageStyle rectangle
+skinparam nodesep 42
+skinparam ranksep 42
+
+title AxiomForgeAI - Phase-Controlled Math Reasoning Loop
+
+rectangle "Small Math Model\n1.5B parameters" as MODEL #DBEAFE
+
+rectangle "Phase Controller\nwarmup: grounded only\nramp: gradual self-play\ncontinuous: capped mix + fallback" as PHASE #E2E8F0
+
+rectangle "Task Source\nfor each GRPO group" as SELECT #E2E8F0
+
+rectangle "Grounded Source\nKnown-answer practice" as GLANE #ECFDF5 {
+ rectangle "Dataset problem\nGSM8K / MATH" as GQ #CCFBF1
+ rectangle "Gold answer\navailable" as GOLD #CCFBF1
+ rectangle "Model samples\nK solutions" as GSOL #CCFBF1
+}
+
+rectangle "Self-Play Source\nModel-made challenges" as SLANE #EEF2FF {
+ rectangle "Curriculum picks\nskill + difficulty" as CURRIC #E0E7FF
+ rectangle "Model writes\na new question" as SQ #E0E7FF
+ rectangle "Model samples\nK solutions" as SSOL #E0E7FF
+}
+
+rectangle "Shared Grading\nanswer, steps, arithmetic, format\n+ question quality for self-play" as GRADERS #F1F5F9
+
+rectangle "Group Comparison\nWhich attempts worked best?" as COMPARE #EDE9FE
+rectangle "GRPO Update\nReinforce stronger reasoning" as GRPO #DDD6FE
+rectangle "Improved Model\nfor the next round" as NEXT #DBEAFE
+
+MODEL -down-> PHASE
+PHASE -down-> SELECT
+
+note right of PHASE
+ sets mix
+end note
+
+SELECT -left-> GQ : grounded slot
+GQ --> GOLD
+GOLD --> GSOL
+
+SELECT -right-> CURRIC : self-play slot
+CURRIC --> SQ
+SQ --> SSOL
+
+GSOL -down-> GRADERS
+SSOL -down-> GRADERS
+GRADERS -right-> COMPARE
+COMPARE -right-> GRPO
+GRPO -right-> NEXT
+NEXT -up-> MODEL : repeat
+
+note bottom of SELECT
+ Each batch is randomly interleaved.
+ Phase 1 uses grounded only.
+ Later phases add self-play slots by ratio.
+end note
+@enduml
diff --git a/docs/reward-system.puml b/docs/reward-system.puml
new file mode 100644
index 0000000000000000000000000000000000000000..028dfda63fbed34e51e75c2951d1910d6d456971
--- /dev/null
+++ b/docs/reward-system.puml
@@ -0,0 +1,51 @@
+@startuml reward_system
+!theme plain
+top to bottom direction
+skinparam backgroundColor #FEFEFE
+skinparam defaultFontName Arial
+skinparam defaultFontSize 14
+skinparam ArrowColor #334155
+skinparam RectangleBorderColor #64748B
+skinparam RectangleFontColor #0F172A
+skinparam roundcorner 10
+skinparam linetype ortho
+skinparam packageStyle rectangle
+skinparam nodesep 54
+skinparam ranksep 60
+
+title AxiomForgeAI - Reward System
+
+rectangle "Sampled Solution Attempt" as ATTEMPT #DBEAFE
+
+rectangle "Grounded Reward\nknown-answer problem" as GROUNDED #ECFDF5 {
+ rectangle "Final answer\nmatches gold" as GOLD #CCFBF1
+ rectangle "PRM process score\nreasoning quality" as GPRM #CCFBF1
+ rectangle "Chain consistency\ncorrect prefix + final check" as GCHAIN #CCFBF1
+ rectangle "Format score\nparseable final answer" as GFORMAT #CCFBF1
+}
+
+rectangle "Self-Play Reward\ngenerated challenge" as SELFPLAY #EEF2FF {
+ rectangle "Question quality\nclarity, novelty, solvability" as QUALITY #E0E7FF
+ rectangle "Solution quality\nPRM + chain checks" as SOLUTION #E0E7FF
+ rectangle "Format score\nparseable final answer" as SFORMAT #E0E7FF
+}
+
+rectangle "Combined Reward\none score per attempt" as SCORE #F1F5F9
+rectangle "GRPO Group Comparison\nrank attempts within the same problem" as COMPARE #EDE9FE
+rectangle "Step-Answer Alignment\nreward paths where reasoning supports the result" as ALIGN #DDD6FE
+
+ATTEMPT -left-> GROUNDED : grounded
+ATTEMPT -right-> SELFPLAY : self-play
+
+GOLD --> GPRM
+GPRM --> GCHAIN
+GCHAIN --> GFORMAT
+
+QUALITY --> SOLUTION
+SOLUTION --> SFORMAT
+
+GFORMAT -down-> SCORE
+SFORMAT -down-> SCORE
+SCORE -right-> COMPARE
+COMPARE -right-> ALIGN
+@enduml
diff --git a/docs/training-phases.puml b/docs/training-phases.puml
new file mode 100644
index 0000000000000000000000000000000000000000..e796620d5090dc7030d417738d46cc40adc3c5cb
--- /dev/null
+++ b/docs/training-phases.puml
@@ -0,0 +1,27 @@
+@startuml training_phases
+!theme plain
+left to right direction
+skinparam backgroundColor #FEFEFE
+skinparam defaultFontName Arial
+skinparam defaultFontSize 14
+skinparam ArrowColor #334155
+skinparam RectangleBorderColor #64748B
+skinparam RectangleFontColor #0F172A
+skinparam roundcorner 10
+skinparam linetype ortho
+skinparam packageStyle rectangle
+skinparam nodesep 42
+skinparam ranksep 42
+
+title AxiomForgeAI - Training Phases
+
+rectangle "Phase 1\nGrounded Only" as Warmup #ECFDF5
+rectangle "Phase 2\nSelf-Play Ramp" as Ramp #EEF2FF
+rectangle "Phase 3\nMixed Training" as Improve #F1F5F9
+rectangle "Fallback\nGrounded Recovery" as Fallback #EDE9FE
+
+Warmup --> Ramp
+Ramp --> Improve
+Improve --> Fallback : if quality drops
+Fallback --> Improve : recover
+@enduml
diff --git a/images/axiomforgeai_scenes/scene_01.svg b/images/axiomforgeai_scenes/scene_01.svg
new file mode 100644
index 0000000000000000000000000000000000000000..19eb40a0582bb4a87ae3509f90de8051fa738a70
--- /dev/null
+++ b/images/axiomforgeai_scenes/scene_01.svg
@@ -0,0 +1,52 @@
+
diff --git a/images/axiomforgeai_scenes/scene_02.svg b/images/axiomforgeai_scenes/scene_02.svg
new file mode 100644
index 0000000000000000000000000000000000000000..ad17ed0f891eacafeed9bfade6da1544dc1e77b0
--- /dev/null
+++ b/images/axiomforgeai_scenes/scene_02.svg
@@ -0,0 +1,72 @@
+
diff --git a/images/axiomforgeai_scenes/scene_03.svg b/images/axiomforgeai_scenes/scene_03.svg
new file mode 100644
index 0000000000000000000000000000000000000000..d3f2beb29f24eff376594315f807f35cd38d4f02
--- /dev/null
+++ b/images/axiomforgeai_scenes/scene_03.svg
@@ -0,0 +1,67 @@
+
diff --git a/images/axiomforgeai_scenes/scene_04.svg b/images/axiomforgeai_scenes/scene_04.svg
new file mode 100644
index 0000000000000000000000000000000000000000..55ed1eed2d0181b0fbc80115a48b33cda91a3fff
--- /dev/null
+++ b/images/axiomforgeai_scenes/scene_04.svg
@@ -0,0 +1,78 @@
+
diff --git a/images/axiomforgeai_scenes/scene_05.svg b/images/axiomforgeai_scenes/scene_05.svg
new file mode 100644
index 0000000000000000000000000000000000000000..28df83570bd87c1c655dee82a7afd8947a617fa4
--- /dev/null
+++ b/images/axiomforgeai_scenes/scene_05.svg
@@ -0,0 +1,66 @@
+
diff --git a/images/axiomforgeai_scenes/scene_06.svg b/images/axiomforgeai_scenes/scene_06.svg
new file mode 100644
index 0000000000000000000000000000000000000000..a919f404683d9c995d96ecd6a6aa5ac90a70ab11
--- /dev/null
+++ b/images/axiomforgeai_scenes/scene_06.svg
@@ -0,0 +1,79 @@
+
diff --git a/images/axiomforgeai_scenes/scene_07.svg b/images/axiomforgeai_scenes/scene_07.svg
new file mode 100644
index 0000000000000000000000000000000000000000..4d4acf4547411ea86ca58ce4426c98a2129ba438
--- /dev/null
+++ b/images/axiomforgeai_scenes/scene_07.svg
@@ -0,0 +1,66 @@
+
diff --git a/images/axiomforgeai_scenes/scene_08.svg b/images/axiomforgeai_scenes/scene_08.svg
new file mode 100644
index 0000000000000000000000000000000000000000..058e2b67de12ef31a6b04ba0be4518b362f93f60
--- /dev/null
+++ b/images/axiomforgeai_scenes/scene_08.svg
@@ -0,0 +1,74 @@
+
diff --git a/images/axiomforgeai_scenes/scene_09.svg b/images/axiomforgeai_scenes/scene_09.svg
new file mode 100644
index 0000000000000000000000000000000000000000..9d0475b2350438777f0448a772969c477abfb7a1
--- /dev/null
+++ b/images/axiomforgeai_scenes/scene_09.svg
@@ -0,0 +1,61 @@
+
diff --git a/images/axiomforgeai_scenes/scene_10.svg b/images/axiomforgeai_scenes/scene_10.svg
new file mode 100644
index 0000000000000000000000000000000000000000..692824421e615ffb081001ececcfca99c264a40e
--- /dev/null
+++ b/images/axiomforgeai_scenes/scene_10.svg
@@ -0,0 +1,86 @@
+
diff --git a/images/blog_flow/architecture.svg b/images/blog_flow/architecture.svg
new file mode 100644
index 0000000000000000000000000000000000000000..04badf989bfcf00b67d7e67f4185654f5fccb8a3
--- /dev/null
+++ b/images/blog_flow/architecture.svg
@@ -0,0 +1,50 @@
+
diff --git a/images/blog_flow/grading.svg b/images/blog_flow/grading.svg
new file mode 100644
index 0000000000000000000000000000000000000000..bedc939bfaeec1aca6d60a0ca2de876c57bf6c60
--- /dev/null
+++ b/images/blog_flow/grading.svg
@@ -0,0 +1,45 @@
+
diff --git a/images/blog_flow/grpo-loop.svg b/images/blog_flow/grpo-loop.svg
new file mode 100644
index 0000000000000000000000000000000000000000..e30ca9a8b6b12f0caa3b79d028af5654bd24346f
--- /dev/null
+++ b/images/blog_flow/grpo-loop.svg
@@ -0,0 +1,44 @@
+
diff --git a/images/blog_flow/task-sources.svg b/images/blog_flow/task-sources.svg
new file mode 100644
index 0000000000000000000000000000000000000000..1246732ff2d925dd7a29913272c18a7a449bd38a
--- /dev/null
+++ b/images/blog_flow/task-sources.svg
@@ -0,0 +1,35 @@
+
diff --git a/images/environment_overview.svg b/images/environment_overview.svg
new file mode 100644
index 0000000000000000000000000000000000000000..a911680aebab233ad808484b44e7fc05b471213f
--- /dev/null
+++ b/images/environment_overview.svg
@@ -0,0 +1,2582 @@
+
+
diff --git a/images/training_phases.svg b/images/training_phases.svg
new file mode 100644
index 0000000000000000000000000000000000000000..cd4a6474087d482c556baadecf85055342f3867e
--- /dev/null
+++ b/images/training_phases.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/logs/grpo/grpo_20260426_024029.log b/logs/grpo/grpo_20260426_024029.log
new file mode 100644
index 0000000000000000000000000000000000000000..022c572006632835f0d32be0ad203b0afac27f3c
--- /dev/null
+++ b/logs/grpo/grpo_20260426_024029.log
@@ -0,0 +1,44 @@
+2026-04-26 02:40:33,617 INFO __main__ - ======================================================================
+2026-04-26 02:40:33,617 INFO __main__ - GRPO run: grpo_20260426_024029
+2026-04-26 02:40:33,617 INFO __main__ - Checkpoints : checkpoints/grpo/grpo_20260426_024029
+2026-04-26 02:40:33,618 INFO __main__ - Logs : logs/grpo/grpo_20260426_024029
+2026-04-26 02:40:33,618 INFO __main__ - Console log : logs/grpo/grpo_20260426_024029/console_output.log
+2026-04-26 02:40:33,618 INFO __main__ - ======================================================================
+2026-04-26 02:40:33,736 INFO src.utils.attn_backend - Attention backend selected: flash_attention_2
+2026-04-26 02:40:33,736 INFO __main__ - Device: cuda:0 | attn: flash_attention_2
+2026-04-26 02:40:33,753 INFO __main__ - GPU: NVIDIA A100 80GB PCIe | 85.1 GB VRAM | capability sm_80
+2026-04-26 02:40:33,753 INFO __main__ - Run config: K=8 K_q=2 N=16 lr=5.0e-06 T=0.80 max_new=800 | clip_eps=0.20 kl_coef=0.0400 warmup=6 | diff_alpha=3.0 | self_play=70% grounded=30% | math_mix=30% math_maxdiff=3 | overlong_filter=True | eval_every=5 eval_N=100 | grad_clip=0.50 save_every=5 keep_last=3 | question_GRPO=ENABLED (K_q=2)
+2026-04-26 02:40:33,753 INFO __main__ - Loading model from checkpoints/dual_task_v1 ...
+2026-04-26 02:40:34,405 INFO __main__ - Tokenizer has no chat_template; loading from base model Qwen/Qwen2.5-Math-1.5B-Instruct
+2026-04-26 02:40:34,731 INFO __main__ - Chat template loaded successfully.
+2026-04-26 02:40:34,731 INFO __main__ - Detected PEFT adapter — loading base Qwen/Qwen2.5-Math-1.5B-Instruct then merging checkpoints/dual_task_v1
+2026-04-26 02:40:36,242 WARNING __main__ - All parameters were frozen on load (PEFT merge_and_unload bug). Re-enabled requires_grad — any prior frozen runs were training nothing.
+2026-04-26 02:40:36,242 INFO __main__ - Flash-Attn 2 active — gradient checkpointing OFF (Flash already gives O(T) attention memory).
+2026-04-26 02:40:36,243 INFO __main__ - Trainable parameters: 1,543,714,304 / 1,543,714,304 (100.0%)
+2026-04-26 02:40:36,244 INFO __main__ - Creating frozen reference policy (kl_coef=0.0400, ~3.1 GB VRAM)...
+2026-04-26 02:40:36,305 INFO __main__ - Reference policy ready.
+2026-04-26 02:40:36,306 INFO __main__ - LR schedule: 5.0e-06 warmup(6 iters) → cosine decay(24 iters, min=5.0e-07)
+2026-04-26 02:40:36,415 INFO __main__ - Loaded 8792 QA pairs from data/sft/gsm8k_sft.jsonl
+2026-04-26 02:40:36,424 INFO __main__ - Loaded 4072 MATH pairs from data/math/math_numeric.jsonl
+2026-04-26 02:40:36,424 INFO __main__ - MATH mixing: 30% MATH (4072 problems) + 70% GSM8K (8792 problems)
+2026-04-26 02:40:36,424 INFO src.rl.prm_scorer - Loading PRM Qwen/Qwen2.5-Math-PRM-7B (4-bit=True, dtype=torch.bfloat16) on cuda:0 …
+
Loading checkpoint shards: 0%| | 0/4 [00:00, ?it/s]
Loading checkpoint shards: 25%|##5 | 1/4 [00:00<00:02, 1.19it/s]
Loading checkpoint shards: 50%|##### | 2/4 [00:01<00:01, 1.25it/s]
Loading checkpoint shards: 75%|#######5 | 3/4 [00:02<00:00, 1.25it/s]
Loading checkpoint shards: 100%|##########| 4/4 [00:02<00:00, 1.42it/s]
Loading checkpoint shards: 100%|##########| 4/4 [00:02<00:00, 1.34it/s]
+Some weights of the model checkpoint at Qwen/Qwen2.5-Math-PRM-7B were not used when initializing Qwen2ForProcessRewardModel: ['lm_head.weight']
+- This IS expected if you are initializing Qwen2ForProcessRewardModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing Qwen2ForProcessRewardModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+2026-04-26 02:40:40,150 INFO src.rl.prm_scorer - PRM ready. GPU memory allocated: 9.97 GB step_sep_id=151651
+2026-04-26 02:40:40,151 INFO __main__ - PRM loaded: Qwen/Qwen2.5-Math-PRM-7B (4-bit)
+2026-04-26 02:40:40,154 INFO src.rl.unified_accuracy - Extraction cache not found at data/extraction_cache.json — will build on first use
+2026-04-26 02:40:40,154 INFO __main__ - Unified accuracy calculator ready (extractor=Qwen/Qwen2.5-0.5B-Instruct, cache=data/extraction_cache.json)
+2026-04-26 02:40:40,154 INFO __main__ - Warming up step-chain extractor (eager load)...
+2026-04-26 02:40:40,154 INFO src.rl.unified_accuracy - Loading step chain extractor: Qwen/Qwen2.5-0.5B-Instruct
+2026-04-26 02:40:41,033 INFO src.rl.unified_accuracy - Step chain extractor loaded
+2026-04-26 02:40:41,034 INFO __main__ - Extractor warmup complete
+2026-04-26 02:40:41,034 INFO src.rl.llm_question_classifier - LLMQuestionClassifier ready (model=Qwen2ForCausalLM, cache=10000, topics=24)
+2026-04-26 02:40:42,571 INFO __main__ - Detected structured dataset (8792 records) — bootstrapping curriculum from skill_ids instead of keyword classifier.
+2026-04-26 02:40:42,575 INFO src.rl.curriculum_manager - Curriculum bootstrapped from 8792 records across 1 topics
+2026-04-26 02:40:42,575 INFO __main__ - ======================================================================
+2026-04-26 02:40:42,575 INFO __main__ - INITIAL EVALUATION (Iteration 0)
+2026-04-26 02:40:42,575 INFO __main__ - ======================================================================
+
GSM8K eval: 0%| | 0/100 [00:00, ?q/s]2026-04-26 02:40:44,922 WARNING transformers_modules.Qwen.Qwen2.5-Math-PRM-7B.0610740060112df12585d00a1c5f4624d2f59051.modeling_qwen2_rm - We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
+
GSM8K eval: 1%|1 | 1/100 [00:02<04:03, 2.46s/q, correct=1/1, lccp=100.0%, score=0.999, step_acc=100.0%]
GSM8K eval: 2%|2 | 2/100 [00:07<06:21, 3.89s/q, correct=2/2, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 3%|3 | 3/100 [00:09<05:18, 3.28s/q, correct=3/3, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 4%|4 | 4/100 [00:12<04:33, 2.85s/q, correct=3/4, lccp=83.3%, score=0.887, step_acc=91.7%]
GSM8K eval: 5%|5 | 5/100 [00:13<03:46, 2.38s/q, correct=4/5, lccp=86.7%, score=0.910, step_acc=93.3%]
GSM8K eval: 6%|6 | 6/100 [00:18<05:15, 3.36s/q, correct=4/6, lccp=75.6%, score=0.887, step_acc=87.8%]
GSM8K eval: 7%|7 | 7/100 [00:22<05:08, 3.32s/q, correct=5/7, lccp=79.0%, score=0.903, step_acc=89.5%]
GSM8K eval: 8%|8 | 8/100 [00:24<04:36, 3.00s/q, correct=6/8, lccp=81.7%, score=0.915, step_acc=90.8%]
GSM8K eval: 9%|9 | 9/100 [00:27<04:37, 3.04s/q, correct=7/9, lccp=83.7%, score=0.924, step_acc=91.9%]
GSM8K eval: 10%|# | 10/100 [00:32<05:21, 3.57s/q, correct=7/10, lccp=81.3%, score=0.908, step_acc=90.7%]
GSM8K eval: 11%|#1 | 11/100 [00:35<04:56, 3.33s/q, correct=8/11, lccp=83.0%, score=0.916, step_acc=91.5%]
GSM8K eval: 12%|#2 | 12/100 [00:37<04:17, 2.93s/q, correct=9/12, lccp=84.4%, score=0.923, step_acc=92.2%]
GSM8K eval: 13%|#3 | 13/100 [00:39<04:08, 2.86s/q, correct=10/13, lccp=80.5%, score=0.925, step_acc=90.3%]
GSM8K eval: 14%|#4 | 14/100 [00:44<04:45, 3.32s/q, correct=11/14, lccp=81.9%, score=0.930, step_acc=91.0%]
GSM8K eval: 15%|#5 | 15/100 [00:46<04:19, 3.05s/q, correct=12/15, lccp=83.1%, score=0.935, step_acc=91.6%]
GSM8K eval: 16%|#6 | 16/100 [00:49<03:58, 2.84s/q, correct=12/16, lccp=84.2%, score=0.911, step_acc=92.1%]
GSM8K eval: 17%|#7 | 17/100 [00:51<03:54, 2.83s/q, correct=13/17, lccp=85.1%, score=0.916, step_acc=92.5%]
GSM8K eval: 18%|#8 | 18/100 [00:57<04:57, 3.62s/q, correct=13/18, lccp=81.1%, score=0.904, step_acc=90.2%]
GSM8K eval: 19%|#9 | 19/100 [00:59<04:26, 3.29s/q, correct=14/19, lccp=82.1%, score=0.909, step_acc=90.7%]
GSM8K eval: 20%|## | 20/100 [01:03<04:25, 3.32s/q, correct=15/20, lccp=83.0%, score=0.914, step_acc=91.2%]
GSM8K eval: 21%|##1 | 21/100 [01:05<04:00, 3.04s/q, correct=16/21, lccp=83.8%, score=0.918, step_acc=91.6%]
GSM8K eval: 22%|##2 | 22/100 [01:10<04:43, 3.63s/q, correct=17/22, lccp=81.9%, score=0.919, step_acc=90.0%]
GSM8K eval: 23%|##3 | 23/100 [01:14<04:49, 3.76s/q, correct=18/23, lccp=82.7%, score=0.923, step_acc=90.5%]
GSM8K eval: 24%|##4 | 24/100 [01:17<04:15, 3.37s/q, correct=18/24, lccp=80.3%, score=0.906, step_acc=87.7%]
GSM8K eval: 25%|##5 | 25/100 [01:19<03:55, 3.14s/q, correct=18/25, lccp=78.1%, score=0.902, step_acc=87.2%]
GSM8K eval: 26%|##6 | 26/100 [01:23<04:16, 3.46s/q, correct=19/26, lccp=78.9%, score=0.905, step_acc=87.7%]
GSM8K eval: 27%|##7 | 27/100 [01:26<03:54, 3.22s/q, correct=19/27, lccp=79.7%, score=0.900, step_acc=88.2%]
GSM8K eval: 28%|##8 | 28/100 [01:28<03:27, 2.88s/q, correct=20/28, lccp=80.4%, score=0.904, step_acc=88.6%]
GSM8K eval: 29%|##9 | 29/100 [01:31<03:19, 2.81s/q, correct=21/29, lccp=81.1%, score=0.907, step_acc=89.0%]
GSM8K eval: 30%|### | 30/100 [01:34<03:34, 3.06s/q, correct=22/30, lccp=81.7%, score=0.910, step_acc=89.3%]
GSM8K eval: 31%|###1 | 31/100 [01:37<03:18, 2.88s/q, correct=23/31, lccp=82.3%, score=0.913, step_acc=89.7%]
GSM8K eval: 32%|###2 | 32/100 [01:39<02:51, 2.53s/q, correct=24/32, lccp=82.9%, score=0.915, step_acc=90.0%]
GSM8K eval: 33%|###3 | 33/100 [01:41<02:51, 2.56s/q, correct=25/33, lccp=83.4%, score=0.917, step_acc=90.3%]
GSM8K eval: 34%|###4 | 34/100 [01:43<02:37, 2.38s/q, correct=26/34, lccp=83.9%, score=0.920, step_acc=90.6%]
GSM8K eval: 35%|###5 | 35/100 [01:46<02:35, 2.40s/q, correct=27/35, lccp=84.3%, score=0.922, step_acc=90.9%]
GSM8K eval: 36%|###6 | 36/100 [01:49<02:50, 2.67s/q, correct=28/36, lccp=84.8%, score=0.924, step_acc=91.1%]
GSM8K eval: 37%|###7 | 37/100 [01:51<02:33, 2.44s/q, correct=29/37, lccp=85.2%, score=0.925, step_acc=91.4%]
GSM8K eval: 38%|###8 | 38/100 [01:54<02:39, 2.57s/q, correct=30/38, lccp=85.6%, score=0.927, step_acc=91.6%]
GSM8K eval: 39%|###9 | 39/100 [01:59<03:20, 3.29s/q, correct=31/39, lccp=85.9%, score=0.929, step_acc=91.8%]
GSM8K eval: 40%|#### | 40/100 [02:05<04:06, 4.10s/q, correct=32/40, lccp=86.3%, score=0.931, step_acc=92.0%]
GSM8K eval: 41%|####1 | 41/100 [02:08<03:40, 3.74s/q, correct=32/41, lccp=86.6%, score=0.930, step_acc=92.2%]
GSM8K eval: 42%|####2 | 42/100 [02:13<04:01, 4.16s/q, correct=33/42, lccp=85.4%, score=0.931, step_acc=92.0%]
GSM8K eval: 43%|####3 | 43/100 [02:15<03:17, 3.47s/q, correct=34/43, lccp=85.7%, score=0.933, step_acc=92.2%]
GSM8K eval: 44%|####4 | 44/100 [02:21<04:01, 4.31s/q, correct=35/44, lccp=86.0%, score=0.934, step_acc=92.4%]
GSM8K eval: 45%|####5 | 45/100 [02:24<03:36, 3.94s/q, correct=36/45, lccp=86.3%, score=0.936, step_acc=92.5%]
GSM8K eval: 46%|####6 | 46/100 [02:29<03:47, 4.21s/q, correct=36/46, lccp=84.5%, score=0.931, step_acc=92.4%]
\ No newline at end of file
diff --git a/logs/grpo/grpo_20260426_032827.log b/logs/grpo/grpo_20260426_032827.log
new file mode 100644
index 0000000000000000000000000000000000000000..c3337d8a8f0fc13feea94609bbb19f05b17bb359
--- /dev/null
+++ b/logs/grpo/grpo_20260426_032827.log
@@ -0,0 +1,7428 @@
+2026-04-26 03:28:31,607 INFO __main__ - ======================================================================
+2026-04-26 03:28:31,607 INFO __main__ - GRPO run: grpo_20260426_032827
+2026-04-26 03:28:31,607 INFO __main__ - Checkpoints : checkpoints/grpo/grpo_20260426_032827
+2026-04-26 03:28:31,607 INFO __main__ - Logs : logs/grpo/grpo_20260426_032827
+2026-04-26 03:28:31,608 INFO __main__ - Console log : logs/grpo/grpo_20260426_032827/console_output.log
+2026-04-26 03:28:31,608 INFO __main__ - ======================================================================
+2026-04-26 03:28:31,727 INFO src.utils.attn_backend - Attention backend selected: flash_attention_2
+2026-04-26 03:28:31,727 INFO __main__ - Device: cuda:0 | attn: flash_attention_2
+2026-04-26 03:28:31,745 INFO __main__ - GPU: NVIDIA A100 80GB PCIe | 85.1 GB VRAM | capability sm_80
+2026-04-26 03:28:31,745 INFO __main__ - Run config: K=10 K_q=2 N=20 lr=5.0e-06 T=0.80 max_new=1000 | clip_eps=0.20 kl_coef=0.0600 warmup=8 | diff_alpha=3.5 | self_play=70% grounded=30% | math_mix=30% math_maxdiff=3 | overlong_filter=True | eval_every=5 eval_N=150 | grad_clip=0.50 save_every=5 keep_last=4 | question_GRPO=ENABLED (K_q=2)
+2026-04-26 03:28:31,745 INFO __main__ - Loading model from checkpoints/dual_task_v1 ...
+2026-04-26 03:28:32,465 INFO __main__ - Tokenizer has no chat_template; loading from base model Qwen/Qwen2.5-Math-1.5B-Instruct
+2026-04-26 03:28:32,841 INFO __main__ - Chat template loaded successfully.
+2026-04-26 03:28:32,842 INFO __main__ - Detected PEFT adapter — loading base Qwen/Qwen2.5-Math-1.5B-Instruct then merging checkpoints/dual_task_v1
+2026-04-26 03:28:34,358 WARNING __main__ - All parameters were frozen on load (PEFT merge_and_unload bug). Re-enabled requires_grad — any prior frozen runs were training nothing.
+2026-04-26 03:28:34,358 INFO __main__ - Flash-Attn 2 active — gradient checkpointing OFF (Flash already gives O(T) attention memory).
+2026-04-26 03:28:34,359 INFO __main__ - Trainable parameters: 1,543,714,304 / 1,543,714,304 (100.0%)
+2026-04-26 03:28:34,360 INFO __main__ - Creating frozen reference policy (kl_coef=0.0600, ~3.1 GB VRAM)...
+2026-04-26 03:28:34,425 INFO __main__ - Reference policy ready.
+2026-04-26 03:28:34,426 INFO __main__ - LR schedule: 5.0e-06 warmup(8 iters) → cosine decay(52 iters, min=5.0e-07)
+2026-04-26 03:28:34,538 INFO __main__ - Loaded 8792 QA pairs from data/sft/gsm8k_sft.jsonl
+2026-04-26 03:28:34,546 INFO __main__ - Loaded 4072 MATH pairs from data/math/math_numeric.jsonl
+2026-04-26 03:28:34,546 INFO __main__ - MATH mixing: 30% MATH (4072 problems) + 70% GSM8K (8792 problems)
+2026-04-26 03:28:34,546 INFO src.rl.prm_scorer - Loading PRM Qwen/Qwen2.5-Math-PRM-7B (4-bit=True, dtype=torch.bfloat16) on cuda:0 …
+
Loading checkpoint shards: 0%| | 0/4 [00:00, ?it/s]
Loading checkpoint shards: 25%|##5 | 1/4 [00:00<00:02, 1.17it/s]
Loading checkpoint shards: 50%|##### | 2/4 [00:01<00:01, 1.20it/s]
Loading checkpoint shards: 75%|#######5 | 3/4 [00:02<00:00, 1.20it/s]
Loading checkpoint shards: 100%|##########| 4/4 [00:03<00:00, 1.35it/s]
Loading checkpoint shards: 100%|##########| 4/4 [00:03<00:00, 1.29it/s]
+Some weights of the model checkpoint at Qwen/Qwen2.5-Math-PRM-7B were not used when initializing Qwen2ForProcessRewardModel: ['lm_head.weight']
+- This IS expected if you are initializing Qwen2ForProcessRewardModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing Qwen2ForProcessRewardModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+2026-04-26 03:28:38,414 INFO src.rl.prm_scorer - PRM ready. GPU memory allocated: 9.97 GB step_sep_id=151651
+2026-04-26 03:28:38,414 INFO __main__ - PRM loaded: Qwen/Qwen2.5-Math-PRM-7B (4-bit)
+2026-04-26 03:28:38,416 INFO src.rl.unified_accuracy - Extraction cache not found at data/extraction_cache.json — will build on first use
+2026-04-26 03:28:38,417 INFO __main__ - Unified accuracy calculator ready (extractor=Qwen/Qwen2.5-0.5B-Instruct, cache=data/extraction_cache.json)
+2026-04-26 03:28:38,417 INFO __main__ - Warming up step-chain extractor (eager load)...
+2026-04-26 03:28:38,417 INFO src.rl.unified_accuracy - Loading step chain extractor: Qwen/Qwen2.5-0.5B-Instruct
+2026-04-26 03:28:39,348 INFO src.rl.unified_accuracy - Step chain extractor loaded
+2026-04-26 03:28:39,348 INFO __main__ - Extractor warmup complete
+2026-04-26 03:28:39,349 INFO src.rl.llm_question_classifier - LLMQuestionClassifier ready (model=Qwen2ForCausalLM, cache=10000, topics=24)
+2026-04-26 03:28:40,949 INFO __main__ - Detected structured dataset (8792 records) — bootstrapping curriculum from skill_ids instead of keyword classifier.
+2026-04-26 03:28:40,954 INFO src.rl.curriculum_manager - Curriculum bootstrapped from 8792 records across 1 topics
+2026-04-26 03:28:40,954 INFO __main__ - ======================================================================
+2026-04-26 03:28:40,954 INFO __main__ - INITIAL EVALUATION (Iteration 0)
+2026-04-26 03:28:40,954 INFO __main__ - ======================================================================
+
GSM8K eval: 0%| | 0/150 [00:00, ?q/s]2026-04-26 03:28:43,322 WARNING transformers_modules.Qwen.Qwen2.5-Math-PRM-7B.0610740060112df12585d00a1c5f4624d2f59051.modeling_qwen2_rm - We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
+
GSM8K eval: 1%| | 1/150 [00:02<06:08, 2.48s/q, correct=1/1, lccp=100.0%, score=0.999, step_acc=100.0%]
GSM8K eval: 1%|1 | 2/150 [00:07<09:29, 3.85s/q, correct=2/2, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 2%|2 | 3/150 [00:09<07:59, 3.26s/q, correct=3/3, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 3%|2 | 4/150 [00:12<06:52, 2.83s/q, correct=3/4, lccp=83.3%, score=0.887, step_acc=91.7%]
GSM8K eval: 3%|3 | 5/150 [00:13<05:44, 2.37s/q, correct=4/5, lccp=86.7%, score=0.910, step_acc=93.3%]
GSM8K eval: 4%|4 | 6/150 [00:18<08:09, 3.40s/q, correct=4/6, lccp=75.6%, score=0.887, step_acc=87.8%]
GSM8K eval: 5%|4 | 7/150 [00:22<08:03, 3.38s/q, correct=5/7, lccp=79.0%, score=0.903, step_acc=89.5%]
GSM8K eval: 5%|5 | 8/150 [00:24<07:13, 3.05s/q, correct=6/8, lccp=81.7%, score=0.915, step_acc=90.8%]
GSM8K eval: 6%|6 | 9/150 [00:27<07:16, 3.10s/q, correct=7/9, lccp=83.7%, score=0.924, step_acc=91.9%]
GSM8K eval: 7%|6 | 10/150 [00:32<08:29, 3.64s/q, correct=7/10, lccp=81.3%, score=0.908, step_acc=90.7%]
GSM8K eval: 7%|7 | 11/150 [00:35<07:51, 3.40s/q, correct=8/11, lccp=83.0%, score=0.916, step_acc=91.5%]
GSM8K eval: 8%|8 | 12/150 [00:37<06:51, 2.98s/q, correct=9/12, lccp=84.4%, score=0.923, step_acc=92.2%]
GSM8K eval: 9%|8 | 13/150 [00:40<06:41, 2.93s/q, correct=10/13, lccp=80.5%, score=0.925, step_acc=90.3%]
GSM8K eval: 9%|9 | 14/150 [00:44<07:45, 3.42s/q, correct=11/14, lccp=81.9%, score=0.930, step_acc=91.0%]
GSM8K eval: 10%|# | 15/150 [00:47<07:05, 3.15s/q, correct=12/15, lccp=83.1%, score=0.935, step_acc=91.6%]
GSM8K eval: 11%|# | 16/150 [00:49<06:32, 2.93s/q, correct=12/16, lccp=84.2%, score=0.911, step_acc=92.1%]
GSM8K eval: 11%|#1 | 17/150 [00:52<06:25, 2.90s/q, correct=13/17, lccp=85.1%, score=0.916, step_acc=92.5%]
GSM8K eval: 12%|#2 | 18/150 [00:58<08:10, 3.72s/q, correct=13/18, lccp=81.1%, score=0.904, step_acc=90.2%]
GSM8K eval: 13%|#2 | 19/150 [01:00<07:20, 3.36s/q, correct=14/19, lccp=82.1%, score=0.909, step_acc=90.7%]
GSM8K eval: 13%|#3 | 20/150 [01:04<07:19, 3.38s/q, correct=15/20, lccp=83.0%, score=0.914, step_acc=91.2%]
GSM8K eval: 14%|#4 | 21/150 [01:06<06:40, 3.10s/q, correct=16/21, lccp=83.8%, score=0.918, step_acc=91.6%]
GSM8K eval: 15%|#4 | 22/150 [01:11<07:53, 3.70s/q, correct=17/22, lccp=81.9%, score=0.919, step_acc=90.0%]
GSM8K eval: 15%|#5 | 23/150 [01:15<08:02, 3.80s/q, correct=18/23, lccp=82.7%, score=0.923, step_acc=90.5%]
GSM8K eval: 16%|#6 | 24/150 [01:18<07:09, 3.41s/q, correct=18/24, lccp=80.3%, score=0.906, step_acc=87.7%]
GSM8K eval: 17%|#6 | 25/150 [01:21<06:38, 3.19s/q, correct=18/25, lccp=78.1%, score=0.902, step_acc=87.2%]
GSM8K eval: 17%|#7 | 26/150 [01:25<07:13, 3.50s/q, correct=19/26, lccp=78.9%, score=0.905, step_acc=87.7%]
GSM8K eval: 18%|#8 | 27/150 [01:27<06:40, 3.25s/q, correct=19/27, lccp=79.7%, score=0.900, step_acc=88.2%]
GSM8K eval: 19%|#8 | 28/150 [01:30<05:58, 2.94s/q, correct=20/28, lccp=80.4%, score=0.904, step_acc=88.6%]
GSM8K eval: 19%|#9 | 29/150 [01:32<05:47, 2.88s/q, correct=21/29, lccp=81.1%, score=0.907, step_acc=89.0%]
GSM8K eval: 20%|## | 30/150 [01:36<06:17, 3.14s/q, correct=22/30, lccp=81.7%, score=0.910, step_acc=89.3%]
GSM8K eval: 21%|## | 31/150 [01:39<05:51, 2.95s/q, correct=23/31, lccp=82.3%, score=0.913, step_acc=89.7%]
GSM8K eval: 21%|##1 | 32/150 [01:40<05:04, 2.58s/q, correct=24/32, lccp=82.9%, score=0.915, step_acc=90.0%]
GSM8K eval: 22%|##2 | 33/150 [01:43<05:10, 2.65s/q, correct=25/33, lccp=83.4%, score=0.917, step_acc=90.3%]
GSM8K eval: 23%|##2 | 34/150 [01:45<04:45, 2.46s/q, correct=26/34, lccp=83.9%, score=0.920, step_acc=90.6%]
GSM8K eval: 23%|##3 | 35/150 [01:48<04:48, 2.50s/q, correct=27/35, lccp=84.3%, score=0.922, step_acc=90.9%]
GSM8K eval: 24%|##4 | 36/150 [01:51<05:19, 2.81s/q, correct=28/36, lccp=84.8%, score=0.924, step_acc=91.1%]
GSM8K eval: 25%|##4 | 37/150 [01:53<04:48, 2.55s/q, correct=29/37, lccp=85.2%, score=0.925, step_acc=91.4%]
GSM8K eval: 25%|##5 | 38/150 [01:56<05:01, 2.69s/q, correct=30/38, lccp=85.6%, score=0.927, step_acc=91.6%]
GSM8K eval: 26%|##6 | 39/150 [02:01<06:16, 3.39s/q, correct=31/39, lccp=85.9%, score=0.929, step_acc=91.8%]
GSM8K eval: 27%|##6 | 40/150 [02:07<07:34, 4.13s/q, correct=32/40, lccp=86.3%, score=0.931, step_acc=92.0%]
GSM8K eval: 27%|##7 | 41/150 [02:10<06:48, 3.75s/q, correct=32/41, lccp=86.6%, score=0.930, step_acc=92.2%]
GSM8K eval: 28%|##8 | 42/150 [02:15<07:28, 4.15s/q, correct=33/42, lccp=85.4%, score=0.931, step_acc=92.0%]
GSM8K eval: 29%|##8 | 43/150 [02:17<06:09, 3.45s/q, correct=34/43, lccp=85.7%, score=0.933, step_acc=92.2%]
GSM8K eval: 29%|##9 | 44/150 [02:23<07:30, 4.25s/q, correct=35/44, lccp=86.0%, score=0.934, step_acc=92.4%]
GSM8K eval: 30%|### | 45/150 [02:26<06:47, 3.88s/q, correct=36/45, lccp=86.3%, score=0.936, step_acc=92.5%]
GSM8K eval: 31%|### | 46/150 [02:31<07:09, 4.13s/q, correct=36/46, lccp=84.5%, score=0.931, step_acc=92.4%]
GSM8K eval: 31%|###1 | 47/150 [02:34<06:27, 3.77s/q, correct=37/47, lccp=84.8%, score=0.932, step_acc=92.6%]
GSM8K eval: 32%|###2 | 48/150 [02:35<05:21, 3.15s/q, correct=38/48, lccp=85.1%, score=0.933, step_acc=92.8%]
GSM8K eval: 33%|###2 | 49/150 [02:42<07:00, 4.16s/q, correct=38/49, lccp=84.0%, score=0.919, step_acc=91.5%]
GSM8K eval: 33%|###3 | 50/150 [02:45<06:22, 3.83s/q, correct=38/50, lccp=83.3%, score=0.911, step_acc=90.6%]
GSM8K eval: 34%|###4 | 51/150 [02:46<05:05, 3.09s/q, correct=39/51, lccp=83.6%, score=0.913, step_acc=90.8%]
GSM8K eval: 35%|###4 | 52/150 [02:50<05:32, 3.40s/q, correct=39/52, lccp=82.0%, score=0.912, step_acc=90.7%]
GSM8K eval: 35%|###5 | 53/150 [02:55<06:00, 3.72s/q, correct=39/53, lccp=81.6%, score=0.905, step_acc=90.1%]
GSM8K eval: 36%|###6 | 54/150 [02:57<05:14, 3.28s/q, correct=40/54, lccp=81.9%, score=0.907, step_acc=90.3%]
GSM8K eval: 37%|###6 | 55/150 [03:00<04:56, 3.12s/q, correct=41/55, lccp=82.3%, score=0.908, step_acc=90.4%]
GSM8K eval: 37%|###7 | 56/150 [03:03<05:02, 3.22s/q, correct=42/56, lccp=82.6%, score=0.910, step_acc=90.6%]
GSM8K eval: 38%|###8 | 57/150 [03:06<04:31, 2.92s/q, correct=43/57, lccp=82.9%, score=0.911, step_acc=90.8%]
GSM8K eval: 39%|###8 | 58/150 [03:10<04:57, 3.23s/q, correct=44/58, lccp=83.2%, score=0.913, step_acc=90.9%]
GSM8K eval: 39%|###9 | 59/150 [03:13<05:00, 3.30s/q, correct=44/59, lccp=81.8%, score=0.904, step_acc=89.7%]
GSM8K eval: 40%|#### | 60/150 [03:18<05:35, 3.73s/q, correct=45/60, lccp=82.1%, score=0.906, step_acc=89.9%]
GSM8K eval: 41%|#### | 61/150 [03:20<04:51, 3.28s/q, correct=46/61, lccp=82.4%, score=0.908, step_acc=90.1%]
GSM8K eval: 41%|####1 | 62/150 [03:23<04:41, 3.20s/q, correct=47/62, lccp=82.6%, score=0.909, step_acc=90.2%]
GSM8K eval: 42%|####2 | 63/150 [03:26<04:39, 3.22s/q, correct=47/63, lccp=82.4%, score=0.903, step_acc=89.9%]
GSM8K eval: 43%|####2 | 64/150 [03:29<04:25, 3.08s/q, correct=48/64, lccp=82.7%, score=0.905, step_acc=90.0%]
GSM8K eval: 43%|####3 | 65/150 [03:32<04:12, 2.97s/q, correct=49/65, lccp=82.9%, score=0.906, step_acc=90.2%]
GSM8K eval: 44%|####4 | 66/150 [03:33<03:30, 2.50s/q, correct=50/66, lccp=83.2%, score=0.907, step_acc=90.3%]
GSM8K eval: 45%|####4 | 67/150 [03:35<03:19, 2.41s/q, correct=51/67, lccp=83.4%, score=0.909, step_acc=90.5%]
GSM8K eval: 45%|####5 | 68/150 [03:38<03:21, 2.46s/q, correct=52/68, lccp=83.7%, score=0.910, step_acc=90.6%]
GSM8K eval: 46%|####6 | 69/150 [03:39<02:55, 2.17s/q, correct=53/69, lccp=83.9%, score=0.911, step_acc=90.7%]
GSM8K eval: 47%|####6 | 70/150 [03:42<03:10, 2.38s/q, correct=54/70, lccp=82.7%, score=0.912, step_acc=90.6%]
GSM8K eval: 47%|####7 | 71/150 [03:45<03:23, 2.57s/q, correct=55/71, lccp=81.6%, score=0.913, step_acc=90.4%]
GSM8K eval: 48%|####8 | 72/150 [03:47<02:53, 2.22s/q, correct=56/72, lccp=81.8%, score=0.914, step_acc=90.6%]
GSM8K eval: 49%|####8 | 73/150 [03:48<02:37, 2.04s/q, correct=57/73, lccp=82.1%, score=0.915, step_acc=90.7%]
GSM8K eval: 49%|####9 | 74/150 [03:52<03:06, 2.45s/q, correct=58/74, lccp=82.3%, score=0.917, step_acc=90.8%]
GSM8K eval: 50%|##### | 75/150 [03:53<02:46, 2.22s/q, correct=59/75, lccp=82.5%, score=0.918, step_acc=91.0%]
GSM8K eval: 51%|##### | 76/150 [04:00<04:14, 3.44s/q, correct=59/76, lccp=82.6%, score=0.913, step_acc=90.9%]
GSM8K eval: 51%|#####1 | 77/150 [04:03<04:18, 3.54s/q, correct=60/77, lccp=82.8%, score=0.914, step_acc=91.0%]
GSM8K eval: 52%|#####2 | 78/150 [04:06<03:48, 3.18s/q, correct=61/78, lccp=83.1%, score=0.915, step_acc=91.1%]
GSM8K eval: 53%|#####2 | 79/150 [04:09<03:36, 3.05s/q, correct=62/79, lccp=82.8%, score=0.914, step_acc=91.0%]
GSM8K eval: 53%|#####3 | 80/150 [04:11<03:27, 2.97s/q, correct=63/80, lccp=83.0%, score=0.915, step_acc=91.1%]
GSM8K eval: 54%|#####4 | 81/150 [04:14<03:10, 2.76s/q, correct=64/81, lccp=83.2%, score=0.916, step_acc=91.2%]
GSM8K eval: 55%|#####4 | 82/150 [04:16<03:08, 2.77s/q, correct=65/82, lccp=83.4%, score=0.917, step_acc=91.3%]
GSM8K eval: 55%|#####5 | 83/150 [04:19<03:03, 2.73s/q, correct=66/83, lccp=83.6%, score=0.918, step_acc=91.4%]
GSM8K eval: 56%|#####6 | 84/150 [04:22<02:57, 2.68s/q, correct=67/84, lccp=83.8%, score=0.919, step_acc=91.5%]
GSM8K eval: 57%|#####6 | 85/150 [04:25<03:15, 3.01s/q, correct=68/85, lccp=84.0%, score=0.920, step_acc=91.6%]
GSM8K eval: 57%|#####7 | 86/150 [04:29<03:17, 3.09s/q, correct=69/86, lccp=84.2%, score=0.921, step_acc=91.7%]
GSM8K eval: 58%|#####8 | 87/150 [04:34<03:55, 3.74s/q, correct=70/87, lccp=84.3%, score=0.922, step_acc=91.8%]
GSM8K eval: 59%|#####8 | 88/150 [04:36<03:15, 3.16s/q, correct=71/88, lccp=84.5%, score=0.922, step_acc=91.9%]
GSM8K eval: 59%|#####9 | 89/150 [04:38<03:03, 3.01s/q, correct=72/89, lccp=84.7%, score=0.923, step_acc=92.0%]
GSM8K eval: 60%|###### | 90/150 [04:41<02:48, 2.80s/q, correct=73/90, lccp=84.9%, score=0.924, step_acc=92.1%]
GSM8K eval: 61%|###### | 91/150 [04:45<03:09, 3.21s/q, correct=74/91, lccp=85.0%, score=0.925, step_acc=92.2%]
GSM8K eval: 61%|######1 | 92/150 [04:48<03:00, 3.12s/q, correct=75/92, lccp=85.2%, score=0.926, step_acc=92.3%]
GSM8K eval: 62%|######2 | 93/150 [04:54<03:58, 4.18s/q, correct=76/93, lccp=85.4%, score=0.926, step_acc=92.4%]
GSM8K eval: 63%|######2 | 94/150 [04:58<03:46, 4.04s/q, correct=76/94, lccp=84.4%, score=0.924, step_acc=91.9%]
GSM8K eval: 63%|######3 | 95/150 [05:03<03:58, 4.34s/q, correct=77/95, lccp=83.6%, score=0.924, step_acc=91.5%]
GSM8K eval: 64%|######4 | 96/150 [05:06<03:32, 3.93s/q, correct=77/96, lccp=83.0%, score=0.919, step_acc=90.9%]
GSM8K eval: 65%|######4 | 97/150 [05:09<03:05, 3.51s/q, correct=77/97, lccp=82.7%, score=0.917, step_acc=90.7%]
GSM8K eval: 65%|######5 | 98/150 [05:13<03:09, 3.65s/q, correct=77/98, lccp=82.3%, score=0.913, step_acc=90.5%]
GSM8K eval: 66%|######6 | 99/150 [05:15<02:44, 3.23s/q, correct=78/99, lccp=82.5%, score=0.914, step_acc=90.6%]
GSM8K eval: 67%|######6 | 100/150 [05:17<02:19, 2.79s/q, correct=79/100, lccp=81.6%, score=0.914, step_acc=90.4%]
GSM8K eval: 67%|######7 | 101/150 [05:20<02:17, 2.80s/q, correct=79/101, lccp=81.3%, score=0.911, step_acc=90.2%]
GSM8K eval: 68%|######8 | 102/150 [05:21<01:54, 2.39s/q, correct=80/102, lccp=81.5%, score=0.911, step_acc=90.3%]
GSM8K eval: 69%|######8 | 103/150 [05:23<01:46, 2.26s/q, correct=81/103, lccp=81.7%, score=0.912, step_acc=90.4%]
GSM8K eval: 69%|######9 | 104/150 [05:27<02:15, 2.95s/q, correct=82/104, lccp=81.9%, score=0.913, step_acc=90.5%]
GSM8K eval: 70%|####### | 105/150 [05:30<02:04, 2.77s/q, correct=83/105, lccp=82.0%, score=0.914, step_acc=90.6%]
GSM8K eval: 71%|####### | 106/150 [05:31<01:44, 2.37s/q, correct=84/106, lccp=82.2%, score=0.914, step_acc=90.7%]
GSM8K eval: 71%|#######1 | 107/150 [05:33<01:29, 2.09s/q, correct=85/107, lccp=82.4%, score=0.915, step_acc=90.7%]
GSM8K eval: 72%|#######2 | 108/150 [05:35<01:33, 2.23s/q, correct=86/108, lccp=82.5%, score=0.916, step_acc=90.8%]
GSM8K eval: 73%|#######2 | 109/150 [05:40<02:02, 2.99s/q, correct=86/109, lccp=82.1%, score=0.915, step_acc=90.8%]
GSM8K eval: 73%|#######3 | 110/150 [05:42<01:49, 2.73s/q, correct=87/110, lccp=82.3%, score=0.915, step_acc=90.9%]
GSM8K eval: 74%|#######4 | 111/150 [05:44<01:33, 2.39s/q, correct=88/111, lccp=82.4%, score=0.916, step_acc=90.9%]
GSM8K eval: 75%|#######4 | 112/150 [05:49<01:58, 3.13s/q, correct=88/112, lccp=82.6%, score=0.916, step_acc=91.0%]
GSM8K eval: 75%|#######5 | 113/150 [05:50<01:39, 2.70s/q, correct=89/113, lccp=82.7%, score=0.916, step_acc=91.1%]
GSM8K eval: 76%|#######6 | 114/150 [05:53<01:42, 2.86s/q, correct=89/114, lccp=82.3%, score=0.913, step_acc=90.6%]
GSM8K eval: 77%|#######6 | 115/150 [05:56<01:34, 2.69s/q, correct=90/115, lccp=82.5%, score=0.913, step_acc=90.7%]
GSM8K eval: 77%|#######7 | 116/150 [05:58<01:30, 2.68s/q, correct=91/116, lccp=82.7%, score=0.914, step_acc=90.8%]
GSM8K eval: 78%|#######8 | 117/150 [06:04<01:54, 3.45s/q, correct=92/117, lccp=82.8%, score=0.915, step_acc=90.9%]
GSM8K eval: 79%|#######8 | 118/150 [06:07<01:50, 3.46s/q, correct=92/118, lccp=82.1%, score=0.912, step_acc=90.6%]
GSM8K eval: 79%|#######9 | 119/150 [06:11<01:46, 3.45s/q, correct=92/119, lccp=82.2%, score=0.910, step_acc=90.7%]
GSM8K eval: 80%|######## | 120/150 [06:13<01:36, 3.22s/q, correct=93/120, lccp=82.4%, score=0.911, step_acc=90.8%]
GSM8K eval: 81%|######## | 121/150 [06:16<01:30, 3.12s/q, correct=94/121, lccp=82.5%, score=0.912, step_acc=90.9%]
GSM8K eval: 81%|########1 | 122/150 [06:19<01:25, 3.05s/q, correct=95/122, lccp=82.7%, score=0.912, step_acc=90.9%]
GSM8K eval: 82%|########2 | 123/150 [06:22<01:23, 3.09s/q, correct=95/123, lccp=82.3%, score=0.912, step_acc=90.8%]
GSM8K eval: 83%|########2 | 124/150 [06:24<01:12, 2.78s/q, correct=96/124, lccp=82.5%, score=0.913, step_acc=90.9%]
GSM8K eval: 83%|########3 | 125/150 [06:26<01:03, 2.53s/q, correct=97/125, lccp=82.6%, score=0.914, step_acc=91.0%]
GSM8K eval: 84%|########4 | 126/150 [06:29<01:01, 2.54s/q, correct=98/126, lccp=82.8%, score=0.914, step_acc=91.1%]
GSM8K eval: 85%|########4 | 127/150 [06:33<01:09, 3.04s/q, correct=99/127, lccp=82.9%, score=0.915, step_acc=91.1%]
GSM8K eval: 85%|########5 | 128/150 [06:36<01:05, 2.97s/q, correct=100/128, lccp=83.0%, score=0.916, step_acc=91.2%]
GSM8K eval: 86%|########6 | 129/150 [06:39<01:04, 3.09s/q, correct=101/129, lccp=83.2%, score=0.916, step_acc=91.3%]
GSM8K eval: 87%|########6 | 130/150 [06:41<00:53, 2.68s/q, correct=102/130, lccp=83.3%, score=0.917, step_acc=91.3%]
GSM8K eval: 87%|########7 | 131/150 [06:45<01:00, 3.20s/q, correct=103/131, lccp=83.4%, score=0.917, step_acc=91.4%]
GSM8K eval: 88%|########8 | 132/150 [06:47<00:48, 2.70s/q, correct=104/132, lccp=83.5%, score=0.918, step_acc=91.5%]
GSM8K eval: 89%|########8 | 133/150 [06:50<00:46, 2.71s/q, correct=105/133, lccp=83.7%, score=0.919, step_acc=91.5%]
GSM8K eval: 89%|########9 | 134/150 [06:54<00:50, 3.16s/q, correct=106/134, lccp=83.8%, score=0.919, step_acc=91.6%]
GSM8K eval: 90%|######### | 135/150 [06:57<00:46, 3.09s/q, correct=107/135, lccp=83.9%, score=0.920, step_acc=91.7%]
GSM8K eval: 91%|######### | 136/150 [07:01<00:48, 3.49s/q, correct=107/136, lccp=83.5%, score=0.918, step_acc=91.3%]
GSM8K eval: 91%|#########1| 137/150 [07:08<00:57, 4.39s/q, correct=108/137, lccp=83.7%, score=0.919, step_acc=91.4%]
GSM8K eval: 92%|#########2| 138/150 [07:11<00:50, 4.22s/q, correct=109/138, lccp=83.8%, score=0.919, step_acc=91.5%]
GSM8K eval: 93%|#########2| 139/150 [07:14<00:40, 3.64s/q, correct=110/139, lccp=83.9%, score=0.920, step_acc=91.5%]
GSM8K eval: 93%|#########3| 140/150 [07:18<00:37, 3.76s/q, correct=110/140, lccp=83.8%, score=0.916, step_acc=91.4%]
GSM8K eval: 94%|#########3| 141/150 [07:21<00:33, 3.74s/q, correct=111/141, lccp=83.9%, score=0.917, step_acc=91.4%]
GSM8K eval: 95%|#########4| 142/150 [07:25<00:28, 3.56s/q, correct=112/142, lccp=84.0%, score=0.918, step_acc=91.5%]
GSM8K eval: 95%|#########5| 143/150 [07:27<00:21, 3.13s/q, correct=113/143, lccp=84.1%, score=0.918, step_acc=91.5%]
GSM8K eval: 96%|#########6| 144/150 [07:29<00:17, 2.86s/q, correct=114/144, lccp=84.2%, score=0.919, step_acc=91.6%]
GSM8K eval: 97%|#########6| 145/150 [07:32<00:14, 2.89s/q, correct=114/145, lccp=83.6%, score=0.915, step_acc=91.1%]
GSM8K eval: 97%|#########7| 146/150 [07:35<00:11, 2.89s/q, correct=115/146, lccp=83.8%, score=0.916, step_acc=91.1%]
GSM8K eval: 98%|#########8| 147/150 [07:38<00:09, 3.06s/q, correct=116/147, lccp=83.9%, score=0.917, step_acc=91.2%]
GSM8K eval: 99%|#########8| 148/150 [07:42<00:06, 3.20s/q, correct=117/148, lccp=84.0%, score=0.917, step_acc=91.3%]
GSM8K eval: 99%|#########9| 149/150 [07:45<00:03, 3.24s/q, correct=118/149, lccp=84.1%, score=0.918, step_acc=91.3%]
GSM8K eval: 100%|##########| 150/150 [07:50<00:00, 3.65s/q, correct=118/150, lccp=83.9%, score=0.916, step_acc=91.1%]
GSM8K eval: 100%|##########| 150/150 [07:50<00:00, 3.14s/q, correct=118/150, lccp=83.9%, score=0.916, step_acc=91.1%]
+2026-04-26 03:36:31,226 INFO __main__ - Training Score [INITIAL (iter 0)]: 0.9162 | n=150
+2026-04-26 03:36:31,227 INFO __main__ - Components : 0.50×correct(78.7%) + 0.40×process + 0.10×fmt(1.000)
+2026-04-26 03:36:31,227 INFO __main__ - Process score : prm_mean=0.899 prm_final=0.927 → weighted=0.916
+2026-04-26 03:36:31,227 INFO __main__ - Step accuracy : 91.1% (bag-of-steps: fraction of steps PRM >0.5)
+2026-04-26 03:36:31,227 INFO __main__ - Chain integrity (LCCP): 83.9% ← fraction of steps before first failure
+ [LCCP=100% → all steps correct; LCCP=0% → first step wrong]
+2026-04-26 03:36:31,227 INFO __main__ - (debug) final-answer accuracy: 78.7%
+2026-04-26 03:36:31,227 INFO __main__ - ======================================================================
+2026-04-26 03:36:31,227 INFO __main__ - GRPO ITERATION 1/60
+2026-04-26 03:36:31,228 INFO __main__ - ======================================================================
+2026-04-26 03:36:31,246 INFO __main__ - LR this iteration: 5.00e-07 | T=0.800 | MATH ratio=30%
+
Iter 1 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 03:36:36,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='390' gold='390' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:36:36,737 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.415 = 0.50×0.09(prox=0.09) + 0.40×proc(0.606[fin=0.74,mean=0.40]) + 0.10×fmt(1.000) | pred='2470' gold='390' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 03:36:36,817 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='390' gold='390' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:36:36,898 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.913 = 0.50×0.85(prox=0.85) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='370' gold='390' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:36:36,978 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='390' gold='390' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:36:37,059 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='390' gold='390' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:36:37,142 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='390' gold='390' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:36:37,222 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='390' gold='390' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:36:37,303 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=0.98,mean=0.95]) + 0.10×fmt(1.000) | pred='390' gold='390' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:36:37,384 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='390' gold='390' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 1 GRPO groups: 0%| | 0/20 [00:07, ?q/s, loss=-0.0005, mean_r=0.928, skip=0]
Iter 1 GRPO groups: 5%|5 | 1/20 [00:07<02:30, 7.94s/q, loss=-0.0005, mean_r=0.928, skip=0]2026-04-26 03:36:46,224 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:36:46,306 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:36:46,389 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:36:46,469 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:36:46,550 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:36:46,631 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:36:46,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:36:46,793 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:36:46,873 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.954 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.94]) + 0.10×fmt(0.650) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:36:46,954 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 1 GRPO groups: 5%|5 | 1/20 [00:15<02:30, 7.94s/q, loss=0var, mean_r=0.986, skip=1]
Iter 1 GRPO groups: 10%|# | 2/20 [00:15<02:21, 7.84s/q, loss=0var, mean_r=0.986, skip=1]2026-04-26 03:36:53,060 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:36:53,144 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:36:53,226 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:36:53,308 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:36:53,391 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:36:53,474 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:36:53,556 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:36:53,638 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:36:53,728 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:36:53,814 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 1 GRPO groups: 10%|# | 2/20 [00:22<02:21, 7.84s/q, loss=0var, mean_r=0.998, skip=2]
Iter 1 GRPO groups: 15%|#5 | 3/20 [00:22<02:05, 7.39s/q, loss=0var, mean_r=0.998, skip=2]2026-04-26 03:37:00,254 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.821 = 0.50×0.67(prox=0.67) + 0.40×proc(0.969[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='6' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:00,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.685 = 0.50×0.50(prox=0.50) + 0.40×proc(0.837[fin=0.97,mean=0.63]) + 0.10×fmt(1.000) | pred='4' gold='8' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 03:37:00,420 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:00,502 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.724 = 0.50×0.50(prox=0.50) + 0.40×proc(0.934[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='4' gold='8' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 03:37:00,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.951[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 03:37:00,667 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:00,750 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:00,833 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.500 = 0.50×0.40(prox=0.40) + 0.40×proc(0.350[fin=0.28,mean=0.46]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 03:37:00,915 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:00,997 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 1 GRPO groups: 15%|#5 | 3/20 [00:31<02:05, 7.39s/q, loss=0.0006, mean_r=0.866, skip=2]
Iter 1 GRPO groups: 20%|## | 4/20 [00:31<02:06, 7.88s/q, loss=0.0006, mean_r=0.866, skip=2]2026-04-26 03:37:06,396 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='360' gold='360' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:06,478 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.670 = 0.50×0.60(prox=0.60) + 0.40×proc(0.674[fin=0.80,mean=0.48]) + 0.10×fmt(1.000) | pred='240' gold='360' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:37:06,559 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='360' gold='360' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:06,641 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='360' gold='360' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:06,723 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.712 = 0.50×0.60(prox=0.60) + 0.40×proc(0.780[fin=0.93,mean=0.55]) + 0.10×fmt(1.000) | pred='240' gold='360' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 03:37:06,804 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='360' gold='360' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:06,886 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='360' gold='360' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:06,966 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='360' gold='360' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:07,050 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='360' gold='360' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:07,133 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='360' gold='360' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 1 GRPO groups: 20%|## | 4/20 [00:37<02:06, 7.88s/q, loss=0.0003, mean_r=0.938, skip=2]
Iter 1 GRPO groups: 25%|##5 | 5/20 [00:37<01:48, 7.24s/q, loss=0.0003, mean_r=0.938, skip=2]2026-04-26 03:37:13,050 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1274' gold='1274' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:13,132 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1274' gold='1274' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:13,214 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.480 = 0.50×0.55(prox=0.55) + 0.40×proc(0.262[fin=0.00,mean=0.65]) + 0.10×fmt(1.000) | pred='754' gold='1274' | step_acc=67% lccp=67% (chain=2/3 ok_count=2) n_steps=3
+2026-04-26 03:37:13,297 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1274' gold='1274' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:13,380 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1274' gold='1274' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:13,463 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='1274' gold='1274' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:13,547 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.524 = 0.50×0.24(prox=0.24) + 0.40×proc(0.695[fin=0.79,mean=0.56]) + 0.10×fmt(1.000) | pred='-726' gold='1274' | step_acc=67% lccp=17% (chain=1/6 ok_count=4) n_steps=6
+2026-04-26 03:37:13,628 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1274' gold='1274' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:13,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1274' gold='1274' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:13,790 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1274' gold='1274' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 1 GRPO groups: 25%|##5 | 5/20 [00:43<01:48, 7.24s/q, loss=0.0012, mean_r=0.899, skip=2]
Iter 1 GRPO groups: 30%|### | 6/20 [00:43<01:38, 7.03s/q, loss=0.0012, mean_r=0.899, skip=2]2026-04-26 03:37:20,797 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.735 = 0.50×0.76(prox=0.76) + 0.40×proc(0.631[fin=0.73,mean=0.48]) + 0.10×fmt(1.000) | pred='11' gold='13' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 03:37:20,888 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.42(prox=0.42) + 0.40×proc(0.862[fin=0.91,mean=0.80]) + 0.10×fmt(1.000) | pred='22' gold='13' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:37:20,973 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.729 = 0.50×0.68(prox=0.68) + 0.40×proc(0.716[fin=0.81,mean=0.58]) + 0.10×fmt(1.000) | pred='10' gold='13' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:37:21,056 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.635 = 0.50×0.68(prox=0.68) + 0.40×proc(0.482[fin=0.43,mean=0.56]) + 0.10×fmt(1.000) | pred='10' gold='13' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 03:37:21,143 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.731 = 0.50×0.76(prox=0.76) + 0.40×proc(0.622[fin=0.80,mean=0.35]) + 0.10×fmt(1.000) | pred='11' gold='13' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 03:37:21,228 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.916 = 0.50×1.00(exact) + 0.40×proc(0.791[fin=0.96,mean=0.53]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 03:37:21,319 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.588 = 0.50×0.68(prox=0.68) + 0.40×proc(0.366[fin=0.28,mean=0.50]) + 0.10×fmt(1.000) | pred='10' gold='13' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 03:37:21,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.37(prox=0.37) + 0.40×proc(0.827[fin=1.00,mean=0.57]) + 0.10×fmt(1.000) | pred='24' gold='13' | step_acc=67% lccp=17% (chain=1/6 ok_count=4) n_steps=6
+2026-04-26 03:37:21,496 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.463 = 0.50×0.45(prox=0.45) + 0.40×proc(0.254[fin=0.14,mean=0.42]) + 0.10×fmt(1.000) | pred='5' gold='13' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:37:21,580 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.901 = 0.50×1.00(exact) + 0.40×proc(0.753[fin=0.92,mean=0.50]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+
Iter 1 GRPO groups: 30%|### | 6/20 [00:51<01:38, 7.03s/q, loss=0.0000, mean_r=0.680, skip=2]
Iter 1 GRPO groups: 35%|###5 | 7/20 [00:51<01:34, 7.30s/q, loss=0.0000, mean_r=0.680, skip=2]2026-04-26 03:37:25,982 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,058 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,142 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,225 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,306 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,388 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,463 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,628 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,705 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 1 GRPO groups: 35%|###5 | 7/20 [00:55<01:34, 7.30s/q, loss=0var, mean_r=0.998, skip=3]
Iter 1 GRPO groups: 40%|#### | 8/20 [00:55<01:13, 6.15s/q, loss=0var, mean_r=0.998, skip=3]2026-04-26 03:37:31,307 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:31,389 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:31,470 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:31,547 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.656 = 0.50×0.50(prox=0.50) + 0.40×proc(0.764[fin=0.87,mean=0.61]) + 0.10×fmt(1.000) | pred='70' gold='140' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 03:37:31,630 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.341 = 0.50×0.25(prox=0.25) + 0.40×proc(0.289[fin=0.32,mean=0.24]) + 0.10×fmt(1.000) | pred='-70' gold='140' | step_acc=0% lccp=0% (chain=0/5 ok_count=0) n_steps=5
+2026-04-26 03:37:31,712 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:31,792 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:31,874 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:31,958 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:32,041 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 1 GRPO groups: 40%|#### | 8/20 [01:02<01:13, 6.15s/q, loss=-0.0002, mean_r=0.898, skip=3]
Iter 1 GRPO groups: 45%|####5 | 9/20 [01:02<01:09, 6.33s/q, loss=-0.0002, mean_r=0.898, skip=3]2026-04-26 03:37:40,582 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.588 = 0.50×0.73(prox=0.73) + 0.40×proc(0.306[fin=0.10,mean=0.62]) + 0.10×fmt(1.000) | pred='885' gold='1085' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 03:37:40,666 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1085' gold='1085' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:37:40,757 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1085' gold='1085' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:40,849 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1085' gold='1085' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:40,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.630 = 0.50×0.46(prox=0.46) + 0.40×proc(0.751[fin=0.95,mean=0.45]) + 0.10×fmt(1.000) | pred='445' gold='1085' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 03:37:41,026 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='1085' gold='1085' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:41,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.821 = 0.50×0.81(prox=0.81) + 0.40×proc(0.786[fin=0.96,mean=0.52]) + 0.10×fmt(1.000) | pred='960' gold='1085' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:37:41,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1085' gold='1085' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:37:41,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1085' gold='1085' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:41,378 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1085' gold='1085' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 1 GRPO groups: 45%|####5 | 9/20 [01:11<01:09, 6.33s/q, loss=0.0013, mean_r=0.902, skip=3]
Iter 1 GRPO groups: 50%|##### | 10/20 [01:11<01:12, 7.26s/q, loss=0.0013, mean_r=0.902, skip=3]2026-04-26 03:37:45,118 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:45,194 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:45,275 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:45,356 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:45,431 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:45,515 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:45,597 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:45,673 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:45,753 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:45,834 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 1 GRPO groups: 50%|##### | 10/20 [01:14<01:12, 7.26s/q, loss=0var, mean_r=0.998, skip=4]
Iter 1 GRPO groups: 55%|#####5 | 11/20 [01:14<00:53, 5.97s/q, loss=0var, mean_r=0.998, skip=4]2026-04-26 03:37:49,884 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2400' gold='2400' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:49,966 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2400' gold='2400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:50,049 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='$2400' gold='2400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:50,133 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2400' gold='2400' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:50,216 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2400' gold='2400' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:50,299 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2400' gold='2400' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:50,381 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2400' gold='2400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:50,466 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2400' gold='2400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:50,550 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='$2400' gold='2400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:50,632 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='$2400' gold='2400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 1 GRPO groups: 55%|#####5 | 11/20 [01:19<00:53, 5.97s/q, loss=0var, mean_r=0.999, skip=5]
Iter 1 GRPO groups: 60%|###### | 12/20 [01:19<00:44, 5.62s/q, loss=0var, mean_r=0.999, skip=5]2026-04-26 03:37:54,681 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.924 = 0.50×1.00(exact) + 0.40×proc(0.896[fin=1.00,mean=0.75]) + 0.10×fmt(0.650) | pred='78' gold='78' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 03:37:54,757 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.831 = 0.50×1.00(exact) + 0.40×proc(0.577[fin=0.75,mean=0.31]) + 0.10×fmt(1.000) | pred='78' gold='78' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 03:37:54,840 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.642 = 0.50×0.35(prox=0.35) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='150' gold='78' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:37:54,916 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.308 = 0.50×0.00(prox=0.00) + 0.40×proc(0.425[fin=0.46,mean=0.37]) + 0.10×fmt(1.000) | pred='475 5/6' gold='78' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 03:37:54,991 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='78' gold='78' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:55,072 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='78' gold='78' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:55,154 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='78' gold='78' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:55,235 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.608 = 0.50×0.85(prox=0.85) + 0.40×proc(0.208[fin=0.06,mean=0.43]) + 0.10×fmt(1.000) | pred='84' gold='78' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 03:37:55,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='78' gold='78' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:55,391 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='78' gold='78' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 1 GRPO groups: 60%|###### | 12/20 [01:25<00:44, 5.62s/q, loss=0.0008, mean_r=0.829, skip=5]
Iter 1 GRPO groups: 65%|######5 | 13/20 [01:25<00:40, 5.77s/q, loss=0.0008, mean_r=0.829, skip=5]2026-04-26 03:38:00,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:00,394 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:00,475 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:00,558 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:00,639 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:00,720 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:00,801 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:00,881 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:00,964 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:01,046 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 1 GRPO groups: 65%|######5 | 13/20 [01:29<00:40, 5.77s/q, loss=0var, mean_r=0.999, skip=6]
Iter 1 GRPO groups: 70%|####### | 14/20 [01:29<00:31, 5.32s/q, loss=0var, mean_r=0.999, skip=6]2026-04-26 03:38:06,665 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6435' gold='6435' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:06,747 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.922 = 0.50×1.00(exact) + 0.40×proc(0.892[fin=1.00,mean=0.74]) + 0.10×fmt(0.650) | pred='6435' gold='6435' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 03:38:06,829 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='6435' gold='6435' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:06,910 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.929 = 0.50×1.00(exact) + 0.40×proc(0.909[fin=1.00,mean=0.77]) + 0.10×fmt(0.650) | pred='6435' gold='6435' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:38:06,994 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.372 = 0.50×0.48(prox=0.48) + 0.40×proc(0.076[fin=0.10,mean=0.04]) + 0.10×fmt(1.000) | pred='2993' gold='6435' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:38:07,076 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.953 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(0.650) | pred='6435' gold='6435' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:38:07,160 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(0.650) | pred='6435' gold='6435' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:38:07,241 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.914 = 0.50×1.00(exact) + 0.40×proc(0.871[fin=1.00,mean=0.68]) + 0.10×fmt(0.650) | pred='6435' gold='6435' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 03:38:07,323 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='6435' gold='6435' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:38:07,403 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.940 = 0.50×1.00(exact) + 0.40×proc(0.937[fin=0.99,mean=0.85]) + 0.10×fmt(0.650) | pred='6435' gold='6435' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 1 GRPO groups: 70%|####### | 14/20 [01:37<00:31, 5.32s/q, loss=-0.0011, mean_r=0.894, skip=6]
Iter 1 GRPO groups: 75%|#######5 | 15/20 [01:37<00:30, 6.04s/q, loss=-0.0011, mean_r=0.894, skip=6]2026-04-26 03:38:12,391 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.984[fin=0.99,mean=0.97]) + 0.10×fmt(1.000) | pred='120' gold='480' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:12,473 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.964[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:12,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:12,637 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:12,719 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.500 = 0.50×0.50(prox=0.50) + 0.40×proc(0.376[fin=0.42,mean=0.32]) + 0.10×fmt(1.000) | pred='240' gold='480' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:38:12,801 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.722 = 0.50×0.50(prox=0.50) + 0.40×proc(0.930[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='240' gold='480' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:38:12,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.303 = 0.50×0.34(prox=0.34) + 0.40×proc(0.082[fin=0.01,mean=0.18]) + 0.10×fmt(1.000) | pred='16' gold='480' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:38:12,958 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.947[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:13,039 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.14(prox=0.14) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='1920' gold='480' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:13,122 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.534 = 0.50×0.33(prox=0.33) + 0.40×proc(0.574[fin=0.67,mean=0.43]) + 0.10×fmt(1.000) | pred='960' gold='480' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+
Iter 1 GRPO groups: 75%|#######5 | 15/20 [01:43<00:30, 6.04s/q, loss=0.0027, mean_r=0.711, skip=6]
Iter 1 GRPO groups: 80%|######## | 16/20 [01:43<00:23, 5.96s/q, loss=0.0027, mean_r=0.711, skip=6]2026-04-26 03:38:21,123 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.832 = 0.50×0.71(prox=0.71) + 0.40×proc(0.937[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:38:21,208 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.810 = 0.50×0.71(prox=0.71) + 0.40×proc(0.883[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:38:21,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.811 = 0.50×0.71(prox=0.71) + 0.40×proc(0.885[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:38:21,382 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.38(prox=0.38) + 0.40×proc(0.905[fin=0.96,mean=0.83]) + 0.10×fmt(1.000) | pred='80' gold='400' | step_acc=89% lccp=22% (chain=2/9 ok_count=8) n_steps=9
+2026-04-26 03:38:21,468 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.782 = 0.50×0.71(prox=0.71) + 0.40×proc(0.812[fin=0.99,mean=0.54]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:38:21,553 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.828 = 0.50×0.71(prox=0.71) + 0.40×proc(0.928[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:38:21,635 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.840 = 0.50×0.71(prox=0.71) + 0.40×proc(0.958[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:21,719 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.849 = 0.50×0.71(prox=0.71) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:21,803 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.824 = 0.50×0.71(prox=0.71) + 0.40×proc(0.918[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:38:21,887 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.839 = 0.50×0.71(prox=0.71) + 0.40×proc(0.954[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 1 GRPO groups: 80%|######## | 16/20 [01:52<00:23, 5.96s/q, loss=-0.0013, mean_r=0.797, skip=6]
Iter 1 GRPO groups: 85%|########5 | 17/20 [01:52<00:20, 6.80s/q, loss=-0.0013, mean_r=0.797, skip=6]2026-04-26 03:38:25,690 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:25,767 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:25,843 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:25,923 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:26,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:26,087 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:26,166 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:26,244 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:26,326 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:26,401 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 1 GRPO groups: 85%|########5 | 17/20 [01:55<00:20, 6.80s/q, loss=0var, mean_r=0.994, skip=7]
Iter 1 GRPO groups: 90%|######### | 18/20 [01:55<00:11, 5.69s/q, loss=0var, mean_r=0.994, skip=7]2026-04-26 03:38:30,416 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:30,500 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:30,584 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:38:30,664 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:30,746 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:38:30,822 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:30,897 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:30,972 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:31,050 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:31,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 1 GRPO groups: 90%|######### | 18/20 [01:59<00:11, 5.69s/q, loss=0var, mean_r=0.997, skip=8]
Iter 1 GRPO groups: 95%|#########5| 19/20 [01:59<00:05, 5.40s/q, loss=0var, mean_r=0.997, skip=8]2026-04-26 03:38:36,660 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:38:36,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:36,822 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.761 = 0.50×0.60(prox=0.60) + 0.40×proc(0.901[fin=0.99,mean=0.77]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 03:38:36,905 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:38:36,988 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:37,070 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:38:37,154 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:38:37,237 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:38:37,321 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:38:37,408 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 1 GRPO groups: 95%|#########5| 19/20 [02:07<00:05, 5.40s/q, loss=0.0036, mean_r=0.976, skip=8]
Iter 1 GRPO groups: 100%|##########| 20/20 [02:07<00:00, 6.09s/q, loss=0.0036, mean_r=0.976, skip=8]
Iter 1 GRPO groups: 100%|##########| 20/20 [02:07<00:00, 6.38s/q, loss=0.0036, mean_r=0.976, skip=8]
+2026-04-26 03:38:38,866 INFO __main__ - Iter 1 | loss=0.0006 | reward mean=0.914 std=0.164 | gt_match=78.0% | grounded_acc=96.0% | step_acc=89.5% | lccp=81.4% | batch_acc=96.0% | phase=GROUNDED_ONLY sp_ratio=0% | groups=12 skipped=8(0var=8) | lr=1.06e-06 | 127.6s
+2026-04-26 03:38:38,866 WARNING __main__ - STARVATION: 40% of groups skipped (zero variance). grounded_acc=96.0% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 03:38:38,867 INFO __main__ - ======================================================================
+2026-04-26 03:38:38,867 INFO __main__ - GRPO ITERATION 2/60
+2026-04-26 03:38:38,867 INFO __main__ - ======================================================================
+2026-04-26 03:38:38,884 INFO __main__ - LR this iteration: 1.06e-06 | T=0.793 | MATH ratio=30%
+
Iter 2 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 03:38:43,486 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:38:43,567 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.769 = 0.50×0.62(prox=0.62) + 0.40×proc(0.892[fin=0.99,mean=0.75]) + 0.10×fmt(1.000) | pred='13' gold='10' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:38:43,649 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:43,730 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.750 = 0.50×0.62(prox=0.62) + 0.40×proc(0.844[fin=0.98,mean=0.64]) + 0.10×fmt(1.000) | pred='13' gold='10' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:38:43,811 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:43,891 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:43,972 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.544 = 0.50×0.71(prox=0.71) + 0.40×proc(0.218[fin=0.05,mean=0.47]) + 0.10×fmt(1.000) | pred='12' gold='10' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 03:38:44,053 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:44,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:44,215 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 2 GRPO groups: 0%| | 0/20 [00:06, ?q/s, loss=0.0004, mean_r=0.906, skip=0]
Iter 2 GRPO groups: 5%|5 | 1/20 [00:06<02:07, 6.72s/q, loss=0.0004, mean_r=0.906, skip=0]2026-04-26 03:38:49,930 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:50,007 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:38:50,084 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:38:50,168 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:50,253 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:50,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:50,420 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:50,501 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:50,583 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:50,666 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.543 = 0.50×0.20(prox=0.20) + 0.40×proc(0.633[fin=0.55,mean=0.76]) + 0.10×fmt(1.000) | pred='-193' gold='193' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+
Iter 2 GRPO groups: 5%|5 | 1/20 [00:13<02:07, 6.72s/q, loss=-0.0008, mean_r=0.945, skip=0]
Iter 2 GRPO groups: 10%|# | 2/20 [00:13<01:58, 6.58s/q, loss=-0.0008, mean_r=0.945, skip=0]2026-04-26 03:38:59,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2,870' gold='2870' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:38:59,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.972[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='2870' gold='2870' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:38:59,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2870' gold='2870' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:59,846 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2870' gold='2870' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:38:59,930 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.750 = 0.50×0.54(prox=0.54) + 0.40×proc(0.945[fin=0.99,mean=0.88]) + 0.10×fmt(1.000) | pred='1670' gold='2870' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:39:00,021 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.570 = 0.50×0.38(prox=0.38) + 0.40×proc(0.697[fin=0.90,mean=0.39]) + 0.10×fmt(1.000) | pred='550' gold='2870' | step_acc=33% lccp=0% (chain=0/6 ok_count=2) n_steps=6
+2026-04-26 03:39:00,114 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='2870' gold='2870' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:39:00,198 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.859 = 0.50×0.85(prox=0.85) + 0.40×proc(0.840[fin=0.94,mean=0.69]) + 0.10×fmt(1.000) | pred='3130' gold='2870' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:39:00,288 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='2870' gold='2870' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:39:00,371 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.785 = 0.50×0.59(prox=0.59) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='1870' gold='2870' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 2 GRPO groups: 10%|# | 2/20 [00:22<01:58, 6.58s/q, loss=0.0006, mean_r=0.894, skip=0]
Iter 2 GRPO groups: 15%|#5 | 3/20 [00:22<02:16, 8.02s/q, loss=0.0006, mean_r=0.894, skip=0]2026-04-26 03:39:08,394 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:39:08,476 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:39:08,559 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.915[fin=0.94,mean=0.88]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:39:08,641 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.791 = 0.50×0.67(prox=0.67) + 0.40×proc(0.894[fin=0.98,mean=0.77]) + 0.10×fmt(1.000) | pred='2.5' gold='2' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:39:08,724 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.908[fin=0.98,mean=0.81]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:39:08,807 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.525 = 0.50×0.00(prox=0.00) + 0.40×proc(0.875[fin=0.96,mean=0.75]) + 0.10×fmt(1.000) | pred='5/2' gold='2' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:39:08,890 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.951[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 03:39:08,973 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.466 = 0.50×0.00(prox=0.00) + 0.40×proc(0.822[fin=0.98,mean=0.59]) + 0.10×fmt(1.000) | pred='14/3' gold='2' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:39:09,055 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.514 = 0.50×0.00(prox=0.00) + 0.40×proc(0.911[fin=0.99,mean=0.79]) + 0.10×fmt(1.000) | pred='11/6' gold='2' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 03:39:09,137 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 2 GRPO groups: 15%|#5 | 3/20 [00:31<02:16, 8.02s/q, loss=0.0004, mean_r=0.820, skip=0]
Iter 2 GRPO groups: 20%|## | 4/20 [00:31<02:12, 8.29s/q, loss=0.0004, mean_r=0.820, skip=0]2026-04-26 03:39:14,085 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:39:14,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.394 = 0.50×0.35(prox=0.35) + 0.40×proc(0.205[fin=0.02,mean=0.49]) + 0.10×fmt(1.000) | pred='3' gold='45' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 03:39:14,249 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:39:14,331 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.45(prox=0.45) + 0.40×proc(0.668[fin=0.78,mean=0.50]) + 0.10×fmt(1.000) | pred='72' gold='45' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:39:14,413 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.905[fin=0.98,mean=0.79]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:39:14,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.595 = 0.50×0.43(prox=0.43) + 0.40×proc(0.702[fin=0.93,mean=0.36]) + 0.10×fmt(1.000) | pred='15' gold='45' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 03:39:14,578 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.23(prox=0.23) + 0.40×proc(0.818[fin=0.95,mean=0.62]) + 0.10×fmt(1.000) | pred='120' gold='45' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 03:39:14,661 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:39:14,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.771 = 0.50×0.56(prox=0.56) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='27' gold='45' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:39:14,828 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.411 = 0.50×0.41(prox=0.41) + 0.40×proc(0.168[fin=0.08,mean=0.31]) + 0.10×fmt(1.000) | pred='13' gold='45' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+
Iter 2 GRPO groups: 20%|## | 4/20 [00:37<02:12, 8.29s/q, loss=-0.0006, mean_r=0.722, skip=0]
Iter 2 GRPO groups: 25%|##5 | 5/20 [00:37<01:50, 7.36s/q, loss=-0.0006, mean_r=0.722, skip=0]2026-04-26 03:39:49,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.601 = 0.50×0.75(prox=0.75) + 0.40×proc(0.315[fin=0.17,mean=0.54]) + 0.10×fmt(1.000) | pred='5' gold='6' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 03:39:50,012 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.824 = 0.50×0.75(prox=0.75) + 0.40×proc(0.872[fin=0.93,mean=0.79]) + 0.10×fmt(1.000) | pred='7' gold='6' | step_acc=88% lccp=62% (chain=5/8 ok_count=7) n_steps=8
+2026-04-26 03:39:50,105 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.564 = 0.50×0.33(prox=0.33) + 0.40×proc(0.743[fin=0.77,mean=0.70]) + 0.10×fmt(1.000) | pred='12' gold='6' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:39:50,190 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.756 = 0.50×0.75(prox=0.75) + 0.40×proc(0.701[fin=0.77,mean=0.59]) + 0.10×fmt(1.000) | pred='7' gold='6' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:39:50,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:39:50,386 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.932 = 0.50×1.00(exact) + 0.40×proc(0.830[fin=0.97,mean=0.62]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:39:50,472 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.698 = 0.50×1.00(exact) + 0.40×proc(0.244[fin=0.01,mean=0.59]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 03:39:50,556 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.626 = 0.50×0.75(prox=0.75) + 0.40×proc(0.378[fin=0.34,mean=0.43]) + 0.10×fmt(1.000) | pred='5' gold='6' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+
Iter 2 GRPO groups: 25%|##5 | 5/20 [01:12<01:50, 7.36s/q, loss=0.0004, mean_r=0.748, skip=0]
Iter 2 GRPO groups: 30%|### | 6/20 [01:12<03:57, 16.97s/q, loss=0.0004, mean_r=0.748, skip=0]2026-04-26 03:39:58,448 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:39:58,533 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:39:58,616 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:39:58,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.923 = 0.50×0.85(prox=0.85) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7.60' gold='8' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:39:58,791 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:39:58,883 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:39:58,971 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.918 = 0.50×0.85(prox=0.85) + 0.40×proc(0.983[fin=0.98,mean=0.99]) + 0.10×fmt(1.000) | pred='7.6' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:39:59,054 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:39:59,137 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:39:59,220 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 2 GRPO groups: 30%|### | 6/20 [01:21<03:57, 16.97s/q, loss=0.0005, mean_r=0.984, skip=0]
Iter 2 GRPO groups: 35%|###5 | 7/20 [01:21<03:05, 14.29s/q, loss=0.0005, mean_r=0.984, skip=0]2026-04-26 03:40:20,484 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.914[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='-150' gold='-150' | step_acc=86% lccp=57% (chain=4/7 ok_count=6) n_steps=7
+2026-04-26 03:40:20,578 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.430 = 0.50×0.15(prox=0.15) + 0.40×proc(0.354[fin=0.09,mean=0.75]) + 0.10×fmt(1.000) | pred='290' gold='-150' | step_acc=77% lccp=77% (chain=10/13 ok_count=10) n_steps=13
+2026-04-26 03:40:20,662 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='150' gold='-150' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:40:20,757 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.389 = 0.50×0.33(prox=0.33) + 0.40×proc(0.253[fin=0.21,mean=0.32]) + 0.10×fmt(1.000) | pred='0' gold='-150' | step_acc=14% lccp=14% (chain=1/7 ok_count=1) n_steps=7
+2026-04-26 03:40:20,852 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.309 = 0.50×0.00(prox=0.00) + 0.40×proc(0.288[fin=0.03,mean=0.68]) + 0.10×fmt(1.000) | pred='78680' gold='-150' | step_acc=62% lccp=62% (chain=5/8 ok_count=5) n_steps=8
+2026-04-26 03:40:20,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.964[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='-150' gold='-150' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:40:21,025 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-150' gold='-150' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:40:21,125 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.907[fin=0.93,mean=0.87]) + 0.10×fmt(1.000) | pred='0' gold='-150' | step_acc=100% lccp=100% (chain=12/12 ok_count=12) n_steps=12
+2026-04-26 03:40:21,222 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.441 = 0.50×0.02(prox=0.02) + 0.40×proc(0.756[fin=0.87,mean=0.58]) + 0.10×fmt(1.000) | pred='3975' gold='-150' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 03:40:21,334 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.480 = 0.50×0.02(prox=0.02) + 0.40×proc(0.927[fin=0.98,mean=0.85]) + 0.10×fmt(1.000) | pred='4000' gold='-150' | step_acc=90% lccp=0% (chain=0/10 ok_count=9) n_steps=10
+
Iter 2 GRPO groups: 35%|###5 | 7/20 [01:43<03:05, 14.29s/q, loss=-0.0016, mean_r=0.610, skip=0]
Iter 2 GRPO groups: 40%|#### | 8/20 [01:43<03:21, 16.79s/q, loss=-0.0016, mean_r=0.610, skip=0]2026-04-26 03:40:25,466 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:25,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:25,622 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:25,704 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:25,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:25,863 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:25,940 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:26,016 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:26,094 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:26,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 2 GRPO groups: 40%|#### | 8/20 [01:47<03:21, 16.79s/q, loss=0var, mean_r=1.000, skip=1]
Iter 2 GRPO groups: 45%|####5 | 9/20 [01:47<02:18, 12.60s/q, loss=0var, mean_r=1.000, skip=1]2026-04-26 03:40:34,776 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:34,868 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:34,959 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:35,052 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.366 = 0.50×0.16(prox=0.16) + 0.40×proc(0.276[fin=0.12,mean=0.51]) + 0.10×fmt(1.000) | pred='453' gold='126' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 03:40:35,145 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.933[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:35,236 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.486 = 0.50×0.72(prox=0.72) + 0.40×proc(0.071[fin=0.05,mean=0.11]) + 0.10×fmt(1.000) | pred='151' gold='126' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:40:35,320 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.777 = 0.50×0.60(prox=0.60) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='168' gold='126' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:35,413 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:35,510 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.925[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 03:40:35,601 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 2 GRPO groups: 45%|####5 | 9/20 [01:58<02:18, 12.60s/q, loss=0.0001, mean_r=0.854, skip=1]
Iter 2 GRPO groups: 50%|##### | 10/20 [01:58<02:00, 12.07s/q, loss=0.0001, mean_r=0.854, skip=1]2026-04-26 03:40:41,618 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.840 = 0.50×0.71(prox=0.71) + 0.40×proc(0.958[fin=0.97,mean=0.94]) + 0.10×fmt(1.000) | pred='200' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:40:41,700 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:40:41,775 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:41,850 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:40:41,932 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:40:42,012 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.903[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 03:40:42,094 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:40:42,175 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.923[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 03:40:42,260 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.831[fin=0.95,mean=0.66]) + 0.10×fmt(1.000) | pred='750' gold='250' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:40:42,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.477 = 0.50×0.56(prox=0.56) + 0.40×proc(0.248[fin=0.15,mean=0.39]) + 0.10×fmt(1.000) | pred='150' gold='250' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+
Iter 2 GRPO groups: 50%|##### | 10/20 [02:04<02:00, 12.07s/q, loss=0.0008, mean_r=0.876, skip=1]
Iter 2 GRPO groups: 55%|#####5 | 11/20 [02:04<01:33, 10.42s/q, loss=0.0008, mean_r=0.876, skip=1]2026-04-26 03:40:48,059 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:48,142 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.295 = 0.50×0.14(prox=0.14) + 0.40×proc(0.216[fin=0.15,mean=0.32]) + 0.10×fmt(1.000) | pred='108' gold='27' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 03:40:48,224 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.894[fin=0.99,mean=0.75]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:40:48,305 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:48,389 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.521 = 0.50×0.08(prox=0.08) + 0.40×proc(0.833[fin=0.92,mean=0.70]) + 0.10×fmt(1.000) | pred='192' gold='27' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 03:40:48,473 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.510 = 0.50×0.07(prox=0.07) + 0.40×proc(0.847[fin=0.99,mean=0.63]) + 0.10×fmt(1.000) | pred='216' gold='27' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 03:40:48,553 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:48,639 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.27(prox=0.27) + 0.40×proc(0.842[fin=0.99,mean=0.62]) + 0.10×fmt(1.000) | pred='64' gold='27' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 03:40:48,723 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.534 = 0.50×0.36(prox=0.36) + 0.40×proc(0.506[fin=0.52,mean=0.49]) + 0.10×fmt(1.000) | pred='3.375' gold='27' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 03:40:48,804 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 2 GRPO groups: 55%|#####5 | 11/20 [02:11<01:33, 10.42s/q, loss=-0.0015, mean_r=0.736, skip=1]
Iter 2 GRPO groups: 60%|###### | 12/20 [02:11<01:13, 9.22s/q, loss=-0.0015, mean_r=0.736, skip=1]2026-04-26 03:40:54,005 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:54,086 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:54,168 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:54,249 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:54,331 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:54,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.173 = 0.50×0.07(prox=0.07) + 0.40×proc(0.093[fin=0.09,mean=0.09]) + 0.10×fmt(1.000) | pred='300' gold='40' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:40:54,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:54,578 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:54,661 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:54,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 2 GRPO groups: 60%|###### | 12/20 [02:17<01:13, 9.22s/q, loss=0.0003, mean_r=0.917, skip=1]
Iter 2 GRPO groups: 65%|######5 | 13/20 [02:17<00:58, 8.30s/q, loss=0.0003, mean_r=0.917, skip=1]2026-04-26 03:41:03,720 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:41:03,805 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.538 = 0.50×0.65(prox=0.65) + 0.40×proc(0.281[fin=0.02,mean=0.68]) + 0.10×fmt(1.000) | pred='11' gold='15' | step_acc=70% lccp=10% (chain=1/10 ok_count=7) n_steps=10
+2026-04-26 03:41:03,887 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=11/11 ok_count=11) n_steps=11
+2026-04-26 03:41:03,970 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:04,054 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:41:04,139 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:41:04,230 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:41:04,312 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:41:04,394 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:41:04,476 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+
Iter 2 GRPO groups: 65%|######5 | 13/20 [02:27<00:58, 8.30s/q, loss=0.0011, mean_r=0.953, skip=1]
Iter 2 GRPO groups: 70%|####### | 14/20 [02:27<00:51, 8.67s/q, loss=0.0011, mean_r=0.953, skip=1]2026-04-26 03:41:09,664 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.881 = 0.50×0.76(prox=0.76) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='55' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:09,740 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:09,822 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:09,905 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.946[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 03:41:09,987 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:10,068 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:10,152 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:10,234 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:10,316 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:10,397 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 2 GRPO groups: 70%|####### | 14/20 [02:32<00:51, 8.67s/q, loss=0.0001, mean_r=0.984, skip=1]
Iter 2 GRPO groups: 75%|#######5 | 15/20 [02:32<00:39, 7.83s/q, loss=0.0001, mean_r=0.984, skip=1]2026-04-26 03:41:30,722 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='154' gold='154' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:41:30,815 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.47(prox=0.47) + 0.40×proc(0.901[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='66' gold='154' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 03:41:30,907 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.37(prox=0.37) + 0.40×proc(0.858[fin=0.94,mean=0.73]) + 0.10×fmt(1.000) | pred='25' gold='154' | step_acc=71% lccp=43% (chain=3/7 ok_count=5) n_steps=7
+2026-04-26 03:41:30,999 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.945[fin=0.93,mean=0.97]) + 0.10×fmt(0.700) | pred='' gold='154' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:41:31,092 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.972[fin=0.99,mean=0.95]) + 0.10×fmt(0.700) | pred='' gold='154' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:31,184 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.864 = 0.50×0.85(prox=0.85) + 0.40×proc(0.849[fin=0.87,mean=0.81]) + 0.10×fmt(1.000) | pred='158' gold='154' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 03:41:31,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.491 = 0.50×0.00(prox=0.00) + 0.40×proc(0.710[fin=0.65,mean=0.80]) + 0.10×fmt(1.000) | pred='84 + 285i' gold='154' | step_acc=86% lccp=71% (chain=5/7 ok_count=6) n_steps=7
+2026-04-26 03:41:31,379 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.37(prox=0.37) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='24' gold='154' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:41:31,476 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.30(prox=0.30) + 0.40×proc(0.676[fin=0.79,mean=0.51]) + 0.10×fmt(1.000) | pred='338' gold='154' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:41:31,569 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.471 = 0.50×0.00(prox=0.00) + 0.40×proc(0.895[fin=1.00,mean=0.74]) + 0.10×fmt(0.700) | pred='' gold='154' | step_acc=71% lccp=29% (chain=2/7 ok_count=5) n_steps=7
+
Iter 2 GRPO groups: 75%|#######5 | 15/20 [02:54<00:39, 7.83s/q, loss=-0.0003, mean_r=0.613, skip=1]
Iter 2 GRPO groups: 80%|######## | 16/20 [02:54<00:47, 11.85s/q, loss=-0.0003, mean_r=0.613, skip=1]2026-04-26 03:41:39,471 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:39,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:39,649 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:41:39,742 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.532 = 0.50×0.34(prox=0.34) + 0.40×proc(0.339[fin=0.02,mean=0.82]) + 0.10×fmt(1.000) | pred='1' gold='23' | step_acc=83% lccp=83% (chain=5/6 ok_count=5) n_steps=6
+2026-04-26 03:41:39,833 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:39,916 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:39,999 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:40,084 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:40,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:40,253 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 2 GRPO groups: 80%|######## | 16/20 [03:02<00:47, 11.85s/q, loss=-0.0009, mean_r=0.951, skip=1]
Iter 2 GRPO groups: 85%|########5 | 17/20 [03:02<00:32, 10.90s/q, loss=-0.0009, mean_r=0.951, skip=1]2026-04-26 03:41:43,971 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,052 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,136 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,304 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,385 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,467 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,548 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,630 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 2 GRPO groups: 85%|########5 | 17/20 [03:05<00:32, 10.90s/q, loss=0var, mean_r=0.999, skip=2]
Iter 2 GRPO groups: 90%|######### | 18/20 [03:05<00:17, 8.54s/q, loss=0var, mean_r=0.999, skip=2]2026-04-26 03:41:49,364 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:49,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=0.98,mean=0.92]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:49,529 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.15(prox=0.15) + 0.40×proc(0.966[fin=0.99,mean=0.93]) + 0.10×fmt(1.000) | pred='68' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:49,612 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 03:41:49,694 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:41:49,776 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.15(prox=0.15) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='68' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:49,859 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='30' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:41:49,942 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:50,024 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:41:50,109 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.494 = 0.50×0.15(prox=0.15) + 0.40×proc(0.570[fin=0.47,mean=0.71]) + 0.10×fmt(1.000) | pred='68' gold='18' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+
Iter 2 GRPO groups: 90%|######### | 18/20 [03:12<00:17, 8.54s/q, loss=0.0016, mean_r=0.808, skip=2]
Iter 2 GRPO groups: 95%|#########5| 19/20 [03:12<00:08, 8.03s/q, loss=0.0016, mean_r=0.808, skip=2]2026-04-26 03:41:56,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.604 = 0.50×0.50(prox=0.50) + 0.40×proc(0.635[fin=0.73,mean=0.50]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:41:56,220 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.580 = 0.50×0.50(prox=0.50) + 0.40×proc(0.576[fin=0.77,mean=0.28]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:41:56,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.582 = 0.50×0.50(prox=0.50) + 0.40×proc(0.581[fin=0.77,mean=0.30]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:41:56,406 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.665 = 0.50×0.50(prox=0.50) + 0.40×proc(0.787[fin=0.98,mean=0.50]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:41:56,497 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.572 = 0.50×0.33(prox=0.33) + 0.40×proc(0.762[fin=0.92,mean=0.52]) + 0.10×fmt(1.000) | pred='8' gold='4' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:41:56,588 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.908 = 0.50×1.00(exact) + 0.40×proc(0.771[fin=0.97,mean=0.47]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:41:56,679 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.616 = 0.50×0.50(prox=0.50) + 0.40×proc(0.665[fin=0.84,mean=0.40]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:41:56,771 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.652 = 0.50×0.50(prox=0.50) + 0.40×proc(0.755[fin=0.93,mean=0.49]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:41:56,862 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.639 = 0.50×0.50(prox=0.50) + 0.40×proc(0.723[fin=0.91,mean=0.44]) + 0.10×fmt(1.000) | pred='6' gold='4' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:41:56,955 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.366 = 0.50×0.05(prox=0.05) + 0.40×proc(0.530[fin=0.65,mean=0.35]) + 0.10×fmt(1.000) | pred='44' gold='4' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+
Iter 2 GRPO groups: 95%|#########5| 19/20 [03:19<00:08, 8.03s/q, loss=-0.0012, mean_r=0.618, skip=2]
Iter 2 GRPO groups: 100%|##########| 20/20 [03:19<00:00, 7.67s/q, loss=-0.0012, mean_r=0.618, skip=2]
Iter 2 GRPO groups: 100%|##########| 20/20 [03:19<00:00, 9.97s/q, loss=-0.0012, mean_r=0.618, skip=2]
+2026-04-26 03:41:58,386 INFO __main__ - Iter 2 | loss=-0.0000 | reward mean=0.848 std=0.216 | gt_match=65.2% | grounded_acc=91.4% | step_acc=86.7% | lccp=76.5% | batch_acc=91.4% | phase=GROUNDED_ONLY sp_ratio=0% | groups=18 skipped=2(0var=2) | lr=1.63e-06 | 199.5s
+2026-04-26 03:41:58,386 INFO __main__ - ======================================================================
+2026-04-26 03:41:58,386 INFO __main__ - GRPO ITERATION 3/60
+2026-04-26 03:41:58,386 INFO __main__ - ======================================================================
+2026-04-26 03:41:58,404 INFO __main__ - LR this iteration: 1.63e-06 | T=0.786 | MATH ratio=30%
+
Iter 3 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 03:42:02,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.459 = 0.50×0.56(prox=0.56) + 0.40×proc(0.197[fin=0.09,mean=0.36]) + 0.10×fmt(1.000) | pred='56' gold='92' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 03:42:02,086 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.905[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='92' gold='92' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 03:42:02,167 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='92' gold='92' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:02,248 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='92' gold='92' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:02,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.463 = 0.50×0.56(prox=0.56) + 0.40×proc(0.206[fin=0.10,mean=0.37]) + 0.10×fmt(1.000) | pred='56' gold='92' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 03:42:02,410 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='92' gold='92' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:02,491 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.951[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='92' gold='92' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:02,572 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='92' gold='92' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:02,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='92' gold='92' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:02,728 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=0.99,mean=0.95]) + 0.10×fmt(1.000) | pred='92' gold='92' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 3 GRPO groups: 0%| | 0/20 [00:05, ?q/s, loss=0.0026, mean_r=0.883, skip=0]
Iter 3 GRPO groups: 5%|5 | 1/20 [00:05<01:48, 5.72s/q, loss=0.0026, mean_r=0.883, skip=0]2026-04-26 03:42:37,768 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.950[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:37,852 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.717 = 0.50×0.56(prox=0.56) + 0.40×proc(0.847[fin=0.89,mean=0.78]) + 0.10×fmt(1.000) | pred='15' gold='25' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:42:37,934 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.939[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:42:38,016 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:38,101 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.471 = 0.50×0.38(prox=0.38) + 0.40×proc(0.221[fin=0.01,mean=0.54]) + 0.10×fmt(1.000) | pred='5' gold='25' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 03:42:38,187 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.306 = 0.50×0.38(prox=0.38) + 0.40×proc(0.035[fin=0.03,mean=0.04]) + 0.10×fmt(1.000) | pred='5' gold='25' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 03:42:38,269 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:38,356 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.548 = 0.50×0.56(prox=0.56) + 0.40×proc(0.426[fin=0.41,mean=0.45]) + 0.10×fmt(1.000) | pred='15' gold='25' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 03:42:38,439 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 3 GRPO groups: 5%|5 | 1/20 [00:41<01:48, 5.72s/q, loss=-0.0003, mean_r=0.776, skip=0]
Iter 3 GRPO groups: 10%|# | 2/20 [00:41<07:00, 23.33s/q, loss=-0.0003, mean_r=0.776, skip=0]2026-04-26 03:42:43,064 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='20' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:43,147 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:43,228 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:43,311 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.744 = 0.50×0.67(prox=0.67) + 0.40×proc(0.777[fin=0.95,mean=0.52]) + 0.10×fmt(1.000) | pred='15' gold='12' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:42:43,393 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.947[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='20' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:43,475 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.686 = 0.50×0.50(prox=0.50) + 0.40×proc(0.839[fin=0.97,mean=0.64]) + 0.10×fmt(1.000) | pred='6' gold='12' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:43,551 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:43,628 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.942[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='20' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:43,705 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.921[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='20' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:43,787 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 3 GRPO groups: 10%|# | 2/20 [00:46<07:00, 23.33s/q, loss=0.0006, mean_r=0.761, skip=0]
Iter 3 GRPO groups: 15%|#5 | 3/20 [00:46<04:17, 15.14s/q, loss=0.0006, mean_r=0.761, skip=0]2026-04-26 03:42:47,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:47,860 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:47,937 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:48,013 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:48,091 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:48,170 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:48,251 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:48,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:48,408 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:48,486 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 3 GRPO groups: 15%|#5 | 3/20 [00:50<04:17, 15.14s/q, loss=0var, mean_r=0.998, skip=1]
Iter 3 GRPO groups: 20%|## | 4/20 [00:50<02:47, 10.47s/q, loss=0var, mean_r=0.998, skip=1]2026-04-26 03:42:53,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.864 = 0.50×0.74(prox=0.74) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='46' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:53,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.844 = 0.50×0.76(prox=0.76) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='65' gold='56' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:42:53,753 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:42:53,836 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.364[fin=0.18,mean=0.64]) + 0.10×fmt(1.000) | pred='14' gold='56' | step_acc=80% lccp=80% (chain=4/5 ok_count=4) n_steps=5
+2026-04-26 03:42:53,919 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:54,002 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.672 = 0.50×0.40(prox=0.40) + 0.40×proc(0.930[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='14' gold='56' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:54,088 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.811 = 0.50×0.82(prox=0.82) + 0.40×proc(0.748[fin=0.96,mean=0.43]) + 0.10×fmt(1.000) | pred='62' gold='56' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:42:54,174 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.725 = 0.50×0.50(prox=0.50) + 0.40×proc(0.936[fin=0.99,mean=0.85]) + 0.10×fmt(1.000) | pred='28' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:54,257 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.788 = 0.50×0.67(prox=0.67) + 0.40×proc(0.886[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='42' gold='56' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:54,342 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.888[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='14' gold='56' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+
Iter 3 GRPO groups: 20%|## | 4/20 [00:57<02:47, 10.47s/q, loss=0.0008, mean_r=0.780, skip=1]
Iter 3 GRPO groups: 25%|##5 | 5/20 [00:57<02:19, 9.32s/q, loss=0.0008, mean_r=0.780, skip=1]2026-04-26 03:42:58,655 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.921[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:58,741 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.920[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:58,825 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:58,910 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.905[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:58,993 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:59,078 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.900[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:59,163 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.920[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:59,247 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.919[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:59,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.921[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:59,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.909[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+
Iter 3 GRPO groups: 25%|##5 | 5/20 [01:01<02:19, 9.32s/q, loss=0var, mean_r=0.972, skip=2]
Iter 3 GRPO groups: 30%|### | 6/20 [01:01<01:43, 7.39s/q, loss=0var, mean_r=0.972, skip=2]2026-04-26 03:43:08,534 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:43:08,626 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.732 = 0.50×0.50(prox=0.50) + 0.40×proc(0.954[fin=0.94,mean=0.97]) + 0.10×fmt(1.000) | pred='64' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:43:08,717 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:43:08,808 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.719 = 0.50×0.50(prox=0.50) + 0.40×proc(0.922[fin=0.89,mean=0.97]) + 0.10×fmt(1.000) | pred='64' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:43:08,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:43:08,992 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:43:09,083 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:43:09,177 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:43:09,268 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:43:09,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+
Iter 3 GRPO groups: 30%|### | 6/20 [01:12<01:43, 7.39s/q, loss=0.0005, mean_r=0.944, skip=2]
Iter 3 GRPO groups: 35%|###5 | 7/20 [01:12<01:52, 8.69s/q, loss=0.0005, mean_r=0.944, skip=2]2026-04-26 03:43:44,376 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.888[fin=1.00,mean=0.73]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 03:43:44,453 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 03:43:44,535 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:43:44,616 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.922[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:43:44,699 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.276 = 0.50×0.03(prox=0.03) + 0.40×proc(0.212[fin=0.03,mean=0.49]) + 0.10×fmt(1.000) | pred='190' gold='12' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 03:43:44,780 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:43:44,862 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.936[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:43:44,939 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:43:45,021 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 3 GRPO groups: 35%|###5 | 7/20 [01:47<01:52, 8.69s/q, loss=-0.0011, mean_r=0.901, skip=2]
Iter 3 GRPO groups: 40%|#### | 8/20 [01:47<03:27, 17.25s/q, loss=-0.0011, mean_r=0.901, skip=2]2026-04-26 03:43:52,675 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.780 = 0.50×0.85(prox=0.85) + 0.40×proc(0.638[fin=0.74,mean=0.49]) + 0.10×fmt(1.000) | pred='1980' gold='2020' | step_acc=50% lccp=33% (chain=2/6 ok_count=3) n_steps=6
+2026-04-26 03:43:52,759 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.698 = 0.50×0.75(prox=0.75) + 0.40×proc(0.561[fin=0.60,mean=0.50]) + 0.10×fmt(1.000) | pred='1680' gold='2020' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 03:43:52,843 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.851 = 0.50×0.81(prox=0.81) + 0.40×proc(0.868[fin=0.98,mean=0.69]) + 0.10×fmt(1.000) | pred='1780' gold='2020' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 03:43:52,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.815 = 0.50×0.75(prox=0.75) + 0.40×proc(0.853[fin=0.96,mean=0.69]) + 0.10×fmt(1.000) | pred='1680' gold='2020' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 03:43:53,019 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.871 = 0.50×0.85(prox=0.85) + 0.40×proc(0.864[fin=0.89,mean=0.83]) + 0.10×fmt(1.000) | pred='2180' gold='2020' | step_acc=86% lccp=71% (chain=5/7 ok_count=6) n_steps=7
+2026-04-26 03:43:53,104 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.823 = 0.50×0.81(prox=0.81) + 0.40×proc(0.796[fin=0.95,mean=0.57]) + 0.10×fmt(1.000) | pred='1780' gold='2020' | step_acc=67% lccp=17% (chain=1/6 ok_count=4) n_steps=6
+2026-04-26 03:43:53,188 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.42(prox=0.42) + 0.40×proc(0.741[fin=0.79,mean=0.67]) + 0.10×fmt(1.000) | pred='3428' gold='2020' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:43:53,275 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.771 = 0.50×0.77(prox=0.77) + 0.40×proc(0.715[fin=0.85,mean=0.51]) + 0.10×fmt(1.000) | pred='1720' gold='2020' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 03:43:53,358 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.697 = 0.50×0.56(prox=0.56) + 0.40×proc(0.796[fin=0.81,mean=0.78]) + 0.10×fmt(1.000) | pred='1220' gold='2020' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 03:43:53,442 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.775 = 0.50×0.75(prox=0.75) + 0.40×proc(0.752[fin=0.92,mean=0.50]) + 0.10×fmt(1.000) | pred='1680' gold='2020' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+
Iter 3 GRPO groups: 40%|#### | 8/20 [01:56<03:27, 17.25s/q, loss=-0.0006, mean_r=0.763, skip=2]
Iter 3 GRPO groups: 45%|####5 | 9/20 [01:56<02:39, 14.52s/q, loss=-0.0006, mean_r=0.763, skip=2]2026-04-26 03:43:58,841 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:43:58,919 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:43:59,002 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:43:59,081 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:43:59,162 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:43:59,239 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.949 = 0.50×1.00(exact) + 0.40×proc(0.961[fin=1.00,mean=0.90]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:43:59,320 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:43:59,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:43:59,472 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:43:59,550 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 3 GRPO groups: 45%|####5 | 9/20 [02:01<02:39, 14.52s/q, loss=0var, mean_r=0.977, skip=3]
Iter 3 GRPO groups: 50%|##### | 10/20 [02:01<01:54, 11.48s/q, loss=0var, mean_r=0.977, skip=3]2026-04-26 03:44:02,249 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:02,332 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:02,407 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:02,483 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:02,563 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:02,643 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:02,724 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:02,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:02,878 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:02,954 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 3 GRPO groups: 50%|##### | 10/20 [02:04<01:54, 11.48s/q, loss=0var, mean_r=0.993, skip=4]
Iter 3 GRPO groups: 55%|#####5 | 11/20 [02:04<01:21, 9.01s/q, loss=0var, mean_r=0.993, skip=4]2026-04-26 03:44:07,923 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,010 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,096 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,262 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,344 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,428 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,513 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,596 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,679 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 3 GRPO groups: 55%|#####5 | 11/20 [02:10<01:21, 9.01s/q, loss=0var, mean_r=0.998, skip=5]
Iter 3 GRPO groups: 60%|###### | 12/20 [02:10<01:04, 8.01s/q, loss=0var, mean_r=0.998, skip=5]2026-04-26 03:44:15,300 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.905 = 0.50×0.85(prox=0.85) + 0.40×proc(0.950[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='7900' gold='7945' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:44:15,385 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.883 = 0.50×0.85(prox=0.85) + 0.40×proc(0.896[fin=0.96,mean=0.80]) + 0.10×fmt(1.000) | pred='7600' gold='7945' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:44:15,470 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.899 = 0.50×0.85(prox=0.85) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='7900' gold='7945' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:44:15,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.910 = 0.50×0.85(prox=0.85) + 0.40×proc(0.962[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='7900' gold='7945' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:15,639 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.919 = 0.50×0.85(prox=0.85) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='8000' gold='7945' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:15,731 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.916 = 0.50×0.85(prox=0.85) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='7900' gold='7945' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:44:15,822 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.865 = 0.50×0.85(prox=0.85) + 0.40×proc(0.850[fin=0.99,mean=0.64]) + 0.10×fmt(1.000) | pred='7900' gold='7945' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:44:15,914 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.910 = 0.50×0.85(prox=0.85) + 0.40×proc(0.963[fin=0.99,mean=0.92]) + 0.10×fmt(1.000) | pred='7920' gold='7945' | step_acc=89% lccp=78% (chain=7/9 ok_count=8) n_steps=9
+2026-04-26 03:44:15,997 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.888 = 0.50×0.85(prox=0.85) + 0.40×proc(0.908[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='8000' gold='7945' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 03:44:16,090 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.661 = 0.50×0.54(prox=0.54) + 0.40×proc(0.734[fin=0.83,mean=0.59]) + 0.10×fmt(1.000) | pred='4500' gold='7945' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+
Iter 3 GRPO groups: 60%|###### | 12/20 [02:19<01:04, 8.01s/q, loss=-0.0006, mean_r=0.876, skip=5]
Iter 3 GRPO groups: 65%|######5 | 13/20 [02:19<00:57, 8.26s/q, loss=-0.0006, mean_r=0.876, skip=5]2026-04-26 03:44:25,743 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.703 = 0.50×0.50(prox=0.50) + 0.40×proc(0.883[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='24' gold='16' | step_acc=71% lccp=14% (chain=1/7 ok_count=5) n_steps=7
+2026-04-26 03:44:25,827 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.740 = 0.50×0.50(prox=0.50) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='24' gold='16' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:44:25,910 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:44:26,001 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+2026-04-26 03:44:26,085 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+2026-04-26 03:44:26,172 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.432 = 0.50×0.33(prox=0.33) + 0.40×proc(0.302[fin=0.11,mean=0.59]) + 0.10×fmt(1.000) | pred='0' gold='16' | step_acc=60% lccp=30% (chain=3/10 ok_count=6) n_steps=10
+2026-04-26 03:44:26,256 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.810[fin=0.99,mean=0.54]) + 0.10×fmt(1.000) | pred='28' gold='16' | step_acc=62% lccp=12% (chain=1/8 ok_count=5) n_steps=8
+2026-04-26 03:44:26,341 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.809 = 0.50×1.00(exact) + 0.40×proc(0.522[fin=0.43,mean=0.67]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 03:44:26,434 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.504 = 0.50×0.53(prox=0.53) + 0.40×proc(0.352[fin=0.09,mean=0.75]) + 0.10×fmt(1.000) | pred='8.8' gold='16' | step_acc=80% lccp=60% (chain=6/10 ok_count=8) n_steps=10
+2026-04-26 03:44:26,516 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 3 GRPO groups: 65%|######5 | 13/20 [02:29<00:57, 8.26s/q, loss=-0.0012, mean_r=0.773, skip=5]
Iter 3 GRPO groups: 70%|####### | 14/20 [02:29<00:53, 8.92s/q, loss=-0.0012, mean_r=0.773, skip=5]2026-04-26 03:44:32,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:32,396 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:32,480 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:32,564 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:32,648 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:32,731 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:32,813 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:32,895 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:32,976 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:33,053 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.689 = 0.50×0.60(prox=0.60) + 0.40×proc(0.724[fin=0.95,mean=0.39]) + 0.10×fmt(1.000) | pred='90' gold='135' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+
Iter 3 GRPO groups: 70%|####### | 14/20 [02:36<00:53, 8.92s/q, loss=0.0025, mean_r=0.969, skip=5]
Iter 3 GRPO groups: 75%|#######5 | 15/20 [02:36<00:40, 8.19s/q, loss=0.0025, mean_r=0.969, skip=5]2026-04-26 03:44:39,345 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.904[fin=0.99,mean=0.77]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:39,427 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:39,509 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:39,590 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.948[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:39,672 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:39,752 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:39,833 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:39,914 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:39,997 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:40,080 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 3 GRPO groups: 75%|#######5 | 15/20 [02:41<00:40, 8.19s/q, loss=0var, mean_r=0.992, skip=6]
Iter 3 GRPO groups: 80%|######## | 16/20 [02:41<00:29, 7.42s/q, loss=0var, mean_r=0.992, skip=6]2026-04-26 03:44:44,052 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:44,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:44,215 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:44,296 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:44,377 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:44,459 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:44,541 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:44,622 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:44,703 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:44,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 3 GRPO groups: 80%|######## | 16/20 [02:46<00:29, 7.42s/q, loss=0var, mean_r=0.997, skip=7]
Iter 3 GRPO groups: 85%|########5 | 17/20 [02:46<00:19, 6.60s/q, loss=0var, mean_r=0.997, skip=7]2026-04-26 03:44:47,629 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:47,704 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:47,786 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.924[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 03:44:47,867 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:47,948 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:48,023 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:48,100 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:48,183 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:48,260 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:48,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 3 GRPO groups: 85%|########5 | 17/20 [02:49<00:19, 6.60s/q, loss=0var, mean_r=0.996, skip=8]
Iter 3 GRPO groups: 90%|######### | 18/20 [02:49<00:11, 5.69s/q, loss=0var, mean_r=0.996, skip=8]2026-04-26 03:44:57,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.626 = 0.50×0.85(prox=0.85) + 0.40×proc(0.252[fin=0.34,mean=0.12]) + 0.10×fmt(1.000) | pred='675' gold='671' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 03:44:57,088 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.946[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='671' gold='671' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:57,183 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.479 = 0.50×0.45(prox=0.45) + 0.40×proc(0.384[fin=0.44,mean=0.30]) + 0.10×fmt(1.000) | pred='261' gold='671' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 03:44:57,266 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.538 = 0.50×0.43(prox=0.43) + 0.40×proc(0.638[fin=0.79,mean=0.40]) + 0.10×fmt(0.650) | pred='235' gold='671' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 03:44:57,358 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.632 = 0.50×0.43(prox=0.43) + 0.40×proc(0.788[fin=0.94,mean=0.56]) + 0.10×fmt(1.000) | pred='233' gold='671' | step_acc=50% lccp=0% (chain=0/6 ok_count=3) n_steps=6
+2026-04-26 03:44:57,442 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.35(prox=0.35) + 0.40×proc(0.613[fin=0.44,mean=0.88]) + 0.10×fmt(1.000) | pred='55' gold='671' | step_acc=86% lccp=86% (chain=6/7 ok_count=6) n_steps=7
+2026-04-26 03:44:57,534 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.607 = 0.50×0.43(prox=0.43) + 0.40×proc(0.729[fin=0.91,mean=0.46]) + 0.10×fmt(1.000) | pred='229' gold='671' | step_acc=33% lccp=0% (chain=0/6 ok_count=2) n_steps=6
+2026-04-26 03:44:57,617 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.469 = 0.50×0.35(prox=0.35) + 0.40×proc(0.388[fin=0.33,mean=0.47]) + 0.10×fmt(1.000) | pred='55' gold='671' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 03:44:57,708 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.512 = 0.50×0.59(prox=0.59) + 0.40×proc(0.288[fin=0.37,mean=0.17]) + 0.10×fmt(1.000) | pred='901' gold='671' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 03:44:57,791 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.922 = 0.50×1.00(exact) + 0.40×proc(0.805[fin=0.90,mean=0.67]) + 0.10×fmt(1.000) | pred='671' gold='671' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+
Iter 3 GRPO groups: 90%|######### | 18/20 [03:00<00:11, 5.69s/q, loss=0.0002, mean_r=0.631, skip=8]
Iter 3 GRPO groups: 95%|#########5| 19/20 [03:00<00:07, 7.24s/q, loss=0.0002, mean_r=0.631, skip=8]2026-04-26 03:45:06,026 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.948 = 0.50×1.00(exact) + 0.40×proc(0.870[fin=1.00,mean=0.68]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 03:45:06,108 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:45:06,191 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:45:06,281 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:45:06,371 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:45:06,455 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.435 = 0.50×0.21(prox=0.21) + 0.40×proc(0.509[fin=0.62,mean=0.33]) + 0.10×fmt(1.000) | pred='100' gold='35' | step_acc=33% lccp=17% (chain=1/6 ok_count=2) n_steps=6
+2026-04-26 03:45:06,537 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:45:06,620 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:45:06,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:45:06,793 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.941[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+
Iter 3 GRPO groups: 95%|#########5| 19/20 [03:09<00:07, 7.24s/q, loss=0.0011, mean_r=0.935, skip=8]
Iter 3 GRPO groups: 100%|##########| 20/20 [03:09<00:00, 7.77s/q, loss=0.0011, mean_r=0.935, skip=8]
Iter 3 GRPO groups: 100%|##########| 20/20 [03:09<00:00, 9.49s/q, loss=0.0011, mean_r=0.935, skip=8]
+2026-04-26 03:45:08,223 INFO __main__ - Iter 3 | loss=0.0004 | reward mean=0.896 std=0.171 | gt_match=70.7% | grounded_acc=95.5% | step_acc=87.7% | lccp=76.5% | batch_acc=95.5% | phase=GROUNDED_ONLY sp_ratio=0% | groups=12 skipped=8(0var=8) | lr=2.19e-06 | 189.8s
+2026-04-26 03:45:08,223 WARNING __main__ - STARVATION: 40% of groups skipped (zero variance). grounded_acc=95.5% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 03:45:08,224 INFO __main__ - ======================================================================
+2026-04-26 03:45:08,224 INFO __main__ - GRPO ITERATION 4/60
+2026-04-26 03:45:08,224 INFO __main__ - ======================================================================
+2026-04-26 03:45:08,240 INFO __main__ - LR this iteration: 2.19e-06 | T=0.780 | MATH ratio=30%
+
Iter 4 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 03:45:12,324 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.772 = 0.50×0.67(prox=0.67) + 0.40×proc(0.848[fin=0.98,mean=0.65]) + 0.10×fmt(1.000) | pred='180' gold='240' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 03:45:12,407 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=0.99,mean=0.97]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:45:12,488 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.940[fin=0.97,mean=0.89]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:45:12,572 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.793 = 0.50×0.67(prox=0.67) + 0.40×proc(0.899[fin=0.95,mean=0.82]) + 0.10×fmt(1.000) | pred='180' gold='240' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:45:12,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.600 = 0.50×0.67(prox=0.67) + 0.40×proc(0.416[fin=0.31,mean=0.58]) + 0.10×fmt(1.000) | pred='180' gold='240' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 03:45:12,736 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:45:12,819 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.704 = 0.50×0.67(prox=0.67) + 0.40×proc(0.677[fin=0.68,mean=0.67]) + 0.10×fmt(1.000) | pred='180' gold='240' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:45:12,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.918 = 0.50×1.00(exact) + 0.40×proc(0.794[fin=0.93,mean=0.59]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:45:12,982 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:45:13,063 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.692 = 0.50×0.67(prox=0.67) + 0.40×proc(0.648[fin=0.63,mean=0.67]) + 0.10×fmt(1.000) | pred='180' gold='240' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+
Iter 4 GRPO groups: 0%| | 0/20 [00:06, ?q/s, loss=0.0004, mean_r=0.845, skip=0]
Iter 4 GRPO groups: 5%|5 | 1/20 [00:06<01:58, 6.22s/q, loss=0.0004, mean_r=0.845, skip=0]2026-04-26 03:45:26,846 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.460 = 0.50×0.32(prox=0.32) + 0.40×proc(0.401[fin=0.25,mean=0.63]) + 0.10×fmt(1.000) | pred='49' gold='24' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:45:26,944 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.32(prox=0.32) + 0.40×proc(0.707[fin=0.85,mean=0.50]) + 0.10×fmt(1.000) | pred='49' gold='24' | step_acc=44% lccp=11% (chain=1/9 ok_count=4) n_steps=9
+2026-04-26 03:45:27,036 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.779[fin=0.83,mean=0.70]) + 0.10×fmt(1.000) | pred='48' gold='24' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 03:45:27,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.226 = 0.50×0.00(prox=0.00) + 0.40×proc(0.295[fin=0.05,mean=0.66]) + 0.10×fmt(0.700) | pred='' gold='24' | step_acc=62% lccp=25% (chain=2/8 ok_count=5) n_steps=8
+2026-04-26 03:45:27,225 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.504 = 0.50×0.32(prox=0.32) + 0.40×proc(0.464[fin=0.40,mean=0.56]) + 0.10×fmt(1.000) | pred='49' gold='24' | step_acc=62% lccp=38% (chain=3/8 ok_count=5) n_steps=8
+2026-04-26 03:45:27,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.892 = 0.50×1.00(exact) + 0.40×proc(0.730[fin=0.77,mean=0.66]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 03:45:27,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.876[fin=0.91,mean=0.83]) + 0.10×fmt(1.000) | pred='48' gold='24' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:45:27,482 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.917 = 0.50×1.00(exact) + 0.40×proc(0.791[fin=0.91,mean=0.62]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=50% lccp=33% (chain=2/6 ok_count=3) n_steps=6
+2026-04-26 03:45:27,576 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.761[fin=0.86,mean=0.62]) + 0.10×fmt(1.000) | pred='48' gold='24' | step_acc=71% lccp=29% (chain=2/7 ok_count=5) n_steps=7
+2026-04-26 03:45:27,667 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.46(prox=0.46) + 0.40×proc(0.404[fin=0.34,mean=0.50]) + 0.10×fmt(1.000) | pred='10' gold='24' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+
Iter 4 GRPO groups: 5%|5 | 1/20 [00:20<01:58, 6.22s/q, loss=-0.0004, mean_r=0.575, skip=0]
Iter 4 GRPO groups: 10%|# | 2/20 [00:20<03:21, 11.18s/q, loss=-0.0004, mean_r=0.575, skip=0]2026-04-26 03:45:39,821 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.492 = 0.50×0.08(prox=0.08) + 0.40×proc(0.884[fin=0.99,mean=0.72]) + 0.10×fmt(1.000) | pred='-10' gold='2' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:45:39,913 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.12(prox=0.12) + 0.40×proc(0.849[fin=0.98,mean=0.66]) + 0.10×fmt(1.000) | pred='-5' gold='2' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 03:45:40,005 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.363 = 0.50×0.00(prox=0.00) + 0.40×proc(0.658[fin=0.83,mean=0.40]) + 0.10×fmt(1.000) | pred='9/2' gold='2' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:45:40,090 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.464 = 0.50×0.12(prox=0.12) + 0.40×proc(0.754[fin=0.95,mean=0.45]) + 0.10×fmt(1.000) | pred='-5' gold='2' | step_acc=29% lccp=0% (chain=0/7 ok_count=2) n_steps=7
+2026-04-26 03:45:40,175 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.284 = 0.50×0.09(prox=0.09) + 0.40×proc(0.351[fin=0.47,mean=0.17]) + 0.10×fmt(1.000) | pred='12.5' gold='2' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:45:40,269 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:40,360 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.502 = 0.50×0.08(prox=0.08) + 0.40×proc(0.834[fin=0.99,mean=0.60]) + 0.10×fmt(1.000) | pred='-10' gold='2' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 03:45:40,457 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:45:40,551 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:45:40,642 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.486 = 0.50×0.07(prox=0.07) + 0.40×proc(0.880[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='-12' gold='2' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+
Iter 4 GRPO groups: 10%|# | 2/20 [00:33<03:21, 11.18s/q, loss=0.0022, mean_r=0.568, skip=0]
Iter 4 GRPO groups: 15%|#5 | 3/20 [00:33<03:23, 11.99s/q, loss=0.0022, mean_r=0.568, skip=0]2026-04-26 03:45:45,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,192 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,269 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,346 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,421 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,497 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,573 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,651 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,730 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,807 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 4 GRPO groups: 15%|#5 | 3/20 [00:37<03:23, 11.99s/q, loss=0var, mean_r=0.994, skip=1]
Iter 4 GRPO groups: 20%|## | 4/20 [00:37<02:19, 8.73s/q, loss=0var, mean_r=0.994, skip=1]2026-04-26 03:45:49,384 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:49,461 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.937[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:49,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:45:49,620 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:49,695 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:49,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:45:49,848 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:45:49,923 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:49,999 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:50,074 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 4 GRPO groups: 20%|## | 4/20 [00:41<02:19, 8.73s/q, loss=0var, mean_r=0.994, skip=2]
Iter 4 GRPO groups: 25%|##5 | 5/20 [00:41<01:46, 7.12s/q, loss=0var, mean_r=0.994, skip=2]2026-04-26 03:45:54,128 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:54,211 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:54,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:54,376 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:54,458 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:54,541 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:54,626 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:54,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:54,793 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.37(prox=0.37) + 0.40×proc(0.736[fin=0.88,mean=0.51]) + 0.10×fmt(1.000) | pred='11' gold='6' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:45:54,875 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 4 GRPO groups: 25%|##5 | 5/20 [00:48<01:46, 7.12s/q, loss=0.0007, mean_r=0.954, skip=2]
Iter 4 GRPO groups: 30%|### | 6/20 [00:48<01:35, 6.82s/q, loss=0.0007, mean_r=0.954, skip=2]2026-04-26 03:46:29,993 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.453 = 0.50×0.00(prox=0.00) + 0.40×proc(0.689[fin=0.62,mean=0.79]) + 0.10×fmt(0.700) | pred='' gold='4' | step_acc=86% lccp=71% (chain=5/7 ok_count=6) n_steps=7
+2026-04-26 03:46:30,090 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.626 = 0.50×0.67(prox=0.67) + 0.40×proc(0.482[fin=0.32,mean=0.73]) + 0.10×fmt(1.000) | pred='5' gold='4' | step_acc=70% lccp=60% (chain=6/10 ok_count=7) n_steps=10
+2026-04-26 03:46:30,194 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.736 = 0.50×0.67(prox=0.67) + 0.40×proc(0.757[fin=0.86,mean=0.61]) + 0.10×fmt(1.000) | pred='3' gold='4' | step_acc=50% lccp=0% (chain=0/6 ok_count=3) n_steps=6
+2026-04-26 03:46:30,286 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.949 = 0.50×1.00(exact) + 0.40×proc(0.872[fin=0.95,mean=0.76]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:46:30,378 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.771 = 0.50×0.67(prox=0.67) + 0.40×proc(0.843[fin=0.95,mean=0.68]) + 0.10×fmt(1.000) | pred='3' gold='4' | step_acc=67% lccp=0% (chain=0/9 ok_count=6) n_steps=9
+2026-04-26 03:46:30,470 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.547 = 0.50×0.67(prox=0.67) + 0.40×proc(0.285[fin=0.10,mean=0.57]) + 0.10×fmt(1.000) | pred='5' gold='4' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 03:46:30,554 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.549 = 0.50×0.67(prox=0.67) + 0.40×proc(0.288[fin=0.12,mean=0.55]) + 0.10×fmt(1.000) | pred='5' gold='4' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 03:46:30,686 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.806 = 0.50×0.67(prox=0.67) + 0.40×proc(0.931[fin=0.99,mean=0.84]) + 0.10×fmt(1.000) | pred='3' gold='4' | step_acc=88% lccp=6% (chain=1/16 ok_count=14) n_steps=16
+2026-04-26 03:46:30,780 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.533 = 0.50×0.67(prox=0.67) + 0.40×proc(0.248[fin=0.10,mean=0.47]) + 0.10×fmt(1.000) | pred='5' gold='4' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+
Iter 4 GRPO groups: 30%|### | 6/20 [01:23<01:35, 6.82s/q, loss=0.0007, mean_r=0.663, skip=2]
Iter 4 GRPO groups: 35%|###5 | 7/20 [01:23<03:32, 16.32s/q, loss=0.0007, mean_r=0.663, skip=2]2026-04-26 03:46:35,164 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,242 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,323 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,404 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,485 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,560 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,636 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,788 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,870 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 4 GRPO groups: 35%|###5 | 7/20 [01:27<03:32, 16.32s/q, loss=0var, mean_r=0.998, skip=3]
Iter 4 GRPO groups: 40%|#### | 8/20 [01:27<02:27, 12.30s/q, loss=0var, mean_r=0.998, skip=3]2026-04-26 03:46:42,207 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:46:42,294 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:46:42,380 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.49(prox=0.49) + 0.40×proc(0.821[fin=0.93,mean=0.66]) + 0.10×fmt(1.000) | pred='24' gold='49' | step_acc=86% lccp=14% (chain=1/7 ok_count=6) n_steps=7
+2026-04-26 03:46:42,464 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.875 = 0.50×0.83(prox=0.83) + 0.40×proc(0.900[fin=0.98,mean=0.78]) + 0.10×fmt(1.000) | pred='54' gold='49' | step_acc=80% lccp=70% (chain=7/10 ok_count=8) n_steps=10
+2026-04-26 03:46:42,547 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:46:42,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:46:42,717 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:46:42,808 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:46:42,891 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.466 = 0.50×0.39(prox=0.39) + 0.40×proc(0.304[fin=0.20,mean=0.47]) + 0.10×fmt(1.000) | pred='10.5' gold='49' | step_acc=50% lccp=33% (chain=2/6 ok_count=3) n_steps=6
+2026-04-26 03:46:42,973 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 4 GRPO groups: 40%|#### | 8/20 [01:36<02:27, 12.30s/q, loss=0.0013, mean_r=0.888, skip=3]
Iter 4 GRPO groups: 45%|####5 | 9/20 [01:36<02:02, 11.11s/q, loss=0.0013, mean_r=0.888, skip=3]2026-04-26 03:46:47,097 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:47,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:47,252 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:47,327 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.891 = 0.50×1.00(exact) + 0.40×proc(0.728[fin=0.95,mean=0.39]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 03:46:47,403 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:47,479 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:47,560 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:47,639 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:47,720 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:47,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.845 = 0.50×1.00(exact) + 0.40×proc(0.613[fin=0.82,mean=0.30]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+
Iter 4 GRPO groups: 45%|####5 | 9/20 [01:40<02:02, 11.11s/q, loss=0.0005, mean_r=0.972, skip=3]
Iter 4 GRPO groups: 50%|##### | 10/20 [01:40<01:31, 9.18s/q, loss=0.0005, mean_r=0.972, skip=3]2026-04-26 03:46:51,124 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,208 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,291 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,375 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,458 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,541 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,625 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,707 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,788 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,869 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 4 GRPO groups: 50%|##### | 10/20 [01:43<01:31, 9.18s/q, loss=0var, mean_r=0.997, skip=4]
Iter 4 GRPO groups: 55%|#####5 | 11/20 [01:43<01:04, 7.18s/q, loss=0var, mean_r=0.997, skip=4]2026-04-26 03:46:55,469 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:46:55,551 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:46:55,626 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.922[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 03:46:55,702 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:46:55,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:46:55,861 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.902[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 03:46:55,936 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.932[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:46:56,018 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:46:56,095 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.907[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 03:46:56,176 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 4 GRPO groups: 55%|#####5 | 11/20 [01:47<01:04, 7.18s/q, loss=0var, mean_r=0.985, skip=5]
Iter 4 GRPO groups: 60%|###### | 12/20 [01:47<00:50, 6.30s/q, loss=0var, mean_r=0.985, skip=5]2026-04-26 03:47:00,461 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:00,545 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:00,630 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.917[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:47:00,713 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:00,797 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:00,880 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:00,964 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:01,045 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:01,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:01,215 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.948[fin=0.99,mean=0.89]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+
Iter 4 GRPO groups: 60%|###### | 12/20 [01:52<00:50, 6.30s/q, loss=0var, mean_r=0.993, skip=6]
Iter 4 GRPO groups: 65%|######5 | 13/20 [01:52<00:41, 5.92s/q, loss=0var, mean_r=0.993, skip=6]2026-04-26 03:47:07,685 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.537 = 0.50×0.50(prox=0.50) + 0.40×proc(0.468[fin=0.28,mean=0.76]) + 0.10×fmt(1.000) | pred='13' gold='26' | step_acc=75% lccp=38% (chain=3/8 ok_count=6) n_steps=8
+2026-04-26 03:47:07,778 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.492 = 0.50×0.22(prox=0.22) + 0.40×proc(0.516[fin=0.43,mean=0.65]) + 0.10×fmt(1.000) | pred='72' gold='26' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 03:47:07,862 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='26' gold='26' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:07,946 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='26' gold='26' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:47:08,031 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.612 = 0.50×0.50(prox=0.50) + 0.40×proc(0.656[fin=0.49,mean=0.91]) + 0.10×fmt(1.000) | pred='13' gold='26' | step_acc=83% lccp=83% (chain=5/6 ok_count=5) n_steps=6
+2026-04-26 03:47:08,115 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.48(prox=0.48) + 0.40×proc(0.628[fin=0.65,mean=0.59]) + 0.10×fmt(1.000) | pred='40' gold='26' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 03:47:08,201 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.48(prox=0.48) + 0.40×proc(0.474[fin=0.43,mean=0.54]) + 0.10×fmt(1.000) | pred='40' gold='26' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 03:47:08,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='26' gold='26' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:47:08,385 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.760 = 0.50×0.57(prox=0.57) + 0.40×proc(0.945[fin=0.99,mean=0.87]) + 0.10×fmt(1.000) | pred='16' gold='26' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 03:47:08,467 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.48(prox=0.48) + 0.40×proc(0.889[fin=0.93,mean=0.83]) + 0.10×fmt(1.000) | pred='40' gold='26' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+
Iter 4 GRPO groups: 65%|######5 | 13/20 [02:01<00:41, 5.92s/q, loss=0.0008, mean_r=0.704, skip=6]
Iter 4 GRPO groups: 70%|####### | 14/20 [02:01<00:40, 6.76s/q, loss=0.0008, mean_r=0.704, skip=6]2026-04-26 03:47:12,149 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.972[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:12,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:12,307 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:12,383 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:12,459 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:12,535 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:12,610 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:12,686 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:12,761 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:12,836 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 4 GRPO groups: 70%|####### | 14/20 [02:04<00:40, 6.76s/q, loss=0var, mean_r=0.998, skip=7]
Iter 4 GRPO groups: 75%|#######5 | 15/20 [02:04<00:28, 5.60s/q, loss=0var, mean_r=0.998, skip=7]2026-04-26 03:47:46,346 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.499 = 0.50×0.35(prox=0.35) + 0.40×proc(0.653[fin=0.77,mean=0.48]) + 0.10×fmt(0.650) | pred='120' gold='2220' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 03:47:46,430 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.253 = 0.50×0.36(prox=0.36) + 0.40×proc(0.026[fin=0.01,mean=0.05]) + 0.10×fmt(0.650) | pred='210' gold='2220' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 03:47:46,513 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.094 = 0.50×0.01(prox=0.01) + 0.40×proc(0.065[fin=0.07,mean=0.05]) + 0.10×fmt(0.650) | pred='200200' gold='2220' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 03:47:46,598 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.484 = 0.50×0.37(prox=0.37) + 0.40×proc(0.276[fin=0.09,mean=0.55]) + 0.10×fmt(1.000) | pred='300' gold='2220' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 03:47:46,683 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.541 = 0.50×0.35(prox=0.35) + 0.40×proc(0.445[fin=0.30,mean=0.66]) + 0.10×fmt(1.000) | pred='120' gold='2220' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 03:47:46,765 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.480 = 0.50×0.37(prox=0.37) + 0.40×proc(0.304[fin=0.15,mean=0.53]) + 0.10×fmt(1.000) | pred='300' gold='2220' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 03:47:46,847 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.642 = 0.50×1.00(exact) + 0.40×proc(0.105[fin=0.11,mean=0.09]) + 0.10×fmt(1.000) | pred='2220' gold='2220' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:47:46,939 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.948 = 0.50×1.00(exact) + 0.40×proc(0.871[fin=0.99,mean=0.69]) + 0.10×fmt(1.000) | pred='2220' gold='2220' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 03:47:47,031 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='2220' gold='2220' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 4 GRPO groups: 75%|#######5 | 15/20 [02:40<00:28, 5.60s/q, loss=0.0011, mean_r=0.548, skip=7]
Iter 4 GRPO groups: 80%|######## | 16/20 [02:40<00:58, 14.62s/q, loss=0.0011, mean_r=0.548, skip=7]2026-04-26 03:47:53,692 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:53,774 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.291 = 0.50×0.14(prox=0.14) + 0.40×proc(0.149[fin=0.01,mean=0.36]) + 0.10×fmt(1.000) | pred='8' gold='2' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 03:47:53,851 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.412 = 0.50×0.50(prox=0.50) + 0.40×proc(0.155[fin=0.06,mean=0.29]) + 0.10×fmt(1.000) | pred='3' gold='2' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 03:47:53,927 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.432 = 0.50×0.50(prox=0.50) + 0.40×proc(0.205[fin=0.07,mean=0.41]) + 0.10×fmt(1.000) | pred='3' gold='2' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 03:47:54,009 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.425 = 0.50×0.33(prox=0.33) + 0.40×proc(0.208[fin=0.02,mean=0.49]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 03:47:54,091 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.449 = 0.50×0.50(prox=0.50) + 0.40×proc(0.249[fin=0.20,mean=0.32]) + 0.10×fmt(1.000) | pred='3' gold='2' | step_acc=20% lccp=20% (chain=1/5 ok_count=1) n_steps=5
+2026-04-26 03:47:54,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.433 = 0.50×0.50(prox=0.50) + 0.40×proc(0.206[fin=0.14,mean=0.31]) + 0.10×fmt(1.000) | pred='3' gold='2' | step_acc=20% lccp=20% (chain=1/5 ok_count=1) n_steps=5
+2026-04-26 03:47:54,251 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:47:54,333 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=0.99,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:54,417 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.811 = 0.50×1.00(exact) + 0.40×proc(0.529[fin=0.52,mean=0.55]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+
Iter 4 GRPO groups: 80%|######## | 16/20 [02:47<00:58, 14.62s/q, loss=0.0012, mean_r=0.625, skip=7]
Iter 4 GRPO groups: 85%|########5 | 17/20 [02:47<00:37, 12.46s/q, loss=0.0012, mean_r=0.625, skip=7]2026-04-26 03:47:59,042 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:59,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.925[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 03:47:59,210 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:59,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.643 = 0.50×0.70(prox=0.70) + 0.40×proc(0.485[fin=0.49,mean=0.48]) + 0.10×fmt(1.000) | pred='51' gold='65' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 03:47:59,375 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:59,457 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:59,539 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:59,622 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:59,705 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:59,787 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 4 GRPO groups: 85%|########5 | 17/20 [02:53<00:37, 12.46s/q, loss=0.0019, mean_r=0.959, skip=7]
Iter 4 GRPO groups: 90%|######### | 18/20 [02:53<00:20, 10.42s/q, loss=0.0019, mean_r=0.959, skip=7]2026-04-26 03:48:04,805 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:04,885 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:04,963 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:05,045 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:05,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:05,209 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:05,291 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:05,366 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:05,441 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:05,516 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 4 GRPO groups: 90%|######### | 18/20 [02:57<00:20, 10.42s/q, loss=0var, mean_r=0.997, skip=8]
Iter 4 GRPO groups: 95%|#########5| 19/20 [02:57<00:08, 8.50s/q, loss=0var, mean_r=0.997, skip=8]2026-04-26 03:48:09,600 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:09,686 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:09,771 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:09,853 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:09,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:10,018 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:10,101 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:10,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:10,262 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:10,339 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 4 GRPO groups: 95%|#########5| 19/20 [03:02<00:08, 8.50s/q, loss=0var, mean_r=1.000, skip=9]
Iter 4 GRPO groups: 100%|##########| 20/20 [03:02<00:00, 7.39s/q, loss=0var, mean_r=1.000, skip=9]
Iter 4 GRPO groups: 100%|##########| 20/20 [03:02<00:00, 9.10s/q, loss=0var, mean_r=1.000, skip=9]
+2026-04-26 03:48:10,349 INFO __main__ - Iter 4 | loss=0.0009 | reward mean=0.865 std=0.219 | gt_match=73.2% | grounded_acc=89.4% | step_acc=85.9% | lccp=76.5% | batch_acc=89.4% | phase=GROUNDED_ONLY sp_ratio=0% | groups=11 skipped=9(0var=9) | lr=2.75e-06 | 182.1s
+2026-04-26 03:48:10,349 WARNING __main__ - STARVATION: 45% of groups skipped (zero variance). grounded_acc=89.4% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 03:48:10,350 INFO __main__ - ======================================================================
+2026-04-26 03:48:10,350 INFO __main__ - GRPO ITERATION 5/60
+2026-04-26 03:48:10,350 INFO __main__ - ======================================================================
+2026-04-26 03:48:10,367 INFO __main__ - LR this iteration: 2.75e-06 | T=0.773 | MATH ratio=30%
+
Iter 5 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 03:48:16,985 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:17,070 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:17,163 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:17,246 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:17,338 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.815[fin=0.99,mean=0.56]) + 0.10×fmt(1.000) | pred='4' gold='35' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 03:48:17,424 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:17,508 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.924 = 0.50×1.00(exact) + 0.40×proc(0.810[fin=0.98,mean=0.55]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:48:17,601 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.261 = 0.50×0.00(prox=0.00) + 0.40×proc(0.310[fin=0.29,mean=0.34]) + 0.10×fmt(1.000) | pred='350000' gold='35' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 03:48:17,692 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.387 = 0.50×0.01(prox=0.01) + 0.40×proc(0.618[fin=0.72,mean=0.47]) + 0.10×fmt(1.000) | pred='3500' gold='35' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:48:17,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 5 GRPO groups: 0%| | 0/20 [00:08, ?q/s, loss=-0.0006, mean_r=0.809, skip=0]
Iter 5 GRPO groups: 5%|5 | 1/20 [00:08<02:49, 8.90s/q, loss=-0.0006, mean_r=0.809, skip=0]2026-04-26 03:48:28,583 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='81' gold='81' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:28,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.563 = 0.50×0.60(prox=0.60) + 0.40×proc(0.407[fin=0.54,mean=0.21]) + 0.10×fmt(1.000) | pred='54' gold='81' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:48:28,753 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='81' gold='81' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:28,839 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.907[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='81' gold='81' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 03:48:28,917 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='81' gold='81' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:28,993 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='81' gold='81' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:29,070 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='81' gold='81' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:29,153 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='81' gold='81' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:29,246 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.439 = 0.50×0.06(prox=0.06) + 0.40×proc(0.613[fin=0.64,mean=0.57]) + 0.10×fmt(1.000) | pred='729' gold='81' | step_acc=57% lccp=43% (chain=3/7 ok_count=4) n_steps=7
+2026-04-26 03:48:29,328 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.492 = 0.50×0.14(prox=0.14) + 0.40×proc(0.709[fin=0.86,mean=0.48]) + 0.10×fmt(1.000) | pred='324' gold='81' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+
Iter 5 GRPO groups: 5%|5 | 1/20 [00:20<02:49, 8.90s/q, loss=-0.0022, mean_r=0.844, skip=0]
Iter 5 GRPO groups: 10%|# | 2/20 [00:20<03:07, 10.43s/q, loss=-0.0022, mean_r=0.844, skip=0]2026-04-26 03:48:34,911 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:34,993 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:35,074 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:35,157 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.883 = 0.50×0.85(prox=0.85) + 0.40×proc(0.895[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='52' gold='48' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 03:48:35,240 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:35,321 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.950[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:48:35,404 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.824 = 0.50×0.85(prox=0.85) + 0.40×proc(0.746[fin=0.74,mean=0.76]) + 0.10×fmt(1.000) | pred='44' gold='48' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:48:35,486 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.861 = 0.50×0.85(prox=0.85) + 0.40×proc(0.840[fin=0.87,mean=0.79]) + 0.10×fmt(1.000) | pred='44' gold='48' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:48:35,567 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:35,649 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 5 GRPO groups: 10%|# | 2/20 [00:26<03:07, 10.43s/q, loss=0.0001, mean_r=0.954, skip=0]
Iter 5 GRPO groups: 15%|#5 | 3/20 [00:26<02:25, 8.54s/q, loss=0.0001, mean_r=0.954, skip=0]2026-04-26 03:48:41,510 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:41,591 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:41,674 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.810 = 0.50×0.85(prox=0.85) + 0.40×proc(0.713[fin=0.90,mean=0.43]) + 0.10×fmt(1.000) | pred='20' gold='21' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 03:48:41,755 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:41,836 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:41,919 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:42,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:42,085 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:42,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:48:42,251 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 5 GRPO groups: 15%|#5 | 3/20 [00:33<02:25, 8.54s/q, loss=-0.0013, mean_r=0.981, skip=0]
Iter 5 GRPO groups: 20%|## | 4/20 [00:33<02:04, 7.78s/q, loss=-0.0013, mean_r=0.981, skip=0]2026-04-26 03:48:47,662 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:47,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:47,826 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:47,908 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:47,990 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:48,072 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:48,157 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:48,241 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:48,326 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:48,410 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 5 GRPO groups: 20%|## | 4/20 [00:38<02:04, 7.78s/q, loss=0var, mean_r=0.998, skip=1]
Iter 5 GRPO groups: 25%|##5 | 5/20 [00:38<01:40, 6.68s/q, loss=0var, mean_r=0.998, skip=1]2026-04-26 03:48:52,305 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:52,388 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:52,469 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:52,551 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:52,635 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.688 = 0.50×0.81(prox=0.81) + 0.40×proc(0.462[fin=0.33,mean=0.67]) + 0.10×fmt(1.000) | pred='22' gold='25' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 03:48:52,717 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:52,801 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.38(prox=0.38) + 0.40×proc(0.788[fin=0.80,mean=0.77]) + 0.10×fmt(1.000) | pred='5' gold='25' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:48:52,883 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:52,966 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:53,047 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 5 GRPO groups: 25%|##5 | 5/20 [00:44<01:40, 6.68s/q, loss=0.0011, mean_r=0.924, skip=1]
Iter 5 GRPO groups: 30%|### | 6/20 [00:44<01:30, 6.47s/q, loss=0.0011, mean_r=0.924, skip=1]2026-04-26 03:49:00,958 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:01,041 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:01,124 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:01,206 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:01,288 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:01,372 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 03:49:01,456 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:01,548 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:01,630 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:01,712 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 5 GRPO groups: 30%|### | 6/20 [00:51<01:30, 6.47s/q, loss=0var, mean_r=0.998, skip=2]
Iter 5 GRPO groups: 35%|###5 | 7/20 [00:51<01:27, 6.72s/q, loss=0var, mean_r=0.998, skip=2]2026-04-26 03:49:05,710 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.473 = 0.50×0.41(prox=0.41) + 0.40×proc(0.420[fin=0.38,mean=0.48]) + 0.10×fmt(1.000) | pred='7' gold='25' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:49:05,796 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.899 = 0.50×0.85(prox=0.85) + 0.40×proc(0.935[fin=0.94,mean=0.93]) + 0.10×fmt(1.000) | pred='23' gold='25' | step_acc=100% lccp=100% (chain=11/11 ok_count=11) n_steps=11
+2026-04-26 03:49:05,878 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.277 = 0.50×0.11(prox=0.11) + 0.40×proc(0.308[fin=0.40,mean=0.17]) + 0.10×fmt(1.000) | pred='128' gold='25' | step_acc=0% lccp=0% (chain=0/5 ok_count=0) n_steps=5
+2026-04-26 03:49:05,953 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.282 = 0.50×0.20(prox=0.20) + 0.40×proc(0.201[fin=0.11,mean=0.34]) + 0.10×fmt(1.000) | pred='74' gold='25' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 03:49:06,035 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.862 = 0.50×0.85(prox=0.85) + 0.40×proc(0.843[fin=0.89,mean=0.77]) + 0.10×fmt(1.000) | pred='23' gold='25' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:49:06,113 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.491 = 0.50×0.41(prox=0.41) + 0.40×proc(0.466[fin=0.56,mean=0.33]) + 0.10×fmt(1.000) | pred='7' gold='25' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:49:06,195 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.792 = 0.50×0.71(prox=0.71) + 0.40×proc(0.837[fin=0.88,mean=0.77]) + 0.10×fmt(1.000) | pred='30' gold='25' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:49:06,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.319 = 0.50×0.06(prox=0.06) + 0.40×proc(0.476[fin=0.62,mean=0.25]) + 0.10×fmt(1.000) | pred='233' gold='25' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 03:49:06,360 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.856 = 0.50×0.71(prox=0.71) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:06,442 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.724 = 0.50×0.58(prox=0.58) + 0.40×proc(0.833[fin=0.99,mean=0.60]) + 0.10×fmt(1.000) | pred='16' gold='25' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+
Iter 5 GRPO groups: 35%|###5 | 7/20 [00:57<01:27, 6.72s/q, loss=0.0037, mean_r=0.598, skip=2]
Iter 5 GRPO groups: 40%|#### | 8/20 [00:57<01:18, 6.54s/q, loss=0.0037, mean_r=0.598, skip=2]2026-04-26 03:49:17,471 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.702 = 0.50×0.85(prox=0.85) + 0.40×proc(0.442[fin=0.24,mean=0.75]) + 0.10×fmt(1.000) | pred='50' gold='49' | step_acc=75% lccp=75% (chain=6/8 ok_count=6) n_steps=8
+2026-04-26 03:49:17,556 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:49:17,640 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:49:17,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:49:17,804 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:49:17,886 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:49:17,977 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:49:18,061 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:49:18,145 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:49:18,229 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.720 = 0.50×0.64(prox=0.64) + 0.40×proc(0.756[fin=0.81,mean=0.68]) + 0.10×fmt(1.000) | pred='63' gold='49' | step_acc=78% lccp=0% (chain=0/9 ok_count=7) n_steps=9
+
Iter 5 GRPO groups: 40%|#### | 8/20 [01:09<01:18, 6.54s/q, loss=-0.0000, mean_r=0.940, skip=2]
Iter 5 GRPO groups: 45%|####5 | 9/20 [01:09<01:30, 8.18s/q, loss=-0.0000, mean_r=0.940, skip=2]2026-04-26 03:49:25,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.48(prox=0.48) + 0.40×proc(0.950[fin=0.99,mean=0.89]) + 0.10×fmt(1.000) | pred='5' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:25,858 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:25,942 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:26,026 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.48(prox=0.48) + 0.40×proc(0.803[fin=0.88,mean=0.70]) + 0.10×fmt(1.000) | pred='5' gold='11' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:49:26,111 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.902[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 03:49:26,205 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.584 = 0.50×0.73(prox=0.73) + 0.40×proc(0.294[fin=0.40,mean=0.14]) + 0.10×fmt(1.000) | pred='9' gold='11' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 03:49:26,288 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:26,380 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.661 = 0.50×0.48(prox=0.48) + 0.40×proc(0.806[fin=1.00,mean=0.52]) + 0.10×fmt(1.000) | pred='5' gold='11' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:49:26,463 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.895[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:49:26,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.709 = 0.50×0.48(prox=0.48) + 0.40×proc(0.924[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='5' gold='11' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+
Iter 5 GRPO groups: 45%|####5 | 9/20 [01:17<01:30, 8.18s/q, loss=-0.0017, mean_r=0.796, skip=2]
Iter 5 GRPO groups: 50%|##### | 10/20 [01:17<01:22, 8.23s/q, loss=-0.0017, mean_r=0.796, skip=2]2026-04-26 03:49:42,119 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:42,213 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:42,296 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:42,390 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.700) | pred='' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:42,475 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:49:42,568 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:42,568 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='9' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 03:49:42,651 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:42,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.905[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='27' gold='9' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 03:49:42,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.896[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='27' gold='9' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+
Iter 5 GRPO groups: 50%|##### | 10/20 [01:33<01:22, 8.23s/q, loss=-0.0011, mean_r=0.763, skip=2]
Iter 5 GRPO groups: 55%|#####5 | 11/20 [01:33<01:36, 10.70s/q, loss=-0.0011, mean_r=0.763, skip=2]2026-04-26 03:49:48,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:49:48,212 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:48,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:48,375 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:49:48,456 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:49:48,537 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:49:48,619 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:48,700 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:49:48,781 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:49:48,864 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.939[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 5 GRPO groups: 55%|#####5 | 11/20 [01:38<01:36, 10.70s/q, loss=0var, mean_r=0.993, skip=3]
Iter 5 GRPO groups: 60%|###### | 12/20 [01:38<01:10, 8.83s/q, loss=0var, mean_r=0.993, skip=3]2026-04-26 03:50:02,139 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.888 = 0.50×1.00(exact) + 0.40×proc(0.721[fin=0.75,mean=0.68]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:50:02,226 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.463 = 0.50×0.33(prox=0.33) + 0.40×proc(0.340[fin=0.26,mean=0.46]) + 0.10×fmt(1.000) | pred='1' gold='0' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 03:50:02,322 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.332 = 0.50×0.00(prox=0.00) + 0.40×proc(0.421[fin=0.24,mean=0.69]) + 0.10×fmt(0.700) | pred='' gold='0' | step_acc=62% lccp=62% (chain=5/8 ok_count=5) n_steps=8
+2026-04-26 03:50:02,416 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:02,510 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.391 = 0.50×0.00(prox=0.00) + 0.40×proc(0.728[fin=0.83,mean=0.58]) + 0.10×fmt(0.700) | pred='' gold='0' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 03:50:02,594 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:50:02,678 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:50:02,771 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:50:02,858 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=0.99,mean=0.92]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:50:02,953 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.503 = 0.50×0.00(prox=0.00) + 0.40×proc(0.882[fin=0.98,mean=0.73]) + 0.10×fmt(1.000) | pred='$1 - \\sqrt{3}$' gold='0' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+
Iter 5 GRPO groups: 60%|###### | 12/20 [01:54<01:10, 8.83s/q, loss=-0.0012, mean_r=0.752, skip=3]
Iter 5 GRPO groups: 65%|######5 | 13/20 [01:54<01:16, 10.87s/q, loss=-0.0012, mean_r=0.752, skip=3]2026-04-26 03:50:12,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='270' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:12,361 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='270' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:12,444 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.917[fin=0.99,mean=0.81]) + 0.10×fmt(1.000) | pred='270' gold='270' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 03:50:12,528 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='270' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:12,612 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='270' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:12,696 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.794 = 0.50×0.60(prox=0.60) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='360' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:12,780 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='270' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:12,864 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='270' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:12,956 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='$270' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:13,039 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='270' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 5 GRPO groups: 65%|######5 | 13/20 [02:04<01:16, 10.87s/q, loss=0.0008, mean_r=0.974, skip=3]
Iter 5 GRPO groups: 70%|####### | 14/20 [02:04<01:03, 10.62s/q, loss=0.0008, mean_r=0.974, skip=3]2026-04-26 03:50:19,128 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.795 = 0.50×0.78(prox=0.78) + 0.40×proc(0.765[fin=0.89,mean=0.58]) + 0.10×fmt(1.000) | pred='16' gold='14' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:50:19,211 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:19,296 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.665 = 0.50×0.41(prox=0.41) + 0.40×proc(0.897[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='4' gold='14' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:50:19,379 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.547 = 0.50×0.58(prox=0.58) + 0.40×proc(0.387[fin=0.35,mean=0.44]) + 0.10×fmt(1.000) | pred='19' gold='14' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:50:19,465 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.848 = 0.50×1.00(exact) + 0.40×proc(0.620[fin=0.67,mean=0.55]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 03:50:19,549 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:50:19,634 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.523 = 0.50×0.44(prox=0.44) + 0.40×proc(0.510[fin=0.66,mean=0.28]) + 0.10×fmt(1.000) | pred='5' gold='14' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 03:50:19,715 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:50:19,797 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:19,882 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.508 = 0.50×0.44(prox=0.44) + 0.40×proc(0.348[fin=0.35,mean=0.34]) + 0.10×fmt(1.000) | pred='5' gold='14' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+
Iter 5 GRPO groups: 70%|####### | 14/20 [02:10<01:03, 10.62s/q, loss=0.0019, mean_r=0.788, skip=3]
Iter 5 GRPO groups: 75%|#######5 | 15/20 [02:10<00:47, 9.47s/q, loss=0.0019, mean_r=0.788, skip=3]2026-04-26 03:50:24,625 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.934[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:50:24,701 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.932[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:50:24,778 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:50:24,861 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:50:24,936 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:50:25,012 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.969[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:50:25,096 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:50:25,180 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:50:25,259 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:50:25,336 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+
Iter 5 GRPO groups: 75%|#######5 | 15/20 [02:14<00:47, 9.47s/q, loss=0var, mean_r=0.989, skip=4]
Iter 5 GRPO groups: 80%|######## | 16/20 [02:14<00:31, 7.84s/q, loss=0var, mean_r=0.989, skip=4]2026-04-26 03:50:32,824 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:50:32,908 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:50:32,990 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:50:33,074 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.310 = 0.50×0.17(prox=0.17) + 0.40×proc(0.271[fin=0.18,mean=0.40]) + 0.10×fmt(1.000) | pred='7' gold='2' | step_acc=38% lccp=12% (chain=1/8 ok_count=3) n_steps=8
+2026-04-26 03:50:33,157 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=12/12 ok_count=12) n_steps=12
+2026-04-26 03:50:33,240 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:50:33,322 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=11/11 ok_count=11) n_steps=11
+2026-04-26 03:50:33,405 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:50:33,496 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:50:33,578 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+
Iter 5 GRPO groups: 80%|######## | 16/20 [02:24<00:31, 7.84s/q, loss=0.0012, mean_r=0.929, skip=4]
Iter 5 GRPO groups: 85%|########5 | 17/20 [02:24<00:25, 8.39s/q, loss=0.0012, mean_r=0.929, skip=4]2026-04-26 03:51:08,058 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='648' gold='648' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:51:08,145 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.849 = 0.50×0.82(prox=0.82) + 0.40×proc(0.849[fin=1.00,mean=0.63]) + 0.10×fmt(1.000) | pred='576' gold='648' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 03:51:08,229 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.956[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='648' gold='648' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:51:08,311 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='648' gold='648' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:51:08,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='648' gold='648' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:51:08,480 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.749 = 0.50×0.60(prox=0.60) + 0.40×proc(0.873[fin=1.00,mean=0.69]) + 0.10×fmt(1.000) | pred='864' gold='648' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 03:51:08,563 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='648' gold='648' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:51:08,645 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='648' gold='648' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:51:08,727 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='648' gold='648' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 5 GRPO groups: 85%|########5 | 17/20 [02:59<00:25, 8.39s/q, loss=0.0003, mean_r=0.951, skip=4]
Iter 5 GRPO groups: 90%|######### | 18/20 [02:59<00:32, 16.43s/q, loss=0.0003, mean_r=0.951, skip=4]2026-04-26 03:51:18,739 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:51:18,831 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:51:18,923 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.35(prox=0.35) + 0.40×proc(0.491[fin=0.40,mean=0.63]) + 0.10×fmt(1.000) | pred='146' gold='76' | step_acc=57% lccp=57% (chain=4/7 ok_count=4) n_steps=7
+2026-04-26 03:51:19,015 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:51:19,100 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.925 = 0.50×1.00(exact) + 0.40×proc(0.813[fin=0.88,mean=0.71]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 03:51:19,194 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.815 = 0.50×0.85(prox=0.85) + 0.40×proc(0.724[fin=0.70,mean=0.76]) + 0.10×fmt(1.000) | pred='74' gold='76' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 03:51:19,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.831 = 0.50×0.85(prox=0.85) + 0.40×proc(0.766[fin=0.75,mean=0.80]) + 0.10×fmt(1.000) | pred='74' gold='76' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 03:51:19,378 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.899[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 03:51:19,470 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.833 = 0.50×0.85(prox=0.85) + 0.40×proc(0.771[fin=0.88,mean=0.62]) + 0.10×fmt(1.000) | pred='82' gold='76' | step_acc=57% lccp=43% (chain=3/7 ok_count=4) n_steps=7
+2026-04-26 03:51:19,565 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.838 = 0.50×0.79(prox=0.79) + 0.40×proc(0.856[fin=0.98,mean=0.66]) + 0.10×fmt(1.000) | pred='86' gold='76' | step_acc=67% lccp=33% (chain=3/9 ok_count=6) n_steps=9
+
Iter 5 GRPO groups: 90%|######### | 18/20 [03:10<00:32, 16.43s/q, loss=-0.0003, mean_r=0.874, skip=4]
Iter 5 GRPO groups: 95%|#########5| 19/20 [03:10<00:14, 14.75s/q, loss=-0.0003, mean_r=0.874, skip=4]2026-04-26 03:51:29,878 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.690 = 0.50×0.50(prox=0.50) + 0.40×proc(0.849[fin=0.93,mean=0.72]) + 0.10×fmt(1.000) | pred='90' gold='180' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 03:51:29,957 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.164 = 0.50×0.02(prox=0.02) + 0.40×proc(0.137[fin=0.09,mean=0.20]) + 0.10×fmt(1.000) | pred='5040' gold='180' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:51:30,043 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.316 = 0.50×0.34(prox=0.34) + 0.40×proc(0.117[fin=0.07,mean=0.18]) + 0.10×fmt(1.000) | pred='4' gold='180' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 03:51:30,128 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.188 = 0.50×0.02(prox=0.02) + 0.40×proc(0.195[fin=0.14,mean=0.27]) + 0.10×fmt(1.000) | pred='4320' gold='180' | step_acc=0% lccp=0% (chain=0/5 ok_count=0) n_steps=5
+2026-04-26 03:51:30,211 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.362 = 0.50×0.04(prox=0.04) + 0.40×proc(0.327[fin=0.16,mean=0.57]) + 0.10×fmt(1.000) | pred='2520' gold='180' | step_acc=75% lccp=75% (chain=3/4 ok_count=3) n_steps=4
+2026-04-26 03:51:30,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.232 = 0.50×0.02(prox=0.02) + 0.40×proc(0.308[fin=0.29,mean=0.34]) + 0.10×fmt(1.000) | pred='5040' gold='180' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 03:51:30,363 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.179 = 0.50×0.02(prox=0.02) + 0.40×proc(0.174[fin=0.14,mean=0.23]) + 0.10×fmt(1.000) | pred='5040' gold='180' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:51:30,454 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.307 = 0.50×0.04(prox=0.04) + 0.40×proc(0.471[fin=0.36,mean=0.64]) + 0.10×fmt(1.000) | pred='2520' gold='180' | step_acc=67% lccp=0% (chain=0/9 ok_count=6) n_steps=9
+2026-04-26 03:51:30,530 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.086 = 0.50×0.02(prox=0.02) + 0.40×proc(0.029[fin=0.03,mean=0.03]) + 0.10×fmt(0.650) | pred='5040' gold='180' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 03:51:30,605 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.394 = 0.50×0.60(prox=0.60) + 0.40×proc(0.073[fin=0.05,mean=0.10]) + 0.10×fmt(0.650) | pred='120' gold='180' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+
Iter 5 GRPO groups: 95%|#########5| 19/20 [03:21<00:14, 14.75s/q, loss=0.0005, mean_r=0.292, skip=4]
Iter 5 GRPO groups: 100%|##########| 20/20 [03:21<00:00, 13.63s/q, loss=0.0005, mean_r=0.292, skip=4]
Iter 5 GRPO groups: 100%|##########| 20/20 [03:21<00:00, 10.08s/q, loss=0.0005, mean_r=0.292, skip=4]
+2026-04-26 03:51:32,030 INFO __main__ - Iter 5 | loss=0.0001 | reward mean=0.857 std=0.239 | gt_match=69.3% | grounded_acc=88.4% | step_acc=83.5% | lccp=72.5% | batch_acc=88.4% | phase=GROUNDED_ONLY sp_ratio=0% | groups=16 skipped=4(0var=4) | lr=3.31e-06 | 201.7s
+2026-04-26 03:51:32,030 INFO __main__ - Evaluating GSM8K (150 samples)...
+
GSM8K eval: 0%| | 0/150 [00:00, ?q/s]
GSM8K eval: 1%| | 1/150 [00:02<05:32, 2.23s/q, correct=1/1, lccp=100.0%, score=0.998, step_acc=100.0%]
GSM8K eval: 1%|1 | 2/150 [00:07<09:23, 3.81s/q, correct=2/2, lccp=100.0%, score=0.999, step_acc=100.0%]
GSM8K eval: 2%|2 | 3/150 [00:09<07:59, 3.26s/q, correct=3/3, lccp=100.0%, score=0.999, step_acc=100.0%]
GSM8K eval: 3%|2 | 4/150 [00:11<06:57, 2.86s/q, correct=3/4, lccp=83.3%, score=0.887, step_acc=91.7%]
GSM8K eval: 3%|3 | 5/150 [00:13<05:48, 2.40s/q, correct=4/5, lccp=86.7%, score=0.910, step_acc=93.3%]
GSM8K eval: 4%|4 | 6/150 [00:19<08:14, 3.43s/q, correct=4/6, lccp=75.6%, score=0.887, step_acc=87.8%]
GSM8K eval: 5%|4 | 7/150 [00:22<08:08, 3.42s/q, correct=5/7, lccp=79.0%, score=0.903, step_acc=89.5%]
GSM8K eval: 5%|5 | 8/150 [00:24<07:18, 3.09s/q, correct=6/8, lccp=81.7%, score=0.915, step_acc=90.8%]
GSM8K eval: 6%|6 | 9/150 [00:28<07:21, 3.13s/q, correct=7/9, lccp=83.7%, score=0.924, step_acc=91.9%]
GSM8K eval: 7%|6 | 10/150 [00:34<09:46, 4.19s/q, correct=8/10, lccp=85.3%, score=0.932, step_acc=92.7%]
GSM8K eval: 7%|7 | 11/150 [00:37<08:45, 3.78s/q, correct=9/11, lccp=86.7%, score=0.938, step_acc=93.3%]
GSM8K eval: 8%|8 | 12/150 [00:39<07:30, 3.26s/q, correct=10/12, lccp=87.8%, score=0.943, step_acc=93.9%]
GSM8K eval: 9%|8 | 13/150 [00:42<07:03, 3.09s/q, correct=11/13, lccp=88.7%, score=0.943, step_acc=94.4%]
GSM8K eval: 9%|9 | 14/150 [00:46<07:57, 3.51s/q, correct=12/14, lccp=89.5%, score=0.947, step_acc=94.8%]
GSM8K eval: 10%|# | 15/150 [00:49<07:15, 3.22s/q, correct=13/15, lccp=90.2%, score=0.951, step_acc=95.1%]
GSM8K eval: 11%|# | 16/150 [00:51<06:42, 3.01s/q, correct=13/16, lccp=90.8%, score=0.926, step_acc=95.4%]
GSM8K eval: 11%|#1 | 17/150 [00:55<07:16, 3.28s/q, correct=14/17, lccp=91.4%, score=0.930, step_acc=95.7%]
GSM8K eval: 12%|#2 | 18/150 [00:59<07:44, 3.52s/q, correct=14/18, lccp=87.2%, score=0.907, step_acc=92.2%]
GSM8K eval: 13%|#2 | 19/150 [01:02<07:05, 3.25s/q, correct=15/19, lccp=87.9%, score=0.912, step_acc=92.6%]
GSM8K eval: 13%|#3 | 20/150 [01:05<07:17, 3.37s/q, correct=16/20, lccp=88.5%, score=0.916, step_acc=93.0%]
GSM8K eval: 14%|#4 | 21/150 [01:08<06:41, 3.12s/q, correct=17/21, lccp=89.0%, score=0.920, step_acc=93.3%]
GSM8K eval: 15%|#4 | 22/150 [01:11<06:21, 2.98s/q, correct=18/22, lccp=86.5%, score=0.915, step_acc=92.1%]
GSM8K eval: 15%|#5 | 23/150 [01:15<06:59, 3.30s/q, correct=19/23, lccp=87.1%, score=0.919, step_acc=92.5%]
GSM8K eval: 16%|#6 | 24/150 [01:17<06:27, 3.08s/q, correct=19/24, lccp=84.5%, score=0.902, step_acc=89.7%]
GSM8K eval: 17%|#6 | 25/150 [01:20<06:10, 2.97s/q, correct=19/25, lccp=82.1%, score=0.898, step_acc=89.1%]
GSM8K eval: 17%|#7 | 26/150 [01:24<06:58, 3.37s/q, correct=20/26, lccp=82.8%, score=0.902, step_acc=89.5%]
GSM8K eval: 18%|#8 | 27/150 [01:27<06:31, 3.19s/q, correct=20/27, lccp=83.5%, score=0.897, step_acc=89.9%]
GSM8K eval: 19%|#8 | 28/150 [01:29<05:51, 2.88s/q, correct=21/28, lccp=84.0%, score=0.900, step_acc=90.2%]
GSM8K eval: 19%|#9 | 29/150 [01:32<05:42, 2.83s/q, correct=22/29, lccp=84.6%, score=0.904, step_acc=90.6%]
GSM8K eval: 20%|## | 30/150 [01:36<06:12, 3.11s/q, correct=23/30, lccp=85.1%, score=0.907, step_acc=90.9%]
GSM8K eval: 21%|## | 31/150 [01:38<05:50, 2.95s/q, correct=24/31, lccp=85.6%, score=0.910, step_acc=91.2%]
GSM8K eval: 21%|##1 | 32/150 [01:40<05:03, 2.57s/q, correct=25/32, lccp=86.0%, score=0.912, step_acc=91.5%]
GSM8K eval: 22%|##2 | 33/150 [01:43<05:07, 2.63s/q, correct=26/33, lccp=86.5%, score=0.914, step_acc=91.7%]
GSM8K eval: 23%|##2 | 34/150 [01:45<04:41, 2.43s/q, correct=27/34, lccp=86.9%, score=0.917, step_acc=92.0%]
GSM8K eval: 23%|##3 | 35/150 [01:47<04:43, 2.47s/q, correct=28/35, lccp=87.2%, score=0.919, step_acc=92.2%]
GSM8K eval: 24%|##4 | 36/150 [01:51<05:15, 2.77s/q, correct=29/36, lccp=87.6%, score=0.921, step_acc=92.4%]
GSM8K eval: 25%|##4 | 37/150 [01:53<04:45, 2.53s/q, correct=30/37, lccp=87.9%, score=0.923, step_acc=92.6%]
GSM8K eval: 25%|##5 | 38/150 [01:56<04:57, 2.66s/q, correct=31/38, lccp=88.2%, score=0.925, step_acc=92.8%]
GSM8K eval: 26%|##6 | 39/150 [02:00<06:03, 3.28s/q, correct=32/39, lccp=88.5%, score=0.926, step_acc=93.0%]
GSM8K eval: 27%|##6 | 40/150 [02:07<07:36, 4.15s/q, correct=33/40, lccp=88.8%, score=0.928, step_acc=93.2%]
GSM8K eval: 27%|##7 | 41/150 [02:10<06:55, 3.81s/q, correct=33/41, lccp=89.1%, score=0.928, step_acc=93.3%]
GSM8K eval: 28%|##8 | 42/150 [02:14<07:07, 3.96s/q, correct=34/42, lccp=87.9%, score=0.929, step_acc=93.0%]
GSM8K eval: 29%|##8 | 43/150 [02:16<05:56, 3.33s/q, correct=35/43, lccp=88.2%, score=0.930, step_acc=93.2%]
GSM8K eval: 29%|##9 | 44/150 [02:22<07:25, 4.20s/q, correct=36/44, lccp=88.5%, score=0.932, step_acc=93.3%]
GSM8K eval: 30%|### | 45/150 [02:25<06:44, 3.85s/q, correct=37/45, lccp=88.7%, score=0.933, step_acc=93.5%]
GSM8K eval: 31%|### | 46/150 [02:30<07:10, 4.14s/q, correct=37/46, lccp=86.8%, score=0.928, step_acc=93.4%]
GSM8K eval: 31%|###1 | 47/150 [02:33<06:30, 3.79s/q, correct=38/47, lccp=87.1%, score=0.930, step_acc=93.5%]
GSM8K eval: 32%|###2 | 48/150 [02:35<05:24, 3.18s/q, correct=39/48, lccp=87.4%, score=0.931, step_acc=93.7%]
GSM8K eval: 33%|###2 | 49/150 [02:38<05:33, 3.30s/q, correct=40/49, lccp=86.3%, score=0.932, step_acc=93.4%]
GSM8K eval: 33%|###3 | 50/150 [02:41<05:25, 3.26s/q, correct=40/50, lccp=85.5%, score=0.923, step_acc=92.6%]
GSM8K eval: 34%|###4 | 51/150 [02:43<04:27, 2.70s/q, correct=41/51, lccp=85.8%, score=0.925, step_acc=92.7%]
GSM8K eval: 35%|###4 | 52/150 [02:47<05:19, 3.26s/q, correct=41/52, lccp=84.2%, score=0.924, step_acc=92.5%]
GSM8K eval: 35%|###5 | 53/150 [02:52<05:57, 3.69s/q, correct=41/53, lccp=83.7%, score=0.917, step_acc=91.9%]
GSM8K eval: 36%|###6 | 54/150 [02:54<05:15, 3.28s/q, correct=42/54, lccp=84.0%, score=0.918, step_acc=92.1%]
GSM8K eval: 37%|###6 | 55/150 [02:58<05:18, 3.36s/q, correct=43/55, lccp=84.3%, score=0.920, step_acc=92.2%]
GSM8K eval: 37%|###7 | 56/150 [03:01<05:21, 3.42s/q, correct=44/56, lccp=84.6%, score=0.921, step_acc=92.4%]
GSM8K eval: 38%|###8 | 57/150 [03:04<04:46, 3.09s/q, correct=45/57, lccp=84.9%, score=0.922, step_acc=92.5%]
GSM8K eval: 39%|###8 | 58/150 [03:08<05:10, 3.37s/q, correct=46/58, lccp=85.1%, score=0.924, step_acc=92.6%]
GSM8K eval: 39%|###9 | 59/150 [03:12<05:17, 3.49s/q, correct=46/59, lccp=83.7%, score=0.917, step_acc=92.1%]
GSM8K eval: 40%|#### | 60/150 [03:16<05:52, 3.92s/q, correct=47/60, lccp=83.9%, score=0.918, step_acc=92.2%]
GSM8K eval: 41%|#### | 61/150 [03:20<05:27, 3.68s/q, correct=48/61, lccp=84.2%, score=0.920, step_acc=92.3%]
GSM8K eval: 41%|####1 | 62/150 [03:23<05:07, 3.50s/q, correct=49/62, lccp=84.5%, score=0.921, step_acc=92.5%]
GSM8K eval: 42%|####2 | 63/150 [03:26<04:58, 3.43s/q, correct=49/63, lccp=84.2%, score=0.915, step_acc=92.0%]
GSM8K eval: 43%|####2 | 64/150 [03:29<04:39, 3.25s/q, correct=50/64, lccp=84.4%, score=0.916, step_acc=92.2%]
GSM8K eval: 43%|####3 | 65/150 [03:31<04:24, 3.11s/q, correct=51/65, lccp=84.7%, score=0.917, step_acc=92.3%]
GSM8K eval: 44%|####4 | 66/150 [03:33<03:38, 2.61s/q, correct=52/66, lccp=84.9%, score=0.919, step_acc=92.4%]
GSM8K eval: 45%|####4 | 67/150 [03:35<03:27, 2.50s/q, correct=53/67, lccp=85.1%, score=0.920, step_acc=92.5%]
GSM8K eval: 45%|####5 | 68/150 [03:38<03:29, 2.55s/q, correct=54/68, lccp=85.3%, score=0.921, step_acc=92.6%]
GSM8K eval: 46%|####6 | 69/150 [03:39<03:01, 2.24s/q, correct=55/69, lccp=85.6%, score=0.922, step_acc=92.7%]
GSM8K eval: 47%|####6 | 70/150 [03:42<03:15, 2.44s/q, correct=56/70, lccp=84.3%, score=0.923, step_acc=92.6%]
GSM8K eval: 47%|####7 | 71/150 [03:45<03:28, 2.64s/q, correct=57/71, lccp=83.1%, score=0.924, step_acc=92.4%]
GSM8K eval: 48%|####8 | 72/150 [03:47<02:57, 2.27s/q, correct=58/72, lccp=83.4%, score=0.925, step_acc=92.5%]
GSM8K eval: 49%|####8 | 73/150 [03:48<02:40, 2.09s/q, correct=59/73, lccp=83.6%, score=0.926, step_acc=92.6%]
GSM8K eval: 49%|####9 | 74/150 [03:52<03:12, 2.53s/q, correct=60/74, lccp=83.8%, score=0.927, step_acc=92.7%]
GSM8K eval: 50%|##### | 75/150 [03:54<02:51, 2.29s/q, correct=61/75, lccp=84.0%, score=0.928, step_acc=92.8%]
GSM8K eval: 51%|##### | 76/150 [04:00<04:26, 3.60s/q, correct=61/76, lccp=84.1%, score=0.923, step_acc=92.7%]
GSM8K eval: 51%|#####1 | 77/150 [04:04<04:29, 3.69s/q, correct=62/77, lccp=84.3%, score=0.924, step_acc=92.8%]
GSM8K eval: 52%|#####2 | 78/150 [04:07<03:58, 3.31s/q, correct=63/78, lccp=84.5%, score=0.924, step_acc=92.9%]
GSM8K eval: 53%|#####2 | 79/150 [04:10<03:48, 3.22s/q, correct=63/79, lccp=83.6%, score=0.919, step_acc=92.1%]
GSM8K eval: 53%|#####3 | 80/150 [04:13<03:38, 3.13s/q, correct=64/80, lccp=83.8%, score=0.920, step_acc=92.2%]
GSM8K eval: 54%|#####4 | 81/150 [04:15<03:19, 2.89s/q, correct=65/81, lccp=84.0%, score=0.921, step_acc=92.3%]
GSM8K eval: 55%|#####4 | 82/150 [04:18<03:16, 2.89s/q, correct=66/82, lccp=84.2%, score=0.922, step_acc=92.4%]
GSM8K eval: 55%|#####5 | 83/150 [04:21<03:10, 2.84s/q, correct=67/83, lccp=84.4%, score=0.923, step_acc=92.5%]
GSM8K eval: 56%|#####6 | 84/150 [04:23<03:03, 2.78s/q, correct=68/84, lccp=84.6%, score=0.923, step_acc=92.6%]
GSM8K eval: 57%|#####6 | 85/150 [04:27<03:18, 3.06s/q, correct=69/85, lccp=84.8%, score=0.924, step_acc=92.7%]
GSM8K eval: 57%|#####7 | 86/150 [04:30<03:22, 3.16s/q, correct=70/86, lccp=85.0%, score=0.925, step_acc=92.8%]
GSM8K eval: 58%|#####8 | 87/150 [04:36<04:02, 3.85s/q, correct=71/87, lccp=85.1%, score=0.926, step_acc=92.9%]
GSM8K eval: 59%|#####8 | 88/150 [04:38<03:21, 3.25s/q, correct=72/88, lccp=85.3%, score=0.927, step_acc=93.0%]
GSM8K eval: 59%|#####9 | 89/150 [04:40<03:08, 3.09s/q, correct=73/89, lccp=85.5%, score=0.928, step_acc=93.0%]
GSM8K eval: 60%|###### | 90/150 [04:43<02:52, 2.88s/q, correct=74/90, lccp=85.6%, score=0.928, step_acc=93.1%]
GSM8K eval: 61%|###### | 91/150 [04:47<03:14, 3.30s/q, correct=75/91, lccp=85.8%, score=0.929, step_acc=93.2%]
GSM8K eval: 61%|######1 | 92/150 [04:50<03:07, 3.23s/q, correct=76/92, lccp=86.0%, score=0.930, step_acc=93.3%]
GSM8K eval: 62%|######2 | 93/150 [04:57<04:06, 4.33s/q, correct=77/93, lccp=86.1%, score=0.930, step_acc=93.3%]
GSM8K eval: 63%|######2 | 94/150 [04:58<03:15, 3.48s/q, correct=77/94, lccp=85.2%, score=0.925, step_acc=92.3%]
GSM8K eval: 63%|######3 | 95/150 [05:04<03:42, 4.05s/q, correct=78/95, lccp=84.3%, score=0.925, step_acc=91.9%]
GSM8K eval: 64%|######4 | 96/150 [05:07<03:23, 3.77s/q, correct=78/96, lccp=83.8%, score=0.921, step_acc=91.3%]
GSM8K eval: 65%|######4 | 97/150 [05:10<03:01, 3.42s/q, correct=78/97, lccp=83.2%, score=0.919, step_acc=90.9%]
GSM8K eval: 65%|######5 | 98/150 [05:14<03:08, 3.62s/q, correct=78/98, lccp=82.7%, score=0.915, step_acc=90.7%]
GSM8K eval: 66%|######6 | 99/150 [05:16<02:44, 3.22s/q, correct=79/99, lccp=82.9%, score=0.916, step_acc=90.8%]
GSM8K eval: 67%|######6 | 100/150 [05:18<02:20, 2.81s/q, correct=80/100, lccp=82.1%, score=0.916, step_acc=90.5%]
GSM8K eval: 67%|######7 | 101/150 [05:21<02:19, 2.84s/q, correct=80/101, lccp=81.8%, score=0.913, step_acc=90.4%]
GSM8K eval: 68%|######8 | 102/150 [05:22<01:56, 2.43s/q, correct=81/102, lccp=81.9%, score=0.913, step_acc=90.5%]
GSM8K eval: 69%|######8 | 103/150 [05:24<01:48, 2.30s/q, correct=82/103, lccp=82.1%, score=0.914, step_acc=90.5%]
GSM8K eval: 69%|######9 | 104/150 [05:29<02:18, 3.00s/q, correct=83/104, lccp=82.3%, score=0.915, step_acc=90.6%]
GSM8K eval: 70%|####### | 105/150 [05:31<02:08, 2.86s/q, correct=84/105, lccp=82.5%, score=0.916, step_acc=90.7%]
GSM8K eval: 71%|####### | 106/150 [05:33<01:47, 2.45s/q, correct=85/106, lccp=82.6%, score=0.917, step_acc=90.8%]
GSM8K eval: 71%|#######1 | 107/150 [05:34<01:32, 2.16s/q, correct=86/107, lccp=82.8%, score=0.917, step_acc=90.9%]
GSM8K eval: 72%|#######2 | 108/150 [05:37<01:36, 2.30s/q, correct=87/108, lccp=83.0%, score=0.918, step_acc=91.0%]
GSM8K eval: 73%|#######2 | 109/150 [05:42<02:06, 3.08s/q, correct=87/109, lccp=82.5%, score=0.917, step_acc=90.9%]
GSM8K eval: 73%|#######3 | 110/150 [05:44<01:51, 2.80s/q, correct=88/110, lccp=82.7%, score=0.917, step_acc=91.0%]
GSM8K eval: 74%|#######4 | 111/150 [05:46<01:35, 2.46s/q, correct=89/111, lccp=82.8%, score=0.918, step_acc=91.1%]
GSM8K eval: 75%|#######4 | 112/150 [05:51<02:03, 3.25s/q, correct=89/112, lccp=83.0%, score=0.918, step_acc=91.2%]
GSM8K eval: 75%|#######5 | 113/150 [05:53<01:44, 2.81s/q, correct=90/113, lccp=83.1%, score=0.918, step_acc=91.2%]
GSM8K eval: 76%|#######6 | 114/150 [05:56<01:46, 2.96s/q, correct=90/114, lccp=82.7%, score=0.915, step_acc=90.8%]
GSM8K eval: 77%|#######6 | 115/150 [05:58<01:37, 2.77s/q, correct=91/115, lccp=82.9%, score=0.915, step_acc=90.9%]
GSM8K eval: 77%|#######7 | 116/150 [06:01<01:34, 2.76s/q, correct=92/116, lccp=83.0%, score=0.916, step_acc=90.9%]
GSM8K eval: 78%|#######8 | 117/150 [06:07<01:59, 3.63s/q, correct=93/117, lccp=83.2%, score=0.917, step_acc=91.0%]
GSM8K eval: 79%|#######8 | 118/150 [06:11<02:02, 3.84s/q, correct=93/118, lccp=82.5%, score=0.914, step_acc=91.0%]
GSM8K eval: 79%|#######9 | 119/150 [06:14<01:55, 3.72s/q, correct=93/119, lccp=82.6%, score=0.913, step_acc=91.0%]
GSM8K eval: 80%|######## | 120/150 [06:17<01:42, 3.42s/q, correct=94/120, lccp=82.8%, score=0.914, step_acc=91.1%]
GSM8K eval: 81%|######## | 121/150 [06:20<01:36, 3.32s/q, correct=95/121, lccp=82.9%, score=0.914, step_acc=91.2%]
GSM8K eval: 81%|########1 | 122/150 [06:23<01:31, 3.26s/q, correct=96/122, lccp=83.0%, score=0.915, step_acc=91.3%]
GSM8K eval: 82%|########2 | 123/150 [06:27<01:28, 3.29s/q, correct=96/123, lccp=82.7%, score=0.915, step_acc=91.2%]
GSM8K eval: 83%|########2 | 124/150 [06:29<01:17, 2.99s/q, correct=97/124, lccp=82.8%, score=0.916, step_acc=91.2%]
GSM8K eval: 83%|########3 | 125/150 [06:31<01:07, 2.71s/q, correct=98/125, lccp=83.0%, score=0.916, step_acc=91.3%]
GSM8K eval: 84%|########4 | 126/150 [06:34<01:05, 2.72s/q, correct=99/126, lccp=83.1%, score=0.917, step_acc=91.4%]
GSM8K eval: 85%|########4 | 127/150 [06:38<01:14, 3.26s/q, correct=100/127, lccp=83.2%, score=0.917, step_acc=91.4%]
GSM8K eval: 85%|########5 | 128/150 [06:41<01:09, 3.18s/q, correct=101/128, lccp=83.4%, score=0.918, step_acc=91.5%]
GSM8K eval: 86%|########6 | 129/150 [06:45<01:09, 3.31s/q, correct=102/129, lccp=83.5%, score=0.919, step_acc=91.6%]
GSM8K eval: 87%|########6 | 130/150 [06:47<00:57, 2.87s/q, correct=103/130, lccp=83.6%, score=0.919, step_acc=91.6%]
GSM8K eval: 87%|########7 | 131/150 [06:51<01:04, 3.41s/q, correct=104/131, lccp=83.8%, score=0.920, step_acc=91.7%]
GSM8K eval: 88%|########8 | 132/150 [06:53<00:51, 2.88s/q, correct=105/132, lccp=83.9%, score=0.920, step_acc=91.8%]
GSM8K eval: 89%|########8 | 133/150 [06:56<00:48, 2.87s/q, correct=106/133, lccp=84.0%, score=0.921, step_acc=91.8%]
GSM8K eval: 89%|########9 | 134/150 [07:00<00:53, 3.33s/q, correct=107/134, lccp=84.1%, score=0.922, step_acc=91.9%]
GSM8K eval: 90%|######### | 135/150 [07:03<00:47, 3.19s/q, correct=108/135, lccp=84.2%, score=0.922, step_acc=91.9%]
GSM8K eval: 91%|######### | 136/150 [07:08<00:50, 3.59s/q, correct=108/136, lccp=83.9%, score=0.921, step_acc=91.8%]
GSM8K eval: 91%|#########1| 137/150 [07:14<00:59, 4.55s/q, correct=109/137, lccp=84.0%, score=0.921, step_acc=91.8%]
GSM8K eval: 92%|#########2| 138/150 [07:18<00:52, 4.38s/q, correct=110/138, lccp=84.1%, score=0.922, step_acc=91.9%]
GSM8K eval: 93%|#########2| 139/150 [07:22<00:45, 4.10s/q, correct=111/139, lccp=84.2%, score=0.923, step_acc=91.9%]
GSM8K eval: 93%|#########3| 140/150 [07:26<00:41, 4.17s/q, correct=111/140, lccp=84.1%, score=0.919, step_acc=91.8%]
GSM8K eval: 94%|#########3| 141/150 [07:30<00:36, 4.07s/q, correct=112/141, lccp=84.2%, score=0.919, step_acc=91.8%]
GSM8K eval: 95%|#########4| 142/150 [07:33<00:30, 3.85s/q, correct=113/142, lccp=84.3%, score=0.920, step_acc=91.9%]
GSM8K eval: 95%|#########5| 143/150 [07:36<00:23, 3.38s/q, correct=114/143, lccp=84.4%, score=0.921, step_acc=91.9%]
GSM8K eval: 96%|#########6| 144/150 [07:38<00:18, 3.08s/q, correct=115/144, lccp=84.5%, score=0.921, step_acc=92.0%]
GSM8K eval: 97%|#########6| 145/150 [07:43<00:18, 3.72s/q, correct=115/145, lccp=84.0%, score=0.919, step_acc=91.8%]
GSM8K eval: 97%|#########7| 146/150 [07:46<00:13, 3.49s/q, correct=116/146, lccp=84.2%, score=0.919, step_acc=91.9%]
GSM8K eval: 98%|#########8| 147/150 [07:50<00:10, 3.59s/q, correct=117/147, lccp=84.3%, score=0.920, step_acc=92.0%]
GSM8K eval: 99%|#########8| 148/150 [07:54<00:07, 3.62s/q, correct=118/148, lccp=84.4%, score=0.920, step_acc=92.0%]
GSM8K eval: 99%|#########9| 149/150 [07:57<00:03, 3.57s/q, correct=119/149, lccp=84.5%, score=0.921, step_acc=92.1%]
GSM8K eval: 100%|##########| 150/150 [08:02<00:00, 3.96s/q, correct=119/150, lccp=84.3%, score=0.919, step_acc=91.9%]
GSM8K eval: 100%|##########| 150/150 [08:02<00:00, 3.22s/q, correct=119/150, lccp=84.3%, score=0.919, step_acc=91.9%]
+2026-04-26 03:59:34,576 INFO __main__ - Training Score [iter 5]: 0.9192 (best=0.9162) | n=150
+2026-04-26 03:59:34,576 INFO __main__ - Components : 0.50×correct(79.3%) + 0.40×process + 0.10×fmt(0.998)
+2026-04-26 03:59:34,576 INFO __main__ - Process score : prm_mean=0.903 prm_final=0.930 → weighted=0.920
+2026-04-26 03:59:34,577 INFO __main__ - Step accuracy : 91.8% (bag-of-steps: fraction of steps PRM >0.5)
+2026-04-26 03:59:34,577 INFO __main__ - Chain integrity (LCCP): 84.3% ← fraction of steps before first failure
+ [LCCP=100% → all steps correct; LCCP=0% → first step wrong]
+2026-04-26 03:59:34,577 INFO __main__ - (debug) final-answer accuracy: 79.3%
+2026-04-26 03:59:36,811 INFO __main__ - New best saved → checkpoints/grpo/grpo_20260426_032827/best_policy (combined 0.9192 > 0.9162)
+2026-04-26 03:59:39,019 INFO __main__ - ======================================================================
+2026-04-26 03:59:39,019 INFO __main__ - GRPO ITERATION 6/60
+2026-04-26 03:59:39,020 INFO __main__ - ======================================================================
+2026-04-26 03:59:39,040 INFO __main__ - LR this iteration: 3.31e-06 | T=0.766 | MATH ratio=30%
+
Iter 6 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 03:59:45,143 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,311 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,486 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,569 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,746 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,836 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,921 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 6 GRPO groups: 0%| | 0/20 [00:06, ?q/s, loss=0var, mean_r=0.998, skip=1]
Iter 6 GRPO groups: 5%|5 | 1/20 [00:06<02:10, 6.88s/q, loss=0var, mean_r=0.998, skip=1]2026-04-26 03:59:49,066 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,149 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,232 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,314 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,397 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,479 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,561 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,644 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,726 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,811 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 6 GRPO groups: 5%|5 | 1/20 [00:10<02:10, 6.88s/q, loss=0var, mean_r=0.997, skip=2]
Iter 6 GRPO groups: 10%|# | 2/20 [00:10<01:32, 5.12s/q, loss=0var, mean_r=0.997, skip=2]2026-04-26 04:00:01,239 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:00:01,323 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.949[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:00:01,418 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:00:01,512 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:00:01,598 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.884 = 0.50×0.85(prox=0.85) + 0.40×proc(0.897[fin=0.96,mean=0.79]) + 0.10×fmt(1.000) | pred='26' gold='25' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:00:01,690 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:00:01,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:00:01,865 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.936[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 04:00:01,957 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.918[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=86% lccp=29% (chain=2/7 ok_count=6) n_steps=7
+2026-04-26 04:00:02,050 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.752 = 0.50×0.85(prox=0.85) + 0.40×proc(0.567[fin=0.42,mean=0.79]) + 0.10×fmt(1.000) | pred='26' gold='25' | step_acc=75% lccp=75% (chain=6/8 ok_count=6) n_steps=8
+
Iter 6 GRPO groups: 10%|# | 2/20 [00:24<01:32, 5.12s/q, loss=0.0001, mean_r=0.954, skip=2]
Iter 6 GRPO groups: 15%|#5 | 3/20 [00:24<02:34, 9.10s/q, loss=0.0001, mean_r=0.954, skip=2]2026-04-26 04:00:07,865 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='$240000' gold='240000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:00:07,947 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='240000' gold='240000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:00:08,029 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.962[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='$240000' gold='240000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:00:08,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.903[fin=0.97,mean=0.80]) + 0.10×fmt(1.000) | pred='240000' gold='240000' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:00:08,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='240000' gold='240000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:00:08,279 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.674 = 0.50×0.45(prox=0.45) + 0.40×proc(0.867[fin=1.00,mean=0.67]) + 0.10×fmt(1.000) | pred='384000' gold='240000' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:00:08,362 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='240000' gold='240000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:00:08,445 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='240000' gold='240000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:00:08,527 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.641 = 0.50×0.45(prox=0.45) + 0.40×proc(0.784[fin=0.97,mean=0.51]) + 0.10×fmt(1.000) | pred='384000' gold='240000' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:00:08,611 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240000' gold='240000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 6 GRPO groups: 15%|#5 | 3/20 [00:31<02:34, 9.10s/q, loss=-0.0007, mean_r=0.925, skip=2]
Iter 6 GRPO groups: 20%|## | 4/20 [00:31<02:09, 8.09s/q, loss=-0.0007, mean_r=0.925, skip=2]2026-04-26 04:00:18,599 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:00:18,691 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.25(prox=0.25) + 0.40×proc(0.948[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='25' gold='10' | step_acc=83% lccp=33% (chain=2/6 ok_count=5) n_steps=6
+2026-04-26 04:00:18,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:00:18,878 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.25(prox=0.25) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='25' gold='10' | step_acc=83% lccp=33% (chain=2/6 ok_count=5) n_steps=6
+2026-04-26 04:00:18,970 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.25(prox=0.25) + 0.40×proc(0.899[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='25' gold='10' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:00:19,064 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.25(prox=0.25) + 0.40×proc(0.938[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='25' gold='10' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:00:19,156 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.25(prox=0.25) + 0.40×proc(0.877[fin=1.00,mean=0.70]) + 0.10×fmt(1.000) | pred='25' gold='10' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:00:19,249 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:00:19,341 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.832[fin=0.96,mean=0.64]) + 0.10×fmt(1.000) | pred='0' gold='10' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:00:19,433 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.508 = 0.50×0.05(prox=0.05) + 0.40×proc(0.857[fin=0.99,mean=0.65]) + 0.10×fmt(1.000) | pred='115' gold='10' | step_acc=57% lccp=29% (chain=2/7 ok_count=4) n_steps=7
+
Iter 6 GRPO groups: 20%|## | 4/20 [00:41<02:09, 8.09s/q, loss=0.0007, mean_r=0.681, skip=2]
Iter 6 GRPO groups: 25%|##5 | 5/20 [00:41<02:15, 9.03s/q, loss=0.0007, mean_r=0.681, skip=2]2026-04-26 04:00:24,976 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,058 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,143 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,228 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,393 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,474 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,556 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,638 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 6 GRPO groups: 25%|##5 | 5/20 [00:46<02:15, 9.03s/q, loss=0var, mean_r=0.999, skip=3]
Iter 6 GRPO groups: 30%|### | 6/20 [00:46<01:46, 7.61s/q, loss=0var, mean_r=0.999, skip=3]2026-04-26 04:00:59,418 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.381 = 0.50×0.00(prox=0.00) + 0.40×proc(0.609[fin=0.71,mean=0.45]) + 0.10×fmt(1.000) | pred='C' gold='119' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:00:59,516 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.377 = 0.50×0.00(prox=0.00) + 0.40×proc(0.646[fin=0.69,mean=0.58]) + 0.10×fmt(1.000) | pred='C' gold='119' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+2026-04-26 04:00:59,608 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.474 = 0.50×0.50(prox=0.50) + 0.40×proc(0.306[fin=0.12,mean=0.58]) + 0.10×fmt(1.000) | pred='60' gold='119' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 04:00:59,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.037 = 0.50×0.00(prox=0.00) + 0.40×proc(0.006[fin=0.01,mean=0.01]) + 0.10×fmt(0.350) | pred='' gold='119' | step_acc=0% lccp=0% (chain=0/1 ok_count=0) n_steps=1
+2026-04-26 04:00:59,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.347 = 0.50×0.00(prox=0.00) + 0.40×proc(0.706[fin=0.88,mean=0.45]) + 0.10×fmt(0.650) | pred='C' gold='119' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 04:00:59,953 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.161 = 0.50×0.00(prox=0.00) + 0.40×proc(0.153[fin=0.04,mean=0.32]) + 0.10×fmt(0.700) | pred='' gold='119' | step_acc=20% lccp=20% (chain=1/5 ok_count=1) n_steps=5
+2026-04-26 04:01:00,073 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='C' gold='119' | step_acc=100% lccp=100% (chain=16/16 ok_count=16) n_steps=16
+2026-04-26 04:01:00,163 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.441 = 0.50×0.00(prox=0.00) + 0.40×proc(0.628[fin=0.58,mean=0.70]) + 0.10×fmt(1.000) | pred='C' gold='119' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:01:00,292 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.475 = 0.50×0.00(prox=0.00) + 0.40×proc(0.953[fin=1.00,mean=0.89]) + 0.10×fmt(0.700) | pred='' gold='119' | step_acc=92% lccp=16% (chain=4/25 ok_count=23) n_steps=25
+
Iter 6 GRPO groups: 30%|### | 6/20 [01:22<01:46, 7.61s/q, loss=-0.0006, mean_r=0.360, skip=3]
Iter 6 GRPO groups: 35%|###5 | 7/20 [01:22<03:39, 16.92s/q, loss=-0.0006, mean_r=0.360, skip=3]2026-04-26 04:01:24,800 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.485[fin=0.44,mean=0.56]) + 0.10×fmt(1.000) | pred='3' gold='8' | step_acc=62% lccp=25% (chain=2/8 ok_count=5) n_steps=8
+2026-04-26 04:01:24,897 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(0.700) | pred='' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:01:24,983 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.330 = 0.50×0.00(prox=0.00) + 0.40×proc(0.525[fin=0.47,mean=0.61]) + 0.10×fmt(0.700) | pred='' gold='8' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 04:01:25,078 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.42(prox=0.42) + 0.40×proc(0.560[fin=0.64,mean=0.43]) + 0.10×fmt(1.000) | pred='2.40824' gold='8' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 04:01:25,178 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.754 = 0.50×0.57(prox=0.57) + 0.40×proc(0.920[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='11' gold='8' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 04:01:25,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:01:25,388 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.671 = 0.50×0.80(prox=0.80) + 0.40×proc(0.428[fin=0.55,mean=0.24]) + 0.10×fmt(1.000) | pred='7' gold='8' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:01:25,482 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.856 = 0.50×0.80(prox=0.80) + 0.40×proc(0.890[fin=0.98,mean=0.75]) + 0.10×fmt(1.000) | pred='9' gold='8' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:01:25,575 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.956 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(0.650) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:01:25,667 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.591 = 0.50×0.67(prox=0.67) + 0.40×proc(0.393[fin=0.53,mean=0.19]) + 0.10×fmt(1.000) | pred='6' gold='8' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+
Iter 6 GRPO groups: 35%|###5 | 7/20 [01:48<03:39, 16.92s/q, loss=0.0011, mean_r=0.681, skip=3]
Iter 6 GRPO groups: 40%|#### | 8/20 [01:48<03:55, 19.59s/q, loss=0.0011, mean_r=0.681, skip=3]2026-04-26 04:01:35,767 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:01:35,852 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:01:35,945 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.314 = 0.50×0.34(prox=0.34) + 0.40×proc(0.112[fin=0.07,mean=0.18]) + 0.10×fmt(1.000) | pred='1' gold='50' | step_acc=12% lccp=0% (chain=0/8 ok_count=1) n_steps=8
+2026-04-26 04:01:36,028 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.431 = 0.50×0.57(prox=0.57) + 0.40×proc(0.117[fin=0.04,mean=0.24]) + 0.10×fmt(1.000) | pred='31' gold='50' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 04:01:36,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:01:36,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.295 = 0.50×0.02(prox=0.02) + 0.40×proc(0.270[fin=0.10,mean=0.52]) + 0.10×fmt(1.000) | pred='1100' gold='50' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 04:01:36,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.895 = 0.50×1.00(exact) + 0.40×proc(0.738[fin=0.85,mean=0.57]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 04:01:36,360 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:01:36,451 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+2026-04-26 04:01:36,534 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 6 GRPO groups: 40%|#### | 8/20 [01:58<03:55, 19.59s/q, loss=0.0008, mean_r=0.793, skip=3]
Iter 6 GRPO groups: 45%|####5 | 9/20 [01:58<03:05, 16.85s/q, loss=0.0008, mean_r=0.793, skip=3]2026-04-26 04:01:44,178 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.452 = 0.50×0.27(prox=0.27) + 0.40×proc(0.258[fin=0.01,mean=0.63]) + 0.10×fmt(1.000) | pred='315' gold='135' | step_acc=75% lccp=75% (chain=3/4 ok_count=3) n_steps=4
+2026-04-26 04:01:44,260 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:01:44,354 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.669 = 0.50×0.53(prox=0.53) + 0.40×proc(0.761[fin=0.82,mean=0.68]) + 0.10×fmt(1.000) | pred='195' gold='135' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:01:44,439 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:01:44,530 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:01:44,613 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:01:44,695 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:01:44,786 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:01:44,872 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.463 = 0.50×0.27(prox=0.27) + 0.40×proc(0.284[fin=0.04,mean=0.66]) + 0.10×fmt(1.000) | pred='315' gold='135' | step_acc=75% lccp=75% (chain=3/4 ok_count=3) n_steps=4
+2026-04-26 04:01:44,955 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 6 GRPO groups: 45%|####5 | 9/20 [02:07<03:05, 16.85s/q, loss=0.0002, mean_r=0.857, skip=3]
Iter 6 GRPO groups: 50%|##### | 10/20 [02:07<02:22, 14.25s/q, loss=0.0002, mean_r=0.857, skip=3]2026-04-26 04:01:54,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.487 = 0.50×0.36(prox=0.36) + 0.40×proc(0.270[fin=0.02,mean=0.64]) + 0.10×fmt(1.000) | pred='5' gold='50' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 04:01:54,423 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.938 = 0.50×1.00(exact) + 0.40×proc(0.846[fin=0.93,mean=0.72]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:01:54,507 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:01:54,598 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:01:54,691 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.515 = 0.50×0.36(prox=0.36) + 0.40×proc(0.374[fin=0.22,mean=0.61]) + 0.10×fmt(1.000) | pred='5.5' gold='50' | step_acc=57% lccp=57% (chain=4/7 ok_count=4) n_steps=7
+2026-04-26 04:01:54,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.461 = 0.50×0.36(prox=0.36) + 0.40×proc(0.241[fin=0.02,mean=0.57]) + 0.10×fmt(1.000) | pred='5' gold='50' | step_acc=57% lccp=57% (chain=4/7 ok_count=4) n_steps=7
+2026-04-26 04:01:54,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.659 = 0.50×0.85(prox=0.85) + 0.40×proc(0.334[fin=0.03,mean=0.80]) + 0.10×fmt(1.000) | pred='49.5' gold='50' | step_acc=83% lccp=83% (chain=5/6 ok_count=5) n_steps=6
+2026-04-26 04:01:54,962 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.925[fin=0.98,mean=0.85]) + 0.10×fmt(1.000) | pred='5.75' gold='50' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:01:55,046 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.918 = 0.50×0.85(prox=0.85) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='51' gold='50' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:01:55,132 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 6 GRPO groups: 50%|##### | 10/20 [02:17<02:22, 14.25s/q, loss=-0.0004, mean_r=0.752, skip=3]
Iter 6 GRPO groups: 55%|#####5 | 11/20 [02:17<01:57, 13.01s/q, loss=-0.0004, mean_r=0.752, skip=3]2026-04-26 04:01:59,602 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.669 = 0.50×0.44(prox=0.44) + 0.40×proc(0.868[fin=1.00,mean=0.67]) + 0.10×fmt(1.000) | pred='65' gold='40' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:01:59,684 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.666 = 0.50×0.44(prox=0.44) + 0.40×proc(0.858[fin=1.00,mean=0.65]) + 0.10×fmt(1.000) | pred='65' gold='40' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:01:59,765 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:01:59,846 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:01:59,926 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:00,007 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:00,088 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:00,171 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:00,253 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:00,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 6 GRPO groups: 55%|#####5 | 11/20 [02:22<01:57, 13.01s/q, loss=-0.0002, mean_r=0.927, skip=3]
Iter 6 GRPO groups: 60%|###### | 12/20 [02:22<01:25, 10.64s/q, loss=-0.0002, mean_r=0.927, skip=3]2026-04-26 04:02:05,789 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.958[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:05,872 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:05,956 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:06,039 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:06,122 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:06,205 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:06,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:06,368 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:06,450 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:06,533 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.953[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 6 GRPO groups: 60%|###### | 12/20 [02:27<01:25, 10.64s/q, loss=0var, mean_r=0.991, skip=4]
Iter 6 GRPO groups: 65%|######5 | 13/20 [02:27<01:01, 8.85s/q, loss=0var, mean_r=0.991, skip=4]2026-04-26 04:02:11,302 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:11,387 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:11,469 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:11,550 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:11,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.758 = 0.50×0.58(prox=0.58) + 0.40×proc(0.916[fin=0.98,mean=0.82]) + 0.10×fmt(1.000) | pred='38' gold='28' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:02:11,715 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.972[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:11,796 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:11,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:11,959 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:02:12,040 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 6 GRPO groups: 65%|######5 | 13/20 [02:34<01:01, 8.85s/q, loss=0.0004, mean_r=0.971, skip=4]
Iter 6 GRPO groups: 70%|####### | 14/20 [02:34<00:49, 8.29s/q, loss=0.0004, mean_r=0.971, skip=4]2026-04-26 04:02:16,958 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,040 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,121 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,205 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,290 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,372 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,455 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,538 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,620 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,701 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 6 GRPO groups: 70%|####### | 14/20 [02:38<00:49, 8.29s/q, loss=0var, mean_r=0.999, skip=5]
Iter 6 GRPO groups: 75%|#######5 | 15/20 [02:38<00:35, 7.05s/q, loss=0var, mean_r=0.999, skip=5]2026-04-26 04:02:23,551 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:02:23,634 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:02:23,720 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:02:23,805 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:23,890 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:02:23,973 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:02:24,057 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:02:24,141 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+2026-04-26 04:02:24,223 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:02:24,308 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+
Iter 6 GRPO groups: 75%|#######5 | 15/20 [02:45<00:35, 7.05s/q, loss=0var, mean_r=0.998, skip=6]
Iter 6 GRPO groups: 80%|######## | 16/20 [02:45<00:27, 6.92s/q, loss=0var, mean_r=0.998, skip=6]2026-04-26 04:02:30,625 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:30,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:30,800 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:02:30,882 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:30,965 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:31,048 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:31,133 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:31,217 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:31,300 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:31,384 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 6 GRPO groups: 80%|######## | 16/20 [02:52<00:27, 6.92s/q, loss=0var, mean_r=1.000, skip=7]
Iter 6 GRPO groups: 85%|########5 | 17/20 [02:52<00:20, 6.97s/q, loss=0var, mean_r=1.000, skip=7]2026-04-26 04:02:34,333 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:34,414 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:34,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:34,571 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.939[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:02:34,648 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.969[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:34,728 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.948[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:34,807 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.949[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:34,888 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:34,969 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:35,051 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.766 = 0.50×0.57(prox=0.57) + 0.40×proc(0.951[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='88' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 6 GRPO groups: 85%|########5 | 17/20 [02:57<00:20, 6.97s/q, loss=-0.0011, mean_r=0.968, skip=7]
Iter 6 GRPO groups: 90%|######### | 18/20 [02:57<00:12, 6.41s/q, loss=-0.0011, mean_r=0.968, skip=7]2026-04-26 04:02:39,176 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,258 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,341 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,422 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,503 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,584 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,665 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,745 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,826 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,907 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 6 GRPO groups: 90%|######### | 18/20 [03:00<00:12, 6.41s/q, loss=0var, mean_r=0.999, skip=8]
Iter 6 GRPO groups: 95%|#########5| 19/20 [03:00<00:05, 5.51s/q, loss=0var, mean_r=0.999, skip=8]2026-04-26 04:02:50,163 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.921[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:02:50,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.521 = 0.50×0.71(prox=0.71) + 0.40×proc(0.159[fin=0.10,mean=0.25]) + 0.10×fmt(1.000) | pred='6' gold='5' | step_acc=17% lccp=0% (chain=0/6 ok_count=1) n_steps=6
+2026-04-26 04:02:50,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.439 = 0.50×0.56(prox=0.56) + 0.40×proc(0.152[fin=0.19,mean=0.10]) + 0.10×fmt(1.000) | pred='7' gold='5' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 04:02:50,419 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.678 = 0.50×0.71(prox=0.71) + 0.40×proc(0.551[fin=0.73,mean=0.28]) + 0.10×fmt(1.000) | pred='6' gold='5' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 04:02:50,501 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.882 = 0.50×1.00(exact) + 0.40×proc(0.705[fin=0.75,mean=0.64]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:02:50,584 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:02:50,675 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.504 = 0.50×0.45(prox=0.45) + 0.40×proc(0.407[fin=0.24,mean=0.66]) + 0.10×fmt(1.000) | pred='2' gold='5' | step_acc=64% lccp=9% (chain=1/11 ok_count=7) n_steps=11
+2026-04-26 04:02:50,756 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.511 = 0.50×0.56(prox=0.56) + 0.40×proc(0.332[fin=0.36,mean=0.30]) + 0.10×fmt(1.000) | pred='7' gold='5' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:02:50,840 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:50,933 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.339 = 0.50×0.38(prox=0.38) + 0.40×proc(0.118[fin=0.06,mean=0.20]) + 0.10×fmt(1.000) | pred='1' gold='5' | step_acc=0% lccp=0% (chain=0/5 ok_count=0) n_steps=5
+
Iter 6 GRPO groups: 95%|#########5| 19/20 [03:13<00:05, 5.51s/q, loss=-0.0010, mean_r=0.684, skip=8]
Iter 6 GRPO groups: 100%|##########| 20/20 [03:13<00:00, 7.59s/q, loss=-0.0010, mean_r=0.684, skip=8]
Iter 6 GRPO groups: 100%|##########| 20/20 [03:13<00:00, 9.67s/q, loss=-0.0010, mean_r=0.684, skip=8]
+2026-04-26 04:02:52,370 INFO __main__ - Iter 6 | loss=-0.0001 | reward mean=0.879 std=0.215 | gt_match=74.9% | grounded_acc=91.0% | step_acc=88.5% | lccp=80.6% | batch_acc=91.0% | phase=GROUNDED_ONLY sp_ratio=0% | groups=12 skipped=8(0var=8) | lr=3.88e-06 | 193.4s
+2026-04-26 04:02:52,370 WARNING __main__ - STARVATION: 40% of groups skipped (zero variance). grounded_acc=91.0% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 04:02:52,371 INFO __main__ - ======================================================================
+2026-04-26 04:02:52,371 INFO __main__ - GRPO ITERATION 7/60
+2026-04-26 04:02:52,371 INFO __main__ - ======================================================================
+2026-04-26 04:02:52,392 INFO __main__ - LR this iteration: 3.88e-06 | T=0.759 | MATH ratio=30%
+
Iter 7 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:02:57,992 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.567 = 0.50×0.43(prox=0.43) + 0.40×proc(0.631[fin=0.79,mean=0.39]) + 0.10×fmt(1.000) | pred='12' gold='36' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:02:58,087 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.643 = 0.50×0.50(prox=0.50) + 0.40×proc(0.734[fin=0.91,mean=0.47]) + 0.10×fmt(1.000) | pred='18' gold='36' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:02:58,180 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.305 = 0.50×0.00(prox=0.00) + 0.40×proc(0.400[fin=0.25,mean=0.62]) + 0.10×fmt(0.700) | pred='' gold='36' | step_acc=50% lccp=50% (chain=1/2 ok_count=1) n_steps=2
+2026-04-26 04:02:58,272 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.651 = 0.50×0.50(prox=0.50) + 0.40×proc(0.752[fin=0.92,mean=0.50]) + 0.10×fmt(1.000) | pred='18' gold='36' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:02:58,363 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.460 = 0.50×0.50(prox=0.50) + 0.40×proc(0.362[fin=0.43,mean=0.26]) + 0.10×fmt(0.650) | pred='18' gold='36' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 04:02:58,456 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.664 = 0.50×0.50(prox=0.50) + 0.40×proc(0.786[fin=0.89,mean=0.62]) + 0.10×fmt(1.000) | pred='18' gold='36' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 04:02:58,549 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.596 = 0.50×0.50(prox=0.50) + 0.40×proc(0.614[fin=0.66,mean=0.54]) + 0.10×fmt(1.000) | pred='18' gold='36' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:02:58,642 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.821 = 0.50×1.00(exact) + 0.40×proc(0.553[fin=0.64,mean=0.43]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:02:58,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.657 = 0.50×0.50(prox=0.50) + 0.40×proc(0.766[fin=0.94,mean=0.51]) + 0.10×fmt(1.000) | pred='18' gold='36' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:02:58,829 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.675 = 0.50×0.50(prox=0.50) + 0.40×proc(0.813[fin=0.95,mean=0.61]) + 0.10×fmt(1.000) | pred='18' gold='36' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+
Iter 7 GRPO groups: 0%| | 0/20 [00:07, ?q/s, loss=0.0016, mean_r=0.604, skip=0]
Iter 7 GRPO groups: 5%|5 | 1/20 [00:07<02:29, 7.85s/q, loss=0.0016, mean_r=0.604, skip=0]2026-04-26 04:03:04,023 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:04,105 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:04,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:04,267 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:04,350 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:04,434 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:04,515 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:04,596 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:04,678 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:03:04,760 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.757 = 0.50×0.64(prox=0.64) + 0.40×proc(0.848[fin=0.95,mean=0.70]) + 0.10×fmt(1.000) | pred='10.71' gold='15' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+
Iter 7 GRPO groups: 5%|5 | 1/20 [00:13<02:29, 7.85s/q, loss=0.0010, mean_r=0.970, skip=0]
Iter 7 GRPO groups: 10%|# | 2/20 [00:13<02:01, 6.74s/q, loss=0.0010, mean_r=0.970, skip=0]2026-04-26 04:03:09,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:09,750 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:09,825 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:09,901 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.896[fin=0.99,mean=0.75]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:03:09,976 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:03:10,056 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:10,137 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:10,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:10,301 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:03:10,380 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 7 GRPO groups: 10%|# | 2/20 [00:17<02:01, 6.74s/q, loss=0var, mean_r=0.993, skip=1]
Iter 7 GRPO groups: 15%|#5 | 3/20 [00:17<01:34, 5.57s/q, loss=0var, mean_r=0.993, skip=1]2026-04-26 04:03:16,274 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:16,358 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:16,441 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:16,523 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:16,604 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:16,685 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:16,768 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:16,852 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:16,936 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:17,019 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 7 GRPO groups: 15%|#5 | 3/20 [00:24<01:34, 5.57s/q, loss=0var, mean_r=0.999, skip=2]
Iter 7 GRPO groups: 20%|## | 4/20 [00:24<01:35, 5.99s/q, loss=0var, mean_r=0.999, skip=2]2026-04-26 04:03:26,740 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.925[fin=0.98,mean=0.83]) + 0.10×fmt(1.000) | pred='0' gold='2000' | step_acc=88% lccp=62% (chain=5/8 ok_count=7) n_steps=8
+2026-04-26 04:03:26,834 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2000' gold='2000' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:03:26,926 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.421 = 0.50×0.22(prox=0.22) + 0.40×proc(0.319[fin=0.15,mean=0.57]) + 0.10×fmt(1.000) | pred='-1640' gold='2000' | step_acc=57% lccp=57% (chain=4/7 ok_count=4) n_steps=7
+2026-04-26 04:03:27,019 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.953 = 0.50×1.00(exact) + 0.40×proc(0.883[fin=0.93,mean=0.81]) + 0.10×fmt(1.000) | pred='2000' gold='2000' | step_acc=88% lccp=75% (chain=6/8 ok_count=7) n_steps=8
+2026-04-26 04:03:27,113 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.433 = 0.50×0.22(prox=0.22) + 0.40×proc(0.376[fin=0.25,mean=0.57]) + 0.10×fmt(1.000) | pred='-1640' gold='2000' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 04:03:27,207 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.436 = 0.50×0.00(prox=0.00) + 0.40×proc(0.700[fin=0.73,mean=0.65]) + 0.10×fmt(1.000) | pred='Thomas needs $1640 more to buy the car.' gold='2000' | step_acc=75% lccp=38% (chain=3/8 ok_count=6) n_steps=8
+2026-04-26 04:03:27,300 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.35(prox=0.35) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='180' gold='2000' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:03:27,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.433 = 0.50×0.07(prox=0.07) + 0.40×proc(0.508[fin=0.43,mean=0.62]) + 0.10×fmt(1.000) | pred='14640' gold='2000' | step_acc=62% lccp=62% (chain=5/8 ok_count=5) n_steps=8
+2026-04-26 04:03:27,487 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.958[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='2000' gold='2000' | step_acc=89% lccp=44% (chain=4/9 ok_count=8) n_steps=9
+2026-04-26 04:03:27,579 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.23(prox=0.23) + 0.40×proc(0.797[fin=0.85,mean=0.72]) + 0.10×fmt(1.000) | pred='5280' gold='2000' | step_acc=78% lccp=67% (chain=6/9 ok_count=7) n_steps=9
+
Iter 7 GRPO groups: 20%|## | 4/20 [00:36<01:35, 5.99s/q, loss=-0.0001, mean_r=0.631, skip=2]
Iter 7 GRPO groups: 25%|##5 | 5/20 [00:36<02:02, 8.15s/q, loss=-0.0001, mean_r=0.631, skip=2]2026-04-26 04:03:35,621 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:35,707 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:03:35,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.914[fin=0.98,mean=0.81]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:03:35,866 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:03:35,950 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:03:36,026 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.182 = 0.50×0.06(prox=0.06) + 0.40×proc(0.216[fin=0.19,mean=0.26]) + 0.10×fmt(0.650) | pred='113' gold='13' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 04:03:36,109 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:03:36,189 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:03:36,271 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:03:36,348 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 7 GRPO groups: 25%|##5 | 5/20 [00:45<02:02, 8.15s/q, loss=0.0042, mean_r=0.910, skip=2]
Iter 7 GRPO groups: 30%|### | 6/20 [00:45<01:57, 8.37s/q, loss=0.0042, mean_r=0.910, skip=2]2026-04-26 04:04:11,315 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.385 = 0.50×0.33(prox=0.33) + 0.40×proc(0.297[fin=0.40,mean=0.14]) + 0.10×fmt(1.000) | pred='0' gold='13535' | step_acc=0% lccp=0% (chain=0/7 ok_count=0) n_steps=7
+2026-04-26 04:04:11,426 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.457 = 0.50×0.01(prox=0.01) + 0.40×proc(0.844[fin=0.92,mean=0.73]) + 0.10×fmt(1.000) | pred='629663' gold='13535' | step_acc=73% lccp=9% (chain=1/11 ok_count=8) n_steps=11
+2026-04-26 04:04:11,519 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.600 = 0.50×0.36(prox=0.36) + 0.40×proc(0.801[fin=0.91,mean=0.64]) + 0.10×fmt(1.000) | pred='1471' gold='13535' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 04:04:11,634 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.473 = 0.50×0.35(prox=0.35) + 0.40×proc(0.379[fin=0.12,mean=0.77]) + 0.10×fmt(1.000) | pred='960' gold='13535' | step_acc=75% lccp=31% (chain=5/16 ok_count=12) n_steps=16
+2026-04-26 04:04:11,726 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.196 = 0.50×0.00(prox=0.00) + 0.40×proc(0.267[fin=0.07,mean=0.57]) + 0.10×fmt(0.700) | pred='' gold='13535' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+2026-04-26 04:04:11,829 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.573 = 0.50×0.35(prox=0.35) + 0.40×proc(0.748[fin=0.84,mean=0.62]) + 0.10×fmt(1.000) | pred='828' gold='13535' | step_acc=70% lccp=0% (chain=0/10 ok_count=7) n_steps=10
+2026-04-26 04:04:11,926 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.529 = 0.50×0.58(prox=0.58) + 0.40×proc(0.346[fin=0.44,mean=0.20]) + 0.10×fmt(1.000) | pred='8639' gold='13535' | step_acc=0% lccp=0% (chain=0/5 ok_count=0) n_steps=5
+2026-04-26 04:04:12,037 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.49(prox=0.49) + 0.40×proc(0.726[fin=0.70,mean=0.76]) + 0.10×fmt(1.000) | pred='20617' gold='13535' | step_acc=80% lccp=7% (chain=1/15 ok_count=12) n_steps=15
+2026-04-26 04:04:12,121 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.359 = 0.50×0.08(prox=0.08) + 0.40×proc(0.542[fin=0.68,mean=0.33]) + 0.10×fmt(1.000) | pred='86445' gold='13535' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+
Iter 7 GRPO groups: 30%|### | 6/20 [01:21<01:57, 8.37s/q, loss=0.0016, mean_r=0.458, skip=2]
Iter 7 GRPO groups: 35%|###5 | 7/20 [01:21<03:44, 17.30s/q, loss=0.0016, mean_r=0.458, skip=2]2026-04-26 04:04:18,526 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='348' gold='348' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:18,611 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.540 = 0.50×0.49(prox=0.49) + 0.40×proc(0.299[fin=0.14,mean=0.54]) + 0.10×fmt(1.000) | pred='168' gold='348' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:04:18,694 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.675 = 0.50×0.53(prox=0.53) + 0.40×proc(0.780[fin=0.82,mean=0.71]) + 0.10×fmt(1.000) | pred='192' gold='348' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:04:18,776 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='348' gold='348' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:18,858 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='348' gold='348' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:18,940 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='348' gold='348' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:19,023 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.497 = 0.50×0.49(prox=0.49) + 0.40×proc(0.192[fin=0.03,mean=0.43]) + 0.10×fmt(1.000) | pred='168' gold='348' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:04:19,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.715 = 0.50×0.49(prox=0.49) + 0.40×proc(0.924[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='168' gold='348' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 04:04:19,188 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='348' gold='348' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:04:19,270 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.921[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='348' gold='348' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+
Iter 7 GRPO groups: 35%|###5 | 7/20 [01:28<03:44, 17.30s/q, loss=0.0011, mean_r=0.837, skip=2]
Iter 7 GRPO groups: 40%|#### | 8/20 [01:28<02:49, 14.09s/q, loss=0.0011, mean_r=0.837, skip=2]2026-04-26 04:04:26,675 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:26,759 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:26,850 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:26,936 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:27,021 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:27,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.913[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:04:27,190 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:27,281 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:27,364 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:27,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 7 GRPO groups: 40%|#### | 8/20 [01:35<02:49, 14.09s/q, loss=0var, mean_r=0.994, skip=3]
Iter 7 GRPO groups: 45%|####5 | 9/20 [01:35<02:09, 11.80s/q, loss=0var, mean_r=0.994, skip=3]2026-04-26 04:05:00,794 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.09(prox=0.09) + 0.40×proc(0.954[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='5' gold='0' | step_acc=88% lccp=62% (chain=5/8 ok_count=7) n_steps=8
+2026-04-26 04:05:00,891 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.169 = 0.50×0.00(prox=0.00) + 0.40×proc(0.173[fin=0.16,mean=0.19]) + 0.10×fmt(1.000) | pred='$x^3 + 3x^2 + 3x + 4$' gold='0' | step_acc=14% lccp=0% (chain=0/7 ok_count=1) n_steps=7
+2026-04-26 04:05:00,976 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.524 = 0.50×0.09(prox=0.09) + 0.40×proc(0.853[fin=0.98,mean=0.66]) + 0.10×fmt(1.000) | pred='5' gold='0' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:05:01,067 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:05:01,163 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.252 = 0.50×0.00(prox=0.00) + 0.40×proc(0.286[fin=0.16,mean=0.47]) + 0.10×fmt(1.000) | pred='$x^{44} + x^{33} + x^{22} + x^{11} + 1$' gold='0' | step_acc=38% lccp=25% (chain=2/8 ok_count=3) n_steps=8
+2026-04-26 04:05:01,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.919 = 0.50×1.00(exact) + 0.40×proc(0.797[fin=0.94,mean=0.58]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 04:05:01,348 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.546 = 0.50×0.33(prox=0.33) + 0.40×proc(0.699[fin=0.92,mean=0.37]) + 0.10×fmt(1.000) | pred='1' gold='0' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:05:01,451 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.972 = 0.50×1.00(exact) + 0.40×proc(0.930[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=80% lccp=40% (chain=4/10 ok_count=8) n_steps=10
+
Iter 7 GRPO groups: 45%|####5 | 9/20 [02:10<02:09, 11.80s/q, loss=0.0004, mean_r=0.615, skip=3]
Iter 7 GRPO groups: 50%|##### | 10/20 [02:10<03:10, 19.03s/q, loss=0.0004, mean_r=0.615, skip=3]2026-04-26 04:05:06,957 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:05:07,039 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:05:07,121 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:05:07,203 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:05:07,285 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:05:07,367 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:05:07,448 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:05:07,529 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:05:07,604 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:05:07,688 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 7 GRPO groups: 50%|##### | 10/20 [02:15<03:10, 19.03s/q, loss=0var, mean_r=0.996, skip=4]
Iter 7 GRPO groups: 55%|#####5 | 11/20 [02:15<02:12, 14.74s/q, loss=0var, mean_r=0.996, skip=4]2026-04-26 04:05:18,067 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:05:18,154 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.741 = 0.50×0.62(prox=0.62) + 0.40×proc(0.820[fin=0.98,mean=0.57]) + 0.10×fmt(1.000) | pred='8.4' gold='12' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:05:18,246 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:05:18,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.710 = 0.50×0.62(prox=0.62) + 0.40×proc(0.744[fin=0.95,mean=0.43]) + 0.10×fmt(1.000) | pred='8.4' gold='12' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:05:18,416 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.766 = 0.50×0.60(prox=0.60) + 0.40×proc(0.914[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='8' gold='12' | step_acc=86% lccp=0% (chain=0/7 ok_count=6) n_steps=7
+2026-04-26 04:05:18,508 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+2026-04-26 04:05:18,601 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.739 = 0.50×0.55(prox=0.55) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='17' gold='12' | step_acc=78% lccp=0% (chain=0/9 ok_count=7) n_steps=9
+2026-04-26 04:05:18,693 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:05:18,787 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:05:18,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 7 GRPO groups: 55%|#####5 | 11/20 [02:27<02:12, 14.74s/q, loss=0.0018, mean_r=0.893, skip=4]
Iter 7 GRPO groups: 60%|###### | 12/20 [02:27<01:52, 14.09s/q, loss=0.0018, mean_r=0.893, skip=4]2026-04-26 04:05:27,788 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(0.650) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:05:27,871 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.920[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:05:27,952 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:05:28,033 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=0.99,mean=0.97]) + 0.10×fmt(0.650) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:05:28,118 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:05:28,200 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=0.99,mean=0.97]) + 0.10×fmt(0.650) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:05:28,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:05:28,375 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:05:28,458 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(0.650) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:05:28,537 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.954 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=0.99,mean=0.95]) + 0.10×fmt(0.650) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 7 GRPO groups: 60%|###### | 12/20 [02:36<01:52, 14.09s/q, loss=0var, mean_r=0.971, skip=5]
Iter 7 GRPO groups: 65%|######5 | 13/20 [02:36<01:26, 12.32s/q, loss=0var, mean_r=0.971, skip=5]2026-04-26 04:05:33,888 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.787 = 0.50×0.60(prox=0.60) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='24' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:05:33,971 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.785 = 0.50×0.60(prox=0.60) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:05:34,057 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.828 = 0.50×0.75(prox=0.75) + 0.40×proc(0.881[fin=0.93,mean=0.80]) + 0.10×fmt(1.000) | pred='15' gold='18' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:05:34,143 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.778 = 0.50×0.60(prox=0.60) + 0.40×proc(0.945[fin=0.99,mean=0.88]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:05:34,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.736 = 0.50×0.60(prox=0.60) + 0.40×proc(0.840[fin=0.90,mean=0.75]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 04:05:34,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.478 = 0.50×0.43(prox=0.43) + 0.40×proc(0.410[fin=0.38,mean=0.46]) + 0.10×fmt(1.000) | pred='6' gold='18' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:05:34,393 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.753 = 0.50×0.60(prox=0.60) + 0.40×proc(0.882[fin=0.99,mean=0.72]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:05:34,477 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.462 = 0.50×0.04(prox=0.04) + 0.40×proc(0.856[fin=0.99,mean=0.66]) + 0.10×fmt(1.000) | pred='240' gold='18' | step_acc=71% lccp=0% (chain=0/7 ok_count=5) n_steps=7
+2026-04-26 04:05:34,560 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.354 = 0.50×0.07(prox=0.07) + 0.40×proc(0.550[fin=0.62,mean=0.45]) + 0.10×fmt(1.000) | pred='142' gold='18' | step_acc=57% lccp=0% (chain=0/7 ok_count=4) n_steps=7
+2026-04-26 04:05:34,644 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.744 = 0.50×0.60(prox=0.60) + 0.40×proc(0.859[fin=0.99,mean=0.67]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=50% lccp=33% (chain=2/6 ok_count=3) n_steps=6
+
Iter 7 GRPO groups: 65%|######5 | 13/20 [02:43<01:26, 12.32s/q, loss=-0.0013, mean_r=0.670, skip=5]
Iter 7 GRPO groups: 70%|####### | 14/20 [02:43<01:05, 10.89s/q, loss=-0.0013, mean_r=0.670, skip=5]2026-04-26 04:05:45,825 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.860 = 0.50×1.00(exact) + 0.40×proc(0.650[fin=0.79,mean=0.45]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=43% lccp=0% (chain=0/7 ok_count=3) n_steps=7
+2026-04-26 04:05:45,918 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.910[fin=0.99,mean=0.79]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:05:46,011 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:05:46,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.38(prox=0.38) + 0.40×proc(0.781[fin=0.90,mean=0.61]) + 0.10×fmt(1.000) | pred='70' gold='350' | step_acc=57% lccp=43% (chain=3/7 ok_count=4) n_steps=7
+2026-04-26 04:05:46,193 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.747 = 0.50×0.76(prox=0.76) + 0.40×proc(0.670[fin=0.68,mean=0.65]) + 0.10×fmt(1.000) | pred='406' gold='350' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:05:46,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.838 = 0.50×0.76(prox=0.76) + 0.40×proc(0.899[fin=0.98,mean=0.77]) + 0.10×fmt(1.000) | pred='406' gold='350' | step_acc=71% lccp=57% (chain=4/7 ok_count=5) n_steps=7
+2026-04-26 04:05:46,379 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.31(prox=0.31) + 0.40×proc(0.734[fin=0.91,mean=0.46]) + 0.10×fmt(1.000) | pred='736' gold='350' | step_acc=50% lccp=0% (chain=0/6 ok_count=3) n_steps=6
+2026-04-26 04:05:46,471 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.829 = 0.50×0.83(prox=0.83) + 0.40×proc(0.781[fin=0.91,mean=0.58]) + 0.10×fmt(1.000) | pred='385' gold='350' | step_acc=50% lccp=33% (chain=2/6 ok_count=3) n_steps=6
+2026-04-26 04:05:46,562 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:05:46,658 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.592 = 0.50×0.38(prox=0.38) + 0.40×proc(0.758[fin=0.83,mean=0.65]) + 0.10×fmt(1.000) | pred='60' gold='350' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+
Iter 7 GRPO groups: 70%|####### | 14/20 [02:55<01:05, 10.89s/q, loss=0.0002, mean_r=0.792, skip=5]
Iter 7 GRPO groups: 75%|#######5 | 15/20 [02:55<00:56, 11.22s/q, loss=0.0002, mean_r=0.792, skip=5]2026-04-26 04:05:53,423 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1000' gold='1000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:05:53,507 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.498 = 0.50×0.48(prox=0.48) + 0.40×proc(0.391[fin=0.49,mean=0.25]) + 0.10×fmt(1.000) | pred='1536' gold='1000' | step_acc=0% lccp=0% (chain=0/5 ok_count=0) n_steps=5
+2026-04-26 04:05:53,593 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.900[fin=0.97,mean=0.79]) + 0.10×fmt(1.000) | pred='1000' gold='1000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:05:53,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1000' gold='1000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:05:53,764 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1000' gold='1000' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:05:53,849 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.531 = 0.50×0.38(prox=0.38) + 0.40×proc(0.597[fin=0.70,mean=0.44]) + 0.10×fmt(1.000) | pred='200' gold='1000' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:05:53,933 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.476 = 0.50×0.71(prox=0.71) + 0.40×proc(0.048[fin=0.03,mean=0.08]) + 0.10×fmt(1.000) | pred='800' gold='1000' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:05:54,017 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1000' gold='1000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:05:54,103 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.442 = 0.50×0.38(prox=0.38) + 0.40×proc(0.374[fin=0.45,mean=0.26]) + 0.10×fmt(1.000) | pred='200' gold='1000' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:05:54,189 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.641 = 0.50×0.45(prox=0.45) + 0.40×proc(0.783[fin=0.92,mean=0.58]) + 0.10×fmt(1.000) | pred='400' gold='1000' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+
Iter 7 GRPO groups: 75%|#######5 | 15/20 [03:03<00:56, 11.22s/q, loss=0.0024, mean_r=0.754, skip=5]
Iter 7 GRPO groups: 80%|######## | 16/20 [03:03<00:40, 10.20s/q, loss=0.0024, mean_r=0.754, skip=5]2026-04-26 04:06:01,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:06:01,825 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.898[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 04:06:01,910 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:06:01,993 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:02,075 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.947[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=86% lccp=43% (chain=3/7 ok_count=6) n_steps=7
+2026-04-26 04:06:02,158 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:06:02,242 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.791 = 0.50×0.81(prox=0.81) + 0.40×proc(0.713[fin=0.93,mean=0.39]) + 0.10×fmt(1.000) | pred='46' gold='52' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:06:02,324 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.946[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 04:06:02,407 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:06:02,492 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 7 GRPO groups: 80%|######## | 16/20 [03:11<00:40, 10.20s/q, loss=0.0026, mean_r=0.964, skip=5]
Iter 7 GRPO groups: 85%|########5 | 17/20 [03:11<00:28, 9.53s/q, loss=0.0026, mean_r=0.964, skip=5]2026-04-26 04:06:08,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.896[fin=0.98,mean=0.77]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:06:08,792 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.554 = 0.50×0.60(prox=0.60) + 0.40×proc(0.386[fin=0.30,mean=0.52]) + 0.10×fmt(1.000) | pred='30' gold='45' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:06:08,875 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.752 = 0.50×0.82(prox=0.82) + 0.40×proc(0.607[fin=0.71,mean=0.45]) + 0.10×fmt(1.000) | pred='40' gold='45' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:06:08,959 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.873 = 0.50×0.82(prox=0.82) + 0.40×proc(0.909[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='40' gold='45' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:06:09,043 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:09,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.922[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:06:09,210 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.786 = 0.50×0.69(prox=0.69) + 0.40×proc(0.849[fin=0.99,mean=0.64]) + 0.10×fmt(1.000) | pred='35' gold='45' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:06:09,291 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.565 = 0.50×0.45(prox=0.45) + 0.40×proc(0.595[fin=0.79,mean=0.31]) + 0.10×fmt(1.000) | pred='18' gold='45' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:06:09,377 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.320 = 0.50×0.23(prox=0.23) + 0.40×proc(0.167[fin=0.08,mean=0.30]) + 0.10×fmt(1.000) | pred='-30' gold='45' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 04:06:09,459 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.741[fin=0.94,mean=0.44]) + 0.10×fmt(1.000) | pred='15' gold='45' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+
Iter 7 GRPO groups: 85%|########5 | 17/20 [03:18<00:28, 9.53s/q, loss=-0.0003, mean_r=0.731, skip=5]
Iter 7 GRPO groups: 90%|######### | 18/20 [03:18<00:17, 8.76s/q, loss=-0.0003, mean_r=0.731, skip=5]2026-04-26 04:06:14,038 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:14,123 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.504 = 0.50×0.33(prox=0.33) + 0.40×proc(0.593[fin=0.78,mean=0.31]) + 0.10×fmt(1.000) | pred='210' gold='105' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:06:14,206 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:14,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:14,369 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:14,450 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:14,533 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:14,617 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.458 = 0.50×0.33(prox=0.33) + 0.40×proc(0.479[fin=0.64,mean=0.24]) + 0.10×fmt(1.000) | pred='210' gold='105' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:06:14,700 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:14,782 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 7 GRPO groups: 90%|######### | 18/20 [03:23<00:17, 8.76s/q, loss=-0.0002, mean_r=0.895, skip=5]
Iter 7 GRPO groups: 95%|#########5| 19/20 [03:23<00:07, 7.73s/q, loss=-0.0002, mean_r=0.895, skip=5]2026-04-26 04:06:19,856 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:19,938 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:20,019 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:20,102 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:20,184 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:20,267 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:20,349 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:20,430 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:20,511 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:20,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 7 GRPO groups: 95%|#########5| 19/20 [03:28<00:07, 7.73s/q, loss=0var, mean_r=0.997, skip=6]
Iter 7 GRPO groups: 100%|##########| 20/20 [03:28<00:00, 6.72s/q, loss=0var, mean_r=0.997, skip=6]
Iter 7 GRPO groups: 100%|##########| 20/20 [03:28<00:00, 10.41s/q, loss=0var, mean_r=0.997, skip=6]
+2026-04-26 04:06:20,595 INFO __main__ - Iter 7 | loss=0.0011 | reward mean=0.838 std=0.223 | gt_match=64.0% | grounded_acc=88.3% | step_acc=81.3% | lccp=65.8% | batch_acc=88.3% | phase=GROUNDED_ONLY sp_ratio=0% | groups=14 skipped=6(0var=6) | lr=4.44e-06 | 208.2s
+2026-04-26 04:06:20,596 INFO __main__ - ======================================================================
+2026-04-26 04:06:20,596 INFO __main__ - GRPO ITERATION 8/60
+2026-04-26 04:06:20,596 INFO __main__ - ======================================================================
+2026-04-26 04:06:20,614 INFO __main__ - LR this iteration: 4.44e-06 | T=0.753 | MATH ratio=30%
+
Iter 8 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:06:30,052 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:30,136 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:30,218 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:30,303 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.472 = 0.50×0.00(prox=0.00) + 0.40×proc(0.930[fin=0.98,mean=0.85]) + 0.10×fmt(1.000) | pred='4*6**0.5' gold='24' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:06:30,387 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:30,470 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:30,553 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:30,635 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:06:30,726 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.477 = 0.50×0.00(prox=0.00) + 0.40×proc(0.835[fin=0.99,mean=0.60]) + 0.10×fmt(1.000) | pred='$4\\sqrt{15}$' gold='24' | step_acc=71% lccp=29% (chain=2/7 ok_count=5) n_steps=7
+2026-04-26 04:06:30,816 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 8 GRPO groups: 0%| | 0/20 [00:11, ?q/s, loss=-0.0017, mean_r=0.895, skip=0]
Iter 8 GRPO groups: 5%|5 | 1/20 [00:11<03:41, 11.64s/q, loss=-0.0017, mean_r=0.895, skip=0]2026-04-26 04:06:35,918 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,002 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,086 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,168 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,251 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,333 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,418 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,503 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,587 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 5%|5 | 1/20 [00:16<03:41, 11.64s/q, loss=0var, mean_r=0.999, skip=1]
Iter 8 GRPO groups: 10%|# | 2/20 [00:16<02:13, 7.39s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 04:06:40,619 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.467 = 0.50×0.33(prox=0.33) + 0.40×proc(0.313[fin=0.15,mean=0.56]) + 0.10×fmt(1.000) | pred='2700' gold='1350' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:06:40,705 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.550[fin=0.46,mean=0.68]) + 0.10×fmt(1.000) | pred='2700' gold='1350' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:06:40,791 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.465 = 0.50×0.33(prox=0.33) + 0.40×proc(0.308[fin=0.11,mean=0.61]) + 0.10×fmt(1.000) | pred='2700' gold='1350' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:06:40,875 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1350' gold='1350' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:40,960 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.900[fin=0.99,mean=0.76]) + 0.10×fmt(1.000) | pred='1350' gold='1350' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:06:41,044 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.749 = 0.50×0.60(prox=0.60) + 0.40×proc(0.873[fin=0.92,mean=0.80]) + 0.10×fmt(1.000) | pred='900' gold='1350' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:06:41,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1350' gold='1350' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:41,214 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.464 = 0.50×0.33(prox=0.33) + 0.40×proc(0.305[fin=0.12,mean=0.58]) + 0.10×fmt(1.000) | pred='2700' gold='1350' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:06:41,298 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.463 = 0.50×0.33(prox=0.33) + 0.40×proc(0.304[fin=0.12,mean=0.58]) + 0.10×fmt(1.000) | pred='2700' gold='1350' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:06:41,381 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='1350' gold='1350' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 8 GRPO groups: 10%|# | 2/20 [00:22<02:13, 7.39s/q, loss=-0.0000, mean_r=0.710, skip=1]
Iter 8 GRPO groups: 15%|#5 | 3/20 [00:22<01:55, 6.82s/q, loss=-0.0000, mean_r=0.710, skip=1]2026-04-26 04:06:46,262 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:46,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:46,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:46,490 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:46,571 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:46,651 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:46,734 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:46,816 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:06:46,890 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:46,967 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 15%|#5 | 3/20 [00:26<01:55, 6.82s/q, loss=0var, mean_r=0.999, skip=2]
Iter 8 GRPO groups: 20%|## | 4/20 [00:26<01:32, 5.77s/q, loss=0var, mean_r=0.999, skip=2]2026-04-26 04:06:51,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:51,260 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.652 = 0.50×0.41(prox=0.41) + 0.40×proc(0.864[fin=0.99,mean=0.67]) + 0.10×fmt(1.000) | pred='2' gold='7' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:06:51,343 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.934 = 0.50×1.00(exact) + 0.40×proc(0.835[fin=1.00,mean=0.59]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:06:51,424 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:51,506 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.940 = 0.50×1.00(exact) + 0.40×proc(0.850[fin=1.00,mean=0.62]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:06:51,588 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:51,671 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:51,754 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.948[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:51,838 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.953 = 0.50×1.00(exact) + 0.40×proc(0.882[fin=1.00,mean=0.70]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:06:51,919 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 20%|## | 4/20 [00:32<01:32, 5.77s/q, loss=0.0002, mean_r=0.945, skip=2]
Iter 8 GRPO groups: 25%|##5 | 5/20 [00:32<01:29, 5.98s/q, loss=0.0002, mean_r=0.945, skip=2]2026-04-26 04:07:07,499 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:07:07,595 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.838 = 0.50×0.74(prox=0.74) + 0.40×proc(0.923[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='81.25' gold='69' | step_acc=88% lccp=50% (chain=4/8 ok_count=7) n_steps=8
+2026-04-26 04:07:07,678 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.920 = 0.50×0.85(prox=0.85) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='63' gold='69' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:07:07,761 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.925[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:07,857 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.902 = 0.50×0.85(prox=0.85) + 0.40×proc(0.944[fin=0.99,mean=0.87]) + 0.10×fmt(1.000) | pred='75' gold='69' | step_acc=89% lccp=11% (chain=1/9 ok_count=8) n_steps=9
+2026-04-26 04:07:07,941 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:07:08,033 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.745 = 0.50×0.52(prox=0.52) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='37' gold='69' | step_acc=91% lccp=55% (chain=6/11 ok_count=10) n_steps=11
+2026-04-26 04:07:08,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.377 = 0.50×0.00(prox=0.00) + 0.40×proc(0.580[fin=0.46,mean=0.76]) + 0.10×fmt(0.700) | pred='' gold='69' | step_acc=67% lccp=50% (chain=6/12 ok_count=8) n_steps=12
+2026-04-26 04:07:08,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.38(prox=0.38) + 0.40×proc(0.854[fin=0.98,mean=0.67]) + 0.10×fmt(1.000) | pred='13' gold='69' | step_acc=73% lccp=18% (chain=2/11 ok_count=8) n_steps=11
+2026-04-26 04:07:08,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 8 GRPO groups: 25%|##5 | 5/20 [00:49<01:29, 5.98s/q, loss=-0.0013, mean_r=0.830, skip=2]
Iter 8 GRPO groups: 30%|### | 6/20 [00:49<02:13, 9.53s/q, loss=-0.0013, mean_r=0.830, skip=2]2026-04-26 04:07:12,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,336 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,418 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,499 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,580 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,662 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,746 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,829 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,913 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,991 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 30%|### | 6/20 [00:52<02:13, 9.53s/q, loss=0var, mean_r=0.997, skip=3]
Iter 8 GRPO groups: 35%|###5 | 7/20 [00:52<01:37, 7.48s/q, loss=0var, mean_r=0.997, skip=3]2026-04-26 04:07:16,858 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:16,943 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:17,024 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:17,108 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:17,192 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:17,274 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:07:17,355 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:07:17,436 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:17,518 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:07:17,599 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 8 GRPO groups: 35%|###5 | 7/20 [00:56<01:37, 7.48s/q, loss=0var, mean_r=1.000, skip=4]
Iter 8 GRPO groups: 40%|#### | 8/20 [00:56<01:18, 6.56s/q, loss=0var, mean_r=1.000, skip=4]2026-04-26 04:07:24,210 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.538 = 0.50×0.50(prox=0.50) + 0.40×proc(0.469[fin=0.61,mean=0.26]) + 0.10×fmt(1.000) | pred='70' gold='140' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:07:24,296 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:07:24,375 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.906 = 0.50×1.00(exact) + 0.40×proc(0.852[fin=0.99,mean=0.65]) + 0.10×fmt(0.650) | pred='140' gold='140' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 04:07:24,459 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:24,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:07:24,628 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:07:24,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:07:24,787 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.634 = 0.50×0.37(prox=0.37) + 0.40×proc(0.874[fin=0.99,mean=0.70]) + 0.10×fmt(1.000) | pred='260' gold='140' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:07:24,864 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.641 = 0.50×0.38(prox=0.38) + 0.40×proc(0.872[fin=0.97,mean=0.72]) + 0.10×fmt(1.000) | pred='28' gold='140' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:07:24,945 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 40%|#### | 8/20 [01:05<01:18, 6.56s/q, loss=0.0008, mean_r=0.870, skip=4]
Iter 8 GRPO groups: 45%|####5 | 9/20 [01:05<01:19, 7.26s/q, loss=0.0008, mean_r=0.870, skip=4]2026-04-26 04:07:32,140 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='19' gold='19' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:07:32,228 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.786 = 0.50×0.70(prox=0.70) + 0.40×proc(0.835[fin=0.89,mean=0.75]) + 0.10×fmt(1.000) | pred='23' gold='19' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:07:32,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.876 = 0.50×0.83(prox=0.83) + 0.40×proc(0.907[fin=0.95,mean=0.84]) + 0.10×fmt(1.000) | pred='21' gold='19' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:07:32,403 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='19' gold='19' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:07:32,485 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.956 = 0.50×1.00(exact) + 0.40×proc(0.890[fin=0.98,mean=0.75]) + 0.10×fmt(1.000) | pred='19' gold='19' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:07:32,576 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='19' gold='19' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:07:32,661 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.868 = 0.50×0.83(prox=0.83) + 0.40×proc(0.889[fin=0.95,mean=0.80]) + 0.10×fmt(1.000) | pred='21' gold='19' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:07:32,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.767 = 0.50×0.85(prox=0.85) + 0.40×proc(0.605[fin=0.70,mean=0.47]) + 0.10×fmt(1.000) | pred='20' gold='19' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:07:32,828 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='19' gold='19' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:07:32,911 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='19' gold='19' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 8 GRPO groups: 45%|####5 | 9/20 [01:13<01:19, 7.26s/q, loss=0.0010, mean_r=0.925, skip=4]
Iter 8 GRPO groups: 50%|##### | 10/20 [01:13<01:14, 7.48s/q, loss=0.0010, mean_r=0.925, skip=4]2026-04-26 04:07:37,338 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:37,420 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:37,501 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:37,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:07:37,671 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:37,754 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:37,836 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:37,917 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:38,001 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:38,083 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 50%|##### | 10/20 [01:17<01:14, 7.48s/q, loss=0var, mean_r=0.999, skip=5]
Iter 8 GRPO groups: 55%|#####5 | 11/20 [01:17<00:56, 6.33s/q, loss=0var, mean_r=0.999, skip=5]2026-04-26 04:07:41,772 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.364 = 0.50×0.40(prox=0.40) + 0.40×proc(0.161[fin=0.19,mean=0.11]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:07:41,856 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.462 = 0.50×0.40(prox=0.40) + 0.40×proc(0.404[fin=0.50,mean=0.26]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:07:41,938 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:42,021 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:42,104 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.895[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:07:42,191 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.373 = 0.50×0.40(prox=0.40) + 0.40×proc(0.183[fin=0.21,mean=0.14]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:07:42,273 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:42,356 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:42,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.354 = 0.50×0.40(prox=0.40) + 0.40×proc(0.136[fin=0.17,mean=0.09]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:07:42,521 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 8 GRPO groups: 55%|#####5 | 11/20 [01:23<00:56, 6.33s/q, loss=-0.0002, mean_r=0.751, skip=5]
Iter 8 GRPO groups: 60%|###### | 12/20 [01:23<00:49, 6.18s/q, loss=-0.0002, mean_r=0.751, skip=5]2026-04-26 04:08:15,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.622 = 0.50×0.34(prox=0.34) + 0.40×proc(0.876[fin=1.00,mean=0.70]) + 0.10×fmt(1.000) | pred='19787' gold='10100' | step_acc=80% lccp=0% (chain=0/10 ok_count=8) n_steps=10
+2026-04-26 04:08:15,216 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.886[fin=0.92,mean=0.83]) + 0.10×fmt(1.000) | pred='2525' gold='10100' | step_acc=86% lccp=71% (chain=5/7 ok_count=6) n_steps=7
+2026-04-26 04:08:15,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.587 = 0.50×0.49(prox=0.49) + 0.40×proc(0.602[fin=0.62,mean=0.58]) + 0.10×fmt(1.000) | pred='4899' gold='10100' | step_acc=57% lccp=0% (chain=0/7 ok_count=4) n_steps=7
+2026-04-26 04:08:15,406 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.785 = 0.50×0.85(prox=0.85) + 0.40×proc(0.651[fin=0.75,mean=0.51]) + 0.10×fmt(1.000) | pred='9414' gold='10100' | step_acc=50% lccp=33% (chain=2/6 ok_count=3) n_steps=6
+2026-04-26 04:08:15,535 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.816 = 0.50×0.85(prox=0.85) + 0.40×proc(0.727[fin=0.80,mean=0.61]) + 0.10×fmt(1.000) | pred='10088' gold='10100' | step_acc=67% lccp=22% (chain=2/9 ok_count=6) n_steps=9
+2026-04-26 04:08:15,639 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.817[fin=0.93,mean=0.64]) + 0.10×fmt(1.000) | pred='100' gold='10100' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 04:08:15,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.443 = 0.50×0.39(prox=0.39) + 0.40×proc(0.365[fin=0.29,mean=0.48]) + 0.10×fmt(1.000) | pred='2304' gold='10100' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:08:15,837 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.37(prox=0.37) + 0.40×proc(0.830[fin=0.94,mean=0.66]) + 0.10×fmt(1.000) | pred='1608' gold='10100' | step_acc=75% lccp=38% (chain=3/8 ok_count=6) n_steps=8
+2026-04-26 04:08:15,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.698 = 0.50×0.50(prox=0.50) + 0.40×proc(0.870[fin=0.99,mean=0.69]) + 0.10×fmt(1.000) | pred='5049' gold='10100' | step_acc=71% lccp=0% (chain=0/7 ok_count=5) n_steps=7
+2026-04-26 04:08:16,030 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.762[fin=0.85,mean=0.63]) + 0.10×fmt(1.000) | pred='200' gold='10100' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+
Iter 8 GRPO groups: 60%|###### | 12/20 [01:56<00:49, 6.18s/q, loss=0.0000, mean_r=0.615, skip=5]
Iter 8 GRPO groups: 65%|######5 | 13/20 [01:56<01:41, 14.49s/q, loss=0.0000, mean_r=0.615, skip=5]2026-04-26 04:08:21,459 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:21,542 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:21,625 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:21,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.729 = 0.50×0.62(prox=0.62) + 0.40×proc(0.792[fin=0.91,mean=0.61]) + 0.10×fmt(1.000) | pred='70' gold='100' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:08:21,794 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:08:21,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:21,959 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:08:22,042 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:22,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:22,209 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 65%|######5 | 13/20 [02:02<01:41, 14.49s/q, loss=-0.0012, mean_r=0.972, skip=5]
Iter 8 GRPO groups: 70%|####### | 14/20 [02:02<01:11, 11.95s/q, loss=-0.0012, mean_r=0.972, skip=5]2026-04-26 04:08:33,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.14(prox=0.14) + 0.40×proc(0.902[fin=0.99,mean=0.77]) + 0.10×fmt(1.000) | pred='8' gold='2' | step_acc=83% lccp=50% (chain=6/12 ok_count=10) n_steps=12
+2026-04-26 04:08:33,647 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.484 = 0.50×0.50(prox=0.50) + 0.40×proc(0.335[fin=0.29,mean=0.40]) + 0.10×fmt(1.000) | pred='1' gold='2' | step_acc=43% lccp=29% (chain=2/7 ok_count=3) n_steps=7
+2026-04-26 04:08:33,739 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.261 = 0.50×0.08(prox=0.08) + 0.40×proc(0.260[fin=0.13,mean=0.46]) + 0.10×fmt(1.000) | pred='14' gold='2' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+2026-04-26 04:08:33,830 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.954 = 0.50×1.00(exact) + 0.40×proc(0.884[fin=0.99,mean=0.72]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=86% lccp=29% (chain=2/7 ok_count=6) n_steps=7
+2026-04-26 04:08:33,921 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.868[fin=1.00,mean=0.68]) + 0.10×fmt(1.000) | pred='6' gold='2' | step_acc=67% lccp=33% (chain=4/12 ok_count=8) n_steps=12
+2026-04-26 04:08:34,011 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.528 = 0.50×0.09(prox=0.09) + 0.40×proc(0.815[fin=0.95,mean=0.61]) + 0.10×fmt(1.000) | pred='12' gold='2' | step_acc=62% lccp=38% (chain=3/8 ok_count=5) n_steps=8
+2026-04-26 04:08:34,096 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.25(prox=0.25) + 0.40×proc(0.878[fin=1.00,mean=0.70]) + 0.10×fmt(1.000) | pred='5' gold='2' | step_acc=67% lccp=17% (chain=1/6 ok_count=4) n_steps=6
+2026-04-26 04:08:34,181 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.949[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:34,265 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.924 = 0.50×1.00(exact) + 0.40×proc(0.809[fin=0.90,mean=0.68]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 04:08:34,350 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.503 = 0.50×0.17(prox=0.17) + 0.40×proc(0.693[fin=0.79,mean=0.55]) + 0.10×fmt(1.000) | pred='7' gold='2' | step_acc=57% lccp=29% (chain=2/7 ok_count=4) n_steps=7
+
Iter 8 GRPO groups: 70%|####### | 14/20 [02:15<01:11, 11.95s/q, loss=-0.0008, mean_r=0.628, skip=5]
Iter 8 GRPO groups: 75%|#######5 | 15/20 [02:15<01:00, 12.01s/q, loss=-0.0008, mean_r=0.628, skip=5]2026-04-26 04:08:39,419 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:39,501 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:39,581 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:39,663 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:39,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:39,825 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:39,906 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:39,987 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:40,062 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:08:40,146 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 75%|#######5 | 15/20 [02:19<01:00, 12.01s/q, loss=0var, mean_r=0.997, skip=6]
Iter 8 GRPO groups: 80%|######## | 16/20 [02:19<00:38, 9.71s/q, loss=0var, mean_r=0.997, skip=6]2026-04-26 04:08:46,373 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:46,459 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:46,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:46,636 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:46,720 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:46,803 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:46,886 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:46,970 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:47,053 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:47,139 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 8 GRPO groups: 80%|######## | 16/20 [02:26<00:38, 9.71s/q, loss=0var, mean_r=0.999, skip=7]
Iter 8 GRPO groups: 85%|########5 | 17/20 [02:26<00:26, 8.90s/q, loss=0var, mean_r=0.999, skip=7]2026-04-26 04:08:52,458 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.594 = 0.50×0.35(prox=0.35) + 0.40×proc(0.801[fin=0.99,mean=0.51]) + 0.10×fmt(1.000) | pred='66' gold='1056' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:08:52,542 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.27(prox=0.27) + 0.40×proc(0.802[fin=0.81,mean=0.79]) + 0.10×fmt(1.000) | pred='2464' gold='1056' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:08:52,626 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.940 = 0.50×1.00(exact) + 0.40×proc(0.851[fin=1.00,mean=0.63]) + 0.10×fmt(1.000) | pred='1056' gold='1056' | step_acc=57% lccp=14% (chain=1/7 ok_count=4) n_steps=7
+2026-04-26 04:08:52,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='1056' gold='1056' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:08:52,792 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1056' gold='1056' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:08:52,876 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.429 = 0.50×0.36(prox=0.36) + 0.40×proc(0.311[fin=0.30,mean=0.33]) + 0.10×fmt(1.000) | pred='111.9965' gold='1056' | step_acc=17% lccp=17% (chain=1/6 ok_count=1) n_steps=6
+2026-04-26 04:08:52,958 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.474 = 0.50×0.27(prox=0.27) + 0.40×proc(0.345[fin=0.12,mean=0.68]) + 0.10×fmt(1.000) | pred='2464' gold='1056' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 04:08:53,041 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.497 = 0.50×0.27(prox=0.27) + 0.40×proc(0.402[fin=0.21,mean=0.69]) + 0.10×fmt(1.000) | pred='2464' gold='1056' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 04:08:53,126 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.449 = 0.50×0.27(prox=0.27) + 0.40×proc(0.281[fin=0.04,mean=0.64]) + 0.10×fmt(1.000) | pred='2464' gold='1056' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 04:08:53,208 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.473 = 0.50×0.27(prox=0.27) + 0.40×proc(0.342[fin=0.13,mean=0.66]) + 0.10×fmt(1.000) | pred='2464' gold='1056' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+
Iter 8 GRPO groups: 85%|########5 | 17/20 [02:34<00:26, 8.90s/q, loss=0.0008, mean_r=0.640, skip=7]
Iter 8 GRPO groups: 90%|######### | 18/20 [02:34<00:16, 8.48s/q, loss=0.0008, mean_r=0.640, skip=7]2026-04-26 04:09:01,486 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.958[fin=0.98,mean=0.92]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:09:01,570 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.951[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:09:01,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.942[fin=0.98,mean=0.88]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:09:01,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=0.99,mean=0.94]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:09:01,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.908[fin=0.98,mean=0.79]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:09:01,901 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.945[fin=0.96,mean=0.92]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:09:01,993 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.766 = 0.50×0.71(prox=0.71) + 0.40×proc(0.772[fin=0.88,mean=0.62]) + 0.10×fmt(1.000) | pred='24' gold='20' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:09:02,078 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.431 = 0.50×0.44(prox=0.44) + 0.40×proc(0.272[fin=0.22,mean=0.36]) + 0.10×fmt(1.000) | pred='32.5' gold='20' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:09:02,172 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.914[fin=0.99,mean=0.80]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:09:02,257 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.954 = 0.50×1.00(exact) + 0.40×proc(0.885[fin=0.91,mean=0.85]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 90%|######### | 18/20 [02:43<00:16, 8.48s/q, loss=-0.0012, mean_r=0.899, skip=7]
Iter 8 GRPO groups: 95%|#########5| 19/20 [02:43<00:08, 8.64s/q, loss=-0.0012, mean_r=0.899, skip=7]2026-04-26 04:09:08,999 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.865 = 0.50×0.82(prox=0.82) + 0.40×proc(0.883[fin=0.99,mean=0.73]) + 0.10×fmt(1.000) | pred='310' gold='280' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:09:09,084 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.876 = 0.50×0.82(prox=0.82) + 0.40×proc(0.911[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='310' gold='280' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:09:09,168 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.883 = 0.50×0.82(prox=0.82) + 0.40×proc(0.928[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='310' gold='280' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:09:09,251 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.873 = 0.50×0.82(prox=0.82) + 0.40×proc(0.904[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='310' gold='280' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:09:09,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.845 = 0.50×0.82(prox=0.82) + 0.40×proc(0.833[fin=0.93,mean=0.69]) + 0.10×fmt(1.000) | pred='310' gold='280' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:09:09,424 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.850 = 0.50×0.82(prox=0.82) + 0.40×proc(0.845[fin=0.99,mean=0.62]) + 0.10×fmt(1.000) | pred='310' gold='280' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:09:09,509 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.887 = 0.50×0.82(prox=0.82) + 0.40×proc(0.939[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='310' gold='280' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:09:09,591 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='280' gold='280' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:09:09,675 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.875 = 0.50×0.82(prox=0.82) + 0.40×proc(0.908[fin=0.99,mean=0.79]) + 0.10×fmt(1.000) | pred='310' gold='280' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:09:09,758 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.474 = 0.50×0.45(prox=0.45) + 0.40×proc(0.225[fin=0.08,mean=0.44]) + 0.10×fmt(1.000) | pred='107.5' gold='280' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+
Iter 8 GRPO groups: 95%|#########5| 19/20 [02:50<00:08, 8.64s/q, loss=0.0002, mean_r=0.843, skip=7]
Iter 8 GRPO groups: 100%|##########| 20/20 [02:50<00:00, 8.30s/q, loss=0.0002, mean_r=0.843, skip=7]
Iter 8 GRPO groups: 100%|##########| 20/20 [02:50<00:00, 8.53s/q, loss=0.0002, mean_r=0.843, skip=7]
+2026-04-26 04:09:11,192 INFO __main__ - Iter 8 | loss=-0.0003 | reward mean=0.876 std=0.200 | gt_match=69.0% | grounded_acc=89.5% | step_acc=86.5% | lccp=74.8% | batch_acc=89.5% | phase=GROUNDED_ONLY sp_ratio=0% | groups=13 skipped=7(0var=7) | lr=5.00e-06 | 170.6s
+2026-04-26 04:09:11,192 WARNING __main__ - STARVATION: 35% of groups skipped (zero variance). grounded_acc=89.5% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 04:09:11,193 INFO __main__ - ======================================================================
+2026-04-26 04:09:11,193 INFO __main__ - GRPO ITERATION 9/60
+2026-04-26 04:09:11,193 INFO __main__ - ======================================================================
+2026-04-26 04:09:11,213 INFO __main__ - LR this iteration: 5.00e-06 | T=0.746 | MATH ratio=30%
+
Iter 9 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:09:19,193 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.527 = 0.50×0.47(prox=0.47) + 0.40×proc(0.296[fin=0.17,mean=0.48]) + 0.10×fmt(1.000) | pred='6' gold='14' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:09:19,277 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.449 = 0.50×0.54(prox=0.54) + 0.40×proc(0.199[fin=0.16,mean=0.26]) + 0.10×fmt(1.000) | pred='8' gold='14' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:09:19,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.921[fin=0.97,mean=0.84]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:09:19,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.945[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:09:19,523 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:09:19,604 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:09:19,696 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:09:19,778 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.512 = 0.50×0.37(prox=0.37) + 0.40×proc(0.569[fin=0.75,mean=0.30]) + 0.10×fmt(1.000) | pred='2' gold='14' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:09:19,868 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:09:19,951 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 9 GRPO groups: 0%| | 0/20 [00:10, ?q/s, loss=-0.0001, mean_r=0.839, skip=0]
Iter 9 GRPO groups: 5%|5 | 1/20 [00:10<03:13, 10.17s/q, loss=-0.0001, mean_r=0.839, skip=0]2026-04-26 04:09:55,074 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.606 = 0.50×0.85(prox=0.85) + 0.40×proc(0.202[fin=0.01,mean=0.49]) + 0.10×fmt(1.000) | pred='989' gold='990' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:09:55,159 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='990' gold='990' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:09:55,239 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='990' gold='990' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:09:55,320 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.949 = 0.50×1.00(exact) + 0.40×proc(0.961[fin=1.00,mean=0.91]) + 0.10×fmt(0.650) | pred='990' gold='990' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:09:55,404 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.290 = 0.50×0.35(prox=0.35) + 0.40×proc(0.031[fin=0.01,mean=0.06]) + 0.10×fmt(1.000) | pred='90' gold='990' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:09:55,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='990' gold='990' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:09:55,559 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.941 = 0.50×1.00(exact) + 0.40×proc(0.941[fin=1.00,mean=0.85]) + 0.10×fmt(0.650) | pred='990' gold='990' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:09:55,635 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='990' gold='990' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:09:55,712 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.509 = 0.50×0.85(prox=0.85) + 0.40×proc(0.048[fin=0.06,mean=0.03]) + 0.10×fmt(0.650) | pred='991' gold='990' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+
Iter 9 GRPO groups: 5%|5 | 1/20 [00:45<03:13, 10.17s/q, loss=-0.0033, mean_r=0.805, skip=0]
Iter 9 GRPO groups: 10%|# | 2/20 [00:45<07:33, 25.19s/q, loss=-0.0033, mean_r=0.805, skip=0]2026-04-26 04:10:04,618 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.810 = 0.50×0.77(prox=0.77) + 0.40×proc(0.814[fin=0.91,mean=0.67]) + 0.10×fmt(1.000) | pred='85' gold='100' | step_acc=71% lccp=43% (chain=3/7 ok_count=5) n_steps=7
+2026-04-26 04:10:04,702 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.888[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='200' gold='100' | step_acc=75% lccp=12% (chain=1/8 ok_count=6) n_steps=8
+2026-04-26 04:10:04,784 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.946[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=88% lccp=25% (chain=2/8 ok_count=7) n_steps=8
+2026-04-26 04:10:04,868 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.648 = 0.50×0.64(prox=0.64) + 0.40×proc(0.575[fin=0.55,mean=0.60]) + 0.10×fmt(1.000) | pred='71.43' gold='100' | step_acc=86% lccp=14% (chain=1/7 ok_count=6) n_steps=7
+2026-04-26 04:10:04,962 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.453 = 0.50×0.33(prox=0.33) + 0.40×proc(0.467[fin=0.57,mean=0.32]) + 0.10×fmt(1.000) | pred='0' gold='100' | step_acc=38% lccp=0% (chain=0/8 ok_count=3) n_steps=8
+2026-04-26 04:10:05,047 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.548 = 0.50×0.54(prox=0.54) + 0.40×proc(0.448[fin=0.46,mean=0.43]) + 0.10×fmt(1.000) | pred='142.85714' gold='100' | step_acc=25% lccp=25% (chain=2/8 ok_count=2) n_steps=8
+2026-04-26 04:10:05,131 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.684 = 0.50×0.54(prox=0.54) + 0.40×proc(0.787[fin=0.98,mean=0.50]) + 0.10×fmt(1.000) | pred='143' gold='100' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 04:10:05,215 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:05,299 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.300 = 0.50×0.00(prox=0.00) + 0.40×proc(0.437[fin=0.52,mean=0.31]) + 0.10×fmt(1.000) | pred='83 1/3' gold='100' | step_acc=33% lccp=17% (chain=1/6 ok_count=2) n_steps=6
+2026-04-26 04:10:05,383 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.754 = 0.50×0.82(prox=0.82) + 0.40×proc(0.614[fin=0.66,mean=0.54]) + 0.10×fmt(1.000) | pred='111.12' gold='100' | step_acc=62% lccp=12% (chain=1/8 ok_count=5) n_steps=8
+
Iter 9 GRPO groups: 10%|# | 2/20 [00:55<07:33, 25.19s/q, loss=0.0017, mean_r=0.672, skip=0]
Iter 9 GRPO groups: 15%|#5 | 3/20 [00:55<05:08, 18.13s/q, loss=0.0017, mean_r=0.672, skip=0]2026-04-26 04:10:11,496 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:11,580 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:11,663 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:11,747 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:11,830 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:11,913 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:11,996 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:12,079 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:12,161 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:12,244 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 9 GRPO groups: 15%|#5 | 3/20 [01:01<05:08, 18.13s/q, loss=0var, mean_r=0.999, skip=1]
Iter 9 GRPO groups: 20%|## | 4/20 [01:01<03:29, 13.12s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 04:10:15,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:10:15,193 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:10:15,275 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:10:15,356 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:10:15,438 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:15,519 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:15,600 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:15,682 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:15,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.960[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:15,844 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 9 GRPO groups: 20%|## | 4/20 [01:04<03:29, 13.12s/q, loss=0var, mean_r=0.996, skip=2]
Iter 9 GRPO groups: 25%|##5 | 5/20 [01:04<02:25, 9.68s/q, loss=0var, mean_r=0.996, skip=2]2026-04-26 04:10:22,199 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:10:22,284 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:10:22,368 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:10:22,453 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:10:22,538 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:10:22,621 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:10:22,704 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:10:22,788 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:10:22,883 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.832[fin=0.96,mean=0.64]) + 0.10×fmt(1.000) | pred='31' gold='88' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:10:22,966 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 9 GRPO groups: 25%|##5 | 5/20 [01:13<02:25, 9.68s/q, loss=-0.0025, mean_r=0.950, skip=2]
Iter 9 GRPO groups: 30%|### | 6/20 [01:13<02:10, 9.31s/q, loss=-0.0025, mean_r=0.950, skip=2]2026-04-26 04:10:49,634 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:49,715 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:49,796 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:10:49,871 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=0.99,mean=0.91]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:49,953 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:50,064 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.785 = 0.50×0.60(prox=0.60) + 0.40×proc(0.962[fin=0.94,mean=0.99]) + 0.10×fmt(1.000) | pred='16' gold='12' | step_acc=100% lccp=100% (chain=24/24 ok_count=24) n_steps=24
+2026-04-26 04:10:50,140 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.401 = 0.50×0.33(prox=0.33) + 0.40×proc(0.211[fin=0.12,mean=0.35]) + 0.10×fmt(1.000) | pred='24' gold='12' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 04:10:50,223 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:50,304 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:50,385 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 9 GRPO groups: 30%|### | 6/20 [01:40<02:10, 9.31s/q, loss=0.0001, mean_r=0.913, skip=2]
Iter 9 GRPO groups: 35%|###5 | 7/20 [01:40<03:17, 15.22s/q, loss=0.0001, mean_r=0.913, skip=2]2026-04-26 04:10:59,635 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:10:59,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.651 = 0.50×0.50(prox=0.50) + 0.40×proc(0.752[fin=0.87,mean=0.58]) + 0.10×fmt(1.000) | pred='1' gold='2' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 04:10:59,805 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:59,890 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.642 = 0.50×0.50(prox=0.50) + 0.40×proc(0.731[fin=0.70,mean=0.78]) + 0.10×fmt(1.000) | pred='1' gold='2' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:10:59,973 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:00,056 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:11:00,149 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:00,231 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:00,315 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:00,399 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 9 GRPO groups: 35%|###5 | 7/20 [01:50<03:17, 15.22s/q, loss=0.0008, mean_r=0.929, skip=2]
Iter 9 GRPO groups: 40%|#### | 8/20 [01:50<02:42, 13.57s/q, loss=0.0008, mean_r=0.929, skip=2]2026-04-26 04:11:07,400 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:07,483 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:07,567 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.928 = 0.50×1.00(exact) + 0.40×proc(0.820[fin=0.98,mean=0.58]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:11:07,650 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:07,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:07,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:07,902 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:07,984 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:08,066 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:08,161 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.542[fin=0.60,mean=0.46]) + 0.10×fmt(1.000) | pred='132' gold='80' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+
Iter 9 GRPO groups: 40%|#### | 8/20 [01:58<02:42, 13.57s/q, loss=-0.0014, mean_r=0.945, skip=2]
Iter 9 GRPO groups: 45%|####5 | 9/20 [01:58<02:09, 11.75s/q, loss=-0.0014, mean_r=0.945, skip=2]2026-04-26 04:11:13,028 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:13,109 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:13,192 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.535 = 0.50×0.08(prox=0.08) + 0.40×proc(0.917[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='420' gold='60' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 04:11:13,276 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:13,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:13,434 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=0.99,mean=0.95]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:13,510 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:13,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:13,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:13,752 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 9 GRPO groups: 45%|####5 | 9/20 [02:03<02:09, 11.75s/q, loss=-0.0010, mean_r=0.950, skip=2]
Iter 9 GRPO groups: 50%|##### | 10/20 [02:03<01:38, 9.85s/q, loss=-0.0010, mean_r=0.950, skip=2]2026-04-26 04:11:19,737 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.925 = 0.50×1.00(exact) + 0.40×proc(0.811[fin=0.98,mean=0.56]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:11:19,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:19,899 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:11:19,981 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:20,064 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:11:20,148 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:20,233 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:20,316 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.614 = 0.50×1.00(exact) + 0.40×proc(0.035[fin=0.01,mean=0.07]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:11:20,399 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:11:20,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 9 GRPO groups: 50%|##### | 10/20 [02:10<01:38, 9.85s/q, loss=0.0009, mean_r=0.952, skip=2]
Iter 9 GRPO groups: 55%|#####5 | 11/20 [02:10<01:20, 8.90s/q, loss=0.0009, mean_r=0.952, skip=2]2026-04-26 04:11:28,099 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:11:28,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.619 = 0.50×0.85(prox=0.85) + 0.40×proc(0.234[fin=0.05,mean=0.51]) + 0.10×fmt(1.000) | pred='132' gold='136' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:11:28,270 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:11:28,355 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:11:28,437 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:11:28,521 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:11:28,607 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:11:28,690 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:11:28,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.907[fin=0.96,mean=0.82]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:11:28,855 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 9 GRPO groups: 55%|#####5 | 11/20 [02:19<01:20, 8.90s/q, loss=0.0001, mean_r=0.958, skip=2]
Iter 9 GRPO groups: 60%|###### | 12/20 [02:19<01:10, 8.83s/q, loss=0.0001, mean_r=0.958, skip=2]2026-04-26 04:11:34,077 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:34,159 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:34,241 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:34,321 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:34,403 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:34,484 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:34,565 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:34,646 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.951 = 0.50×1.00(exact) + 0.40×proc(0.879[fin=1.00,mean=0.70]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:11:34,728 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:34,810 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 9 GRPO groups: 60%|###### | 12/20 [02:23<01:10, 8.83s/q, loss=0var, mean_r=0.995, skip=3]
Iter 9 GRPO groups: 65%|######5 | 13/20 [02:23<00:51, 7.43s/q, loss=0var, mean_r=0.995, skip=3]2026-04-26 04:11:38,974 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:39,055 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:39,138 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:39,213 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:39,289 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.428 = 0.50×0.43(prox=0.43) + 0.40×proc(0.160[fin=0.06,mean=0.31]) + 0.10×fmt(1.000) | pred='14' gold='42' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 04:11:39,366 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:39,449 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:39,526 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:39,607 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:39,682 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 9 GRPO groups: 65%|######5 | 13/20 [02:29<00:51, 7.43s/q, loss=0.0032, mean_r=0.939, skip=3]
Iter 9 GRPO groups: 70%|####### | 14/20 [02:29<00:42, 7.08s/q, loss=0.0032, mean_r=0.939, skip=3]2026-04-26 04:11:44,001 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,083 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,166 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,247 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,410 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,491 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,573 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,655 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.928[fin=0.98,mean=0.85]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,737 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 9 GRPO groups: 70%|####### | 14/20 [02:33<00:42, 7.08s/q, loss=0var, mean_r=0.995, skip=4]
Iter 9 GRPO groups: 75%|#######5 | 15/20 [02:33<00:30, 6.05s/q, loss=0var, mean_r=0.995, skip=4]2026-04-26 04:11:51,607 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:51,692 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:51,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:51,859 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:51,941 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=0.99,mean=0.98]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:52,023 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:52,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.540 = 0.50×0.06(prox=0.06) + 0.40×proc(0.901[fin=0.96,mean=0.81]) + 0.10×fmt(1.000) | pred='8' gold='0' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:11:52,198 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:52,280 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:52,362 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 9 GRPO groups: 75%|#######5 | 15/20 [02:42<00:30, 6.05s/q, loss=0.0023, mean_r=0.951, skip=4]
Iter 9 GRPO groups: 80%|######## | 16/20 [02:42<00:27, 6.94s/q, loss=0.0023, mean_r=0.951, skip=4]2026-04-26 04:12:26,843 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.798 = 0.50×1.00(exact) + 0.40×proc(0.496[fin=0.37,mean=0.68]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 04:12:26,931 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.10(prox=0.10) + 0.40×proc(0.945[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='198' gold='36' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:12:27,025 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.954[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='108' gold='36' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:12:27,110 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.895 = 0.50×1.00(exact) + 0.40×proc(0.737[fin=0.70,mean=0.79]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:12:27,195 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.522 = 0.50×0.20(prox=0.20) + 0.40×proc(0.804[fin=0.86,mean=0.72]) + 0.10×fmt(1.000) | pred='108' gold='36' | step_acc=71% lccp=0% (chain=0/7 ok_count=5) n_steps=7
+2026-04-26 04:12:27,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.631 = 0.50×0.85(prox=0.85) + 0.40×proc(0.265[fin=0.13,mean=0.47]) + 0.10×fmt(1.000) | pred='33' gold='36' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:12:27,362 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.896 = 0.50×0.85(prox=0.85) + 0.40×proc(0.928[fin=0.98,mean=0.85]) + 0.10×fmt(1.000) | pred='33' gold='36' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 04:12:27,446 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.756 = 0.50×0.85(prox=0.85) + 0.40×proc(0.578[fin=0.55,mean=0.61]) + 0.10×fmt(1.000) | pred='33' gold='36' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:12:27,530 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.870 = 0.50×1.00(exact) + 0.40×proc(0.674[fin=0.59,mean=0.79]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 9 GRPO groups: 80%|######## | 16/20 [03:17<00:27, 6.94s/q, loss=-0.0005, mean_r=0.719, skip=4]
Iter 9 GRPO groups: 85%|########5 | 17/20 [03:17<00:46, 15.43s/q, loss=-0.0005, mean_r=0.719, skip=4]2026-04-26 04:12:32,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:12:32,571 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:12:32,651 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:12:32,726 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.948[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:12:32,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:12:32,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:12:32,951 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.919[fin=0.99,mean=0.81]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:12:33,026 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.650) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:12:33,108 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:12:33,182 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 9 GRPO groups: 85%|########5 | 17/20 [03:21<00:46, 15.43s/q, loss=0var, mean_r=0.986, skip=5]
Iter 9 GRPO groups: 90%|######### | 18/20 [03:21<00:24, 12.07s/q, loss=0var, mean_r=0.986, skip=5]2026-04-26 04:12:41,341 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.522 = 0.50×0.60(prox=0.60) + 0.40×proc(0.305[fin=0.25,mean=0.39]) + 0.10×fmt(1.000) | pred='200000' gold='150000' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 04:12:41,433 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.907[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='300000' gold='150000' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 04:12:41,517 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.349 = 0.50×0.23(prox=0.23) + 0.40×proc(0.239[fin=0.19,mean=0.32]) + 0.10×fmt(1.000) | pred='400000' gold='150000' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 04:12:41,599 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.950 = 0.50×1.00(exact) + 0.40×proc(0.874[fin=0.98,mean=0.71]) + 0.10×fmt(1.000) | pred='150000' gold='150000' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:12:41,691 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.941[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='300000' gold='150000' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 04:12:41,782 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.728 = 0.50×0.60(prox=0.60) + 0.40×proc(0.820[fin=0.95,mean=0.63]) + 0.10×fmt(1.000) | pred='200000' gold='150000' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:12:41,874 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='300000' gold='150000' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 04:12:41,965 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='150000' gold='150000' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:12:42,057 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.743 = 0.50×0.60(prox=0.60) + 0.40×proc(0.856[fin=1.00,mean=0.64]) + 0.10×fmt(1.000) | pred='100000' gold='150000' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:12:42,143 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.902[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='300000' gold='150000' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+
Iter 9 GRPO groups: 90%|######### | 18/20 [03:32<00:24, 12.07s/q, loss=0.0005, mean_r=0.648, skip=5]
Iter 9 GRPO groups: 95%|#########5| 19/20 [03:32<00:11, 11.56s/q, loss=0.0005, mean_r=0.648, skip=5]2026-04-26 04:12:50,322 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:12:50,405 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:12:50,488 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=0.99,mean=0.94]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:12:50,579 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:12:50,664 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.720 = 0.50×0.54(prox=0.54) + 0.40×proc(0.875[fin=0.98,mean=0.72]) + 0.10×fmt(1.000) | pred='188' gold='328' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:12:50,748 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.894 = 0.50×1.00(exact) + 0.40×proc(0.735[fin=0.75,mean=0.72]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=83% lccp=33% (chain=2/6 ok_count=5) n_steps=6
+2026-04-26 04:12:50,831 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:12:50,922 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:12:51,006 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:12:51,089 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=0.99,mean=0.96]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 9 GRPO groups: 95%|#########5| 19/20 [03:41<00:11, 11.56s/q, loss=-0.0000, mean_r=0.958, skip=5]
Iter 9 GRPO groups: 100%|##########| 20/20 [03:41<00:00, 10.78s/q, loss=-0.0000, mean_r=0.958, skip=5]
Iter 9 GRPO groups: 100%|##########| 20/20 [03:41<00:00, 11.07s/q, loss=-0.0000, mean_r=0.958, skip=5]
+2026-04-26 04:12:52,544 INFO __main__ - Iter 9 | loss=0.0001 | reward mean=0.907 std=0.177 | gt_match=80.3% | grounded_acc=96.5% | step_acc=89.4% | lccp=81.8% | batch_acc=96.5% | phase=GROUNDED_ONLY sp_ratio=0% | groups=15 skipped=5(0var=5) | lr=5.00e-06 | 221.4s
+2026-04-26 04:12:52,545 INFO __main__ - ======================================================================
+2026-04-26 04:12:52,545 INFO __main__ - GRPO ITERATION 10/60
+2026-04-26 04:12:52,545 INFO __main__ - ======================================================================
+2026-04-26 04:12:52,566 INFO __main__ - LR this iteration: 5.00e-06 | T=0.739 | MATH ratio=30%
+
Iter 10 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:12:56,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.956[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:12:56,257 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.969[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:12:56,341 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.920[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:12:56,425 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.932[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:12:56,507 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.928[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:12:56,589 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.887[fin=0.97,mean=0.76]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:12:56,671 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:12:56,752 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.914[fin=0.99,mean=0.80]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:12:56,833 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.894[fin=0.98,mean=0.76]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:12:56,914 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.956 = 0.50×1.00(exact) + 0.40×proc(0.890[fin=0.98,mean=0.76]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+
Iter 10 GRPO groups: 0%| | 0/20 [00:04, ?q/s, loss=0var, mean_r=0.972, skip=1]
Iter 10 GRPO groups: 5%|5 | 1/20 [00:04<01:22, 4.35s/q, loss=0var, mean_r=0.972, skip=1]2026-04-26 04:13:01,303 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:01,379 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:01,456 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:01,539 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:01,618 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:01,697 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:01,775 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:01,856 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:13:01,930 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.921[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:13:02,012 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 10 GRPO groups: 5%|5 | 1/20 [00:09<01:22, 4.35s/q, loss=0var, mean_r=0.996, skip=2]
Iter 10 GRPO groups: 10%|# | 2/20 [00:09<01:26, 4.79s/q, loss=0var, mean_r=0.996, skip=2]2026-04-26 04:13:35,174 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.695 = 0.50×0.50(prox=0.50) + 0.40×proc(0.862[fin=0.99,mean=0.67]) + 0.10×fmt(1.000) | pred='7' gold='14' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:13:35,258 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.529 = 0.50×0.44(prox=0.44) + 0.40×proc(0.525[fin=0.68,mean=0.30]) + 0.10×fmt(1.000) | pred='5' gold='14' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:13:35,341 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.757 = 0.50×0.78(prox=0.78) + 0.40×proc(0.669[fin=0.86,mean=0.39]) + 0.10×fmt(1.000) | pred='16' gold='14' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:13:35,426 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.936 = 0.50×1.00(exact) + 0.40×proc(0.839[fin=0.98,mean=0.62]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:13:35,510 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.767 = 0.50×0.78(prox=0.78) + 0.40×proc(0.697[fin=0.90,mean=0.39]) + 0.10×fmt(1.000) | pred='12' gold='14' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:13:35,592 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.634 = 0.50×0.50(prox=0.50) + 0.40×proc(0.711[fin=0.84,mean=0.51]) + 0.10×fmt(1.000) | pred='7' gold='14' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:13:35,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:13:35,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.655 = 0.50×0.50(prox=0.50) + 0.40×proc(0.762[fin=0.96,mean=0.46]) + 0.10×fmt(1.000) | pred='7' gold='14' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:13:35,845 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 10 GRPO groups: 10%|# | 2/20 [00:44<01:26, 4.79s/q, loss=0.0015, mean_r=0.775, skip=2]
Iter 10 GRPO groups: 15%|#5 | 3/20 [00:44<05:17, 18.70s/q, loss=0.0015, mean_r=0.775, skip=2]2026-04-26 04:13:41,694 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:41,776 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:41,851 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:41,932 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:13:42,013 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:42,090 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:42,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.956[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:42,245 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:42,328 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:42,409 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.901[fin=0.98,mean=0.79]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 10 GRPO groups: 15%|#5 | 3/20 [00:49<05:17, 18.70s/q, loss=0var, mean_r=0.992, skip=3]
Iter 10 GRPO groups: 20%|## | 4/20 [00:49<03:33, 13.35s/q, loss=0var, mean_r=0.992, skip=3]2026-04-26 04:13:45,966 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,048 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,203 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,279 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,357 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,439 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,519 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,600 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,681 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 10 GRPO groups: 20%|## | 4/20 [00:54<03:33, 13.35s/q, loss=0var, mean_r=0.999, skip=4]
Iter 10 GRPO groups: 25%|##5 | 5/20 [00:54<02:31, 10.08s/q, loss=0var, mean_r=0.999, skip=4]2026-04-26 04:13:52,591 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:13:52,673 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:52,756 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.940[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:13:52,838 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:13:52,919 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.884 = 0.50×1.00(exact) + 0.40×proc(0.710[fin=0.86,mean=0.48]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:13:53,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.395 = 0.50×0.14(prox=0.14) + 0.40×proc(0.277[fin=0.00,mean=0.69]) + 0.10×fmt(1.000) | pred='8' gold='2' | step_acc=75% lccp=75% (chain=3/4 ok_count=3) n_steps=4
+2026-04-26 04:13:53,086 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:13:53,172 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.657 = 0.50×0.50(prox=0.50) + 0.40×proc(0.769[fin=0.89,mean=0.59]) + 0.10×fmt(1.000) | pred='3' gold='2' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:13:53,256 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:13:53,338 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.926 = 0.50×1.00(exact) + 0.40×proc(0.815[fin=0.98,mean=0.57]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+
Iter 10 GRPO groups: 25%|##5 | 5/20 [01:02<02:31, 10.08s/q, loss=0.0002, mean_r=0.882, skip=4]
Iter 10 GRPO groups: 30%|### | 6/20 [01:02<02:11, 9.40s/q, loss=0.0002, mean_r=0.882, skip=4]2026-04-26 04:13:58,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:58,868 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:58,949 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:59,031 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:59,113 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:59,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:59,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:59,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:59,441 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:59,523 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 10 GRPO groups: 30%|### | 6/20 [01:06<02:11, 9.40s/q, loss=0var, mean_r=0.998, skip=5]
Iter 10 GRPO groups: 35%|###5 | 7/20 [01:06<01:42, 7.88s/q, loss=0var, mean_r=0.998, skip=5]2026-04-26 04:14:02,109 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:02,187 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:02,266 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:02,342 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:02,418 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:02,494 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:02,569 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:02,646 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.795 = 0.50×0.71(prox=0.71) + 0.40×proc(0.845[fin=0.96,mean=0.66]) + 0.10×fmt(1.000) | pred='8' gold='10' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:14:02,722 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:02,798 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.808 = 0.50×0.71(prox=0.71) + 0.40×proc(0.878[fin=0.99,mean=0.71]) + 0.10×fmt(1.000) | pred='8' gold='10' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+
Iter 10 GRPO groups: 35%|###5 | 7/20 [01:11<01:42, 7.88s/q, loss=0.0022, mean_r=0.955, skip=5]
Iter 10 GRPO groups: 40%|#### | 8/20 [01:11<01:22, 6.87s/q, loss=0.0022, mean_r=0.955, skip=5]2026-04-26 04:14:19,035 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.988[fin=0.98,mean=0.99]) + 0.10×fmt(0.700) | pred='' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:19,118 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:19,200 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:19,281 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:19,364 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:19,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:19,545 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.637 = 0.50×0.67(prox=0.67) + 0.40×proc(0.510[fin=0.43,mean=0.63]) + 0.10×fmt(1.000) | pred='7.5' gold='10' | step_acc=67% lccp=8% (chain=1/12 ok_count=8) n_steps=12
+2026-04-26 04:14:19,628 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.859 = 0.50×1.00(exact) + 0.40×proc(0.648[fin=0.60,mean=0.72]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:14:19,712 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:19,805 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 10 GRPO groups: 40%|#### | 8/20 [01:28<01:22, 6.87s/q, loss=-0.0007, mean_r=0.903, skip=5]
Iter 10 GRPO groups: 45%|####5 | 9/20 [01:28<01:50, 10.05s/q, loss=-0.0007, mean_r=0.903, skip=5]2026-04-26 04:14:25,027 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,113 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,277 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,442 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,526 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,607 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,689 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,771 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 10 GRPO groups: 45%|####5 | 9/20 [01:33<01:50, 10.05s/q, loss=0var, mean_r=1.000, skip=6]
Iter 10 GRPO groups: 50%|##### | 10/20 [01:33<01:23, 8.34s/q, loss=0var, mean_r=1.000, skip=6]2026-04-26 04:14:29,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:29,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:29,755 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:29,839 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:29,925 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:14:30,009 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:30,092 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:14:30,175 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(0.650) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:14:30,257 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:30,338 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.935 = 0.50×1.00(exact) + 0.40×proc(0.925[fin=1.00,mean=0.81]) + 0.10×fmt(0.650) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 10 GRPO groups: 50%|##### | 10/20 [01:39<01:23, 8.34s/q, loss=0.0028, mean_r=0.984, skip=6]
Iter 10 GRPO groups: 55%|#####5 | 11/20 [01:39<01:08, 7.62s/q, loss=0.0028, mean_r=0.984, skip=6]2026-04-26 04:14:36,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:36,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:36,833 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:36,914 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:36,995 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:37,078 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:37,161 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:37,242 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:37,324 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:37,407 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.929 = 0.50×1.00(exact) + 0.40×proc(0.823[fin=0.99,mean=0.57]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+
Iter 10 GRPO groups: 55%|#####5 | 11/20 [01:46<01:08, 7.62s/q, loss=-0.0017, mean_r=0.992, skip=6]
Iter 10 GRPO groups: 60%|###### | 12/20 [01:46<00:59, 7.46s/q, loss=-0.0017, mean_r=0.992, skip=6]2026-04-26 04:14:51,968 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.908 = 0.50×1.00(exact) + 0.40×proc(0.770[fin=0.80,mean=0.72]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=75% lccp=62% (chain=5/8 ok_count=6) n_steps=8
+2026-04-26 04:14:52,061 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.902 = 0.50×1.00(exact) + 0.40×proc(0.756[fin=0.72,mean=0.81]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=88% lccp=75% (chain=6/8 ok_count=7) n_steps=8
+2026-04-26 04:14:52,157 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.737 = 0.50×0.50(prox=0.50) + 0.40×proc(0.969[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='175' gold='350' | step_acc=89% lccp=78% (chain=7/9 ok_count=8) n_steps=9
+2026-04-26 04:14:52,249 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.627 = 0.50×0.85(prox=0.85) + 0.40×proc(0.254[fin=0.08,mean=0.52]) + 0.10×fmt(1.000) | pred='380' gold='350' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+2026-04-26 04:14:52,347 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.804 = 0.50×0.80(prox=0.80) + 0.40×proc(0.766[fin=0.73,mean=0.81]) + 0.10×fmt(1.000) | pred='305' gold='350' | step_acc=90% lccp=80% (chain=8/10 ok_count=9) n_steps=10
+2026-04-26 04:14:52,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.834 = 0.50×0.85(prox=0.85) + 0.40×proc(0.772[fin=0.83,mean=0.68]) + 0.10×fmt(1.000) | pred='380' gold='350' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:14:52,532 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.572 = 0.50×0.53(prox=0.53) + 0.40×proc(0.513[fin=0.46,mean=0.60]) + 0.10×fmt(1.000) | pred='197.5' gold='350' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 04:14:52,624 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.739 = 0.50×0.50(prox=0.50) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='175' gold='350' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:14:52,719 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.606 = 0.50×0.80(prox=0.80) + 0.40×proc(0.270[fin=0.01,mean=0.65]) + 0.10×fmt(1.000) | pred='395' gold='350' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 04:14:52,815 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.609 = 0.50×0.80(prox=0.80) + 0.40×proc(0.279[fin=0.07,mean=0.60]) + 0.10×fmt(1.000) | pred='395' gold='350' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+
Iter 10 GRPO groups: 60%|###### | 12/20 [02:01<00:59, 7.46s/q, loss=0.0005, mean_r=0.734, skip=6]
Iter 10 GRPO groups: 65%|######5 | 13/20 [02:01<01:09, 9.87s/q, loss=0.0005, mean_r=0.734, skip=6]2026-04-26 04:15:01,426 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.915 = 0.50×0.85(prox=0.85) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='54.5' gold='50' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:15:01,518 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.871 = 0.50×0.85(prox=0.85) + 0.40×proc(0.865[fin=0.90,mean=0.81]) + 0.10×fmt(1.000) | pred='49.5' gold='50' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:15:01,609 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:15:01,695 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.904 = 0.50×0.85(prox=0.85) + 0.40×proc(0.948[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='48.5' gold='50' | step_acc=86% lccp=14% (chain=1/7 ok_count=6) n_steps=7
+2026-04-26 04:15:01,779 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.470 = 0.50×0.36(prox=0.36) + 0.40×proc(0.254[fin=0.02,mean=0.61]) + 0.10×fmt(1.000) | pred='5' gold='50' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 04:15:01,870 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.897 = 0.50×0.85(prox=0.85) + 0.40×proc(0.929[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='50.5' gold='50' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:15:01,962 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:15:02,053 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:15:02,144 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.832 = 0.50×0.85(prox=0.85) + 0.40×proc(0.767[fin=0.84,mean=0.66]) + 0.10×fmt(1.000) | pred='46.2' gold='50' | step_acc=62% lccp=50% (chain=4/8 ok_count=5) n_steps=8
+2026-04-26 04:15:02,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.31(prox=0.31) + 0.40×proc(0.888[fin=1.00,mean=0.73]) + 0.10×fmt(1.000) | pred='105.5' gold='50' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+
Iter 10 GRPO groups: 65%|######5 | 13/20 [02:11<01:09, 9.87s/q, loss=0.0015, mean_r=0.842, skip=6]
Iter 10 GRPO groups: 70%|####### | 14/20 [02:11<00:58, 9.73s/q, loss=0.0015, mean_r=0.842, skip=6]2026-04-26 04:15:09,022 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=86% lccp=0% (chain=0/7 ok_count=6) n_steps=7
+2026-04-26 04:15:09,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=86% lccp=0% (chain=0/7 ok_count=6) n_steps=7
+2026-04-26 04:15:09,188 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:15:09,269 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.756 = 0.50×0.85(prox=0.85) + 0.40×proc(0.578[fin=0.63,mean=0.50]) + 0.10×fmt(1.000) | pred='39' gold='36' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:15:09,351 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.950[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=86% lccp=0% (chain=0/7 ok_count=6) n_steps=7
+2026-04-26 04:15:09,434 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:15:09,516 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.592 = 0.50×0.75(prox=0.75) + 0.40×proc(0.293[fin=0.08,mean=0.61]) + 0.10×fmt(1.000) | pred='30' gold='36' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 04:15:09,599 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.927[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 04:15:09,683 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:15:09,766 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.777 = 0.50×0.75(prox=0.75) + 0.40×proc(0.754[fin=0.83,mean=0.64]) + 0.10×fmt(1.000) | pred='30' gold='36' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+
Iter 10 GRPO groups: 70%|####### | 14/20 [02:18<00:58, 9.73s/q, loss=0.0010, mean_r=0.903, skip=6]
Iter 10 GRPO groups: 75%|#######5 | 15/20 [02:18<00:45, 9.06s/q, loss=0.0010, mean_r=0.903, skip=6]2026-04-26 04:15:17,449 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.727 = 0.50×0.60(prox=0.60) + 0.40×proc(0.817[fin=0.90,mean=0.69]) + 0.10×fmt(1.000) | pred='40' gold='60' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:15:17,533 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.756 = 0.50×0.60(prox=0.60) + 0.40×proc(0.891[fin=1.00,mean=0.73]) + 0.10×fmt(1.000) | pred='40' gold='60' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:15:17,617 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.680 = 0.50×0.60(prox=0.60) + 0.40×proc(0.701[fin=0.83,mean=0.51]) + 0.10×fmt(1.000) | pred='40' gold='60' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:15:17,700 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.924 = 0.50×1.00(exact) + 0.40×proc(0.810[fin=1.00,mean=0.53]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:15:17,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.667 = 0.50×0.60(prox=0.60) + 0.40×proc(0.667[fin=0.79,mean=0.49]) + 0.10×fmt(1.000) | pred='40' gold='60' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:15:17,867 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.670 = 0.50×0.60(prox=0.60) + 0.40×proc(0.676[fin=0.69,mean=0.66]) + 0.10×fmt(1.000) | pred='40' gold='60' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:15:17,950 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.587 = 0.50×0.52(prox=0.52) + 0.40×proc(0.571[fin=0.77,mean=0.28]) + 0.10×fmt(1.000) | pred='32' gold='60' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:15:18,041 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:15:18,136 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.727 = 0.50×0.64(prox=0.64) + 0.40×proc(0.769[fin=0.92,mean=0.54]) + 0.10×fmt(1.000) | pred='43' gold='60' | step_acc=57% lccp=43% (chain=3/7 ok_count=4) n_steps=7
+2026-04-26 04:15:18,220 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 10 GRPO groups: 75%|#######5 | 15/20 [02:27<00:45, 9.06s/q, loss=0.0004, mean_r=0.773, skip=6]
Iter 10 GRPO groups: 80%|######## | 16/20 [02:27<00:35, 8.88s/q, loss=0.0004, mean_r=0.773, skip=6]2026-04-26 04:15:30,284 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.728 = 0.50×0.60(prox=0.60) + 0.40×proc(0.819[fin=0.94,mean=0.64]) + 0.10×fmt(1.000) | pred='33.33' gold='25' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:15:30,378 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.704 = 0.50×0.55(prox=0.55) + 0.40×proc(0.822[fin=0.98,mean=0.58]) + 0.10×fmt(1.000) | pred='14.81' gold='25' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:15:30,465 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:15:30,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:15:30,650 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.881 = 0.50×0.85(prox=0.85) + 0.40×proc(0.891[fin=0.99,mean=0.74]) + 0.10×fmt(1.000) | pred='23.53' gold='25' | step_acc=71% lccp=43% (chain=3/7 ok_count=5) n_steps=7
+2026-04-26 04:15:30,742 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.956[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:15:30,827 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:15:30,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.749 = 0.50×0.60(prox=0.60) + 0.40×proc(0.872[fin=0.99,mean=0.70]) + 0.10×fmt(1.000) | pred='33.33' gold='25' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:15:31,017 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='25%' gold='25' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:15:31,107 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.759 = 0.50×0.60(prox=0.60) + 0.40×proc(0.897[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='33.3' gold='25' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+
Iter 10 GRPO groups: 80%|######## | 16/20 [02:39<00:35, 8.88s/q, loss=-0.0011, mean_r=0.835, skip=6]
Iter 10 GRPO groups: 85%|########5 | 17/20 [02:39<00:30, 10.09s/q, loss=-0.0011, mean_r=0.835, skip=6]2026-04-26 04:15:41,494 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.485 = 0.50×0.20(prox=0.20) + 0.40×proc(0.525[fin=0.49,mean=0.58]) + 0.10×fmt(1.000) | pred='648' gold='216' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 04:15:41,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.928[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='216' gold='216' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:15:41,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.459 = 0.50×0.37(prox=0.37) + 0.40×proc(0.322[fin=0.13,mean=0.62]) + 0.10×fmt(1.000) | pred='36' gold='216' | step_acc=57% lccp=29% (chain=2/7 ok_count=4) n_steps=7
+2026-04-26 04:15:41,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.450 = 0.50×0.37(prox=0.37) + 0.40×proc(0.330[fin=0.22,mean=0.50]) + 0.10×fmt(1.000) | pred='36' gold='216' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 04:15:41,843 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.39(prox=0.39) + 0.40×proc(0.435[fin=0.34,mean=0.58]) + 0.10×fmt(1.000) | pred='48' gold='216' | step_acc=57% lccp=57% (chain=4/7 ok_count=4) n_steps=7
+2026-04-26 04:15:41,924 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.406 = 0.50×0.35(prox=0.35) + 0.40×proc(0.207[fin=0.04,mean=0.46]) + 0.10×fmt(1.000) | pred='12' gold='216' | step_acc=33% lccp=33% (chain=2/6 ok_count=2) n_steps=6
+2026-04-26 04:15:42,007 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.506 = 0.50×0.37(prox=0.37) + 0.40×proc(0.405[fin=0.32,mean=0.54]) + 0.10×fmt(1.000) | pred='36' gold='216' | step_acc=38% lccp=38% (chain=3/8 ok_count=3) n_steps=8
+2026-04-26 04:15:42,100 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.455 = 0.50×0.39(prox=0.39) + 0.40×proc(0.237[fin=0.11,mean=0.43]) + 0.10×fmt(1.000) | pred='48' gold='216' | step_acc=43% lccp=43% (chain=3/7 ok_count=3) n_steps=7
+2026-04-26 04:15:42,184 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.489 = 0.50×0.20(prox=0.20) + 0.40×proc(0.670[fin=0.73,mean=0.58]) + 0.10×fmt(1.000) | pred='648' gold='216' | step_acc=57% lccp=14% (chain=1/7 ok_count=4) n_steps=7
+2026-04-26 04:15:42,267 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.37(prox=0.37) + 0.40×proc(0.880[fin=0.93,mean=0.81]) + 0.10×fmt(1.000) | pred='36' gold='216' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+
Iter 10 GRPO groups: 85%|########5 | 17/20 [02:51<00:30, 10.09s/q, loss=0.0008, mean_r=0.532, skip=6]
Iter 10 GRPO groups: 90%|######### | 18/20 [02:51<00:20, 10.40s/q, loss=0.0008, mean_r=0.532, skip=6]2026-04-26 04:15:49,639 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:15:49,722 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:15:49,812 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:15:49,903 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:15:49,994 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:15:50,086 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.536 = 0.50×0.50(prox=0.50) + 0.40×proc(0.465[fin=0.39,mean=0.57]) + 0.10×fmt(1.000) | pred='1' gold='2' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:15:50,177 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:15:50,269 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:15:50,360 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:15:50,454 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 10 GRPO groups: 90%|######### | 18/20 [02:59<00:20, 10.40s/q, loss=-0.0009, mean_r=0.953, skip=6]
Iter 10 GRPO groups: 95%|#########5| 19/20 [02:59<00:09, 9.73s/q, loss=-0.0009, mean_r=0.953, skip=6]2026-04-26 04:15:59,286 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.716 = 0.50×0.85(prox=0.85) + 0.40×proc(0.477[fin=0.41,mean=0.57]) + 0.10×fmt(1.000) | pred='55' gold='59' | step_acc=43% lccp=43% (chain=3/7 ok_count=3) n_steps=7
+2026-04-26 04:15:59,378 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.31(prox=0.31) + 0.40×proc(0.490[fin=0.36,mean=0.68]) + 0.10×fmt(1.000) | pred='124' gold='59' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 04:15:59,461 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.516 = 0.50×0.36(prox=0.36) + 0.40×proc(0.588[fin=0.68,mean=0.46]) + 0.10×fmt(1.000) | pred='111' gold='59' | step_acc=33% lccp=0% (chain=0/6 ok_count=2) n_steps=6
+2026-04-26 04:15:59,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.460 = 0.50×0.25(prox=0.25) + 0.40×proc(0.583[fin=0.72,mean=0.38]) + 0.10×fmt(1.000) | pred='146.36' gold='59' | step_acc=29% lccp=0% (chain=0/7 ok_count=2) n_steps=7
+2026-04-26 04:15:59,635 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.749 = 0.50×0.63(prox=0.63) + 0.40×proc(0.829[fin=0.96,mean=0.63]) + 0.10×fmt(1.000) | pred='76' gold='59' | step_acc=50% lccp=38% (chain=3/8 ok_count=4) n_steps=8
+2026-04-26 04:15:59,719 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.576 = 0.50×0.21(prox=0.21) + 0.40×proc(0.929[fin=0.97,mean=0.87]) + 0.10×fmt(1.000) | pred='171' gold='59' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 04:15:59,812 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.596 = 0.50×0.36(prox=0.36) + 0.40×proc(0.791[fin=0.93,mean=0.58]) + 0.10×fmt(1.000) | pred='6.5' gold='59' | step_acc=75% lccp=0% (chain=0/8 ok_count=6) n_steps=8
+2026-04-26 04:15:59,905 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.562 = 0.50×0.46(prox=0.46) + 0.40×proc(0.577[fin=0.77,mean=0.29]) + 0.10×fmt(1.000) | pred='93.3' gold='59' | step_acc=29% lccp=0% (chain=0/7 ok_count=2) n_steps=7
+2026-04-26 04:15:59,992 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.584 = 0.50×0.38(prox=0.38) + 0.40×proc(0.733[fin=0.85,mean=0.56]) + 0.10×fmt(1.000) | pred='11' gold='59' | step_acc=43% lccp=0% (chain=0/7 ok_count=3) n_steps=7
+2026-04-26 04:16:00,077 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.719[fin=0.82,mean=0.57]) + 0.10×fmt(1.000) | pred='14' gold='59' | step_acc=57% lccp=14% (chain=1/7 ok_count=4) n_steps=7
+
Iter 10 GRPO groups: 95%|#########5| 19/20 [03:08<00:09, 9.73s/q, loss=-0.0005, mean_r=0.586, skip=6]
Iter 10 GRPO groups: 100%|##########| 20/20 [03:08<00:00, 9.71s/q, loss=-0.0005, mean_r=0.586, skip=6]
Iter 10 GRPO groups: 100%|##########| 20/20 [03:08<00:00, 9.45s/q, loss=-0.0005, mean_r=0.586, skip=6]
+2026-04-26 04:16:01,527 INFO __main__ - Iter 10 | loss=0.0004 | reward mean=0.881 std=0.176 | gt_match=68.3% | grounded_acc=95.5% | step_acc=84.7% | lccp=72.2% | batch_acc=95.5% | phase=GROUNDED_ONLY sp_ratio=0% | groups=14 skipped=6(0var=6) | lr=4.98e-06 | 189.0s
+2026-04-26 04:16:01,528 INFO __main__ - Evaluating GSM8K (150 samples)...
+
GSM8K eval: 0%| | 0/150 [00:00, ?q/s]
GSM8K eval: 1%| | 1/150 [00:02<05:25, 2.18s/q, correct=1/1, lccp=100.0%, score=0.999, step_acc=100.0%]
GSM8K eval: 1%|1 | 2/150 [00:07<09:21, 3.79s/q, correct=2/2, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 2%|2 | 3/150 [00:09<07:57, 3.25s/q, correct=3/3, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 3%|2 | 4/150 [00:11<06:55, 2.84s/q, correct=3/4, lccp=83.3%, score=0.887, step_acc=91.7%]
GSM8K eval: 3%|3 | 5/150 [00:13<05:49, 2.41s/q, correct=4/5, lccp=86.7%, score=0.910, step_acc=93.3%]
GSM8K eval: 4%|4 | 6/150 [00:18<08:13, 3.42s/q, correct=4/6, lccp=75.6%, score=0.888, step_acc=87.8%]
GSM8K eval: 5%|4 | 7/150 [00:22<08:09, 3.43s/q, correct=5/7, lccp=79.0%, score=0.904, step_acc=89.5%]
GSM8K eval: 5%|5 | 8/150 [00:24<07:22, 3.12s/q, correct=6/8, lccp=81.7%, score=0.916, step_acc=90.8%]
GSM8K eval: 6%|6 | 9/150 [00:28<07:27, 3.17s/q, correct=7/9, lccp=83.7%, score=0.925, step_acc=91.9%]
GSM8K eval: 7%|6 | 10/150 [00:33<08:42, 3.73s/q, correct=7/10, lccp=81.3%, score=0.908, step_acc=90.7%]
GSM8K eval: 7%|7 | 11/150 [00:36<08:04, 3.48s/q, correct=8/11, lccp=83.0%, score=0.917, step_acc=91.5%]
GSM8K eval: 8%|8 | 12/150 [00:38<07:05, 3.08s/q, correct=9/12, lccp=84.4%, score=0.924, step_acc=92.2%]
GSM8K eval: 9%|8 | 13/150 [00:41<06:50, 3.00s/q, correct=10/13, lccp=85.6%, score=0.925, step_acc=92.8%]
GSM8K eval: 9%|9 | 14/150 [00:45<07:41, 3.39s/q, correct=11/14, lccp=86.7%, score=0.930, step_acc=93.3%]
GSM8K eval: 10%|# | 15/150 [00:47<07:03, 3.13s/q, correct=12/15, lccp=87.6%, score=0.935, step_acc=93.8%]
GSM8K eval: 11%|# | 16/150 [00:50<06:33, 2.94s/q, correct=12/16, lccp=88.3%, score=0.911, step_acc=94.2%]
GSM8K eval: 11%|#1 | 17/150 [00:54<07:11, 3.24s/q, correct=13/17, lccp=89.0%, score=0.916, step_acc=94.5%]
GSM8K eval: 12%|#2 | 18/150 [01:00<08:48, 4.01s/q, correct=13/18, lccp=84.8%, score=0.904, step_acc=92.0%]
GSM8K eval: 13%|#2 | 19/150 [01:02<07:49, 3.59s/q, correct=14/19, lccp=85.6%, score=0.909, step_acc=92.5%]
GSM8K eval: 13%|#3 | 20/150 [01:06<07:52, 3.63s/q, correct=15/20, lccp=86.3%, score=0.914, step_acc=92.8%]
GSM8K eval: 14%|#4 | 21/150 [01:09<07:08, 3.32s/q, correct=16/21, lccp=86.9%, score=0.918, step_acc=93.2%]
GSM8K eval: 15%|#4 | 22/150 [01:14<08:20, 3.91s/q, correct=17/22, lccp=84.9%, score=0.919, step_acc=91.5%]
GSM8K eval: 15%|#5 | 23/150 [01:17<07:57, 3.76s/q, correct=17/23, lccp=82.3%, score=0.902, step_acc=88.6%]
GSM8K eval: 16%|#6 | 24/150 [01:20<07:08, 3.40s/q, correct=17/24, lccp=79.9%, score=0.886, step_acc=86.0%]
GSM8K eval: 17%|#6 | 25/150 [01:23<06:41, 3.21s/q, correct=17/25, lccp=77.7%, score=0.882, step_acc=85.6%]
GSM8K eval: 17%|#7 | 26/150 [01:27<07:19, 3.55s/q, correct=18/26, lccp=78.6%, score=0.887, step_acc=86.1%]
GSM8K eval: 18%|#8 | 27/150 [01:30<06:49, 3.33s/q, correct=18/27, lccp=79.4%, score=0.882, step_acc=86.6%]
GSM8K eval: 19%|#8 | 28/150 [01:32<06:03, 2.98s/q, correct=19/28, lccp=80.1%, score=0.887, step_acc=87.1%]
GSM8K eval: 19%|#9 | 29/150 [01:35<05:53, 2.92s/q, correct=20/29, lccp=80.8%, score=0.890, step_acc=87.5%]
GSM8K eval: 20%|## | 30/150 [01:38<06:22, 3.19s/q, correct=21/30, lccp=81.5%, score=0.894, step_acc=88.0%]
GSM8K eval: 21%|## | 31/150 [01:41<05:56, 3.00s/q, correct=22/31, lccp=82.1%, score=0.897, step_acc=88.3%]
GSM8K eval: 21%|##1 | 32/150 [01:43<05:08, 2.61s/q, correct=23/32, lccp=82.6%, score=0.899, step_acc=88.7%]
GSM8K eval: 22%|##2 | 33/150 [01:45<05:11, 2.66s/q, correct=24/33, lccp=83.1%, score=0.903, step_acc=89.1%]
GSM8K eval: 23%|##2 | 34/150 [01:47<04:45, 2.46s/q, correct=25/34, lccp=83.6%, score=0.905, step_acc=89.4%]
GSM8K eval: 23%|##3 | 35/150 [01:50<04:45, 2.49s/q, correct=26/35, lccp=84.1%, score=0.908, step_acc=89.7%]
GSM8K eval: 24%|##4 | 36/150 [01:53<05:16, 2.77s/q, correct=27/36, lccp=84.5%, score=0.911, step_acc=90.0%]
GSM8K eval: 25%|##4 | 37/150 [01:55<04:47, 2.54s/q, correct=28/37, lccp=85.0%, score=0.912, step_acc=90.2%]
GSM8K eval: 25%|##5 | 38/150 [01:59<05:00, 2.69s/q, correct=29/38, lccp=85.4%, score=0.914, step_acc=90.5%]
GSM8K eval: 26%|##6 | 39/150 [02:03<06:14, 3.37s/q, correct=30/39, lccp=85.7%, score=0.916, step_acc=90.7%]
GSM8K eval: 27%|##6 | 40/150 [02:10<07:45, 4.23s/q, correct=31/40, lccp=86.1%, score=0.919, step_acc=91.0%]
GSM8K eval: 27%|##7 | 41/150 [02:13<07:01, 3.87s/q, correct=31/41, lccp=86.4%, score=0.918, step_acc=91.2%]
GSM8K eval: 28%|##8 | 42/150 [02:18<07:40, 4.27s/q, correct=32/42, lccp=85.2%, score=0.920, step_acc=91.0%]
GSM8K eval: 29%|##8 | 43/150 [02:20<06:20, 3.56s/q, correct=33/43, lccp=85.5%, score=0.922, step_acc=91.2%]
GSM8K eval: 29%|##9 | 44/150 [02:26<07:47, 4.41s/q, correct=34/44, lccp=85.8%, score=0.923, step_acc=91.4%]
GSM8K eval: 30%|### | 45/150 [02:29<07:01, 4.01s/q, correct=35/45, lccp=86.2%, score=0.925, step_acc=91.6%]
GSM8K eval: 31%|### | 46/150 [02:34<07:21, 4.25s/q, correct=35/46, lccp=84.3%, score=0.920, step_acc=91.5%]
GSM8K eval: 31%|###1 | 47/150 [02:37<06:38, 3.87s/q, correct=36/47, lccp=84.6%, score=0.922, step_acc=91.7%]
GSM8K eval: 32%|###2 | 48/150 [02:39<05:30, 3.24s/q, correct=37/48, lccp=84.9%, score=0.923, step_acc=91.9%]
GSM8K eval: 33%|###2 | 49/150 [02:46<07:11, 4.27s/q, correct=37/49, lccp=83.8%, score=0.910, step_acc=90.6%]
GSM8K eval: 33%|###3 | 50/150 [02:49<06:32, 3.93s/q, correct=37/50, lccp=83.1%, score=0.901, step_acc=89.8%]
GSM8K eval: 34%|###4 | 51/150 [02:50<05:14, 3.18s/q, correct=38/51, lccp=83.4%, score=0.903, step_acc=90.0%]
GSM8K eval: 35%|###4 | 52/150 [02:54<05:41, 3.48s/q, correct=38/52, lccp=81.8%, score=0.903, step_acc=89.9%]
GSM8K eval: 35%|###5 | 53/150 [03:00<06:39, 4.12s/q, correct=39/53, lccp=82.2%, score=0.905, step_acc=90.1%]
GSM8K eval: 36%|###6 | 54/150 [03:02<05:44, 3.58s/q, correct=40/54, lccp=82.5%, score=0.907, step_acc=90.2%]
GSM8K eval: 37%|###6 | 55/150 [03:06<05:39, 3.57s/q, correct=41/55, lccp=82.8%, score=0.908, step_acc=90.4%]
GSM8K eval: 37%|###7 | 56/150 [03:09<05:35, 3.57s/q, correct=42/56, lccp=83.1%, score=0.910, step_acc=90.6%]
GSM8K eval: 38%|###8 | 57/150 [03:12<04:55, 3.18s/q, correct=43/57, lccp=83.4%, score=0.911, step_acc=90.8%]
GSM8K eval: 39%|###8 | 58/150 [03:16<05:16, 3.44s/q, correct=44/58, lccp=83.7%, score=0.913, step_acc=90.9%]
GSM8K eval: 39%|###9 | 59/150 [03:19<05:17, 3.49s/q, correct=44/59, lccp=82.3%, score=0.904, step_acc=89.7%]
GSM8K eval: 40%|#### | 60/150 [03:24<05:52, 3.91s/q, correct=45/60, lccp=82.6%, score=0.906, step_acc=89.9%]
GSM8K eval: 41%|#### | 61/150 [03:27<05:28, 3.69s/q, correct=46/61, lccp=82.9%, score=0.908, step_acc=90.0%]
GSM8K eval: 41%|####1 | 62/150 [03:30<05:09, 3.51s/q, correct=47/62, lccp=83.2%, score=0.909, step_acc=90.2%]
GSM8K eval: 42%|####2 | 63/150 [03:34<05:00, 3.46s/q, correct=47/63, lccp=82.9%, score=0.903, step_acc=89.8%]
GSM8K eval: 43%|####2 | 64/150 [03:37<04:40, 3.27s/q, correct=48/64, lccp=83.2%, score=0.905, step_acc=90.0%]
GSM8K eval: 43%|####3 | 65/150 [03:39<04:24, 3.11s/q, correct=49/65, lccp=83.4%, score=0.906, step_acc=90.1%]
GSM8K eval: 44%|####4 | 66/150 [03:41<03:39, 2.62s/q, correct=50/66, lccp=83.7%, score=0.907, step_acc=90.3%]
GSM8K eval: 45%|####4 | 67/150 [03:43<03:29, 2.52s/q, correct=51/67, lccp=83.9%, score=0.909, step_acc=90.4%]
GSM8K eval: 45%|####5 | 68/150 [03:46<03:30, 2.56s/q, correct=52/68, lccp=84.2%, score=0.910, step_acc=90.6%]
GSM8K eval: 46%|####6 | 69/150 [03:47<03:02, 2.25s/q, correct=53/69, lccp=84.4%, score=0.911, step_acc=90.7%]
GSM8K eval: 47%|####6 | 70/150 [03:50<03:18, 2.48s/q, correct=54/70, lccp=83.2%, score=0.912, step_acc=90.6%]
GSM8K eval: 47%|####7 | 71/150 [03:53<03:31, 2.68s/q, correct=55/71, lccp=82.0%, score=0.913, step_acc=90.4%]
GSM8K eval: 48%|####8 | 72/150 [03:55<03:01, 2.33s/q, correct=56/72, lccp=82.3%, score=0.914, step_acc=90.6%]
GSM8K eval: 49%|####8 | 73/150 [03:57<02:44, 2.13s/q, correct=57/73, lccp=82.5%, score=0.915, step_acc=90.7%]
GSM8K eval: 49%|####9 | 74/150 [04:00<03:13, 2.55s/q, correct=58/74, lccp=82.7%, score=0.917, step_acc=90.8%]
GSM8K eval: 50%|##### | 75/150 [04:02<02:52, 2.31s/q, correct=59/75, lccp=83.0%, score=0.918, step_acc=90.9%]
GSM8K eval: 51%|##### | 76/150 [04:08<04:25, 3.59s/q, correct=59/76, lccp=83.0%, score=0.913, step_acc=90.9%]
GSM8K eval: 51%|#####1 | 77/150 [04:12<04:29, 3.70s/q, correct=60/77, lccp=83.2%, score=0.914, step_acc=91.0%]
GSM8K eval: 52%|#####2 | 78/150 [04:15<03:58, 3.31s/q, correct=61/78, lccp=83.5%, score=0.915, step_acc=91.1%]
GSM8K eval: 53%|#####2 | 79/150 [04:18<03:49, 3.23s/q, correct=61/79, lccp=83.0%, score=0.910, step_acc=90.8%]
GSM8K eval: 53%|#####3 | 80/150 [04:21<03:41, 3.16s/q, correct=62/80, lccp=83.2%, score=0.911, step_acc=90.9%]
GSM8K eval: 54%|#####4 | 81/150 [04:23<03:22, 2.94s/q, correct=63/81, lccp=83.5%, score=0.912, step_acc=91.0%]
GSM8K eval: 55%|#####4 | 82/150 [04:26<03:20, 2.94s/q, correct=64/82, lccp=83.7%, score=0.913, step_acc=91.1%]
GSM8K eval: 55%|#####5 | 83/150 [04:29<03:14, 2.91s/q, correct=65/83, lccp=83.9%, score=0.914, step_acc=91.3%]
GSM8K eval: 56%|#####6 | 84/150 [04:32<03:07, 2.84s/q, correct=66/84, lccp=84.0%, score=0.915, step_acc=91.4%]
GSM8K eval: 57%|#####6 | 85/150 [04:36<03:23, 3.14s/q, correct=67/85, lccp=84.2%, score=0.916, step_acc=91.5%]
GSM8K eval: 57%|#####7 | 86/150 [04:39<03:27, 3.24s/q, correct=68/86, lccp=84.4%, score=0.917, step_acc=91.6%]
GSM8K eval: 58%|#####8 | 87/150 [04:45<04:08, 3.95s/q, correct=69/87, lccp=84.6%, score=0.918, step_acc=91.7%]
GSM8K eval: 59%|#####8 | 88/150 [04:47<03:25, 3.32s/q, correct=70/88, lccp=84.8%, score=0.919, step_acc=91.7%]
GSM8K eval: 59%|#####9 | 89/150 [04:49<03:11, 3.13s/q, correct=71/89, lccp=84.9%, score=0.920, step_acc=91.8%]
GSM8K eval: 60%|###### | 90/150 [04:52<02:54, 2.91s/q, correct=72/90, lccp=85.1%, score=0.921, step_acc=91.9%]
GSM8K eval: 61%|###### | 91/150 [04:56<03:17, 3.35s/q, correct=73/91, lccp=85.3%, score=0.922, step_acc=92.0%]
GSM8K eval: 61%|######1 | 92/150 [04:59<03:09, 3.27s/q, correct=74/92, lccp=85.4%, score=0.922, step_acc=92.1%]
GSM8K eval: 62%|######2 | 93/150 [05:07<04:17, 4.52s/q, correct=75/93, lccp=85.6%, score=0.923, step_acc=92.2%]
GSM8K eval: 63%|######2 | 94/150 [05:38<11:51, 12.71s/q, correct=75/94, lccp=84.7%, score=0.916, step_acc=92.2%]
GSM8K eval: 63%|######3 | 95/150 [05:43<09:27, 10.31s/q, correct=76/95, lccp=83.8%, score=0.916, step_acc=91.6%]
GSM8K eval: 64%|######4 | 96/150 [05:48<07:48, 8.67s/q, correct=77/96, lccp=84.0%, score=0.917, step_acc=91.7%]
GSM8K eval: 65%|######4 | 97/150 [05:51<06:03, 6.86s/q, correct=77/97, lccp=83.3%, score=0.916, step_acc=91.3%]
GSM8K eval: 65%|######5 | 98/150 [05:55<05:14, 6.04s/q, correct=77/98, lccp=82.9%, score=0.912, step_acc=91.1%]
GSM8K eval: 66%|######6 | 99/150 [05:57<04:11, 4.94s/q, correct=78/99, lccp=83.1%, score=0.913, step_acc=91.2%]
GSM8K eval: 67%|######6 | 100/150 [05:59<03:21, 4.02s/q, correct=79/100, lccp=82.3%, score=0.913, step_acc=90.9%]
GSM8K eval: 67%|######7 | 101/150 [06:02<03:01, 3.70s/q, correct=79/101, lccp=82.0%, score=0.909, step_acc=90.8%]
GSM8K eval: 68%|######8 | 102/150 [06:03<02:25, 3.03s/q, correct=80/102, lccp=82.1%, score=0.910, step_acc=90.8%]
GSM8K eval: 69%|######8 | 103/150 [06:05<02:07, 2.72s/q, correct=81/103, lccp=82.3%, score=0.911, step_acc=90.9%]
GSM8K eval: 69%|######9 | 104/150 [06:10<02:31, 3.30s/q, correct=82/104, lccp=82.5%, score=0.912, step_acc=91.0%]
GSM8K eval: 70%|####### | 105/150 [06:12<02:18, 3.07s/q, correct=83/105, lccp=82.6%, score=0.913, step_acc=91.1%]
GSM8K eval: 71%|####### | 106/150 [06:14<01:54, 2.59s/q, correct=84/106, lccp=82.8%, score=0.913, step_acc=91.2%]
GSM8K eval: 71%|#######1 | 107/150 [06:15<01:37, 2.26s/q, correct=85/107, lccp=83.0%, score=0.914, step_acc=91.3%]
GSM8K eval: 72%|#######2 | 108/150 [06:18<01:40, 2.39s/q, correct=86/108, lccp=83.1%, score=0.915, step_acc=91.3%]
GSM8K eval: 73%|#######2 | 109/150 [06:23<02:10, 3.17s/q, correct=86/109, lccp=82.7%, score=0.914, step_acc=91.3%]
GSM8K eval: 73%|#######3 | 110/150 [06:25<01:55, 2.88s/q, correct=87/110, lccp=82.8%, score=0.914, step_acc=91.4%]
GSM8K eval: 74%|#######4 | 111/150 [06:27<01:38, 2.52s/q, correct=88/111, lccp=83.0%, score=0.915, step_acc=91.4%]
GSM8K eval: 75%|#######4 | 112/150 [06:32<02:06, 3.32s/q, correct=88/112, lccp=83.1%, score=0.915, step_acc=91.5%]
GSM8K eval: 75%|#######5 | 113/150 [06:34<01:45, 2.85s/q, correct=89/113, lccp=83.3%, score=0.915, step_acc=91.6%]
GSM8K eval: 76%|#######6 | 114/150 [06:39<02:08, 3.58s/q, correct=90/114, lccp=82.8%, score=0.916, step_acc=91.5%]
GSM8K eval: 77%|#######6 | 115/150 [06:42<01:57, 3.35s/q, correct=91/115, lccp=83.0%, score=0.917, step_acc=91.6%]
GSM8K eval: 77%|#######7 | 116/150 [06:45<01:48, 3.19s/q, correct=92/116, lccp=83.1%, score=0.917, step_acc=91.7%]
GSM8K eval: 78%|#######8 | 117/150 [06:51<02:10, 3.96s/q, correct=93/117, lccp=83.2%, score=0.918, step_acc=91.8%]
GSM8K eval: 79%|#######8 | 118/150 [06:55<02:11, 4.10s/q, correct=93/118, lccp=82.5%, score=0.916, step_acc=91.7%]
GSM8K eval: 79%|#######9 | 119/150 [06:59<02:01, 3.92s/q, correct=93/119, lccp=82.7%, score=0.914, step_acc=91.7%]
GSM8K eval: 80%|######## | 120/150 [07:01<01:47, 3.59s/q, correct=94/120, lccp=82.8%, score=0.915, step_acc=91.8%]
GSM8K eval: 81%|######## | 121/150 [07:04<01:38, 3.41s/q, correct=95/121, lccp=83.0%, score=0.916, step_acc=91.9%]
GSM8K eval: 81%|########1 | 122/150 [07:07<01:32, 3.30s/q, correct=96/122, lccp=83.1%, score=0.916, step_acc=92.0%]
GSM8K eval: 82%|########2 | 123/150 [07:11<01:28, 3.29s/q, correct=96/123, lccp=82.8%, score=0.916, step_acc=91.9%]
GSM8K eval: 83%|########2 | 124/150 [07:13<01:17, 2.96s/q, correct=97/124, lccp=82.9%, score=0.917, step_acc=91.9%]
GSM8K eval: 83%|########3 | 125/150 [07:15<01:07, 2.68s/q, correct=98/125, lccp=83.0%, score=0.917, step_acc=92.0%]
GSM8K eval: 84%|########4 | 126/150 [07:18<01:04, 2.70s/q, correct=99/126, lccp=83.2%, score=0.918, step_acc=92.0%]
GSM8K eval: 85%|########4 | 127/150 [07:22<01:14, 3.22s/q, correct=100/127, lccp=83.3%, score=0.919, step_acc=92.1%]
GSM8K eval: 85%|########5 | 128/150 [07:25<01:08, 3.13s/q, correct=101/128, lccp=83.4%, score=0.919, step_acc=92.2%]
GSM8K eval: 86%|########6 | 129/150 [07:29<01:08, 3.26s/q, correct=102/129, lccp=83.6%, score=0.920, step_acc=92.2%]
GSM8K eval: 87%|########6 | 130/150 [07:30<00:56, 2.84s/q, correct=103/130, lccp=83.7%, score=0.920, step_acc=92.3%]
GSM8K eval: 87%|########7 | 131/150 [07:35<01:03, 3.37s/q, correct=104/131, lccp=83.8%, score=0.921, step_acc=92.4%]
GSM8K eval: 88%|########8 | 132/150 [07:37<00:51, 2.84s/q, correct=105/132, lccp=83.9%, score=0.922, step_acc=92.4%]
GSM8K eval: 89%|########8 | 133/150 [07:39<00:48, 2.84s/q, correct=106/133, lccp=84.1%, score=0.922, step_acc=92.5%]
GSM8K eval: 89%|########9 | 134/150 [07:44<00:52, 3.29s/q, correct=107/134, lccp=84.2%, score=0.923, step_acc=92.5%]
GSM8K eval: 90%|######### | 135/150 [07:47<00:47, 3.15s/q, correct=108/135, lccp=84.3%, score=0.923, step_acc=92.6%]
GSM8K eval: 91%|######### | 136/150 [07:51<00:49, 3.51s/q, correct=108/136, lccp=83.9%, score=0.922, step_acc=92.4%]
GSM8K eval: 91%|#########1| 137/150 [07:58<00:57, 4.45s/q, correct=109/137, lccp=84.0%, score=0.923, step_acc=92.4%]
GSM8K eval: 92%|#########2| 138/150 [08:02<00:51, 4.28s/q, correct=110/138, lccp=84.2%, score=0.923, step_acc=92.5%]
GSM8K eval: 93%|#########2| 139/150 [08:05<00:43, 4.00s/q, correct=111/139, lccp=84.3%, score=0.924, step_acc=92.6%]
GSM8K eval: 93%|#########3| 140/150 [08:09<00:40, 4.04s/q, correct=111/140, lccp=84.1%, score=0.920, step_acc=92.4%]
GSM8K eval: 94%|#########3| 141/150 [08:13<00:35, 3.95s/q, correct=112/141, lccp=84.3%, score=0.921, step_acc=92.4%]
GSM8K eval: 95%|#########4| 142/150 [08:16<00:30, 3.76s/q, correct=113/142, lccp=84.4%, score=0.922, step_acc=92.5%]
GSM8K eval: 95%|#########5| 143/150 [08:18<00:23, 3.29s/q, correct=114/143, lccp=84.5%, score=0.922, step_acc=92.5%]
GSM8K eval: 96%|#########6| 144/150 [08:21<00:17, 3.00s/q, correct=115/144, lccp=84.6%, score=0.923, step_acc=92.6%]
GSM8K eval: 97%|#########6| 145/150 [08:24<00:15, 3.04s/q, correct=115/145, lccp=84.0%, score=0.919, step_acc=92.1%]
GSM8K eval: 97%|#########7| 146/150 [08:27<00:12, 3.01s/q, correct=116/146, lccp=84.1%, score=0.920, step_acc=92.1%]
GSM8K eval: 98%|#########8| 147/150 [08:30<00:09, 3.22s/q, correct=117/147, lccp=84.2%, score=0.920, step_acc=92.2%]
GSM8K eval: 99%|#########8| 148/150 [08:34<00:06, 3.40s/q, correct=118/148, lccp=84.3%, score=0.921, step_acc=92.2%]
GSM8K eval: 99%|#########9| 149/150 [08:38<00:03, 3.44s/q, correct=119/149, lccp=84.4%, score=0.921, step_acc=92.3%]
GSM8K eval: 100%|##########| 150/150 [08:43<00:00, 3.86s/q, correct=119/150, lccp=84.3%, score=0.920, step_acc=92.1%]
GSM8K eval: 100%|##########| 150/150 [08:43<00:00, 3.49s/q, correct=119/150, lccp=84.3%, score=0.920, step_acc=92.1%]
+2026-04-26 04:24:44,562 INFO __main__ - Training Score [iter 10]: 0.9199 (best=0.9192) | n=150
+2026-04-26 04:24:44,563 INFO __main__ - Components : 0.50×correct(79.3%) + 0.40×process + 0.10×fmt(0.998)
+2026-04-26 04:24:44,563 INFO __main__ - Process score : prm_mean=0.907 prm_final=0.941 → weighted=0.927
+2026-04-26 04:24:44,563 INFO __main__ - Step accuracy : 92.0% (bag-of-steps: fraction of steps PRM >0.5)
+2026-04-26 04:24:44,563 INFO __main__ - Chain integrity (LCCP): 84.3% ← fraction of steps before first failure
+ [LCCP=100% → all steps correct; LCCP=0% → first step wrong]
+2026-04-26 04:24:44,563 INFO __main__ - (debug) final-answer accuracy: 79.3%
+2026-04-26 04:24:47,497 INFO __main__ - New best saved → checkpoints/grpo/grpo_20260426_032827/best_policy (combined 0.9199 > 0.9192)
+2026-04-26 04:24:49,713 INFO __main__ - ======================================================================
+2026-04-26 04:24:49,713 INFO __main__ - GRPO ITERATION 11/60
+2026-04-26 04:24:49,713 INFO __main__ - ======================================================================
+2026-04-26 04:24:49,734 INFO __main__ - LR this iteration: 4.98e-06 | T=0.732 | MATH ratio=30%
+
Iter 11 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:24:54,678 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:24:54,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:24:54,845 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:24:54,929 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:24:55,014 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:24:55,099 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:24:55,184 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:24:55,266 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:24:55,348 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:24:55,430 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 11 GRPO groups: 0%| | 0/20 [00:05, ?q/s, loss=0var, mean_r=0.999, skip=1]
Iter 11 GRPO groups: 5%|5 | 1/20 [00:05<01:48, 5.69s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 04:24:58,800 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.934[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:24:58,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:24:58,955 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:24:59,042 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.458 = 0.50×0.34(prox=0.34) + 0.40×proc(0.277[fin=0.10,mean=0.54]) + 0.10×fmt(1.000) | pred='1.05' gold='21' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:24:59,125 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:24:59,203 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:24:59,280 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:24:59,357 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.926 = 0.50×1.00(exact) + 0.40×proc(0.815[fin=0.98,mean=0.57]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:24:59,439 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:24:59,521 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 11 GRPO groups: 5%|5 | 1/20 [00:11<01:48, 5.69s/q, loss=-0.0009, mean_r=0.935, skip=1]
Iter 11 GRPO groups: 10%|# | 2/20 [00:11<01:42, 5.71s/q, loss=-0.0009, mean_r=0.935, skip=1]2026-04-26 04:25:06,506 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:06,593 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:06,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:06,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:06,847 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:06,931 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:07,013 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:07,094 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:07,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:07,261 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 11 GRPO groups: 10%|# | 2/20 [00:17<01:42, 5.71s/q, loss=0var, mean_r=0.999, skip=2]
Iter 11 GRPO groups: 15%|#5 | 3/20 [00:17<01:40, 5.89s/q, loss=0var, mean_r=0.999, skip=2]2026-04-26 04:25:16,793 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:25:16,885 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:25:16,972 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.791 = 0.50×0.85(prox=0.85) + 0.40×proc(0.666[fin=0.61,mean=0.75]) + 0.10×fmt(1.000) | pred='444' gold='445' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:17,058 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.951 = 0.50×1.00(exact) + 0.40×proc(0.876[fin=0.97,mean=0.73]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:25:17,152 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:25:17,236 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.887[fin=0.99,mean=0.73]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:25:17,327 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:25:17,420 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:25:17,516 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:25:17,610 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+
Iter 11 GRPO groups: 15%|#5 | 3/20 [00:29<01:40, 5.89s/q, loss=0.0002, mean_r=0.968, skip=2]
Iter 11 GRPO groups: 20%|## | 4/20 [00:29<02:12, 8.27s/q, loss=0.0002, mean_r=0.968, skip=2]2026-04-26 04:25:27,619 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.620 = 0.50×0.33(prox=0.33) + 0.40×proc(0.884[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='0' gold='10000' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 04:25:27,704 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10000' gold='10000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:27,790 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='$10000' gold='10000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:27,882 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10000' gold='10000' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:25:27,965 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.907[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='10000' gold='10000' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:25:28,055 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10000' gold='10000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:28,148 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10000' gold='10000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:28,238 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10000' gold='10000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:28,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10000' gold='10000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:28,422 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10000' gold='10000' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 11 GRPO groups: 20%|## | 4/20 [00:40<02:12, 8.27s/q, loss=-0.0001, mean_r=0.958, skip=2]
Iter 11 GRPO groups: 25%|##5 | 5/20 [00:40<02:17, 9.15s/q, loss=-0.0001, mean_r=0.958, skip=2]2026-04-26 04:25:38,272 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:38,355 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:38,444 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='$32.00' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:38,531 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:38,618 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:38,704 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:38,792 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.458 = 0.50×0.08(prox=0.08) + 0.40×proc(0.792[fin=0.93,mean=0.59]) + 0.10×fmt(1.000) | pred='212.8' gold='32' | step_acc=50% lccp=0% (chain=0/6 ok_count=3) n_steps=6
+2026-04-26 04:25:38,876 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:38,969 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:25:39,053 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 11 GRPO groups: 25%|##5 | 5/20 [00:50<02:17, 9.15s/q, loss=-0.0008, mean_r=0.945, skip=2]
Iter 11 GRPO groups: 30%|### | 6/20 [00:50<02:15, 9.65s/q, loss=-0.0008, mean_r=0.945, skip=2]2026-04-26 04:25:44,913 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:44,995 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:45,078 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:45,163 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:45,247 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:45,334 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:45,421 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:45,505 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:45,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:45,668 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 11 GRPO groups: 30%|### | 6/20 [00:55<02:15, 9.65s/q, loss=0var, mean_r=0.993, skip=3]
Iter 11 GRPO groups: 35%|###5 | 7/20 [00:55<01:46, 8.18s/q, loss=0var, mean_r=0.993, skip=3]2026-04-26 04:25:48,746 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:25:48,829 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:25:48,912 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:25:48,995 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.945[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:25:49,078 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:25:49,160 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.945[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:25:49,242 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.787 = 0.50×0.85(prox=0.85) + 0.40×proc(0.655[fin=0.83,mean=0.39]) + 0.10×fmt(1.000) | pred='-6.67' gold='-7' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:25:49,324 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.396 = 0.50×0.00(prox=0.00) + 0.40×proc(0.741[fin=0.92,mean=0.47]) + 0.10×fmt(1.000) | pred='-6 2/3' gold='-7' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:25:49,407 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:25:49,488 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 11 GRPO groups: 35%|###5 | 7/20 [01:01<01:46, 8.18s/q, loss=-0.0023, mean_r=0.911, skip=3]
Iter 11 GRPO groups: 40%|#### | 8/20 [01:01<01:26, 7.24s/q, loss=-0.0023, mean_r=0.911, skip=3]2026-04-26 04:25:53,138 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,216 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,374 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,456 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,532 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,608 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,685 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,837 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 11 GRPO groups: 40%|#### | 8/20 [01:04<01:26, 7.24s/q, loss=0var, mean_r=0.998, skip=4]
Iter 11 GRPO groups: 45%|####5 | 9/20 [01:04<01:04, 5.90s/q, loss=0var, mean_r=0.998, skip=4]2026-04-26 04:26:01,819 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:26:01,901 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:26:01,985 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:02,069 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:02,155 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:02,249 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:26:02,332 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:02,414 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:02,497 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:02,581 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 11 GRPO groups: 45%|####5 | 9/20 [01:12<01:04, 5.90s/q, loss=0var, mean_r=0.998, skip=5]
Iter 11 GRPO groups: 50%|##### | 10/20 [01:12<01:07, 6.78s/q, loss=0var, mean_r=0.998, skip=5]2026-04-26 04:26:07,240 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:07,317 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:07,400 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:07,482 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.951[fin=0.96,mean=0.94]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:07,564 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:07,640 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=0.99,mean=0.98]) + 0.10×fmt(0.650) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:26:07,717 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:07,798 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:07,881 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:07,963 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=0.99,mean=0.98]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 11 GRPO groups: 50%|##### | 10/20 [01:18<01:07, 6.78s/q, loss=0var, mean_r=0.991, skip=6]
Iter 11 GRPO groups: 55%|#####5 | 11/20 [01:18<00:57, 6.35s/q, loss=0var, mean_r=0.991, skip=6]2026-04-26 04:26:12,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:12,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:12,714 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:12,790 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:12,872 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:12,948 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:13,030 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:13,107 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:13,183 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:13,266 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 11 GRPO groups: 55%|#####5 | 11/20 [01:23<00:57, 6.35s/q, loss=0var, mean_r=0.999, skip=7]
Iter 11 GRPO groups: 60%|###### | 12/20 [01:23<00:48, 6.03s/q, loss=0var, mean_r=0.999, skip=7]2026-04-26 04:26:20,521 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:20,614 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.947[fin=0.97,mean=0.92]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:26:20,706 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:20,798 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:26:20,890 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.939[fin=0.93,mean=0.96]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:26:20,983 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:26:21,077 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.687 = 0.50×0.43(prox=0.43) + 0.40×proc(0.933[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='18' gold='54' | step_acc=86% lccp=0% (chain=0/7 ok_count=6) n_steps=7
+2026-04-26 04:26:21,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=0.98,mean=0.91]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=88% lccp=75% (chain=6/8 ok_count=7) n_steps=8
+2026-04-26 04:26:21,264 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=0.99,mean=0.95]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:26:21,355 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 11 GRPO groups: 60%|###### | 12/20 [01:33<00:48, 6.03s/q, loss=-0.0003, mean_r=0.957, skip=7]
Iter 11 GRPO groups: 65%|######5 | 13/20 [01:33<00:49, 7.09s/q, loss=-0.0003, mean_r=0.957, skip=7]2026-04-26 04:26:27,345 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:27,429 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:27,512 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:27,595 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:27,680 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:27,766 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:27,851 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:27,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:26:28,019 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:28,104 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 11 GRPO groups: 65%|######5 | 13/20 [01:38<00:49, 7.09s/q, loss=0var, mean_r=0.995, skip=8]
Iter 11 GRPO groups: 70%|####### | 14/20 [01:38<00:39, 6.55s/q, loss=0var, mean_r=0.995, skip=8]2026-04-26 04:26:35,042 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.518 = 0.50×0.22(prox=0.22) + 0.40×proc(0.586[fin=0.57,mean=0.60]) + 0.10×fmt(1.000) | pred='16.75' gold='6' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:26:35,131 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.821 = 0.50×0.71(prox=0.71) + 0.40×proc(0.920[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='4.75' gold='6' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:26:35,223 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:35,315 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=0.99,mean=0.93]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:35,400 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.962[fin=0.99,mean=0.92]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:35,493 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.807 = 0.50×0.71(prox=0.71) + 0.40×proc(0.886[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='4.75' gold='6' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:26:35,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.802 = 0.50×0.71(prox=0.71) + 0.40×proc(0.872[fin=1.00,mean=0.69]) + 0.10×fmt(1.000) | pred='4.75' gold='6' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:26:35,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.940 = 0.50×1.00(exact) + 0.40×proc(0.850[fin=0.94,mean=0.72]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:26:35,761 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.808 = 0.50×0.71(prox=0.71) + 0.40×proc(0.889[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='4.75' gold='6' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:26:35,854 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.527 = 0.50×0.50(prox=0.50) + 0.40×proc(0.443[fin=0.38,mean=0.53]) + 0.10×fmt(1.000) | pred='9' gold='6' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+
Iter 11 GRPO groups: 70%|####### | 14/20 [01:47<00:39, 6.55s/q, loss=-0.0002, mean_r=0.819, skip=8]
Iter 11 GRPO groups: 75%|#######5 | 15/20 [01:47<00:36, 7.35s/q, loss=-0.0002, mean_r=0.819, skip=8]2026-04-26 04:26:42,108 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:42,189 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:26:42,272 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:42,351 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:42,436 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:42,520 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+2026-04-26 04:26:42,605 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:42,687 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:42,768 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:26:42,850 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 11 GRPO groups: 75%|#######5 | 15/20 [01:53<00:36, 7.35s/q, loss=0var, mean_r=1.000, skip=9]
Iter 11 GRPO groups: 80%|######## | 16/20 [01:53<00:27, 6.81s/q, loss=0var, mean_r=1.000, skip=9]2026-04-26 04:26:48,087 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:48,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:48,260 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.951 = 0.50×1.00(exact) + 0.40×proc(0.877[fin=1.00,mean=0.70]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 04:26:48,343 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:48,424 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:48,506 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:48,588 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:48,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:48,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:48,835 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 11 GRPO groups: 80%|######## | 16/20 [01:59<00:27, 6.81s/q, loss=0var, mean_r=0.991, skip=10]
Iter 11 GRPO groups: 85%|########5 | 17/20 [01:59<00:19, 6.56s/q, loss=0var, mean_r=0.991, skip=10]2026-04-26 04:26:54,391 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:54,474 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:54,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:54,638 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:54,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:54,803 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:54,885 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:26:54,966 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:26:55,048 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:55,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 11 GRPO groups: 85%|########5 | 17/20 [02:05<00:19, 6.56s/q, loss=0var, mean_r=0.992, skip=11]
Iter 11 GRPO groups: 90%|######### | 18/20 [02:05<00:12, 6.48s/q, loss=0var, mean_r=0.992, skip=11]2026-04-26 04:27:00,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='110' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:00,264 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='110.00' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:00,346 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='110' gold='110' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:00,428 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='110' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:00,511 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='110.00' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:00,596 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='110' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:00,682 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='110' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:00,768 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='110' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:00,853 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='110.00' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:00,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='110.00' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 11 GRPO groups: 90%|######### | 18/20 [02:11<00:12, 6.48s/q, loss=0var, mean_r=0.996, skip=12]
Iter 11 GRPO groups: 95%|#########5| 19/20 [02:11<00:06, 6.28s/q, loss=0var, mean_r=0.996, skip=12]2026-04-26 04:27:09,441 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:27:09,525 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:09,608 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:09,696 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.37(prox=0.37) + 0.40×proc(0.784[fin=0.92,mean=0.58]) + 0.10×fmt(1.000) | pred='81' gold='486' | step_acc=57% lccp=43% (chain=3/7 ok_count=4) n_steps=7
+2026-04-26 04:27:09,791 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:27:09,876 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:09,963 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:27:10,047 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:27:10,132 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:10,226 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+
Iter 11 GRPO groups: 95%|#########5| 19/20 [02:21<00:06, 6.28s/q, loss=-0.0002, mean_r=0.953, skip=12]
Iter 11 GRPO groups: 100%|##########| 20/20 [02:21<00:00, 7.61s/q, loss=-0.0002, mean_r=0.953, skip=12]
Iter 11 GRPO groups: 100%|##########| 20/20 [02:21<00:00, 7.10s/q, loss=-0.0002, mean_r=0.953, skip=12]
+2026-04-26 04:27:11,680 INFO __main__ - Iter 11 | loss=-0.0006 | reward mean=0.970 std=0.098 | gt_match=93.0% | grounded_acc=98.5% | step_acc=96.6% | lccp=92.2% | batch_acc=98.5% | phase=GROUNDED_ONLY sp_ratio=0% | groups=8 skipped=12(0var=12) | lr=4.96e-06 | 142.0s
+2026-04-26 04:27:11,680 WARNING __main__ - STARVATION: 60% of groups skipped (zero variance). grounded_acc=98.5% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 04:27:11,681 INFO __main__ - ======================================================================
+2026-04-26 04:27:11,681 INFO __main__ - GRPO ITERATION 12/60
+2026-04-26 04:27:11,681 INFO __main__ - ======================================================================
+2026-04-26 04:27:11,701 INFO __main__ - LR this iteration: 4.96e-06 | T=0.725 | MATH ratio=30%
+
Iter 12 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:27:14,947 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,029 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,113 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,281 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,365 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,528 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,610 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,692 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 12 GRPO groups: 0%| | 0/20 [00:03, ?q/s, loss=0var, mean_r=0.999, skip=1]
Iter 12 GRPO groups: 5%|5 | 1/20 [00:03<01:15, 3.99s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 04:27:21,375 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:21,460 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:21,543 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.960[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:21,627 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.962[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:21,710 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.894[fin=1.00,mean=0.73]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 04:27:21,793 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.947[fin=0.99,mean=0.89]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:21,878 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:27:21,964 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.943[fin=0.94,mean=0.95]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:22,047 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:22,138 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 12 GRPO groups: 5%|5 | 1/20 [00:10<01:15, 3.99s/q, loss=0var, mean_r=0.983, skip=2]
Iter 12 GRPO groups: 10%|# | 2/20 [00:10<01:37, 5.43s/q, loss=0var, mean_r=0.983, skip=2]2026-04-26 04:27:27,589 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.614 = 0.50×0.44(prox=0.44) + 0.40×proc(0.737[fin=0.92,mean=0.47]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:27:27,672 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.932[fin=0.99,mean=0.84]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:27,754 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.918[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:27:27,837 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.952[fin=0.98,mean=0.90]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:27,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.876[fin=0.96,mean=0.74]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:27:28,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.823[fin=0.97,mean=0.60]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:27:28,089 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.896[fin=0.98,mean=0.76]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:27:28,172 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.933[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:28,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.940[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:28,339 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.860 = 0.50×0.85(prox=0.85) + 0.40×proc(0.836[fin=0.99,mean=0.61]) + 0.10×fmt(1.000) | pred='3120000' gold='2880000' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+
Iter 12 GRPO groups: 10%|# | 2/20 [00:18<01:37, 5.43s/q, loss=0.0017, mean_r=0.587, skip=2]
Iter 12 GRPO groups: 15%|#5 | 3/20 [00:18<01:49, 6.47s/q, loss=0.0017, mean_r=0.587, skip=2]2026-04-26 04:27:37,657 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.515 = 0.50×0.02(prox=0.02) + 0.40×proc(0.851[fin=0.89,mean=0.79]) + 0.10×fmt(1.000) | pred='900' gold='30' | step_acc=89% lccp=44% (chain=4/9 ok_count=8) n_steps=9
+2026-04-26 04:27:37,752 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=89% lccp=11% (chain=1/9 ok_count=8) n_steps=9
+2026-04-26 04:27:37,835 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:27:37,919 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:27:38,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.448 = 0.50×0.02(prox=0.02) + 0.40×proc(0.795[fin=0.89,mean=0.65]) + 0.10×fmt(1.000) | pred='900' gold='30' | step_acc=71% lccp=14% (chain=1/7 ok_count=5) n_steps=7
+2026-04-26 04:27:38,085 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:38,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:38,252 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:27:38,343 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:27:38,426 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.440 = 0.50×0.20(prox=0.20) + 0.40×proc(0.411[fin=0.29,mean=0.59]) + 0.10×fmt(1.000) | pred='90' gold='30' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+
Iter 12 GRPO groups: 15%|#5 | 3/20 [00:28<01:49, 6.47s/q, loss=-0.0004, mean_r=0.837, skip=2]
Iter 12 GRPO groups: 20%|## | 4/20 [00:28<02:05, 7.87s/q, loss=-0.0004, mean_r=0.837, skip=2]2026-04-26 04:27:48,453 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.731 = 0.50×0.85(prox=0.85) + 0.40×proc(0.516[fin=0.44,mean=0.63]) + 0.10×fmt(1.000) | pred='11' gold='12' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 04:27:48,545 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:48,628 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:48,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.887[fin=0.90,mean=0.86]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:48,792 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.900[fin=0.94,mean=0.84]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:27:48,875 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.972[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:48,962 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.791 = 0.50×0.75(prox=0.75) + 0.40×proc(0.790[fin=0.87,mean=0.67]) + 0.10×fmt(1.000) | pred='10' gold='12' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 04:27:49,043 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.950[fin=0.99,mean=0.89]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:49,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:49,210 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.932[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=83% lccp=33% (chain=2/6 ok_count=5) n_steps=6
+
Iter 12 GRPO groups: 20%|## | 4/20 [00:38<02:05, 7.87s/q, loss=0.0006, mean_r=0.937, skip=2]
Iter 12 GRPO groups: 25%|##5 | 5/20 [00:38<02:13, 8.92s/q, loss=0.0006, mean_r=0.937, skip=2]2026-04-26 04:27:53,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:53,734 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:53,810 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:53,885 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:53,967 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:54,048 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:54,124 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:54,199 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:54,276 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:54,351 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 12 GRPO groups: 25%|##5 | 5/20 [00:42<02:13, 8.92s/q, loss=0var, mean_r=1.000, skip=3]
Iter 12 GRPO groups: 30%|### | 6/20 [00:42<01:40, 7.15s/q, loss=0var, mean_r=1.000, skip=3]2026-04-26 04:28:05,090 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.549 = 0.50×0.00(prox=0.00) + 0.40×proc(0.909[fin=0.99,mean=0.78]) + 0.10×fmt(1.000) | pred='$1175' gold='1100' | step_acc=71% lccp=57% (chain=4/7 ok_count=5) n_steps=7
+2026-04-26 04:28:05,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.490 = 0.50×0.00(prox=0.00) + 0.40×proc(0.834[fin=0.89,mean=0.76]) + 0.10×fmt(1.000) | pred='$2400' gold='1100' | step_acc=75% lccp=38% (chain=3/8 ok_count=6) n_steps=8
+2026-04-26 04:28:05,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.549 = 0.50×0.00(prox=0.00) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='$900' gold='1100' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:28:05,371 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1100' gold='1100' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+2026-04-26 04:28:05,464 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.884 = 0.50×0.85(prox=0.85) + 0.40×proc(0.897[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='1175' gold='1100' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:28:05,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.824 = 0.50×0.76(prox=0.76) + 0.40×proc(0.862[fin=0.99,mean=0.67]) + 0.10×fmt(1.000) | pred='1275' gold='1100' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:28:05,652 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.814 = 0.50×0.76(prox=0.76) + 0.40×proc(0.836[fin=0.96,mean=0.64]) + 0.10×fmt(1.000) | pred='1275' gold='1100' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:28:05,745 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.438 = 0.50×0.13(prox=0.13) + 0.40×proc(0.529[fin=0.46,mean=0.64]) + 0.10×fmt(1.000) | pred='4700' gold='1100' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:28:05,839 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1100' gold='1100' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:28:05,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.334 = 0.50×0.00(prox=0.00) + 0.40×proc(0.397[fin=0.21,mean=0.67]) + 0.10×fmt(1.000) | pred='$5500' gold='1100' | step_acc=62% lccp=50% (chain=4/8 ok_count=5) n_steps=8
+
Iter 12 GRPO groups: 30%|### | 6/20 [00:55<01:40, 7.15s/q, loss=-0.0003, mean_r=0.688, skip=3]
Iter 12 GRPO groups: 35%|###5 | 7/20 [00:55<01:57, 9.07s/q, loss=-0.0003, mean_r=0.688, skip=3]2026-04-26 04:28:10,638 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.542 = 0.50×0.60(prox=0.60) + 0.40×proc(0.354[fin=0.44,mean=0.23]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:28:10,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.644 = 0.50×0.43(prox=0.43) + 0.40×proc(0.826[fin=0.99,mean=0.58]) + 0.10×fmt(1.000) | pred='6' gold='18' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:28:10,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.972 = 0.50×1.00(exact) + 0.40×proc(0.931[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:28:10,885 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.442 = 0.50×0.18(prox=0.18) + 0.40×proc(0.635[fin=0.80,mean=0.39]) + 0.10×fmt(1.000) | pred='60' gold='18' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:28:10,967 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.493 = 0.50×0.47(prox=0.47) + 0.40×proc(0.391[fin=0.52,mean=0.21]) + 0.10×fmt(1.000) | pred='8' gold='18' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:28:11,045 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.426 = 0.50×0.27(prox=0.27) + 0.40×proc(0.348[fin=0.29,mean=0.44]) + 0.10×fmt(1.000) | pred='42' gold='18' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 04:28:11,126 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.505 = 0.50×0.33(prox=0.33) + 0.40×proc(0.596[fin=0.79,mean=0.31]) + 0.10×fmt(1.000) | pred='0' gold='18' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:28:11,204 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.459 = 0.50×0.60(prox=0.60) + 0.40×proc(0.148[fin=0.10,mean=0.21]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:28:11,282 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.750 = 0.50×0.60(prox=0.60) + 0.40×proc(0.875[fin=0.98,mean=0.72]) + 0.10×fmt(1.000) | pred='24' gold='18' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:28:11,365 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.332 = 0.50×0.14(prox=0.14) + 0.40×proc(0.401[fin=0.42,mean=0.37]) + 0.10×fmt(1.000) | pred='72' gold='18' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+
Iter 12 GRPO groups: 35%|###5 | 7/20 [01:01<01:57, 9.07s/q, loss=0.0018, mean_r=0.557, skip=3]
Iter 12 GRPO groups: 40%|#### | 8/20 [01:01<01:34, 7.91s/q, loss=0.0018, mean_r=0.557, skip=3]2026-04-26 04:28:32,551 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.683 = 0.50×0.60(prox=0.60) + 0.40×proc(0.707[fin=0.84,mean=0.51]) + 0.10×fmt(1.000) | pred='5.2' gold='7.8000' | step_acc=50% lccp=0% (chain=0/12 ok_count=6) n_steps=12
+2026-04-26 04:28:32,648 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.945 = 0.50×1.00(exact) + 0.40×proc(0.950[fin=1.00,mean=0.88]) + 0.10×fmt(0.650) | pred='7.8' gold='7.8000' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:28:32,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.316 = 0.50×0.24(prox=0.24) + 0.40×proc(0.143[fin=0.11,mean=0.19]) + 0.10×fmt(1.000) | pred='20.0' gold='7.8000' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 04:28:32,843 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.22(prox=0.22) + 0.40×proc(0.800[fin=0.95,mean=0.57]) + 0.10×fmt(1.000) | pred='21.7' gold='7.8000' | step_acc=75% lccp=25% (chain=2/8 ok_count=6) n_steps=8
+2026-04-26 04:28:32,939 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.721 = 0.50×0.60(prox=0.60) + 0.40×proc(0.803[fin=0.98,mean=0.53]) + 0.10×fmt(1.000) | pred='10.4' gold='7.8000' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 04:28:33,034 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='7.8' gold='7.8000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:28:33,128 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.921[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='7.8' gold='7.8000' | step_acc=86% lccp=14% (chain=1/7 ok_count=6) n_steps=7
+2026-04-26 04:28:33,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.438 = 0.50×0.43(prox=0.43) + 0.40×proc(0.184[fin=0.11,mean=0.29]) + 0.10×fmt(1.000) | pred='13.0' gold='7.8000' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 04:28:33,316 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.524 = 0.50×0.43(prox=0.43) + 0.40×proc(0.525[fin=0.68,mean=0.29]) + 0.10×fmt(1.000) | pred='13.0' gold='7.8000' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:28:33,411 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.330 = 0.50×0.27(prox=0.27) + 0.40×proc(0.241[fin=0.32,mean=0.13]) + 0.10×fmt(1.000) | pred='18.5' gold='7.8000' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+
Iter 12 GRPO groups: 40%|#### | 8/20 [01:23<01:34, 7.91s/q, loss=-0.0002, mean_r=0.646, skip=3]
Iter 12 GRPO groups: 45%|####5 | 9/20 [01:23<02:15, 12.36s/q, loss=-0.0002, mean_r=0.646, skip=3]2026-04-26 04:28:37,444 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:37,520 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:37,596 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:37,671 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:37,747 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:37,824 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:37,903 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:37,980 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:38,055 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:38,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.946[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 12 GRPO groups: 45%|####5 | 9/20 [01:26<02:15, 12.36s/q, loss=0var, mean_r=0.997, skip=4]
Iter 12 GRPO groups: 50%|##### | 10/20 [01:26<01:35, 9.53s/q, loss=0var, mean_r=0.997, skip=4]2026-04-26 04:28:53,340 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.899[fin=0.98,mean=0.77]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:28:53,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.934 = 0.50×1.00(exact) + 0.40×proc(0.835[fin=0.99,mean=0.60]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=50% lccp=0% (chain=0/10 ok_count=5) n_steps=10
+2026-04-26 04:28:53,537 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.304 = 0.50×0.00(prox=0.00) + 0.40×proc(0.463[fin=0.38,mean=0.59]) + 0.10×fmt(1.000) | pred='3/2' gold='2' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+2026-04-26 04:28:53,632 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.869 = 0.50×1.00(exact) + 0.40×proc(0.672[fin=0.78,mean=0.51]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+2026-04-26 04:28:53,726 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:28:53,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:28:53,909 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:28:53,988 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.943 = 0.50×1.00(exact) + 0.40×proc(0.858[fin=0.98,mean=0.67]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:28:54,083 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:28:54,178 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 12 GRPO groups: 50%|##### | 10/20 [01:44<01:35, 9.53s/q, loss=-0.0003, mean_r=0.899, skip=4]
Iter 12 GRPO groups: 55%|#####5 | 11/20 [01:44<01:47, 11.99s/q, loss=-0.0003, mean_r=0.899, skip=4]2026-04-26 04:28:58,129 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:58,213 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:58,292 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:58,372 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:58,454 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:58,538 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:58,621 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:58,702 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:28:58,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:58,867 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 12 GRPO groups: 55%|#####5 | 11/20 [01:47<01:47, 11.99s/q, loss=0var, mean_r=0.996, skip=5]
Iter 12 GRPO groups: 60%|###### | 12/20 [01:47<01:14, 9.30s/q, loss=0var, mean_r=0.996, skip=5]2026-04-26 04:29:08,297 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:08,383 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:08,470 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:08,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:08,644 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:08,729 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.931 = 0.50×1.00(exact) + 0.40×proc(0.828[fin=0.91,mean=0.70]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:29:08,823 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='-7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:08,907 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:08,989 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:09,082 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 12 GRPO groups: 60%|###### | 12/20 [01:58<01:14, 9.30s/q, loss=-0.0017, mean_r=0.947, skip=5]
Iter 12 GRPO groups: 65%|######5 | 13/20 [01:58<01:10, 10.02s/q, loss=-0.0017, mean_r=0.947, skip=5]2026-04-26 04:29:16,122 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.433 = 0.50×0.20(prox=0.20) + 0.40×proc(0.357[fin=0.16,mean=0.66]) + 0.10×fmt(1.000) | pred='-9' gold='9' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 04:29:16,207 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:16,290 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.695 = 0.50×0.60(prox=0.60) + 0.40×proc(0.738[fin=0.82,mean=0.62]) + 0.10×fmt(1.000) | pred='6' gold='9' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:29:16,376 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.709 = 0.50×0.60(prox=0.60) + 0.40×proc(0.773[fin=0.89,mean=0.60]) + 0.10×fmt(1.000) | pred='6' gold='9' | step_acc=67% lccp=17% (chain=1/6 ok_count=4) n_steps=6
+2026-04-26 04:29:16,462 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.717 = 0.50×0.60(prox=0.60) + 0.40×proc(0.792[fin=0.93,mean=0.59]) + 0.10×fmt(1.000) | pred='6' gold='9' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:29:16,548 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.30(prox=0.30) + 0.40×proc(0.734[fin=0.87,mean=0.53]) + 0.10×fmt(1.000) | pred='19.5' gold='9' | step_acc=50% lccp=38% (chain=3/8 ok_count=4) n_steps=8
+2026-04-26 04:29:16,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:29:16,719 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.860[fin=1.00,mean=0.65]) + 0.10×fmt(1.000) | pred='3' gold='9' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 04:29:16,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:16,884 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.904[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+
Iter 12 GRPO groups: 65%|######5 | 13/20 [02:06<01:10, 10.02s/q, loss=-0.0007, mean_r=0.761, skip=5]
Iter 12 GRPO groups: 70%|####### | 14/20 [02:06<00:56, 9.34s/q, loss=-0.0007, mean_r=0.761, skip=5]2026-04-26 04:29:22,273 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.307 = 0.50×0.33(prox=0.33) + 0.40×proc(0.101[fin=0.10,mean=0.10]) + 0.10×fmt(1.000) | pred='0' gold='90' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 04:29:22,355 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:22,438 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:22,516 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:22,598 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:22,675 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.273 = 0.50×0.36(prox=0.36) + 0.40×proc(0.069[fin=0.07,mean=0.06]) + 0.10×fmt(0.650) | pred='10' gold='90' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 04:29:22,757 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:22,838 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.911[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 04:29:22,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:23,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 12 GRPO groups: 70%|####### | 14/20 [02:12<00:56, 9.34s/q, loss=0.0038, mean_r=0.853, skip=5]
Iter 12 GRPO groups: 75%|#######5 | 15/20 [02:12<00:41, 8.38s/q, loss=0.0038, mean_r=0.853, skip=5]2026-04-26 04:29:42,580 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.433 = 0.50×0.37(prox=0.37) + 0.40×proc(0.189[fin=0.02,mean=0.44]) + 0.10×fmt(1.000) | pred='9' gold='69' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:29:42,666 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.638 = 0.50×0.83(prox=0.83) + 0.40×proc(0.307[fin=0.17,mean=0.51]) + 0.10×fmt(1.000) | pred='76' gold='69' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 04:29:42,750 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.736 = 0.50×0.83(prox=0.83) + 0.40×proc(0.550[fin=0.62,mean=0.44]) + 0.10×fmt(1.000) | pred='76' gold='69' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:29:42,834 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:42,918 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.919[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:29:43,011 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:29:43,095 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.919[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:29:43,203 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.756 = 0.50×0.83(prox=0.83) + 0.40×proc(0.600[fin=0.59,mean=0.61]) + 0.10×fmt(1.000) | pred='76' gold='69' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:29:43,284 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:43,367 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.825 = 0.50×0.85(prox=0.85) + 0.40×proc(0.750[fin=0.85,mean=0.60]) + 0.10×fmt(1.000) | pred='68' gold='69' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+
Iter 12 GRPO groups: 75%|#######5 | 15/20 [02:33<00:41, 8.38s/q, loss=-0.0016, mean_r=0.830, skip=5]
Iter 12 GRPO groups: 80%|######## | 16/20 [02:33<00:47, 11.99s/q, loss=-0.0016, mean_r=0.830, skip=5]2026-04-26 04:29:47,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:47,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:47,714 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:47,797 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:47,882 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:47,959 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:48,041 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:48,119 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:48,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:48,279 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.969[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 12 GRPO groups: 80%|######## | 16/20 [02:36<00:47, 11.99s/q, loss=0var, mean_r=0.994, skip=6]
Iter 12 GRPO groups: 85%|########5 | 17/20 [02:36<00:28, 9.42s/q, loss=0var, mean_r=0.994, skip=6]2026-04-26 04:29:52,474 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.922[fin=0.99,mean=0.82]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:29:52,561 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.945[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:52,648 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:52,734 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:52,817 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:52,901 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:52,986 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.719 = 0.50×0.64(prox=0.64) + 0.40×proc(0.747[fin=0.92,mean=0.48]) + 0.10×fmt(1.000) | pred='64' gold='50' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:29:53,069 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:53,154 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.905[fin=0.99,mean=0.78]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:29:53,241 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 12 GRPO groups: 85%|########5 | 17/20 [02:43<00:28, 9.42s/q, loss=-0.0008, mean_r=0.956, skip=6]
Iter 12 GRPO groups: 90%|######### | 18/20 [02:43<00:17, 8.52s/q, loss=-0.0008, mean_r=0.956, skip=6]2026-04-26 04:30:00,880 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.874 = 0.50×0.85(prox=0.85) + 0.40×proc(0.872[fin=1.00,mean=0.69]) + 0.10×fmt(1.000) | pred='108' gold='112' | step_acc=75% lccp=12% (chain=1/8 ok_count=6) n_steps=8
+2026-04-26 04:30:00,964 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.865 = 0.50×1.00(exact) + 0.40×proc(0.661[fin=0.82,mean=0.42]) + 0.10×fmt(1.000) | pred='112' gold='112' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:30:01,048 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.438 = 0.50×0.52(prox=0.52) + 0.40×proc(0.197[fin=0.12,mean=0.32]) + 0.10×fmt(1.000) | pred='60' gold='112' | step_acc=20% lccp=20% (chain=1/5 ok_count=1) n_steps=5
+2026-04-26 04:30:01,133 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.790 = 0.50×0.61(prox=0.61) + 0.40×proc(0.964[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='76' gold='112' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:30:01,219 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.681 = 0.50×0.49(prox=0.49) + 0.40×proc(0.839[fin=0.98,mean=0.62]) + 0.10×fmt(1.000) | pred='54' gold='112' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:30:01,303 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.861 = 0.50×0.85(prox=0.85) + 0.40×proc(0.842[fin=0.99,mean=0.62]) + 0.10×fmt(1.000) | pred='102' gold='112' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:30:01,389 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.412 = 0.50×0.47(prox=0.47) + 0.40×proc(0.196[fin=0.06,mean=0.40]) + 0.10×fmt(1.000) | pred='48' gold='112' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:30:01,475 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.47(prox=0.47) + 0.40×proc(0.730[fin=0.70,mean=0.77]) + 0.10×fmt(1.000) | pred='48' gold='112' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:30:01,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.822 = 0.50×0.85(prox=0.85) + 0.40×proc(0.742[fin=0.87,mean=0.55]) + 0.10×fmt(1.000) | pred='108' gold='112' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:30:01,643 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.797 = 0.50×0.78(prox=0.78) + 0.40×proc(0.771[fin=0.98,mean=0.45]) + 0.10×fmt(1.000) | pred='96' gold='112' | step_acc=33% lccp=0% (chain=0/6 ok_count=2) n_steps=6
+
Iter 12 GRPO groups: 90%|######### | 18/20 [02:51<00:17, 8.52s/q, loss=0.0001, mean_r=0.709, skip=6]
Iter 12 GRPO groups: 95%|#########5| 19/20 [02:51<00:08, 8.49s/q, loss=0.0001, mean_r=0.709, skip=6]2026-04-26 04:30:07,469 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.912[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='192' gold='96' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:30:07,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.779 = 0.50×0.75(prox=0.75) + 0.40×proc(0.759[fin=0.81,mean=0.68]) + 0.10×fmt(1.000) | pred='80' gold='96' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:30:07,637 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.709 = 0.50×0.55(prox=0.55) + 0.40×proc(0.842[fin=0.97,mean=0.65]) + 0.10×fmt(1.000) | pred='136' gold='96' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:30:07,719 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='96' gold='96' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:30:07,801 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.634 = 0.50×0.50(prox=0.50) + 0.40×proc(0.709[fin=0.77,mean=0.61]) + 0.10×fmt(1.000) | pred='48' gold='96' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:30:07,883 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='96' gold='96' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:30:07,965 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.594 = 0.50×0.50(prox=0.50) + 0.40×proc(0.611[fin=0.75,mean=0.41]) + 0.10×fmt(1.000) | pred='48' gold='96' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:30:08,049 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.849 = 0.50×0.75(prox=0.75) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='112' gold='96' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 04:30:08,125 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='96' gold='96' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:30:08,209 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='96' gold='96' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 12 GRPO groups: 95%|#########5| 19/20 [02:57<00:08, 8.49s/q, loss=-0.0008, mean_r=0.809, skip=6]
Iter 12 GRPO groups: 100%|##########| 20/20 [02:57<00:00, 7.89s/q, loss=-0.0008, mean_r=0.809, skip=6]
Iter 12 GRPO groups: 100%|##########| 20/20 [02:57<00:00, 8.90s/q, loss=-0.0008, mean_r=0.809, skip=6]
+2026-04-26 04:30:09,636 INFO __main__ - PHASE → SELFPLAY_RAMP at iter 12 (gt_match=0.65 grounded_acc=0.90 step_acc=0.82) — shadow extraction active; chain scoring deferred until calibration passes (corr≥0.70, success_rate≥0.80)
+2026-04-26 04:30:09,636 INFO __main__ - Iter 12 | loss=0.0001 | reward mean=0.849 std=0.213 | gt_match=65.0% | grounded_acc=90.0% | step_acc=82.1% | lccp=68.7% | batch_acc=90.0% | phase=SELFPLAY_RAMP sp_ratio=0% | groups=14 skipped=6(0var=6) | lr=4.93e-06 | 178.0s
+2026-04-26 04:30:09,637 INFO __main__ - ======================================================================
+2026-04-26 04:30:09,638 INFO __main__ - GRPO ITERATION 13/60
+2026-04-26 04:30:09,638 INFO __main__ - ======================================================================
+2026-04-26 04:30:09,659 INFO __main__ - LR this iteration: 4.93e-06 | T=0.719 | MATH ratio=30%
+
Iter 13 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:30:16,229 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:30:16,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:590: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.1` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
+ warnings.warn(
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:595: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.
+ warnings.warn(
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:612: UserWarning: `do_sample` is set to `False`. However, `top_k` is set to `20` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_k`.
+ warnings.warn(
+2026-04-26 04:30:30,528 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:30:30,619 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:30:30,712 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:30:30,799 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:30:44,668 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:30:44,753 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:30:44,837 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:30:44,925 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.740 = 0.50×0.60(prox=0.60) + 0.40×proc(0.851[fin=1.00,mean=0.63]) + 0.10×fmt(1.000) | pred='96' gold='144' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+
Iter 13 GRPO groups: 0%| | 0/20 [00:46, ?q/s, loss=0.0004, mean_r=0.974, skip=0]
Iter 13 GRPO groups: 5%|5 | 1/20 [00:46<14:40, 46.34s/q, loss=0.0004, mean_r=0.974, skip=0]2026-04-26 04:31:02,947 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.731 = 0.50×0.60(prox=0.60) + 0.40×proc(0.827[fin=0.99,mean=0.58]) + 0.10×fmt(1.000) | pred='33.33' gold='25' | step_acc=57% lccp=14% (chain=1/7 ok_count=4) n_steps=7
+2026-04-26 04:31:03,034 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.928[fin=0.95,mean=0.90]) + 0.10×fmt(1.000) | pred='50' gold='25' | step_acc=86% lccp=71% (chain=5/7 ok_count=6) n_steps=7
+2026-04-26 04:31:03,118 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:31:03,202 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.953[fin=0.98,mean=0.91]) + 0.10×fmt(1.000) | pred='50' gold='25' | step_acc=88% lccp=62% (chain=5/8 ok_count=7) n_steps=8
+2026-04-26 04:31:17,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.701 = 0.50×0.57(prox=0.57) + 0.40×proc(0.797[fin=0.94,mean=0.59]) + 0.10×fmt(1.000) | pred='15.3846' gold='25' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 04:31:17,793 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.863 = 0.50×0.82(prox=0.82) + 0.40×proc(0.884[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='22.22' gold='25' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:31:17,875 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:31:17,961 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.734 = 0.50×0.60(prox=0.60) + 0.40×proc(0.834[fin=0.98,mean=0.61]) + 0.10×fmt(1.000) | pred='16.67' gold='25' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:31:25,961 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.728 = 0.50×0.60(prox=0.60) + 0.40×proc(0.819[fin=0.87,mean=0.74]) + 0.10×fmt(1.000) | pred='33.33' gold='25' | step_acc=78% lccp=22% (chain=2/9 ok_count=7) n_steps=9
+2026-04-26 04:31:26,046 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 13 GRPO groups: 5%|5 | 1/20 [01:17<14:40, 46.34s/q, loss=-0.0017, mean_r=0.783, skip=0]
Iter 13 GRPO groups: 10%|# | 2/20 [01:17<11:17, 37.62s/q, loss=-0.0017, mean_r=0.783, skip=0]2026-04-26 04:31:30,569 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:31:30,654 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:31:35,239 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:31:35,315 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:31:35,397 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:31:35,479 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:31:38,277 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:31:38,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:31:38,435 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:31:38,511 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 13 GRPO groups: 10%|# | 2/20 [01:33<11:17, 37.62s/q, loss=0var, mean_r=0.998, skip=1]
Iter 13 GRPO groups: 15%|#5 | 3/20 [01:33<07:50, 27.67s/q, loss=0var, mean_r=0.998, skip=1]2026-04-26 04:31:49,225 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:31:49,318 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:31:49,410 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:31:49,494 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:32:00,370 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:32:00,461 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:32:00,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:32:00,644 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:32:14,681 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:32:14,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 13 GRPO groups: 15%|#5 | 3/20 [02:05<07:50, 27.67s/q, loss=0var, mean_r=1.000, skip=2]
Iter 13 GRPO groups: 20%|## | 4/20 [02:05<07:46, 29.15s/q, loss=0var, mean_r=1.000, skip=2]2026-04-26 04:32:21,029 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.500 = 0.50×0.37(prox=0.37) + 0.40×proc(0.258[fin=0.03,mean=0.60]) + 0.10×fmt(1.000) | pred='300' gold='2100' | step_acc=75% lccp=75% (chain=3/4 ok_count=3) n_steps=4
+2026-04-26 04:32:21,115 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:32:32,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:32:32,257 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:32:32,344 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:32:32,431 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.468 = 0.50×0.39(prox=0.39) + 0.40×proc(0.432[fin=0.45,mean=0.40]) + 0.10×fmt(1.000) | pred='455' gold='2100' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:32:35,356 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:32:35,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:32:35,524 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:32:35,606 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 13 GRPO groups: 20%|## | 4/20 [02:37<07:46, 29.15s/q, loss=-0.0001, mean_r=0.896, skip=2]
Iter 13 GRPO groups: 25%|##5 | 5/20 [02:37<07:36, 30.41s/q, loss=-0.0001, mean_r=0.896, skip=2]2026-04-26 04:32:51,576 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='3.5' gold='3.5000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:32:51,658 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.864 = 0.50×1.00(exact) + 0.40×proc(0.747[fin=0.93,mean=0.48]) + 0.10×fmt(0.650) | pred='3.5' gold='3.5000' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 04:32:51,739 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='3.5' gold='3.5000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:32:51,817 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.802 = 0.50×1.00(exact) + 0.40×proc(0.506[fin=0.68,mean=0.25]) + 0.10×fmt(1.000) | pred='3.5' gold='3.5000' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:32:54,144 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.951 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.92]) + 0.10×fmt(0.650) | pred='3.5' gold='3.5000' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:32:54,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='3.5' gold='3.5000' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:32:54,309 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='3.5' gold='3.5000' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:32:54,390 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3.5' gold='3.5000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:32:59,487 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3.5' gold='3.5000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:32:59,564 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3.5' gold='3.5000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 13 GRPO groups: 25%|##5 | 5/20 [02:51<07:36, 30.41s/q, loss=0.0022, mean_r=0.956, skip=2]
Iter 13 GRPO groups: 30%|### | 6/20 [02:51<05:45, 24.69s/q, loss=0.0022, mean_r=0.956, skip=2]2026-04-26 04:33:03,294 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:33:03,371 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:33:07,972 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:33:08,051 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.897 = 0.50×1.00(exact) + 0.40×proc(0.829[fin=0.97,mean=0.62]) + 0.10×fmt(0.650) | pred='240' gold='240' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 04:33:08,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:33:08,206 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:33:13,047 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:33:13,128 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:33:13,206 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:33:13,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.922[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+
Iter 13 GRPO groups: 30%|### | 6/20 [03:08<05:45, 24.69s/q, loss=0.0062, mean_r=0.973, skip=2]
Iter 13 GRPO groups: 35%|###5 | 7/20 [03:08<04:47, 22.14s/q, loss=0.0062, mean_r=0.973, skip=2]2026-04-26 04:33:22,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.353 = 0.50×0.11(prox=0.11) + 0.40×proc(0.279[fin=0.08,mean=0.58]) + 0.10×fmt(1.000) | pred='-21' gold='7' | step_acc=57% lccp=57% (chain=4/7 ok_count=4) n_steps=7
+2026-04-26 04:33:22,081 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:33:22,167 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.47(prox=0.47) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='3' gold='7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:33:22,251 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:33:32,549 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:33:32,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.917 = 0.50×1.00(exact) + 0.40×proc(0.794[fin=0.79,mean=0.79]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:33:32,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:33:32,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.961[fin=0.99,mean=0.92]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:33:41,116 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:33:41,197 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 13 GRPO groups: 35%|###5 | 7/20 [03:32<04:47, 22.14s/q, loss=-0.0011, mean_r=0.878, skip=2]
Iter 13 GRPO groups: 40%|#### | 8/20 [03:32<04:35, 22.97s/q, loss=-0.0011, mean_r=0.878, skip=2]2026-04-26 04:33:48,370 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.912[fin=0.99,mean=0.80]) + 0.10×fmt(1.000) | pred='290' gold='290' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:33:48,452 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.201 = 0.50×0.17(prox=0.17) + 0.40×proc(0.041[fin=0.05,mean=0.03]) + 0.10×fmt(1.000) | pred='1000' gold='290' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:33:53,234 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.510 = 0.50×0.45(prox=0.45) + 0.40×proc(0.281[fin=0.10,mean=0.55]) + 0.10×fmt(1.000) | pred='110' gold='290' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:33:53,315 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.760 = 0.50×0.78(prox=0.78) + 0.40×proc(0.670[fin=0.84,mean=0.41]) + 0.10×fmt(1.000) | pred='250' gold='290' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:33:53,398 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.880 = 0.50×0.85(prox=0.85) + 0.40×proc(0.887[fin=0.98,mean=0.75]) + 0.10×fmt(1.000) | pred='310' gold='290' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:33:53,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='290' gold='290' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:34:04,688 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.405 = 0.50×0.45(prox=0.45) + 0.40×proc(0.205[fin=0.02,mean=0.48]) + 0.10×fmt(1.000) | pred='110' gold='290' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:34:04,772 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.854 = 0.50×1.00(exact) + 0.40×proc(0.635[fin=0.82,mean=0.36]) + 0.10×fmt(1.000) | pred='290' gold='290' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:34:04,856 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.514 = 0.50×0.45(prox=0.45) + 0.40×proc(0.291[fin=0.12,mean=0.54]) + 0.10×fmt(1.000) | pred='110' gold='290' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:34:04,940 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.549 = 0.50×0.45(prox=0.45) + 0.40×proc(0.378[fin=0.24,mean=0.58]) + 0.10×fmt(1.000) | pred='110' gold='290' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+
Iter 13 GRPO groups: 40%|#### | 8/20 [04:03<04:35, 22.97s/q, loss=0.0010, mean_r=0.662, skip=2]
Iter 13 GRPO groups: 45%|####5 | 9/20 [04:03<04:37, 25.26s/q, loss=0.0010, mean_r=0.662, skip=2]2026-04-26 04:34:18,980 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:34:19,062 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.674 = 0.50×0.50(prox=0.50) + 0.40×proc(0.811[fin=0.91,mean=0.66]) + 0.10×fmt(1.000) | pred='3' gold='2' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:34:19,144 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:34:19,226 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:34:24,326 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:34:24,411 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:34:24,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.895[fin=0.98,mean=0.77]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:34:24,587 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.298 = 0.50×0.09(prox=0.09) + 0.40×proc(0.236[fin=0.09,mean=0.46]) + 0.10×fmt(1.000) | pred='12.5' gold='2' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 04:34:33,755 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:34:33,841 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.744[fin=0.91,mean=0.49]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+
Iter 13 GRPO groups: 45%|####5 | 9/20 [04:25<04:37, 25.26s/q, loss=-0.0004, mean_r=0.845, skip=2]
Iter 13 GRPO groups: 50%|##### | 10/20 [04:25<04:03, 24.36s/q, loss=-0.0004, mean_r=0.845, skip=2]2026-04-26 04:34:39,400 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:34:39,483 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:34:44,805 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:34:44,889 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:34:44,976 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:34:45,059 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:34:56,700 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:34:56,782 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:34:56,865 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:34:56,947 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 13 GRPO groups: 50%|##### | 10/20 [04:54<04:03, 24.36s/q, loss=0var, mean_r=0.999, skip=3]
Iter 13 GRPO groups: 55%|#####5 | 11/20 [04:54<03:51, 25.73s/q, loss=0var, mean_r=0.999, skip=3]2026-04-26 04:35:11,170 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:35:11,263 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:35:11,347 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:35:11,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.915 = 0.50×1.00(exact) + 0.40×proc(0.788[fin=0.85,mean=0.70]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:35:22,381 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.892 = 0.50×1.00(exact) + 0.40×proc(0.731[fin=0.86,mean=0.54]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:35:22,465 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:35:22,548 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:35:22,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:35:33,058 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.908 = 0.50×1.00(exact) + 0.40×proc(0.770[fin=0.78,mean=0.75]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:35:33,145 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.951[fin=0.99,mean=0.89]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 13 GRPO groups: 55%|#####5 | 11/20 [05:24<03:51, 25.73s/q, loss=-0.0002, mean_r=0.968, skip=3]
Iter 13 GRPO groups: 60%|###### | 12/20 [05:24<03:37, 27.18s/q, loss=-0.0002, mean_r=0.968, skip=3]2026-04-26 04:35:41,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:35:41,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.441 = 0.50×0.40(prox=0.40) + 0.40×proc(0.278[fin=0.27,mean=0.29]) + 0.10×fmt(1.000) | pred='4' gold='16' | step_acc=20% lccp=20% (chain=1/5 ok_count=1) n_steps=5
+2026-04-26 04:35:50,747 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.919[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:35:50,831 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:35:50,913 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:35:50,995 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:35:55,523 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:35:55,605 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:35:55,688 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:35:55,770 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 13 GRPO groups: 60%|###### | 12/20 [05:51<03:37, 27.18s/q, loss=-0.0029, mean_r=0.936, skip=3]
Iter 13 GRPO groups: 65%|######5 | 13/20 [05:51<03:09, 27.10s/q, loss=-0.0029, mean_r=0.936, skip=3]2026-04-26 04:36:05,631 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:05,716 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:05,800 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:05,881 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:12,993 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:13,080 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:13,165 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:13,249 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:20,870 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:20,957 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 13 GRPO groups: 65%|######5 | 13/20 [06:11<03:09, 27.10s/q, loss=0var, mean_r=0.999, skip=4]
Iter 13 GRPO groups: 70%|####### | 14/20 [06:11<02:28, 24.78s/q, loss=0var, mean_r=0.999, skip=4]2026-04-26 04:36:27,658 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43200' gold='43200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:36:27,753 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='43200' gold='43200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:36:39,044 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='43200' gold='43200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:36:39,133 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43200' gold='43200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:36:39,219 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='43200' gold='43200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:36:39,305 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43200' gold='43200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:36:52,986 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.377 = 0.50×0.34(prox=0.34) + 0.40×proc(0.269[fin=0.06,mean=0.58]) + 0.10×fmt(1.000) | pred='1080' gold='43200' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:36:53,073 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.670 = 0.50×0.50(prox=0.50) + 0.40×proc(0.799[fin=0.99,mean=0.51]) + 0.10×fmt(1.000) | pred='21600' gold='43200' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:36:53,165 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43200' gold='43200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:36:53,256 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43200' gold='43200' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 13 GRPO groups: 70%|####### | 14/20 [06:56<02:28, 24.78s/q, loss=0.0011, mean_r=0.903, skip=4]
Iter 13 GRPO groups: 75%|#######5 | 15/20 [06:56<02:34, 30.96s/q, loss=0.0011, mean_r=0.903, skip=4]2026-04-26 04:37:17,617 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:37:17,702 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.915[fin=0.99,mean=0.80]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:37:17,786 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.735 = 0.50×0.52(prox=0.52) + 0.40×proc(0.933[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='80' gold='55' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 04:37:17,879 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.489 = 0.50×0.26(prox=0.26) + 0.40×proc(0.652[fin=0.66,mean=0.64]) + 0.10×fmt(1.000) | pred='135' gold='55' | step_acc=62% lccp=0% (chain=0/8 ok_count=5) n_steps=8
+2026-04-26 04:37:32,537 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:37:32,632 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.42(prox=0.42) + 0.40×proc(0.835[fin=0.98,mean=0.62]) + 0.10×fmt(1.000) | pred='17.5' gold='55' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:37:32,723 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:37:32,807 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.817 = 0.50×0.65(prox=0.65) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='40' gold='55' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:37:39,315 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.624 = 0.50×0.38(prox=0.38) + 0.40×proc(0.836[fin=0.96,mean=0.65]) + 0.10×fmt(1.000) | pred='100' gold='55' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 04:37:39,406 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 13 GRPO groups: 75%|#######5 | 15/20 [07:31<02:34, 30.96s/q, loss=-0.0005, mean_r=0.817, skip=4]
Iter 13 GRPO groups: 80%|######## | 16/20 [07:31<02:08, 32.06s/q, loss=-0.0005, mean_r=0.817, skip=4]2026-04-26 04:37:46,172 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.766 = 0.50×0.72(prox=0.72) + 0.40×proc(0.769[fin=0.93,mean=0.52]) + 0.10×fmt(1.000) | pred='294' gold='366' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:37:46,260 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.788 = 0.50×0.72(prox=0.72) + 0.40×proc(0.823[fin=0.98,mean=0.59]) + 0.10×fmt(1.000) | pred='294' gold='366' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:37:53,583 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.936[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='366' gold='366' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:37:53,667 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.884 = 0.50×1.00(exact) + 0.40×proc(0.710[fin=0.88,mean=0.46]) + 0.10×fmt(1.000) | pred='366' gold='366' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 04:37:53,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.920 = 0.50×1.00(exact) + 0.40×proc(0.799[fin=0.98,mean=0.53]) + 0.10×fmt(1.000) | pred='366' gold='366' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 04:37:53,836 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='366' gold='366' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:38:02,994 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.451 = 0.50×0.50(prox=0.50) + 0.40×proc(0.246[fin=0.05,mean=0.55]) + 0.10×fmt(1.000) | pred='186' gold='366' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 04:38:03,080 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.726 = 0.50×0.72(prox=0.72) + 0.40×proc(0.667[fin=0.80,mean=0.46]) + 0.10×fmt(1.000) | pred='294' gold='366' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:38:03,165 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.792 = 0.50×0.72(prox=0.72) + 0.40×proc(0.834[fin=0.99,mean=0.60]) + 0.10×fmt(1.000) | pred='294' gold='366' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:38:03,248 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.961[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='366' gold='366' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 13 GRPO groups: 80%|######## | 16/20 [08:03<02:08, 32.06s/q, loss=0.0003, mean_r=0.828, skip=4]
Iter 13 GRPO groups: 85%|########5 | 17/20 [08:03<01:36, 32.20s/q, loss=0.0003, mean_r=0.828, skip=4]2026-04-26 04:38:16,516 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:16,599 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:16,681 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:16,764 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:23,410 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:23,496 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:23,578 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:23,660 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:30,661 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:30,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 13 GRPO groups: 85%|########5 | 17/20 [08:21<01:36, 32.20s/q, loss=0var, mean_r=0.999, skip=5]
Iter 13 GRPO groups: 90%|######### | 18/20 [08:21<00:55, 27.74s/q, loss=0var, mean_r=0.999, skip=5]2026-04-26 04:38:35,541 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:38:35,624 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:38:40,341 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:38:40,427 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:38:40,511 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:38:40,594 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:38:45,610 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:38:45,694 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:45,776 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:38:45,861 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 13 GRPO groups: 90%|######### | 18/20 [08:41<00:55, 27.74s/q, loss=0var, mean_r=0.998, skip=6]
Iter 13 GRPO groups: 95%|#########5| 19/20 [08:41<00:25, 25.46s/q, loss=0var, mean_r=0.998, skip=6]2026-04-26 04:39:04,295 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.563 = 0.50×0.60(prox=0.60) + 0.40×proc(0.408[fin=0.38,mean=0.45]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 04:39:04,393 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.731 = 0.50×0.60(prox=0.60) + 0.40×proc(0.828[fin=0.89,mean=0.73]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=71% lccp=43% (chain=3/7 ok_count=5) n_steps=7
+2026-04-26 04:39:04,498 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.347 = 0.50×0.00(prox=0.00) + 0.40×proc(0.646[fin=0.76,mean=0.47]) + 0.10×fmt(0.700) | pred='' gold='18' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+2026-04-26 04:39:04,592 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.661 = 0.50×0.60(prox=0.60) + 0.40×proc(0.654[fin=0.72,mean=0.55]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:39:13,916 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.496 = 0.50×0.60(prox=0.60) + 0.40×proc(0.241[fin=0.07,mean=0.50]) + 0.10×fmt(1.000) | pred='24' gold='18' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 04:39:14,010 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.888[fin=1.00,mean=0.73]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=83% lccp=33% (chain=2/6 ok_count=5) n_steps=6
+2026-04-26 04:39:14,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.309 = 0.50×0.33(prox=0.33) + 0.40×proc(0.107[fin=0.06,mean=0.18]) + 0.10×fmt(1.000) | pred='0' gold='18' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:39:14,202 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.666 = 0.50×0.60(prox=0.60) + 0.40×proc(0.666[fin=0.86,mean=0.38]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:39:24,225 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.343 = 0.50×0.43(prox=0.43) + 0.40×proc(0.072[fin=0.04,mean=0.11]) + 0.10×fmt(1.000) | pred='6' gold='18' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:39:24,321 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.565 = 0.50×0.60(prox=0.60) + 0.40×proc(0.412[fin=0.39,mean=0.45]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+
Iter 13 GRPO groups: 95%|#########5| 19/20 [09:16<00:25, 25.46s/q, loss=-0.0005, mean_r=0.564, skip=6]
Iter 13 GRPO groups: 100%|##########| 20/20 [09:16<00:00, 28.30s/q, loss=-0.0005, mean_r=0.564, skip=6]
Iter 13 GRPO groups: 100%|##########| 20/20 [09:16<00:00, 27.81s/q, loss=-0.0005, mean_r=0.564, skip=6]
+2026-04-26 04:39:26,156 INFO __main__ - Iter 13 | loss=0.0003 | reward mean=0.899 std=0.186 | gt_match=78.0% | grounded_acc=93.0% | step_acc=87.1% | lccp=78.9% | batch_acc=93.0% | phase=SELFPLAY_RAMP sp_ratio=0% | groups=14 skipped=6(0var=6) | lr=4.90e-06 | 556.2s
+2026-04-26 04:39:26,157 INFO __main__ - ======================================================================
+2026-04-26 04:39:26,157 INFO __main__ - GRPO ITERATION 14/60
+2026-04-26 04:39:26,157 INFO __main__ - ======================================================================
+2026-04-26 04:39:26,178 INFO __main__ - LR this iteration: 4.90e-06 | T=0.712 | MATH ratio=30%
+
Iter 14 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:39:30,146 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.25(prox=0.25) + 0.40×proc(0.902[fin=0.95,mean=0.83]) + 0.10×fmt(1.000) | pred='10' gold='4' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:39:30,230 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:39:36,849 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:39:36,932 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.25(prox=0.25) + 0.40×proc(0.903[fin=0.95,mean=0.83]) + 0.10×fmt(1.000) | pred='10' gold='4' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:39:37,014 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:39:37,097 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:39:45,317 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:39:45,394 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.507 = 0.50×0.09(prox=0.09) + 0.40×proc(0.903[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='24' gold='4' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:39:45,476 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:39:45,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 14 GRPO groups: 0%| | 0/20 [00:27, ?q/s, loss=0.0000, mean_r=0.860, skip=0]
Iter 14 GRPO groups: 5%|5 | 1/20 [00:27<08:33, 27.02s/q, loss=0.0000, mean_r=0.860, skip=0]2026-04-26 04:39:57,564 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.871 = 0.50×0.85(prox=0.85) + 0.40×proc(0.864[fin=1.00,mean=0.66]) + 0.10×fmt(1.000) | pred='104' gold='108' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:39:57,648 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.210 = 0.50×0.02(prox=0.02) + 0.40×proc(0.247[fin=0.31,mean=0.15]) + 0.10×fmt(1.000) | pred='2592' gold='108' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 04:39:57,731 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:39:57,814 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.311 = 0.50×0.01(prox=0.01) + 0.40×proc(0.516[fin=0.66,mean=0.31]) + 0.10×fmt(1.000) | pred='6480' gold='108' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:40:05,847 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:40:05,930 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:40:06,015 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.953[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:40:06,100 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.193 = 0.50×0.02(prox=0.02) + 0.40×proc(0.207[fin=0.14,mean=0.30]) + 0.10×fmt(1.000) | pred='2592' gold='108' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:40:13,207 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.528 = 0.50×0.13(prox=0.13) + 0.40×proc(0.904[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='464' gold='108' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:40:13,289 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 14 GRPO groups: 5%|5 | 1/20 [00:48<08:33, 27.02s/q, loss=-0.0000, mean_r=0.706, skip=0]
Iter 14 GRPO groups: 10%|# | 2/20 [00:48<07:08, 23.78s/q, loss=-0.0000, mean_r=0.706, skip=0]2026-04-26 04:40:20,695 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:40:20,778 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:40:25,205 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.738 = 0.50×0.85(prox=0.85) + 0.40×proc(0.619[fin=0.75,mean=0.42]) + 0.10×fmt(0.650) | pred='65' gold='64' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 04:40:25,292 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.707 = 0.50×0.85(prox=0.85) + 0.40×proc(0.455[fin=0.57,mean=0.29]) + 0.10×fmt(1.000) | pred='65' gold='64' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:40:25,374 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.633 = 0.50×0.52(prox=0.52) + 0.40×proc(0.774[fin=0.92,mean=0.56]) + 0.10×fmt(0.650) | pred='34' gold='64' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 04:40:25,458 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.822 = 0.50×0.85(prox=0.85) + 0.40×proc(0.742[fin=0.87,mean=0.56]) + 0.10×fmt(1.000) | pred='65' gold='64' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:40:35,013 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:40:35,093 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.801 = 0.50×1.00(exact) + 0.40×proc(0.503[fin=0.50,mean=0.51]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 04:40:35,178 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.752 = 0.50×0.85(prox=0.85) + 0.40×proc(0.567[fin=0.53,mean=0.63]) + 0.10×fmt(1.000) | pred='65' gold='64' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:40:35,256 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.622 = 0.50×0.85(prox=0.85) + 0.40×proc(0.331[fin=0.29,mean=0.40]) + 0.10×fmt(0.650) | pred='65' gold='64' | step_acc=50% lccp=50% (chain=1/2 ok_count=1) n_steps=2
+
Iter 14 GRPO groups: 10%|# | 2/20 [01:14<07:08, 23.78s/q, loss=0.0051, mean_r=0.800, skip=0]
Iter 14 GRPO groups: 15%|#5 | 3/20 [01:14<07:00, 24.72s/q, loss=0.0051, mean_r=0.800, skip=0]2026-04-26 04:40:45,300 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:40:45,385 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:40:45,468 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:40:45,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:40:57,145 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:40:57,230 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:40:57,314 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:40:57,399 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:41:09,724 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:41:09,808 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 14 GRPO groups: 15%|#5 | 3/20 [01:43<07:00, 24.72s/q, loss=0var, mean_r=0.997, skip=1]
Iter 14 GRPO groups: 20%|## | 4/20 [01:43<07:04, 26.51s/q, loss=0var, mean_r=0.997, skip=1]2026-04-26 04:41:15,335 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:41:15,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:41:19,890 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:41:19,974 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.694[fin=0.84,mean=0.48]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:41:20,056 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.603 = 0.50×0.33(prox=0.33) + 0.40×proc(0.841[fin=1.00,mean=0.60]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:41:20,135 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:41:25,348 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:41:25,431 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:41:25,509 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:41:25,589 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 14 GRPO groups: 20%|## | 4/20 [02:07<07:04, 26.51s/q, loss=0var, mean_r=0.555, skip=2]
Iter 14 GRPO groups: 25%|##5 | 5/20 [02:07<06:22, 25.52s/q, loss=0var, mean_r=0.555, skip=2]2026-04-26 04:41:44,575 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:41:44,670 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.678 = 0.50×0.50(prox=0.50) + 0.40×proc(0.820[fin=0.97,mean=0.59]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=62% lccp=0% (chain=0/8 ok_count=5) n_steps=8
+2026-04-26 04:41:44,763 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:41:44,856 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:41:55,399 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:41:55,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:41:55,588 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:41:55,681 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:42:11,053 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:42:11,147 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 14 GRPO groups: 25%|##5 | 5/20 [02:46<06:22, 25.52s/q, loss=-0.0002, mean_r=0.966, skip=2]
Iter 14 GRPO groups: 30%|### | 6/20 [02:46<07:01, 30.12s/q, loss=-0.0002, mean_r=0.966, skip=2]2026-04-26 04:42:16,080 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.726 = 0.50×0.50(prox=0.50) + 0.40×proc(0.940[fin=0.97,mean=0.90]) + 0.10×fmt(1.000) | pred='4' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:42:16,164 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.702 = 0.50×0.50(prox=0.50) + 0.40×proc(0.881[fin=0.96,mean=0.76]) + 0.10×fmt(1.000) | pred='4' gold='8' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:42:23,429 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.728 = 0.50×0.50(prox=0.50) + 0.40×proc(0.945[fin=0.94,mean=0.96]) + 0.10×fmt(1.000) | pred='4' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:42:23,513 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:23,596 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.743 = 0.50×0.50(prox=0.50) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='4' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:42:23,679 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:30,729 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:30,811 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:30,897 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.835 = 0.50×0.73(prox=0.73) + 0.40×proc(0.928[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='6.5' gold='8' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:42:30,982 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 14 GRPO groups: 30%|### | 6/20 [03:11<07:01, 30.12s/q, loss=0.0027, mean_r=0.870, skip=2]
Iter 14 GRPO groups: 35%|###5 | 7/20 [03:11<06:10, 28.47s/q, loss=0.0027, mean_r=0.870, skip=2]2026-04-26 04:42:42,448 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:42,535 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:42,620 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.830 = 0.50×0.75(prox=0.75) + 0.40×proc(0.888[fin=0.97,mean=0.77]) + 0.10×fmt(1.000) | pred='42' gold='36' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:42:42,706 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:50,525 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:50,609 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:50,694 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:50,780 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:59,166 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.653 = 0.50×0.85(prox=0.85) + 0.40×proc(0.320[fin=0.17,mean=0.55]) + 0.10×fmt(1.000) | pred='39' gold='36' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:42:59,251 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 14 GRPO groups: 35%|###5 | 7/20 [03:34<06:10, 28.47s/q, loss=-0.0005, mean_r=0.947, skip=2]
Iter 14 GRPO groups: 40%|#### | 8/20 [03:34<05:22, 26.83s/q, loss=-0.0005, mean_r=0.947, skip=2]2026-04-26 04:43:04,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:04,217 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:11,503 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:11,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.925[fin=0.96,mean=0.88]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:11,668 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:11,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.617 = 0.50×0.67(prox=0.67) + 0.40×proc(0.460[fin=0.44,mean=0.49]) + 0.10×fmt(1.000) | pred='5' gold='4' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 04:43:16,101 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:16,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:16,270 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:16,353 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 14 GRPO groups: 40%|#### | 8/20 [03:58<05:22, 26.83s/q, loss=-0.0002, mean_r=0.957, skip=2]
Iter 14 GRPO groups: 45%|####5 | 9/20 [03:58<04:44, 25.87s/q, loss=-0.0002, mean_r=0.957, skip=2]2026-04-26 04:43:30,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.908[fin=0.98,mean=0.80]) + 0.10×fmt(1.000) | pred='96' gold='32' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:43:30,986 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.514 = 0.50×0.19(prox=0.19) + 0.40×proc(0.728[fin=0.90,mean=0.47]) + 0.10×fmt(1.000) | pred='102' gold='32' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 04:43:31,079 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.547 = 0.50×0.19(prox=0.19) + 0.40×proc(0.809[fin=0.96,mean=0.58]) + 0.10×fmt(1.000) | pred='102' gold='32' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 04:43:31,165 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.932[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='96' gold='32' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:43:39,748 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.349 = 0.50×0.18(prox=0.18) + 0.40×proc(0.299[fin=0.25,mean=0.37]) + 0.10×fmt(1.000) | pred='102.67' gold='32' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 04:43:39,832 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.522 = 0.50×0.19(prox=0.19) + 0.40×proc(0.730[fin=0.88,mean=0.50]) + 0.10×fmt(1.000) | pred='102' gold='32' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:43:39,918 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:43:40,011 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.539 = 0.50×0.18(prox=0.18) + 0.40×proc(0.800[fin=0.96,mean=0.56]) + 0.10×fmt(1.000) | pred='106' gold='32' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 04:43:49,822 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:43:49,908 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.764 = 0.50×0.80(prox=0.80) + 0.40×proc(0.660[fin=0.67,mean=0.64]) + 0.10×fmt(1.000) | pred='36' gold='32' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+
Iter 14 GRPO groups: 45%|####5 | 9/20 [04:25<04:44, 25.87s/q, loss=0.0000, mean_r=0.633, skip=2]
Iter 14 GRPO groups: 50%|##### | 10/20 [04:25<04:20, 26.09s/q, loss=0.0000, mean_r=0.633, skip=2]2026-04-26 04:43:54,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:54,529 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:01,081 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:01,163 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:01,245 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:01,327 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:08,292 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:08,376 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:08,458 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:08,541 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 14 GRPO groups: 50%|##### | 10/20 [04:49<04:20, 26.09s/q, loss=0var, mean_r=0.999, skip=3]
Iter 14 GRPO groups: 55%|#####5 | 11/20 [04:49<03:48, 25.43s/q, loss=0var, mean_r=0.999, skip=3]2026-04-26 04:44:15,308 INFO src.rl.curriculum_manager - Topic probabilities (rollout 0): [('basic_arithmetic', '0.042'), ('single_step_word_problems', '0.042'), ('fractions', '0.042'), ('percentages', '0.042'), ('ratios', '0.042')]
+2026-04-26 04:44:22,989 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.958 + mod=+0.080, cap=1.00) | Q=0.90 sol=1.000 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 04:44:23,196 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=1.000 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 04:44:23,403 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=1.000 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 04:44:23,600 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=1.000 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 04:44:23,802 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=1.000 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 04:44:24,009 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.984 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=5
+2026-04-26 04:44:24,216 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=1.000 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 04:44:24,423 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.921 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.990 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 04:44:24,625 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=1.000 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 04:44:24,822 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.926 + mod=+0.080, cap=1.00) | Q=0.82 sol=1.000 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 04:44:28,488 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.773 = clip(base=0.693 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.645 novelty=0.70 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.61)+0.20*lccp(0.00) | steps=2
+2026-04-26 04:44:28,686 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.749 = clip(base=0.669 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.636 novelty=0.70 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.60)+0.20*lccp(0.00) | steps=2
+2026-04-26 04:44:28,884 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.755 = clip(base=0.675 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.646 novelty=0.70 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.61)+0.20*lccp(0.00) | steps=2
+2026-04-26 04:44:29,085 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.754 = clip(base=0.674 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.645 novelty=0.70 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.61)+0.20*lccp(0.00) | steps=2
+2026-04-26 04:44:29,285 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.754 = clip(base=0.674 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.645 novelty=0.70 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.61)+0.20*lccp(0.00) | steps=2
+2026-04-26 04:44:29,481 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 04:44:29,673 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.755 = clip(base=0.675 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.646 novelty=0.70 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.61)+0.20*lccp(0.00) | steps=2
+2026-04-26 04:44:29,868 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.754 = clip(base=0.674 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.645 novelty=0.70 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.61)+0.20*lccp(0.00) | steps=2
+2026-04-26 04:44:30,067 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.919 = clip(base=0.839 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.932 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.81)+0.20*lccp(1.00) | steps=2
+2026-04-26 04:44:30,267 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+
Iter 14 GRPO groups: 55%|#####5 | 11/20 [05:05<03:48, 25.43s/q, loss=0.0024, mean_r=0.905, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 60%|###### | 12/20 [05:05<03:02, 22.78s/q, loss=0.0024, mean_r=0.905, q_acc=100%, q_rew=0.763, skip=3]2026-04-26 04:44:48,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.38(prox=0.38) + 0.40×proc(0.904[fin=0.94,mean=0.85]) + 0.10×fmt(1.000) | pred='9' gold='5' | step_acc=86% lccp=71% (chain=5/7 ok_count=6) n_steps=7
+2026-04-26 04:44:48,110 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.691 = 0.50×0.45(prox=0.45) + 0.40×proc(0.909[fin=0.99,mean=0.79]) + 0.10×fmt(1.000) | pred='8' gold='5' | step_acc=88% lccp=0% (chain=0/8 ok_count=7) n_steps=8
+2026-04-26 04:44:48,195 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.26(prox=0.26) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='12' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:48,282 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:44:59,830 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.952 = 0.50×1.00(exact) + 0.40×proc(0.880[fin=0.97,mean=0.75]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:44:59,914 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.939[fin=0.98,mean=0.87]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:45:00,007 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.889[fin=0.97,mean=0.77]) + 0.10×fmt(1.000) | pred='15' gold='5' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:45:00,091 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.45(prox=0.45) + 0.40×proc(0.640[fin=0.64,mean=0.64]) + 0.10×fmt(1.000) | pred='2' gold='5' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:45:05,699 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.45(prox=0.45) + 0.40×proc(0.756[fin=0.80,mean=0.69]) + 0.10×fmt(1.000) | pred='2' gold='5' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:45:05,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.45(prox=0.45) + 0.40×proc(0.856[fin=0.91,mean=0.77]) + 0.10×fmt(1.000) | pred='2' gold='5' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+
Iter 14 GRPO groups: 60%|###### | 12/20 [05:41<03:02, 22.78s/q, loss=-0.0009, mean_r=0.691, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 65%|######5 | 13/20 [05:41<03:05, 26.56s/q, loss=-0.0009, mean_r=0.691, q_acc=100%, q_rew=0.763, skip=3]2026-04-26 04:45:14,695 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:45:14,787 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:45:30,383 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.834 = 0.50×1.00(exact) + 0.40×proc(0.585[fin=0.41,mean=0.84]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=83% lccp=83% (chain=5/6 ok_count=5) n_steps=6
+2026-04-26 04:45:30,475 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:45:30,569 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:45:30,664 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:45:40,454 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:45:40,548 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:45:40,641 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:45:40,736 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+
Iter 14 GRPO groups: 65%|######5 | 13/20 [06:30<03:05, 26.56s/q, loss=0.0012, mean_r=0.982, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 70%|####### | 14/20 [06:30<03:20, 33.48s/q, loss=0.0012, mean_r=0.982, q_acc=100%, q_rew=0.763, skip=3]2026-04-26 04:46:06,276 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(0.650) | pred='1024' gold='1024' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:46:06,355 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.284 = 0.50×0.34(prox=0.34) + 0.40×proc(0.126[fin=0.13,mean=0.12]) + 0.10×fmt(0.650) | pred='20' gold='1024' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 04:46:06,439 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.255 = 0.50×0.00(prox=0.00) + 0.40×proc(0.293[fin=0.24,mean=0.38]) + 0.10×fmt(1.000) | pred='3628800' gold='1024' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 04:46:06,518 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='1024' gold='1024' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:46:12,602 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.950 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(0.650) | pred='1024' gold='1024' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:46:12,678 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.949 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(0.650) | pred='1024' gold='1024' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:46:12,755 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.268 = 0.50×0.34(prox=0.34) + 0.40×proc(0.086[fin=0.09,mean=0.08]) + 0.10×fmt(0.650) | pred='20' gold='1024' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 04:46:12,845 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.903[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='1024' gold='1024' | step_acc=86% lccp=14% (chain=1/7 ok_count=6) n_steps=7
+2026-04-26 04:46:24,959 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='1024' gold='1024' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:46:25,041 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(0.650) | pred='1024' gold='1024' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 14 GRPO groups: 70%|####### | 14/20 [07:00<03:20, 33.48s/q, loss=-0.0004, mean_r=0.756, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 75%|#######5 | 15/20 [07:00<02:41, 32.36s/q, loss=-0.0004, mean_r=0.756, q_acc=100%, q_rew=0.763, skip=3]2026-04-26 04:46:33,052 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.932[fin=0.99,mean=0.84]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:46:33,139 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.940[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:46:43,017 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:46:43,100 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:46:43,182 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:46:43,261 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:46:47,954 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.923 = 0.50×1.00(exact) + 0.40×proc(0.807[fin=0.94,mean=0.61]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:46:48,031 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.903 = 0.50×1.00(exact) + 0.40×proc(0.845[fin=0.97,mean=0.65]) + 0.10×fmt(0.650) | pred='3' gold='3' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 04:46:48,116 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:46:48,203 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 14 GRPO groups: 75%|#######5 | 15/20 [07:39<02:41, 32.36s/q, loss=0.0042, mean_r=0.971, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 80%|######## | 16/20 [07:39<02:17, 34.40s/q, loss=0.0042, mean_r=0.971, q_acc=100%, q_rew=0.763, skip=3]2026-04-26 04:47:09,666 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:47:09,752 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.928[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:47:09,835 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.789 = 0.50×0.70(prox=0.70) + 0.40×proc(0.847[fin=0.98,mean=0.64]) + 0.10×fmt(1.000) | pred='25.5' gold='21' | step_acc=71% lccp=14% (chain=1/7 ok_count=5) n_steps=7
+2026-04-26 04:47:09,919 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.442 = 0.50×0.26(prox=0.26) + 0.40×proc(0.469[fin=0.47,mean=0.47]) + 0.10×fmt(1.000) | pred='51' gold='21' | step_acc=33% lccp=17% (chain=1/6 ok_count=2) n_steps=6
+2026-04-26 04:47:20,793 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.26(prox=0.26) + 0.40×proc(0.776[fin=0.89,mean=0.60]) + 0.10×fmt(1.000) | pred='51' gold='21' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:47:20,878 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:47:20,963 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.925[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 04:47:21,039 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.925 = 0.50×1.00(exact) + 0.40×proc(0.813[fin=0.98,mean=0.56]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:47:25,907 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:47:25,989 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.962[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 14 GRPO groups: 80%|######## | 16/20 [08:01<02:17, 34.40s/q, loss=-0.0018, mean_r=0.861, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 85%|########5 | 17/20 [08:01<01:31, 30.61s/q, loss=-0.0018, mean_r=0.861, q_acc=100%, q_rew=0.763, skip=3]2026-04-26 04:47:32,501 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.944 = 0.50×1.00(exact) + 0.40×proc(0.859[fin=0.99,mean=0.66]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:47:32,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.513 = 0.50×0.18(prox=0.18) + 0.40×proc(0.813[fin=1.00,mean=0.53]) + 0.10×fmt(1.000) | pred='20' gold='6' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:47:38,537 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:47:38,621 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.925[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:47:38,706 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.531 = 0.50×0.14(prox=0.14) + 0.40×proc(0.813[fin=0.95,mean=0.60]) + 0.10×fmt(1.000) | pred='25' gold='6' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:47:38,791 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:47:55,491 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.520 = 0.50×0.18(prox=0.18) + 0.40×proc(0.829[fin=0.98,mean=0.60]) + 0.10×fmt(1.000) | pred='20' gold='6' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:47:55,577 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:47:55,661 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:47:55,743 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 14 GRPO groups: 85%|########5 | 17/20 [08:35<01:31, 30.61s/q, loss=-0.0023, mean_r=0.848, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 90%|######### | 18/20 [08:35<01:03, 31.69s/q, loss=-0.0023, mean_r=0.848, q_acc=100%, q_rew=0.763, skip=3]2026-04-26 04:48:07,966 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:48:08,059 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.320 = 0.50×0.00(prox=0.00) + 0.40×proc(0.363[fin=0.25,mean=0.53]) + 0.10×fmt(1.000) | pred='No solution' gold='4' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:48:08,143 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:48:08,226 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:48:16,974 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.956 = 0.50×1.00(exact) + 0.40×proc(0.890[fin=0.95,mean=0.81]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:48:17,056 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:48:17,140 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:48:17,223 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=0.99,mean=0.93]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:48:22,246 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.687 = 0.50×0.67(prox=0.67) + 0.40×proc(0.721[fin=0.86,mean=0.51]) + 0.10×fmt(0.650) | pred='5' gold='4' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 04:48:22,330 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.671 = 0.50×0.67(prox=0.67) + 0.40×proc(0.682[fin=0.80,mean=0.51]) + 0.10×fmt(0.650) | pred='5' gold='4' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+
Iter 14 GRPO groups: 90%|######### | 18/20 [08:57<01:03, 31.69s/q, loss=-0.0004, mean_r=0.861, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 95%|#########5| 19/20 [08:57<00:28, 28.82s/q, loss=-0.0004, mean_r=0.861, q_acc=100%, q_rew=0.763, skip=3]2026-04-26 04:48:29,026 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:29,113 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:36,400 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:36,485 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.477 = 0.50×0.33(prox=0.33) + 0.40×proc(0.375[fin=0.27,mean=0.53]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 04:48:36,562 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:36,644 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:43,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:43,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:43,265 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:43,349 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.543 = 0.50×0.50(prox=0.50) + 0.40×proc(0.483[fin=0.44,mean=0.55]) + 0.10×fmt(1.000) | pred='3' gold='2' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+
Iter 14 GRPO groups: 95%|#########5| 19/20 [09:28<00:28, 28.82s/q, loss=-0.0023, mean_r=0.900, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 100%|##########| 20/20 [09:28<00:00, 29.40s/q, loss=-0.0023, mean_r=0.900, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 100%|##########| 20/20 [09:28<00:00, 28.42s/q, loss=-0.0023, mean_r=0.900, q_acc=100%, q_rew=0.763, skip=3]
+2026-04-26 04:48:54,559 INFO __main__ - Iter 14 | loss=0.0005 | reward mean=0.856 std=0.208 | gt_match=67.4% | grounded_acc=94.7% | step_acc=85.8% | lccp=74.8% | batch_acc=95.2% | phase=SELFPLAY_RAMP sp_ratio=4% | groups=18 skipped=3(0var=3) | lr=4.85e-06 | 568.4s
+2026-04-26 04:48:54,559 INFO __main__ - Question generation: 1/1 valid (100%) | q_reward=0.763 | q_acc=100.0% (>0.5 quality) | topic=0.57 diff=0.89 clarity=1.00 novelty=0.43 solvability=1.00
+2026-04-26 04:48:54,560 INFO __main__ - ======================================================================
+2026-04-26 04:48:54,561 INFO __main__ - GRPO ITERATION 15/60
+2026-04-26 04:48:54,561 INFO __main__ - ======================================================================
+2026-04-26 04:48:54,581 INFO __main__ - LR this iteration: 4.85e-06 | T=0.705 | MATH ratio=30%
+
Iter 15 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:48:57,821 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:57,904 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:48:57,986 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.912[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:48:58,068 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:49:02,833 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.964[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:49:02,915 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:49:02,996 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:49:03,079 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:49:08,099 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:49:08,183 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.914[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+
Iter 15 GRPO groups: 0%| | 0/20 [00:13, ?q/s, loss=0var, mean_r=0.982, skip=1]
Iter 15 GRPO groups: 5%|5 | 1/20 [00:13<04:18, 13.60s/q, loss=0var, mean_r=0.982, skip=1]2026-04-26 04:49:14,753 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.854 = 0.50×0.78(prox=0.78) + 0.40×proc(0.912[fin=0.99,mean=0.79]) + 0.10×fmt(1.000) | pred='96' gold='84' | step_acc=88% lccp=38% (chain=3/8 ok_count=7) n_steps=8
+2026-04-26 04:49:14,840 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.408 = 0.50×0.43(prox=0.43) + 0.40×proc(0.234[fin=0.03,mean=0.54]) + 0.10×fmt(1.000) | pred='28' gold='84' | step_acc=57% lccp=0% (chain=0/7 ok_count=4) n_steps=7
+2026-04-26 04:49:28,676 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.677 = 0.50×0.78(prox=0.78) + 0.40×proc(0.471[fin=0.39,mean=0.60]) + 0.10×fmt(1.000) | pred='72' gold='84' | step_acc=57% lccp=43% (chain=3/7 ok_count=4) n_steps=7
+2026-04-26 04:49:28,760 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.528 = 0.50×0.49(prox=0.49) + 0.40×proc(0.460[fin=0.43,mean=0.51]) + 0.10×fmt(1.000) | pred='40' gold='84' | step_acc=43% lccp=0% (chain=0/7 ok_count=3) n_steps=7
+2026-04-26 04:49:28,843 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.447 = 0.50×0.46(prox=0.46) + 0.40×proc(0.298[fin=0.10,mean=0.59]) + 0.10×fmt(1.000) | pred='34' gold='84' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 04:49:28,926 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.915 = 0.50×0.85(prox=0.85) + 0.40×proc(0.976[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='80' gold='84' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:49:37,600 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.482 = 0.50×0.45(prox=0.45) + 0.40×proc(0.235[fin=0.01,mean=0.56]) + 0.10×fmt(1.000) | pred='32' gold='84' | step_acc=57% lccp=43% (chain=3/7 ok_count=4) n_steps=7
+2026-04-26 04:49:37,688 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:49:37,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.900 = 0.50×0.85(prox=0.85) + 0.40×proc(0.937[fin=0.99,mean=0.86]) + 0.10×fmt(1.000) | pred='80' gold='84' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:49:37,861 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.47(prox=0.47) + 0.40×proc(0.732[fin=0.70,mean=0.78]) + 0.10×fmt(1.000) | pred='36' gold='84' | step_acc=86% lccp=71% (chain=5/7 ok_count=6) n_steps=7
+
Iter 15 GRPO groups: 5%|5 | 1/20 [00:57<04:18, 13.60s/q, loss=0.0000, mean_r=0.676, skip=1]
Iter 15 GRPO groups: 10%|# | 2/20 [00:57<09:26, 31.50s/q, loss=0.0000, mean_r=0.676, skip=1]2026-04-26 04:49:56,210 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:49:56,292 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:49:56,367 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:49:56,443 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:00,710 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:00,790 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:00,867 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:00,943 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:50:08,451 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:50:08,527 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 15 GRPO groups: 10%|# | 2/20 [01:13<09:26, 31.50s/q, loss=0var, mean_r=0.998, skip=2]
Iter 15 GRPO groups: 15%|#5 | 3/20 [01:13<06:57, 24.57s/q, loss=0var, mean_r=0.998, skip=2]2026-04-26 04:50:11,654 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:11,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:17,175 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:17,256 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:17,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:17,418 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:22,331 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:22,414 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.613 = 0.50×0.50(prox=0.50) + 0.40×proc(0.657[fin=0.71,mean=0.58]) + 0.10×fmt(1.000) | pred='35' gold='70' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:50:22,496 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:22,581 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 15 GRPO groups: 15%|#5 | 3/20 [01:33<06:57, 24.57s/q, loss=-0.0025, mean_r=0.960, skip=2]
Iter 15 GRPO groups: 20%|## | 4/20 [01:33<06:03, 22.69s/q, loss=-0.0025, mean_r=0.960, skip=2]2026-04-26 04:50:28,353 INFO src.rl.curriculum_manager - Topic probabilities (rollout 20): [('basic_arithmetic', '0.043'), ('single_step_word_problems', '0.043'), ('fractions', '0.043'), ('percentages', '0.043'), ('ratios', '0.043')]
+2026-04-26 04:50:33,615 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.995 = clip(base=0.915 + mod=+0.080, cap=1.00) | Q=0.79 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:33,791 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:33,967 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:34,149 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:34,329 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:34,510 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:34,685 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:34,860 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:35,039 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:35,223 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:40,769 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:40,950 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:41,139 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:41,324 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:41,512 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:41,699 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:41,882 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:42,067 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:42,251 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:42,432 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 15 GRPO groups: 20%|## | 4/20 [01:49<06:03, 22.69s/q, loss=-0.0002, mean_r=0.968, q_acc=100%, q_rew=0.722, skip=2]
Iter 15 GRPO groups: 25%|##5 | 5/20 [01:49<05:02, 20.18s/q, loss=-0.0002, mean_r=0.968, q_acc=100%, q_rew=0.722, skip=2]2026-04-26 04:50:48,954 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.425 = 0.50×0.15(prox=0.15) + 0.40×proc(0.621[fin=0.68,mean=0.54]) + 0.10×fmt(1.000) | pred='30' gold='8' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:50:49,036 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:50:49,122 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.404 = 0.50×0.50(prox=0.50) + 0.40×proc(0.134[fin=0.14,mean=0.13]) + 0.10×fmt(1.000) | pred='4' gold='8' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:50:49,204 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.958[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:51:07,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.950[fin=0.93,mean=0.98]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:51:07,794 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.961[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:51:07,879 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:51:07,963 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:51:20,799 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.442 = 0.50×0.50(prox=0.50) + 0.40×proc(0.229[fin=0.06,mean=0.49]) + 0.10×fmt(1.000) | pred='4' gold='8' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:51:20,881 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=0.98,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 15 GRPO groups: 25%|##5 | 5/20 [02:27<05:02, 20.18s/q, loss=0.0002, mean_r=0.821, q_acc=100%, q_rew=0.722, skip=2]
Iter 15 GRPO groups: 30%|### | 6/20 [02:27<06:08, 26.33s/q, loss=0.0002, mean_r=0.821, q_acc=100%, q_rew=0.722, skip=2]2026-04-26 04:51:25,554 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:25,636 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.707 = 0.50×0.50(prox=0.50) + 0.40×proc(0.893[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='24' gold='48' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:51:32,367 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:51:32,453 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.717 = 0.50×0.50(prox=0.50) + 0.40×proc(0.919[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='24' gold='48' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:51:32,541 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:32,624 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:40,299 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:40,383 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.939[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:51:40,466 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:40,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 15 GRPO groups: 30%|### | 6/20 [02:53<06:08, 26.33s/q, loss=-0.0012, mean_r=0.938, q_acc=100%, q_rew=0.722, skip=2]
Iter 15 GRPO groups: 35%|###5 | 7/20 [02:53<05:40, 26.16s/q, loss=-0.0012, mean_r=0.938, q_acc=100%, q_rew=0.722, skip=2]2026-04-26 04:51:52,095 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:52,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:52,257 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:52,336 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:59,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:59,334 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:59,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:59,490 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:52:06,338 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:52:06,416 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 15 GRPO groups: 35%|###5 | 7/20 [03:11<05:40, 26.16s/q, loss=0var, mean_r=0.999, skip=3]
Iter 15 GRPO groups: 40%|#### | 8/20 [03:11<04:43, 23.65s/q, loss=0var, mean_r=0.999, skip=3]2026-04-26 04:52:40,189 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.585 = 0.50×0.33(prox=0.33) + 0.40×proc(0.795[fin=0.95,mean=0.57]) + 0.10×fmt(1.000) | pred='348' gold='174' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:52:40,283 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.306 = 0.50×0.00(prox=0.00) + 0.40×proc(0.514[fin=0.54,mean=0.47]) + 0.10×fmt(1.000) | pred='1953/2' gold='174' | step_acc=50% lccp=0% (chain=0/6 ok_count=3) n_steps=6
+2026-04-26 04:52:51,623 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.415 = 0.50×0.00(prox=0.00) + 0.40×proc(0.862[fin=1.00,mean=0.66]) + 0.10×fmt(0.700) | pred='' gold='174' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:52:51,708 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.947[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='174' gold='174' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:52:51,801 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.648 = 0.50×0.50(prox=0.50) + 0.40×proc(0.744[fin=0.92,mean=0.48]) + 0.10×fmt(1.000) | pred='87' gold='174' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:52:51,893 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.907 = 0.50×1.00(exact) + 0.40×proc(0.767[fin=0.96,mean=0.47]) + 0.10×fmt(1.000) | pred='174' gold='174' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 04:53:07,776 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.471 = 0.50×0.20(prox=0.20) + 0.40×proc(0.677[fin=0.73,mean=0.59]) + 0.10×fmt(1.000) | pred='522' gold='174' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 04:53:07,859 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='174' gold='174' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:53:07,952 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.588 = 0.50×0.47(prox=0.47) + 0.40×proc(0.628[fin=0.62,mean=0.63]) + 0.10×fmt(1.000) | pred='77.3' gold='174' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+
Iter 15 GRPO groups: 40%|#### | 8/20 [04:14<04:43, 23.65s/q, loss=-0.0008, mean_r=0.652, q_acc=100%, q_rew=0.722, skip=3]
Iter 15 GRPO groups: 45%|####5 | 9/20 [04:14<06:35, 35.94s/q, loss=-0.0008, mean_r=0.652, q_acc=100%, q_rew=0.722, skip=3]2026-04-26 04:53:14,989 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:53:22,334 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:53:22,420 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:53:22,507 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:53:22,590 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.702 = 0.50×0.50(prox=0.50) + 0.40×proc(0.881[fin=0.98,mean=0.74]) + 0.10×fmt(1.000) | pred='-24' gold='-16' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:53:32,064 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.660 = 0.50×0.50(prox=0.50) + 0.40×proc(0.774[fin=0.97,mean=0.48]) + 0.10×fmt(1.000) | pred='-24' gold='-16' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:53:32,150 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:53:32,235 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:53:32,319 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:53:41,214 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 15 GRPO groups: 45%|####5 | 9/20 [04:48<06:35, 35.94s/q, loss=0.0010, mean_r=0.935, q_acc=100%, q_rew=0.722, skip=3]
Iter 15 GRPO groups: 50%|##### | 10/20 [04:48<05:51, 35.12s/q, loss=0.0010, mean_r=0.935, q_acc=100%, q_rew=0.722, skip=3]2026-04-26 04:53:47,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:53:47,827 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.794 = 0.50×0.66(prox=0.66) + 0.40×proc(0.910[fin=0.98,mean=0.81]) + 0.10×fmt(1.000) | pred='49' gold='39' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:53:47,909 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:53:59,406 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:53:59,489 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:53:59,572 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:53:59,654 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:54:07,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:54:07,636 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:54:07,722 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.545 = 0.50×0.71(prox=0.71) + 0.40×proc(0.227[fin=0.18,mean=0.29]) + 0.10×fmt(1.000) | pred='47' gold='39' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+
Iter 15 GRPO groups: 50%|##### | 10/20 [05:14<05:51, 35.12s/q, loss=-0.0003, mean_r=0.933, q_acc=100%, q_rew=0.722, skip=3]
Iter 15 GRPO groups: 55%|#####5 | 11/20 [05:14<04:52, 32.50s/q, loss=-0.0003, mean_r=0.933, q_acc=100%, q_rew=0.722, skip=3]2026-04-26 04:54:16,509 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:54:27,569 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.937[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 04:54:27,656 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.951 = 0.50×1.00(exact) + 0.40×proc(0.876[fin=0.99,mean=0.71]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=67% lccp=17% (chain=1/6 ok_count=4) n_steps=6
+2026-04-26 04:54:27,741 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:54:27,827 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:54:42,476 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:54:42,561 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:54:42,647 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:54:42,734 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:54:57,738 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 15 GRPO groups: 55%|#####5 | 11/20 [06:03<04:52, 32.50s/q, loss=0var, mean_r=0.991, skip=4]
Iter 15 GRPO groups: 60%|###### | 12/20 [06:03<04:58, 37.37s/q, loss=0var, mean_r=0.991, skip=4]2026-04-26 04:55:05,950 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:06,034 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:06,126 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:55:11,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:55:11,304 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:11,388 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:55:11,471 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:15,938 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:16,020 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:16,102 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 15 GRPO groups: 60%|###### | 12/20 [06:21<04:58, 37.37s/q, loss=0var, mean_r=0.998, skip=5]
Iter 15 GRPO groups: 65%|######5 | 13/20 [06:21<03:41, 31.61s/q, loss=0var, mean_r=0.998, skip=5]2026-04-26 04:55:18,737 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:22,830 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:22,908 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:22,988 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:23,073 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:28,070 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:28,154 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:28,234 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:28,314 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:33,128 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 15 GRPO groups: 65%|######5 | 13/20 [06:38<03:41, 31.61s/q, loss=0var, mean_r=0.999, skip=6]
Iter 15 GRPO groups: 70%|####### | 14/20 [06:38<02:43, 27.21s/q, loss=0var, mean_r=0.999, skip=6]2026-04-26 04:55:36,454 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:36,539 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.568 = 0.50×0.36(prox=0.36) + 0.40×proc(0.724[fin=0.95,mean=0.38]) + 0.10×fmt(1.000) | pred='1' gold='10' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:55:36,623 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:43,262 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:43,345 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:43,428 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:43,509 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:50,129 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:55:50,211 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:55:50,294 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 15 GRPO groups: 70%|####### | 14/20 [06:57<02:43, 27.21s/q, loss=0.0029, mean_r=0.956, q_acc=100%, q_rew=0.722, skip=6]
Iter 15 GRPO groups: 75%|#######5 | 15/20 [06:57<02:03, 24.62s/q, loss=0.0029, mean_r=0.956, q_acc=100%, q_rew=0.722, skip=6]2026-04-26 04:55:55,520 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:56:02,184 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.384 = 0.50×0.20(prox=0.20) + 0.40×proc(0.366[fin=0.34,mean=0.40]) + 0.10×fmt(1.000) | pred='60' gold='20' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 04:56:02,260 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:56:02,336 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:56:02,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:56:08,803 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:56:08,886 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:56:08,962 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:09,040 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:56:11,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 15 GRPO groups: 75%|#######5 | 15/20 [07:18<02:03, 24.62s/q, loss=0.0013, mean_r=0.938, q_acc=100%, q_rew=0.722, skip=6]
Iter 15 GRPO groups: 80%|######## | 16/20 [07:18<01:34, 23.65s/q, loss=0.0013, mean_r=0.938, q_acc=100%, q_rew=0.722, skip=6]2026-04-26 04:56:18,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:18,496 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:18,581 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:28,656 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:28,743 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:28,835 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:28,918 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:42,157 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:42,240 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:42,322 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 15 GRPO groups: 80%|######## | 16/20 [07:47<01:34, 23.65s/q, loss=0var, mean_r=0.998, skip=7]
Iter 15 GRPO groups: 85%|########5 | 17/20 [07:47<01:15, 25.31s/q, loss=0var, mean_r=0.998, skip=7]2026-04-26 04:56:45,165 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:51,348 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:51,430 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:51,511 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:51,594 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:56,668 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:56,750 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:56,832 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:56,914 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:57:02,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 15 GRPO groups: 85%|########5 | 17/20 [08:07<01:15, 25.31s/q, loss=0var, mean_r=0.998, skip=8]
Iter 15 GRPO groups: 90%|######### | 18/20 [08:07<00:47, 23.67s/q, loss=0var, mean_r=0.998, skip=8]2026-04-26 04:57:08,145 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:57:08,232 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:57:08,327 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:57:15,679 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:57:15,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:57:15,845 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:57:15,928 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:57:22,809 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:57:22,896 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:57:22,983 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 15 GRPO groups: 90%|######### | 18/20 [08:28<00:47, 23.67s/q, loss=0var, mean_r=0.998, skip=9]
Iter 15 GRPO groups: 95%|#########5| 19/20 [08:28<00:22, 22.81s/q, loss=0var, mean_r=0.998, skip=9]2026-04-26 04:57:29,568 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.940[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 04:57:38,271 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.941[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:57:38,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:57:38,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.212 = 0.50×0.08(prox=0.08) + 0.40×proc(0.178[fin=0.09,mean=0.31]) + 0.10×fmt(1.000) | pred='53' gold='8' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 04:57:38,533 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:57:47,857 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:57:47,942 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.797[fin=0.97,mean=0.53]) + 0.10×fmt(1.000) | pred='13' gold='8' | step_acc=57% lccp=14% (chain=1/7 ok_count=4) n_steps=7
+2026-04-26 04:57:48,024 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:57:48,111 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.467 = 0.50×0.10(prox=0.10) + 0.40×proc(0.727[fin=0.91,mean=0.45]) + 0.10×fmt(1.000) | pred='43' gold='8' | step_acc=33% lccp=17% (chain=1/6 ok_count=2) n_steps=6
+2026-04-26 04:58:02,864 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.384 = 0.50×0.14(prox=0.14) + 0.40×proc(0.387[fin=0.34,mean=0.45]) + 0.10×fmt(1.000) | pred='-17' gold='8' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+
Iter 15 GRPO groups: 95%|#########5| 19/20 [09:10<00:22, 22.81s/q, loss=0.0000, mean_r=0.754, q_acc=100%, q_rew=0.722, skip=9]
Iter 15 GRPO groups: 100%|##########| 20/20 [09:10<00:00, 28.48s/q, loss=0.0000, mean_r=0.754, q_acc=100%, q_rew=0.722, skip=9]
Iter 15 GRPO groups: 100%|##########| 20/20 [09:10<00:00, 27.51s/q, loss=0.0000, mean_r=0.754, q_acc=100%, q_rew=0.722, skip=9]
+2026-04-26 04:58:04,705 INFO src.rl.llm_question_classifier - LLMClassifier cache=90% llm=2% fallback=8% (cache_size=4/10000)
+2026-04-26 04:58:04,705 INFO __main__ - Iter 15 | loss=0.0000 | reward mean=0.928 std=0.167 | gt_match=83.6% | grounded_acc=93.1% | step_acc=91.7% | lccp=83.7% | batch_acc=93.8% | phase=SELFPLAY_RAMP sp_ratio=7% | groups=12 skipped=9(0var=9) | lr=4.80e-06 | 550.1s
+2026-04-26 04:58:04,706 WARNING __main__ - STARVATION: 43% of groups skipped (zero variance). grounded_acc=93.1% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 04:58:04,706 INFO __main__ - Question generation: 1/1 valid (100%) | q_reward=0.722 | q_acc=100.0% (>0.5 quality) | topic=0.35 diff=0.95 clarity=1.00 novelty=0.46 solvability=1.00
+2026-04-26 04:58:04,706 INFO __main__ - Evaluating GSM8K (150 samples)...
+
GSM8K eval: 0%| | 0/150 [00:00, ?q/s]
GSM8K eval: 1%| | 1/150 [00:02<05:36, 2.26s/q, correct=1/1, lccp=100.0%, score=0.999, step_acc=100.0%]
GSM8K eval: 1%|1 | 2/150 [00:07<09:34, 3.88s/q, correct=2/2, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 2%|2 | 3/150 [00:09<08:09, 3.33s/q, correct=3/3, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 3%|2 | 4/150 [00:12<07:07, 2.93s/q, correct=3/4, lccp=83.3%, score=0.887, step_acc=91.7%]
GSM8K eval: 3%|3 | 5/150 [00:13<05:59, 2.48s/q, correct=4/5, lccp=86.7%, score=0.910, step_acc=93.3%]
GSM8K eval: 4%|4 | 6/150 [00:19<08:29, 3.54s/q, correct=4/6, lccp=75.6%, score=0.887, step_acc=87.8%]
GSM8K eval: 5%|4 | 7/150 [00:23<08:23, 3.52s/q, correct=5/7, lccp=79.0%, score=0.903, step_acc=89.5%]
GSM8K eval: 5%|5 | 8/150 [00:25<07:34, 3.20s/q, correct=6/8, lccp=81.7%, score=0.915, step_acc=90.8%]
GSM8K eval: 6%|6 | 9/150 [00:28<07:38, 3.25s/q, correct=7/9, lccp=83.7%, score=0.924, step_acc=91.9%]
GSM8K eval: 7%|6 | 10/150 [00:33<08:52, 3.81s/q, correct=7/10, lccp=81.3%, score=0.908, step_acc=90.7%]
GSM8K eval: 7%|7 | 11/150 [00:36<08:13, 3.55s/q, correct=8/11, lccp=83.0%, score=0.916, step_acc=91.5%]
GSM8K eval: 8%|8 | 12/150 [00:39<07:09, 3.11s/q, correct=9/12, lccp=84.4%, score=0.923, step_acc=92.2%]
GSM8K eval: 9%|8 | 13/150 [00:41<06:44, 2.95s/q, correct=10/13, lccp=85.6%, score=0.926, step_acc=92.8%]
GSM8K eval: 9%|9 | 14/150 [00:46<07:49, 3.45s/q, correct=11/14, lccp=86.7%, score=0.932, step_acc=93.3%]
GSM8K eval: 10%|# | 15/150 [00:48<07:11, 3.19s/q, correct=12/15, lccp=87.6%, score=0.936, step_acc=93.8%]
GSM8K eval: 11%|# | 16/150 [00:51<06:39, 2.98s/q, correct=12/16, lccp=88.3%, score=0.912, step_acc=94.2%]
GSM8K eval: 11%|#1 | 17/150 [00:55<07:15, 3.28s/q, correct=13/17, lccp=89.0%, score=0.917, step_acc=94.5%]
GSM8K eval: 12%|#2 | 18/150 [01:01<08:52, 4.03s/q, correct=13/18, lccp=84.8%, score=0.906, step_acc=92.0%]
GSM8K eval: 13%|#2 | 19/150 [01:03<07:52, 3.61s/q, correct=14/19, lccp=85.6%, score=0.911, step_acc=92.5%]
GSM8K eval: 13%|#3 | 20/150 [01:07<08:00, 3.69s/q, correct=15/20, lccp=86.3%, score=0.915, step_acc=92.8%]
GSM8K eval: 14%|#4 | 21/150 [01:10<07:14, 3.37s/q, correct=16/21, lccp=86.9%, score=0.919, step_acc=93.2%]
GSM8K eval: 15%|#4 | 22/150 [01:12<06:45, 3.17s/q, correct=17/22, lccp=84.5%, score=0.914, step_acc=92.0%]
GSM8K eval: 15%|#5 | 23/150 [01:17<07:20, 3.47s/q, correct=18/23, lccp=85.2%, score=0.918, step_acc=92.3%]
GSM8K eval: 16%|#6 | 24/150 [01:19<06:42, 3.20s/q, correct=18/24, lccp=82.7%, score=0.901, step_acc=89.5%]
GSM8K eval: 17%|#6 | 25/150 [01:22<06:21, 3.06s/q, correct=18/25, lccp=80.4%, score=0.897, step_acc=88.9%]
GSM8K eval: 17%|#7 | 26/150 [01:26<07:08, 3.46s/q, correct=19/26, lccp=81.1%, score=0.901, step_acc=89.4%]
GSM8K eval: 18%|#8 | 27/150 [01:29<06:42, 3.27s/q, correct=19/27, lccp=81.8%, score=0.896, step_acc=89.8%]
GSM8K eval: 19%|#8 | 28/150 [01:31<06:00, 2.96s/q, correct=20/28, lccp=82.5%, score=0.900, step_acc=90.1%]
GSM8K eval: 19%|#9 | 29/150 [01:34<05:50, 2.90s/q, correct=21/29, lccp=83.1%, score=0.903, step_acc=90.5%]
GSM8K eval: 20%|## | 30/150 [01:38<06:21, 3.18s/q, correct=22/30, lccp=83.6%, score=0.906, step_acc=90.8%]
GSM8K eval: 21%|## | 31/150 [01:40<05:57, 3.00s/q, correct=23/31, lccp=84.2%, score=0.909, step_acc=91.1%]
GSM8K eval: 21%|##1 | 32/150 [01:42<05:10, 2.63s/q, correct=24/32, lccp=84.7%, score=0.911, step_acc=91.4%]
GSM8K eval: 22%|##2 | 33/150 [01:45<05:12, 2.67s/q, correct=25/33, lccp=85.1%, score=0.914, step_acc=91.6%]
GSM8K eval: 23%|##2 | 34/150 [01:47<04:48, 2.49s/q, correct=26/34, lccp=85.6%, score=0.916, step_acc=91.9%]
GSM8K eval: 23%|##3 | 35/150 [01:50<04:49, 2.52s/q, correct=27/35, lccp=86.0%, score=0.918, step_acc=92.1%]
GSM8K eval: 24%|##4 | 36/150 [01:53<05:17, 2.79s/q, correct=28/36, lccp=86.4%, score=0.921, step_acc=92.3%]
GSM8K eval: 25%|##4 | 37/150 [01:55<04:48, 2.55s/q, correct=29/37, lccp=86.7%, score=0.922, step_acc=92.5%]
GSM8K eval: 25%|##5 | 38/150 [01:58<05:03, 2.71s/q, correct=30/38, lccp=87.1%, score=0.924, step_acc=92.7%]
GSM8K eval: 26%|##6 | 39/150 [02:03<06:10, 3.33s/q, correct=31/39, lccp=87.4%, score=0.926, step_acc=92.9%]
GSM8K eval: 27%|##6 | 40/150 [02:09<07:37, 4.16s/q, correct=32/40, lccp=87.7%, score=0.928, step_acc=93.1%]
GSM8K eval: 27%|##7 | 41/150 [02:12<06:53, 3.79s/q, correct=32/41, lccp=88.0%, score=0.927, step_acc=93.3%]
GSM8K eval: 28%|##8 | 42/150 [02:17<07:36, 4.23s/q, correct=33/42, lccp=86.7%, score=0.928, step_acc=93.0%]
GSM8K eval: 29%|##8 | 43/150 [02:19<06:29, 3.64s/q, correct=34/43, lccp=87.0%, score=0.930, step_acc=93.2%]
GSM8K eval: 29%|##9 | 44/150 [02:26<07:44, 4.38s/q, correct=35/44, lccp=87.3%, score=0.931, step_acc=93.3%]
GSM8K eval: 30%|### | 45/150 [02:29<07:01, 4.02s/q, correct=36/45, lccp=87.6%, score=0.933, step_acc=93.5%]
GSM8K eval: 31%|### | 46/150 [02:34<07:23, 4.27s/q, correct=36/46, lccp=85.7%, score=0.928, step_acc=93.4%]
GSM8K eval: 31%|###1 | 47/150 [02:37<06:44, 3.92s/q, correct=37/47, lccp=86.0%, score=0.929, step_acc=93.5%]
GSM8K eval: 32%|###2 | 48/150 [02:39<05:35, 3.29s/q, correct=38/48, lccp=86.3%, score=0.931, step_acc=93.7%]
GSM8K eval: 33%|###2 | 49/150 [02:42<05:41, 3.38s/q, correct=39/49, lccp=85.2%, score=0.932, step_acc=93.4%]
GSM8K eval: 33%|###3 | 50/150 [02:45<05:31, 3.32s/q, correct=39/50, lccp=84.5%, score=0.923, step_acc=92.6%]
GSM8K eval: 34%|###4 | 51/150 [02:47<04:32, 2.75s/q, correct=40/51, lccp=84.8%, score=0.925, step_acc=92.7%]
GSM8K eval: 35%|###4 | 52/150 [02:51<05:15, 3.22s/q, correct=40/52, lccp=83.2%, score=0.924, step_acc=92.5%]
GSM8K eval: 35%|###5 | 53/150 [02:56<05:55, 3.67s/q, correct=40/53, lccp=82.8%, score=0.916, step_acc=91.9%]
GSM8K eval: 36%|###6 | 54/150 [02:59<05:41, 3.55s/q, correct=41/54, lccp=83.1%, score=0.918, step_acc=92.1%]
GSM8K eval: 37%|###6 | 55/150 [03:03<05:37, 3.55s/q, correct=42/55, lccp=83.4%, score=0.919, step_acc=92.2%]
GSM8K eval: 37%|###7 | 56/150 [03:06<05:38, 3.60s/q, correct=43/56, lccp=83.7%, score=0.920, step_acc=92.4%]
GSM8K eval: 38%|###8 | 57/150 [03:09<04:58, 3.21s/q, correct=44/57, lccp=84.0%, score=0.922, step_acc=92.5%]
GSM8K eval: 39%|###8 | 58/150 [03:13<05:20, 3.49s/q, correct=45/58, lccp=84.2%, score=0.923, step_acc=92.6%]
GSM8K eval: 39%|###9 | 59/150 [03:17<05:26, 3.59s/q, correct=45/59, lccp=82.8%, score=0.916, step_acc=92.1%]
GSM8K eval: 40%|#### | 60/150 [03:22<06:00, 4.00s/q, correct=46/60, lccp=83.1%, score=0.918, step_acc=92.2%]
GSM8K eval: 41%|#### | 61/150 [03:25<05:35, 3.77s/q, correct=47/61, lccp=83.4%, score=0.919, step_acc=92.3%]
GSM8K eval: 41%|####1 | 62/150 [03:28<05:14, 3.57s/q, correct=48/62, lccp=83.6%, score=0.920, step_acc=92.5%]
GSM8K eval: 42%|####2 | 63/150 [03:31<05:06, 3.52s/q, correct=48/63, lccp=83.4%, score=0.914, step_acc=92.0%]
GSM8K eval: 43%|####2 | 64/150 [03:34<04:44, 3.30s/q, correct=49/64, lccp=83.6%, score=0.916, step_acc=92.2%]
GSM8K eval: 43%|####3 | 65/150 [03:37<04:27, 3.15s/q, correct=50/65, lccp=83.9%, score=0.917, step_acc=92.3%]
GSM8K eval: 44%|####4 | 66/150 [03:39<03:54, 2.79s/q, correct=51/66, lccp=84.1%, score=0.918, step_acc=92.4%]
GSM8K eval: 45%|####4 | 67/150 [03:41<03:38, 2.64s/q, correct=52/67, lccp=84.4%, score=0.919, step_acc=92.5%]
GSM8K eval: 45%|####5 | 68/150 [03:44<03:37, 2.65s/q, correct=53/68, lccp=84.6%, score=0.921, step_acc=92.6%]
GSM8K eval: 46%|####6 | 69/150 [03:45<03:07, 2.32s/q, correct=54/69, lccp=84.8%, score=0.922, step_acc=92.7%]
GSM8K eval: 47%|####6 | 70/150 [03:48<03:20, 2.51s/q, correct=55/70, lccp=83.6%, score=0.922, step_acc=92.6%]
GSM8K eval: 47%|####7 | 71/150 [03:51<03:32, 2.69s/q, correct=56/71, lccp=82.4%, score=0.923, step_acc=92.4%]
GSM8K eval: 48%|####8 | 72/150 [03:53<03:01, 2.32s/q, correct=57/72, lccp=82.7%, score=0.924, step_acc=92.5%]
GSM8K eval: 49%|####8 | 73/150 [03:55<02:44, 2.13s/q, correct=58/73, lccp=82.9%, score=0.925, step_acc=92.6%]
GSM8K eval: 49%|####9 | 74/150 [03:58<03:14, 2.56s/q, correct=59/74, lccp=83.1%, score=0.926, step_acc=92.7%]
GSM8K eval: 50%|##### | 75/150 [04:00<02:52, 2.30s/q, correct=60/75, lccp=83.4%, score=0.927, step_acc=92.8%]
GSM8K eval: 51%|##### | 76/150 [04:06<04:25, 3.58s/q, correct=60/76, lccp=83.4%, score=0.922, step_acc=92.7%]
GSM8K eval: 51%|#####1 | 77/150 [04:10<04:29, 3.69s/q, correct=61/77, lccp=83.6%, score=0.923, step_acc=92.8%]
GSM8K eval: 52%|#####2 | 78/150 [04:13<03:58, 3.32s/q, correct=62/78, lccp=83.8%, score=0.924, step_acc=92.9%]
GSM8K eval: 53%|#####2 | 79/150 [04:16<03:50, 3.25s/q, correct=62/79, lccp=83.0%, score=0.918, step_acc=92.1%]
GSM8K eval: 53%|#####3 | 80/150 [04:19<03:44, 3.21s/q, correct=63/80, lccp=83.2%, score=0.919, step_acc=92.2%]
GSM8K eval: 54%|#####4 | 81/150 [04:21<03:24, 2.96s/q, correct=64/81, lccp=83.4%, score=0.920, step_acc=92.3%]
GSM8K eval: 55%|#####4 | 82/150 [04:24<03:23, 2.99s/q, correct=65/82, lccp=83.6%, score=0.921, step_acc=92.4%]
GSM8K eval: 55%|#####5 | 83/150 [04:27<03:17, 2.95s/q, correct=66/83, lccp=83.8%, score=0.922, step_acc=92.5%]
GSM8K eval: 56%|#####6 | 84/150 [04:30<03:08, 2.85s/q, correct=67/84, lccp=84.0%, score=0.923, step_acc=92.6%]
GSM8K eval: 57%|#####6 | 85/150 [04:34<03:23, 3.13s/q, correct=68/85, lccp=84.2%, score=0.924, step_acc=92.7%]
GSM8K eval: 57%|#####7 | 86/150 [04:37<03:26, 3.22s/q, correct=69/86, lccp=84.4%, score=0.925, step_acc=92.8%]
GSM8K eval: 58%|#####8 | 87/150 [04:43<04:07, 3.93s/q, correct=70/87, lccp=84.6%, score=0.926, step_acc=92.9%]
GSM8K eval: 59%|#####8 | 88/150 [04:45<03:25, 3.32s/q, correct=71/88, lccp=84.7%, score=0.926, step_acc=93.0%]
GSM8K eval: 59%|#####9 | 89/150 [04:47<03:11, 3.15s/q, correct=72/89, lccp=84.9%, score=0.927, step_acc=93.0%]
GSM8K eval: 60%|###### | 90/150 [04:50<02:55, 2.92s/q, correct=73/90, lccp=85.1%, score=0.928, step_acc=93.1%]
GSM8K eval: 61%|###### | 91/150 [04:54<03:17, 3.35s/q, correct=74/91, lccp=85.2%, score=0.929, step_acc=93.2%]
GSM8K eval: 61%|######1 | 92/150 [04:57<03:10, 3.28s/q, correct=75/92, lccp=85.4%, score=0.929, step_acc=93.3%]
GSM8K eval: 62%|######2 | 93/150 [05:05<04:21, 4.58s/q, correct=76/93, lccp=85.6%, score=0.930, step_acc=93.3%]
GSM8K eval: 63%|######2 | 94/150 [05:08<03:45, 4.03s/q, correct=77/94, lccp=84.6%, score=0.930, step_acc=92.7%]
GSM8K eval: 63%|######3 | 95/150 [05:12<03:52, 4.22s/q, correct=77/95, lccp=83.8%, score=0.927, step_acc=91.9%]
GSM8K eval: 64%|######4 | 96/150 [05:17<04:01, 4.47s/q, correct=78/96, lccp=83.9%, score=0.928, step_acc=92.0%]
GSM8K eval: 65%|######4 | 97/150 [05:20<03:28, 3.94s/q, correct=78/97, lccp=83.6%, score=0.926, step_acc=91.8%]
GSM8K eval: 65%|######5 | 98/150 [05:24<03:30, 4.04s/q, correct=78/98, lccp=83.2%, score=0.922, step_acc=91.6%]
GSM8K eval: 66%|######6 | 99/150 [05:27<03:01, 3.56s/q, correct=79/99, lccp=83.3%, score=0.923, step_acc=91.7%]
GSM8K eval: 67%|######6 | 100/150 [05:29<02:33, 3.07s/q, correct=80/100, lccp=82.5%, score=0.923, step_acc=91.5%]
GSM8K eval: 67%|######7 | 101/150 [05:33<02:43, 3.33s/q, correct=80/101, lccp=82.7%, score=0.919, step_acc=91.5%]
GSM8K eval: 68%|######8 | 102/150 [05:34<02:13, 2.79s/q, correct=81/102, lccp=82.8%, score=0.920, step_acc=91.6%]
GSM8K eval: 69%|######8 | 103/150 [05:36<02:01, 2.59s/q, correct=82/103, lccp=83.0%, score=0.921, step_acc=91.7%]
GSM8K eval: 69%|######9 | 104/150 [05:41<02:29, 3.26s/q, correct=83/104, lccp=83.2%, score=0.921, step_acc=91.8%]
GSM8K eval: 70%|####### | 105/150 [05:44<02:17, 3.06s/q, correct=84/105, lccp=83.3%, score=0.922, step_acc=91.9%]
GSM8K eval: 71%|####### | 106/150 [05:45<01:54, 2.61s/q, correct=85/106, lccp=83.5%, score=0.923, step_acc=91.9%]
GSM8K eval: 71%|#######1 | 107/150 [05:47<01:38, 2.30s/q, correct=86/107, lccp=83.6%, score=0.923, step_acc=92.0%]
GSM8K eval: 72%|#######2 | 108/150 [05:49<01:41, 2.43s/q, correct=87/108, lccp=83.8%, score=0.924, step_acc=92.1%]
GSM8K eval: 73%|#######2 | 109/150 [05:55<02:12, 3.22s/q, correct=87/109, lccp=83.3%, score=0.923, step_acc=92.0%]
GSM8K eval: 73%|#######3 | 110/150 [05:57<01:58, 2.97s/q, correct=88/110, lccp=82.8%, score=0.923, step_acc=91.9%]
GSM8K eval: 74%|#######4 | 111/150 [05:59<01:41, 2.60s/q, correct=89/111, lccp=83.0%, score=0.924, step_acc=91.9%]
GSM8K eval: 75%|#######4 | 112/150 [06:04<02:10, 3.42s/q, correct=89/112, lccp=83.1%, score=0.923, step_acc=92.0%]
GSM8K eval: 75%|#######5 | 113/150 [06:06<01:48, 2.94s/q, correct=90/113, lccp=83.3%, score=0.924, step_acc=92.1%]
GSM8K eval: 76%|#######6 | 114/150 [06:11<02:10, 3.63s/q, correct=91/114, lccp=82.8%, score=0.924, step_acc=92.0%]
GSM8K eval: 77%|#######6 | 115/150 [06:14<01:59, 3.42s/q, correct=92/115, lccp=82.9%, score=0.925, step_acc=92.1%]
GSM8K eval: 77%|#######7 | 116/150 [06:17<01:52, 3.30s/q, correct=93/116, lccp=83.1%, score=0.925, step_acc=92.2%]
GSM8K eval: 78%|#######8 | 117/150 [06:23<02:16, 4.13s/q, correct=94/117, lccp=83.2%, score=0.926, step_acc=92.2%]
GSM8K eval: 79%|#######8 | 118/150 [06:28<02:16, 4.27s/q, correct=94/118, lccp=82.5%, score=0.924, step_acc=92.1%]
GSM8K eval: 79%|#######9 | 119/150 [06:31<02:06, 4.09s/q, correct=94/119, lccp=82.7%, score=0.922, step_acc=92.2%]
GSM8K eval: 80%|######## | 120/150 [06:34<01:51, 3.72s/q, correct=95/120, lccp=82.8%, score=0.923, step_acc=92.3%]
GSM8K eval: 81%|######## | 121/150 [06:37<01:43, 3.56s/q, correct=96/121, lccp=82.9%, score=0.923, step_acc=92.3%]
GSM8K eval: 81%|########1 | 122/150 [06:40<01:35, 3.42s/q, correct=97/122, lccp=83.1%, score=0.924, step_acc=92.4%]
GSM8K eval: 82%|########2 | 123/150 [06:44<01:31, 3.40s/q, correct=97/123, lccp=82.7%, score=0.924, step_acc=92.3%]
GSM8K eval: 83%|########2 | 124/150 [06:46<01:19, 3.04s/q, correct=98/124, lccp=82.9%, score=0.924, step_acc=92.4%]
GSM8K eval: 83%|########3 | 125/150 [06:48<01:08, 2.75s/q, correct=99/125, lccp=83.0%, score=0.925, step_acc=92.4%]
GSM8K eval: 84%|########4 | 126/150 [06:51<01:06, 2.76s/q, correct=100/126, lccp=83.1%, score=0.926, step_acc=92.5%]
GSM8K eval: 85%|########4 | 127/150 [06:55<01:15, 3.27s/q, correct=101/127, lccp=83.3%, score=0.926, step_acc=92.5%]
GSM8K eval: 85%|########5 | 128/150 [06:58<01:10, 3.19s/q, correct=102/128, lccp=83.4%, score=0.927, step_acc=92.6%]
GSM8K eval: 86%|########6 | 129/150 [07:02<01:08, 3.26s/q, correct=103/129, lccp=83.5%, score=0.927, step_acc=92.7%]
GSM8K eval: 87%|########6 | 130/150 [07:04<00:56, 2.83s/q, correct=104/130, lccp=83.7%, score=0.928, step_acc=92.7%]
GSM8K eval: 87%|########7 | 131/150 [07:08<01:03, 3.37s/q, correct=105/131, lccp=83.8%, score=0.928, step_acc=92.8%]
GSM8K eval: 88%|########8 | 132/150 [07:10<00:51, 2.84s/q, correct=106/132, lccp=83.9%, score=0.929, step_acc=92.8%]
GSM8K eval: 89%|########8 | 133/150 [07:13<00:48, 2.84s/q, correct=107/133, lccp=84.0%, score=0.929, step_acc=92.9%]
GSM8K eval: 89%|########9 | 134/150 [07:17<00:53, 3.32s/q, correct=108/134, lccp=84.2%, score=0.930, step_acc=92.9%]
GSM8K eval: 90%|######### | 135/150 [07:20<00:47, 3.19s/q, correct=109/135, lccp=84.3%, score=0.930, step_acc=93.0%]
GSM8K eval: 91%|######### | 136/150 [07:24<00:50, 3.58s/q, correct=109/136, lccp=83.9%, score=0.929, step_acc=92.8%]
GSM8K eval: 91%|#########1| 137/150 [07:31<00:59, 4.60s/q, correct=110/137, lccp=84.0%, score=0.930, step_acc=92.8%]
GSM8K eval: 92%|#########2| 138/150 [07:35<00:53, 4.43s/q, correct=111/138, lccp=84.1%, score=0.930, step_acc=92.9%]
GSM8K eval: 93%|#########2| 139/150 [07:39<00:45, 4.14s/q, correct=112/139, lccp=84.2%, score=0.931, step_acc=93.0%]
GSM8K eval: 93%|#########3| 140/150 [07:43<00:42, 4.20s/q, correct=112/140, lccp=84.1%, score=0.927, step_acc=92.8%]
GSM8K eval: 94%|#########3| 141/150 [07:47<00:36, 4.10s/q, correct=113/141, lccp=84.2%, score=0.928, step_acc=92.8%]
GSM8K eval: 95%|#########4| 142/150 [07:52<00:33, 4.22s/q, correct=114/142, lccp=84.3%, score=0.928, step_acc=92.9%]
GSM8K eval: 95%|#########5| 143/150 [07:54<00:25, 3.64s/q, correct=115/143, lccp=84.5%, score=0.929, step_acc=92.9%]
GSM8K eval: 96%|#########6| 144/150 [07:56<00:19, 3.25s/q, correct=116/144, lccp=84.6%, score=0.929, step_acc=93.0%]
GSM8K eval: 97%|#########6| 145/150 [07:59<00:16, 3.21s/q, correct=116/145, lccp=84.0%, score=0.926, step_acc=92.4%]
GSM8K eval: 97%|#########7| 146/150 [08:02<00:12, 3.14s/q, correct=117/146, lccp=84.1%, score=0.926, step_acc=92.5%]
GSM8K eval: 98%|#########8| 147/150 [08:06<00:10, 3.34s/q, correct=118/147, lccp=84.2%, score=0.927, step_acc=92.5%]
GSM8K eval: 99%|#########8| 148/150 [08:10<00:06, 3.43s/q, correct=119/148, lccp=84.3%, score=0.927, step_acc=92.6%]
GSM8K eval: 99%|#########9| 149/150 [08:13<00:03, 3.47s/q, correct=120/149, lccp=84.4%, score=0.928, step_acc=92.6%]
GSM8K eval: 100%|##########| 150/150 [08:18<00:00, 3.88s/q, correct=120/150, lccp=84.2%, score=0.926, step_acc=92.4%]
GSM8K eval: 100%|##########| 150/150 [08:18<00:00, 3.32s/q, correct=120/150, lccp=84.2%, score=0.926, step_acc=92.4%]
+2026-04-26 05:06:23,420 INFO __main__ - Training Score [iter 15]: 0.9262 (best=0.9199) | n=150
+2026-04-26 05:06:23,420 INFO __main__ - Components : 0.50×correct(80.0%) + 0.40×process + 0.10×fmt(1.000)
+2026-04-26 05:06:23,420 INFO __main__ - Process score : prm_mean=0.907 prm_final=0.940 → weighted=0.927
+2026-04-26 05:06:23,420 INFO __main__ - Step accuracy : 92.4% (bag-of-steps: fraction of steps PRM >0.5)
+2026-04-26 05:06:23,420 INFO __main__ - Chain integrity (LCCP): 84.2% ← fraction of steps before first failure
+ [LCCP=100% → all steps correct; LCCP=0% → first step wrong]
+2026-04-26 05:06:23,420 INFO __main__ - (debug) final-answer accuracy: 80.0%
+2026-04-26 05:06:26,384 INFO __main__ - New best saved → checkpoints/grpo/grpo_20260426_032827/best_policy (combined 0.9262 > 0.9199)
+2026-04-26 05:06:28,581 INFO __main__ - ======================================================================
+2026-04-26 05:06:28,582 INFO __main__ - GRPO ITERATION 16/60
+2026-04-26 05:06:28,582 INFO __main__ - ======================================================================
+2026-04-26 05:06:28,602 INFO __main__ - LR this iteration: 4.80e-06 | T=0.698 | MATH ratio=30%
+
Iter 16 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 05:06:33,399 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.918[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:06:33,485 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.426 = 0.50×0.33(prox=0.33) + 0.40×proc(0.211[fin=0.01,mean=0.51]) + 0.10×fmt(1.000) | pred='30' gold='15' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 05:06:33,570 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.427 = 0.50×0.33(prox=0.33) + 0.40×proc(0.214[fin=0.01,mean=0.51]) + 0.10×fmt(1.000) | pred='30' gold='15' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:590: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.1` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
+ warnings.warn(
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:595: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.
+ warnings.warn(
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:612: UserWarning: `do_sample` is set to `False`. However, `top_k` is set to `20` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_k`.
+ warnings.warn(
+2026-04-26 05:06:42,912 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:06:42,995 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:06:43,079 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:06:43,166 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.427 = 0.50×0.33(prox=0.33) + 0.40×proc(0.214[fin=0.02,mean=0.50]) + 0.10×fmt(1.000) | pred='30' gold='15' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 05:06:51,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:06:51,564 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:06:51,647 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 16 GRPO groups: 0%| | 0/20 [00:24, ?q/s, loss=0.0015, mean_r=0.822, skip=0]
Iter 16 GRPO groups: 5%|5 | 1/20 [00:24<07:49, 24.74s/q, loss=0.0015, mean_r=0.822, skip=0]2026-04-26 05:06:57,019 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:03,880 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:03,962 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:04,043 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:04,125 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:13,883 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:13,967 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:14,050 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:14,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:20,864 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 16 GRPO groups: 5%|5 | 1/20 [00:52<07:49, 24.74s/q, loss=0var, mean_r=0.999, skip=1]
Iter 16 GRPO groups: 10%|# | 2/20 [00:52<07:54, 26.38s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 05:07:28,577 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:07:28,662 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.748 = 0.50×0.80(prox=0.80) + 0.40×proc(0.620[fin=0.64,mean=0.59]) + 0.10×fmt(1.000) | pred='144' gold='128' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 05:07:28,748 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:07:33,874 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:33,958 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.493 = 0.50×0.41(prox=0.41) + 0.40×proc(0.469[fin=0.56,mean=0.33]) + 0.10×fmt(1.000) | pred='36' gold='128' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 05:07:34,042 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.569 = 0.50×0.80(prox=0.80) + 0.40×proc(0.171[fin=0.09,mean=0.29]) + 0.10×fmt(1.000) | pred='144' gold='128' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 05:07:34,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.573 = 0.50×0.80(prox=0.80) + 0.40×proc(0.182[fin=0.23,mean=0.11]) + 0.10×fmt(1.000) | pred='144' gold='128' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+:1: SyntaxWarning: 'int' object is not callable; perhaps you missed a comma?
+2026-04-26 05:07:38,879 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:38,964 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.592 = 0.50×0.80(prox=0.80) + 0.40×proc(0.230[fin=0.15,mean=0.36]) + 0.10×fmt(1.000) | pred='144' gold='128' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 05:07:39,048 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.972[fin=0.99,mean=0.94]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 16 GRPO groups: 10%|# | 2/20 [01:11<07:54, 26.38s/q, loss=0.0003, mean_r=0.794, skip=1]
Iter 16 GRPO groups: 15%|#5 | 3/20 [01:11<06:36, 23.33s/q, loss=0.0003, mean_r=0.794, skip=1]2026-04-26 05:07:44,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:51,477 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:51,561 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:51,646 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:51,730 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:59,190 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:59,276 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:59,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:59,441 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:08:06,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 16 GRPO groups: 15%|#5 | 3/20 [01:37<06:36, 23.33s/q, loss=0var, mean_r=1.000, skip=2]
Iter 16 GRPO groups: 20%|## | 4/20 [01:37<06:27, 24.23s/q, loss=0var, mean_r=1.000, skip=2]2026-04-26 05:08:06,187 INFO src.rl.curriculum_manager - Topic probabilities (rollout 40): [('algebra', '0.264'), ('basic_arithmetic', '0.033'), ('single_step_word_problems', '0.033'), ('fractions', '0.033'), ('percentages', '0.033')]
+2026-04-26 05:08:10,844 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.978 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:11,034 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.991 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:11,227 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.987 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:11,418 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.980 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:11,609 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.991 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:11,807 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.991 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:11,999 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.985 = clip(base=0.905 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.991 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:12,196 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.970 = clip(base=0.890 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.979 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:12,387 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.991 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:12,584 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.991 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:16,732 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.935 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.999 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:16,926 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.999 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:17,120 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.990 = clip(base=0.910 + mod=+0.080, cap=1.00) | Q=0.77 sol=1.000 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:17,312 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.988 = clip(base=0.908 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.997 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:17,510 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.988 = clip(base=0.908 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.997 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:17,705 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.990 = clip(base=0.910 + mod=+0.080, cap=1.00) | Q=0.77 sol=1.000 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:17,898 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.990 = clip(base=0.910 + mod=+0.080, cap=1.00) | Q=0.77 sol=1.000 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:18,095 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.998 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:18,294 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.998 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:18,487 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.999 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 16 GRPO groups: 20%|## | 4/20 [01:51<06:27, 24.23s/q, loss=0.0000, mean_r=0.985, q_acc=100%, q_rew=0.777, skip=2]
Iter 16 GRPO groups: 25%|##5 | 5/20 [01:51<05:07, 20.53s/q, loss=0.0000, mean_r=0.985, q_acc=100%, q_rew=0.777, skip=2]2026-04-26 05:08:27,954 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:08:28,044 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:08:28,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:08:39,717 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:08:39,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:08:39,893 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:08:39,976 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:08:51,772 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:08:51,855 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.941[fin=0.99,mean=0.87]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 05:08:51,938 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 16 GRPO groups: 25%|##5 | 5/20 [02:23<05:07, 20.53s/q, loss=0var, mean_r=0.997, skip=3]
Iter 16 GRPO groups: 30%|### | 6/20 [02:23<05:41, 24.36s/q, loss=0var, mean_r=0.997, skip=3]2026-04-26 05:08:55,982 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:04,347 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:04,429 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:04,511 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:04,593 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:12,867 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:12,950 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:13,033 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:13,117 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:22,158 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 16 GRPO groups: 30%|### | 6/20 [02:53<05:41, 24.36s/q, loss=0var, mean_r=0.998, skip=4]
Iter 16 GRPO groups: 35%|###5 | 7/20 [02:53<05:41, 26.27s/q, loss=0var, mean_r=0.998, skip=4]2026-04-26 05:09:56,056 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.623[fin=0.68,mean=0.54]) + 0.10×fmt(1.000) | pred='2' gold='6' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:09:56,142 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:09:56,236 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.616 = 0.50×0.71(prox=0.71) + 0.40×proc(0.397[fin=0.43,mean=0.35]) + 0.10×fmt(1.000) | pred='4.8' gold='6' | step_acc=33% lccp=17% (chain=1/6 ok_count=2) n_steps=6
+2026-04-26 05:10:10,100 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.718 = 0.50×0.60(prox=0.60) + 0.40×proc(0.795[fin=0.97,mean=0.54]) + 0.10×fmt(1.000) | pred='8' gold='6' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 05:10:10,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.373 = 0.50×0.00(prox=0.00) + 0.40×proc(0.614[fin=0.59,mean=0.64]) + 0.10×fmt(1.000) | pred='8/3' gold='6' | step_acc=73% lccp=18% (chain=2/11 ok_count=8) n_steps=11
+2026-04-26 05:10:10,282 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:10:10,366 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.449 = 0.50×0.33(prox=0.33) + 0.40×proc(0.394[fin=0.41,mean=0.36]) + 0.10×fmt(1.000) | pred='12' gold='6' | step_acc=17% lccp=17% (chain=1/6 ok_count=1) n_steps=6
+2026-04-26 05:10:20,548 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:10:20,631 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 16 GRPO groups: 35%|###5 | 7/20 [03:53<05:41, 26.27s/q, loss=-0.0006, mean_r=0.744, q_acc=100%, q_rew=0.777, skip=4]
Iter 16 GRPO groups: 40%|#### | 8/20 [03:53<07:24, 37.01s/q, loss=-0.0006, mean_r=0.744, q_acc=100%, q_rew=0.777, skip=4]2026-04-26 05:10:22,173 INFO src.rl.curriculum_manager - Topic probabilities (rollout 60): [('basic_arithmetic', '0.044'), ('single_step_word_problems', '0.044'), ('fractions', '0.044'), ('percentages', '0.044'), ('ratios', '0.044')]
+2026-04-26 05:10:33,950 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.935 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.983 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:10:34,147 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.824 = clip(base=0.744 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.691 novelty=0.72 | sol=0.45*prm_final(0.75)+0.35*prm_mean(0.73)+0.20*lccp(0.50) | steps=4
+2026-04-26 05:10:34,348 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.943 novelty=0.72 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:10:34,548 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.997 = clip(base=0.917 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.993 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:10:34,749 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.842 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(0.50) | steps=6
+2026-04-26 05:10:34,943 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.883 = clip(base=0.803 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.806 novelty=0.72 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.80)+0.20*lccp(0.40) | steps=5
+2026-04-26 05:10:35,149 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.972 novelty=0.72 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:10:35,349 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.992 = clip(base=0.912 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.996 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:10:35,551 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.993 = clip(base=0.913 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.987 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:10:35,750 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.888 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(0.62) | steps=8
+2026-04-26 05:10:47,503 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.743 = clip(base=0.663 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.592 novelty=0.73 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.47)+0.20*lccp(0.00) | steps=10
+2026-04-26 05:10:47,710 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.868 = clip(base=0.788 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.747 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.75)+0.20*lccp(0.20) | steps=5
+2026-04-26 05:10:47,925 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.757 = clip(base=0.677 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.595 novelty=0.73 | sol=0.45*prm_final(0.92)+0.35*prm_mean(0.52)+0.20*lccp(0.00) | steps=7
+2026-04-26 05:10:48,153 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.769 = clip(base=0.689 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.616 novelty=0.73 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.50)+0.20*lccp(0.17) | steps=6
+2026-04-26 05:10:48,371 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.797 = clip(base=0.717 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.647 novelty=0.73 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.60)+0.20*lccp(0.00) | steps=5
+2026-04-26 05:10:48,588 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.717 = clip(base=0.637 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.590 novelty=0.73 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.44)+0.20*lccp(0.00) | steps=6
+2026-04-26 05:10:48,797 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.868 = clip(base=0.788 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.748 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.75)+0.20*lccp(0.20) | steps=5
+2026-04-26 05:10:49,013 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.872 = clip(base=0.792 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.760 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.81)+0.20*lccp(0.17) | steps=6
+2026-04-26 05:10:49,233 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.730 = clip(base=0.650 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.587 novelty=0.73 | sol=0.45*prm_final(0.94)+0.35*prm_mean(0.47)+0.20*lccp(0.00) | steps=4
+2026-04-26 05:10:49,450 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.733 = clip(base=0.653 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.593 novelty=0.73 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.47)+0.20*lccp(0.00) | steps=4
+
Iter 16 GRPO groups: 40%|#### | 8/20 [04:22<07:24, 37.01s/q, loss=-0.0001, mean_r=0.866, q_acc=100%, q_rew=0.788, skip=4]
Iter 16 GRPO groups: 45%|####5 | 9/20 [04:22<06:19, 34.49s/q, loss=-0.0001, mean_r=0.866, q_acc=100%, q_rew=0.788, skip=4]2026-04-26 05:10:54,870 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:10:54,952 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:11:03,982 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:11:04,067 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:04,152 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.514 = 0.50×0.37(prox=0.37) + 0.40×proc(0.473[fin=0.52,mean=0.41]) + 0.10×fmt(1.000) | pred='154' gold='84' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 05:11:04,235 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:11,815 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:11,898 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:11,980 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:12,063 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 16 GRPO groups: 45%|####5 | 9/20 [04:53<06:19, 34.49s/q, loss=-0.0002, mean_r=0.951, q_acc=100%, q_rew=0.788, skip=4]
Iter 16 GRPO groups: 50%|##### | 10/20 [04:53<05:35, 33.55s/q, loss=-0.0002, mean_r=0.951, q_acc=100%, q_rew=0.788, skip=4]2026-04-26 05:11:28,398 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:11:28,475 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:11:28,556 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:11:28,641 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:11:33,803 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(0.650) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:11:33,887 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.650) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:11:33,968 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:11:34,052 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:37,141 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.934 = 0.50×1.00(exact) + 0.40×proc(0.923[fin=1.00,mean=0.81]) + 0.10×fmt(0.650) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:11:37,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 16 GRPO groups: 50%|##### | 10/20 [05:10<05:35, 33.55s/q, loss=0.0014, mean_r=0.971, q_acc=100%, q_rew=0.788, skip=4]
Iter 16 GRPO groups: 55%|#####5 | 11/20 [05:10<04:13, 28.22s/q, loss=0.0014, mean_r=0.971, q_acc=100%, q_rew=0.788, skip=4]2026-04-26 05:11:43,397 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.524 = 0.50×0.50(prox=0.50) + 0.40×proc(0.435[fin=0.45,mean=0.41]) + 0.10×fmt(1.000) | pred='1' gold='2' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 05:11:43,484 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:47,945 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:48,027 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.956 = 0.50×1.00(exact) + 0.40×proc(0.891[fin=1.00,mean=0.73]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 05:11:48,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:48,195 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:55,589 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:55,672 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:11:55,755 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.972[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:55,837 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 16 GRPO groups: 55%|#####5 | 11/20 [05:35<04:13, 28.22s/q, loss=-0.0002, mean_r=0.944, q_acc=100%, q_rew=0.788, skip=4]
Iter 16 GRPO groups: 60%|###### | 12/20 [05:35<03:38, 27.32s/q, loss=-0.0002, mean_r=0.944, q_acc=100%, q_rew=0.788, skip=4]2026-04-26 05:12:12,631 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:12:12,714 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:12:12,805 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:12:12,888 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:12:29,152 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.05(prox=0.05) + 0.40×proc(0.939[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='154' gold='14' | step_acc=86% lccp=57% (chain=4/7 ok_count=6) n_steps=7
+2026-04-26 05:12:29,239 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:12:29,323 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:12:29,417 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 05:12:43,294 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:12:43,386 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+
Iter 16 GRPO groups: 60%|###### | 12/20 [06:16<03:38, 27.32s/q, loss=0.0012, mean_r=0.954, q_acc=100%, q_rew=0.788, skip=4]
Iter 16 GRPO groups: 65%|######5 | 13/20 [06:16<03:40, 31.44s/q, loss=0.0012, mean_r=0.954, q_acc=100%, q_rew=0.788, skip=4]2026-04-26 05:12:49,910 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-4' gold='-4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:12:49,992 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-4' gold='-4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:13:00,472 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='-4' gold='-4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:13:00,556 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-4' gold='-4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:13:00,642 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.544 = 0.50×0.29(prox=0.29) + 0.40×proc(0.754[fin=0.96,mean=0.44]) + 0.10×fmt(1.000) | pred='1' gold='-4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:13:00,726 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.546 = 0.50×0.29(prox=0.29) + 0.40×proc(0.758[fin=0.96,mean=0.45]) + 0.10×fmt(1.000) | pred='1' gold='-4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:13:05,960 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='-4' gold='-4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:13:06,042 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.544 = 0.50×0.29(prox=0.29) + 0.40×proc(0.753[fin=0.97,mean=0.43]) + 0.10×fmt(1.000) | pred='1' gold='-4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:13:06,124 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.552 = 0.50×0.29(prox=0.29) + 0.40×proc(0.773[fin=0.97,mean=0.48]) + 0.10×fmt(1.000) | pred='1' gold='-4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:13:06,207 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.551 = 0.50×0.29(prox=0.29) + 0.40×proc(0.770[fin=0.97,mean=0.47]) + 0.10×fmt(1.000) | pred='1' gold='-4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+
Iter 16 GRPO groups: 65%|######5 | 13/20 [06:43<03:40, 31.44s/q, loss=0.0009, mean_r=0.772, q_acc=100%, q_rew=0.788, skip=4]
Iter 16 GRPO groups: 70%|####### | 14/20 [06:43<03:01, 30.31s/q, loss=0.0009, mean_r=0.772, q_acc=100%, q_rew=0.788, skip=4]2026-04-26 05:13:25,346 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.890 = 0.50×1.00(exact) + 0.40×proc(0.726[fin=0.68,mean=0.80]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=83% lccp=33% (chain=4/12 ok_count=10) n_steps=12
+2026-04-26 05:13:25,434 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.526 = 0.50×0.09(prox=0.09) + 0.40×proc(0.825[fin=0.97,mean=0.61]) + 0.10×fmt(1.000) | pred='-4' gold='1' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 05:13:25,518 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.101 = 0.50×0.00(prox=0.00) + 0.40×proc(0.077[fin=0.02,mean=0.16]) + 0.10×fmt(0.700) | pred='' gold='1' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 05:13:25,604 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.499 = 0.50×0.33(prox=0.33) + 0.40×proc(0.505[fin=0.54,mean=0.45]) + 0.10×fmt(1.000) | pred='0' gold='1' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 05:13:36,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 05:13:36,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 05:13:36,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 05:13:36,404 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 05:13:58,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.405 = 0.50×0.00(prox=0.00) + 0.40×proc(0.687[fin=0.71,mean=0.65]) + 0.10×fmt(0.700) | pred='' gold='1' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 05:13:58,865 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+
Iter 16 GRPO groups: 70%|####### | 14/20 [07:31<03:01, 30.31s/q, loss=0.0016, mean_r=0.742, q_acc=100%, q_rew=0.788, skip=4]
Iter 16 GRPO groups: 75%|#######5 | 15/20 [07:31<02:57, 35.58s/q, loss=0.0016, mean_r=0.742, q_acc=100%, q_rew=0.788, skip=4]2026-04-26 05:14:02,220 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:02,300 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:07,118 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:07,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:07,272 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:07,347 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:11,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:11,747 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:11,823 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:11,902 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 16 GRPO groups: 75%|#######5 | 15/20 [07:47<02:57, 35.58s/q, loss=0var, mean_r=0.999, skip=5]
Iter 16 GRPO groups: 80%|######## | 16/20 [07:47<01:58, 29.71s/q, loss=0var, mean_r=0.999, skip=5]2026-04-26 05:14:20,136 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:14:20,219 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.639 = 0.50×0.50(prox=0.50) + 0.40×proc(0.722[fin=0.79,mean=0.61]) + 0.10×fmt(1.000) | pred='1' gold='2' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 05:14:20,302 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.385 = 0.50×0.25(prox=0.25) + 0.40×proc(0.249[fin=0.06,mean=0.54]) + 0.10×fmt(1.000) | pred='-1' gold='2' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 05:14:20,387 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.896 = 0.50×1.00(exact) + 0.40×proc(0.739[fin=0.82,mean=0.62]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 05:14:25,021 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.569 = 0.50×0.50(prox=0.50) + 0.40×proc(0.546[fin=0.57,mean=0.52]) + 0.10×fmt(1.000) | pred='1' gold='2' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:14:25,105 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.943 = 0.50×1.00(exact) + 0.40×proc(0.857[fin=0.97,mean=0.68]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 05:14:25,191 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.935 = 0.50×1.00(exact) + 0.40×proc(0.836[fin=0.91,mean=0.73]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:14:25,276 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.960[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:14:31,636 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.948[fin=0.99,mean=0.88]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:14:31,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.370 = 0.50×0.25(prox=0.25) + 0.40×proc(0.237[fin=0.08,mean=0.47]) + 0.10×fmt(1.000) | pred='5' gold='2' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+
Iter 16 GRPO groups: 80%|######## | 16/20 [08:04<01:58, 29.71s/q, loss=-0.0014, mean_r=0.769, q_acc=100%, q_rew=0.788, skip=5]
Iter 16 GRPO groups: 85%|########5 | 17/20 [08:04<01:17, 25.82s/q, loss=-0.0014, mean_r=0.769, q_acc=100%, q_rew=0.788, skip=5]2026-04-26 05:14:38,352 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:14:38,434 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:14:48,928 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:14:49,010 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:14:49,094 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:14:49,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:14:59,841 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:14:59,924 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.916[fin=0.99,mean=0.80]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=71% lccp=57% (chain=4/7 ok_count=5) n_steps=7
+2026-04-26 05:15:00,006 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:15:00,083 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 16 GRPO groups: 85%|########5 | 17/20 [08:40<01:17, 25.82s/q, loss=0var, mean_r=0.994, skip=6]
Iter 16 GRPO groups: 90%|######### | 18/20 [08:40<00:57, 28.82s/q, loss=0var, mean_r=0.994, skip=6]2026-04-26 05:15:12,420 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:15:12,504 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(0.650) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:15:12,588 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:15:12,670 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.642 = 0.50×0.47(prox=0.47) + 0.40×proc(0.851[fin=1.00,mean=0.63]) + 0.10×fmt(0.650) | pred='4' gold='9' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 05:15:18,263 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:15:18,345 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:15:18,427 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:15:18,508 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:15:23,177 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:15:23,261 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 16 GRPO groups: 90%|######### | 18/20 [08:56<00:57, 28.82s/q, loss=0.0005, mean_r=0.952, q_acc=100%, q_rew=0.788, skip=6]
Iter 16 GRPO groups: 95%|#########5| 19/20 [08:56<00:24, 24.89s/q, loss=0.0005, mean_r=0.952, q_acc=100%, q_rew=0.788, skip=6]2026-04-26 05:15:30,343 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:15:30,427 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:15:41,363 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:15:41,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:15:41,539 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:15:41,625 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:15:53,346 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:15:53,430 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:15:53,513 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:15:53,598 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 16 GRPO groups: 95%|#########5| 19/20 [09:35<00:24, 24.89s/q, loss=0var, mean_r=0.999, skip=7]
Iter 16 GRPO groups: 100%|##########| 20/20 [09:35<00:00, 29.24s/q, loss=0var, mean_r=0.999, skip=7]
Iter 16 GRPO groups: 100%|##########| 20/20 [09:35<00:00, 28.77s/q, loss=0var, mean_r=0.999, skip=7]
+2026-04-26 05:16:04,112 INFO __main__ - Iter 16 | loss=0.0003 | reward mean=0.915 std=0.173 | gt_match=83.2% | grounded_acc=93.9% | step_acc=89.6% | lccp=84.4% | batch_acc=95.0% | phase=SELFPLAY_RAMP sp_ratio=11% | groups=15 skipped=7(0var=7) | lr=4.74e-06 | 575.5s
+2026-04-26 05:16:04,112 WARNING __main__ - STARVATION: 32% of groups skipped (zero variance). grounded_acc=93.9% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 05:16:04,112 INFO __main__ - Question generation: 2/2 valid (100%) | q_reward=0.788 | q_acc=100.0% (>0.5 quality) | topic=0.88 diff=0.58 clarity=1.00 novelty=0.45 solvability=0.96
+2026-04-26 05:16:04,114 INFO __main__ - ======================================================================
+2026-04-26 05:16:04,114 INFO __main__ - GRPO ITERATION 17/60
+2026-04-26 05:16:04,114 INFO __main__ - ======================================================================
+2026-04-26 05:16:04,135 INFO __main__ - LR this iteration: 4.74e-06 | T=0.692 | MATH ratio=30%
+
Iter 17 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 05:16:07,390 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:07,473 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:07,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:07,636 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:14,449 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:14,525 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:16:14,606 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:14,688 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:21,812 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:21,898 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 17 GRPO groups: 0%| | 0/20 [00:17, ?q/s, loss=0var, mean_r=0.999, skip=1]
Iter 17 GRPO groups: 5%|5 | 1/20 [00:17<05:37, 17.76s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 05:16:27,085 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:16:27,170 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:16:36,526 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:36,611 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:16:36,696 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.351 = 0.50×0.08(prox=0.08) + 0.40×proc(0.457[fin=0.54,mean=0.34]) + 0.10×fmt(1.000) | pred='252' gold='36' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 05:16:36,780 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.207 = 0.50×0.00(prox=0.00) + 0.40×proc(0.175[fin=0.04,mean=0.38]) + 0.10×fmt(1.000) | pred='4/9' gold='36' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 05:16:44,648 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:16:44,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:44,816 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:16:44,902 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 17 GRPO groups: 5%|5 | 1/20 [00:51<05:37, 17.76s/q, loss=0.0001, mean_r=0.853, skip=1]
Iter 17 GRPO groups: 10%|# | 2/20 [00:51<08:12, 27.37s/q, loss=0.0001, mean_r=0.853, skip=1]2026-04-26 05:17:03,565 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.922 = 0.50×1.00(exact) + 0.40×proc(0.805[fin=0.97,mean=0.56]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 05:17:03,660 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.669 = 0.50×0.45(prox=0.45) + 0.40×proc(0.854[fin=1.00,mean=0.64]) + 0.10×fmt(1.000) | pred='32' gold='20' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 05:17:03,745 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.693 = 0.50×1.00(exact) + 0.40×proc(0.232[fin=0.04,mean=0.53]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 05:17:03,837 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.825 = 0.50×0.85(prox=0.85) + 0.40×proc(0.751[fin=0.93,mean=0.48]) + 0.10×fmt(1.000) | pred='21' gold='20' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 05:17:12,176 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.915 = 0.50×1.00(exact) + 0.40×proc(0.787[fin=0.95,mean=0.55]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=50% lccp=33% (chain=2/6 ok_count=3) n_steps=6
+2026-04-26 05:17:12,261 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.705 = 0.50×0.77(prox=0.77) + 0.40×proc(0.551[fin=0.59,mean=0.50]) + 0.10×fmt(1.000) | pred='17' gold='20' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:17:12,353 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:17:12,437 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.677 = 0.50×0.71(prox=0.71) + 0.40×proc(0.548[fin=0.43,mean=0.73]) + 0.10×fmt(1.000) | pred='16' gold='20' | step_acc=80% lccp=80% (chain=4/5 ok_count=4) n_steps=5
+2026-04-26 05:17:20,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.45(prox=0.45) + 0.40×proc(0.794[fin=0.91,mean=0.63]) + 0.10×fmt(1.000) | pred='32' gold='20' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 05:17:20,538 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.921[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+
Iter 17 GRPO groups: 10%|# | 2/20 [01:17<08:12, 27.37s/q, loss=-0.0004, mean_r=0.792, skip=1]
Iter 17 GRPO groups: 15%|#5 | 3/20 [01:17<07:34, 26.75s/q, loss=-0.0004, mean_r=0.792, skip=1]2026-04-26 05:17:21,999 INFO src.rl.curriculum_manager - Topic probabilities (rollout 80): [('basic_arithmetic', '0.045'), ('single_step_word_problems', '0.045'), ('fractions', '0.045'), ('percentages', '0.045'), ('ratios', '0.045')]
+2026-04-26 05:17:27,783 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.944 + mod=+0.080, cap=1.00) | Q=0.91 sol=0.969 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:17:27,981 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.920 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.956 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:17:28,190 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.954 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:17:28,390 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:17:28,589 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.990 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:28,784 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.979 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:17:28,983 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.922 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.977 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:17:29,192 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.937 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.989 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:17:29,403 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.920 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.973 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:17:29,615 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.960 novelty=0.74 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:35,041 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.955 + mod=+0.080, cap=1.00) | Q=0.89 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:35,238 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.935 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.988 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:17:35,437 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.941 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:35,637 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.941 + mod=+0.080, cap=1.00) | Q=0.85 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:35,841 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.943 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:36,052 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.942 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:36,254 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.944 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:36,453 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.943 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:36,651 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.938 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:36,850 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.931 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+
Iter 17 GRPO groups: 15%|#5 | 3/20 [01:32<07:34, 26.75s/q, loss=-0.0000, mean_r=1.000, q_acc=100%, q_rew=0.857, skip=2]
Iter 17 GRPO groups: 20%|## | 4/20 [01:32<05:54, 22.16s/q, loss=-0.0000, mean_r=1.000, q_acc=100%, q_rew=0.857, skip=2]2026-04-26 05:17:42,997 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:17:43,081 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:17:53,620 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:17:53,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:17:53,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:17:53,886 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:18:04,507 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:18:04,591 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:18:04,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:18:04,761 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 17 GRPO groups: 20%|## | 4/20 [02:11<05:54, 22.16s/q, loss=0var, mean_r=0.999, skip=3]
Iter 17 GRPO groups: 25%|##5 | 5/20 [02:11<07:02, 28.14s/q, loss=0var, mean_r=0.999, skip=3]2026-04-26 05:18:20,541 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.924 = 0.50×1.00(exact) + 0.40×proc(0.897[fin=1.00,mean=0.75]) + 0.10×fmt(0.650) | pred='2149' gold='2149' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 05:18:20,621 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.539 = 0.50×0.61(prox=0.61) + 0.40×proc(0.420[fin=0.52,mean=0.27]) + 0.10×fmt(0.650) | pred='1466' gold='2149' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 05:18:20,704 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(0.650) | pred='2149' gold='2149' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:18:20,787 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.325 = 0.50×0.17(prox=0.17) + 0.40×proc(0.349[fin=0.36,mean=0.33]) + 0.10×fmt(1.000) | pred='7375' gold='2149' | step_acc=0% lccp=0% (chain=0/5 ok_count=0) n_steps=5
+2026-04-26 05:18:32,302 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.927 = 0.50×1.00(exact) + 0.40×proc(0.904[fin=1.00,mean=0.77]) + 0.10×fmt(0.650) | pred='2149' gold='2149' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:18:32,386 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.920 = 0.50×1.00(exact) + 0.40×proc(0.887[fin=1.00,mean=0.72]) + 0.10×fmt(0.650) | pred='2149' gold='2149' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 05:18:32,470 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.947 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(0.650) | pred='2149' gold='2149' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:18:32,553 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='2149' gold='2149' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:18:37,924 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.94]) + 0.10×fmt(0.650) | pred='2149' gold='2149' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:18:38,007 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.952 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.92]) + 0.10×fmt(0.650) | pred='2149' gold='2149' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 17 GRPO groups: 25%|##5 | 5/20 [02:35<07:02, 28.14s/q, loss=0.0005, mean_r=0.840, q_acc=100%, q_rew=0.857, skip=3]
Iter 17 GRPO groups: 30%|### | 6/20 [02:35<06:12, 26.59s/q, loss=0.0005, mean_r=0.840, q_acc=100%, q_rew=0.857, skip=3]2026-04-26 05:18:43,086 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:18:43,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:18:52,489 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:18:52,571 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:18:52,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:18:52,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:01,386 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.866 = 0.50×0.85(prox=0.85) + 0.40×proc(0.851[fin=0.99,mean=0.64]) + 0.10×fmt(1.000) | pred='31' gold='29' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 05:19:01,468 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:01,551 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:01,634 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 17 GRPO groups: 30%|### | 6/20 [03:08<06:12, 26.59s/q, loss=-0.0006, mean_r=0.983, q_acc=100%, q_rew=0.857, skip=3]
Iter 17 GRPO groups: 35%|###5 | 7/20 [03:08<06:12, 28.63s/q, loss=-0.0006, mean_r=0.983, q_acc=100%, q_rew=0.857, skip=3]2026-04-26 05:19:17,872 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:17,955 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:18,038 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:18,122 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:27,855 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:27,938 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.865[fin=0.98,mean=0.69]) + 0.10×fmt(1.000) | pred='24' gold='8' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 05:19:28,021 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:28,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:38,869 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:38,955 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 17 GRPO groups: 35%|###5 | 7/20 [03:36<06:12, 28.63s/q, loss=0.0007, mean_r=0.954, q_acc=100%, q_rew=0.857, skip=3]
Iter 17 GRPO groups: 40%|#### | 8/20 [03:36<05:41, 28.46s/q, loss=0.0007, mean_r=0.954, q_acc=100%, q_rew=0.857, skip=3]2026-04-26 05:20:13,932 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.540 = 0.50×0.14(prox=0.14) + 0.40×proc(0.828[fin=0.96,mean=0.63]) + 0.10×fmt(1.000) | pred='-2' gold='1' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 05:20:14,031 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.672[fin=0.57,mean=0.83]) + 0.10×fmt(1.000) | pred='2' gold='1' | step_acc=88% lccp=75% (chain=6/8 ok_count=7) n_steps=8
+2026-04-26 05:20:28,226 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.350 = 0.50×0.00(prox=0.00) + 0.40×proc(0.363[fin=0.05,mean=0.83]) + 0.10×fmt(0.700) | pred='' gold='1' | step_acc=90% lccp=90% (chain=9/10 ok_count=9) n_steps=10
+2026-04-26 05:20:28,323 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.201 = 0.50×0.00(prox=0.00) + 0.40×proc(0.275[fin=0.15,mean=0.47]) + 0.10×fmt(0.700) | pred='' gold='1' | step_acc=43% lccp=14% (chain=1/7 ok_count=3) n_steps=7
+2026-04-26 05:20:28,407 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.394 = 0.50×0.33(prox=0.33) + 0.40×proc(0.224[fin=0.15,mean=0.33]) + 0.10×fmt(1.000) | pred='0' gold='1' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 05:20:28,500 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.248 = 0.50×0.00(prox=0.00) + 0.40×proc(0.445[fin=0.41,mean=0.50]) + 0.10×fmt(0.700) | pred='' gold='1' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 05:20:39,936 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.342 = 0.50×0.00(prox=0.00) + 0.40×proc(0.359[fin=0.05,mean=0.82]) + 0.10×fmt(0.700) | pred='' gold='1' | step_acc=86% lccp=86% (chain=6/7 ok_count=6) n_steps=7
+2026-04-26 05:20:40,034 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.930 = 0.50×1.00(exact) + 0.40×proc(0.826[fin=0.91,mean=0.70]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=78% lccp=11% (chain=1/9 ok_count=7) n_steps=9
+2026-04-26 05:20:40,151 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 17 GRPO groups: 40%|#### | 8/20 [04:37<05:41, 28.46s/q, loss=0.0006, mean_r=0.506, q_acc=100%, q_rew=0.857, skip=3]
Iter 17 GRPO groups: 45%|####5 | 9/20 [04:37<07:05, 38.69s/q, loss=0.0006, mean_r=0.506, q_acc=100%, q_rew=0.857, skip=3]2026-04-26 05:20:41,584 INFO src.rl.curriculum_manager - Topic probabilities (rollout 100): [('statistics', '0.256'), ('basic_arithmetic', '0.034'), ('single_step_word_problems', '0.034'), ('fractions', '0.034'), ('percentages', '0.034')]
+2026-04-26 05:20:49,007 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.955 + mod=+0.080, cap=1.00) | Q=0.89 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:49,206 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.83 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:49,402 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.997 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:49,597 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.83 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:49,800 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.996 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:20:49,999 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.598 = clip(base=0.518 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.371 novelty=0.74 | sol=0.45*prm_final(0.43)+0.35*prm_mean(0.51)+0.20*lccp(0.00) | steps=6
+2026-04-26 05:20:50,201 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:50,400 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:50,598 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:50,790 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.935 + mod=+0.080, cap=1.00) | Q=0.84 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:58,499 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.981 = clip(base=0.901 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:20:58,698 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.837 = clip(base=0.757 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.806 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.79)+0.20*lccp(0.40) | steps=5
+2026-04-26 05:20:58,890 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:20:59,084 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:20:59,278 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:20:59,470 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.992 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:59,662 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:20:59,858 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:00,053 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:00,248 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+
Iter 17 GRPO groups: 45%|####5 | 9/20 [04:57<07:05, 38.69s/q, loss=-0.0021, mean_r=0.955, q_acc=100%, q_rew=0.812, skip=3]
Iter 17 GRPO groups: 50%|##### | 10/20 [04:57<05:30, 33.01s/q, loss=-0.0021, mean_r=0.955, q_acc=100%, q_rew=0.812, skip=3]2026-04-26 05:21:05,684 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.676 = 0.50×0.73(prox=0.73) + 0.40×proc(0.523[fin=0.61,mean=0.40]) + 0.10×fmt(1.000) | pred='36' gold='44' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 05:21:12,103 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:21:12,187 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:21:12,269 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:21:12,353 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:21:19,325 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.654 = 0.50×0.73(prox=0.73) + 0.40×proc(0.469[fin=0.57,mean=0.31]) + 0.10×fmt(1.000) | pred='36' gold='44' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 05:21:19,409 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:21:19,493 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:21:19,575 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:21:26,111 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.534 = 0.50×0.27(prox=0.27) + 0.40×proc(0.750[fin=0.99,mean=0.39]) + 0.10×fmt(1.000) | pred='104' gold='44' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+
Iter 17 GRPO groups: 50%|##### | 10/20 [05:23<05:30, 33.01s/q, loss=-0.0012, mean_r=0.884, q_acc=100%, q_rew=0.812, skip=3]
Iter 17 GRPO groups: 55%|#####5 | 11/20 [05:23<04:36, 30.77s/q, loss=-0.0012, mean_r=0.884, q_acc=100%, q_rew=0.812, skip=3]2026-04-26 05:21:27,565 INFO src.rl.curriculum_manager - Topic probabilities (rollout 120): [('statistics', '0.251'), ('basic_arithmetic', '0.035'), ('single_step_word_problems', '0.035'), ('fractions', '0.035'), ('percentages', '0.035')]
+2026-04-26 05:21:34,078 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.988 = clip(base=0.908 + mod=+0.080, cap=1.00) | Q=0.77 sol=1.000 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:34,283 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:34,487 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:34,685 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:34,884 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:35,085 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:35,283 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:35,479 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.996 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:35,675 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:35,876 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:39,259 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.836 = clip(base=0.756 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.671 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.64)+0.20*lccp(0.00) | steps=2
+2026-04-26 05:21:39,446 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.808 = clip(base=0.728 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.667 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.62)+0.20*lccp(0.00) | steps=2
+2026-04-26 05:21:39,633 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:21:39,820 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:21:40,007 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.997 = clip(base=0.917 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.980 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:21:40,202 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.994 = clip(base=0.914 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.985 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:21:40,391 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.995 = clip(base=0.915 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.988 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:21:40,580 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.824 = clip(base=0.744 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.689 novelty=0.71 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.69)+0.20*lccp(0.00) | steps=2
+2026-04-26 05:21:40,767 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.993 = clip(base=0.913 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.973 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:21:40,954 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.822 = clip(base=0.742 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.686 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.68)+0.20*lccp(0.00) | steps=2
+
Iter 17 GRPO groups: 55%|#####5 | 11/20 [05:38<04:36, 30.77s/q, loss=-0.0001, mean_r=0.947, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 60%|###### | 12/20 [05:38<03:27, 25.98s/q, loss=-0.0001, mean_r=0.947, q_acc=100%, q_rew=0.798, skip=3]2026-04-26 05:21:56,558 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.536 = 0.50×0.30(prox=0.30) + 0.40×proc(0.634[fin=0.67,mean=0.58]) + 0.10×fmt(1.000) | pred='-2324' gold='12834' | step_acc=56% lccp=22% (chain=2/9 ok_count=5) n_steps=9
+2026-04-26 05:21:56,656 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.901[fin=0.97,mean=0.79]) + 0.10×fmt(1.000) | pred='534' gold='12834' | step_acc=86% lccp=57% (chain=4/7 ok_count=6) n_steps=7
+2026-04-26 05:21:56,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.536 = 0.50×0.68(prox=0.68) + 0.40×proc(0.238[fin=0.20,mean=0.29]) + 0.10×fmt(1.000) | pred='9830' gold='12834' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 05:22:07,901 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.30(prox=0.30) + 0.40×proc(0.751[fin=0.81,mean=0.66]) + 0.10×fmt(1.000) | pred='-1950' gold='12834' | step_acc=57% lccp=43% (chain=3/7 ok_count=4) n_steps=7
+2026-04-26 05:22:07,995 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.429 = 0.50×0.04(prox=0.04) + 0.40×proc(0.716[fin=0.87,mean=0.49]) + 0.10×fmt(1.000) | pred='160910' gold='12834' | step_acc=57% lccp=14% (chain=1/7 ok_count=4) n_steps=7
+2026-04-26 05:22:08,093 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.443 = 0.50×0.29(prox=0.29) + 0.40×proc(0.443[fin=0.38,mean=0.55]) + 0.10×fmt(1.000) | pred='-3000' gold='12834' | step_acc=43% lccp=14% (chain=1/7 ok_count=3) n_steps=7
+2026-04-26 05:22:08,188 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.643[fin=0.62,mean=0.68]) + 0.10×fmt(1.000) | pred='534' gold='12834' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 05:22:23,045 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.745[fin=0.93,mean=0.47]) + 0.10×fmt(1.000) | pred='4763' gold='12834' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 05:22:23,140 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.468 = 0.50×0.30(prox=0.30) + 0.40×proc(0.487[fin=0.47,mean=0.52]) + 0.10×fmt(1.000) | pred='-1962' gold='12834' | step_acc=43% lccp=14% (chain=1/7 ok_count=3) n_steps=7
+2026-04-26 05:22:23,237 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.599 = 0.50×0.34(prox=0.34) + 0.40×proc(0.828[fin=0.99,mean=0.58]) + 0.10×fmt(1.000) | pred='25500' gold='12834' | step_acc=50% lccp=0% (chain=0/8 ok_count=4) n_steps=8
+
Iter 17 GRPO groups: 60%|###### | 12/20 [06:20<03:27, 25.98s/q, loss=0.0001, mean_r=0.521, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 65%|######5 | 13/20 [06:20<03:36, 30.87s/q, loss=0.0001, mean_r=0.521, q_acc=100%, q_rew=0.798, skip=3]2026-04-26 05:22:30,778 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.662 = 0.50×0.64(prox=0.64) + 0.40×proc(0.609[fin=0.59,mean=0.63]) + 0.10×fmt(1.000) | pred='240' gold='336' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 05:22:39,032 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.780 = 0.50×0.64(prox=0.64) + 0.40×proc(0.905[fin=0.98,mean=0.79]) + 0.10×fmt(1.000) | pred='240' gold='336' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:22:39,118 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.746 = 0.50×0.64(prox=0.64) + 0.40×proc(0.818[fin=0.91,mean=0.69]) + 0.10×fmt(1.000) | pred='240' gold='336' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:22:39,204 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='336' gold='336' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:22:39,289 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.621 = 0.50×0.64(prox=0.64) + 0.40×proc(0.508[fin=0.50,mean=0.51]) + 0.10×fmt(1.000) | pred='240' gold='336' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:22:45,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='336' gold='336' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:22:45,494 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='336' gold='336' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:22:45,576 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='336' gold='336' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:22:45,660 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='336' gold='336' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:22:55,757 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='336' gold='336' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 17 GRPO groups: 65%|######5 | 13/20 [06:53<03:36, 30.87s/q, loss=0.0006, mean_r=0.879, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 70%|####### | 14/20 [06:53<03:08, 31.35s/q, loss=0.0006, mean_r=0.879, q_acc=100%, q_rew=0.798, skip=3]2026-04-26 05:23:03,409 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:23:03,492 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:23:03,578 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:23:13,389 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:23:13,473 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.792 = 0.50×0.72(prox=0.72) + 0.40×proc(0.823[fin=0.97,mean=0.60]) + 0.10×fmt(1.000) | pred='446.5' gold='551' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:23:13,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:23:13,638 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:23:25,020 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:23:25,105 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:23:25,190 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 17 GRPO groups: 70%|####### | 14/20 [07:22<03:08, 31.35s/q, loss=-0.0006, mean_r=0.978, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 75%|#######5 | 15/20 [07:22<02:33, 30.77s/q, loss=-0.0006, mean_r=0.978, q_acc=100%, q_rew=0.798, skip=3]2026-04-26 05:23:32,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:23:40,325 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:23:40,402 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.913[fin=1.00,mean=0.79]) + 0.10×fmt(0.650) | pred='210' gold='105' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:23:40,487 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.368 = 0.50×0.33(prox=0.33) + 0.40×proc(0.255[fin=0.16,mean=0.40]) + 0.10×fmt(1.000) | pred='210' gold='105' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 05:23:40,571 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.319 = 0.50×0.00(prox=0.00) + 0.40×proc(0.453[fin=0.49,mean=0.40]) + 0.10×fmt(1.000) | pred='6,537,021,8400' gold='105' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 05:23:47,404 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:23:47,487 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:23:47,569 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:23:47,652 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:23:53,994 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 17 GRPO groups: 75%|#######5 | 15/20 [07:51<02:33, 30.77s/q, loss=0.0003, mean_r=0.811, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 80%|######## | 16/20 [07:51<02:00, 30.19s/q, loss=0.0003, mean_r=0.811, q_acc=100%, q_rew=0.798, skip=3]2026-04-26 05:24:02,609 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:24:02,694 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:24:02,781 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:24:12,373 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.524 = 0.50×0.43(prox=0.43) + 0.40×proc(0.431[fin=0.46,mean=0.38]) + 0.10×fmt(1.000) | pred='1' gold='3' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 05:24:12,466 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:24:12,550 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.14(prox=0.14) + 0.40×proc(0.880[fin=0.96,mean=0.75]) + 0.10×fmt(1.000) | pred='12' gold='3' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:24:12,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:24:20,401 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.928[fin=0.99,mean=0.84]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:24:20,493 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:24:20,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 17 GRPO groups: 80%|######## | 16/20 [08:17<02:00, 30.19s/q, loss=0.0022, mean_r=0.904, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 85%|########5 | 17/20 [08:17<01:27, 29.13s/q, loss=0.0022, mean_r=0.904, q_acc=100%, q_rew=0.798, skip=3]2026-04-26 05:24:28,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.815 = 0.50×0.64(prox=0.64) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='5' gold='7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:24:37,302 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.933 = 0.50×1.00(exact) + 0.40×proc(0.832[fin=0.99,mean=0.59]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 05:24:37,386 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.739 = 0.50×0.54(prox=0.54) + 0.40×proc(0.925[fin=0.99,mean=0.83]) + 0.10×fmt(1.000) | pred='10' gold='7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:24:37,469 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.745 = 0.50×0.54(prox=0.54) + 0.40×proc(0.938[fin=0.98,mean=0.87]) + 0.10×fmt(1.000) | pred='10' gold='7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:24:37,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.749 = 0.50×0.54(prox=0.54) + 0.40×proc(0.950[fin=0.98,mean=0.90]) + 0.10×fmt(1.000) | pred='10' gold='7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:24:44,926 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:24:45,011 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.619 = 0.50×0.54(prox=0.54) + 0.40×proc(0.623[fin=0.66,mean=0.57]) + 0.10×fmt(1.000) | pred='10' gold='7' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 05:24:45,096 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.681 = 0.50×0.54(prox=0.54) + 0.40×proc(0.780[fin=0.79,mean=0.77]) + 0.10×fmt(1.000) | pred='10' gold='7' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:24:45,191 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.617 = 0.50×0.64(prox=0.64) + 0.40×proc(0.498[fin=0.50,mean=0.50]) + 0.10×fmt(1.000) | pred='5' gold='7' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 05:24:55,262 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 17 GRPO groups: 85%|########5 | 17/20 [08:52<01:27, 29.13s/q, loss=-0.0002, mean_r=0.790, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 90%|######### | 18/20 [08:52<01:01, 30.77s/q, loss=-0.0002, mean_r=0.790, q_acc=100%, q_rew=0.798, skip=3]2026-04-26 05:25:30,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:25:30,374 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:25:30,461 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:25:37,483 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:25:37,568 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:25:37,651 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:25:37,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:25:47,064 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.440 = 0.50×0.33(prox=0.33) + 0.40×proc(0.339[fin=0.34,mean=0.34]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+
Iter 17 GRPO groups: 90%|######### | 18/20 [09:44<01:01, 30.77s/q, loss=0.0001, mean_r=0.924, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 95%|#########5| 19/20 [09:44<00:37, 37.03s/q, loss=0.0001, mean_r=0.924, q_acc=100%, q_rew=0.798, skip=3]2026-04-26 05:25:53,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:25:53,641 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:25:53,724 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:05,421 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:05,515 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:05,599 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:05,684 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.755 = 0.50×0.56(prox=0.56) + 0.40×proc(0.944[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='300' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:18,485 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:18,571 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:18,655 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 17 GRPO groups: 95%|#########5| 19/20 [10:15<00:37, 37.03s/q, loss=-0.0006, mean_r=0.974, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 100%|##########| 20/20 [10:15<00:00, 35.46s/q, loss=-0.0006, mean_r=0.974, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 100%|##########| 20/20 [10:15<00:00, 30.80s/q, loss=-0.0006, mean_r=0.974, q_acc=100%, q_rew=0.798, skip=3]
+2026-04-26 05:26:20,134 INFO __main__ - Iter 17 | loss=-0.0001 | reward mean=0.888 std=0.195 | gt_match=70.1% | grounded_acc=91.6% | step_acc=85.6% | lccp=76.8% | batch_acc=93.8% | phase=SELFPLAY_RAMP sp_ratio=14% | groups=20 skipped=3(0var=3) | lr=4.68e-06 | 616.0s
+2026-04-26 05:26:20,134 INFO __main__ - Question generation: 3/3 valid (100%) | q_reward=0.798 | q_acc=100.0% (>0.5 quality) | topic=0.69 diff=0.89 clarity=1.00 novelty=0.46 solvability=1.00
+2026-04-26 05:26:20,136 INFO __main__ - ======================================================================
+2026-04-26 05:26:20,136 INFO __main__ - GRPO ITERATION 18/60
+2026-04-26 05:26:20,136 INFO __main__ - ======================================================================
+2026-04-26 05:26:20,157 INFO __main__ - LR this iteration: 4.68e-06 | T=0.685 | MATH ratio=30%
+
Iter 18 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 05:26:23,132 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:27,831 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:27,912 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:27,993 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:28,070 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:33,172 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:33,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:33,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:33,419 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:38,592 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 18 GRPO groups: 0%| | 0/20 [00:18, ?q/s, loss=0var, mean_r=0.999, skip=1]
Iter 18 GRPO groups: 5%|5 | 1/20 [00:18<05:50, 18.43s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 05:26:43,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:26:43,083 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:43,165 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:55,151 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:55,232 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:26:55,314 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:55,396 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:27:02,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:27:02,622 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.944 = 0.50×1.00(exact) + 0.40×proc(0.861[fin=0.97,mean=0.69]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 05:27:02,698 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 18 GRPO groups: 5%|5 | 1/20 [00:42<05:50, 18.43s/q, loss=0var, mean_r=0.993, skip=2]
Iter 18 GRPO groups: 10%|# | 2/20 [00:42<06:31, 21.77s/q, loss=0var, mean_r=0.993, skip=2]2026-04-26 05:27:07,202 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:27:09,871 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:27:09,958 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.954 = 0.50×1.00(exact) + 0.40×proc(0.885[fin=0.99,mean=0.73]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 05:27:10,046 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.491 = 0.50×0.05(prox=0.05) + 0.40×proc(0.913[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='340' gold='34' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 05:27:10,131 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:27:25,298 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:27:25,380 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:27:25,462 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:27:25,547 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:27:38,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 18 GRPO groups: 10%|# | 2/20 [01:19<06:31, 21.77s/q, loss=-0.0006, mean_r=0.944, skip=2]
Iter 18 GRPO groups: 15%|#5 | 3/20 [01:19<08:08, 28.73s/q, loss=-0.0006, mean_r=0.944, skip=2]2026-04-26 05:27:47,637 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10000' gold='100002' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:27:47,722 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.989[fin=0.99,mean=0.99]) + 0.10×fmt(1.000) | pred='10000' gold='100002' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:27:47,813 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.414 = 0.50×0.36(prox=0.36) + 0.40×proc(0.427[fin=0.52,mean=0.28]) + 0.10×fmt(0.650) | pred='10000' gold='100002' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 05:27:54,413 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.992[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10000' gold='100002' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:27:54,496 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.822[fin=0.93,mean=0.66]) + 0.10×fmt(1.000) | pred='10100' gold='100002' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 05:27:54,579 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.989[fin=0.99,mean=0.98]) + 0.10×fmt(1.000) | pred='10000' gold='100002' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:27:54,671 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.996[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10000' gold='100002' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:28:03,692 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.973[fin=0.97,mean=0.98]) + 0.10×fmt(1.000) | pred='10000' gold='100002' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:28:03,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.988[fin=0.98,mean=0.99]) + 0.10×fmt(1.000) | pred='10000' gold='100002' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:28:03,866 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.465 = 0.50×0.34(prox=0.34) + 0.40×proc(0.493[fin=0.57,mean=0.37]) + 0.10×fmt(1.000) | pred='1111' gold='100002' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+
Iter 18 GRPO groups: 15%|#5 | 3/20 [01:45<08:08, 28.73s/q, loss=-0.0001, mean_r=0.528, skip=2]
Iter 18 GRPO groups: 20%|## | 4/20 [01:45<07:19, 27.48s/q, loss=-0.0001, mean_r=0.528, skip=2]2026-04-26 05:28:05,289 INFO src.rl.curriculum_manager - Topic probabilities (rollout 140): [('basic_arithmetic', '0.050'), ('fractions', '0.050'), ('percentages', '0.050'), ('ratios', '0.050'), ('money_problems', '0.050')]
+2026-04-26 05:28:10,605 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.952 + mod=+0.080, cap=1.00) | Q=0.89 sol=0.991 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:10,802 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:11,001 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.989 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:11,199 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:28:11,397 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.991 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:11,596 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.922 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.984 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:11,786 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.921 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.984 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:11,974 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.991 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:12,175 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.993 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:28:12,364 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.990 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:18,337 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.947 + mod=+0.080, cap=1.00) | Q=0.89 sol=0.985 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:18,540 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.844 = clip(base=0.764 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.751 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.76)+0.20*lccp(0.17) | steps=6
+2026-04-26 05:28:18,737 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.835 = clip(base=0.755 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.728 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.66)+0.20*lccp(0.25) | steps=4
+2026-04-26 05:28:18,936 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.993 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:28:19,136 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.821 = clip(base=0.741 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.717 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.69)+0.20*lccp(0.17) | steps=6
+2026-04-26 05:28:19,331 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.774 = clip(base=0.694 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.628 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.53)+0.20*lccp(0.00) | steps=4
+2026-04-26 05:28:19,530 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.938 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.82)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:28:19,724 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.979 = clip(base=0.899 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.943 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.85)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:19,919 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.778 = clip(base=0.698 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.637 novelty=0.70 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.51)+0.20*lccp(0.25) | steps=4
+2026-04-26 05:28:20,117 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.957 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=4
+
Iter 18 GRPO groups: 20%|## | 4/20 [02:01<07:19, 27.48s/q, loss=-0.0005, mean_r=0.948, q_acc=100%, q_rew=0.818, skip=3]
Iter 18 GRPO groups: 25%|##5 | 5/20 [02:01<05:52, 23.51s/q, loss=-0.0005, mean_r=0.948, q_acc=100%, q_rew=0.818, skip=3]2026-04-26 05:28:27,100 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.429 = 0.50×0.35(prox=0.35) + 0.40×proc(0.237[fin=0.11,mean=0.43]) + 0.10×fmt(1.000) | pred='468' gold='7020' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 05:28:37,504 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.737 = 0.50×0.60(prox=0.60) + 0.40×proc(0.844[fin=0.96,mean=0.67]) + 0.10×fmt(1.000) | pred='4680' gold='7020' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 05:28:37,591 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.765 = 0.50×0.60(prox=0.60) + 0.40×proc(0.912[fin=0.97,mean=0.82]) + 0.10×fmt(1.000) | pred='4680' gold='7020' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:28:37,675 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='7020' gold='7020' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:28:37,758 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.750 = 0.50×0.56(prox=0.56) + 0.40×proc(0.930[fin=0.99,mean=0.84]) + 0.10×fmt(1.000) | pred='4212' gold='7020' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:28:45,563 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7020' gold='7020' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:28:45,640 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.622[fin=0.61,mean=0.63]) + 0.10×fmt(1.000) | pred='14058' gold='7020' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:28:45,719 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.633 = 0.50×0.60(prox=0.60) + 0.40×proc(0.587[fin=0.56,mean=0.62]) + 0.10×fmt(1.000) | pred='4644' gold='7020' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:28:45,796 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.727[fin=0.76,mean=0.67]) + 0.10×fmt(1.000) | pred='14052' gold='7020' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:28:53,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.871[fin=0.95,mean=0.75]) + 0.10×fmt(1.000) | pred='14040' gold='7020' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+
Iter 18 GRPO groups: 25%|##5 | 5/20 [02:34<05:52, 23.51s/q, loss=0.0007, mean_r=0.696, q_acc=100%, q_rew=0.818, skip=3]
Iter 18 GRPO groups: 30%|### | 6/20 [02:34<06:15, 26.83s/q, loss=0.0007, mean_r=0.696, q_acc=100%, q_rew=0.818, skip=3]2026-04-26 05:28:58,202 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:28:58,283 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 05:28:58,364 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.920[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 05:29:05,309 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.927[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 05:29:05,392 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:29:05,474 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:29:05,556 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:29:13,597 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:29:13,681 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:29:13,766 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 18 GRPO groups: 30%|### | 6/20 [02:53<06:15, 26.83s/q, loss=0var, mean_r=0.986, skip=4]
Iter 18 GRPO groups: 35%|###5 | 7/20 [02:53<05:14, 24.19s/q, loss=0var, mean_r=0.986, skip=4]2026-04-26 05:29:19,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.750[fin=0.85,mean=0.60]) + 0.10×fmt(1.000) | pred='490' gold='250' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:29:29,050 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.745[fin=0.85,mean=0.59]) + 0.10×fmt(1.000) | pred='490' gold='250' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:29:29,137 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:29:29,224 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:29:29,309 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:29:38,741 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.798 = 0.50×0.68(prox=0.68) + 0.40×proc(0.899[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='310' gold='250' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 05:29:38,824 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:29:38,909 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:29:38,997 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.663 = 0.50×0.68(prox=0.68) + 0.40×proc(0.564[fin=0.58,mean=0.54]) + 0.10×fmt(1.000) | pred='310' gold='250' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 05:29:49,867 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 18 GRPO groups: 35%|###5 | 7/20 [03:31<05:14, 24.19s/q, loss=-0.0006, mean_r=0.855, q_acc=100%, q_rew=0.818, skip=4]
Iter 18 GRPO groups: 40%|#### | 8/20 [03:31<05:41, 28.43s/q, loss=-0.0006, mean_r=0.855, q_acc=100%, q_rew=0.818, skip=4]2026-04-26 05:30:25,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:30:25,200 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:30:25,286 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:30:37,283 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.938[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=83% lccp=33% (chain=2/6 ok_count=5) n_steps=6
+2026-04-26 05:30:37,365 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.809 = 0.50×0.80(prox=0.80) + 0.40×proc(0.771[fin=0.97,mean=0.48]) + 0.10×fmt(1.000) | pred='7' gold='8' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:30:37,446 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.939 = 0.50×1.00(exact) + 0.40×proc(0.847[fin=0.99,mean=0.63]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 05:30:37,531 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.848 = 0.50×0.80(prox=0.80) + 0.40×proc(0.869[fin=1.00,mean=0.67]) + 0.10×fmt(1.000) | pred='7' gold='8' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 05:30:42,563 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.599 = 0.50×0.67(prox=0.67) + 0.40×proc(0.414[fin=0.54,mean=0.23]) + 0.10×fmt(1.000) | pred='6' gold='8' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 05:30:42,651 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.866 = 0.50×0.80(prox=0.80) + 0.40×proc(0.914[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='7' gold='8' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+
Iter 18 GRPO groups: 40%|#### | 8/20 [04:23<05:41, 28.43s/q, loss=0.0006, mean_r=0.892, q_acc=100%, q_rew=0.818, skip=4]
Iter 18 GRPO groups: 45%|####5 | 9/20 [04:23<06:36, 36.05s/q, loss=0.0006, mean_r=0.892, q_acc=100%, q_rew=0.818, skip=4]2026-04-26 05:30:50,410 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.677 = 0.50×0.85(prox=0.85) + 0.40×proc(0.380[fin=0.33,mean=0.46]) + 0.10×fmt(1.000) | pred='1.83' gold='2' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 05:30:50,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.964[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:31:00,245 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.940 = 0.50×1.00(exact) + 0.40×proc(0.849[fin=0.91,mean=0.75]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:31:00,333 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.791 = 0.50×0.80(prox=0.80) + 0.40×proc(0.728[fin=0.80,mean=0.61]) + 0.10×fmt(1.000) | pred='2.25' gold='2' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:31:00,421 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:31:00,506 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.917 = 0.50×1.00(exact) + 0.40×proc(0.793[fin=0.93,mean=0.58]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:31:09,853 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.815 = 0.50×0.85(prox=0.85) + 0.40×proc(0.725[fin=0.82,mean=0.59]) + 0.10×fmt(1.000) | pred='1.83' gold='2' | step_acc=57% lccp=29% (chain=2/7 ok_count=4) n_steps=7
+2026-04-26 05:31:09,938 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=0.99,mean=0.90]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:31:10,024 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.813 = 0.50×0.80(prox=0.80) + 0.40×proc(0.783[fin=0.90,mean=0.60]) + 0.10×fmt(1.000) | pred='2.25' gold='2' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:31:10,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 18 GRPO groups: 45%|####5 | 9/20 [04:59<06:36, 36.05s/q, loss=0.0001, mean_r=0.891, q_acc=100%, q_rew=0.818, skip=4]
Iter 18 GRPO groups: 50%|##### | 10/20 [04:59<05:59, 35.97s/q, loss=0.0001, mean_r=0.891, q_acc=100%, q_rew=0.818, skip=4]2026-04-26 05:31:24,297 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.268 = 0.50×0.00(prox=0.00) + 0.40×proc(0.420[fin=0.50,mean=0.30]) + 0.10×fmt(1.000) | pred='2/3' gold='8' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 05:31:24,385 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.417 = 0.50×0.29(prox=0.29) + 0.40×proc(0.374[fin=0.29,mean=0.49]) + 0.10×fmt(1.000) | pred='18' gold='8' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 05:31:24,469 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.104 = 0.50×0.00(prox=0.00) + 0.40×proc(0.097[fin=0.10,mean=0.10]) + 0.10×fmt(0.650) | pred='2/3' gold='8' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 05:31:24,556 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.629 = 0.50×0.40(prox=0.40) + 0.40×proc(0.822[fin=0.98,mean=0.58]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:31:29,750 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.590 = 0.50×0.40(prox=0.40) + 0.40×proc(0.725[fin=0.91,mean=0.44]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:31:29,835 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.946[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:31:29,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.514 = 0.50×0.29(prox=0.29) + 0.40×proc(0.678[fin=0.80,mean=0.50]) + 0.10×fmt(1.000) | pred='18' gold='8' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:31:30,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:31:35,314 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:31:35,399 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.474 = 0.50×0.50(prox=0.50) + 0.40×proc(0.310[fin=0.21,mean=0.47]) + 0.10×fmt(1.000) | pred='12' gold='8' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+
Iter 18 GRPO groups: 50%|##### | 10/20 [05:16<05:59, 35.97s/q, loss=0.0038, mean_r=0.595, q_acc=100%, q_rew=0.818, skip=4]
Iter 18 GRPO groups: 55%|#####5 | 11/20 [05:16<04:31, 30.17s/q, loss=0.0038, mean_r=0.595, q_acc=100%, q_rew=0.818, skip=4]2026-04-26 05:31:36,894 INFO src.rl.curriculum_manager - Topic probabilities (rollout 160): [('basic_arithmetic', '0.052'), ('fractions', '0.052'), ('percentages', '0.052'), ('money_problems', '0.052'), ('time_distance', '0.052')]
+2026-04-26 05:31:44,071 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.995 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:44,267 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:44,461 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.993 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:44,656 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:44,852 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:45,047 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.988 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:45,246 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:45,442 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.995 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:45,642 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:31:45,839 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.994 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:51,004 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.988 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:31:51,196 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.976 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:31:51,389 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:51,579 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.985 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:31:51,773 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.784 = clip(base=0.704 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.691 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.69)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:31:51,970 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.796 = clip(base=0.716 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.714 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.75)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:31:52,161 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.983 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:31:52,354 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:52,545 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.972 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:31:52,737 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.943 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+
Iter 18 GRPO groups: 55%|#####5 | 11/20 [05:34<04:31, 30.17s/q, loss=0.0005, mean_r=0.939, q_acc=100%, q_rew=0.762, skip=4]
Iter 18 GRPO groups: 60%|###### | 12/20 [05:34<03:30, 26.32s/q, loss=0.0005, mean_r=0.939, q_acc=100%, q_rew=0.762, skip=4]2026-04-26 05:31:57,563 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:31:57,646 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:32:02,610 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:32:02,692 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:32:02,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:32:02,855 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:32:07,491 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:32:07,573 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:32:07,654 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:32:07,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 18 GRPO groups: 60%|###### | 12/20 [05:52<03:30, 26.32s/q, loss=0var, mean_r=0.994, skip=5]
Iter 18 GRPO groups: 65%|######5 | 13/20 [05:52<02:46, 23.74s/q, loss=0var, mean_r=0.994, skip=5]2026-04-26 05:32:21,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.888 = 0.50×0.85(prox=0.85) + 0.40×proc(0.908[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='2.125' gold='2' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 05:32:21,816 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.809 = 0.50×0.85(prox=0.85) + 0.40×proc(0.709[fin=0.69,mean=0.74]) + 0.10×fmt(1.000) | pred='2.06' gold='2' | step_acc=86% lccp=29% (chain=2/7 ok_count=6) n_steps=7
+2026-04-26 05:32:21,909 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.899 = 0.50×0.85(prox=0.85) + 0.40×proc(0.935[fin=0.97,mean=0.89]) + 0.10×fmt(1.000) | pred='1.9375' gold='2' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:32:22,001 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.953 = 0.50×1.00(exact) + 0.40×proc(0.884[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:32:35,119 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.777 = 0.50×0.67(prox=0.67) + 0.40×proc(0.858[fin=1.00,mean=0.65]) + 0.10×fmt(1.000) | pred='2.5' gold='2' | step_acc=71% lccp=43% (chain=3/7 ok_count=5) n_steps=7
+2026-04-26 05:32:35,214 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.903 = 0.50×0.85(prox=0.85) + 0.40×proc(0.946[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='1.94' gold='2' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:32:35,307 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.903[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 05:32:35,402 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.722 = 0.50×0.62(prox=0.62) + 0.40×proc(0.786[fin=0.96,mean=0.52]) + 0.10×fmt(1.000) | pred='1.375' gold='2' | step_acc=57% lccp=29% (chain=2/7 ok_count=4) n_steps=7
+2026-04-26 05:32:50,166 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.847 = 0.50×0.85(prox=0.85) + 0.40×proc(0.805[fin=0.94,mean=0.61]) + 0.10×fmt(1.000) | pred='1.9375' gold='2' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 05:32:50,261 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.750 = 0.50×0.67(prox=0.67) + 0.40×proc(0.791[fin=1.00,mean=0.48]) + 0.10×fmt(1.000) | pred='2.5' gold='2' | step_acc=43% lccp=0% (chain=0/7 ok_count=3) n_steps=7
+
Iter 18 GRPO groups: 65%|######5 | 13/20 [06:31<02:46, 23.74s/q, loss=-0.0010, mean_r=0.851, q_acc=100%, q_rew=0.762, skip=5]
Iter 18 GRPO groups: 70%|####### | 14/20 [06:31<02:51, 28.51s/q, loss=-0.0010, mean_r=0.851, q_acc=100%, q_rew=0.762, skip=5]2026-04-26 05:32:59,302 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:32:59,385 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:33:08,755 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:33:08,847 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.748 = 0.50×0.50(prox=0.50) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='8' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 05:33:08,930 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.914 = 0.50×1.00(exact) + 0.40×proc(0.784[fin=0.88,mean=0.64]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:33:09,011 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.917[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 05:33:15,980 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.679 = 0.50×0.50(prox=0.50) + 0.40×proc(0.822[fin=0.83,mean=0.81]) + 0.10×fmt(1.000) | pred='12' gold='8' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:33:16,064 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.638 = 0.50×0.50(prox=0.50) + 0.40×proc(0.720[fin=0.74,mean=0.69]) + 0.10×fmt(1.000) | pred='12' gold='8' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:33:16,149 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.357 = 0.50×0.11(prox=0.11) + 0.40×proc(0.506[fin=0.64,mean=0.30]) + 0.10×fmt(1.000) | pred='40.5' gold='8' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:33:16,233 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.795 = 0.50×0.80(prox=0.80) + 0.40×proc(0.737[fin=0.90,mean=0.50]) + 0.10×fmt(1.000) | pred='9' gold='8' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+
Iter 18 GRPO groups: 70%|####### | 14/20 [07:10<02:51, 28.51s/q, loss=0.0004, mean_r=0.810, q_acc=100%, q_rew=0.762, skip=5]
Iter 18 GRPO groups: 75%|#######5 | 15/20 [07:10<02:37, 31.55s/q, loss=0.0004, mean_r=0.810, q_acc=100%, q_rew=0.762, skip=5]2026-04-26 05:33:36,545 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:33:36,629 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:33:36,713 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.783 = 0.50×0.60(prox=0.60) + 0.40×proc(0.956[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='26' gold='39' | step_acc=86% lccp=71% (chain=5/7 ok_count=6) n_steps=7
+2026-04-26 05:33:36,797 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.779 = 0.50×0.60(prox=0.60) + 0.40×proc(0.947[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='26' gold='39' | step_acc=88% lccp=62% (chain=5/8 ok_count=7) n_steps=8
+2026-04-26 05:33:51,067 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:33:51,154 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:33:51,237 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:33:51,321 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.874 = 0.50×0.76(prox=0.76) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='45' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:33:58,487 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:33:58,572 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 18 GRPO groups: 75%|#######5 | 15/20 [07:39<02:37, 31.55s/q, loss=0.0012, mean_r=0.942, q_acc=100%, q_rew=0.762, skip=5]
Iter 18 GRPO groups: 80%|######## | 16/20 [07:39<02:03, 30.99s/q, loss=0.0012, mean_r=0.942, q_acc=100%, q_rew=0.762, skip=5]2026-04-26 05:34:00,029 INFO src.rl.curriculum_manager - Topic probabilities (rollout 180): [('basic_arithmetic', '0.054'), ('percentages', '0.054'), ('money_problems', '0.054'), ('time_distance', '0.054'), ('multi_step_reasoning', '0.054')]
+2026-04-26 05:34:06,357 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.955 + mod=+0.080, cap=1.00) | Q=0.92 sol=0.978 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:06,542 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.982 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:06,732 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.456 = clip(base=0.376 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.180 novelty=0.73 | sol=0.45*prm_final(0.31)+0.35*prm_mean(0.12)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:06,917 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.956 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:07,115 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.926 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.979 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:07,307 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.502 = clip(base=0.422 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.253 novelty=0.73 | sol=0.45*prm_final(0.43)+0.35*prm_mean(0.17)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:07,498 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:34:07,682 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.981 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:07,873 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.937 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:34:08,065 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.436 = clip(base=0.356 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.140 novelty=0.73 | sol=0.45*prm_final(0.24)+0.35*prm_mean(0.09)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:13,212 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.92 sol=0.944 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:13,403 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.834 = clip(base=0.754 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.709 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.74)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:13,595 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.972 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:13,785 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.831 = clip(base=0.751 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.705 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.73)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:13,975 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.828 = clip(base=0.748 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.700 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.71)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:14,174 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.836 = clip(base=0.756 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.711 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.75)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:14,360 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.820 = clip(base=0.740 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.675 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.65)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:14,551 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.812 = clip(base=0.732 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.676 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.65)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:14,739 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.818 = clip(base=0.738 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.672 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.64)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:14,921 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.842 = clip(base=0.762 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.720 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.77)+0.20*lccp(0.00) | steps=3
+
Iter 18 GRPO groups: 80%|######## | 16/20 [07:56<02:03, 30.99s/q, loss=0.0005, mean_r=0.850, q_acc=100%, q_rew=0.782, skip=5]
Iter 18 GRPO groups: 85%|########5 | 17/20 [07:56<01:19, 26.66s/q, loss=0.0005, mean_r=0.850, q_acc=100%, q_rew=0.782, skip=5]2026-04-26 05:34:16,633 INFO src.rl.curriculum_manager - Topic probabilities (rollout 200): [('basic_arithmetic', '0.056'), ('money_problems', '0.056'), ('time_distance', '0.056'), ('multi_step_reasoning', '0.056'), ('mixed_operations', '0.056')]
+2026-04-26 05:34:20,864 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.755 = clip(base=0.675 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.674 novelty=0.65 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.57)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:34:21,046 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.990 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:21,237 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.805 = clip(base=0.725 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.800 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(0.25) | steps=4
+2026-04-26 05:34:21,425 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.859 = clip(base=0.779 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.891 novelty=0.65 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.72)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:21,615 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.990 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:21,803 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.623 = clip(base=0.543 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.532 novelty=0.65 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.40)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:21,990 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.718 = clip(base=0.638 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.660 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.61)+0.20*lccp(0.00) | steps=2
+2026-04-26 05:34:22,174 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.881 = clip(base=0.801 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.934 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.81)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:22,358 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.881 = clip(base=0.801 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.934 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.81)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:22,542 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.713 = clip(base=0.633 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.650 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.58)+0.20*lccp(0.00) | steps=2
+2026-04-26 05:34:26,546 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.968 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:26,732 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.889 = clip(base=0.809 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.941 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:26,918 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.976 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:27,109 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.901 = clip(base=0.821 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.960 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:27,299 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.980 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:27,487 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.895 = clip(base=0.815 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.950 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:27,674 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.992 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:27,872 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.980 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:28,060 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.987 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:28,250 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.977 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+
Iter 18 GRPO groups: 85%|########5 | 17/20 [08:09<01:19, 26.66s/q, loss=-0.0021, mean_r=0.858, q_acc=100%, q_rew=0.739, skip=5]
Iter 18 GRPO groups: 90%|######### | 18/20 [08:09<00:45, 22.65s/q, loss=-0.0021, mean_r=0.858, q_acc=100%, q_rew=0.739, skip=5]2026-04-26 05:34:35,419 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:34:35,503 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:34:44,271 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:34:44,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:34:44,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:34:44,534 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:34:53,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:34:53,265 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.948 = 0.50×1.00(exact) + 0.40×proc(0.870[fin=1.00,mean=0.68]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:34:53,351 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.592 = 0.50×0.70(prox=0.70) + 0.40×proc(0.356[fin=0.49,mean=0.15]) + 0.10×fmt(1.000) | pred='255' gold='210' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 05:34:53,436 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 18 GRPO groups: 90%|######### | 18/20 [08:41<00:45, 22.65s/q, loss=0.0004, mean_r=0.953, q_acc=100%, q_rew=0.739, skip=5]
Iter 18 GRPO groups: 95%|#########5| 19/20 [08:41<00:25, 25.29s/q, loss=0.0004, mean_r=0.953, q_acc=100%, q_rew=0.739, skip=5]2026-04-26 05:35:07,842 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.519 = 0.50×0.46(prox=0.46) + 0.40×proc(0.281[fin=0.12,mean=0.53]) + 0.10×fmt(1.000) | pred='79' gold='50' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 05:35:07,929 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.583 = 0.50×0.64(prox=0.64) + 0.40×proc(0.407[fin=0.36,mean=0.48]) + 0.10×fmt(1.000) | pred='36' gold='50' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 05:35:08,012 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.566 = 0.50×0.71(prox=0.71) + 0.40×proc(0.271[fin=0.05,mean=0.60]) + 0.10×fmt(1.000) | pred='60' gold='50' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 05:35:08,096 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.751 = 0.50×0.67(prox=0.67) + 0.40×proc(0.794[fin=0.89,mean=0.65]) + 0.10×fmt(1.000) | pred='62.5' gold='50' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 05:35:19,848 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.903 = 0.50×0.85(prox=0.85) + 0.40×proc(0.946[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='50.5' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:35:19,931 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.595 = 0.50×0.53(prox=0.53) + 0.40×proc(0.572[fin=0.54,mean=0.63]) + 0.10×fmt(1.000) | pred='72' gold='50' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:35:20,015 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.894 = 0.50×0.83(prox=0.83) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='55' gold='50' | step_acc=86% lccp=43% (chain=3/7 ok_count=6) n_steps=7
+2026-04-26 05:35:20,099 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.568 = 0.50×0.71(prox=0.71) + 0.40×proc(0.278[fin=0.06,mean=0.61]) + 0.10×fmt(1.000) | pred='60' gold='50' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 05:35:29,135 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.831 = 0.50×0.85(prox=0.85) + 0.40×proc(0.764[fin=0.95,mean=0.49]) + 0.10×fmt(1.000) | pred='50.5' gold='50' | step_acc=43% lccp=14% (chain=1/7 ok_count=3) n_steps=7
+2026-04-26 05:35:29,222 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.560 = 0.50×0.71(prox=0.71) + 0.40×proc(0.258[fin=0.08,mean=0.52]) + 0.10×fmt(1.000) | pred='60' gold='50' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+
Iter 18 GRPO groups: 95%|#########5| 19/20 [09:10<00:25, 25.29s/q, loss=-0.0003, mean_r=0.677, q_acc=100%, q_rew=0.739, skip=5]
Iter 18 GRPO groups: 100%|##########| 20/20 [09:10<00:00, 26.50s/q, loss=-0.0003, mean_r=0.677, q_acc=100%, q_rew=0.739, skip=5]
Iter 18 GRPO groups: 100%|##########| 20/20 [09:10<00:00, 27.53s/q, loss=-0.0003, mean_r=0.677, q_acc=100%, q_rew=0.739, skip=5]
+2026-04-26 05:35:30,710 INFO __main__ - Iter 18 | loss=0.0001 | reward mean=0.866 std=0.178 | gt_match=59.1% | grounded_acc=94.3% | step_acc=83.1% | lccp=69.2% | batch_acc=95.4% | phase=SELFPLAY_RAMP sp_ratio=18% | groups=19 skipped=5(0var=5) | lr=4.60e-06 | 550.6s
+2026-04-26 05:35:30,710 INFO __main__ - Question generation: 4/4 valid (100%) | q_reward=0.739 | q_acc=100.0% (>0.5 quality) | topic=0.64 diff=0.63 clarity=1.00 novelty=0.45 solvability=0.98
+2026-04-26 05:35:30,711 INFO __main__ - ======================================================================
+2026-04-26 05:35:30,712 INFO __main__ - GRPO ITERATION 19/60
+2026-04-26 05:35:30,712 INFO __main__ - ======================================================================
+2026-04-26 05:35:30,733 INFO __main__ - LR this iteration: 4.60e-06 | T=0.678 | MATH ratio=32%
+
Iter 19 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 05:35:34,965 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:35:35,047 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:35:43,226 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:35:43,309 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:35:43,391 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:35:43,473 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:35:50,109 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:35:50,192 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:35:50,275 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.944[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:35:50,358 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 19 GRPO groups: 0%| | 0/20 [00:27, ?q/s, loss=0var, mean_r=0.998, skip=1]
Iter 19 GRPO groups: 5%|5 | 1/20 [00:27<08:43, 27.53s/q, loss=0var, mean_r=0.998, skip=1]2026-04-26 05:36:03,766 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:36:03,849 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.930 = 0.50×1.00(exact) + 0.40×proc(0.825[fin=0.99,mean=0.57]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:36:03,933 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:36:04,017 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:36:13,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:36:13,304 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:36:13,387 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.735 = 0.50×0.71(prox=0.71) + 0.40×proc(0.694[fin=0.84,mean=0.48]) + 0.10×fmt(1.000) | pred='60' gold='75' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:36:13,470 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.801 = 0.50×0.71(prox=0.71) + 0.40×proc(0.860[fin=1.00,mean=0.66]) + 0.10×fmt(1.000) | pred='60' gold='75' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:36:23,801 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.725 = 0.50×0.71(prox=0.71) + 0.40×proc(0.671[fin=0.81,mean=0.46]) + 0.10×fmt(1.000) | pred='60' gold='75' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 05:36:23,885 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 19 GRPO groups: 5%|5 | 1/20 [00:54<08:43, 27.53s/q, loss=0.0011, mean_r=0.919, skip=1]
Iter 19 GRPO groups: 10%|# | 2/20 [00:54<08:11, 27.29s/q, loss=0.0011, mean_r=0.919, skip=1]2026-04-26 05:36:25,389 INFO src.rl.curriculum_manager - Topic probabilities (rollout 220): [('statistics', '0.233'), ('money_problems', '0.044'), ('time_distance', '0.044'), ('multi_step_reasoning', '0.044'), ('mixed_operations', '0.044')]
+2026-04-26 05:36:33,224 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.980 = clip(base=0.900 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.987 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:36:33,432 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.965 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:36:33,635 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.994 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:36:33,837 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.995 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:36:34,039 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.986 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:36:34,245 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.963 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:36:34,461 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.996 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:34,670 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.819 = clip(base=0.739 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.787 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.82)+0.20*lccp(0.25) | steps=4
+2026-04-26 05:36:34,885 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.980 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:36:35,103 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:36:39,976 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.987 = clip(base=0.907 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:40,189 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:40,398 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.519 = clip(base=0.439 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.341 novelty=0.72 | sol=0.45*prm_final(0.30)+0.35*prm_mean(0.40)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:36:40,606 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:40,810 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:41,017 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:41,226 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.995 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:41,430 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:41,638 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:41,844 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 19 GRPO groups: 10%|# | 2/20 [01:12<08:11, 27.29s/q, loss=-0.0009, mean_r=0.931, q_acc=100%, q_rew=0.706, skip=1]
Iter 19 GRPO groups: 15%|#5 | 3/20 [01:12<06:32, 23.10s/q, loss=-0.0009, mean_r=0.931, q_acc=100%, q_rew=0.706, skip=1]2026-04-26 05:36:46,571 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:36:46,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:36:51,099 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.949 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(0.650) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:36:51,181 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.956 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(0.650) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:36:51,263 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.944[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:36:51,345 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:36:55,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.314 = 0.50×0.00(prox=0.00) + 0.40×proc(0.623[fin=0.77,mean=0.40]) + 0.10×fmt(0.650) | pred='$\\sqrt{\\pi}$' gold='2' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 05:36:55,715 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:36:55,797 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:36:55,880 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 19 GRPO groups: 15%|#5 | 3/20 [01:28<06:32, 23.10s/q, loss=-0.0018, mean_r=0.914, q_acc=100%, q_rew=0.706, skip=1]
Iter 19 GRPO groups: 20%|## | 4/20 [01:28<05:23, 20.22s/q, loss=-0.0018, mean_r=0.914, q_acc=100%, q_rew=0.706, skip=1]2026-04-26 05:37:02,967 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:37:03,052 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:37:03,138 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:37:03,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:37:07,950 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:37:08,033 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:37:08,117 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:37:08,199 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:37:14,335 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:37:14,418 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 19 GRPO groups: 20%|## | 4/20 [01:43<05:23, 20.22s/q, loss=0var, mean_r=0.999, skip=2]
Iter 19 GRPO groups: 25%|##5 | 5/20 [01:43<04:35, 18.38s/q, loss=0var, mean_r=0.999, skip=2]2026-04-26 05:37:14,419 INFO src.rl.curriculum_manager - Topic probabilities (rollout 240): [('money_problems', '0.063'), ('time_distance', '0.063'), ('multi_step_reasoning', '0.063'), ('mixed_operations', '0.063'), ('comparison_problems', '0.063')]
+2026-04-26 05:37:24,095 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.737 = clip(base=0.657 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.592 novelty=0.73 | sol=0.45*prm_final(0.44)+0.35*prm_mean(0.75)+0.20*lccp(0.67) | steps=6
+2026-04-26 05:37:24,310 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.740 = clip(base=0.660 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.640 novelty=0.73 | sol=0.45*prm_final(0.54)+0.35*prm_mean(0.76)+0.20*lccp(0.67) | steps=6
+2026-04-26 05:37:24,533 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.870 = clip(base=0.790 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.855 novelty=0.73 | sol=0.45*prm_final(0.78)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:37:24,752 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.807 = clip(base=0.727 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.753 novelty=0.73 | sol=0.45*prm_final(0.60)+0.35*prm_mean(0.81)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:37:24,970 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.836 = clip(base=0.756 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.795 novelty=0.73 | sol=0.45*prm_final(0.67)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:37:25,191 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.669 = clip(base=0.589 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.524 novelty=0.73 | sol=0.45*prm_final(0.31)+0.35*prm_mean(0.72)+0.20*lccp(0.67) | steps=6
+2026-04-26 05:37:25,417 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.747 = clip(base=0.667 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.654 novelty=0.73 | sol=0.45*prm_final(0.61)+0.35*prm_mean(0.71)+0.20*lccp(0.67) | steps=6
+2026-04-26 05:37:25,637 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.703 = clip(base=0.623 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.584 novelty=0.73 | sol=0.45*prm_final(0.41)+0.35*prm_mean(0.75)+0.20*lccp(0.67) | steps=6
+2026-04-26 05:37:25,860 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.848 = clip(base=0.768 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.818 novelty=0.73 | sol=0.45*prm_final(0.71)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:37:26,076 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.698 = clip(base=0.618 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.563 novelty=0.73 | sol=0.45*prm_final(0.38)+0.35*prm_mean(0.74)+0.20*lccp(0.67) | steps=6
+2026-04-26 05:37:32,782 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.704 = clip(base=0.624 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.540 novelty=0.75 | sol=0.45*prm_final(0.62)+0.35*prm_mean(0.55)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:37:32,991 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.643 = clip(base=0.563 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.478 novelty=0.75 | sol=0.45*prm_final(0.52)+0.35*prm_mean(0.51)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:37:33,201 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.445 = clip(base=0.365 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.205 novelty=0.75 | sol=0.45*prm_final(0.05)+0.35*prm_mean(0.33)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:37:33,406 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.780 = clip(base=0.700 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.696 novelty=0.75 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.69)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:37:33,609 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.953 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:37:33,811 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.805 = clip(base=0.725 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.742 novelty=0.75 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.68)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:37:34,011 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.968 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:37:34,214 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.797 = clip(base=0.717 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.715 novelty=0.75 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.72)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:37:34,414 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.661 = clip(base=0.581 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.512 novelty=0.75 | sol=0.45*prm_final(0.45)+0.35*prm_mean(0.61)+0.20*lccp(0.50) | steps=4
+2026-04-26 05:37:34,613 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.808 = clip(base=0.728 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.733 novelty=0.75 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.69)+0.20*lccp(0.33) | steps=3
+
Iter 19 GRPO groups: 25%|##5 | 5/20 [02:05<04:35, 18.38s/q, loss=0.0009, mean_r=0.759, q_acc=100%, q_rew=0.702, skip=2]
Iter 19 GRPO groups: 30%|### | 6/20 [02:05<04:33, 19.57s/q, loss=0.0009, mean_r=0.759, q_acc=100%, q_rew=0.702, skip=2]2026-04-26 05:37:39,191 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:37:39,270 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.435 = 0.50×0.50(prox=0.50) + 0.40×proc(0.213[fin=0.22,mean=0.21]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 05:37:43,584 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:37:43,661 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:37:43,736 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.894 = 0.50×1.00(exact) + 0.40×proc(0.734[fin=0.95,mean=0.41]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:37:43,812 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:37:50,746 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.458 = 0.50×0.50(prox=0.50) + 0.40×proc(0.269[fin=0.29,mean=0.23]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 05:37:50,823 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.908 = 0.50×1.00(exact) + 0.40×proc(0.770[fin=0.98,mean=0.45]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:37:50,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.906 = 0.50×1.00(exact) + 0.40×proc(0.765[fin=0.96,mean=0.48]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:37:50,977 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 19 GRPO groups: 30%|### | 6/20 [02:28<04:33, 19.57s/q, loss=-0.0019, mean_r=0.860, q_acc=100%, q_rew=0.702, skip=2]
Iter 19 GRPO groups: 35%|###5 | 7/20 [02:28<04:28, 20.63s/q, loss=-0.0019, mean_r=0.860, q_acc=100%, q_rew=0.702, skip=2]2026-04-26 05:38:03,140 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:38:03,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:38:03,304 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:38:03,385 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:38:08,291 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:38:08,373 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:38:08,455 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:38:08,536 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:38:15,618 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:38:15,699 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(0.650) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 19 GRPO groups: 35%|###5 | 7/20 [02:44<04:28, 20.63s/q, loss=0var, mean_r=0.989, skip=3]
Iter 19 GRPO groups: 40%|#### | 8/20 [02:44<03:52, 19.34s/q, loss=0var, mean_r=0.989, skip=3]2026-04-26 05:38:20,144 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.845 = 0.50×0.85(prox=0.85) + 0.40×proc(0.799[fin=0.96,mean=0.55]) + 0.10×fmt(1.000) | pred='5194' gold='5217' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:38:20,229 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.748 = 0.50×0.85(prox=0.85) + 0.40×proc(0.558[fin=0.65,mean=0.42]) + 0.10×fmt(1.000) | pred='5194' gold='5217' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 05:38:27,953 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.888[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='5217' gold='5217' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 05:38:28,036 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.803 = 0.50×0.85(prox=0.85) + 0.40×proc(0.694[fin=0.82,mean=0.50]) + 0.10×fmt(1.000) | pred='5194' gold='5217' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 05:38:28,120 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5217' gold='5217' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:38:28,205 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.861 = 0.50×0.85(prox=0.85) + 0.40×proc(0.841[fin=1.00,mean=0.60]) + 0.10×fmt(1.000) | pred='5194' gold='5217' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 05:38:38,239 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.829 = 0.50×0.85(prox=0.85) + 0.40×proc(0.760[fin=0.91,mean=0.53]) + 0.10×fmt(1.000) | pred='5194' gold='5217' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 05:38:38,324 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.894 = 0.50×0.85(prox=0.85) + 0.40×proc(0.923[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='5194' gold='5217' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 05:38:38,408 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.629 = 0.50×0.85(prox=0.85) + 0.40×proc(0.261[fin=0.27,mean=0.24]) + 0.10×fmt(1.000) | pred='5192' gold='5217' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 05:38:38,489 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5217' gold='5217' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 19 GRPO groups: 40%|#### | 8/20 [03:14<03:52, 19.34s/q, loss=-0.0002, mean_r=0.856, q_acc=100%, q_rew=0.702, skip=3]
Iter 19 GRPO groups: 45%|####5 | 9/20 [03:14<04:07, 22.51s/q, loss=-0.0002, mean_r=0.856, q_acc=100%, q_rew=0.702, skip=3]2026-04-26 05:38:51,357 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:38:51,448 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:38:51,531 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:38:51,622 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:39:05,758 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.473 = 0.50×0.50(prox=0.50) + 0.40×proc(0.305[fin=0.16,mean=0.52]) + 0.10×fmt(1.000) | pred='98' gold='195' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 05:39:05,853 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:39:05,941 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.887[fin=0.93,mean=0.82]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 05:39:06,028 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.824 = 0.50×0.80(prox=0.80) + 0.40×proc(0.806[fin=0.99,mean=0.53]) + 0.10×fmt(1.000) | pred='171' gold='195' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 05:39:16,659 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:39:16,745 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 19 GRPO groups: 45%|####5 | 9/20 [03:47<04:07, 22.51s/q, loss=0.0012, mean_r=0.923, q_acc=100%, q_rew=0.702, skip=3]
Iter 19 GRPO groups: 50%|##### | 10/20 [03:47<04:17, 25.76s/q, loss=0.0012, mean_r=0.923, q_acc=100%, q_rew=0.702, skip=3]2026-04-26 05:39:21,830 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:39:21,913 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:39:31,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:39:31,411 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:39:31,488 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.972 = 0.50×1.00(exact) + 0.40×proc(0.931[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:39:31,570 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:39:41,123 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:39:41,205 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:39:41,286 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:39:41,368 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 19 GRPO groups: 50%|##### | 10/20 [04:15<04:17, 25.76s/q, loss=0var, mean_r=0.997, skip=4]
Iter 19 GRPO groups: 55%|#####5 | 11/20 [04:15<03:56, 26.31s/q, loss=0var, mean_r=0.997, skip=4]2026-04-26 05:39:51,424 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='600' gold='600' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:39:51,506 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='600' gold='600' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:39:51,590 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.511 = 0.50×0.45(prox=0.45) + 0.40×proc(0.384[fin=0.38,mean=0.39]) + 0.10×fmt(1.000) | pred='240' gold='600' | step_acc=20% lccp=20% (chain=1/5 ok_count=1) n_steps=5
+2026-04-26 05:39:51,673 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='600' gold='600' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:40:05,587 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='600' gold='600' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:40:05,670 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.928[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='600' gold='600' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 05:40:05,754 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.543 = 0.50×0.37(prox=0.37) + 0.40×proc(0.645[fin=0.84,mean=0.36]) + 0.10×fmt(1.000) | pred='90' gold='600' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 05:40:05,838 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.589 = 0.50×0.37(prox=0.37) + 0.40×proc(0.766[fin=0.97,mean=0.46]) + 0.10×fmt(1.000) | pred='80' gold='600' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 05:40:13,425 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.819 = 0.50×0.75(prox=0.75) + 0.40×proc(0.860[fin=0.96,mean=0.71]) + 0.10×fmt(1.000) | pred='700' gold='600' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 05:40:13,510 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.933[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='600' gold='600' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 19 GRPO groups: 55%|#####5 | 11/20 [04:44<03:56, 26.31s/q, loss=0.0010, mean_r=0.839, q_acc=100%, q_rew=0.702, skip=4]
Iter 19 GRPO groups: 60%|###### | 12/20 [04:44<03:37, 27.19s/q, loss=0.0010, mean_r=0.839, q_acc=100%, q_rew=0.702, skip=4]2026-04-26 05:40:14,961 INFO src.rl.curriculum_manager - Topic probabilities (rollout 260): [('money_problems', '0.067'), ('time_distance', '0.067'), ('multi_step_reasoning', '0.067'), ('mixed_operations', '0.067'), ('comparison_problems', '0.067')]
+2026-04-26 05:40:20,811 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.946 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.987 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:21,004 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:21,198 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:21,391 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:21,583 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.922 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.996 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:21,775 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:21,970 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:22,167 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:22,359 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:22,555 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.998 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:29,769 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.960 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:29,973 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.512 = clip(base=0.432 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.300 novelty=0.68 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.55)+0.20*lccp(0.50) | steps=6
+2026-04-26 05:40:30,177 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:30,374 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.670 = clip(base=0.590 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.603 novelty=0.68 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.48)+0.20*lccp(0.00) | steps=4
+2026-04-26 05:40:30,571 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:30,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:30,974 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.399 = clip(base=0.319 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.188 novelty=0.68 | sol=0.45*prm_final(0.32)+0.35*prm_mean(0.13)+0.20*lccp(0.00) | steps=4
+2026-04-26 05:40:31,176 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.990 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:40:31,372 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:31,567 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+
Iter 19 GRPO groups: 60%|###### | 12/20 [05:02<03:37, 27.19s/q, loss=0.0000, mean_r=0.914, q_acc=100%, q_rew=0.716, skip=5]
Iter 19 GRPO groups: 65%|######5 | 13/20 [05:02<02:51, 24.47s/q, loss=0.0000, mean_r=0.914, q_acc=100%, q_rew=0.716, skip=5]2026-04-26 05:40:33,195 INFO src.rl.curriculum_manager - Topic probabilities (rollout 280): [('money_problems', '0.072'), ('time_distance', '0.072'), ('multi_step_reasoning', '0.072'), ('comparison_problems', '0.072'), ('optimization_problems', '0.072')]
+2026-04-26 05:40:42,725 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.976 + mod=+0.080, cap=1.00) | Q=0.94 sol=1.000 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:42,919 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.973 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:40:43,123 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.943 + mod=+0.080, cap=1.00) | Q=0.86 sol=1.000 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:43,317 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.939 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.985 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:43,518 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.945 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.995 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:43,716 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.933 = clip(base=0.853 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.849 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(0.50) | steps=4
+2026-04-26 05:40:43,909 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.939 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.986 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:44,105 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.943 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.990 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:44,298 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.979 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:44,491 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.813 novelty=0.75 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.79)+0.20*lccp(0.50) | steps=4
+2026-04-26 05:40:50,249 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.970 novelty=0.71 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:40:50,446 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.983 novelty=0.71 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:50,643 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.992 novelty=0.71 | sol=0.45*prm_final(0.98)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:50,844 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.911 novelty=0.71 | sol=0.45*prm_final(0.82)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:40:51,041 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.654 = clip(base=0.574 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.478 novelty=0.71 | sol=0.45*prm_final(0.63)+0.35*prm_mean(0.56)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:40:51,236 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.406 = clip(base=0.326 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.159 novelty=0.71 | sol=0.45*prm_final(0.22)+0.35*prm_mean(0.17)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:40:51,433 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.617 = clip(base=0.537 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.414 novelty=0.71 | sol=0.45*prm_final(0.03)+0.35*prm_mean(0.69)+0.20*lccp(0.80) | steps=5
+2026-04-26 05:40:51,628 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.990 novelty=0.71 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:51,824 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.829 = clip(base=0.749 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.781 novelty=0.71 | sol=0.45*prm_final(0.59)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:52,021 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.996 novelty=0.71 | sol=0.45*prm_final(0.99)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+
Iter 19 GRPO groups: 65%|######5 | 13/20 [05:22<02:51, 24.47s/q, loss=0.0008, mean_r=0.901, q_acc=100%, q_rew=0.733, skip=5]
Iter 19 GRPO groups: 70%|####### | 14/20 [05:22<02:19, 23.26s/q, loss=0.0008, mean_r=0.901, q_acc=100%, q_rew=0.733, skip=5]2026-04-26 05:41:27,059 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.842[fin=0.89,mean=0.76]) + 0.10×fmt(1.000) | pred='521' gold='5067' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 05:41:27,147 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.350 = 0.50×0.34(prox=0.34) + 0.40×proc(0.136[fin=0.10,mean=0.20]) + 0.10×fmt(1.000) | pred='156' gold='5067' | step_acc=17% lccp=17% (chain=1/6 ok_count=1) n_steps=6
+2026-04-26 05:41:39,374 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.470 = 0.50×0.35(prox=0.35) + 0.40×proc(0.306[fin=0.17,mean=0.51]) + 0.10×fmt(1.000) | pred='260' gold='5067' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 05:41:39,457 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.251 = 0.50×0.10(prox=0.10) + 0.40×proc(0.156[fin=0.08,mean=0.27]) + 0.10×fmt(1.000) | pred='-17240' gold='5067' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 05:41:39,551 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='506' gold='5067' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:41:39,642 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.868[fin=0.95,mean=0.74]) + 0.10×fmt(1.000) | pred='501' gold='5067' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 05:41:49,846 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.35(prox=0.35) + 0.40×proc(0.797[fin=0.90,mean=0.64]) + 0.10×fmt(1.000) | pred='410' gold='5067' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 05:41:49,930 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.49(prox=0.49) + 0.40×proc(0.305[fin=0.14,mean=0.56]) + 0.10×fmt(1.000) | pred='2407' gold='5067' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 05:41:50,022 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.39(prox=0.39) + 0.40×proc(0.655[fin=0.59,mean=0.75]) + 0.10×fmt(1.000) | pred='1106' gold='5067' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+
Iter 19 GRPO groups: 70%|####### | 14/20 [06:20<02:19, 23.26s/q, loss=0.0007, mean_r=0.486, q_acc=100%, q_rew=0.733, skip=5]
Iter 19 GRPO groups: 75%|#######5 | 15/20 [06:20<02:48, 33.64s/q, loss=0.0007, mean_r=0.486, q_acc=100%, q_rew=0.733, skip=5]2026-04-26 05:42:24,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.37(prox=0.37) + 0.40×proc(0.690[fin=0.75,mean=0.61]) + 0.10×fmt(1.000) | pred='11' gold='6' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:42:33,589 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.589 = 0.50×0.60(prox=0.60) + 0.40×proc(0.472[fin=0.39,mean=0.60]) + 0.10×fmt(1.000) | pred='8' gold='6' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 05:42:33,676 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.700 = 0.50×0.75(prox=0.75) + 0.40×proc(0.563[fin=0.53,mean=0.62]) + 0.10×fmt(1.000) | pred='7' gold='6' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:42:33,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.923[fin=0.95,mean=0.88]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=88% lccp=75% (chain=6/8 ok_count=7) n_steps=8
+2026-04-26 05:42:33,857 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.894 = 0.50×1.00(exact) + 0.40×proc(0.736[fin=0.64,mean=0.88]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=91% lccp=82% (chain=9/11 ok_count=10) n_steps=11
+2026-04-26 05:42:56,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.924 = 0.50×1.00(exact) + 0.40×proc(0.810[fin=0.78,mean=0.85]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=88% lccp=75% (chain=6/8 ok_count=7) n_steps=8
+2026-04-26 05:42:56,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.724 = 0.50×0.75(prox=0.75) + 0.40×proc(0.710[fin=0.87,mean=0.47]) + 0.10×fmt(0.650) | pred='5' gold='6' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 05:42:56,282 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=0.98,mean=0.94]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=89% lccp=78% (chain=7/9 ok_count=8) n_steps=9
+
Iter 19 GRPO groups: 75%|#######5 | 15/20 [07:26<02:48, 33.64s/q, loss=0.0024, mean_r=0.792, q_acc=100%, q_rew=0.733, skip=5]
Iter 19 GRPO groups: 80%|######## | 16/20 [07:26<02:53, 43.44s/q, loss=0.0024, mean_r=0.792, q_acc=100%, q_rew=0.733, skip=5]2026-04-26 05:42:59,768 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:04,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:04,803 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:04,886 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:04,968 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:09,892 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:09,975 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:10,058 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:10,142 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:14,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 19 GRPO groups: 80%|######## | 16/20 [07:43<02:53, 43.44s/q, loss=0var, mean_r=0.999, skip=6]
Iter 19 GRPO groups: 85%|########5 | 17/20 [07:43<01:46, 35.47s/q, loss=0var, mean_r=0.999, skip=6]2026-04-26 05:43:17,333 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:17,417 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.898[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 05:43:17,499 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:22,573 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:22,658 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.909[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 05:43:22,740 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.928[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:22,826 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:28,907 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:28,990 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:29,074 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 19 GRPO groups: 85%|########5 | 17/20 [07:58<01:46, 35.47s/q, loss=0var, mean_r=0.987, skip=7]
Iter 19 GRPO groups: 90%|######### | 18/20 [07:58<00:58, 29.20s/q, loss=0var, mean_r=0.987, skip=7]2026-04-26 05:43:30,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:36,309 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:36,387 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:36,466 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:36,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:41,572 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:41,652 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:41,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:41,810 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:47,331 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 19 GRPO groups: 90%|######### | 18/20 [08:16<00:58, 29.20s/q, loss=0var, mean_r=0.998, skip=8]
Iter 19 GRPO groups: 95%|#########5| 19/20 [08:16<00:25, 25.91s/q, loss=0var, mean_r=0.998, skip=8]2026-04-26 05:43:55,590 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.587 = 0.50×0.36(prox=0.36) + 0.40×proc(0.764[fin=1.00,mean=0.42]) + 0.10×fmt(1.000) | pred='7.5' gold='60' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 05:43:55,674 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.964[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:43:55,759 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.536 = 0.50×0.46(prox=0.46) + 0.40×proc(0.512[fin=0.67,mean=0.27]) + 0.10×fmt(1.000) | pred='25' gold='60' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:44:01,536 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.953 = 0.50×1.00(exact) + 0.40×proc(0.883[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 05:44:01,632 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.288 = 0.50×0.00(prox=0.00) + 0.40×proc(0.471[fin=0.42,mean=0.55]) + 0.10×fmt(1.000) | pred='233.33%' gold='60' | step_acc=57% lccp=0% (chain=0/7 ok_count=4) n_steps=7
+2026-04-26 05:44:01,720 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.664 = 0.50×0.53(prox=0.53) + 0.40×proc(0.749[fin=0.95,mean=0.45]) + 0.10×fmt(1.000) | pred='33.33' gold='60' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 05:44:01,814 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 05:44:11,452 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.659 = 0.50×0.47(prox=0.47) + 0.40×proc(0.805[fin=0.97,mean=0.55]) + 0.10×fmt(1.000) | pred='26.67' gold='60' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 05:44:11,536 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:44:11,619 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 19 GRPO groups: 95%|#########5| 19/20 [08:42<00:25, 25.91s/q, loss=-0.0018, mean_r=0.765, q_acc=100%, q_rew=0.733, skip=8]
Iter 19 GRPO groups: 100%|##########| 20/20 [08:42<00:00, 25.88s/q, loss=-0.0018, mean_r=0.765, q_acc=100%, q_rew=0.733, skip=8]
Iter 19 GRPO groups: 100%|##########| 20/20 [08:42<00:00, 26.12s/q, loss=-0.0018, mean_r=0.765, q_acc=100%, q_rew=0.733, skip=8]
+2026-04-26 05:44:13,142 INFO __main__ - Iter 19 | loss=0.0002 | reward mean=0.891 std=0.173 | gt_match=76.4% | grounded_acc=94.9% | step_acc=85.1% | lccp=75.7% | batch_acc=95.4% | phase=SELFPLAY_RAMP sp_ratio=21% | groups=16 skipped=8(0var=8) | lr=4.52e-06 | 522.4s
+2026-04-26 05:44:13,143 WARNING __main__ - STARVATION: 33% of groups skipped (zero variance). grounded_acc=94.9% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 05:44:13,143 INFO __main__ - Question generation: 4/4 valid (100%) | q_reward=0.733 | q_acc=100.0% (>0.5 quality) | topic=0.48 diff=0.85 clarity=1.00 novelty=0.46 solvability=0.97
+2026-04-26 05:44:13,144 INFO __main__ - ======================================================================
+2026-04-26 05:44:13,144 INFO __main__ - GRPO ITERATION 20/60
+2026-04-26 05:44:13,144 INFO __main__ - ======================================================================
+2026-04-26 05:44:13,165 INFO __main__ - LR this iteration: 4.52e-06 | T=0.671 | MATH ratio=34%
+
Iter 20 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 05:44:25,231 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.496 = 0.50×0.00(prox=0.00) + 0.40×proc(0.830[fin=0.89,mean=0.74]) + 0.10×fmt(1.000) | pred='$95,000' gold='250000' | step_acc=71% lccp=43% (chain=3/7 ok_count=5) n_steps=7
+2026-04-26 05:44:41,176 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.903[fin=0.98,mean=0.79]) + 0.10×fmt(1.000) | pred='250000' gold='250000' | step_acc=71% lccp=43% (chain=3/7 ok_count=5) n_steps=7
+2026-04-26 05:44:41,270 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.863 = 0.50×1.00(exact) + 0.40×proc(0.657[fin=0.57,mean=0.79]) + 0.10×fmt(1.000) | pred='250000' gold='250000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:44:41,368 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.962[fin=0.99,mean=0.92]) + 0.10×fmt(1.000) | pred='25000' gold='250000' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:44:41,463 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.901[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='5000' gold='250000' | step_acc=83% lccp=33% (chain=2/6 ok_count=5) n_steps=6
+2026-04-26 05:45:04,914 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.455 = 0.50×0.33(prox=0.33) + 0.40×proc(0.289[fin=0.11,mean=0.55]) + 0.10×fmt(1.000) | pred='-5000' gold='250000' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 05:45:05,006 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250000' gold='250000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:45:05,101 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.441 = 0.50×0.33(prox=0.33) + 0.40×proc(0.255[fin=0.07,mean=0.53]) + 0.10×fmt(1.000) | pred='-5000' gold='250000' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 05:45:05,194 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.451 = 0.50×0.33(prox=0.33) + 0.40×proc(0.278[fin=0.12,mean=0.52]) + 0.10×fmt(1.000) | pred='-5000' gold='250000' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 05:45:20,972 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.445 = 0.50×0.33(prox=0.33) + 0.40×proc(0.264[fin=0.09,mean=0.53]) + 0.10×fmt(1.000) | pred='-5000' gold='250000' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+
Iter 20 GRPO groups: 0%| | 0/20 [01:09, ?q/s, loss=0.0002, mean_r=0.621, skip=0]
Iter 20 GRPO groups: 5%|5 | 1/20 [01:09<21:57, 69.33s/q, loss=0.0002, mean_r=0.621, skip=0]2026-04-26 05:45:25,117 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:45:25,199 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:45:25,279 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:45:30,083 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:45:30,170 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:45:30,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:45:30,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:45:34,971 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:45:35,049 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:45:35,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.949[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 20 GRPO groups: 5%|5 | 1/20 [01:21<21:57, 69.33s/q, loss=0var, mean_r=0.994, skip=1]
Iter 20 GRPO groups: 10%|# | 2/20 [01:21<10:47, 35.98s/q, loss=0var, mean_r=0.994, skip=1]2026-04-26 05:46:09,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:46:14,066 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.945[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:46:14,151 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:46:14,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.942[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:46:14,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:46:16,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.944[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:46:16,622 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:46:16,701 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:46:16,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 20 GRPO groups: 10%|# | 2/20 [02:07<10:47, 35.98s/q, loss=0var, mean_r=0.987, skip=2]
Iter 20 GRPO groups: 15%|#5 | 3/20 [02:07<11:28, 40.49s/q, loss=0var, mean_r=0.987, skip=2]2026-04-26 05:46:20,990 INFO src.rl.curriculum_manager - Topic probabilities (rollout 300): [('money_problems', '0.078'), ('time_distance', '0.078'), ('multi_step_reasoning', '0.078'), ('comparison_problems', '0.078'), ('optimization_problems', '0.078')]
+2026-04-26 05:46:26,362 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.947 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:26,562 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:26,765 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:26,969 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:27,175 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:27,380 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:27,582 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:27,783 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:27,981 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:28,184 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:32,054 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.936 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:32,262 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.926 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.997 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:32,460 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.988 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:32,659 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:32,865 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:33,064 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=1.000 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:33,273 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=1.000 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:33,472 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=1.000 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:33,671 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:33,869 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.920 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.988 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+
Iter 20 GRPO groups: 15%|#5 | 3/20 [02:22<11:28, 40.49s/q, loss=0.0004, mean_r=1.000, q_acc=100%, q_rew=0.819, skip=3]
Iter 20 GRPO groups: 20%|## | 4/20 [02:22<08:05, 30.37s/q, loss=0.0004, mean_r=1.000, q_acc=100%, q_rew=0.819, skip=3]2026-04-26 05:46:43,901 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:46:43,985 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:46:44,071 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:46:44,154 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:46:54,692 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:46:54,774 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.972 = 0.50×1.00(exact) + 0.40×proc(0.929[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 05:46:54,867 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:46:54,951 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.878 = 0.50×1.00(exact) + 0.40×proc(0.695[fin=0.84,mean=0.47]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 05:46:57,810 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.454 = 0.50×0.00(prox=0.00) + 0.40×proc(0.790[fin=0.87,mean=0.67]) + 0.10×fmt(1.000) | pred='41/2' gold='17' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 05:46:57,893 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 20 GRPO groups: 20%|## | 4/20 [02:46<08:05, 30.37s/q, loss=-0.0006, mean_r=0.930, q_acc=100%, q_rew=0.819, skip=3]
Iter 20 GRPO groups: 25%|##5 | 5/20 [02:46<06:58, 27.91s/q, loss=-0.0006, mean_r=0.930, q_acc=100%, q_rew=0.819, skip=3]2026-04-26 05:47:05,933 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:47:06,018 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 05:47:22,465 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:47:22,550 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:47:22,634 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.837[fin=0.88,mean=0.78]) + 0.10×fmt(1.000) | pred='215' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:47:22,719 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:47:31,614 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:47:31,698 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:47:31,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:47:31,868 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 20 GRPO groups: 25%|##5 | 5/20 [03:28<06:58, 27.91s/q, loss=0.0008, mean_r=0.953, q_acc=100%, q_rew=0.819, skip=3]
Iter 20 GRPO groups: 30%|### | 6/20 [03:28<07:38, 32.76s/q, loss=0.0008, mean_r=0.953, q_acc=100%, q_rew=0.819, skip=3]2026-04-26 05:47:41,562 INFO src.rl.curriculum_manager - Topic probabilities (rollout 320): [('probability', '0.111'), ('statistics', '0.111'), ('money_problems', '0.063'), ('time_distance', '0.063'), ('comparison_problems', '0.063')]
+2026-04-26 05:47:49,520 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.700 = clip(base=0.620 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.519 novelty=0.74 | sol=0.45*prm_final(0.59)+0.35*prm_mean(0.53)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:47:49,738 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.527 = clip(base=0.447 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.332 novelty=0.74 | sol=0.45*prm_final(0.26)+0.35*prm_mean(0.42)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:47:49,964 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.783 = clip(base=0.703 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.718 novelty=0.74 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.64)+0.20*lccp(0.25) | steps=4
+2026-04-26 05:47:50,183 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.792 = clip(base=0.712 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.731 novelty=0.74 | sol=0.45*prm_final(0.92)+0.35*prm_mean(0.72)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:47:50,399 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.957 novelty=0.74 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:47:50,616 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.764 = clip(base=0.684 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.666 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.62)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:47:50,832 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.957 novelty=0.74 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:47:51,050 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.941 novelty=0.74 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:47:51,265 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.976 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:47:51,483 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.991 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:47:58,392 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:47:58,616 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:47:58,833 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.991 novelty=0.71 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:47:59,055 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.971 novelty=0.71 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:47:59,272 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:47:59,485 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:47:59,695 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.960 novelty=0.71 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:47:59,914 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:48:00,128 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.973 novelty=0.71 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:48:00,365 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.954 novelty=0.71 | sol=0.45*prm_final(0.92)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+
Iter 20 GRPO groups: 30%|### | 6/20 [03:48<07:38, 32.76s/q, loss=0.0005, mean_r=0.889, q_acc=100%, q_rew=0.760, skip=3]
Iter 20 GRPO groups: 35%|###5 | 7/20 [03:48<06:13, 28.76s/q, loss=0.0005, mean_r=0.889, q_acc=100%, q_rew=0.760, skip=3]2026-04-26 05:48:05,521 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:05,605 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:05,687 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:05,771 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:12,296 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:12,381 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:12,466 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:12,550 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:19,817 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:19,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 20 GRPO groups: 35%|###5 | 7/20 [04:06<06:13, 28.76s/q, loss=0var, mean_r=0.998, skip=4]
Iter 20 GRPO groups: 40%|#### | 8/20 [04:06<05:03, 25.28s/q, loss=0var, mean_r=0.998, skip=4]2026-04-26 05:48:21,472 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:21,550 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:26,411 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:26,489 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:26,566 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:26,643 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:31,615 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:31,697 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:31,779 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:31,859 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 20 GRPO groups: 40%|#### | 8/20 [04:23<05:03, 25.28s/q, loss=0var, mean_r=0.997, skip=5]
Iter 20 GRPO groups: 45%|####5 | 9/20 [04:23<04:08, 22.61s/q, loss=0var, mean_r=0.997, skip=5]2026-04-26 05:48:43,479 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:48:43,572 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:48:43,664 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:48:43,757 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:48:55,243 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:48:55,334 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:48:55,419 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:48:55,502 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:49:06,187 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:49:06,271 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 20 GRPO groups: 45%|####5 | 9/20 [04:53<04:08, 22.61s/q, loss=0var, mean_r=0.999, skip=6]
Iter 20 GRPO groups: 50%|##### | 10/20 [04:53<04:07, 24.77s/q, loss=0var, mean_r=0.999, skip=6]2026-04-26 05:49:17,617 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.321 = 0.50×0.23(prox=0.23) + 0.40×proc(0.201[fin=0.13,mean=0.30]) + 0.10×fmt(1.000) | pred='64' gold='24' | step_acc=17% lccp=17% (chain=1/6 ok_count=1) n_steps=6
+2026-04-26 05:49:17,742 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.323 = 0.50×0.00(prox=0.00) + 0.40×proc(0.511[fin=0.53,mean=0.48]) + 0.10×fmt(1.000) | pred='64-32*sqrt(2)' gold='24' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+2026-04-26 05:49:33,596 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.348 = 0.50×0.00(prox=0.00) + 0.40×proc(0.546[fin=0.51,mean=0.60]) + 0.10×fmt(1.000) | pred='64-16*sqrt(2)' gold='24' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:49:33,700 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.596 = 0.50×0.60(prox=0.60) + 0.40×proc(0.489[fin=0.46,mean=0.54]) + 0.10×fmt(1.000) | pred='32' gold='24' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 05:49:33,804 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.416 = 0.50×0.28(prox=0.28) + 0.40×proc(0.252[fin=0.14,mean=0.42]) + 0.10×fmt(1.000) | pred='54.848' gold='24' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 05:49:33,910 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.742 = 0.50×0.75(prox=0.75) + 0.40×proc(0.669[fin=0.82,mean=0.44]) + 0.10×fmt(1.000) | pred='28' gold='24' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 05:49:41,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.898[fin=0.98,mean=0.77]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 05:49:41,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.895[fin=0.98,mean=0.77]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=75% lccp=38% (chain=3/8 ok_count=6) n_steps=8
+2026-04-26 05:49:41,656 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.571 = 0.50×0.43(prox=0.43) + 0.40×proc(0.641[fin=0.84,mean=0.34]) + 0.10×fmt(1.000) | pred='40' gold='24' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 05:49:41,755 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.495 = 0.50×0.60(prox=0.60) + 0.40×proc(0.239[fin=0.04,mean=0.53]) + 0.10×fmt(1.000) | pred='32' gold='24' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+
Iter 20 GRPO groups: 50%|##### | 10/20 [05:37<04:07, 24.77s/q, loss=-0.0004, mean_r=0.573, q_acc=100%, q_rew=0.760, skip=6]
Iter 20 GRPO groups: 55%|#####5 | 11/20 [05:37<04:35, 30.65s/q, loss=-0.0004, mean_r=0.573, q_acc=100%, q_rew=0.760, skip=6]2026-04-26 05:49:57,432 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:49:57,517 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:49:57,603 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:49:57,698 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:50:18,878 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:50:18,971 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:50:19,056 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:50:19,149 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:50:32,103 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:50:32,198 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 20 GRPO groups: 55%|#####5 | 11/20 [06:19<04:35, 30.65s/q, loss=0var, mean_r=0.999, skip=7]
Iter 20 GRPO groups: 60%|###### | 12/20 [06:19<04:32, 34.09s/q, loss=0var, mean_r=0.999, skip=7]2026-04-26 05:50:32,200 INFO src.rl.curriculum_manager - Topic probabilities (rollout 340): [('probability', '0.110'), ('statistics', '0.110'), ('money_problems', '0.069'), ('time_distance', '0.069'), ('comparison_problems', '0.069')]
+2026-04-26 05:50:44,778 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.936 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:50:44,978 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.986 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:50:45,185 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.960 novelty=0.74 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:50:45,386 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.918 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.77)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:50:45,585 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.589 = clip(base=0.509 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.347 novelty=0.74 | sol=0.45*prm_final(0.13)+0.35*prm_mean(0.53)+0.20*lccp(0.50) | steps=4
+2026-04-26 05:50:45,792 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.877 = clip(base=0.797 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.839 novelty=0.74 | sol=0.45*prm_final(0.73)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:50:46,004 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.993 = clip(base=0.913 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:50:46,208 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.821 = clip(base=0.741 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.753 novelty=0.74 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.69)+0.20*lccp(0.33) | steps=6
+2026-04-26 05:50:46,406 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.905 novelty=0.74 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:50:46,602 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.564 = clip(base=0.484 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.354 novelty=0.74 | sol=0.45*prm_final(0.29)+0.35*prm_mean(0.45)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:51,394 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.864 = clip(base=0.784 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.758 novelty=0.79 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.74)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:51,588 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.845 = clip(base=0.765 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.758 novelty=0.79 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.74)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:51,816 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.849 = clip(base=0.769 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.763 novelty=0.79 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.75)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:52,050 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.854 = clip(base=0.774 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.759 novelty=0.79 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.78)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:52,290 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.866 = clip(base=0.786 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.777 novelty=0.79 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.80)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:52,503 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.852 = clip(base=0.772 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.756 novelty=0.79 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.77)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:52,704 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.576 = clip(base=0.496 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.377 novelty=0.79 | sol=0.45*prm_final(0.34)+0.35*prm_mean(0.45)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:52,904 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.853 = clip(base=0.773 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.756 novelty=0.79 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.78)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:53,103 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.822 = clip(base=0.742 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.708 novelty=0.79 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.72)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:53,305 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.858 = clip(base=0.778 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.778 novelty=0.79 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.77)+0.20*lccp(0.33) | steps=3
+
Iter 20 GRPO groups: 60%|###### | 12/20 [06:41<04:32, 34.09s/q, loss=-0.0001, mean_r=0.844, q_acc=100%, q_rew=0.763, skip=7]
Iter 20 GRPO groups: 65%|######5 | 13/20 [06:41<03:34, 30.68s/q, loss=-0.0001, mean_r=0.844, q_acc=100%, q_rew=0.763, skip=7]2026-04-26 05:50:55,048 INFO src.rl.curriculum_manager - Topic probabilities (rollout 360): [('money_problems', '0.094'), ('time_distance', '0.094'), ('comparison_problems', '0.094'), ('sets', '0.094'), ('combinatorics', '0.094')]
+2026-04-26 05:51:05,158 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.947 + mod=+0.080, cap=1.00) | Q=0.90 sol=0.982 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:51:05,377 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.936 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.995 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:51:05,588 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.876 = clip(base=0.796 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.785 novelty=0.70 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.72)+0.20*lccp(0.50) | steps=6
+2026-04-26 05:51:05,798 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.930 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.983 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:51:06,018 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.996 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:51:06,235 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.989 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:51:06,455 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.921 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.980 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:51:06,665 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.988 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:51:06,886 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.983 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:51:07,098 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.936 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.994 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:51:15,914 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.966 + mod=+0.080, cap=1.00) | Q=0.92 sol=0.996 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:51:16,102 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.976 novelty=0.80 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:51:16,294 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.959 novelty=0.80 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:51:16,495 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.736 = clip(base=0.656 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.547 novelty=0.80 | sol=0.45*prm_final(0.35)+0.35*prm_mean(0.74)+0.20*lccp(0.67) | steps=6
+2026-04-26 05:51:16,690 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.946 novelty=0.80 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:51:16,885 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.592 = clip(base=0.512 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.353 novelty=0.80 | sol=0.45*prm_final(0.42)+0.35*prm_mean(0.47)+0.20*lccp(0.00) | steps=4
+2026-04-26 05:51:17,080 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.990 = clip(base=0.910 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.979 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:51:17,273 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.820 = clip(base=0.740 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.652 novelty=0.80 | sol=0.45*prm_final(0.85)+0.35*prm_mean(0.77)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:51:17,465 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.893 = clip(base=0.813 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.774 novelty=0.80 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.76)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:51:17,657 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.847 = clip(base=0.767 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.729 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.80)+0.20*lccp(0.00) | steps=4
+
Iter 20 GRPO groups: 65%|######5 | 13/20 [07:06<03:34, 30.68s/q, loss=-0.0016, mean_r=0.936, q_acc=100%, q_rew=0.783, skip=7]
Iter 20 GRPO groups: 70%|####### | 14/20 [07:06<02:52, 28.76s/q, loss=-0.0016, mean_r=0.936, q_acc=100%, q_rew=0.783, skip=7]2026-04-26 05:51:25,490 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:51:25,577 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:51:35,137 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:51:35,220 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:51:35,303 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:51:35,389 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.459 = 0.50×0.00(prox=0.00) + 0.40×proc(0.748[fin=0.85,mean=0.59]) + 0.10×fmt(1.000) | pred='2^119' gold='0' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:51:47,933 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:51:48,016 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:51:48,094 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:51:48,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 20 GRPO groups: 70%|####### | 14/20 [07:44<02:52, 28.76s/q, loss=-0.0009, mean_r=0.944, q_acc=100%, q_rew=0.783, skip=7]
Iter 20 GRPO groups: 75%|#######5 | 15/20 [07:44<02:38, 31.76s/q, loss=-0.0009, mean_r=0.944, q_acc=100%, q_rew=0.783, skip=7]2026-04-26 05:52:03,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:03,986 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.766 = 0.50×0.60(prox=0.60) + 0.40×proc(0.916[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='22400' gold='16800' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 05:52:04,070 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:04,154 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:11,344 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.926[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:11,429 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:11,512 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:11,594 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:19,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:19,844 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 20 GRPO groups: 75%|#######5 | 15/20 [08:08<02:38, 31.76s/q, loss=-0.0005, mean_r=0.972, q_acc=100%, q_rew=0.783, skip=7]
Iter 20 GRPO groups: 80%|######## | 16/20 [08:08<01:56, 29.19s/q, loss=-0.0005, mean_r=0.972, q_acc=100%, q_rew=0.783, skip=7]2026-04-26 05:52:26,021 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:26,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:34,063 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:34,149 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:34,236 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.824 = 0.50×0.85(prox=0.85) + 0.40×proc(0.749[fin=0.98,mean=0.40]) + 0.10×fmt(1.000) | pred='105' gold='115' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 05:52:34,322 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:42,493 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:42,577 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:42,664 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:42,748 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 20 GRPO groups: 80%|######## | 16/20 [08:39<01:56, 29.19s/q, loss=0.0003, mean_r=0.982, q_acc=100%, q_rew=0.783, skip=7]
Iter 20 GRPO groups: 85%|########5 | 17/20 [08:39<01:29, 29.90s/q, loss=0.0003, mean_r=0.982, q_acc=100%, q_rew=0.783, skip=7]2026-04-26 05:53:00,748 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:53:00,840 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 05:53:00,933 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:53:01,026 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:53:10,829 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(0.650) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:53:10,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:53:11,014 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.420 = 0.50×0.00(prox=0.00) + 0.40×proc(0.822[fin=0.84,mean=0.80]) + 0.10×fmt(0.700) | pred='' gold='10' | step_acc=86% lccp=14% (chain=1/7 ok_count=6) n_steps=7
+2026-04-26 05:53:11,107 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:53:19,843 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:53:19,934 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 20 GRPO groups: 85%|########5 | 17/20 [09:08<01:29, 29.90s/q, loss=-0.0010, mean_r=0.890, q_acc=100%, q_rew=0.783, skip=7]
Iter 20 GRPO groups: 90%|######### | 18/20 [09:08<00:58, 29.50s/q, loss=-0.0010, mean_r=0.890, q_acc=100%, q_rew=0.783, skip=7]2026-04-26 05:53:21,405 INFO src.rl.curriculum_manager - Topic probabilities (rollout 380): [('money_problems', '0.094'), ('time_distance', '0.094'), ('comparison_problems', '0.094'), ('sets', '0.094'), ('combinatorics', '0.094')]
+2026-04-26 05:53:33,354 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.371 = clip(base=0.291 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.055 novelty=0.77 | sol=0.45*prm_final(0.07)+0.35*prm_mean(0.07)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:53:33,573 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.617 = clip(base=0.537 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.508 novelty=0.77 | sol=0.45*prm_final(0.66)+0.35*prm_mean(0.47)+0.20*lccp(0.25) | steps=4
+2026-04-26 05:53:33,786 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.982 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:53:33,999 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.395 = clip(base=0.315 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.138 novelty=0.77 | sol=0.45*prm_final(0.23)+0.35*prm_mean(0.09)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:53:34,230 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.652 = clip(base=0.572 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.576 novelty=0.77 | sol=0.45*prm_final(0.83)+0.35*prm_mean(0.46)+0.20*lccp(0.20) | steps=5
+2026-04-26 05:53:34,452 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.763 = clip(base=0.683 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.716 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.68)+0.20*lccp(0.17) | steps=6
+2026-04-26 05:53:34,681 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.768 = clip(base=0.688 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.709 novelty=0.77 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.57)+0.20*lccp(0.40) | steps=5
+2026-04-26 05:53:34,913 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.447 = clip(base=0.367 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.227 novelty=0.77 | sol=0.45*prm_final(0.23)+0.35*prm_mean(0.36)+0.20*lccp(0.00) | steps=5
+2026-04-26 05:53:35,141 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.780 = clip(base=0.700 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.740 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.76)+0.20*lccp(0.14) | steps=7
+2026-04-26 05:53:35,351 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.445 = clip(base=0.365 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.213 novelty=0.77 | sol=0.45*prm_final(0.31)+0.35*prm_mean(0.21)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:53:51,967 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.958 + mod=+0.080, cap=1.00) | Q=0.90 sol=0.994 novelty=0.83 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=8
+2026-04-26 05:53:52,186 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.590 = clip(base=0.510 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.362 novelty=0.83 | sol=0.45*prm_final(0.39)+0.35*prm_mean(0.39)+0.20*lccp(0.25) | steps=4
+2026-04-26 05:53:52,421 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.939 novelty=0.83 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.85)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:53:52,625 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.533 = clip(base=0.533 + mod=-0.000, cap=1.00) | Q=0.77 sol=0.373 novelty=0.83 | sol=0.45*prm_final(0.59)+0.35*prm_mean(0.31)+0.20*lccp(0.00) | steps=2
+2026-04-26 05:53:52,845 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.994 = clip(base=0.914 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.961 novelty=0.83 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:53:53,068 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.845 novelty=0.83 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.81)+0.20*lccp(0.70) | steps=10
+2026-04-26 05:53:53,289 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.609 = clip(base=0.529 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.382 novelty=0.83 | sol=0.45*prm_final(0.41)+0.35*prm_mean(0.45)+0.20*lccp(0.20) | steps=5
+2026-04-26 05:53:53,503 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.930 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.992 novelty=0.83 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:53:53,715 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.941 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.994 novelty=0.83 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:53:53,934 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.940 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.997 novelty=0.83 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=11
+
Iter 20 GRPO groups: 90%|######### | 18/20 [09:42<00:58, 29.50s/q, loss=0.0039, mean_r=0.740, q_acc=100%, q_rew=0.770, skip=7]
Iter 20 GRPO groups: 95%|#########5| 19/20 [09:42<00:30, 30.92s/q, loss=0.0039, mean_r=0.740, q_acc=100%, q_rew=0.770, skip=7]2026-04-26 05:54:01,477 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.791 = 0.50×0.59(prox=0.59) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='54' gold='40' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:54:01,563 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.792 = 0.50×0.59(prox=0.59) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='54' gold='40' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:54:14,174 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.792 = 0.50×0.59(prox=0.59) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='54' gold='40' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:54:14,258 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.363 = 0.50×0.01(prox=0.01) + 0.40×proc(0.567[fin=0.64,mean=0.45]) + 0.10×fmt(1.000) | pred='1630' gold='40' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 05:54:14,342 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.772 = 0.50×0.59(prox=0.59) + 0.40×proc(0.944[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='54' gold='40' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 05:54:14,426 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.14(prox=0.14) + 0.40×proc(0.862[fin=0.99,mean=0.67]) + 0.10×fmt(1.000) | pred='162' gold='40' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 05:54:22,859 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.782 = 0.50×0.59(prox=0.59) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='54' gold='40' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:54:22,942 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.792 = 0.50×0.59(prox=0.59) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='40' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:54:23,024 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.913[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 05:54:23,109 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.938[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+
Iter 20 GRPO groups: 95%|#########5| 19/20 [10:19<00:30, 30.92s/q, loss=0.0004, mean_r=0.757, q_acc=100%, q_rew=0.770, skip=7]
Iter 20 GRPO groups: 100%|##########| 20/20 [10:19<00:00, 32.86s/q, loss=0.0004, mean_r=0.757, q_acc=100%, q_rew=0.770, skip=7]
Iter 20 GRPO groups: 100%|##########| 20/20 [10:19<00:00, 30.99s/q, loss=0.0004, mean_r=0.757, q_acc=100%, q_rew=0.770, skip=7]
+2026-04-26 05:54:33,032 INFO src.rl.llm_question_classifier - LLMClassifier cache=90% llm=2% fallback=8% (cache_size=40/10000)
+2026-04-26 05:54:33,032 INFO __main__ - Iter 20 | loss=0.0002 | reward mean=0.896 std=0.178 | gt_match=79.9% | grounded_acc=90.6% | step_acc=92.3% | lccp=86.2% | batch_acc=92.8% | phase=SELFPLAY_RAMP sp_ratio=25% | groups=18 skipped=7(0var=7) | lr=4.43e-06 | 619.9s
+2026-04-26 05:54:33,032 INFO __main__ - Question generation: 5/5 valid (100%) | q_reward=0.770 | q_acc=100.0% (>0.5 quality) | topic=0.72 diff=0.70 clarity=1.00 novelty=0.47 solvability=0.94
+2026-04-26 05:54:33,032 INFO __main__ - Evaluating GSM8K (150 samples)...
+
GSM8K eval: 0%| | 0/150 [00:00, ?q/s]
GSM8K eval: 1%| | 1/150 [00:02<05:36, 2.26s/q, correct=1/1, lccp=100.0%, score=0.999, step_acc=100.0%]
GSM8K eval: 1%|1 | 2/150 [00:07<09:33, 3.88s/q, correct=2/2, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 2%|2 | 3/150 [00:09<08:06, 3.31s/q, correct=3/3, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 3%|2 | 4/150 [00:12<07:05, 2.91s/q, correct=3/4, lccp=83.3%, score=0.887, step_acc=91.7%]
GSM8K eval: 3%|3 | 5/150 [00:13<05:56, 2.46s/q, correct=4/5, lccp=86.7%, score=0.910, step_acc=93.3%]
GSM8K eval: 4%|4 | 6/150 [00:19<08:25, 3.51s/q, correct=4/6, lccp=75.6%, score=0.888, step_acc=87.8%]
GSM8K eval: 5%|4 | 7/150 [00:22<08:19, 3.49s/q, correct=5/7, lccp=79.0%, score=0.904, step_acc=89.5%]
GSM8K eval: 5%|5 | 8/150 [00:25<07:28, 3.16s/q, correct=6/8, lccp=81.7%, score=0.916, step_acc=90.8%]
GSM8K eval: 6%|6 | 9/150 [00:28<07:27, 3.18s/q, correct=7/9, lccp=83.7%, score=0.925, step_acc=91.9%]
GSM8K eval: 7%|6 | 10/150 [00:33<08:42, 3.73s/q, correct=7/10, lccp=81.3%, score=0.908, step_acc=90.7%]
GSM8K eval: 7%|7 | 11/150 [00:36<08:03, 3.48s/q, correct=8/11, lccp=83.0%, score=0.917, step_acc=91.5%]
GSM8K eval: 8%|8 | 12/150 [00:38<07:01, 3.06s/q, correct=9/12, lccp=84.4%, score=0.923, step_acc=92.2%]
GSM8K eval: 9%|8 | 13/150 [00:41<06:39, 2.91s/q, correct=10/13, lccp=85.6%, score=0.927, step_acc=92.8%]
GSM8K eval: 9%|9 | 14/150 [00:45<07:45, 3.42s/q, correct=11/14, lccp=86.7%, score=0.932, step_acc=93.3%]
GSM8K eval: 10%|# | 15/150 [00:48<07:07, 3.17s/q, correct=12/15, lccp=87.6%, score=0.937, step_acc=93.8%]
GSM8K eval: 11%|# | 16/150 [00:50<06:38, 2.98s/q, correct=12/16, lccp=88.3%, score=0.913, step_acc=94.2%]
GSM8K eval: 11%|#1 | 17/150 [00:54<07:19, 3.30s/q, correct=13/17, lccp=89.0%, score=0.918, step_acc=94.5%]
GSM8K eval: 12%|#2 | 18/150 [01:00<08:59, 4.09s/q, correct=13/18, lccp=84.8%, score=0.906, step_acc=92.0%]
GSM8K eval: 13%|#2 | 19/150 [01:03<07:57, 3.65s/q, correct=14/19, lccp=85.6%, score=0.911, step_acc=92.5%]
GSM8K eval: 13%|#3 | 20/150 [01:07<07:59, 3.69s/q, correct=15/20, lccp=86.3%, score=0.915, step_acc=92.8%]
GSM8K eval: 14%|#4 | 21/150 [01:09<07:16, 3.38s/q, correct=16/21, lccp=86.9%, score=0.919, step_acc=93.2%]
GSM8K eval: 15%|#4 | 22/150 [01:15<08:25, 3.95s/q, correct=17/22, lccp=84.9%, score=0.921, step_acc=91.5%]
GSM8K eval: 15%|#5 | 23/150 [01:19<08:30, 4.02s/q, correct=18/23, lccp=85.6%, score=0.924, step_acc=91.9%]
GSM8K eval: 16%|#6 | 24/150 [01:21<07:32, 3.59s/q, correct=18/24, lccp=83.1%, score=0.907, step_acc=89.1%]
GSM8K eval: 17%|#6 | 25/150 [01:24<06:57, 3.34s/q, correct=18/25, lccp=80.7%, score=0.903, step_acc=88.6%]
GSM8K eval: 17%|#7 | 26/150 [01:29<07:32, 3.65s/q, correct=19/26, lccp=81.5%, score=0.907, step_acc=89.0%]
GSM8K eval: 18%|#8 | 27/150 [01:31<07:00, 3.42s/q, correct=19/27, lccp=82.2%, score=0.901, step_acc=89.4%]
GSM8K eval: 19%|#8 | 28/150 [01:34<06:11, 3.04s/q, correct=20/28, lccp=82.8%, score=0.905, step_acc=89.8%]
GSM8K eval: 19%|#9 | 29/150 [01:36<05:58, 2.96s/q, correct=21/29, lccp=83.4%, score=0.908, step_acc=90.1%]
GSM8K eval: 20%|## | 30/150 [01:40<06:26, 3.22s/q, correct=22/30, lccp=84.0%, score=0.911, step_acc=90.5%]
GSM8K eval: 21%|## | 31/150 [01:43<05:58, 3.01s/q, correct=23/31, lccp=84.5%, score=0.914, step_acc=90.8%]
GSM8K eval: 21%|##1 | 32/150 [01:44<05:09, 2.62s/q, correct=24/32, lccp=85.0%, score=0.916, step_acc=91.1%]
GSM8K eval: 22%|##2 | 33/150 [01:47<05:11, 2.66s/q, correct=25/33, lccp=85.4%, score=0.918, step_acc=91.3%]
GSM8K eval: 23%|##2 | 34/150 [01:49<04:46, 2.47s/q, correct=26/34, lccp=85.8%, score=0.921, step_acc=91.6%]
GSM8K eval: 23%|##3 | 35/150 [01:52<04:47, 2.50s/q, correct=27/35, lccp=86.2%, score=0.923, step_acc=91.8%]
GSM8K eval: 24%|##4 | 36/150 [01:55<05:17, 2.78s/q, correct=28/36, lccp=86.6%, score=0.925, step_acc=92.1%]
GSM8K eval: 25%|##4 | 37/150 [01:57<04:48, 2.55s/q, correct=29/37, lccp=87.0%, score=0.926, step_acc=92.3%]
GSM8K eval: 25%|##5 | 38/150 [02:00<05:01, 2.69s/q, correct=30/38, lccp=87.3%, score=0.928, step_acc=92.5%]
GSM8K eval: 26%|##6 | 39/150 [02:05<06:10, 3.34s/q, correct=31/39, lccp=87.7%, score=0.930, step_acc=92.7%]
GSM8K eval: 27%|##6 | 40/150 [02:11<07:42, 4.20s/q, correct=32/40, lccp=88.0%, score=0.931, step_acc=92.8%]
GSM8K eval: 27%|##7 | 41/150 [02:14<06:57, 3.83s/q, correct=32/41, lccp=88.3%, score=0.931, step_acc=93.0%]
GSM8K eval: 28%|##8 | 42/150 [02:19<07:38, 4.25s/q, correct=33/42, lccp=87.0%, score=0.932, step_acc=92.8%]
GSM8K eval: 29%|##8 | 43/150 [02:22<06:31, 3.66s/q, correct=34/43, lccp=87.3%, score=0.933, step_acc=93.0%]
GSM8K eval: 29%|##9 | 44/150 [02:28<07:54, 4.48s/q, correct=35/44, lccp=87.5%, score=0.935, step_acc=93.1%]
GSM8K eval: 30%|### | 45/150 [02:31<07:07, 4.07s/q, correct=36/45, lccp=87.8%, score=0.936, step_acc=93.3%]
GSM8K eval: 31%|### | 46/150 [02:36<07:27, 4.31s/q, correct=36/46, lccp=85.9%, score=0.931, step_acc=93.2%]
GSM8K eval: 31%|###1 | 47/150 [02:39<06:45, 3.93s/q, correct=37/47, lccp=86.2%, score=0.933, step_acc=93.3%]
GSM8K eval: 32%|###2 | 48/150 [02:41<05:35, 3.29s/q, correct=38/48, lccp=86.5%, score=0.934, step_acc=93.5%]
GSM8K eval: 33%|###2 | 49/150 [02:48<07:16, 4.32s/q, correct=38/49, lccp=85.3%, score=0.920, step_acc=92.1%]
GSM8K eval: 33%|###3 | 50/150 [02:51<06:38, 3.99s/q, correct=38/50, lccp=84.6%, score=0.912, step_acc=91.3%]
GSM8K eval: 34%|###4 | 51/150 [02:52<05:18, 3.22s/q, correct=39/51, lccp=84.9%, score=0.913, step_acc=91.5%]
GSM8K eval: 35%|###4 | 52/150 [02:57<05:46, 3.53s/q, correct=39/52, lccp=83.3%, score=0.913, step_acc=91.3%]
GSM8K eval: 35%|###5 | 53/150 [03:01<06:18, 3.90s/q, correct=39/53, lccp=82.8%, score=0.905, step_acc=90.7%]
GSM8K eval: 36%|###6 | 54/150 [03:04<05:29, 3.44s/q, correct=40/54, lccp=83.2%, score=0.907, step_acc=90.9%]
GSM8K eval: 37%|###6 | 55/150 [03:07<05:30, 3.48s/q, correct=41/55, lccp=83.5%, score=0.909, step_acc=91.1%]
GSM8K eval: 37%|###7 | 56/150 [03:11<05:29, 3.51s/q, correct=42/56, lccp=83.8%, score=0.910, step_acc=91.2%]
GSM8K eval: 38%|###8 | 57/150 [03:13<04:52, 3.15s/q, correct=43/57, lccp=84.0%, score=0.912, step_acc=91.4%]
GSM8K eval: 39%|###8 | 58/150 [03:17<05:16, 3.44s/q, correct=44/58, lccp=84.3%, score=0.913, step_acc=91.5%]
GSM8K eval: 39%|###9 | 59/150 [03:22<05:50, 3.86s/q, correct=44/59, lccp=82.9%, score=0.911, step_acc=91.1%]
GSM8K eval: 40%|#### | 60/150 [03:27<06:17, 4.19s/q, correct=45/60, lccp=83.2%, score=0.912, step_acc=91.2%]
GSM8K eval: 41%|#### | 61/150 [03:29<05:22, 3.63s/q, correct=46/61, lccp=83.5%, score=0.914, step_acc=91.4%]
GSM8K eval: 41%|####1 | 62/150 [03:33<05:06, 3.48s/q, correct=47/62, lccp=83.7%, score=0.915, step_acc=91.5%]
GSM8K eval: 42%|####2 | 63/150 [03:36<04:59, 3.44s/q, correct=47/63, lccp=83.4%, score=0.909, step_acc=91.1%]
GSM8K eval: 43%|####2 | 64/150 [03:39<04:39, 3.25s/q, correct=48/64, lccp=83.7%, score=0.910, step_acc=91.3%]
GSM8K eval: 43%|####3 | 65/150 [03:41<04:24, 3.11s/q, correct=49/65, lccp=84.0%, score=0.912, step_acc=91.4%]
GSM8K eval: 44%|####4 | 66/150 [03:43<03:52, 2.76s/q, correct=50/66, lccp=84.2%, score=0.913, step_acc=91.5%]
GSM8K eval: 45%|####4 | 67/150 [03:46<03:37, 2.62s/q, correct=51/67, lccp=84.4%, score=0.914, step_acc=91.7%]
GSM8K eval: 45%|####5 | 68/150 [03:48<03:35, 2.63s/q, correct=52/68, lccp=84.7%, score=0.916, step_acc=91.8%]
GSM8K eval: 46%|####6 | 69/150 [03:50<03:05, 2.29s/q, correct=53/69, lccp=84.9%, score=0.917, step_acc=91.9%]
GSM8K eval: 47%|####6 | 70/150 [03:53<03:21, 2.51s/q, correct=54/70, lccp=83.7%, score=0.918, step_acc=91.7%]
GSM8K eval: 47%|####7 | 71/150 [03:56<03:33, 2.70s/q, correct=55/71, lccp=82.5%, score=0.918, step_acc=91.6%]
GSM8K eval: 48%|####8 | 72/150 [03:58<03:01, 2.33s/q, correct=56/72, lccp=82.7%, score=0.919, step_acc=91.7%]
GSM8K eval: 49%|####8 | 73/150 [03:59<02:45, 2.15s/q, correct=57/73, lccp=83.0%, score=0.921, step_acc=91.8%]
GSM8K eval: 49%|####9 | 74/150 [04:03<03:15, 2.57s/q, correct=58/74, lccp=83.2%, score=0.922, step_acc=91.9%]
GSM8K eval: 50%|##### | 75/150 [04:05<02:53, 2.31s/q, correct=59/75, lccp=83.4%, score=0.923, step_acc=92.0%]
GSM8K eval: 51%|##### | 76/150 [04:11<04:26, 3.61s/q, correct=59/76, lccp=83.5%, score=0.918, step_acc=92.0%]
GSM8K eval: 51%|#####1 | 77/150 [04:15<04:30, 3.70s/q, correct=60/77, lccp=83.7%, score=0.919, step_acc=92.1%]
GSM8K eval: 52%|#####2 | 78/150 [04:18<03:59, 3.33s/q, correct=61/78, lccp=83.9%, score=0.920, step_acc=92.2%]
GSM8K eval: 53%|#####2 | 79/150 [04:20<03:47, 3.20s/q, correct=62/79, lccp=83.6%, score=0.919, step_acc=92.0%]
GSM8K eval: 53%|#####3 | 80/150 [04:23<03:40, 3.15s/q, correct=63/80, lccp=83.8%, score=0.920, step_acc=92.1%]
GSM8K eval: 54%|#####4 | 81/150 [04:26<03:22, 2.93s/q, correct=64/81, lccp=84.0%, score=0.920, step_acc=92.2%]
GSM8K eval: 55%|#####4 | 82/150 [04:29<03:19, 2.93s/q, correct=65/82, lccp=84.2%, score=0.921, step_acc=92.3%]
GSM8K eval: 55%|#####5 | 83/150 [04:32<03:13, 2.88s/q, correct=66/83, lccp=84.4%, score=0.922, step_acc=92.4%]
GSM8K eval: 56%|#####6 | 84/150 [04:34<03:04, 2.80s/q, correct=67/84, lccp=84.6%, score=0.923, step_acc=92.5%]
GSM8K eval: 57%|#####6 | 85/150 [04:38<03:22, 3.11s/q, correct=68/85, lccp=84.8%, score=0.924, step_acc=92.6%]
GSM8K eval: 57%|#####7 | 86/150 [04:42<03:26, 3.23s/q, correct=69/86, lccp=84.9%, score=0.925, step_acc=92.7%]
GSM8K eval: 58%|#####8 | 87/150 [04:47<04:08, 3.94s/q, correct=70/87, lccp=85.1%, score=0.926, step_acc=92.7%]
GSM8K eval: 59%|#####8 | 88/150 [04:49<03:26, 3.33s/q, correct=71/88, lccp=85.3%, score=0.927, step_acc=92.8%]
GSM8K eval: 59%|#####9 | 89/150 [04:52<03:13, 3.17s/q, correct=72/89, lccp=85.4%, score=0.927, step_acc=92.9%]
GSM8K eval: 60%|###### | 90/150 [04:54<02:55, 2.93s/q, correct=73/90, lccp=85.6%, score=0.928, step_acc=93.0%]
GSM8K eval: 61%|###### | 91/150 [04:59<03:18, 3.36s/q, correct=74/91, lccp=85.8%, score=0.929, step_acc=93.1%]
GSM8K eval: 61%|######1 | 92/150 [05:02<03:11, 3.30s/q, correct=75/92, lccp=85.9%, score=0.930, step_acc=93.1%]
GSM8K eval: 62%|######2 | 93/150 [05:09<04:19, 4.55s/q, correct=76/93, lccp=86.1%, score=0.930, step_acc=93.2%]
GSM8K eval: 63%|######2 | 94/150 [05:12<03:44, 4.01s/q, correct=76/94, lccp=85.2%, score=0.927, step_acc=92.6%]
GSM8K eval: 63%|######3 | 95/150 [05:17<04:01, 4.39s/q, correct=77/95, lccp=84.3%, score=0.927, step_acc=92.1%]
GSM8K eval: 64%|######4 | 96/150 [05:20<03:37, 4.02s/q, correct=77/96, lccp=83.7%, score=0.922, step_acc=91.5%]
GSM8K eval: 65%|######4 | 97/150 [05:23<03:10, 3.60s/q, correct=77/97, lccp=83.4%, score=0.919, step_acc=91.3%]
GSM8K eval: 65%|######5 | 98/150 [05:27<03:17, 3.80s/q, correct=77/98, lccp=83.0%, score=0.916, step_acc=91.3%]
GSM8K eval: 66%|######6 | 99/150 [05:30<02:51, 3.36s/q, correct=78/99, lccp=83.1%, score=0.917, step_acc=91.4%]
GSM8K eval: 67%|######6 | 100/150 [05:31<02:25, 2.91s/q, correct=79/100, lccp=82.3%, score=0.917, step_acc=91.1%]
GSM8K eval: 67%|######7 | 101/150 [05:34<02:22, 2.92s/q, correct=79/101, lccp=82.0%, score=0.913, step_acc=91.0%]
GSM8K eval: 68%|######8 | 102/150 [05:36<01:59, 2.49s/q, correct=80/102, lccp=82.2%, score=0.914, step_acc=91.1%]
GSM8K eval: 69%|######8 | 103/150 [05:38<01:50, 2.36s/q, correct=81/103, lccp=82.3%, score=0.915, step_acc=91.1%]
GSM8K eval: 69%|######9 | 104/150 [05:43<02:20, 3.06s/q, correct=82/104, lccp=82.5%, score=0.915, step_acc=91.2%]
GSM8K eval: 70%|####### | 105/150 [05:45<02:10, 2.90s/q, correct=83/105, lccp=82.7%, score=0.916, step_acc=91.3%]
GSM8K eval: 71%|####### | 106/150 [05:47<01:49, 2.48s/q, correct=84/106, lccp=82.8%, score=0.917, step_acc=91.4%]
GSM8K eval: 71%|#######1 | 107/150 [05:48<01:33, 2.18s/q, correct=85/107, lccp=83.0%, score=0.917, step_acc=91.5%]
GSM8K eval: 72%|#######2 | 108/150 [05:51<01:39, 2.37s/q, correct=86/108, lccp=83.2%, score=0.918, step_acc=91.6%]
GSM8K eval: 73%|#######2 | 109/150 [05:56<02:09, 3.15s/q, correct=86/109, lccp=82.7%, score=0.917, step_acc=91.5%]
GSM8K eval: 73%|#######3 | 110/150 [05:58<01:55, 2.90s/q, correct=87/110, lccp=82.2%, score=0.917, step_acc=91.3%]
GSM8K eval: 74%|#######4 | 111/150 [06:00<01:38, 2.53s/q, correct=88/111, lccp=82.3%, score=0.918, step_acc=91.4%]
GSM8K eval: 75%|#######4 | 112/150 [06:05<02:05, 3.30s/q, correct=88/112, lccp=82.5%, score=0.918, step_acc=91.5%]
GSM8K eval: 75%|#######5 | 113/150 [06:07<01:45, 2.84s/q, correct=89/113, lccp=82.6%, score=0.918, step_acc=91.6%]
GSM8K eval: 76%|#######6 | 114/150 [06:12<02:07, 3.53s/q, correct=90/114, lccp=82.2%, score=0.919, step_acc=91.5%]
GSM8K eval: 77%|#######6 | 115/150 [06:15<01:56, 3.33s/q, correct=91/115, lccp=82.3%, score=0.920, step_acc=91.6%]
GSM8K eval: 77%|#######7 | 116/150 [06:18<01:49, 3.21s/q, correct=92/116, lccp=82.5%, score=0.920, step_acc=91.7%]
GSM8K eval: 78%|#######8 | 117/150 [06:24<02:12, 4.01s/q, correct=93/117, lccp=82.6%, score=0.921, step_acc=91.7%]
GSM8K eval: 79%|#######8 | 118/150 [06:28<02:12, 4.13s/q, correct=93/118, lccp=81.9%, score=0.918, step_acc=91.7%]
GSM8K eval: 79%|#######9 | 119/150 [06:32<02:02, 3.96s/q, correct=93/119, lccp=82.1%, score=0.917, step_acc=91.7%]
GSM8K eval: 80%|######## | 120/150 [06:34<01:48, 3.60s/q, correct=94/120, lccp=82.2%, score=0.918, step_acc=91.8%]
GSM8K eval: 81%|######## | 121/150 [06:37<01:40, 3.45s/q, correct=95/121, lccp=82.4%, score=0.918, step_acc=91.9%]
GSM8K eval: 81%|########1 | 122/150 [06:40<01:32, 3.32s/q, correct=96/122, lccp=82.5%, score=0.919, step_acc=91.9%]
GSM8K eval: 82%|########2 | 123/150 [06:44<01:30, 3.35s/q, correct=97/123, lccp=82.7%, score=0.920, step_acc=92.0%]
GSM8K eval: 83%|########2 | 124/150 [06:46<01:18, 3.00s/q, correct=98/124, lccp=82.8%, score=0.920, step_acc=92.1%]
GSM8K eval: 83%|########3 | 125/150 [06:48<01:07, 2.71s/q, correct=99/125, lccp=82.9%, score=0.921, step_acc=92.1%]
GSM8K eval: 84%|########4 | 126/150 [06:51<01:05, 2.75s/q, correct=100/126, lccp=83.1%, score=0.921, step_acc=92.2%]
GSM8K eval: 85%|########4 | 127/150 [06:55<01:14, 3.25s/q, correct=101/127, lccp=83.2%, score=0.922, step_acc=92.2%]
GSM8K eval: 85%|########5 | 128/150 [06:58<01:09, 3.16s/q, correct=102/128, lccp=83.3%, score=0.923, step_acc=92.3%]
GSM8K eval: 86%|########6 | 129/150 [07:02<01:08, 3.28s/q, correct=103/129, lccp=83.5%, score=0.923, step_acc=92.4%]
GSM8K eval: 87%|########6 | 130/150 [07:04<00:56, 2.84s/q, correct=104/130, lccp=83.6%, score=0.924, step_acc=92.4%]
GSM8K eval: 87%|########7 | 131/150 [07:08<01:03, 3.34s/q, correct=105/131, lccp=83.7%, score=0.924, step_acc=92.5%]
GSM8K eval: 88%|########8 | 132/150 [07:10<00:50, 2.81s/q, correct=106/132, lccp=83.8%, score=0.925, step_acc=92.5%]
GSM8K eval: 89%|########8 | 133/150 [07:13<00:47, 2.82s/q, correct=107/133, lccp=84.0%, score=0.926, step_acc=92.6%]
GSM8K eval: 89%|########9 | 134/150 [07:17<00:52, 3.28s/q, correct=108/134, lccp=84.1%, score=0.926, step_acc=92.7%]
GSM8K eval: 90%|######### | 135/150 [07:20<00:48, 3.21s/q, correct=109/135, lccp=84.2%, score=0.927, step_acc=92.7%]
GSM8K eval: 91%|######### | 136/150 [07:24<00:49, 3.56s/q, correct=109/136, lccp=83.8%, score=0.926, step_acc=92.5%]
GSM8K eval: 91%|#########1| 137/150 [07:31<00:58, 4.51s/q, correct=110/137, lccp=84.0%, score=0.926, step_acc=92.6%]
GSM8K eval: 92%|#########2| 138/150 [07:35<00:51, 4.32s/q, correct=111/138, lccp=84.1%, score=0.927, step_acc=92.6%]
GSM8K eval: 93%|#########2| 139/150 [07:38<00:44, 4.06s/q, correct=112/139, lccp=84.2%, score=0.927, step_acc=92.7%]
GSM8K eval: 93%|#########3| 140/150 [07:43<00:41, 4.12s/q, correct=112/140, lccp=84.1%, score=0.924, step_acc=92.5%]
GSM8K eval: 94%|#########3| 141/150 [07:47<00:36, 4.05s/q, correct=113/141, lccp=84.2%, score=0.924, step_acc=92.5%]
GSM8K eval: 95%|#########4| 142/150 [07:51<00:32, 4.05s/q, correct=114/142, lccp=84.3%, score=0.925, step_acc=92.6%]
GSM8K eval: 95%|#########5| 143/150 [07:53<00:24, 3.53s/q, correct=115/143, lccp=84.4%, score=0.925, step_acc=92.6%]
GSM8K eval: 96%|#########6| 144/150 [07:55<00:19, 3.17s/q, correct=116/144, lccp=84.5%, score=0.926, step_acc=92.7%]
GSM8K eval: 97%|#########6| 145/150 [08:00<00:18, 3.79s/q, correct=116/145, lccp=84.0%, score=0.923, step_acc=92.6%]
GSM8K eval: 97%|#########7| 146/150 [08:03<00:14, 3.53s/q, correct=117/146, lccp=84.1%, score=0.923, step_acc=92.6%]
GSM8K eval: 98%|#########8| 147/150 [08:07<00:10, 3.57s/q, correct=118/147, lccp=84.2%, score=0.924, step_acc=92.7%]
GSM8K eval: 99%|#########8| 148/150 [08:11<00:07, 3.58s/q, correct=119/148, lccp=84.3%, score=0.924, step_acc=92.7%]
GSM8K eval: 99%|#########9| 149/150 [08:14<00:03, 3.53s/q, correct=120/149, lccp=84.4%, score=0.925, step_acc=92.8%]
GSM8K eval: 100%|##########| 150/150 [08:19<00:00, 3.90s/q, correct=120/150, lccp=84.3%, score=0.923, step_acc=92.5%]
GSM8K eval: 100%|##########| 150/150 [08:19<00:00, 3.33s/q, correct=120/150, lccp=84.3%, score=0.923, step_acc=92.5%]
+2026-04-26 06:02:52,390 INFO __main__ - Training Score [iter 20]: 0.9234 (best=0.9262) | n=150
+2026-04-26 06:02:52,390 INFO __main__ - Components : 0.50×correct(80.0%) + 0.40×process + 0.10×fmt(1.000)
+2026-04-26 06:02:52,391 INFO __main__ - Process score : prm_mean=0.906 prm_final=0.935 → weighted=0.923
+2026-04-26 06:02:52,391 INFO __main__ - Step accuracy : 92.5% (bag-of-steps: fraction of steps PRM >0.5)
+2026-04-26 06:02:52,391 INFO __main__ - Chain integrity (LCCP): 84.3% ← fraction of steps before first failure
+ [LCCP=100% → all steps correct; LCCP=0% → first step wrong]
+2026-04-26 06:02:52,391 INFO __main__ - (debug) final-answer accuracy: 80.0%
+2026-04-26 06:02:54,581 INFO __main__ - ======================================================================
+2026-04-26 06:02:54,581 INFO __main__ - GRPO ITERATION 21/60
+2026-04-26 06:02:54,581 INFO __main__ - ======================================================================
+2026-04-26 06:02:54,601 INFO __main__ - LR this iteration: 4.43e-06 | T=0.664 | MATH ratio=36%
+
Iter 21 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 06:02:58,401 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:02:58,482 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:02:58,564 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:02:58,646 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:590: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.1` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
+ warnings.warn(
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:595: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.
+ warnings.warn(
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:612: UserWarning: `do_sample` is set to `False`. However, `top_k` is set to `20` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_k`.
+ warnings.warn(
+2026-04-26 06:03:04,834 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:03:04,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:03:05,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:03:05,088 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:03:11,820 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:03:11,902 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 21 GRPO groups: 0%| | 0/20 [00:17, ?q/s, loss=0var, mean_r=0.999, skip=1]
Iter 21 GRPO groups: 5%|5 | 1/20 [00:17<05:28, 17.30s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 06:03:13,851 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.950 = 0.50×1.00(exact) + 0.40×proc(0.876[fin=0.99,mean=0.70]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:03:13,933 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.892[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:03:19,792 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:03:19,874 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.953 = 0.50×1.00(exact) + 0.40×proc(0.882[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:03:19,955 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.950 = 0.50×1.00(exact) + 0.40×proc(0.876[fin=0.99,mean=0.70]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:03:20,041 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.420 = 0.50×0.43(prox=0.43) + 0.40×proc(0.138[fin=0.04,mean=0.28]) + 0.10×fmt(1.000) | pred='400' gold='240' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 06:03:24,657 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.950 = 0.50×1.00(exact) + 0.40×proc(0.876[fin=0.99,mean=0.70]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:03:24,738 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.892[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:03:24,814 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.893[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:03:24,889 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.893[fin=0.99,mean=0.74]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+
Iter 21 GRPO groups: 5%|5 | 1/20 [00:37<05:28, 17.30s/q, loss=-0.0007, mean_r=0.904, skip=1]
Iter 21 GRPO groups: 10%|# | 2/20 [00:37<05:38, 18.79s/q, loss=-0.0007, mean_r=0.904, skip=1]2026-04-26 06:03:41,667 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.801 = 0.50×0.70(prox=0.70) + 0.40×proc(0.882[fin=0.99,mean=0.72]) + 0.10×fmt(1.000) | pred='82800' gold='106000' | step_acc=71% lccp=57% (chain=4/7 ok_count=5) n_steps=7
+2026-04-26 06:03:41,760 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.847[fin=1.00,mean=0.62]) + 0.10×fmt(1.000) | pred='11600' gold='106000' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 06:03:41,853 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.920 = 0.50×0.85(prox=0.85) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='100000' gold='106000' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:03:41,944 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='106000' gold='106000' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:03:53,357 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.500 = 0.50×0.35(prox=0.35) + 0.40×proc(0.294[fin=0.02,mean=0.71]) + 0.10×fmt(1.000) | pred='8245' gold='106000' | step_acc=71% lccp=71% (chain=5/7 ok_count=5) n_steps=7
+2026-04-26 06:03:53,448 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.35(prox=0.35) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='8245' gold='106000' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:03:53,541 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.900[fin=0.95,mean=0.82]) + 0.10×fmt(1.000) | pred='11200' gold='106000' | step_acc=88% lccp=75% (chain=6/8 ok_count=7) n_steps=8
+2026-04-26 06:03:53,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='106000' gold='106000' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:04:09,948 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='106000' gold='106000' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:04:10,040 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='106000' gold='106000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 21 GRPO groups: 10%|# | 2/20 [01:17<05:38, 18.79s/q, loss=0.0000, mean_r=0.786, skip=1]
Iter 21 GRPO groups: 15%|#5 | 3/20 [01:17<08:03, 28.42s/q, loss=0.0000, mean_r=0.786, skip=1]2026-04-26 06:04:11,611 INFO src.rl.curriculum_manager - Topic probabilities (rollout 400): [('money_problems', '0.106'), ('time_distance', '0.106'), ('comparison_problems', '0.106'), ('sets', '0.106'), ('combinatorics', '0.106')]
+2026-04-26 06:04:22,995 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.630 = clip(base=0.550 + mod=+0.080, cap=1.00) | Q=0.92 sol=0.997 novelty=0.85 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=8
+2026-04-26 06:04:23,189 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.603 = clip(base=0.523 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.308 novelty=0.85 | sol=0.45*prm_final(0.03)+0.35*prm_mean(0.62)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:04:23,378 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.630 = clip(base=0.550 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.784 novelty=0.85 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.84)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:04:23,567 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.630 = clip(base=0.550 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.452 novelty=0.85 | sol=0.45*prm_final(0.06)+0.35*prm_mean(0.75)+0.20*lccp(0.80) | steps=5
+2026-04-26 06:04:23,760 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.630 = clip(base=0.550 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.990 novelty=0.85 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:04:23,944 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.609 = clip(base=0.529 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.336 novelty=0.85 | sol=0.45*prm_final(0.04)+0.35*prm_mean(0.53)+0.20*lccp(0.67) | steps=3
+2026-04-26 06:04:24,135 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.630 = clip(base=0.550 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.992 novelty=0.85 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:04:24,322 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.630 = clip(base=0.550 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.991 novelty=0.85 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:04:24,514 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.630 = clip(base=0.550 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.787 novelty=0.85 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.84)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:04:24,718 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.630 = clip(base=0.550 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.509 novelty=0.85 | sol=0.45*prm_final(0.08)+0.35*prm_mean(0.86)+0.20*lccp(0.86) | steps=7
+2026-04-26 06:04:31,297 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.474 = clip(base=0.394 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.220 novelty=0.69 | sol=0.45*prm_final(0.07)+0.35*prm_mean(0.35)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:04:31,490 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.360 = clip(base=0.280 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.057 novelty=0.69 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.15)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:04:31,682 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.655 = clip(base=0.575 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.505 novelty=0.69 | sol=0.45*prm_final(0.55)+0.35*prm_mean(0.54)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:04:31,879 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.408 = clip(base=0.328 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.126 novelty=0.69 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.34)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:04:32,079 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.377 = clip(base=0.297 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.111 novelty=0.69 | sol=0.45*prm_final(0.16)+0.35*prm_mean(0.11)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:04:32,288 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.726 = clip(base=0.646 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.601 novelty=0.69 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.52)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:04:32,479 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.361 = clip(base=0.281 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.059 novelty=0.69 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.15)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:04:32,678 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.926 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.79)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:04:32,877 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.775 = clip(base=0.695 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.720 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.78)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:04:33,080 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.381 = clip(base=0.301 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.093 novelty=0.69 | sol=0.45*prm_final(0.08)+0.35*prm_mean(0.16)+0.20*lccp(0.00) | steps=3
+
Iter 21 GRPO groups: 15%|#5 | 3/20 [01:40<08:03, 28.42s/q, loss=0.0013, mean_r=0.584, q_acc=100%, q_rew=0.742, skip=1]
Iter 21 GRPO groups: 20%|## | 4/20 [01:40<07:01, 26.37s/q, loss=0.0013, mean_r=0.584, q_acc=100%, q_rew=0.742, skip=1]2026-04-26 06:04:39,381 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.430 = 0.50×0.33(prox=0.33) + 0.40×proc(0.409[fin=0.43,mean=0.38]) + 0.10×fmt(1.000) | pred='10' gold='5' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:04:39,464 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.464 = 0.50×0.56(prox=0.56) + 0.40×proc(0.216[fin=0.08,mean=0.42]) + 0.10×fmt(1.000) | pred='7' gold='5' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 06:04:48,778 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.915 = 0.50×1.00(exact) + 0.40×proc(0.876[fin=0.99,mean=0.70]) + 0.10×fmt(0.650) | pred='5' gold='5' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 06:04:48,863 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.321 = 0.50×0.20(prox=0.20) + 0.40×proc(0.303[fin=0.29,mean=0.33]) + 0.10×fmt(1.000) | pred='15' gold='5' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 06:04:48,941 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:04:49,023 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:04:54,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.896[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 06:04:54,640 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.439 = 0.50×0.29(prox=0.29) + 0.40×proc(0.479[fin=0.56,mean=0.36]) + 0.10×fmt(1.000) | pred='11' gold='5' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 06:04:54,720 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.770 = 0.50×1.00(exact) + 0.40×proc(0.512[fin=0.60,mean=0.38]) + 0.10×fmt(0.650) | pred='5' gold='5' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 06:04:54,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.915 = 0.50×1.00(exact) + 0.40×proc(0.789[fin=0.91,mean=0.61]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+
Iter 21 GRPO groups: 20%|## | 4/20 [02:10<07:01, 26.37s/q, loss=-0.0026, mean_r=0.717, q_acc=100%, q_rew=0.742, skip=1]
Iter 21 GRPO groups: 25%|##5 | 5/20 [02:10<06:55, 27.73s/q, loss=-0.0026, mean_r=0.717, q_acc=100%, q_rew=0.742, skip=1]2026-04-26 06:05:07,854 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:07,937 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:08,022 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:08,105 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:14,800 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:14,882 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:14,964 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:15,046 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:21,676 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:21,758 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 21 GRPO groups: 25%|##5 | 5/20 [02:27<06:55, 27.73s/q, loss=0var, mean_r=0.993, skip=2]
Iter 21 GRPO groups: 30%|### | 6/20 [02:27<05:36, 24.01s/q, loss=0var, mean_r=0.993, skip=2]2026-04-26 06:05:25,007 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:05:25,091 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.332 = 0.50×0.40(prox=0.40) + 0.40×proc(0.080[fin=0.08,mean=0.07]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:05:29,713 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.946[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:05:29,798 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:05:29,876 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.507[fin=0.46,mean=0.58]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 06:05:29,959 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.352 = 0.50×0.40(prox=0.40) + 0.40×proc(0.131[fin=0.15,mean=0.10]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:05:35,116 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:05:35,202 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.352 = 0.50×0.40(prox=0.40) + 0.40×proc(0.129[fin=0.15,mean=0.09]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:05:35,285 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:05:35,368 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 21 GRPO groups: 30%|### | 6/20 [02:47<05:36, 24.01s/q, loss=0.0004, mean_r=0.756, q_acc=100%, q_rew=0.742, skip=2]
Iter 21 GRPO groups: 35%|###5 | 7/20 [02:47<04:56, 22.82s/q, loss=0.0004, mean_r=0.756, q_acc=100%, q_rew=0.742, skip=2]2026-04-26 06:05:46,549 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:05:46,631 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:05:46,713 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:05:46,795 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:05:55,454 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:05:55,536 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:05:55,618 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:05:55,701 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:06:04,873 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:06:04,959 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 21 GRPO groups: 35%|###5 | 7/20 [03:10<04:56, 22.82s/q, loss=0var, mean_r=0.997, skip=3]
Iter 21 GRPO groups: 40%|#### | 8/20 [03:10<04:33, 22.82s/q, loss=0var, mean_r=0.997, skip=3]2026-04-26 06:06:04,960 INFO src.rl.curriculum_manager - Topic probabilities (rollout 420): [('money_problems', '0.106'), ('time_distance', '0.106'), ('comparison_problems', '0.106'), ('sets', '0.106'), ('combinatorics', '0.106')]
+2026-04-26 06:06:07,429 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:07,609 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:07,788 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:07,962 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:08,143 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:08,325 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:08,503 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:08,679 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:08,854 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:09,034 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:12,726 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:12,904 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.998 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:06:13,084 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:06:13,264 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:13,445 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:13,625 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:13,802 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.933 = clip(base=0.853 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.997 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:13,978 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:14,157 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:14,341 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+
Iter 21 GRPO groups: 40%|#### | 8/20 [03:21<04:33, 22.82s/q, loss=0.0015, mean_r=0.930, q_acc=100%, q_rew=0.693, skip=3]
Iter 21 GRPO groups: 45%|####5 | 9/20 [03:21<03:30, 19.15s/q, loss=0.0015, mean_r=0.930, q_acc=100%, q_rew=0.693, skip=3]2026-04-26 06:06:23,740 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.31(prox=0.31) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='10160' gold='4830' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:06:23,833 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.884 = 0.50×0.85(prox=0.85) + 0.40×proc(0.898[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='5150' gold='4830' | step_acc=78% lccp=44% (chain=4/9 ok_count=7) n_steps=9
+2026-04-26 06:06:41,975 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.877 = 0.50×0.85(prox=0.85) + 0.40×proc(0.881[fin=0.95,mean=0.78]) + 0.10×fmt(1.000) | pred='4665' gold='4830' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 06:06:42,059 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.35(prox=0.35) + 0.40×proc(0.807[fin=0.91,mean=0.66]) + 0.10×fmt(1.000) | pred='430' gold='4830' | step_acc=50% lccp=38% (chain=3/8 ok_count=4) n_steps=8
+2026-04-26 06:06:42,145 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.680[fin=0.59,mean=0.81]) + 0.10×fmt(1.000) | pred='-4830' gold='4830' | step_acc=90% lccp=80% (chain=8/10 ok_count=9) n_steps=10
+2026-04-26 06:06:42,231 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.733 = 0.50×0.78(prox=0.78) + 0.40×proc(0.613[fin=0.70,mean=0.49]) + 0.10×fmt(1.000) | pred='4130' gold='4830' | step_acc=57% lccp=14% (chain=1/7 ok_count=4) n_steps=7
+2026-04-26 06:06:57,815 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.902[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='1264' gold='4830' | step_acc=78% lccp=11% (chain=1/9 ok_count=7) n_steps=9
+2026-04-26 06:06:57,902 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.667 = 0.50×0.77(prox=0.77) + 0.40×proc(0.452[fin=0.48,mean=0.41]) + 0.10×fmt(1.000) | pred='4120' gold='4830' | step_acc=33% lccp=11% (chain=1/9 ok_count=3) n_steps=9
+2026-04-26 06:06:57,988 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.38(prox=0.38) + 0.40×proc(0.723[fin=0.77,mean=0.65]) + 0.10×fmt(1.000) | pred='900' gold='4830' | step_acc=78% lccp=11% (chain=1/9 ok_count=7) n_steps=9
+2026-04-26 06:06:58,074 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.912[fin=0.98,mean=0.80]) + 0.10×fmt(1.000) | pred='485' gold='4830' | step_acc=89% lccp=56% (chain=5/9 ok_count=8) n_steps=9
+
Iter 21 GRPO groups: 45%|####5 | 9/20 [04:23<03:30, 19.15s/q, loss=-0.0005, mean_r=0.646, q_acc=100%, q_rew=0.693, skip=3]
Iter 21 GRPO groups: 50%|##### | 10/20 [04:23<05:25, 32.52s/q, loss=-0.0005, mean_r=0.646, q_acc=100%, q_rew=0.693, skip=3]2026-04-26 06:07:18,497 INFO src.rl.curriculum_manager - Topic probabilities (rollout 440): [('geometry', '0.092'), ('statistics', '0.092'), ('money_problems', '0.090'), ('time_distance', '0.090'), ('sets', '0.090')]
+2026-04-26 06:07:24,670 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.827 = clip(base=0.747 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.717 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.77)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:07:24,852 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.934 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.82)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:25,031 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.707 = clip(base=0.627 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.567 novelty=0.69 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.51)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:07:25,218 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.985 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:07:25,411 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.994 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:25,601 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.994 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:25,783 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:25,969 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:26,159 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:26,346 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.996 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:33,261 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.988 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:33,461 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.987 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:07:33,660 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.984 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:33,858 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.715 = clip(base=0.635 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.650 novelty=0.69 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.51)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:07:34,057 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.718 = clip(base=0.638 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.655 novelty=0.69 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.53)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:07:34,260 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.830 = clip(base=0.750 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.826 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.85)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:07:34,460 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:07:34,660 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:07:34,869 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:07:35,080 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.885 = clip(base=0.805 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.950 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+
Iter 21 GRPO groups: 50%|##### | 10/20 [04:42<05:25, 32.52s/q, loss=0.0017, mean_r=0.893, q_acc=100%, q_rew=0.684, skip=3]
Iter 21 GRPO groups: 55%|#####5 | 11/20 [04:42<04:13, 28.18s/q, loss=0.0017, mean_r=0.893, q_acc=100%, q_rew=0.684, skip=3]2026-04-26 06:07:42,736 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.706 = 0.50×1.00(exact) + 0.40×proc(0.264[fin=0.21,mean=0.34]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 06:07:42,820 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.718 = 0.50×1.00(exact) + 0.40×proc(0.295[fin=0.26,mean=0.35]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 06:07:42,906 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.500 = 0.50×0.37(prox=0.37) + 0.40×proc(0.283[fin=0.01,mean=0.69]) + 0.10×fmt(1.000) | pred='9' gold='56' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 06:07:42,989 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.644 = 0.50×1.00(exact) + 0.40×proc(0.111[fin=0.14,mean=0.06]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:07:47,468 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.646 = 0.50×1.00(exact) + 0.40×proc(0.115[fin=0.13,mean=0.09]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:07:47,549 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.860 = 0.50×1.00(exact) + 0.40×proc(0.649[fin=0.54,mean=0.82]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:07:47,636 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.663 = 0.50×0.43(prox=0.43) + 0.40×proc(0.870[fin=0.96,mean=0.73]) + 0.10×fmt(1.000) | pred='19' gold='56' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 06:07:47,718 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.636 = 0.50×1.00(exact) + 0.40×proc(0.177[fin=0.08,mean=0.33]) + 0.10×fmt(0.650) | pred='56' gold='56' | step_acc=50% lccp=50% (chain=1/2 ok_count=1) n_steps=2
+2026-04-26 06:07:52,687 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.826 = 0.50×1.00(exact) + 0.40×proc(0.566[fin=0.65,mean=0.44]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 06:07:52,771 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.380 = 0.50×0.33(prox=0.33) + 0.40×proc(0.236[fin=0.12,mean=0.41]) + 0.10×fmt(1.000) | pred='0' gold='56' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+
Iter 21 GRPO groups: 55%|#####5 | 11/20 [04:59<04:13, 28.18s/q, loss=-0.0022, mean_r=0.658, q_acc=100%, q_rew=0.684, skip=3]
Iter 21 GRPO groups: 60%|###### | 12/20 [04:59<03:19, 24.90s/q, loss=-0.0022, mean_r=0.658, q_acc=100%, q_rew=0.684, skip=3]2026-04-26 06:08:00,415 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.812 = 0.50×0.64(prox=0.64) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='-1.43' gold='-2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:08:00,501 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.803 = 0.50×0.64(prox=0.64) + 0.40×proc(0.962[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='-1.43' gold='-2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:08:09,431 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.958[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='-2' gold='-2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:08:09,516 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.804 = 0.50×0.64(prox=0.64) + 0.40×proc(0.964[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='-1.43' gold='-2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:08:09,611 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='$-1 \\frac{3}{7}$' gold='-2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:08:09,696 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='-2' gold='-2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:08:18,506 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='-2' gold='-2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:08:18,593 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.835 = 0.50×0.77(prox=0.77) + 0.40×proc(0.875[fin=0.99,mean=0.70]) + 0.10×fmt(1.000) | pred='-1.7' gold='-2' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 06:08:18,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.958[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='-2' gold='-2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:08:18,761 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='-2' gold='-2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 21 GRPO groups: 60%|###### | 12/20 [05:31<03:19, 24.90s/q, loss=-0.0023, mean_r=0.875, q_acc=100%, q_rew=0.684, skip=3]
Iter 21 GRPO groups: 65%|######5 | 13/20 [05:31<03:09, 27.02s/q, loss=-0.0023, mean_r=0.875, q_acc=100%, q_rew=0.684, skip=3]2026-04-26 06:08:26,116 INFO src.rl.curriculum_manager - Topic probabilities (rollout 460): [('money_problems', '0.121'), ('time_distance', '0.121'), ('sets', '0.121'), ('combinatorics', '0.121'), ('sequences', '0.121')]
+2026-04-26 06:08:36,354 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.997 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:08:36,564 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.661 = clip(base=0.581 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.450 novelty=0.72 | sol=0.45*prm_final(0.20)+0.35*prm_mean(0.62)+0.20*lccp(0.71) | steps=7
+2026-04-26 06:08:36,765 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.645 = clip(base=0.565 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.443 novelty=0.72 | sol=0.45*prm_final(0.12)+0.35*prm_mean(0.74)+0.20*lccp(0.67) | steps=6
+2026-04-26 06:08:36,972 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:08:37,190 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.640 = clip(base=0.560 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.438 novelty=0.72 | sol=0.45*prm_final(0.23)+0.35*prm_mean(0.61)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:08:37,393 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.643 = clip(base=0.563 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.442 novelty=0.72 | sol=0.45*prm_final(0.19)+0.35*prm_mean(0.67)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:08:37,601 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.996 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:08:37,810 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.842 = clip(base=0.762 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.789 novelty=0.72 | sol=0.45*prm_final(0.64)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:08:38,024 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=8
+2026-04-26 06:08:38,236 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.981 = clip(base=0.901 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.995 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:08:51,311 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.838 = clip(base=0.758 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.718 novelty=0.75 | sol=0.45*prm_final(0.66)+0.35*prm_mean(0.80)+0.20*lccp(0.71) | steps=7
+2026-04-26 06:08:51,518 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.756 = clip(base=0.676 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.595 novelty=0.75 | sol=0.45*prm_final(0.81)+0.35*prm_mean(0.57)+0.20*lccp(0.17) | steps=6
+2026-04-26 06:08:51,727 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.922 = clip(base=0.842 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.878 novelty=0.75 | sol=0.45*prm_final(0.85)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:08:51,931 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.703 = clip(base=0.623 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.499 novelty=0.75 | sol=0.45*prm_final(0.59)+0.35*prm_mean(0.55)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:08:52,140 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.981 = clip(base=0.901 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.990 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:08:52,341 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.624 = clip(base=0.544 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.392 novelty=0.75 | sol=0.45*prm_final(0.24)+0.35*prm_mean(0.53)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:08:52,543 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.491 = clip(base=0.411 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.253 novelty=0.75 | sol=0.45*prm_final(0.20)+0.35*prm_mean(0.27)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:08:52,754 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.831 = clip(base=0.751 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.732 novelty=0.75 | sol=0.45*prm_final(0.62)+0.35*prm_mean(0.84)+0.20*lccp(0.80) | steps=10
+2026-04-26 06:08:52,973 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.776 = clip(base=0.696 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.638 novelty=0.75 | sol=0.45*prm_final(0.49)+0.35*prm_mean(0.79)+0.20*lccp(0.71) | steps=7
+2026-04-26 06:08:53,180 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.807 = clip(base=0.727 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.690 novelty=0.75 | sol=0.45*prm_final(0.68)+0.35*prm_mean(0.76)+0.20*lccp(0.60) | steps=5
+
Iter 21 GRPO groups: 65%|######5 | 13/20 [06:00<03:09, 27.02s/q, loss=0.0009, mean_r=0.804, q_acc=100%, q_rew=0.704, skip=3]
Iter 21 GRPO groups: 70%|####### | 14/20 [06:00<02:45, 27.55s/q, loss=0.0009, mean_r=0.804, q_acc=100%, q_rew=0.704, skip=3]2026-04-26 06:08:58,417 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.938 = 0.50×1.00(exact) + 0.40×proc(0.844[fin=0.98,mean=0.63]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:08:58,493 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:08:58,575 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.935 = 0.50×1.00(exact) + 0.40×proc(0.838[fin=0.97,mean=0.64]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 06:08:58,652 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.910[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:09:04,939 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:09:05,015 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.898[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:09:05,091 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.958[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:09:05,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.945[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:09:10,562 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.829 = 0.50×0.85(prox=0.85) + 0.40×proc(0.761[fin=0.98,mean=0.43]) + 0.10×fmt(1.000) | pred='113' gold='107' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 06:09:10,640 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.945[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 21 GRPO groups: 70%|####### | 14/20 [06:17<02:45, 27.55s/q, loss=-0.0007, mean_r=0.955, q_acc=100%, q_rew=0.704, skip=3]
Iter 21 GRPO groups: 75%|#######5 | 15/20 [06:17<02:02, 24.42s/q, loss=-0.0007, mean_r=0.955, q_acc=100%, q_rew=0.704, skip=3]2026-04-26 06:09:12,068 INFO src.rl.curriculum_manager - Topic probabilities (rollout 480): [('money_problems', '0.121'), ('time_distance', '0.121'), ('sets', '0.121'), ('combinatorics', '0.121'), ('sequences', '0.121')]
+2026-04-26 06:09:23,762 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:09:23,989 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.993 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:09:24,221 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.941 = clip(base=0.861 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.999 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:09:24,447 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.989 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:09:24,671 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.611 = clip(base=0.531 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.459 novelty=0.76 | sol=0.45*prm_final(0.26)+0.35*prm_mean(0.64)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:09:24,898 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.941 = clip(base=0.861 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:09:25,129 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.847 = clip(base=0.767 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.839 novelty=0.76 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.88)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:09:25,346 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.992 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:09:25,583 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:09:25,815 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.995 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:09:36,768 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.641 = clip(base=0.561 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.486 novelty=0.72 | sol=0.45*prm_final(0.60)+0.35*prm_mean(0.55)+0.20*lccp(0.12) | steps=8
+2026-04-26 06:09:36,994 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.746 = clip(base=0.666 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.675 novelty=0.72 | sol=0.45*prm_final(0.85)+0.35*prm_mean(0.76)+0.20*lccp(0.14) | steps=7
+2026-04-26 06:09:37,229 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.786 = clip(base=0.706 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.742 novelty=0.72 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.68)+0.20*lccp(0.33) | steps=6
+2026-04-26 06:09:37,460 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.672 = clip(base=0.592 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.578 novelty=0.72 | sol=0.45*prm_final(0.80)+0.35*prm_mean(0.53)+0.20*lccp(0.17) | steps=6
+2026-04-26 06:09:37,690 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.808 = clip(base=0.728 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.779 novelty=0.72 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.75)+0.20*lccp(0.50) | steps=8
+2026-04-26 06:09:37,923 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.685 = clip(base=0.605 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.572 novelty=0.72 | sol=0.45*prm_final(0.38)+0.35*prm_mean(0.71)+0.20*lccp(0.78) | steps=9
+2026-04-26 06:09:38,160 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.807 = clip(base=0.727 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.773 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(0.17) | steps=6
+2026-04-26 06:09:38,418 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.737 = clip(base=0.657 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.661 novelty=0.72 | sol=0.45*prm_final(0.75)+0.35*prm_mean(0.67)+0.20*lccp(0.43) | steps=7
+2026-04-26 06:09:38,659 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.816 = clip(base=0.736 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.799 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.85)+0.20*lccp(0.25) | steps=8
+2026-04-26 06:09:38,885 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.624 = clip(base=0.544 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.487 novelty=0.72 | sol=0.45*prm_final(0.20)+0.35*prm_mean(0.73)+0.20*lccp(0.71) | steps=7
+
Iter 21 GRPO groups: 75%|#######5 | 15/20 [06:46<02:02, 24.42s/q, loss=0.0008, mean_r=0.817, q_acc=100%, q_rew=0.695, skip=3]
Iter 21 GRPO groups: 80%|######## | 16/20 [06:46<01:42, 25.67s/q, loss=0.0008, mean_r=0.817, q_acc=100%, q_rew=0.695, skip=3]2026-04-26 06:09:40,646 INFO src.rl.curriculum_manager - Topic probabilities (rollout 500): [('money_problems', '0.143'), ('time_distance', '0.143'), ('sets', '0.143'), ('combinatorics', '0.143'), ('sequences', '0.143')]
+2026-04-26 06:09:49,859 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.451 = clip(base=0.371 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.159 novelty=0.68 | sol=0.45*prm_final(0.05)+0.35*prm_mean(0.39)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:09:50,060 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.576 = clip(base=0.496 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.408 novelty=0.68 | sol=0.45*prm_final(0.47)+0.35*prm_mean(0.45)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:09:50,269 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.633 = clip(base=0.553 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.446 novelty=0.68 | sol=0.45*prm_final(0.29)+0.35*prm_mean(0.56)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:09:50,473 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.502 = clip(base=0.422 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.281 novelty=0.68 | sol=0.45*prm_final(0.08)+0.35*prm_mean(0.41)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:09:50,674 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.686 = clip(base=0.606 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.535 novelty=0.68 | sol=0.45*prm_final(0.49)+0.35*prm_mean(0.55)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:09:50,881 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.920 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.996 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:09:51,085 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.964 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:09:51,293 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.838 = clip(base=0.758 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.769 novelty=0.68 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.86)+0.20*lccp(0.17) | steps=6
+2026-04-26 06:09:51,493 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.968 novelty=0.68 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:09:51,693 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.984 = clip(base=0.904 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.989 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:09:55,331 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.976 = clip(base=0.896 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.995 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:55,515 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.995 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:55,695 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:55,878 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:56,059 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:56,242 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:56,422 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.991 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:56,610 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:56,795 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:56,982 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.985 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+
Iter 21 GRPO groups: 80%|######## | 16/20 [07:04<01:42, 25.67s/q, loss=-0.0001, mean_r=0.858, q_acc=100%, q_rew=0.697, skip=3]
Iter 21 GRPO groups: 85%|########5 | 17/20 [07:04<01:10, 23.37s/q, loss=-0.0001, mean_r=0.858, q_acc=100%, q_rew=0.697, skip=3]2026-04-26 06:10:00,821 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:00,897 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:06,448 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:06,529 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:06,608 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:06,684 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:12,670 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:12,746 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:12,822 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:12,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 21 GRPO groups: 85%|########5 | 17/20 [07:22<01:10, 23.37s/q, loss=0var, mean_r=0.999, skip=4]
Iter 21 GRPO groups: 90%|######### | 18/20 [07:22<00:43, 21.90s/q, loss=0var, mean_r=0.999, skip=4]2026-04-26 06:10:22,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.458 = 0.50×0.50(prox=0.50) + 0.40×proc(0.269[fin=0.20,mean=0.38]) + 0.10×fmt(1.000) | pred='15' gold='30' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 06:10:22,845 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.646 = 0.50×0.36(prox=0.36) + 0.40×proc(0.920[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='3' gold='30' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:10:22,927 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:10:23,010 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.399 = 0.50×0.00(prox=0.00) + 0.40×proc(0.748[fin=0.94,mean=0.46]) + 0.10×fmt(1.000) | pred='3 2/3' gold='30' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 06:10:31,610 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:10:31,697 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:10:31,782 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:10:31,863 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:36,638 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.908 = 0.50×1.00(exact) + 0.40×proc(0.770[fin=0.96,mean=0.48]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 06:10:36,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 21 GRPO groups: 90%|######### | 18/20 [07:43<00:43, 21.90s/q, loss=0.0007, mean_r=0.841, q_acc=100%, q_rew=0.697, skip=4]
Iter 21 GRPO groups: 95%|#########5| 19/20 [07:43<00:21, 21.65s/q, loss=0.0007, mean_r=0.841, q_acc=100%, q_rew=0.697, skip=4]2026-04-26 06:10:42,085 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:42,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:10:48,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:10:48,666 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:48,742 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:10:48,824 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:11:00,038 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:11:00,117 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:11:00,193 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:11:00,269 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.650) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 21 GRPO groups: 95%|#########5| 19/20 [08:10<00:21, 21.65s/q, loss=0var, mean_r=0.989, skip=5]
Iter 21 GRPO groups: 100%|##########| 20/20 [08:10<00:00, 23.18s/q, loss=0var, mean_r=0.989, skip=5]
Iter 21 GRPO groups: 100%|##########| 20/20 [08:10<00:00, 24.52s/q, loss=0var, mean_r=0.989, skip=5]
+2026-04-26 06:11:04,949 INFO __main__ - Iter 21 | loss=0.0002 | reward mean=0.842 std=0.188 | gt_match=73.6% | grounded_acc=91.4% | step_acc=82.0% | lccp=69.3% | batch_acc=92.3% | phase=SELFPLAY_RAMP sp_ratio=29% | groups=21 skipped=5(0var=5) | lr=4.34e-06 | 490.4s
+2026-04-26 06:11:04,950 INFO __main__ - Question generation: 6/6 valid (100%) | q_reward=0.697 | q_acc=100.0% (>0.5 quality) | topic=0.57 diff=0.48 clarity=1.00 novelty=0.45 solvability=0.96
+2026-04-26 06:11:04,951 INFO __main__ - ======================================================================
+2026-04-26 06:11:04,951 INFO __main__ - GRPO ITERATION 22/60
+2026-04-26 06:11:04,951 INFO __main__ - ======================================================================
+2026-04-26 06:11:04,971 INFO __main__ - LR this iteration: 4.34e-06 | T=0.658 | MATH ratio=38%
+
Iter 22 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 06:11:04,973 INFO src.rl.curriculum_manager - Topic probabilities (rollout 520): [('money_problems', '0.107'), ('time_distance', '0.107'), ('sets', '0.107'), ('combinatorics', '0.107'), ('sequences', '0.107')]
+2026-04-26 06:11:12,213 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.918 = clip(base=0.838 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.968 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:12,435 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.891 = clip(base=0.811 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.960 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:12,649 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.897 = clip(base=0.817 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.981 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:12,863 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.813 = clip(base=0.733 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.823 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:11:13,079 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.992 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:13,297 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.988 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:13,511 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.700 = clip(base=0.620 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.627 novelty=0.70 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.56)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:11:13,729 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.982 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:13,947 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.982 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:14,167 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.981 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:49,889 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.951 novelty=0.76 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:11:50,107 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.736 = clip(base=0.656 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.654 novelty=0.76 | sol=0.45*prm_final(0.73)+0.35*prm_mean(0.70)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:11:50,317 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.710 = clip(base=0.630 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.634 novelty=0.76 | sol=0.45*prm_final(0.78)+0.35*prm_mean(0.62)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:11:50,525 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.439 = clip(base=0.359 + mod=+0.080, cap=1.00) | Q=0.55 sol=0.229 novelty=0.76 | sol=0.45*prm_final(0.21)+0.35*prm_mean(0.24)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:11:50,733 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.689 = clip(base=0.609 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.594 novelty=0.76 | sol=0.45*prm_final(0.77)+0.35*prm_mean(0.52)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:11:50,942 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.684 = clip(base=0.604 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.580 novelty=0.76 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.55)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:11:51,163 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.867 = clip(base=0.787 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.915 novelty=0.76 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:51,379 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.769 = clip(base=0.689 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.730 novelty=0.76 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.78)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:11:51,599 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.702 = clip(base=0.622 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.604 novelty=0.76 | sol=0.45*prm_final(0.74)+0.35*prm_mean(0.68)+0.20*lccp(0.17) | steps=6
+
Iter 22 GRPO groups: 0%| | 0/20 [00:48, ?q/s, loss=-0.0004, mean_r=0.802, q_acc=100%, q_rew=0.608, skip=0]
Iter 22 GRPO groups: 5%|5 | 1/20 [00:48<15:20, 48.43s/q, loss=-0.0004, mean_r=0.802, q_acc=100%, q_rew=0.608, skip=0]2026-04-26 06:11:58,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:11:58,420 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:11:58,502 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:11:58,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:12:03,176 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:12:03,259 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:12:03,340 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:12:03,423 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:12:08,416 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:12:08,491 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 22 GRPO groups: 5%|5 | 1/20 [01:03<15:20, 48.43s/q, loss=0var, mean_r=1.000, skip=1]
Iter 22 GRPO groups: 10%|# | 2/20 [01:03<08:38, 28.82s/q, loss=0var, mean_r=1.000, skip=1]2026-04-26 06:12:42,264 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.915[fin=0.98,mean=0.82]) + 0.10×fmt(1.000) | pred='751' gold='751' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:12:42,363 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.686 = 0.50×0.60(prox=0.60) + 0.40×proc(0.714[fin=0.87,mean=0.48]) + 0.10×fmt(1.000) | pred='1000' gold='751' | step_acc=33% lccp=17% (chain=1/6 ok_count=2) n_steps=6
+2026-04-26 06:12:54,657 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='751' gold='751' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:12:54,753 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='751' gold='751' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:12:54,847 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='751' gold='751' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:12:54,943 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='751' gold='751' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:13:19,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.950 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(0.650) | pred='751' gold='751' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:13:19,104 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='751' gold='751' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:19,203 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.874 = 0.50×0.85(prox=0.85) + 0.40×proc(0.873[fin=0.91,mean=0.82]) + 0.10×fmt(1.000) | pred='754' gold='751' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+
Iter 22 GRPO groups: 10%|# | 2/20 [02:16<08:38, 28.82s/q, loss=-0.0009, mean_r=0.941, q_acc=100%, q_rew=0.608, skip=1]
Iter 22 GRPO groups: 15%|#5 | 3/20 [02:16<13:49, 48.81s/q, loss=-0.0009, mean_r=0.941, q_acc=100%, q_rew=0.608, skip=1]2026-04-26 06:13:25,055 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:32,864 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:32,949 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:33,033 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:33,119 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:40,608 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:40,694 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:40,780 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:40,865 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:50,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 22 GRPO groups: 15%|#5 | 3/20 [02:45<13:49, 48.81s/q, loss=0var, mean_r=0.996, skip=2]
Iter 22 GRPO groups: 20%|## | 4/20 [02:45<10:57, 41.11s/q, loss=0var, mean_r=0.996, skip=2]2026-04-26 06:13:55,158 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.963 + mod=+0.080, cap=1.00) | Q=0.91 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:13:55,357 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.995 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:13:55,552 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.995 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:13:55,747 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.936 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.991 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:13:55,938 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.840 = clip(base=0.760 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.687 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.69)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:13:56,133 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.841 = clip(base=0.761 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.689 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.70)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:13:56,327 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.936 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.990 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:13:56,518 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.958 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:13:56,708 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.841 = clip(base=0.761 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.689 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.70)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:13:56,897 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.841 = clip(base=0.761 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.689 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.70)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:14:02,143 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.943 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:14:02,341 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.941 + mod=+0.080, cap=1.00) | Q=0.85 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:14:02,539 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.822 = clip(base=0.742 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.660 novelty=0.67 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.65)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:14:02,739 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.945 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:14:02,933 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.822 = clip(base=0.742 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.660 novelty=0.67 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.65)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:14:03,133 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:14:03,327 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.822 = clip(base=0.742 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.660 novelty=0.67 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.65)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:14:03,521 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.996 = clip(base=0.916 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.976 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:14:03,720 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.832 = clip(base=0.752 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.689 novelty=0.67 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.71)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:14:03,912 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+
Iter 22 GRPO groups: 20%|## | 4/20 [03:00<10:57, 41.11s/q, loss=0.0053, mean_r=0.932, q_acc=100%, q_rew=0.735, skip=2]
Iter 22 GRPO groups: 25%|##5 | 5/20 [03:00<07:56, 31.76s/q, loss=0.0053, mean_r=0.932, q_acc=100%, q_rew=0.735, skip=2]2026-04-26 06:14:12,491 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:14:12,574 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:14:12,657 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:14:22,799 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:14:22,883 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.744 = 0.50×0.58(prox=0.58) + 0.40×proc(0.887[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='30' gold='22' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:14:22,967 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:14:23,051 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.834 = 0.50×0.73(prox=0.73) + 0.40×proc(0.918[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='18' gold='22' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 06:14:31,243 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='$22' gold='22' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:14:31,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:14:31,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 22 GRPO groups: 25%|##5 | 5/20 [03:27<07:56, 31.76s/q, loss=0.0003, mean_r=0.953, q_acc=100%, q_rew=0.735, skip=2]
Iter 22 GRPO groups: 30%|### | 6/20 [03:27<07:03, 30.25s/q, loss=0.0003, mean_r=0.953, q_acc=100%, q_rew=0.735, skip=2]2026-04-26 06:14:37,683 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:14:48,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:14:48,998 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:14:49,075 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:14:49,151 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:14:52,220 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:14:52,299 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:14:52,381 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:14:52,457 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:14:57,430 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(0.650) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 22 GRPO groups: 30%|### | 6/20 [03:52<07:03, 30.25s/q, loss=0var, mean_r=0.977, skip=3]
Iter 22 GRPO groups: 35%|###5 | 7/20 [03:52<06:08, 28.38s/q, loss=0var, mean_r=0.977, skip=3]2026-04-26 06:15:02,323 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:02,409 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:02,490 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:15:05,199 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:15:05,283 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:15:05,360 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:15:05,443 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:15:14,075 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:14,162 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:15:14,241 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.936 = 0.50×1.00(exact) + 0.40×proc(0.927[fin=1.00,mean=0.82]) + 0.10×fmt(0.650) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 22 GRPO groups: 35%|###5 | 7/20 [04:10<06:08, 28.38s/q, loss=0.0028, mean_r=0.984, q_acc=100%, q_rew=0.735, skip=3]
Iter 22 GRPO groups: 40%|#### | 8/20 [04:10<05:01, 25.16s/q, loss=0.0028, mean_r=0.984, q_acc=100%, q_rew=0.735, skip=3]2026-04-26 06:15:18,851 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:22,907 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:22,992 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:23,076 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:23,161 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:28,694 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:28,776 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:28,859 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:28,942 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:34,015 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 22 GRPO groups: 40%|#### | 8/20 [04:29<05:01, 25.16s/q, loss=0var, mean_r=1.000, skip=4]
Iter 22 GRPO groups: 45%|####5 | 9/20 [04:29<04:13, 23.02s/q, loss=0var, mean_r=1.000, skip=4]2026-04-26 06:15:50,875 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=8
+2026-04-26 06:15:51,114 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=8
+2026-04-26 06:15:51,346 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=9
+2026-04-26 06:15:51,591 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.986 novelty=0.73 | sol=0.45*prm_final(0.97)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=9
+2026-04-26 06:15:51,824 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.992 novelty=0.73 | sol=0.45*prm_final(0.98)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=9
+2026-04-26 06:15:52,059 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=9
+2026-04-26 06:15:52,292 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=8
+2026-04-26 06:15:52,523 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=9
+2026-04-26 06:15:52,750 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=9
+2026-04-26 06:15:52,978 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.989 novelty=0.73 | sol=0.45*prm_final(0.98)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=9
+2026-04-26 06:16:02,063 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.77 sol=1.000 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:16:02,283 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.665 = clip(base=0.585 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.514 novelty=0.72 | sol=0.45*prm_final(0.45)+0.35*prm_mean(0.60)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:16:02,507 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.475 = clip(base=0.395 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.269 novelty=0.72 | sol=0.45*prm_final(0.04)+0.35*prm_mean(0.48)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:16:02,721 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.787 = clip(base=0.707 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.715 novelty=0.72 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.66)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:16:02,943 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.933 = clip(base=0.853 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.973 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:16:03,168 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.993 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:16:03,393 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:16:03,616 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.811 = clip(base=0.731 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.763 novelty=0.72 | sol=0.45*prm_final(0.80)+0.35*prm_mean(0.81)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:16:03,833 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:16:04,058 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.994 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=7
+
Iter 22 GRPO groups: 45%|####5 | 9/20 [05:00<04:13, 23.02s/q, loss=-0.0004, mean_r=0.904, q_acc=100%, q_rew=0.722, skip=4]
Iter 22 GRPO groups: 50%|##### | 10/20 [05:00<04:17, 25.72s/q, loss=-0.0004, mean_r=0.904, q_acc=100%, q_rew=0.722, skip=4]2026-04-26 06:16:12,010 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:16:12,096 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:16:12,182 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:16:19,248 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.296 = 0.50×0.10(prox=0.10) + 0.40×proc(0.371[fin=0.39,mean=0.35]) + 0.10×fmt(1.000) | pred='86' gold='15' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 06:16:19,333 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:16:19,417 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:16:19,500 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:16:27,441 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.685 = 0.50×0.56(prox=0.56) + 0.40×proc(0.769[fin=0.89,mean=0.58]) + 0.10×fmt(1.000) | pred='21' gold='15' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:16:27,523 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:16:27,606 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 22 GRPO groups: 50%|##### | 10/20 [05:24<04:17, 25.72s/q, loss=-0.0005, mean_r=0.896, q_acc=100%, q_rew=0.722, skip=4]
Iter 22 GRPO groups: 55%|#####5 | 11/20 [05:24<03:44, 24.97s/q, loss=-0.0005, mean_r=0.896, q_acc=100%, q_rew=0.722, skip=4]2026-04-26 06:16:33,063 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:40,037 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:40,122 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:40,207 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:40,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:48,105 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:48,187 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:48,270 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:48,353 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:56,468 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 22 GRPO groups: 55%|#####5 | 11/20 [05:51<03:44, 24.97s/q, loss=0var, mean_r=0.994, skip=5]
Iter 22 GRPO groups: 60%|###### | 12/20 [05:51<03:25, 25.72s/q, loss=0var, mean_r=0.994, skip=5]2026-04-26 06:17:01,519 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:01,720 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.987 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:01,915 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.918 = clip(base=0.838 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.996 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:02,114 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.918 = clip(base=0.838 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.996 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:02,320 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.993 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:02,518 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.999 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:02,714 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.986 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:02,908 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.919 = clip(base=0.839 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.997 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:03,108 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.995 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:03,310 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.993 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:07,367 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.952 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:07,565 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.996 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:07,765 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.893 = clip(base=0.813 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.957 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:07,963 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.759 = clip(base=0.679 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.734 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.63)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:17:08,163 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.995 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:08,364 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.975 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:08,567 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.882 = clip(base=0.802 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.938 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.82)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:08,765 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.893 = clip(base=0.813 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.957 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:08,958 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.995 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:09,162 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.996 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 22 GRPO groups: 60%|###### | 12/20 [06:05<03:25, 25.72s/q, loss=-0.0004, mean_r=0.904, q_acc=100%, q_rew=0.692, skip=5]
Iter 22 GRPO groups: 65%|######5 | 13/20 [06:05<02:36, 22.29s/q, loss=-0.0004, mean_r=0.904, q_acc=100%, q_rew=0.692, skip=5]2026-04-26 06:17:18,999 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:17:19,093 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.27(prox=0.27) + 0.40×proc(0.950[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='35' gold='15' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 06:17:19,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.27(prox=0.27) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='35' gold='15' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 06:17:31,911 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:17:31,996 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:17:32,091 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:17:32,185 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.27(prox=0.27) + 0.40×proc(0.897[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='35' gold='15' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 06:17:43,598 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:17:43,682 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.916[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 06:17:43,765 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 22 GRPO groups: 65%|######5 | 13/20 [06:40<02:36, 22.29s/q, loss=0.0001, mean_r=0.861, q_acc=100%, q_rew=0.692, skip=5]
Iter 22 GRPO groups: 70%|####### | 14/20 [06:40<02:35, 25.93s/q, loss=0.0001, mean_r=0.861, q_acc=100%, q_rew=0.692, skip=5]2026-04-26 06:18:07,498 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.733 = clip(base=0.653 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.637 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.54)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:18:07,707 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.890 = clip(base=0.810 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.942 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:18:07,929 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.831 = clip(base=0.751 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.843 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(0.50) | steps=6
+2026-04-26 06:18:08,139 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.889 = clip(base=0.809 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.941 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:18:08,350 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.727 = clip(base=0.647 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.674 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.64)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:18:08,563 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.789 = clip(base=0.709 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.772 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.68)+0.20*lccp(0.43) | steps=7
+2026-04-26 06:18:08,767 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.771 = clip(base=0.691 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.736 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.82)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:18:08,974 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.789 = clip(base=0.709 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.773 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.70)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:18:09,183 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.793 = clip(base=0.713 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.778 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.71)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:18:09,393 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.745 = clip(base=0.665 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.680 novelty=0.80 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.67)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:18:22,879 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.940 novelty=0.82 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:18:23,096 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.981 = clip(base=0.901 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.976 novelty=0.82 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:18:23,307 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.829 novelty=0.82 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:18:23,510 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.592 = clip(base=0.512 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.393 novelty=0.82 | sol=0.45*prm_final(0.46)+0.35*prm_mean(0.39)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:18:23,722 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.987 novelty=0.82 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:18:23,932 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.848 = clip(base=0.768 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.751 novelty=0.82 | sol=0.45*prm_final(0.84)+0.35*prm_mean(0.78)+0.20*lccp(0.50) | steps=6
+2026-04-26 06:18:24,146 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.859 = clip(base=0.779 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.769 novelty=0.82 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.81)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:18:24,350 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.839 = clip(base=0.759 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.717 novelty=0.82 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.63)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:18:24,562 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.844 = clip(base=0.764 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.724 novelty=0.82 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.65)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:18:24,776 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.986 = clip(base=0.906 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.982 novelty=0.82 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=7
+
Iter 22 GRPO groups: 70%|####### | 14/20 [07:21<02:35, 25.93s/q, loss=0.0007, mean_r=0.839, q_acc=100%, q_rew=0.696, skip=5]
Iter 22 GRPO groups: 75%|#######5 | 15/20 [07:21<02:32, 30.56s/q, loss=0.0007, mean_r=0.839, q_acc=100%, q_rew=0.696, skip=5]2026-04-26 06:18:29,987 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:36,534 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:36,617 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:36,699 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:36,781 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:43,942 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:44,025 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:44,110 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:44,194 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:50,349 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 22 GRPO groups: 75%|#######5 | 15/20 [07:45<02:32, 30.56s/q, loss=0var, mean_r=0.999, skip=6]
Iter 22 GRPO groups: 80%|######## | 16/20 [07:45<01:54, 28.54s/q, loss=0var, mean_r=0.999, skip=6]2026-04-26 06:18:55,559 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.800 = 0.50×1.00(exact) + 0.40×proc(0.500[fin=0.42,mean=0.62]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 06:18:55,646 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:18:55,734 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:19:09,810 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:19:09,902 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:19:09,985 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.969[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:19:10,068 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:19:19,540 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:19:19,623 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:19:19,707 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 22 GRPO groups: 80%|######## | 16/20 [08:16<01:54, 28.54s/q, loss=0.0001, mean_r=0.976, q_acc=100%, q_rew=0.696, skip=6]
Iter 22 GRPO groups: 85%|########5 | 17/20 [08:16<01:27, 29.22s/q, loss=0.0001, mean_r=0.976, q_acc=100%, q_rew=0.696, skip=6]2026-04-26 06:19:27,764 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:19:38,679 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:19:38,764 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:19:38,850 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.631[fin=0.60,mean=0.68]) + 0.10×fmt(1.000) | pred='120' gold='480' | step_acc=71% lccp=57% (chain=4/7 ok_count=5) n_steps=7
+2026-04-26 06:19:38,936 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:19:56,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:19:56,835 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:19:56,921 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:19:57,005 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:20:06,533 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.739 = 0.50×0.50(prox=0.50) + 0.40×proc(0.973[fin=0.99,mean=0.95]) + 0.10×fmt(1.000) | pred='720' gold='480' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 22 GRPO groups: 85%|########5 | 17/20 [09:03<01:27, 29.22s/q, loss=-0.0003, mean_r=0.929, q_acc=100%, q_rew=0.696, skip=6]
Iter 22 GRPO groups: 90%|######### | 18/20 [09:03<01:09, 34.51s/q, loss=-0.0003, mean_r=0.929, q_acc=100%, q_rew=0.696, skip=6]2026-04-26 06:20:42,166 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+2026-04-26 06:20:42,262 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.723 = 0.50×0.55(prox=0.55) + 0.40×proc(0.870[fin=0.90,mean=0.82]) + 0.10×fmt(1.000) | pred='29' gold='49' | step_acc=82% lccp=64% (chain=7/11 ok_count=9) n_steps=11
+2026-04-26 06:20:42,346 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=0.98,mean=0.98]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:20:44,978 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=12/12 ok_count=12) n_steps=12
+2026-04-26 06:20:45,069 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.956 = 0.50×1.00(exact) + 0.40×proc(0.891[fin=1.00,mean=0.73]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=75% lccp=33% (chain=4/12 ok_count=9) n_steps=12
+2026-04-26 06:20:45,163 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:20:45,247 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.365 = 0.50×0.30(prox=0.30) + 0.40×proc(0.282[fin=0.13,mean=0.51]) + 0.10×fmt(1.000) | pred='105' gold='49' | step_acc=57% lccp=0% (chain=0/7 ok_count=4) n_steps=7
+
Iter 22 GRPO groups: 90%|######### | 18/20 [09:55<01:09, 34.51s/q, loss=0.0000, mean_r=0.862, q_acc=100%, q_rew=0.696, skip=6]
Iter 22 GRPO groups: 95%|#########5| 19/20 [09:55<00:39, 39.79s/q, loss=0.0000, mean_r=0.862, q_acc=100%, q_rew=0.696, skip=6]2026-04-26 06:21:06,268 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.726 = clip(base=0.646 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.600 novelty=0.70 | sol=0.45*prm_final(0.81)+0.35*prm_mean(0.49)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:21:06,468 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.794 = clip(base=0.714 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.698 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.58)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:21:06,670 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.756 = clip(base=0.676 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.640 novelty=0.70 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.52)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:21:06,873 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.821 = clip(base=0.741 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.752 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.73)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:21:07,077 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.763 = clip(base=0.683 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.654 novelty=0.70 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.57)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:21:07,277 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.754 = clip(base=0.674 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.641 novelty=0.70 | sol=0.45*prm_final(0.81)+0.35*prm_mean(0.60)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:21:07,476 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.589 = clip(base=0.509 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.394 novelty=0.70 | sol=0.45*prm_final(0.61)+0.35*prm_mean(0.34)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:21:07,678 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.795 = clip(base=0.715 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.712 novelty=0.70 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.63)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:21:07,877 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.697 = clip(base=0.617 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.546 novelty=0.70 | sol=0.45*prm_final(0.76)+0.35*prm_mean(0.58)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:21:08,077 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.654 = clip(base=0.574 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.513 novelty=0.70 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.36)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:21:13,346 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.993 = clip(base=0.913 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.997 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:21:13,542 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.979 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:13,739 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.979 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:13,947 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.961 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:14,149 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:14,354 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.984 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:14,551 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.994 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:14,747 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.984 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:14,946 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:15,147 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.996 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+
Iter 22 GRPO groups: 95%|#########5| 19/20 [10:11<00:39, 39.79s/q, loss=0.0002, mean_r=0.848, q_acc=100%, q_rew=0.700, skip=6]
Iter 22 GRPO groups: 100%|##########| 20/20 [10:11<00:00, 32.87s/q, loss=0.0002, mean_r=0.848, q_acc=100%, q_rew=0.700, skip=6]
Iter 22 GRPO groups: 100%|##########| 20/20 [10:11<00:00, 30.59s/q, loss=0.0002, mean_r=0.848, q_acc=100%, q_rew=0.700, skip=6]
+2026-04-26 06:21:16,825 INFO __main__ - Iter 22 | loss=0.0006 | reward mean=0.918 std=0.124 | gt_match=90.4% | grounded_acc=98.5% | step_acc=96.5% | lccp=92.8% | batch_acc=98.4% | phase=SELFPLAY_RAMP sp_ratio=32% | groups=20 skipped=6(0var=6) | lr=4.24e-06 | 611.9s
+2026-04-26 06:21:16,825 INFO __main__ - Question generation: 6/6 valid (100%) | q_reward=0.700 | q_acc=100.0% (>0.5 quality) | topic=0.62 diff=0.39 clarity=1.00 novelty=0.46 solvability=0.98
+2026-04-26 06:21:16,826 INFO __main__ - ======================================================================
+2026-04-26 06:21:16,826 INFO __main__ - GRPO ITERATION 23/60
+2026-04-26 06:21:16,826 INFO __main__ - ======================================================================
+2026-04-26 06:21:16,847 INFO __main__ - LR this iteration: 4.24e-06 | T=0.651 | MATH ratio=40%
+
Iter 23 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 06:21:21,199 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:21,389 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:21,589 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:21,779 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:21,977 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:22,178 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:22,374 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:22,562 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:22,749 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:22,938 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:28,794 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.997 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:28,985 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.997 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:29,180 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.997 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:29,372 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.997 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:29,562 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.996 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:29,751 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.995 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:29,941 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:30,135 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.997 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:30,324 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:30,510 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.997 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+
Iter 23 GRPO groups: 0%| | 0/20 [00:15, ?q/s, loss=0.0001, mean_r=0.967, q_acc=100%, q_rew=0.722, skip=0]
Iter 23 GRPO groups: 5%|5 | 1/20 [00:15<04:51, 15.36s/q, loss=0.0001, mean_r=0.967, q_acc=100%, q_rew=0.722, skip=0]2026-04-26 06:21:41,305 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.878 novelty=0.73 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:41,511 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.979 = clip(base=0.899 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.936 novelty=0.73 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:41,722 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.679 = clip(base=0.599 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.469 novelty=0.73 | sol=0.45*prm_final(0.62)+0.35*prm_mean(0.54)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:21:41,939 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.836 novelty=0.73 | sol=0.45*prm_final(0.82)+0.35*prm_mean(0.76)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:42,155 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.785 = clip(base=0.705 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.639 novelty=0.73 | sol=0.45*prm_final(0.70)+0.35*prm_mean(0.64)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:21:42,371 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.930 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.985 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:42,594 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.803 = clip(base=0.723 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.664 novelty=0.73 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.55)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:21:42,801 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.933 = clip(base=0.853 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.873 novelty=0.73 | sol=0.45*prm_final(0.82)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:43,008 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.774 = clip(base=0.694 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.613 novelty=0.73 | sol=0.45*prm_final(0.78)+0.35*prm_mean(0.60)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:21:43,222 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.729 = clip(base=0.649 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.540 novelty=0.73 | sol=0.45*prm_final(0.52)+0.35*prm_mean(0.59)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:21:49,104 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.940 = clip(base=0.860 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.934 novelty=0.76 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:49,301 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.963 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:49,499 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.961 novelty=0.76 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:21:49,696 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.988 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:49,894 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.993 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:50,096 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.965 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:50,295 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.990 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:50,497 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.976 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:50,695 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.932 novelty=0.76 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.82)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:50,896 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.993 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+
Iter 23 GRPO groups: 5%|5 | 1/20 [00:35<04:51, 15.36s/q, loss=-0.0003, mean_r=0.903, q_acc=100%, q_rew=0.748, skip=0]
Iter 23 GRPO groups: 10%|# | 2/20 [00:35<05:29, 18.32s/q, loss=-0.0003, mean_r=0.903, q_acc=100%, q_rew=0.748, skip=0]2026-04-26 06:21:57,202 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:21:57,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:21:57,360 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:21:57,435 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.940[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 06:22:04,899 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:22:04,974 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:22:05,050 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:22:05,132 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:22:11,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:22:11,620 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 23 GRPO groups: 10%|# | 2/20 [00:54<05:29, 18.32s/q, loss=0var, mean_r=0.986, skip=1]
Iter 23 GRPO groups: 15%|#5 | 3/20 [00:54<05:16, 18.64s/q, loss=0var, mean_r=0.986, skip=1]2026-04-26 06:22:16,848 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:22:16,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.35(prox=0.35) + 0.40×proc(0.599[fin=0.57,mean=0.64]) + 0.10×fmt(1.000) | pred='1.25' gold='20' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 06:22:28,976 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.972 = 0.50×1.00(exact) + 0.40×proc(0.930[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=88% lccp=50% (chain=4/8 ok_count=7) n_steps=8
+2026-04-26 06:22:29,062 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.811 = 0.50×0.71(prox=0.71) + 0.40×proc(0.884[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='16' gold='20' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 06:22:29,148 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.843 = 0.50×0.71(prox=0.71) + 0.40×proc(0.966[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='24' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:22:29,234 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.768[fin=0.91,mean=0.56]) + 0.10×fmt(1.000) | pred='5' gold='20' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 06:22:41,174 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:22:41,261 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.680[fin=0.88,mean=0.38]) + 0.10×fmt(1.000) | pred='6.67' gold='20' | step_acc=29% lccp=14% (chain=1/7 ok_count=2) n_steps=7
+2026-04-26 06:22:41,346 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.837[fin=0.99,mean=0.61]) + 0.10×fmt(1.000) | pred='40' gold='20' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 06:22:41,432 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.718 = 0.50×0.76(prox=0.76) + 0.40×proc(0.599[fin=0.69,mean=0.46]) + 0.10×fmt(1.000) | pred='16.8' gold='20' | step_acc=43% lccp=0% (chain=0/7 ok_count=3) n_steps=7
+
Iter 23 GRPO groups: 15%|#5 | 3/20 [01:38<05:16, 18.64s/q, loss=0.0001, mean_r=0.754, q_acc=100%, q_rew=0.748, skip=1]
Iter 23 GRPO groups: 20%|## | 4/20 [01:38<07:37, 28.58s/q, loss=0.0001, mean_r=0.754, q_acc=100%, q_rew=0.748, skip=1]2026-04-26 06:23:29,011 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='13' gold='8' | step_acc=100% lccp=100% (chain=16/16 ok_count=16) n_steps=16
+2026-04-26 06:23:29,107 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.855 = 0.50×0.80(prox=0.80) + 0.40×proc(0.888[fin=0.88,mean=0.91]) + 0.10×fmt(1.000) | pred='9' gold='8' | step_acc=91% lccp=82% (chain=9/11 ok_count=10) n_steps=11
+
Iter 23 GRPO groups: 20%|## | 4/20 [02:12<07:37, 28.58s/q, loss=-0.0013, mean_r=0.703, q_acc=100%, q_rew=0.748, skip=1]
Iter 23 GRPO groups: 25%|##5 | 5/20 [02:12<07:38, 30.54s/q, loss=-0.0013, mean_r=0.703, q_acc=100%, q_rew=0.748, skip=1]2026-04-26 06:23:35,502 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.987 = clip(base=0.907 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:23:35,701 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:23:35,900 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.996 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:23:36,104 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:23:36,310 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.995 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:23:36,514 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.648 = clip(base=0.568 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.509 novelty=0.66 | sol=0.45*prm_final(0.69)+0.35*prm_mean(0.57)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:23:36,720 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:23:36,924 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.602 = clip(base=0.522 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.491 novelty=0.66 | sol=0.45*prm_final(0.71)+0.35*prm_mean(0.49)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:23:37,127 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:37,328 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:23:44,375 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.977 = clip(base=0.897 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.997 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:44,574 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.963 novelty=0.66 | sol=0.45*prm_final(0.94)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:44,781 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.990 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:23:44,984 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.945 novelty=0.66 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:45,179 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.962 novelty=0.66 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:45,377 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.934 = clip(base=0.854 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.960 novelty=0.66 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:45,574 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.980 novelty=0.66 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:45,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.993 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:45,980 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.876 = clip(base=0.796 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.857 novelty=0.66 | sol=0.45*prm_final(0.79)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:46,187 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.979 novelty=0.66 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+
Iter 23 GRPO groups: 25%|##5 | 5/20 [02:31<07:38, 30.54s/q, loss=-0.0004, mean_r=0.917, q_acc=100%, q_rew=0.731, skip=1]
Iter 23 GRPO groups: 30%|### | 6/20 [02:31<06:11, 26.52s/q, loss=-0.0004, mean_r=0.917, q_acc=100%, q_rew=0.731, skip=1]2026-04-26 06:23:52,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:23:52,522 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:23:57,261 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:23:57,342 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:23:57,425 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:23:57,506 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:24:04,180 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:24:04,261 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:24:04,342 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:24:04,425 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 23 GRPO groups: 30%|### | 6/20 [02:54<06:11, 26.52s/q, loss=0var, mean_r=0.999, skip=2]
Iter 23 GRPO groups: 35%|###5 | 7/20 [02:54<05:29, 25.37s/q, loss=0var, mean_r=0.999, skip=2]2026-04-26 06:24:14,244 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:14,324 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:14,401 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:14,484 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:19,569 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:19,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:19,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:19,816 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:24,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:24,301 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 23 GRPO groups: 35%|###5 | 7/20 [03:07<05:29, 25.37s/q, loss=0var, mean_r=0.998, skip=3]
Iter 23 GRPO groups: 40%|#### | 8/20 [03:07<04:17, 21.47s/q, loss=0var, mean_r=0.998, skip=3]2026-04-26 06:24:28,378 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.942 = 0.50×1.00(exact) + 0.40×proc(0.855[fin=1.00,mean=0.64]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:24:28,462 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:33,166 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:33,243 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.969[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:33,321 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:33,404 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.923[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:38,267 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.953 = 0.50×1.00(exact) + 0.40×proc(0.883[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:24:38,344 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:38,428 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:38,511 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.917[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 23 GRPO groups: 40%|#### | 8/20 [03:26<04:17, 21.47s/q, loss=0var, mean_r=0.980, skip=4]
Iter 23 GRPO groups: 45%|####5 | 9/20 [03:26<03:47, 20.71s/q, loss=0var, mean_r=0.980, skip=4]2026-04-26 06:24:47,841 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.367 = 0.50×0.50(prox=0.50) + 0.40×proc(0.043[fin=0.03,mean=0.06]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:24:47,934 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.960[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:24:48,027 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.923[fin=0.99,mean=0.82]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:48,027 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='4' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 06:24:50,841 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.217 = 0.50×0.00(prox=0.00) + 0.40×proc(0.380[fin=0.47,mean=0.24]) + 0.10×fmt(0.650) | pred='4/7' gold='4' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 06:24:50,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.281 = 0.50×0.00(prox=0.00) + 0.40×proc(0.358[fin=0.38,mean=0.32]) + 0.10×fmt(1.000) | pred='4/3' gold='4' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 06:24:51,031 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:24:51,126 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.143 = 0.50×0.00(prox=0.00) + 0.40×proc(0.182[fin=0.15,mean=0.24]) + 0.10×fmt(0.700) | pred='' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 06:25:01,247 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.212 = 0.50×0.00(prox=0.00) + 0.40×proc(0.279[fin=0.29,mean=0.26]) + 0.10×fmt(1.000) | pred='2/3' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 06:25:01,340 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 23 GRPO groups: 45%|####5 | 9/20 [03:45<03:47, 20.71s/q, loss=0.0001, mean_r=0.517, q_acc=100%, q_rew=0.731, skip=4]
Iter 23 GRPO groups: 50%|##### | 10/20 [03:45<03:23, 20.34s/q, loss=0.0001, mean_r=0.517, q_acc=100%, q_rew=0.731, skip=4]2026-04-26 06:25:09,211 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.990 = clip(base=0.910 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:25:09,416 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.995 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:25:09,618 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:25:09,820 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:25:10,024 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:25:10,230 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:25:10,434 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:25:10,643 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:25:10,851 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.994 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:25:11,056 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.70 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:25:14,885 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.958 novelty=0.67 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:25:15,083 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.976 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:25:15,277 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.933 = clip(base=0.853 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.977 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:25:15,470 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.923 = clip(base=0.843 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.958 novelty=0.67 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:25:15,676 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.994 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:25:15,871 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:25:16,068 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:25:16,266 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:25:16,458 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.979 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:25:16,652 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.988 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+
Iter 23 GRPO groups: 50%|##### | 10/20 [04:01<03:23, 20.34s/q, loss=0.0013, mean_r=0.956, q_acc=100%, q_rew=0.724, skip=4]
Iter 23 GRPO groups: 55%|#####5 | 11/20 [04:01<02:49, 18.85s/q, loss=0.0013, mean_r=0.956, q_acc=100%, q_rew=0.724, skip=4]2026-04-26 06:25:19,927 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:20,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:24,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:24,574 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:24,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:24,732 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:29,378 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:29,456 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:29,532 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:29,608 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 23 GRPO groups: 55%|#####5 | 11/20 [04:17<02:49, 18.85s/q, loss=0var, mean_r=0.997, skip=5]
Iter 23 GRPO groups: 60%|###### | 12/20 [04:17<02:23, 17.95s/q, loss=0var, mean_r=0.997, skip=5]2026-04-26 06:25:38,098 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.783 = 0.50×0.64(prox=0.64) + 0.40×proc(0.912[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='2700' gold='2100' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:25:38,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.780 = 0.50×0.64(prox=0.64) + 0.40×proc(0.905[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='2700' gold='2100' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 06:25:38,276 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.749 = 0.50×0.64(prox=0.64) + 0.40×proc(0.828[fin=0.93,mean=0.67]) + 0.10×fmt(1.000) | pred='2700' gold='2100' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 06:25:38,366 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.853 = 0.50×0.78(prox=0.78) + 0.40×proc(0.910[fin=0.98,mean=0.80]) + 0.10×fmt(1.000) | pred='2400' gold='2100' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:25:48,241 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.781 = 0.50×0.64(prox=0.64) + 0.40×proc(0.906[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='2700' gold='2100' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 06:25:48,324 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.861 = 0.50×0.78(prox=0.78) + 0.40×proc(0.931[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='2400' gold='2100' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 06:25:48,408 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.822 = 0.50×0.78(prox=0.78) + 0.40×proc(0.833[fin=1.00,mean=0.59]) + 0.10×fmt(1.000) | pred='2400' gold='2100' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 06:25:48,492 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.941 = 0.50×1.00(exact) + 0.40×proc(0.853[fin=0.91,mean=0.77]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:26:01,562 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.880 = 0.50×0.78(prox=0.78) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='2400' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:26:01,647 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.867 = 0.50×0.78(prox=0.78) + 0.40×proc(0.946[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='2400' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 23 GRPO groups: 60%|###### | 12/20 [04:46<02:23, 17.95s/q, loss=0.0008, mean_r=0.832, q_acc=100%, q_rew=0.724, skip=5]
Iter 23 GRPO groups: 65%|######5 | 13/20 [04:46<02:28, 21.28s/q, loss=0.0008, mean_r=0.832, q_acc=100%, q_rew=0.724, skip=5]2026-04-26 06:26:07,369 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:07,451 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:13,521 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:26:13,606 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:13,693 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:26:13,779 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:26:21,117 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:26:21,201 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:21,279 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:21,363 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 23 GRPO groups: 65%|######5 | 13/20 [05:13<02:28, 21.28s/q, loss=0var, mean_r=1.000, skip=6]
Iter 23 GRPO groups: 70%|####### | 14/20 [05:13<02:18, 23.02s/q, loss=0var, mean_r=1.000, skip=6]2026-04-26 06:26:33,759 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.997 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:33,952 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.995 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:34,154 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:34,352 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:34,542 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.995 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:34,734 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:34,925 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.995 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:35,121 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:35,315 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:35,524 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:39,581 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.790 = clip(base=0.710 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.699 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.73)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:26:39,772 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.997 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:39,964 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.992 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:40,160 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.989 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:26:40,349 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.772 = clip(base=0.692 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.699 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.73)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:26:40,539 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.996 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:40,729 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:40,922 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.997 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:41,116 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.993 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:41,317 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.772 = clip(base=0.692 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.699 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.73)+0.20*lccp(0.00) | steps=2
+
Iter 23 GRPO groups: 70%|####### | 14/20 [05:26<02:18, 23.02s/q, loss=0.0040, mean_r=0.926, q_acc=100%, q_rew=0.717, skip=6]
Iter 23 GRPO groups: 75%|#######5 | 15/20 [05:26<01:39, 19.94s/q, loss=0.0040, mean_r=0.926, q_acc=100%, q_rew=0.717, skip=6]2026-04-26 06:26:46,247 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:46,328 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:26:46,405 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:46,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:51,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:51,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:26:51,898 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:51,977 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:26:57,099 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:26:57,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 23 GRPO groups: 75%|#######5 | 15/20 [05:40<01:39, 19.94s/q, loss=0var, mean_r=0.987, skip=7]
Iter 23 GRPO groups: 80%|######## | 16/20 [05:40<01:12, 18.21s/q, loss=0var, mean_r=0.987, skip=7]2026-04-26 06:27:05,484 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.968 novelty=0.71 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:27:05,694 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.716 = clip(base=0.636 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.583 novelty=0.71 | sol=0.45*prm_final(0.89)+0.35*prm_mean(0.52)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:27:05,898 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.796 = clip(base=0.716 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.724 novelty=0.71 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.72)+0.20*lccp(0.33) | steps=6
+2026-04-26 06:27:06,101 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.849 = clip(base=0.769 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.809 novelty=0.71 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.80)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:27:06,302 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.570 = clip(base=0.490 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.409 novelty=0.71 | sol=0.45*prm_final(0.40)+0.35*prm_mean(0.43)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:27:06,515 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.677 = clip(base=0.597 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.519 novelty=0.71 | sol=0.45*prm_final(0.55)+0.35*prm_mean(0.55)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:27:06,721 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.611 = clip(base=0.531 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.485 novelty=0.71 | sol=0.45*prm_final(0.55)+0.35*prm_mean(0.49)+0.20*lccp(0.33) | steps=6
+2026-04-26 06:27:06,925 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.729 = clip(base=0.649 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.600 novelty=0.71 | sol=0.45*prm_final(0.57)+0.35*prm_mean(0.63)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:27:07,129 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.839 = clip(base=0.759 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.791 novelty=0.71 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.74)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:27:07,335 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.755 = clip(base=0.675 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.649 novelty=0.71 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.70)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:27:14,793 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:15,007 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:15,226 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:15,443 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:15,658 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:15,906 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:16,122 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:16,336 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:16,562 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:16,775 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+
Iter 23 GRPO groups: 80%|######## | 16/20 [06:01<01:12, 18.21s/q, loss=-0.0001, mean_r=0.859, q_acc=100%, q_rew=0.716, skip=7]
Iter 23 GRPO groups: 85%|########5 | 17/20 [06:01<00:57, 19.13s/q, loss=-0.0001, mean_r=0.859, q_acc=100%, q_rew=0.716, skip=7]2026-04-26 06:27:22,060 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:22,146 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:28,483 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:28,565 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:28,649 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:28,731 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:35,045 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:35,132 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:35,218 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:35,302 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 23 GRPO groups: 85%|########5 | 17/20 [06:18<00:57, 19.13s/q, loss=0var, mean_r=0.999, skip=8]
Iter 23 GRPO groups: 90%|######### | 18/20 [06:18<00:36, 18.44s/q, loss=0var, mean_r=0.999, skip=8]2026-04-26 06:27:42,106 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.935 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.985 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:27:42,296 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.795 = clip(base=0.715 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.678 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.67)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:27:42,491 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.824 = clip(base=0.744 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.729 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.80)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:27:42,685 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.981 = clip(base=0.901 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.966 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:27:42,878 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.694 = clip(base=0.614 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.578 novelty=0.77 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.43)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:27:43,076 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.963 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:27:43,276 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.992 = clip(base=0.912 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.982 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:27:43,474 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.994 = clip(base=0.914 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:27:43,670 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.997 = clip(base=0.917 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.985 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:27:43,871 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.820 = clip(base=0.740 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.715 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.76)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:28:20,310 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:28:20,511 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.987 = clip(base=0.907 + mod=+0.080, cap=1.00) | Q=0.77 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:28:20,708 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.891 = clip(base=0.811 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.841 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:28:20,906 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.984 = clip(base=0.904 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.987 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:28:21,105 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.943 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:21,302 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.922 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:28:21,498 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.980 = clip(base=0.900 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.959 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:21,696 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.921 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.996 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:28:21,893 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.994 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+
Iter 23 GRPO groups: 90%|######### | 18/20 [07:06<00:36, 18.44s/q, loss=0.0010, mean_r=0.944, q_acc=100%, q_rew=0.726, skip=8]
Iter 23 GRPO groups: 95%|#########5| 19/20 [07:06<00:27, 27.38s/q, loss=0.0010, mean_r=0.944, q_acc=100%, q_rew=0.726, skip=8]2026-04-26 06:28:27,656 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:28:27,738 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:28:27,821 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:28:27,903 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:28:36,072 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:28:36,155 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:28:36,237 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:28:36,318 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:28:41,054 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:28:41,136 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 23 GRPO groups: 95%|#########5| 19/20 [07:24<00:27, 27.38s/q, loss=0var, mean_r=0.998, skip=9]
Iter 23 GRPO groups: 100%|##########| 20/20 [07:24<00:00, 24.45s/q, loss=0var, mean_r=0.998, skip=9]
Iter 23 GRPO groups: 100%|##########| 20/20 [07:24<00:00, 22.21s/q, loss=0var, mean_r=0.998, skip=9]
+2026-04-26 06:28:41,148 INFO __main__ - Iter 23 | loss=0.0006 | reward mean=0.921 std=0.147 | gt_match=80.3% | grounded_acc=95.1% | step_acc=90.8% | lccp=84.8% | batch_acc=97.7% | phase=SELFPLAY_RAMP sp_ratio=36% | groups=18 skipped=9(0var=9) | lr=4.14e-06 | 444.3s
+2026-04-26 06:28:41,149 WARNING __main__ - STARVATION: 33% of groups skipped (zero variance). grounded_acc=95.1% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 06:28:41,149 INFO __main__ - Question generation: 7/7 valid (100%) | q_reward=0.726 | q_acc=100.0% (>0.5 quality) | topic=0.56 diff=0.66 clarity=1.00 novelty=0.44 solvability=0.99
+2026-04-26 06:28:41,150 INFO __main__ - ======================================================================
+2026-04-26 06:28:41,150 INFO __main__ - GRPO ITERATION 24/60
+2026-04-26 06:28:41,150 INFO __main__ - ======================================================================
+2026-04-26 06:28:41,169 INFO __main__ - LR this iteration: 4.14e-06 | T=0.644 | MATH ratio=42%
+
Iter 24 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 06:28:45,381 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.992 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:45,569 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:45,755 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:45,941 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:46,130 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:46,321 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:46,506 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:46,691 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:46,876 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:47,063 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:52,254 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.967 novelty=0.71 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:52,449 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.887 = clip(base=0.807 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.964 novelty=0.71 | sol=0.45*prm_final(0.94)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:52,643 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.889 = clip(base=0.809 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.967 novelty=0.71 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:52,837 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.880 = clip(base=0.800 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.952 novelty=0.71 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:53,031 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.876 = clip(base=0.796 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.945 novelty=0.71 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:53,231 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.891 = clip(base=0.811 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.971 novelty=0.71 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:53,435 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.889 = clip(base=0.809 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.967 novelty=0.71 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:53,633 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.893 = clip(base=0.813 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.974 novelty=0.71 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:53,832 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.891 = clip(base=0.811 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.971 novelty=0.71 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:54,027 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.893 = clip(base=0.813 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.974 novelty=0.71 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+
Iter 24 GRPO groups: 0%| | 0/20 [00:14, ?q/s, loss=-0.0001, mean_r=0.899, q_acc=100%, q_rew=0.575, skip=0]
Iter 24 GRPO groups: 5%|5 | 1/20 [00:14<04:35, 14.52s/q, loss=-0.0001, mean_r=0.899, q_acc=100%, q_rew=0.575, skip=0]2026-04-26 06:28:58,829 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:28:58,914 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.713 = 0.50×0.67(prox=0.67) + 0.40×proc(0.786[fin=0.97,mean=0.51]) + 0.10×fmt(0.650) | pred='12' gold='16' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 06:29:06,145 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:29:06,228 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.679 = 0.50×0.67(prox=0.67) + 0.40×proc(0.702[fin=0.87,mean=0.45]) + 0.10×fmt(0.650) | pred='12' gold='16' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 06:29:06,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:29:06,393 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.717 = 0.50×0.67(prox=0.67) + 0.40×proc(0.796[fin=0.98,mean=0.51]) + 0.10×fmt(0.650) | pred='12' gold='16' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 06:29:15,158 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.693 = 0.50×0.67(prox=0.67) + 0.40×proc(0.738[fin=0.91,mean=0.47]) + 0.10×fmt(0.650) | pred='12' gold='16' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 06:29:15,242 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:29:15,327 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.744 = 0.50×0.67(prox=0.67) + 0.40×proc(0.777[fin=0.98,mean=0.48]) + 0.10×fmt(1.000) | pred='12' gold='16' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 06:29:15,411 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 24 GRPO groups: 5%|5 | 1/20 [00:43<04:35, 14.52s/q, loss=0.0018, mean_r=0.850, q_acc=100%, q_rew=0.575, skip=0]
Iter 24 GRPO groups: 10%|# | 2/20 [00:43<06:57, 23.18s/q, loss=0.0018, mean_r=0.850, q_acc=100%, q_rew=0.575, skip=0]2026-04-26 06:29:31,262 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:29:31,345 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:29:31,428 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:29:31,514 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.948[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:29:38,316 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.912[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=88% lccp=12% (chain=1/8 ok_count=7) n_steps=8
+2026-04-26 06:29:38,398 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:29:38,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:29:38,564 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.939[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 06:29:46,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:29:46,479 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 24 GRPO groups: 10%|# | 2/20 [01:05<06:57, 23.18s/q, loss=0var, mean_r=0.991, skip=1]
Iter 24 GRPO groups: 15%|#5 | 3/20 [01:05<06:21, 22.44s/q, loss=0var, mean_r=0.991, skip=1]2026-04-26 06:29:54,343 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.981 novelty=0.61 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:29:54,541 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.733 = clip(base=0.653 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.713 novelty=0.61 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.66)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:29:54,741 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.390 = clip(base=0.310 + mod=+0.080, cap=1.00) | Q=0.50 sol=0.184 novelty=0.61 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.31)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:29:54,945 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.509 = clip(base=0.429 + mod=+0.080, cap=1.00) | Q=0.49 sol=0.386 novelty=0.61 | sol=0.45*prm_final(0.41)+0.35*prm_mean(0.38)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:29:55,154 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.698 = clip(base=0.618 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.649 novelty=0.61 | sol=0.45*prm_final(0.61)+0.35*prm_mean(0.72)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:29:55,364 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.412 = clip(base=0.332 + mod=+0.080, cap=1.00) | Q=0.49 sol=0.224 novelty=0.61 | sol=0.45*prm_final(0.07)+0.35*prm_mean(0.36)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:29:55,569 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.896 = clip(base=0.816 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.985 novelty=0.61 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:29:55,769 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.636 = clip(base=0.556 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.531 novelty=0.61 | sol=0.45*prm_final(0.43)+0.35*prm_mean(0.64)+0.20*lccp(0.57) | steps=7
+2026-04-26 06:29:55,968 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.667 = clip(base=0.587 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.603 novelty=0.61 | sol=0.45*prm_final(0.62)+0.35*prm_mean(0.63)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:29:56,170 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.888 = clip(base=0.808 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.973 novelty=0.61 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:30:02,443 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.957 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:30:02,646 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.934 = clip(base=0.854 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.969 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:30:02,849 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.940 = clip(base=0.860 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.978 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:30:03,051 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.962 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:30:03,255 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.997 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:30:03,460 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.987 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:30:03,667 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:30:03,871 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.792 = clip(base=0.712 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.732 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.81)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:30:04,074 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:30:04,274 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.984 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+
Iter 24 GRPO groups: 15%|#5 | 3/20 [01:24<06:21, 22.44s/q, loss=0.0005, mean_r=0.801, q_acc=100%, q_rew=0.597, skip=1]
Iter 24 GRPO groups: 20%|## | 4/20 [01:24<05:40, 21.27s/q, loss=0.0005, mean_r=0.801, q_acc=100%, q_rew=0.597, skip=1]2026-04-26 06:30:15,546 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.970 = clip(base=0.890 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:30:15,752 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.999 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:30:15,954 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.998 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:30:16,158 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.996 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:30:16,361 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.66 sol=1.000 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:30:16,567 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.989 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:30:16,771 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.66 sol=1.000 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:30:16,974 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:30:17,179 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.999 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:30:17,386 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.975 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:30:23,279 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.787 = clip(base=0.707 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.691 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.69)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:23,483 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.787 = clip(base=0.707 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.725 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.79)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:23,688 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.754 = clip(base=0.674 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.689 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.68)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:23,895 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.742 = clip(base=0.662 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.668 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.63)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:24,101 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.778 = clip(base=0.698 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.713 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.75)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:24,313 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.771 = clip(base=0.691 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.702 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.72)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:24,521 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.768 = clip(base=0.688 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.708 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.74)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:24,724 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.779 = clip(base=0.699 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.724 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.78)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:24,941 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.778 = clip(base=0.698 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.722 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.78)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:25,158 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.788 = clip(base=0.708 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.726 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.79)+0.20*lccp(0.00) | steps=3
+
Iter 24 GRPO groups: 20%|## | 4/20 [01:45<05:40, 21.27s/q, loss=0.0003, mean_r=0.861, q_acc=100%, q_rew=0.623, skip=1]
Iter 24 GRPO groups: 25%|##5 | 5/20 [01:45<05:17, 21.14s/q, loss=0.0003, mean_r=0.861, q_acc=100%, q_rew=0.623, skip=1]2026-04-26 06:30:35,189 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.386 = 0.50×0.00(prox=0.00) + 0.40×proc(0.714[fin=0.92,mean=0.41]) + 0.10×fmt(1.000) | pred='$4\\sqrt{2}$' gold='9' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 06:30:35,276 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.449 = 0.50×0.00(prox=0.00) + 0.40×proc(0.873[fin=0.99,mean=0.70]) + 0.10×fmt(1.000) | pred='$4\\sqrt{2}$' gold='9' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 06:30:47,566 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.380 = 0.50×0.47(prox=0.47) + 0.40×proc(0.109[fin=0.11,mean=0.10]) + 0.10×fmt(1.000) | pred='4' gold='9' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:30:47,650 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.844 = 0.50×0.82(prox=0.82) + 0.40×proc(0.837[fin=0.98,mean=0.62]) + 0.10×fmt(1.000) | pred='8' gold='9' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 06:30:47,742 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.117 = 0.50×0.00(prox=0.00) + 0.40×proc(0.117[fin=0.08,mean=0.18]) + 0.10×fmt(0.700) | pred='' gold='9' | step_acc=17% lccp=0% (chain=0/6 ok_count=1) n_steps=6
+2026-04-26 06:30:47,825 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:30:56,450 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.987[fin=0.99,mean=0.99]) + 0.10×fmt(0.700) | pred='' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:30:56,532 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.898[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 06:30:56,623 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.365 = 0.50×0.00(prox=0.00) + 0.40×proc(0.663[fin=0.91,mean=0.30]) + 0.10×fmt(1.000) | pred='$4\\sqrt{2}$' gold='9' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 06:30:56,706 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 24 GRPO groups: 25%|##5 | 5/20 [02:23<05:17, 21.14s/q, loss=0.0002, mean_r=0.602, q_acc=100%, q_rew=0.623, skip=1]
Iter 24 GRPO groups: 30%|### | 6/20 [02:23<06:15, 26.84s/q, loss=0.0002, mean_r=0.602, q_acc=100%, q_rew=0.623, skip=1]2026-04-26 06:31:09,543 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:31:09,627 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.558 = 0.50×0.34(prox=0.34) + 0.40×proc(0.716[fin=0.82,mean=0.56]) + 0.10×fmt(1.000) | pred='0.8' gold='20' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:31:09,708 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:31:09,791 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.928 = 0.50×1.00(exact) + 0.40×proc(0.820[fin=0.82,mean=0.83]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:31:16,023 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:31:16,111 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.924 = 0.50×1.00(exact) + 0.40×proc(0.809[fin=0.98,mean=0.55]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 06:31:16,200 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.631 = 0.50×0.38(prox=0.38) + 0.40×proc(0.847[fin=1.00,mean=0.62]) + 0.10×fmt(1.000) | pred='4' gold='20' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 06:31:16,288 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.572 = 0.50×0.38(prox=0.38) + 0.40×proc(0.699[fin=0.87,mean=0.44]) + 0.10×fmt(1.000) | pred='4' gold='20' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 06:31:23,253 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:31:23,335 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 24 GRPO groups: 30%|### | 6/20 [02:43<06:15, 26.84s/q, loss=-0.0022, mean_r=0.859, q_acc=100%, q_rew=0.623, skip=1]
Iter 24 GRPO groups: 35%|###5 | 7/20 [02:43<05:19, 24.61s/q, loss=-0.0022, mean_r=0.859, q_acc=100%, q_rew=0.623, skip=1]2026-04-26 06:31:31,924 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.954 + mod=+0.080, cap=1.00) | Q=0.89 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:32,143 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.991 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:32,356 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:32,576 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:32,788 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:33,001 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:33,224 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:33,440 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:33,663 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:33,879 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.938 + mod=+0.080, cap=1.00) | Q=0.85 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:31:43,891 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.982 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:31:44,107 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.994 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:31:44,322 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.971 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:31:44,546 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.392 = clip(base=0.312 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.173 novelty=0.77 | sol=0.45*prm_final(0.17)+0.35*prm_mean(0.27)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:31:44,762 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.812 = clip(base=0.732 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.791 novelty=0.77 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.75)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:31:44,986 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.682 = clip(base=0.602 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.617 novelty=0.77 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.58)+0.20*lccp(0.14) | steps=7
+2026-04-26 06:31:45,212 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:31:45,437 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.651 = clip(base=0.571 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.564 novelty=0.77 | sol=0.45*prm_final(0.42)+0.35*prm_mean(0.69)+0.20*lccp(0.67) | steps=6
+2026-04-26 06:31:45,656 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.971 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:31:45,889 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.634 = clip(base=0.554 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.521 novelty=0.77 | sol=0.45*prm_final(0.48)+0.35*prm_mean(0.58)+0.20*lccp(0.50) | steps=4
+
Iter 24 GRPO groups: 35%|###5 | 7/20 [03:06<05:19, 24.61s/q, loss=0.0002, mean_r=0.892, q_acc=100%, q_rew=0.650, skip=2]
Iter 24 GRPO groups: 40%|#### | 8/20 [03:06<04:48, 24.02s/q, loss=0.0002, mean_r=0.892, q_acc=100%, q_rew=0.650, skip=2]2026-04-26 06:31:53,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:31:53,673 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:32:02,861 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:32:02,944 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:32:03,027 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:32:03,113 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:32:12,967 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='-6' gold='6' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:32:13,049 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:32:13,133 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:32:13,216 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 24 GRPO groups: 40%|#### | 8/20 [03:45<04:48, 24.02s/q, loss=-0.0007, mean_r=0.864, q_acc=100%, q_rew=0.650, skip=2]
Iter 24 GRPO groups: 45%|####5 | 9/20 [03:45<05:17, 28.82s/q, loss=-0.0007, mean_r=0.864, q_acc=100%, q_rew=0.650, skip=2]2026-04-26 06:32:36,050 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.971 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:32:36,244 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:32:36,436 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.888 = clip(base=0.808 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.964 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:32:36,633 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.897 = clip(base=0.817 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.980 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:32:36,821 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.990 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:32:37,010 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.453 = clip(base=0.373 + mod=+0.080, cap=1.00) | Q=0.50 sol=0.285 novelty=0.71 | sol=0.45*prm_final(0.07)+0.35*prm_mean(0.44)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:32:37,209 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.989 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:32:37,400 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:32:37,590 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:32:37,784 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.892 = clip(base=0.812 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.971 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:32:47,791 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.972 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:32:47,991 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.884 = clip(base=0.804 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.958 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:32:48,199 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.482 = clip(base=0.402 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.312 novelty=0.70 | sol=0.45*prm_final(0.10)+0.35*prm_mean(0.47)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:32:48,402 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.757 = clip(base=0.677 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.738 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.65)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:32:48,604 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.461 = clip(base=0.381 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.277 novelty=0.70 | sol=0.45*prm_final(0.03)+0.35*prm_mean(0.46)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:32:48,815 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.869 = clip(base=0.789 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.925 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.80)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:32:49,017 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.873 = clip(base=0.793 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.940 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:32:49,219 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.869 = clip(base=0.789 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.933 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.82)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:32:49,433 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.624 = clip(base=0.544 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.536 novelty=0.70 | sol=0.45*prm_final(0.79)+0.35*prm_mean(0.40)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:32:49,634 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.884 = clip(base=0.804 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.958 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=2
+
Iter 24 GRPO groups: 45%|####5 | 9/20 [04:10<05:17, 28.82s/q, loss=-0.0018, mean_r=0.809, q_acc=100%, q_rew=0.634, skip=2]
Iter 24 GRPO groups: 50%|##### | 10/20 [04:10<04:34, 27.45s/q, loss=-0.0018, mean_r=0.809, q_acc=100%, q_rew=0.634, skip=2]2026-04-26 06:32:54,845 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='238' gold='238' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:32:54,927 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='238' gold='238' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:32:55,009 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='238' gold='238' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:32:55,084 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='238' gold='238' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:33:00,095 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='238' gold='238' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:33:00,177 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.873 = 0.50×0.85(prox=0.85) + 0.40×proc(0.871[fin=0.98,mean=0.70]) + 0.10×fmt(1.000) | pred='242' gold='238' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 06:33:00,259 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='238' gold='238' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:33:00,339 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.873 = 0.50×0.85(prox=0.85) + 0.40×proc(0.870[fin=0.98,mean=0.70]) + 0.10×fmt(1.000) | pred='242' gold='238' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 06:33:05,270 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='238' gold='238' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:33:05,355 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='238' gold='238' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 24 GRPO groups: 50%|##### | 10/20 [04:25<04:34, 27.45s/q, loss=0.0010, mean_r=0.974, q_acc=100%, q_rew=0.634, skip=2]
Iter 24 GRPO groups: 55%|#####5 | 11/20 [04:25<03:34, 23.79s/q, loss=0.0010, mean_r=0.974, q_acc=100%, q_rew=0.634, skip=2]2026-04-26 06:33:10,655 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:33:10,852 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.986 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:33:11,064 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.892 = clip(base=0.812 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.973 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:33:11,271 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:33:11,463 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.511 = clip(base=0.431 + mod=+0.080, cap=1.00) | Q=0.51 sol=0.376 novelty=0.69 | sol=0.45*prm_final(0.62)+0.35*prm_mean(0.27)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:33:11,655 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.727 = clip(base=0.647 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.693 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.70)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:33:11,850 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:33:12,042 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:33:12,241 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.896 = clip(base=0.816 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.979 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:33:12,432 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:33:18,076 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.594 = clip(base=0.514 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.432 novelty=0.63 | sol=0.45*prm_final(0.29)+0.35*prm_mean(0.57)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:33:18,278 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.499 = clip(base=0.419 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.298 novelty=0.63 | sol=0.45*prm_final(0.05)+0.35*prm_mean(0.51)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:33:18,492 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.727 = clip(base=0.647 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.682 novelty=0.63 | sol=0.45*prm_final(0.75)+0.35*prm_mean(0.70)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:33:18,697 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.478 = clip(base=0.398 + mod=+0.080, cap=1.00) | Q=0.53 sol=0.312 novelty=0.63 | sol=0.45*prm_final(0.12)+0.35*prm_mean(0.45)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:33:18,897 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.421 = clip(base=0.341 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.210 novelty=0.63 | sol=0.45*prm_final(0.12)+0.35*prm_mean(0.31)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:33:19,101 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.576 = clip(base=0.496 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.427 novelty=0.63 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.68)+0.20*lccp(0.75) | steps=4
+2026-04-26 06:33:19,302 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.565 = clip(base=0.485 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.409 novelty=0.63 | sol=0.45*prm_final(0.06)+0.35*prm_mean(0.66)+0.20*lccp(0.75) | steps=4
+2026-04-26 06:33:19,496 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.473 = clip(base=0.393 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.306 novelty=0.63 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.48)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:33:19,689 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.520 = clip(base=0.440 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.334 novelty=0.63 | sol=0.45*prm_final(0.10)+0.35*prm_mean(0.53)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:33:19,882 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.553 = clip(base=0.473 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.397 novelty=0.63 | sol=0.45*prm_final(0.03)+0.35*prm_mean(0.66)+0.20*lccp(0.75) | steps=4
+
Iter 24 GRPO groups: 55%|#####5 | 11/20 [04:40<03:34, 23.79s/q, loss=-0.0001, mean_r=0.694, q_acc=100%, q_rew=0.625, skip=2]
Iter 24 GRPO groups: 60%|###### | 12/20 [04:40<02:49, 21.15s/q, loss=-0.0001, mean_r=0.694, q_acc=100%, q_rew=0.625, skip=2]2026-04-26 06:33:25,144 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:25,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:32,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:32,865 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:32,947 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:33,030 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:40,923 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:41,008 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:41,095 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:41,180 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 24 GRPO groups: 60%|###### | 12/20 [05:06<02:49, 21.15s/q, loss=0var, mean_r=0.998, skip=3]
Iter 24 GRPO groups: 65%|######5 | 13/20 [05:06<02:38, 22.70s/q, loss=0var, mean_r=0.998, skip=3]2026-04-26 06:33:56,366 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.990 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:33:56,564 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.995 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:33:56,758 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.964 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:33:56,951 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:33:57,151 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.987 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:33:57,356 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.996 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:33:57,584 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:33:57,780 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.992 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:33:57,975 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:33:58,171 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.989 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:34:07,041 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.976 = clip(base=0.896 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:34:07,253 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:34:07,457 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.980 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:34:07,655 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.984 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:34:07,854 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:34:08,054 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.717 = clip(base=0.637 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.571 novelty=0.71 | sol=0.45*prm_final(0.56)+0.35*prm_mean(0.62)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:34:08,264 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.824 = clip(base=0.744 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.751 novelty=0.71 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.74)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:34:08,474 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:34:08,683 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.990 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:34:08,882 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 24 GRPO groups: 65%|######5 | 13/20 [05:29<02:38, 22.70s/q, loss=-0.0013, mean_r=0.940, q_acc=100%, q_rew=0.637, skip=3]
Iter 24 GRPO groups: 70%|####### | 14/20 [05:29<02:15, 22.61s/q, loss=-0.0013, mean_r=0.940, q_acc=100%, q_rew=0.637, skip=3]2026-04-26 06:34:15,822 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:34:15,905 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:34:15,987 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:34:16,069 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:34:28,432 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:34:28,513 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:34:28,595 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:34:28,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:34:41,098 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:34:41,181 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 24 GRPO groups: 70%|####### | 14/20 [06:00<02:15, 22.61s/q, loss=0var, mean_r=0.999, skip=4]
Iter 24 GRPO groups: 75%|#######5 | 15/20 [06:00<02:05, 25.02s/q, loss=0var, mean_r=0.999, skip=4]2026-04-26 06:34:44,922 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:45,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:51,794 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:51,879 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:51,962 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:52,045 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:59,059 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:59,144 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:59,228 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:59,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 24 GRPO groups: 75%|#######5 | 15/20 [06:24<02:05, 25.02s/q, loss=0var, mean_r=0.998, skip=5]
Iter 24 GRPO groups: 80%|######## | 16/20 [06:24<01:39, 24.99s/q, loss=0var, mean_r=0.998, skip=5]2026-04-26 06:35:12,347 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:35:12,429 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 06:35:12,512 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:35:12,596 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:35:27,925 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:35:28,007 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:35:28,092 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:35:28,178 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:35:36,419 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:35:36,505 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+
Iter 24 GRPO groups: 80%|######## | 16/20 [06:55<01:39, 24.99s/q, loss=0var, mean_r=0.999, skip=6]
Iter 24 GRPO groups: 85%|########5 | 17/20 [06:55<01:19, 26.62s/q, loss=0var, mean_r=0.999, skip=6]2026-04-26 06:35:40,326 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:40,408 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:44,524 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='$64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:44,607 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:44,690 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:44,772 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:50,133 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:50,220 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='$64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:50,304 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:50,386 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 24 GRPO groups: 85%|########5 | 17/20 [07:14<01:19, 26.62s/q, loss=0var, mean_r=0.995, skip=7]
Iter 24 GRPO groups: 90%|######### | 18/20 [07:14<00:48, 24.33s/q, loss=0var, mean_r=0.995, skip=7]2026-04-26 06:36:01,508 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:36:01,593 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.399 = 0.50×0.40(prox=0.40) + 0.40×proc(0.155[fin=0.08,mean=0.27]) + 0.10×fmt(1.000) | pred='10' gold='40' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 06:36:01,679 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.461 = 0.50×0.40(prox=0.40) + 0.40×proc(0.308[fin=0.27,mean=0.37]) + 0.10×fmt(1.000) | pred='10' gold='40' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 06:36:01,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.906[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 06:36:08,936 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:36:09,018 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:36:09,103 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='40%' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:36:09,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:36:17,381 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.412 = 0.50×0.40(prox=0.40) + 0.40×proc(0.154[fin=0.04,mean=0.33]) + 0.10×fmt(1.000) | pred='10' gold='40' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 06:36:17,465 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.702 = 0.50×0.50(prox=0.50) + 0.40×proc(0.881[fin=0.99,mean=0.71]) + 0.10×fmt(1.000) | pred='60' gold='40' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+
Iter 24 GRPO groups: 90%|######### | 18/20 [07:37<00:48, 24.33s/q, loss=0.0003, mean_r=0.747, q_acc=100%, q_rew=0.637, skip=7]
Iter 24 GRPO groups: 95%|#########5| 19/20 [07:37<00:24, 24.05s/q, loss=0.0003, mean_r=0.747, q_acc=100%, q_rew=0.637, skip=7]2026-04-26 06:36:24,981 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.593 = clip(base=0.513 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.278 novelty=0.76 | sol=0.45*prm_final(0.00)+0.35*prm_mean(0.50)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:36:25,195 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.999 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:36:25,413 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.976 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:25,626 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.940 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.999 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:25,838 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.983 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:26,046 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.999 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:36:26,264 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:36:26,483 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.935 + mod=+0.080, cap=1.00) | Q=0.84 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:36:26,702 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.926 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.979 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:26,912 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.940 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.999 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:30,816 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.948 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:31,017 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.998 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:31,216 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:31,411 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:31,616 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:31,807 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:31,999 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:32,195 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:32,388 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:32,580 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.997 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 24 GRPO groups: 95%|#########5| 19/20 [07:51<00:24, 24.05s/q, loss=0.0000, mean_r=0.980, q_acc=100%, q_rew=0.662, skip=8]
Iter 24 GRPO groups: 100%|##########| 20/20 [07:51<00:00, 21.01s/q, loss=0.0000, mean_r=0.980, q_acc=100%, q_rew=0.662, skip=8]
Iter 24 GRPO groups: 100%|##########| 20/20 [07:51<00:00, 23.58s/q, loss=0.0000, mean_r=0.980, q_acc=100%, q_rew=0.662, skip=8]
+2026-04-26 06:36:32,850 INFO __main__ - Iter 24 | loss=-0.0002 | reward mean=0.880 std=0.173 | gt_match=79.2% | grounded_acc=93.3% | step_acc=89.9% | lccp=81.2% | batch_acc=93.6% | phase=SELFPLAY_RAMP sp_ratio=39% | groups=20 skipped=8(0var=8) | lr=4.03e-06 | 471.7s
+2026-04-26 06:36:32,851 INFO __main__ - Question generation: 8/8 valid (100%) | q_reward=0.662 | q_acc=100.0% (>0.5 quality) | topic=0.53 diff=0.31 clarity=1.00 novelty=0.44 solvability=0.97
+2026-04-26 06:36:32,852 INFO __main__ - ======================================================================
+2026-04-26 06:36:32,852 INFO __main__ - GRPO ITERATION 25/60
+2026-04-26 06:36:32,852 INFO __main__ - ======================================================================
+2026-04-26 06:36:32,871 INFO __main__ - LR this iteration: 4.03e-06 | T=0.637 | MATH ratio=44%
+
Iter 25 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 06:36:37,962 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.923 = clip(base=0.843 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:38,165 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:38,370 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:38,570 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:38,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:38,980 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:39,185 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:39,394 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:39,600 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:39,808 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:43,677 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:43,873 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.998 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:44,070 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:44,268 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:44,465 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.996 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:44,669 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:44,881 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:45,085 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.998 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:45,284 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:45,481 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 25 GRPO groups: 0%| | 0/20 [00:14, ?q/s, loss=-0.0001, mean_r=0.906, q_acc=100%, q_rew=0.567, skip=0]
Iter 25 GRPO groups: 5%|5 | 1/20 [00:14<04:31, 14.28s/q, loss=-0.0001, mean_r=0.906, q_acc=100%, q_rew=0.567, skip=0]2026-04-26 06:36:53,985 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.650 = clip(base=0.570 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.505 novelty=0.75 | sol=0.45*prm_final(0.66)+0.35*prm_mean(0.45)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:36:54,186 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.536 = clip(base=0.456 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.350 novelty=0.75 | sol=0.45*prm_final(0.49)+0.35*prm_mean(0.37)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:36:54,387 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.431 = clip(base=0.351 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.173 novelty=0.75 | sol=0.45*prm_final(0.13)+0.35*prm_mean(0.33)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:36:54,580 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.818 = clip(base=0.738 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.754 novelty=0.75 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.79)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:36:54,777 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.824 = clip(base=0.744 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.766 novelty=0.75 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.76)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:36:54,970 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.511 = clip(base=0.431 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.307 novelty=0.75 | sol=0.45*prm_final(0.49)+0.35*prm_mean(0.25)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:36:55,160 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.921 novelty=0.75 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.80)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:36:55,355 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.956 novelty=0.75 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:55,545 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.574 = clip(base=0.494 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.427 novelty=0.75 | sol=0.45*prm_final(0.64)+0.35*prm_mean(0.40)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:36:55,739 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.782 = clip(base=0.702 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.688 novelty=0.75 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.61)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:37:10,210 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.575 = clip(base=0.495 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.368 novelty=0.74 | sol=0.45*prm_final(0.46)+0.35*prm_mean(0.47)+0.20*lccp(0.00) | steps=7
+2026-04-26 06:37:10,435 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.636 = clip(base=0.556 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.456 novelty=0.74 | sol=0.45*prm_final(0.42)+0.35*prm_mean(0.68)+0.20*lccp(0.14) | steps=7
+2026-04-26 06:37:10,666 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.644 = clip(base=0.564 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.462 novelty=0.74 | sol=0.45*prm_final(0.56)+0.35*prm_mean(0.50)+0.20*lccp(0.17) | steps=6
+2026-04-26 06:37:10,878 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.533 = clip(base=0.453 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.340 novelty=0.74 | sol=0.45*prm_final(0.17)+0.35*prm_mean(0.47)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:37:11,102 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.493 = clip(base=0.413 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.266 novelty=0.74 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.50)+0.20*lccp(0.14) | steps=7
+2026-04-26 06:37:11,323 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.659 = clip(base=0.579 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.568 novelty=0.74 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.49)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:37:11,545 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.547 = clip(base=0.467 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.299 novelty=0.74 | sol=0.45*prm_final(0.20)+0.35*prm_mean(0.52)+0.20*lccp(0.14) | steps=7
+2026-04-26 06:37:11,766 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.673 = clip(base=0.593 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.535 novelty=0.74 | sol=0.45*prm_final(0.69)+0.35*prm_mean(0.54)+0.20*lccp(0.17) | steps=6
+2026-04-26 06:37:11,985 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.730 = clip(base=0.650 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.608 novelty=0.74 | sol=0.45*prm_final(0.68)+0.35*prm_mean(0.57)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:37:12,216 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.671 = clip(base=0.591 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.543 novelty=0.74 | sol=0.45*prm_final(0.81)+0.35*prm_mean(0.51)+0.20*lccp(0.00) | steps=8
+
Iter 25 GRPO groups: 5%|5 | 1/20 [00:41<04:31, 14.28s/q, loss=-0.0009, mean_r=0.656, q_acc=100%, q_rew=0.618, skip=0]
Iter 25 GRPO groups: 10%|# | 2/20 [00:41<06:29, 21.66s/q, loss=-0.0009, mean_r=0.656, q_acc=100%, q_rew=0.618, skip=0]2026-04-26 06:37:19,518 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.761 = clip(base=0.681 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.701 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.73)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:37:19,702 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.744 = clip(base=0.664 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.701 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.73)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:37:19,887 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:37:20,075 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.994 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:37:20,273 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.818 = clip(base=0.738 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.776 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.79)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:37:20,459 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.995 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:37:20,649 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:37:20,840 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:37:21,027 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.744 = clip(base=0.664 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.701 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.73)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:37:21,218 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:37:26,454 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.937 = clip(base=0.857 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.998 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:37:26,655 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.62 sol=1.000 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:37:26,860 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.998 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:37:27,063 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.994 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:37:27,264 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.998 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:37:27,465 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.62 sol=1.000 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:37:27,664 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.998 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:37:27,859 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.992 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:37:28,058 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.998 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:37:28,262 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.62 sol=1.000 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 25 GRPO groups: 10%|# | 2/20 [00:57<06:29, 21.66s/q, loss=0.0017, mean_r=0.891, q_acc=100%, q_rew=0.617, skip=0]
Iter 25 GRPO groups: 15%|#5 | 3/20 [00:57<05:23, 19.06s/q, loss=0.0017, mean_r=0.891, q_acc=100%, q_rew=0.617, skip=0]2026-04-26 06:37:35,253 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.748 = 0.50×1.00(exact) + 0.40×proc(0.371[fin=0.24,mean=0.56]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 06:37:35,336 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.918 = 0.50×1.00(exact) + 0.40×proc(0.795[fin=0.81,mean=0.77]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:37:43,888 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.927[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:37:43,971 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.851 = 0.50×1.00(exact) + 0.40×proc(0.628[fin=0.64,mean=0.61]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 06:37:44,055 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:37:44,139 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:37:56,821 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.940[fin=0.99,mean=0.86]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 06:37:56,905 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.518[fin=0.41,mean=0.68]) + 0.10×fmt(1.000) | pred='10' gold='5' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 06:37:56,988 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:37:57,072 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 25 GRPO groups: 15%|#5 | 3/20 [01:35<05:23, 19.06s/q, loss=0.0010, mean_r=0.901, q_acc=100%, q_rew=0.617, skip=0]
Iter 25 GRPO groups: 20%|## | 4/20 [01:35<07:07, 26.75s/q, loss=0.0010, mean_r=0.901, q_acc=100%, q_rew=0.617, skip=0]2026-04-26 06:38:19,867 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:38:19,964 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.993[fin=1.00,mean=0.99]) + 0.10×fmt(0.700) | pred='' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:38:20,058 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:38:20,153 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:38:27,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:38:27,969 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:38:28,053 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:38:28,139 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:38:33,032 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:38:33,125 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 25 GRPO groups: 20%|## | 4/20 [02:01<07:07, 26.75s/q, loss=-0.0005, mean_r=0.908, q_acc=100%, q_rew=0.617, skip=0]
Iter 25 GRPO groups: 25%|##5 | 5/20 [02:01<06:37, 26.53s/q, loss=-0.0005, mean_r=0.908, q_acc=100%, q_rew=0.617, skip=0]2026-04-26 06:38:42,737 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:38:42,821 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.862 = 0.50×0.82(prox=0.82) + 0.40×proc(0.875[fin=0.98,mean=0.71]) + 0.10×fmt(1.000) | pred='15.5' gold='14' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 06:38:53,868 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:38:53,952 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.870 = 0.50×0.82(prox=0.82) + 0.40×proc(0.895[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='15.5' gold='14' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 06:38:54,036 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.820 = 0.50×0.82(prox=0.82) + 0.40×proc(0.771[fin=0.91,mean=0.56]) + 0.10×fmt(1.000) | pred='15.5' gold='14' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 06:38:54,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.869 = 0.50×0.82(prox=0.82) + 0.40×proc(0.892[fin=0.99,mean=0.74]) + 0.10×fmt(1.000) | pred='15.5' gold='14' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 06:39:07,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:39:07,219 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.884 = 0.50×0.82(prox=0.82) + 0.40×proc(0.931[fin=0.99,mean=0.84]) + 0.10×fmt(1.000) | pred='15.5' gold='14' | step_acc=88% lccp=25% (chain=2/8 ok_count=7) n_steps=8
+2026-04-26 06:39:07,303 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.829 = 0.50×0.78(prox=0.78) + 0.40×proc(0.851[fin=0.98,mean=0.66]) + 0.10×fmt(1.000) | pred='12' gold='14' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 06:39:07,398 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.883 = 0.50×0.82(prox=0.82) + 0.40×proc(0.928[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='15.5' gold='14' | step_acc=89% lccp=22% (chain=2/9 ok_count=8) n_steps=9
+
Iter 25 GRPO groups: 25%|##5 | 5/20 [02:59<06:37, 26.53s/q, loss=0.0014, mean_r=0.900, q_acc=100%, q_rew=0.617, skip=0]
Iter 25 GRPO groups: 30%|### | 6/20 [02:59<08:39, 37.08s/q, loss=0.0014, mean_r=0.900, q_acc=100%, q_rew=0.617, skip=0]2026-04-26 06:39:38,820 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.798 = 0.50×0.67(prox=0.67) + 0.40×proc(0.912[fin=0.99,mean=0.80]) + 0.10×fmt(1.000) | pred='250' gold='200' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:39:38,913 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.539 = 0.50×0.43(prox=0.43) + 0.40×proc(0.375[fin=0.26,mean=0.54]) + 0.10×fmt(1.000) | pred='66.67' gold='200' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 06:39:38,998 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='200' gold='200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:39:39,092 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.770 = 0.50×0.67(prox=0.67) + 0.40×proc(0.841[fin=0.89,mean=0.76]) + 0.10×fmt(1.000) | pred='250' gold='200' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:39:49,110 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='200' gold='200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:39:49,204 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.790 = 0.50×0.67(prox=0.67) + 0.40×proc(0.892[fin=0.96,mean=0.79]) + 0.10×fmt(1.000) | pred='250' gold='200' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:39:49,294 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='200' gold='200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:39:49,377 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='200' gold='200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:39:57,879 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.536 = 0.50×0.67(prox=0.67) + 0.40×proc(0.257[fin=0.02,mean=0.61]) + 0.10×fmt(1.000) | pred='150' gold='200' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 06:39:57,973 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.794 = 0.50×0.67(prox=0.67) + 0.40×proc(0.902[fin=0.97,mean=0.80]) + 0.10×fmt(1.000) | pred='250' gold='200' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+
Iter 25 GRPO groups: 30%|### | 6/20 [03:26<08:39, 37.08s/q, loss=-0.0004, mean_r=0.818, q_acc=100%, q_rew=0.617, skip=0]
Iter 25 GRPO groups: 35%|###5 | 7/20 [03:26<07:20, 33.87s/q, loss=-0.0004, mean_r=0.818, q_acc=100%, q_rew=0.617, skip=0]2026-04-26 06:40:10,105 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='2' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 06:40:10,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='2' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 06:40:14,916 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='2' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 06:40:15,024 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.919 = 0.50×1.00(exact) + 0.40×proc(0.798[fin=1.00,mean=0.50]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 06:40:15,025 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='2' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 06:40:15,025 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='2' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 06:40:21,856 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.891 = 0.50×1.00(exact) + 0.40×proc(0.902[fin=0.90,mean=0.90]) + 0.10×fmt(0.300) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=1/1 ok_count=1) n_steps=1
+2026-04-26 06:40:21,856 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='2' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 06:40:21,974 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.930 = 0.50×1.00(exact) + 0.40×proc(0.825[fin=0.99,mean=0.57]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 06:40:21,975 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='2' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+
Iter 25 GRPO groups: 35%|###5 | 7/20 [03:55<07:20, 33.87s/q, loss=0.0004, mean_r=0.274, q_acc=100%, q_rew=0.617, skip=0]
Iter 25 GRPO groups: 40%|#### | 8/20 [03:55<06:27, 32.27s/q, loss=0.0004, mean_r=0.274, q_acc=100%, q_rew=0.617, skip=0]2026-04-26 06:40:34,716 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.992 = clip(base=0.912 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:34,916 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.994 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:35,122 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:40:35,326 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:35,524 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:40:35,729 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:40:35,931 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:36,132 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:36,333 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:36,531 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:43,024 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.993 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:43,235 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.820 = clip(base=0.740 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.744 novelty=0.75 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.77)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:40:43,455 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.995 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:43,667 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.989 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:43,877 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:44,086 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.994 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:44,294 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.991 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:44,503 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.984 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:44,710 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:44,920 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.995 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 25 GRPO groups: 40%|#### | 8/20 [04:13<06:27, 32.27s/q, loss=0.0003, mean_r=0.960, q_acc=100%, q_rew=0.644, skip=0]
Iter 25 GRPO groups: 45%|####5 | 9/20 [04:13<05:07, 27.91s/q, loss=0.0003, mean_r=0.960, q_acc=100%, q_rew=0.644, skip=0]2026-04-26 06:41:20,126 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:41:20,224 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.504[fin=0.29,mean=0.83]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=80% lccp=80% (chain=8/10 ok_count=8) n_steps=10
+2026-04-26 06:41:20,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.903[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 06:41:20,414 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=0.98,mean=0.97]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:41:29,334 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:41:29,460 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=94% lccp=11% (chain=2/18 ok_count=17) n_steps=18
+2026-04-26 06:41:29,542 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.893[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 06:41:29,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.912 = 0.50×1.00(exact) + 0.40×proc(0.779[fin=0.82,mean=0.72]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 06:41:34,582 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.695 = 0.50×0.50(prox=0.50) + 0.40×proc(0.863[fin=0.87,mean=0.85]) + 0.10×fmt(1.000) | pred='1' gold='2' | step_acc=83% lccp=44% (chain=8/18 ok_count=15) n_steps=18
+
Iter 25 GRPO groups: 45%|####5 | 9/20 [05:03<05:07, 27.91s/q, loss=-0.0008, mean_r=0.895, q_acc=100%, q_rew=0.644, skip=0]
Iter 25 GRPO groups: 50%|##### | 10/20 [05:03<05:45, 34.52s/q, loss=-0.0008, mean_r=0.895, q_acc=100%, q_rew=0.644, skip=0]2026-04-26 06:41:49,704 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.700) | pred='' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:41:49,796 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:41:49,887 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:42:05,910 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:42:06,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:42:06,103 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.301 = 0.50×0.00(prox=0.00) + 0.40×proc(0.576[fin=0.54,mean=0.64]) + 0.10×fmt(0.700) | pred='' gold='5' | step_acc=71% lccp=0% (chain=0/7 ok_count=5) n_steps=7
+2026-04-26 06:42:06,187 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:42:18,445 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='5' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:42:18,538 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:42:18,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 25 GRPO groups: 50%|##### | 10/20 [05:47<05:45, 34.52s/q, loss=0.0003, mean_r=0.705, q_acc=100%, q_rew=0.644, skip=0]
Iter 25 GRPO groups: 55%|#####5 | 11/20 [05:47<05:37, 37.48s/q, loss=0.0003, mean_r=0.705, q_acc=100%, q_rew=0.644, skip=0]2026-04-26 06:42:27,231 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.769 = clip(base=0.689 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.648 novelty=0.73 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.61)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:42:27,437 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:27,642 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:27,854 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:28,062 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.746 = clip(base=0.666 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.650 novelty=0.73 | sol=0.45*prm_final(0.89)+0.35*prm_mean(0.57)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:42:28,278 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:28,485 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:28,692 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:28,906 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.801 = clip(base=0.721 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.735 novelty=0.73 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.70)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:42:29,116 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.445 = clip(base=0.365 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.226 novelty=0.73 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.38)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:42:34,145 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:34,362 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:34,593 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:34,805 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:35,012 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:35,222 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.689 = clip(base=0.609 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.577 novelty=0.66 | sol=0.45*prm_final(0.69)+0.35*prm_mean(0.57)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:42:35,429 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:35,636 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:35,843 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:36,051 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 25 GRPO groups: 55%|#####5 | 11/20 [06:04<05:37, 37.48s/q, loss=-0.0008, mean_r=0.895, q_acc=100%, q_rew=0.655, skip=0]
Iter 25 GRPO groups: 60%|###### | 12/20 [06:04<04:11, 31.46s/q, loss=-0.0008, mean_r=0.895, q_acc=100%, q_rew=0.655, skip=0]2026-04-26 06:42:43,036 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:42:52,146 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:42:52,230 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:42:52,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:42:52,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:43:01,370 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:43:01,454 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:43:01,537 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:43:01,620 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:43:08,655 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 25 GRPO groups: 60%|###### | 12/20 [06:35<04:11, 31.46s/q, loss=0var, mean_r=0.995, skip=1]
Iter 25 GRPO groups: 65%|######5 | 13/20 [06:35<03:38, 31.27s/q, loss=0var, mean_r=0.995, skip=1]2026-04-26 06:43:21,594 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='-6' gold='-6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:43:21,687 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.936 = 0.50×1.00(exact) + 0.40×proc(0.841[fin=0.99,mean=0.61]) + 0.10×fmt(1.000) | pred='-6' gold='-6' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:43:21,779 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='-6' gold='-6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:43:26,706 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-6' gold='-6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:43:26,793 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.448 = 0.50×0.33(prox=0.33) + 0.40×proc(0.452[fin=0.61,mean=0.22]) + 0.10×fmt(1.000) | pred='-12' gold='-6' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 06:43:26,879 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.493 = 0.50×0.00(prox=0.00) + 0.40×proc(0.907[fin=0.99,mean=0.78]) + 0.10×fmt(1.000) | pred='-8/3' gold='-6' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 06:43:26,984 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.267 = 0.50×0.00(prox=0.00) + 0.40×proc(0.342[fin=0.38,mean=0.28]) + 0.10×fmt(1.000) | pred='$\\frac{4}{9}$' gold='-6' | step_acc=20% lccp=20% (chain=1/5 ok_count=1) n_steps=5
+:1: SyntaxWarning: 'int' object is not callable; perhaps you missed a comma?
+2026-04-26 06:43:34,796 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='-6' gold='-6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:43:34,892 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.933 = 0.50×1.00(exact) + 0.40×proc(0.831[fin=1.00,mean=0.58]) + 0.10×fmt(1.000) | pred='-6' gold='-6' | step_acc=57% lccp=0% (chain=0/7 ok_count=4) n_steps=7
+2026-04-26 06:43:34,987 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.952 = 0.50×1.00(exact) + 0.40×proc(0.969[fin=1.00,mean=0.92]) + 0.10×fmt(0.650) | pred='-6' gold='-6' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 25 GRPO groups: 65%|######5 | 13/20 [07:03<03:38, 31.27s/q, loss=0.0017, mean_r=0.799, q_acc=100%, q_rew=0.655, skip=1]
Iter 25 GRPO groups: 70%|####### | 14/20 [07:03<03:01, 30.25s/q, loss=0.0017, mean_r=0.799, q_acc=100%, q_rew=0.655, skip=1]2026-04-26 06:43:43,057 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.633 = clip(base=0.553 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.481 novelty=0.72 | sol=0.45*prm_final(0.24)+0.35*prm_mean(0.64)+0.20*lccp(0.75) | steps=4
+2026-04-26 06:43:43,260 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.995 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:43:43,471 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.835 = clip(base=0.755 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.875 novelty=0.72 | sol=0.45*prm_final(0.83)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:43:43,684 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.891 = clip(base=0.811 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.969 novelty=0.72 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:43:43,896 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.665 = clip(base=0.585 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.601 novelty=0.72 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.47)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:43:44,101 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.885 = clip(base=0.805 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.948 novelty=0.72 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:43:44,307 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.637 = clip(base=0.557 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.528 novelty=0.72 | sol=0.45*prm_final(0.59)+0.35*prm_mean(0.56)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:43:44,512 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.315 = clip(base=0.235 + mod=+0.080, cap=1.00) | Q=0.55 sol=0.024 novelty=0.72 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.04)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:43:44,725 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.663 = clip(base=0.583 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.599 novelty=0.72 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.50)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:43:44,927 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.773 = clip(base=0.693 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.756 novelty=0.72 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.74)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:43:49,849 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:43:50,040 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:43:50,234 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.953 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:43:50,430 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:43:50,627 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:43:50,826 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:43:51,016 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:43:51,217 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.858 = clip(base=0.778 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.909 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.75)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:43:51,413 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:43:51,610 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+
Iter 25 GRPO groups: 70%|####### | 14/20 [07:20<03:01, 30.25s/q, loss=0.0035, mean_r=0.811, q_acc=100%, q_rew=0.643, skip=1]
Iter 25 GRPO groups: 75%|#######5 | 15/20 [07:20<02:10, 26.20s/q, loss=0.0035, mean_r=0.811, q_acc=100%, q_rew=0.643, skip=1]2026-04-26 06:43:57,658 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.472 = 0.50×0.17(prox=0.17) + 0.40×proc(0.723[fin=0.91,mean=0.45]) + 0.10×fmt(1.000) | pred='14' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 06:44:04,699 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.484 = 0.50×0.40(prox=0.40) + 0.40×proc(0.460[fin=0.53,mean=0.36]) + 0.10×fmt(1.000) | pred='1' gold='4' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 06:44:04,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.505 = 0.50×0.67(prox=0.67) + 0.40×proc(0.179[fin=0.21,mean=0.13]) + 0.10×fmt(1.000) | pred='3' gold='4' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:44:04,879 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.683 = 0.50×0.67(prox=0.67) + 0.40×proc(0.625[fin=0.79,mean=0.38]) + 0.10×fmt(1.000) | pred='3' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 06:44:04,972 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.934[fin=0.99,mean=0.85]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 06:44:11,422 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.326 = 0.50×0.16(prox=0.16) + 0.40×proc(0.369[fin=0.42,mean=0.29]) + 0.10×fmt(1.000) | pred='14.67' gold='4' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:44:11,518 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.875[fin=0.96,mean=0.75]) + 0.10×fmt(1.000) | pred='3 1/3' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:44:11,612 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.740 = 0.50×0.85(prox=0.85) + 0.40×proc(0.537[fin=0.60,mean=0.45]) + 0.10×fmt(1.000) | pred='3.6666666666666665' gold='4' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:44:11,704 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.410 = 0.50×0.17(prox=0.17) + 0.40×proc(0.566[fin=0.70,mean=0.37]) + 0.10×fmt(1.000) | pred='14' gold='4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 06:44:16,758 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.273 = 0.50×0.00(prox=0.00) + 0.40×proc(0.432[fin=0.50,mean=0.32]) + 0.10×fmt(1.000) | pred='3 2/3' gold='4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+
Iter 25 GRPO groups: 75%|#######5 | 15/20 [07:45<02:10, 26.20s/q, loss=0.0012, mean_r=0.542, q_acc=100%, q_rew=0.643, skip=1]
Iter 25 GRPO groups: 80%|######## | 16/20 [07:45<01:43, 25.80s/q, loss=0.0012, mean_r=0.542, q_acc=100%, q_rew=0.643, skip=1]2026-04-26 06:44:26,314 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.998 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:26,517 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:26,724 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:26,927 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:27,138 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:27,346 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:27,552 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.998 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:27,757 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:27,964 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:28,173 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:34,427 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.932 = clip(base=0.852 + mod=+0.080, cap=1.00) | Q=0.63 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:34,626 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:34,831 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:35,034 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.992 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:35,238 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:35,444 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:35,644 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.992 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:35,848 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:36,053 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.500 = clip(base=0.420 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.341 novelty=0.71 | sol=0.45*prm_final(0.55)+0.35*prm_mean(0.26)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:44:36,258 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.992 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+
Iter 25 GRPO groups: 80%|######## | 16/20 [08:05<01:43, 25.80s/q, loss=-0.0013, mean_r=0.893, q_acc=100%, q_rew=0.635, skip=1]
Iter 25 GRPO groups: 85%|########5 | 17/20 [08:05<01:11, 23.97s/q, loss=-0.0013, mean_r=0.893, q_acc=100%, q_rew=0.635, skip=1]2026-04-26 06:44:42,993 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.996 = clip(base=0.916 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.995 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:43,183 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.997 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:43,381 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.973 = clip(base=0.893 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.996 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:43,579 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.973 = clip(base=0.893 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.996 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:43,773 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.995 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:43,975 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.986 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:44:44,172 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.996 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:44,358 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.807 = clip(base=0.727 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.712 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.75)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:44:44,544 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.802 = clip(base=0.722 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.693 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.69)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:44:44,738 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.973 = clip(base=0.893 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.995 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:49,775 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.996 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:49,966 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.997 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:50,163 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.995 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:50,351 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.992 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:50,543 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:50,727 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.968 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:50,912 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.994 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:51,107 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:51,301 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.998 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:51,497 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.995 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 25 GRPO groups: 85%|########5 | 17/20 [08:20<01:11, 23.97s/q, loss=0.0015, mean_r=0.953, q_acc=100%, q_rew=0.648, skip=1]
Iter 25 GRPO groups: 90%|######### | 18/20 [08:20<00:42, 21.37s/q, loss=0.0015, mean_r=0.953, q_acc=100%, q_rew=0.648, skip=1]2026-04-26 06:44:57,614 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:57,845 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:58,045 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:58,244 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:58,439 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:58,628 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:58,824 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:59,019 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:59,215 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:59,406 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:03,254 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:03,439 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.901 = clip(base=0.821 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.987 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:03,625 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:03,812 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.983 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:04,003 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.898 = clip(base=0.818 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.981 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:04,197 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.990 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:04,388 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:04,574 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.983 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:04,761 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.983 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:04,949 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 25 GRPO groups: 90%|######### | 18/20 [08:33<00:42, 21.37s/q, loss=-0.0013, mean_r=0.934, q_acc=100%, q_rew=0.647, skip=1]
Iter 25 GRPO groups: 95%|#########5| 19/20 [08:33<00:18, 18.97s/q, loss=-0.0013, mean_r=0.934, q_acc=100%, q_rew=0.647, skip=1]2026-04-26 06:45:08,502 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:45:08,583 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:45:08,659 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:45:11,200 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.798 = 0.50×1.00(exact) + 0.40×proc(0.582[fin=0.72,mean=0.37]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 06:45:11,280 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:45:11,357 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:45:11,433 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:45:15,887 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:45:15,962 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:45:16,042 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.954 = 0.50×1.00(exact) + 0.40×proc(0.972[fin=1.00,mean=0.93]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 25 GRPO groups: 95%|#########5| 19/20 [08:44<00:18, 18.97s/q, loss=-0.0000, mean_r=0.962, q_acc=100%, q_rew=0.647, skip=1]
Iter 25 GRPO groups: 100%|##########| 20/20 [08:44<00:00, 16.54s/q, loss=-0.0000, mean_r=0.962, q_acc=100%, q_rew=0.647, skip=1]
Iter 25 GRPO groups: 100%|##########| 20/20 [08:44<00:00, 26.23s/q, loss=-0.0000, mean_r=0.962, q_acc=100%, q_rew=0.647, skip=1]
+2026-04-26 06:45:17,509 INFO src.rl.llm_question_classifier - LLMClassifier cache=90% llm=2% fallback=8% (cache_size=112/10000)
+2026-04-26 06:45:17,510 INFO __main__ - Iter 25 | loss=0.0003 | reward mean=0.845 std=0.209 | gt_match=60.6% | grounded_acc=85.3% | step_acc=78.1% | lccp=63.7% | batch_acc=92.7% | phase=SELFPLAY_RAMP sp_ratio=43% | groups=28 skipped=1(0var=1) | lr=3.91e-06 | 524.7s
+2026-04-26 06:45:17,510 INFO __main__ - Question generation: 9/9 valid (100%) | q_reward=0.647 | q_acc=100.0% (>0.5 quality) | topic=0.51 diff=0.26 clarity=1.00 novelty=0.44 solvability=0.97
+2026-04-26 06:45:17,510 INFO __main__ - Evaluating GSM8K (150 samples)...
+
GSM8K eval: 0%| | 0/150 [00:00, ?q/s]
GSM8K eval: 1%| | 1/150 [00:03<08:34, 3.45s/q, correct=1/1, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 1%|1 | 2/150 [00:08<10:51, 4.40s/q, correct=2/2, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 2%|2 | 3/150 [00:11<08:49, 3.60s/q, correct=3/3, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 3%|2 | 4/150 [00:13<07:31, 3.09s/q, correct=3/4, lccp=83.3%, score=0.887, step_acc=91.7%]
GSM8K eval: 3%|3 | 5/150 [00:15<06:15, 2.59s/q, correct=4/5, lccp=86.7%, score=0.910, step_acc=93.3%]
GSM8K eval: 4%|4 | 6/150 [00:20<08:34, 3.58s/q, correct=4/6, lccp=75.6%, score=0.888, step_acc=87.8%]
GSM8K eval: 5%|4 | 7/150 [00:24<08:25, 3.54s/q, correct=5/7, lccp=79.0%, score=0.904, step_acc=89.5%]
GSM8K eval: 5%|5 | 8/150 [00:26<07:32, 3.19s/q, correct=6/8, lccp=81.7%, score=0.916, step_acc=90.8%]
GSM8K eval: 6%|6 | 9/150 [00:29<07:33, 3.22s/q, correct=7/9, lccp=83.7%, score=0.925, step_acc=91.9%]
GSM8K eval: 7%|6 | 10/150 [00:34<08:34, 3.67s/q, correct=7/10, lccp=79.3%, score=0.890, step_acc=86.7%]
GSM8K eval: 7%|7 | 11/150 [00:37<07:59, 3.45s/q, correct=8/11, lccp=81.2%, score=0.900, step_acc=87.9%]
GSM8K eval: 8%|8 | 12/150 [00:39<06:57, 3.02s/q, correct=9/12, lccp=82.8%, score=0.908, step_acc=88.9%]
GSM8K eval: 9%|8 | 13/150 [00:42<06:36, 2.89s/q, correct=10/13, lccp=84.1%, score=0.912, step_acc=89.7%]
GSM8K eval: 9%|9 | 14/150 [00:46<07:40, 3.38s/q, correct=11/14, lccp=85.2%, score=0.918, step_acc=90.5%]
GSM8K eval: 10%|# | 15/150 [00:49<07:03, 3.14s/q, correct=12/15, lccp=86.2%, score=0.924, step_acc=91.1%]
GSM8K eval: 11%|# | 16/150 [00:51<06:32, 2.93s/q, correct=12/16, lccp=87.1%, score=0.900, step_acc=91.7%]
GSM8K eval: 11%|#1 | 17/150 [00:55<07:09, 3.23s/q, correct=13/17, lccp=87.8%, score=0.906, step_acc=92.2%]
GSM8K eval: 12%|#2 | 18/150 [01:01<08:45, 3.98s/q, correct=13/18, lccp=83.7%, score=0.895, step_acc=89.8%]
GSM8K eval: 13%|#2 | 19/150 [01:03<07:47, 3.57s/q, correct=14/19, lccp=84.5%, score=0.901, step_acc=90.4%]
GSM8K eval: 13%|#3 | 20/150 [01:07<07:53, 3.64s/q, correct=15/20, lccp=85.3%, score=0.906, step_acc=90.8%]
GSM8K eval: 14%|#4 | 21/150 [01:10<07:08, 3.32s/q, correct=16/21, lccp=86.0%, score=0.910, step_acc=91.3%]
GSM8K eval: 15%|#4 | 22/150 [01:13<06:42, 3.14s/q, correct=17/22, lccp=83.6%, score=0.905, step_acc=90.2%]
GSM8K eval: 15%|#5 | 23/150 [01:17<07:21, 3.48s/q, correct=18/23, lccp=84.3%, score=0.909, step_acc=90.6%]
GSM8K eval: 16%|#6 | 24/150 [01:19<06:44, 3.21s/q, correct=18/24, lccp=81.8%, score=0.893, step_acc=87.8%]
GSM8K eval: 17%|#6 | 25/150 [01:22<06:23, 3.07s/q, correct=18/25, lccp=79.6%, score=0.890, step_acc=87.3%]
GSM8K eval: 17%|#7 | 26/150 [01:27<07:12, 3.48s/q, correct=19/26, lccp=80.4%, score=0.894, step_acc=87.8%]
GSM8K eval: 18%|#8 | 27/150 [01:29<06:44, 3.29s/q, correct=19/27, lccp=81.1%, score=0.889, step_acc=88.3%]
GSM8K eval: 19%|#8 | 28/150 [01:32<06:01, 2.96s/q, correct=20/28, lccp=81.8%, score=0.893, step_acc=88.7%]
GSM8K eval: 19%|#9 | 29/150 [01:34<05:51, 2.90s/q, correct=21/29, lccp=82.4%, score=0.896, step_acc=89.1%]
GSM8K eval: 20%|## | 30/150 [01:38<06:23, 3.19s/q, correct=22/30, lccp=83.0%, score=0.900, step_acc=89.4%]
GSM8K eval: 21%|## | 31/150 [01:41<05:57, 3.00s/q, correct=23/31, lccp=83.5%, score=0.903, step_acc=89.8%]
GSM8K eval: 21%|##1 | 32/150 [01:43<05:11, 2.64s/q, correct=24/32, lccp=84.0%, score=0.905, step_acc=90.1%]
GSM8K eval: 22%|##2 | 33/150 [01:45<05:13, 2.68s/q, correct=25/33, lccp=84.5%, score=0.908, step_acc=90.4%]
GSM8K eval: 23%|##2 | 34/150 [01:47<04:48, 2.49s/q, correct=26/34, lccp=85.0%, score=0.911, step_acc=90.7%]
GSM8K eval: 23%|##3 | 35/150 [01:50<04:50, 2.52s/q, correct=27/35, lccp=85.4%, score=0.913, step_acc=91.0%]
GSM8K eval: 24%|##4 | 36/150 [01:54<05:21, 2.82s/q, correct=28/36, lccp=85.8%, score=0.915, step_acc=91.2%]
GSM8K eval: 25%|##4 | 37/150 [01:55<04:49, 2.57s/q, correct=29/37, lccp=86.2%, score=0.917, step_acc=91.4%]
GSM8K eval: 25%|##5 | 38/150 [01:59<05:02, 2.71s/q, correct=30/38, lccp=86.6%, score=0.919, step_acc=91.7%]
GSM8K eval: 26%|##6 | 39/150 [02:03<06:13, 3.36s/q, correct=31/39, lccp=86.9%, score=0.921, step_acc=91.9%]
GSM8K eval: 27%|##6 | 40/150 [02:10<07:46, 4.24s/q, correct=32/40, lccp=87.2%, score=0.923, step_acc=92.1%]
GSM8K eval: 27%|##7 | 41/150 [02:13<07:01, 3.86s/q, correct=32/41, lccp=87.5%, score=0.922, step_acc=92.3%]
GSM8K eval: 28%|##8 | 42/150 [02:16<06:37, 3.68s/q, correct=32/42, lccp=87.8%, score=0.921, step_acc=92.5%]
GSM8K eval: 29%|##8 | 43/150 [02:18<05:48, 3.25s/q, correct=33/43, lccp=88.1%, score=0.923, step_acc=92.6%]
GSM8K eval: 29%|##9 | 44/150 [02:25<07:26, 4.21s/q, correct=34/44, lccp=88.4%, score=0.925, step_acc=92.8%]
GSM8K eval: 30%|### | 45/150 [02:28<06:50, 3.91s/q, correct=35/45, lccp=88.6%, score=0.926, step_acc=93.0%]
GSM8K eval: 31%|### | 46/150 [02:33<07:17, 4.21s/q, correct=35/46, lccp=86.7%, score=0.921, step_acc=92.9%]
GSM8K eval: 31%|###1 | 47/150 [02:36<06:39, 3.88s/q, correct=36/47, lccp=87.0%, score=0.923, step_acc=93.0%]
GSM8K eval: 32%|###2 | 48/150 [02:38<05:32, 3.26s/q, correct=37/48, lccp=87.3%, score=0.925, step_acc=93.2%]
GSM8K eval: 33%|###2 | 49/150 [02:41<05:39, 3.36s/q, correct=38/49, lccp=86.2%, score=0.926, step_acc=93.0%]
GSM8K eval: 33%|###3 | 50/150 [02:44<05:31, 3.31s/q, correct=38/50, lccp=85.5%, score=0.917, step_acc=92.1%]
GSM8K eval: 34%|###4 | 51/150 [02:46<04:31, 2.75s/q, correct=39/51, lccp=85.7%, score=0.919, step_acc=92.3%]
GSM8K eval: 35%|###4 | 52/150 [02:50<05:14, 3.21s/q, correct=39/52, lccp=84.1%, score=0.918, step_acc=92.1%]
GSM8K eval: 35%|###5 | 53/150 [02:55<05:57, 3.68s/q, correct=39/53, lccp=83.6%, score=0.911, step_acc=91.5%]
GSM8K eval: 36%|###6 | 54/150 [02:58<05:42, 3.57s/q, correct=40/54, lccp=83.9%, score=0.912, step_acc=91.6%]
GSM8K eval: 37%|###6 | 55/150 [03:03<06:12, 3.92s/q, correct=41/55, lccp=84.2%, score=0.914, step_acc=91.8%]
GSM8K eval: 37%|###7 | 56/150 [03:07<05:58, 3.82s/q, correct=42/56, lccp=84.5%, score=0.915, step_acc=91.9%]
GSM8K eval: 38%|###8 | 57/150 [03:09<05:14, 3.38s/q, correct=43/57, lccp=84.8%, score=0.917, step_acc=92.1%]
GSM8K eval: 39%|###8 | 58/150 [03:13<05:32, 3.62s/q, correct=44/58, lccp=85.0%, score=0.918, step_acc=92.2%]
GSM8K eval: 39%|###9 | 59/150 [03:18<06:00, 3.96s/q, correct=44/59, lccp=83.6%, score=0.915, step_acc=91.8%]
GSM8K eval: 40%|#### | 60/150 [03:23<06:23, 4.27s/q, correct=45/60, lccp=83.9%, score=0.917, step_acc=91.9%]
GSM8K eval: 41%|#### | 61/150 [03:26<05:52, 3.96s/q, correct=46/61, lccp=84.1%, score=0.918, step_acc=92.1%]
GSM8K eval: 41%|####1 | 62/150 [03:29<05:26, 3.71s/q, correct=47/62, lccp=84.4%, score=0.919, step_acc=92.2%]
GSM8K eval: 42%|####2 | 63/150 [03:33<05:13, 3.60s/q, correct=47/63, lccp=84.1%, score=0.913, step_acc=91.8%]
GSM8K eval: 43%|####2 | 64/150 [03:35<04:49, 3.37s/q, correct=48/64, lccp=84.4%, score=0.915, step_acc=91.9%]
GSM8K eval: 43%|####3 | 65/150 [03:38<04:32, 3.20s/q, correct=49/65, lccp=84.6%, score=0.916, step_acc=92.0%]
GSM8K eval: 44%|####4 | 66/150 [03:40<03:58, 2.84s/q, correct=50/66, lccp=84.8%, score=0.917, step_acc=92.2%]
GSM8K eval: 45%|####4 | 67/150 [03:42<03:41, 2.67s/q, correct=51/67, lccp=85.1%, score=0.918, step_acc=92.3%]
GSM8K eval: 45%|####5 | 68/150 [03:45<03:40, 2.68s/q, correct=52/68, lccp=85.3%, score=0.920, step_acc=92.4%]
GSM8K eval: 46%|####6 | 69/150 [03:47<03:10, 2.35s/q, correct=53/69, lccp=85.5%, score=0.921, step_acc=92.5%]
GSM8K eval: 47%|####6 | 70/150 [03:50<03:23, 2.54s/q, correct=54/70, lccp=84.3%, score=0.921, step_acc=92.3%]
GSM8K eval: 47%|####7 | 71/150 [03:53<03:35, 2.73s/q, correct=55/71, lccp=83.1%, score=0.922, step_acc=92.1%]
GSM8K eval: 48%|####8 | 72/150 [03:54<03:03, 2.35s/q, correct=56/72, lccp=83.3%, score=0.923, step_acc=92.3%]
GSM8K eval: 49%|####8 | 73/150 [03:56<02:44, 2.14s/q, correct=57/73, lccp=83.6%, score=0.924, step_acc=92.4%]
GSM8K eval: 49%|####9 | 74/150 [04:00<03:14, 2.56s/q, correct=58/74, lccp=83.8%, score=0.925, step_acc=92.5%]
GSM8K eval: 50%|##### | 75/150 [04:01<02:53, 2.31s/q, correct=59/75, lccp=84.0%, score=0.926, step_acc=92.6%]
GSM8K eval: 51%|##### | 76/150 [04:08<04:26, 3.60s/q, correct=59/76, lccp=84.0%, score=0.921, step_acc=92.5%]
GSM8K eval: 51%|#####1 | 77/150 [04:12<04:29, 3.70s/q, correct=60/77, lccp=84.2%, score=0.922, step_acc=92.6%]
GSM8K eval: 52%|#####2 | 78/150 [04:14<04:00, 3.34s/q, correct=61/78, lccp=84.4%, score=0.923, step_acc=92.7%]
GSM8K eval: 53%|#####2 | 79/150 [04:17<03:50, 3.25s/q, correct=61/79, lccp=83.6%, score=0.917, step_acc=91.9%]
GSM8K eval: 53%|#####3 | 80/150 [04:20<03:42, 3.17s/q, correct=62/80, lccp=83.8%, score=0.918, step_acc=92.0%]
GSM8K eval: 54%|#####4 | 81/150 [04:23<03:22, 2.94s/q, correct=63/81, lccp=84.0%, score=0.919, step_acc=92.1%]
GSM8K eval: 55%|#####4 | 82/150 [04:26<03:19, 2.94s/q, correct=64/82, lccp=84.2%, score=0.920, step_acc=92.2%]
GSM8K eval: 55%|#####5 | 83/150 [04:29<03:14, 2.90s/q, correct=65/83, lccp=84.4%, score=0.921, step_acc=92.3%]
GSM8K eval: 56%|#####6 | 84/150 [04:31<03:06, 2.83s/q, correct=66/84, lccp=84.6%, score=0.922, step_acc=92.4%]
GSM8K eval: 57%|#####6 | 85/150 [04:35<03:22, 3.12s/q, correct=67/85, lccp=84.7%, score=0.923, step_acc=92.5%]
GSM8K eval: 57%|#####7 | 86/150 [04:38<03:26, 3.22s/q, correct=68/86, lccp=84.9%, score=0.924, step_acc=92.6%]
GSM8K eval: 58%|#####8 | 87/150 [04:44<04:10, 3.97s/q, correct=69/87, lccp=85.1%, score=0.925, step_acc=92.7%]
GSM8K eval: 59%|#####8 | 88/150 [04:46<03:27, 3.35s/q, correct=70/88, lccp=85.3%, score=0.926, step_acc=92.8%]
GSM8K eval: 59%|#####9 | 89/150 [04:49<03:13, 3.16s/q, correct=71/89, lccp=85.4%, score=0.927, step_acc=92.8%]
GSM8K eval: 60%|###### | 90/150 [04:51<02:56, 2.94s/q, correct=72/90, lccp=85.6%, score=0.927, step_acc=92.9%]
GSM8K eval: 61%|###### | 91/150 [04:56<03:21, 3.41s/q, correct=73/91, lccp=85.8%, score=0.928, step_acc=93.0%]
GSM8K eval: 61%|######1 | 92/150 [04:59<03:11, 3.30s/q, correct=74/92, lccp=85.9%, score=0.929, step_acc=93.1%]
GSM8K eval: 62%|######2 | 93/150 [05:06<04:22, 4.60s/q, correct=75/93, lccp=86.1%, score=0.929, step_acc=93.2%]
GSM8K eval: 63%|######2 | 94/150 [05:09<03:45, 4.03s/q, correct=75/94, lccp=85.1%, score=0.925, step_acc=92.2%]
GSM8K eval: 63%|######3 | 95/150 [05:15<04:04, 4.45s/q, correct=76/95, lccp=84.2%, score=0.925, step_acc=91.7%]
GSM8K eval: 64%|######4 | 96/150 [05:18<03:39, 4.07s/q, correct=76/96, lccp=83.7%, score=0.920, step_acc=91.1%]
GSM8K eval: 65%|######4 | 97/150 [05:20<03:14, 3.67s/q, correct=76/97, lccp=83.4%, score=0.918, step_acc=90.9%]
GSM8K eval: 65%|######5 | 98/150 [05:25<03:19, 3.84s/q, correct=76/98, lccp=83.0%, score=0.914, step_acc=90.7%]
GSM8K eval: 66%|######6 | 99/150 [05:27<02:52, 3.39s/q, correct=77/99, lccp=83.1%, score=0.915, step_acc=90.8%]
GSM8K eval: 67%|######6 | 100/150 [05:29<02:28, 2.96s/q, correct=78/100, lccp=82.3%, score=0.915, step_acc=90.6%]
GSM8K eval: 67%|######7 | 101/150 [05:32<02:25, 2.96s/q, correct=78/101, lccp=82.0%, score=0.912, step_acc=90.4%]
GSM8K eval: 68%|######8 | 102/150 [05:33<02:00, 2.52s/q, correct=79/102, lccp=82.2%, score=0.912, step_acc=90.5%]
GSM8K eval: 69%|######8 | 103/150 [05:35<01:51, 2.37s/q, correct=80/103, lccp=82.3%, score=0.913, step_acc=90.6%]
GSM8K eval: 69%|######9 | 104/150 [05:40<02:22, 3.10s/q, correct=81/104, lccp=82.5%, score=0.914, step_acc=90.7%]
GSM8K eval: 70%|####### | 105/150 [05:43<02:12, 2.95s/q, correct=82/105, lccp=82.7%, score=0.915, step_acc=90.8%]
GSM8K eval: 71%|####### | 106/150 [05:44<01:50, 2.52s/q, correct=83/106, lccp=82.8%, score=0.916, step_acc=90.9%]
GSM8K eval: 71%|#######1 | 107/150 [05:46<01:35, 2.23s/q, correct=84/107, lccp=83.0%, score=0.916, step_acc=91.0%]
GSM8K eval: 72%|#######2 | 108/150 [05:49<01:38, 2.35s/q, correct=85/108, lccp=83.1%, score=0.917, step_acc=91.1%]
GSM8K eval: 73%|#######2 | 109/150 [05:54<02:09, 3.16s/q, correct=85/109, lccp=82.7%, score=0.916, step_acc=91.0%]
GSM8K eval: 73%|#######3 | 110/150 [05:56<01:55, 2.89s/q, correct=86/110, lccp=82.8%, score=0.917, step_acc=91.1%]
GSM8K eval: 74%|#######4 | 111/150 [05:58<01:38, 2.53s/q, correct=87/111, lccp=83.0%, score=0.917, step_acc=91.2%]
GSM8K eval: 75%|#######4 | 112/150 [06:03<02:05, 3.31s/q, correct=87/112, lccp=83.2%, score=0.917, step_acc=91.2%]
GSM8K eval: 75%|#######5 | 113/150 [06:04<01:45, 2.86s/q, correct=88/113, lccp=83.3%, score=0.918, step_acc=91.3%]
GSM8K eval: 76%|#######6 | 114/150 [06:10<02:09, 3.59s/q, correct=89/114, lccp=82.8%, score=0.918, step_acc=91.3%]
GSM8K eval: 77%|#######6 | 115/150 [06:13<01:58, 3.39s/q, correct=90/115, lccp=83.0%, score=0.919, step_acc=91.3%]
GSM8K eval: 77%|#######7 | 116/150 [06:16<01:50, 3.24s/q, correct=91/116, lccp=83.1%, score=0.919, step_acc=91.4%]
GSM8K eval: 78%|#######8 | 117/150 [06:22<02:13, 4.06s/q, correct=92/117, lccp=83.3%, score=0.920, step_acc=91.5%]
GSM8K eval: 79%|#######8 | 118/150 [06:26<02:14, 4.19s/q, correct=92/118, lccp=82.6%, score=0.918, step_acc=91.4%]
GSM8K eval: 79%|#######9 | 119/150 [06:30<02:04, 4.01s/q, correct=92/119, lccp=82.7%, score=0.916, step_acc=91.5%]
GSM8K eval: 80%|######## | 120/150 [06:33<01:50, 3.68s/q, correct=93/120, lccp=82.8%, score=0.917, step_acc=91.6%]
GSM8K eval: 81%|######## | 121/150 [06:36<01:41, 3.50s/q, correct=94/121, lccp=83.0%, score=0.918, step_acc=91.6%]
GSM8K eval: 81%|########1 | 122/150 [06:39<01:34, 3.37s/q, correct=95/122, lccp=83.1%, score=0.918, step_acc=91.7%]
GSM8K eval: 82%|########2 | 123/150 [06:42<01:31, 3.39s/q, correct=96/123, lccp=83.3%, score=0.919, step_acc=91.8%]
GSM8K eval: 83%|########2 | 124/150 [06:44<01:19, 3.07s/q, correct=97/124, lccp=83.4%, score=0.920, step_acc=91.8%]
GSM8K eval: 83%|########3 | 125/150 [06:47<01:09, 2.76s/q, correct=98/125, lccp=83.5%, score=0.920, step_acc=91.9%]
GSM8K eval: 84%|########4 | 126/150 [06:49<01:06, 2.76s/q, correct=99/126, lccp=83.7%, score=0.921, step_acc=92.0%]
GSM8K eval: 85%|########4 | 127/150 [06:54<01:14, 3.26s/q, correct=100/127, lccp=83.8%, score=0.921, step_acc=92.0%]
GSM8K eval: 85%|########5 | 128/150 [06:57<01:09, 3.15s/q, correct=101/128, lccp=83.9%, score=0.922, step_acc=92.1%]
GSM8K eval: 86%|########6 | 129/150 [07:00<01:07, 3.21s/q, correct=102/129, lccp=84.0%, score=0.923, step_acc=92.1%]
GSM8K eval: 87%|########6 | 130/150 [07:02<00:55, 2.79s/q, correct=103/130, lccp=84.2%, score=0.923, step_acc=92.2%]
GSM8K eval: 87%|########7 | 131/150 [07:06<01:03, 3.34s/q, correct=104/131, lccp=84.3%, score=0.924, step_acc=92.3%]
GSM8K eval: 88%|########8 | 132/150 [07:08<00:50, 2.82s/q, correct=105/132, lccp=84.4%, score=0.924, step_acc=92.3%]
GSM8K eval: 89%|########8 | 133/150 [07:11<00:48, 2.85s/q, correct=106/133, lccp=84.5%, score=0.925, step_acc=92.4%]
GSM8K eval: 89%|########9 | 134/150 [07:15<00:53, 3.32s/q, correct=107/134, lccp=84.6%, score=0.925, step_acc=92.4%]
GSM8K eval: 90%|######### | 135/150 [07:18<00:48, 3.23s/q, correct=108/135, lccp=84.8%, score=0.926, step_acc=92.5%]
GSM8K eval: 91%|######### | 136/150 [07:23<00:50, 3.61s/q, correct=108/136, lccp=84.4%, score=0.925, step_acc=92.3%]
GSM8K eval: 91%|#########1| 137/150 [07:30<00:59, 4.56s/q, correct=109/137, lccp=84.5%, score=0.925, step_acc=92.4%]
GSM8K eval: 92%|#########2| 138/150 [07:34<00:52, 4.40s/q, correct=110/138, lccp=84.6%, score=0.926, step_acc=92.4%]
GSM8K eval: 93%|#########2| 139/150 [07:37<00:45, 4.12s/q, correct=111/139, lccp=84.7%, score=0.927, step_acc=92.5%]
GSM8K eval: 93%|#########3| 140/150 [07:41<00:41, 4.16s/q, correct=111/140, lccp=84.6%, score=0.923, step_acc=92.3%]
GSM8K eval: 94%|#########3| 141/150 [07:45<00:36, 4.07s/q, correct=112/141, lccp=84.7%, score=0.923, step_acc=92.3%]
GSM8K eval: 95%|#########4| 142/150 [07:50<00:33, 4.21s/q, correct=113/142, lccp=84.8%, score=0.924, step_acc=92.4%]
GSM8K eval: 95%|#########5| 143/150 [07:52<00:25, 3.63s/q, correct=114/143, lccp=84.9%, score=0.924, step_acc=92.4%]
GSM8K eval: 96%|#########6| 144/150 [07:54<00:19, 3.24s/q, correct=115/144, lccp=85.0%, score=0.925, step_acc=92.5%]
GSM8K eval: 97%|#########6| 145/150 [07:58<00:16, 3.22s/q, correct=115/145, lccp=84.4%, score=0.922, step_acc=92.0%]
GSM8K eval: 97%|#########7| 146/150 [08:00<00:12, 3.15s/q, correct=116/146, lccp=84.5%, score=0.922, step_acc=92.0%]
GSM8K eval: 98%|#########8| 147/150 [08:04<00:09, 3.31s/q, correct=117/147, lccp=84.6%, score=0.923, step_acc=92.1%]
GSM8K eval: 99%|#########8| 148/150 [08:08<00:06, 3.42s/q, correct=118/148, lccp=84.7%, score=0.923, step_acc=92.1%]
GSM8K eval: 99%|#########9| 149/150 [08:11<00:03, 3.45s/q, correct=119/149, lccp=84.8%, score=0.924, step_acc=92.2%]
GSM8K eval: 100%|##########| 150/150 [08:16<00:00, 3.87s/q, correct=119/150, lccp=84.7%, score=0.922, step_acc=92.0%]
GSM8K eval: 100%|##########| 150/150 [08:16<00:00, 3.31s/q, correct=119/150, lccp=84.7%, score=0.922, step_acc=92.0%]
+2026-04-26 06:53:34,257 INFO __main__ - Training Score [iter 25]: 0.9221 (best=0.9262) | n=150
+2026-04-26 06:53:34,257 INFO __main__ - Components : 0.50×correct(79.3%) + 0.40×process + 0.10×fmt(1.000)
+2026-04-26 06:53:34,257 INFO __main__ - Process score : prm_mean=0.903 prm_final=0.933 → weighted=0.921
+2026-04-26 06:53:34,257 INFO __main__ - Step accuracy : 92.0% (bag-of-steps: fraction of steps PRM >0.5)
+2026-04-26 06:53:34,258 INFO __main__ - Chain integrity (LCCP): 84.7% ← fraction of steps before first failure
+ [LCCP=100% → all steps correct; LCCP=0% → first step wrong]
+2026-04-26 06:53:34,258 INFO __main__ - (debug) final-answer accuracy: 79.3%
+2026-04-26 06:53:36,520 INFO __main__ - Pruned old checkpoint: iter_0005
+2026-04-26 06:53:36,527 INFO __main__ - ======================================================================
+2026-04-26 06:53:36,527 INFO __main__ - GRPO ITERATION 26/60
+2026-04-26 06:53:36,527 INFO __main__ - ======================================================================
+2026-04-26 06:53:36,548 INFO __main__ - LR this iteration: 3.91e-06 | T=0.631 | MATH ratio=46%
+
Iter 26 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 06:53:42,159 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.682 = 0.50×0.47(prox=0.47) + 0.40×proc(0.871[fin=1.00,mean=0.68]) + 0.10×fmt(1.000) | pred='110' gold='70' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:590: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.1` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
+ warnings.warn(
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:595: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.
+ warnings.warn(
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:612: UserWarning: `do_sample` is set to `False`. However, `top_k` is set to `20` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_k`.
+ warnings.warn(
+2026-04-26 06:53:48,688 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.646 = 0.50×0.54(prox=0.54) + 0.40×proc(0.780[fin=0.97,mean=0.50]) + 0.10×fmt(0.650) | pred='100' gold='70' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 06:53:48,780 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.763 = 0.50×0.64(prox=0.64) + 0.40×proc(0.861[fin=0.97,mean=0.69]) + 0.10×fmt(1.000) | pred='50' gold='70' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 06:53:48,871 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.922[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 06:53:48,961 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.186 = 0.50×0.00(prox=0.00) + 0.40×proc(0.290[fin=0.32,mean=0.25]) + 0.10×fmt(0.700) | pred='' gold='70' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:53:57,706 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.954 = 0.50×1.00(exact) + 0.40×proc(0.886[fin=0.99,mean=0.73]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 06:53:57,799 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.767 = 0.50×0.78(prox=0.78) + 0.40×proc(0.696[fin=0.79,mean=0.55]) + 0.10×fmt(1.000) | pred='80' gold='70' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 06:53:57,893 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.911[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 06:53:57,987 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.732 = 0.50×0.54(prox=0.54) + 0.40×proc(0.906[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='100' gold='70' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 06:54:08,828 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.621 = 0.50×0.54(prox=0.54) + 0.40×proc(0.628[fin=0.80,mean=0.36]) + 0.10×fmt(1.000) | pred='100' gold='70' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+
Iter 26 GRPO groups: 0%| | 0/20 [00:33, ?q/s, loss=0.0013, mean_r=0.728, skip=0]
Iter 26 GRPO groups: 5%|5 | 1/20 [00:33<10:43, 33.85s/q, loss=0.0013, mean_r=0.728, skip=0]2026-04-26 06:54:15,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:54:15,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:54:15,983 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:54:18,978 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:54:19,062 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:54:19,147 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:54:19,232 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:54:21,690 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:54:21,767 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:54:21,850 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 26 GRPO groups: 5%|5 | 1/20 [00:45<10:43, 33.85s/q, loss=0var, mean_r=0.984, skip=1]
Iter 26 GRPO groups: 10%|# | 2/20 [00:45<06:12, 20.67s/q, loss=0var, mean_r=0.984, skip=1]2026-04-26 06:54:29,769 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.717 = clip(base=0.637 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.603 novelty=0.74 | sol=0.45*prm_final(0.72)+0.35*prm_mean(0.57)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:54:29,967 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.995 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:54:30,171 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.991 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:54:30,371 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.425 = clip(base=0.345 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.215 novelty=0.74 | sol=0.45*prm_final(0.05)+0.35*prm_mean(0.44)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:54:30,572 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.992 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:54:30,772 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.483 = clip(base=0.403 + mod=+0.080, cap=1.00) | Q=0.55 sol=0.303 novelty=0.74 | sol=0.45*prm_final(0.24)+0.35*prm_mean(0.41)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:54:30,975 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.931 novelty=0.74 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:54:31,180 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.992 novelty=0.74 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:54:31,388 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.941 = clip(base=0.861 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.990 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:54:31,597 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.982 novelty=0.74 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:54:36,639 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:54:36,837 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:54:37,039 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:54:37,233 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:54:37,429 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.982 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:54:37,629 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:54:37,827 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:54:38,022 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.979 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:54:38,226 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:54:38,426 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.819 = clip(base=0.739 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.784 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.77)+0.20*lccp(0.33) | steps=3
+
Iter 26 GRPO groups: 10%|# | 2/20 [01:03<06:12, 20.67s/q, loss=-0.0013, mean_r=0.880, q_acc=100%, q_rew=0.670, skip=1]
Iter 26 GRPO groups: 15%|#5 | 3/20 [01:03<05:33, 19.61s/q, loss=-0.0013, mean_r=0.880, q_acc=100%, q_rew=0.670, skip=1]2026-04-26 06:54:44,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:54:53,649 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:54:53,734 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:54:53,819 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:54:53,904 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:55:04,546 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:55:04,630 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:55:04,714 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:55:04,800 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:55:13,024 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 26 GRPO groups: 15%|#5 | 3/20 [01:36<05:33, 19.61s/q, loss=0var, mean_r=0.998, skip=2]
Iter 26 GRPO groups: 20%|## | 4/20 [01:36<06:37, 24.83s/q, loss=0var, mean_r=0.998, skip=2]2026-04-26 06:55:18,503 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.976 = clip(base=0.896 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.993 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:18,700 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:18,892 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.991 novelty=0.67 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:19,085 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:19,278 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.993 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:19,472 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:19,670 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:19,866 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.67 | sol=0.45*prm_final(0.99)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:20,059 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.993 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:20,254 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:25,047 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:25,244 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:25,438 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:25,634 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.993 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:25,832 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.993 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:26,021 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.997 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:26,224 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:26,428 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:26,618 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.362 = clip(base=0.344 + mod=+0.018, cap=1.00) | Q=0.64 sol=0.147 novelty=0.63 | sol=0.45*prm_final(0.22)+0.35*prm_mean(0.13)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:55:26,812 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 26 GRPO groups: 20%|## | 4/20 [01:51<06:37, 24.83s/q, loss=0.0009, mean_r=0.923, q_acc=100%, q_rew=0.679, skip=2]
Iter 26 GRPO groups: 25%|##5 | 5/20 [01:51<05:21, 21.46s/q, loss=0.0009, mean_r=0.923, q_acc=100%, q_rew=0.679, skip=2]2026-04-26 06:55:40,159 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.964 novelty=0.80 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:40,358 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.979 = clip(base=0.899 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.968 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:40,553 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.990 = clip(base=0.910 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.984 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:40,747 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.720 = clip(base=0.640 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.508 novelty=0.80 | sol=0.45*prm_final(0.50)+0.35*prm_mean(0.52)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:55:40,952 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.785 = clip(base=0.705 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.601 novelty=0.80 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.59)+0.20*lccp(0.00) | steps=7
+2026-04-26 06:55:41,147 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.960 novelty=0.80 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:41,351 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.852 = clip(base=0.772 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.741 novelty=0.80 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.65)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:55:41,556 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.972 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:41,762 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.969 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:41,961 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.970 = clip(base=0.890 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.960 novelty=0.80 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:46,010 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.998 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:46,206 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:46,407 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:46,599 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.982 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:46,791 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:46,984 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:47,177 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:47,368 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:47,558 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.991 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:47,757 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.988 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+
Iter 26 GRPO groups: 25%|##5 | 5/20 [02:12<05:21, 21.46s/q, loss=-0.0003, mean_r=0.937, q_acc=100%, q_rew=0.703, skip=2]
Iter 26 GRPO groups: 30%|### | 6/20 [02:12<04:57, 21.28s/q, loss=-0.0003, mean_r=0.937, q_acc=100%, q_rew=0.703, skip=2]2026-04-26 06:55:58,228 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.592 = clip(base=0.512 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.401 novelty=0.65 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.63)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:55:58,451 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.840 = clip(base=0.760 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.862 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:55:58,668 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.601 = clip(base=0.521 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.423 novelty=0.65 | sol=0.45*prm_final(0.46)+0.35*prm_mean(0.52)+0.20*lccp(0.17) | steps=6
+2026-04-26 06:55:58,878 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.436 = clip(base=0.356 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.223 novelty=0.65 | sol=0.45*prm_final(0.18)+0.35*prm_mean(0.30)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:55:59,086 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.664 = clip(base=0.584 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.569 novelty=0.65 | sol=0.45*prm_final(0.68)+0.35*prm_mean(0.56)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:55:59,298 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.950 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:55:59,500 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.566 = clip(base=0.486 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.397 novelty=0.65 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.68)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:55:59,710 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.704 = clip(base=0.624 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.634 novelty=0.65 | sol=0.45*prm_final(0.64)+0.35*prm_mean(0.70)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:55:59,919 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.438 = clip(base=0.358 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.222 novelty=0.65 | sol=0.45*prm_final(0.15)+0.35*prm_mean(0.30)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:56:00,125 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.562 = clip(base=0.482 + mod=+0.080, cap=1.00) | Q=0.53 sol=0.448 novelty=0.65 | sol=0.45*prm_final(0.46)+0.35*prm_mean(0.49)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:56:05,131 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.998 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:05,330 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.996 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:05,532 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.997 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:05,732 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.998 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:05,931 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.997 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:06,133 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.996 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:06,334 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.997 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:06,534 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.998 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:06,735 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.998 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:06,941 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+
Iter 26 GRPO groups: 30%|### | 6/20 [02:32<04:57, 21.28s/q, loss=0.0003, mean_r=0.768, q_acc=100%, q_rew=0.674, skip=2]
Iter 26 GRPO groups: 35%|###5 | 7/20 [02:32<04:27, 20.61s/q, loss=0.0003, mean_r=0.768, q_acc=100%, q_rew=0.674, skip=2]2026-04-26 06:56:14,978 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:56:15,061 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:56:15,144 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:56:20,193 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.317 = 0.50×0.09(prox=0.09) + 0.40×proc(0.242[fin=0.06,mean=0.51]) + 0.10×fmt(1.000) | pred='18' gold='3' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 06:56:20,269 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:56:20,345 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:56:20,422 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:56:25,302 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:56:25,379 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:56:25,455 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 26 GRPO groups: 35%|###5 | 7/20 [02:50<04:27, 20.61s/q, loss=-0.0005, mean_r=0.930, q_acc=100%, q_rew=0.674, skip=2]
Iter 26 GRPO groups: 40%|#### | 8/20 [02:50<03:58, 19.85s/q, loss=-0.0005, mean_r=0.930, q_acc=100%, q_rew=0.674, skip=2]2026-04-26 06:56:32,132 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.900[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 06:56:40,511 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:56:40,595 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:56:40,680 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:56:40,765 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:56:50,595 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:56:50,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.894[fin=0.99,mean=0.75]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 06:56:50,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:56:50,846 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:56:58,069 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 26 GRPO groups: 40%|#### | 8/20 [03:21<03:58, 19.85s/q, loss=0var, mean_r=0.991, skip=3]
Iter 26 GRPO groups: 45%|####5 | 9/20 [03:21<04:17, 23.39s/q, loss=0var, mean_r=0.991, skip=3]2026-04-26 06:57:01,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:57:01,210 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.953 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(0.650) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:57:01,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:57:07,906 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:57:07,991 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:57:08,073 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:57:08,150 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:57:12,486 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:57:12,564 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:57:12,641 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 26 GRPO groups: 45%|####5 | 9/20 [03:37<04:17, 23.39s/q, loss=0.0038, mean_r=0.980, q_acc=100%, q_rew=0.674, skip=3]
Iter 26 GRPO groups: 50%|##### | 10/20 [03:37<03:31, 21.12s/q, loss=0.0038, mean_r=0.980, q_acc=100%, q_rew=0.674, skip=3]2026-04-26 06:57:19,468 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.373 = 0.50×0.00(prox=0.00) + 0.40×proc(0.558[fin=0.64,mean=0.44]) + 0.10×fmt(1.000) | pred='11 1/7' gold='20' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 06:57:24,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:57:24,525 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:57:24,610 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.327 = 0.50×0.01(prox=0.01) + 0.40×proc(0.465[fin=0.53,mean=0.37]) + 0.10×fmt(1.000) | pred='1260' gold='20' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 06:57:24,691 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:57:27,973 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.500 = 0.50×0.54(prox=0.54) + 0.40×proc(0.327[fin=0.36,mean=0.27]) + 0.10×fmt(1.000) | pred='11.43' gold='20' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:57:28,054 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:57:28,138 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:57:28,220 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.560 = 0.50×0.20(prox=0.20) + 0.40×proc(0.899[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='60' gold='20' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:57:32,955 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 26 GRPO groups: 50%|##### | 10/20 [03:57<03:31, 21.12s/q, loss=-0.0002, mean_r=0.774, q_acc=100%, q_rew=0.674, skip=3]
Iter 26 GRPO groups: 55%|#####5 | 11/20 [03:57<03:07, 20.87s/q, loss=-0.0002, mean_r=0.774, q_acc=100%, q_rew=0.674, skip=3]2026-04-26 06:57:45,202 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.817 = 0.50×0.85(prox=0.85) + 0.40×proc(0.730[fin=0.74,mean=0.72]) + 0.10×fmt(1.000) | pred='44' gold='41' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 06:57:45,295 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:57:45,389 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:57:58,587 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.45(prox=0.45) + 0.40×proc(0.295[fin=0.04,mean=0.68]) + 0.10×fmt(1.000) | pred='16' gold='41' | step_acc=78% lccp=78% (chain=7/9 ok_count=7) n_steps=9
+2026-04-26 06:57:58,671 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.542 = 0.50×0.67(prox=0.67) + 0.40×proc(0.265[fin=0.04,mean=0.61]) + 0.10×fmt(1.000) | pred='31' gold='41' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 06:57:58,763 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 06:57:58,855 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.460 = 0.50×0.32(prox=0.32) + 0.40×proc(0.355[fin=0.14,mean=0.68]) + 0.10×fmt(1.000) | pred='84' gold='41' | step_acc=62% lccp=38% (chain=3/8 ok_count=5) n_steps=8
+2026-04-26 06:58:22,046 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:58:22,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.558 = 0.50×0.63(prox=0.63) + 0.40×proc(0.357[fin=0.27,mean=0.48]) + 0.10×fmt(1.000) | pred='29' gold='41' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 06:58:22,229 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 26 GRPO groups: 55%|#####5 | 11/20 [04:47<03:07, 20.87s/q, loss=0.0014, mean_r=0.791, q_acc=100%, q_rew=0.674, skip=3]
Iter 26 GRPO groups: 60%|###### | 12/20 [04:47<03:56, 29.52s/q, loss=0.0014, mean_r=0.791, q_acc=100%, q_rew=0.674, skip=3]2026-04-26 06:58:57,487 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.577 = 0.50×0.78(prox=0.78) + 0.40×proc(0.221[fin=0.03,mean=0.51]) + 0.10×fmt(1.000) | pred='32' gold='28' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 06:59:05,680 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:59:05,765 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:59:05,851 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.866 = 0.50×0.78(prox=0.78) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='24' gold='28' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:59:05,937 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:59:13,414 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:59:13,499 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:59:13,593 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:59:13,676 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 26 GRPO groups: 60%|###### | 12/20 [05:45<03:56, 29.52s/q, loss=-0.0000, mean_r=0.938, q_acc=100%, q_rew=0.674, skip=3]
Iter 26 GRPO groups: 65%|######5 | 13/20 [05:45<04:27, 38.15s/q, loss=-0.0000, mean_r=0.938, q_acc=100%, q_rew=0.674, skip=3]2026-04-26 06:59:36,670 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.749 = clip(base=0.669 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.616 novelty=0.76 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.48)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:59:36,871 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.721 = clip(base=0.641 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.599 novelty=0.76 | sol=0.45*prm_final(0.85)+0.35*prm_mean(0.48)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:59:37,090 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.940 = clip(base=0.860 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.947 novelty=0.76 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:59:37,310 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.966 novelty=0.76 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:59:37,518 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.921 novelty=0.76 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:59:37,732 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.986 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:59:37,944 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.970 = clip(base=0.890 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.967 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:59:38,166 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.975 = clip(base=0.895 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.979 novelty=0.76 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:59:38,386 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.876 = clip(base=0.796 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.815 novelty=0.76 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.83)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:59:38,597 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.965 novelty=0.76 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:59:57,160 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.783 = clip(base=0.703 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.626 novelty=0.75 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.43)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:59:57,388 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.730 = clip(base=0.650 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.577 novelty=0.75 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.46)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:59:57,614 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.737 = clip(base=0.657 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.588 novelty=0.75 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.41)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:59:57,835 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.807 = clip(base=0.727 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.651 novelty=0.75 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.60)+0.20*lccp(0.00) | steps=7
+2026-04-26 06:59:58,074 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.818 = clip(base=0.738 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.679 novelty=0.75 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.55)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:59:58,312 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.830 = clip(base=0.750 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.695 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.70)+0.20*lccp(0.00) | steps=7
+2026-04-26 06:59:58,549 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.741 = clip(base=0.661 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.596 novelty=0.75 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.43)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:59:58,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.558 = clip(base=0.478 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.249 novelty=0.75 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.53)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:59:58,994 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.799 = clip(base=0.719 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.643 novelty=0.75 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.52)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:59:59,216 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.842 = clip(base=0.762 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.719 novelty=0.75 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.58)+0.20*lccp(0.40) | steps=5
+
Iter 26 GRPO groups: 65%|######5 | 13/20 [06:24<04:27, 38.15s/q, loss=0.0012, mean_r=0.835, q_acc=100%, q_rew=0.695, skip=3]
Iter 26 GRPO groups: 70%|####### | 14/20 [06:24<03:50, 38.46s/q, loss=0.0012, mean_r=0.835, q_acc=100%, q_rew=0.695, skip=3]2026-04-26 07:00:04,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:00:04,081 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:00:04,166 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.415 = 0.50×0.05(prox=0.05) + 0.40×proc(0.722[fin=0.88,mean=0.49]) + 0.10×fmt(1.000) | pred='443' gold='44' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:00:04,249 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.471 = 0.50×0.07(prox=0.07) + 0.40×proc(0.845[fin=0.98,mean=0.64]) + 0.10×fmt(1.000) | pred='355' gold='44' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 07:00:11,823 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:00:11,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:00:11,978 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:00:12,063 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:00:16,360 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.214 = 0.50×0.07(prox=0.07) + 0.40×proc(0.203[fin=0.15,mean=0.28]) + 0.10×fmt(1.000) | pred='353' gold='44' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 07:00:16,437 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.958[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 26 GRPO groups: 70%|####### | 14/20 [06:41<03:50, 38.46s/q, loss=-0.0020, mean_r=0.804, q_acc=100%, q_rew=0.695, skip=3]
Iter 26 GRPO groups: 75%|#######5 | 15/20 [06:41<02:39, 31.98s/q, loss=-0.0020, mean_r=0.804, q_acc=100%, q_rew=0.695, skip=3]2026-04-26 07:00:23,571 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.893 = clip(base=0.813 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.866 novelty=0.62 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.86)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:00:23,766 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.868 = clip(base=0.788 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.858 novelty=0.62 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.83)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:00:23,961 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.962 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:00:24,164 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.845 = clip(base=0.765 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.826 novelty=0.62 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.81)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:00:24,364 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.844 = clip(base=0.764 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.814 novelty=0.62 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.76)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:00:24,567 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.967 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:00:24,761 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.813 = clip(base=0.733 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.748 novelty=0.62 | sol=0.45*prm_final(0.84)+0.35*prm_mean(0.71)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:00:24,958 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.961 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:00:25,162 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.952 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:00:25,366 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.937 novelty=0.62 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:00:33,707 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.669 = clip(base=0.589 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.519 novelty=0.68 | sol=0.45*prm_final(0.74)+0.35*prm_mean(0.39)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:00:33,912 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.697 = clip(base=0.617 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.602 novelty=0.68 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.49)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:00:34,115 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.456 = clip(base=0.376 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.186 novelty=0.68 | sol=0.45*prm_final(0.17)+0.35*prm_mean(0.31)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:00:34,318 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.676 = clip(base=0.596 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.558 novelty=0.68 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.43)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:00:34,519 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.819 = clip(base=0.739 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.755 novelty=0.68 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.74)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:00:34,720 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.851 = clip(base=0.771 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.824 novelty=0.68 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.81)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:00:34,921 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.592 = clip(base=0.512 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.387 novelty=0.68 | sol=0.45*prm_final(0.52)+0.35*prm_mean(0.34)+0.20*lccp(0.17) | steps=6
+2026-04-26 07:00:35,123 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.435 = clip(base=0.355 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.149 novelty=0.68 | sol=0.45*prm_final(0.17)+0.35*prm_mean(0.21)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:00:35,323 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.392 = clip(base=0.312 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.107 novelty=0.68 | sol=0.45*prm_final(0.11)+0.35*prm_mean(0.17)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:00:35,525 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.430 = clip(base=0.350 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.174 novelty=0.68 | sol=0.45*prm_final(0.05)+0.35*prm_mean(0.24)+0.20*lccp(0.33) | steps=3
+
Iter 26 GRPO groups: 75%|#######5 | 15/20 [07:00<02:39, 31.98s/q, loss=0.0016, mean_r=0.745, q_acc=100%, q_rew=0.692, skip=3]
Iter 26 GRPO groups: 80%|######## | 16/20 [07:00<01:52, 28.17s/q, loss=0.0016, mean_r=0.745, q_acc=100%, q_rew=0.692, skip=3]2026-04-26 07:00:44,289 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.424 = clip(base=0.344 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.199 novelty=0.66 | sol=0.45*prm_final(0.07)+0.35*prm_mean(0.33)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:00:44,483 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.755 = clip(base=0.675 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.736 novelty=0.66 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.69)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:00:44,678 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.596 = clip(base=0.516 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.501 novelty=0.66 | sol=0.45*prm_final(0.67)+0.35*prm_mean(0.46)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:00:44,871 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.388 = clip(base=0.308 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.156 novelty=0.66 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.26)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:00:45,065 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.687 = clip(base=0.607 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.602 novelty=0.66 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.51)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:00:45,270 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.494 = clip(base=0.414 + mod=+0.080, cap=1.00) | Q=0.55 sol=0.321 novelty=0.66 | sol=0.45*prm_final(0.21)+0.35*prm_mean(0.46)+0.20*lccp(0.33) | steps=6
+2026-04-26 07:00:45,467 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.383 = clip(base=0.303 + mod=+0.080, cap=1.00) | Q=0.51 sol=0.168 novelty=0.66 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.28)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:00:45,666 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.578 = clip(base=0.498 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.468 novelty=0.66 | sol=0.45*prm_final(0.73)+0.35*prm_mean(0.39)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:00:45,861 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.492 = clip(base=0.412 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.315 novelty=0.66 | sol=0.45*prm_final(0.38)+0.35*prm_mean(0.40)+0.20*lccp(0.00) | steps=6
+2026-04-26 07:00:46,055 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.428 = clip(base=0.348 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.188 novelty=0.66 | sol=0.45*prm_final(0.08)+0.35*prm_mean(0.34)+0.20*lccp(0.17) | steps=6
+2026-04-26 07:00:52,253 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.986 = clip(base=0.906 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.986 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:00:52,452 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.922 = clip(base=0.842 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.926 novelty=0.77 | sol=0.45*prm_final(0.92)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:00:52,653 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:00:52,854 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.996 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:00:53,053 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.990 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:00:53,253 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.888 = clip(base=0.808 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.867 novelty=0.77 | sol=0.45*prm_final(0.84)+0.35*prm_mean(0.83)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:00:53,455 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:00:53,660 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.988 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:00:53,863 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.995 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:00:54,067 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.992 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+
Iter 26 GRPO groups: 80%|######## | 16/20 [07:19<01:52, 28.17s/q, loss=0.0008, mean_r=0.738, q_acc=100%, q_rew=0.685, skip=3]
Iter 26 GRPO groups: 85%|########5 | 17/20 [07:19<01:15, 25.29s/q, loss=0.0008, mean_r=0.738, q_acc=100%, q_rew=0.685, skip=3]2026-04-26 07:01:06,608 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.744 = clip(base=0.664 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.670 novelty=0.71 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.65)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:01:06,814 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:01:07,018 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.969 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:01:07,229 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:01:07,445 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.798 = clip(base=0.718 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.773 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(0.17) | steps=6
+2026-04-26 07:01:07,665 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:01:07,871 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:01:08,081 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.737 = clip(base=0.657 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.675 novelty=0.71 | sol=0.45*prm_final(0.94)+0.35*prm_mean(0.58)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:01:08,298 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.919 = clip(base=0.839 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:01:08,521 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.655 = clip(base=0.575 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.527 novelty=0.71 | sol=0.45*prm_final(0.65)+0.35*prm_mean(0.55)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:01:12,943 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.933 = clip(base=0.853 + mod=+0.080, cap=1.00) | Q=0.63 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:13,151 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:13,364 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:13,568 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:13,766 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:13,966 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:14,174 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:14,376 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:14,574 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:14,773 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 26 GRPO groups: 85%|########5 | 17/20 [07:39<01:15, 25.29s/q, loss=-0.0003, mean_r=0.876, q_acc=100%, q_rew=0.673, skip=3]
Iter 26 GRPO groups: 90%|######### | 18/20 [07:39<00:47, 23.91s/q, loss=-0.0003, mean_r=0.876, q_acc=100%, q_rew=0.673, skip=3]2026-04-26 07:01:22,246 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.855 = 0.50×0.75(prox=0.75) + 0.40×proc(0.950[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='35' gold='30' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:01:22,332 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:01:27,630 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:01:27,714 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:01:27,799 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 07:01:27,884 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:01:32,239 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.950 = 0.50×1.00(exact) + 0.40×proc(0.874[fin=1.00,mean=0.69]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 07:01:32,325 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=86% lccp=14% (chain=1/7 ok_count=6) n_steps=7
+2026-04-26 07:01:32,409 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.941 = 0.50×1.00(exact) + 0.40×proc(0.852[fin=0.98,mean=0.66]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 07:01:32,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 26 GRPO groups: 90%|######### | 18/20 [08:12<00:47, 23.91s/q, loss=0.0015, mean_r=0.969, q_acc=100%, q_rew=0.673, skip=3]
Iter 26 GRPO groups: 95%|#########5| 19/20 [08:12<00:26, 26.62s/q, loss=0.0015, mean_r=0.969, q_acc=100%, q_rew=0.673, skip=3]2026-04-26 07:01:54,781 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.988 = clip(base=0.908 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:54,996 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:55,215 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:55,435 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:55,649 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:55,863 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:56,083 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:56,297 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:56,514 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:56,728 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.991 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:02,415 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.991 = clip(base=0.911 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.997 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:02,613 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.969 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:02,813 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.570 = clip(base=0.490 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.355 novelty=0.77 | sol=0.45*prm_final(0.33)+0.35*prm_mean(0.40)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:02:03,011 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.977 = clip(base=0.897 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.995 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:02:03,206 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.963 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:03,403 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.587 = clip(base=0.507 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.348 novelty=0.77 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.60)+0.20*lccp(0.67) | steps=3
+2026-04-26 07:02:03,596 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.956 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:03,792 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.997 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:03,993 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.950 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:04,193 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.967 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=3
+
Iter 26 GRPO groups: 95%|#########5| 19/20 [08:29<00:26, 26.62s/q, loss=-0.0001, mean_r=0.924, q_acc=100%, q_rew=0.679, skip=3]
Iter 26 GRPO groups: 100%|##########| 20/20 [08:29<00:00, 23.66s/q, loss=-0.0001, mean_r=0.924, q_acc=100%, q_rew=0.679, skip=3]
Iter 26 GRPO groups: 100%|##########| 20/20 [08:29<00:00, 25.48s/q, loss=-0.0001, mean_r=0.924, q_acc=100%, q_rew=0.679, skip=3]
+2026-04-26 07:02:06,205 INFO __main__ - Iter 26 | loss=0.0004 | reward mean=0.867 std=0.180 | gt_match=78.9% | grounded_acc=92.7% | step_acc=89.0% | lccp=79.4% | batch_acc=92.0% | phase=SELFPLAY_RAMP sp_ratio=46% | groups=26 skipped=3(0var=3) | lr=3.80e-06 | 509.7s
+2026-04-26 07:02:06,206 INFO __main__ - Question generation: 9/9 valid (100%) | q_reward=0.679 | q_acc=100.0% (>0.5 quality) | topic=0.66 diff=0.25 clarity=1.00 novelty=0.45 solvability=0.93
+2026-04-26 07:02:06,207 INFO __main__ - ======================================================================
+2026-04-26 07:02:06,207 INFO __main__ - GRPO ITERATION 27/60
+2026-04-26 07:02:06,207 INFO __main__ - ======================================================================
+2026-04-26 07:02:06,226 INFO __main__ - LR this iteration: 3.80e-06 | T=0.624 | MATH ratio=48%
+
Iter 27 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 07:02:10,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.948 = 0.50×1.00(exact) + 0.40×proc(0.870[fin=0.98,mean=0.71]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 07:02:10,836 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.940 = 0.50×1.00(exact) + 0.40×proc(0.849[fin=0.80,mean=0.92]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:02:10,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.925[fin=0.93,mean=0.92]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:02:11,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.933 = 0.50×1.00(exact) + 0.40×proc(0.832[fin=0.80,mean=0.89]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:02:18,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.904 = 0.50×1.00(exact) + 0.40×proc(0.761[fin=0.80,mean=0.70]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 07:02:18,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.945 = 0.50×1.00(exact) + 0.40×proc(0.862[fin=0.82,mean=0.92]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:02:18,490 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.943[fin=0.95,mean=0.93]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:02:18,574 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.911 = 0.50×1.00(exact) + 0.40×proc(0.779[fin=0.73,mean=0.85]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:02:28,279 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.905 = 0.50×1.00(exact) + 0.40×proc(0.762[fin=0.68,mean=0.89]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:02:28,366 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.901 = 0.50×1.00(exact) + 0.40×proc(0.752[fin=0.67,mean=0.87]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 27 GRPO groups: 0%| | 0/20 [00:23, ?q/s, loss=-0.0008, mean_r=0.933, skip=0]
Iter 27 GRPO groups: 5%|5 | 1/20 [00:23<07:28, 23.62s/q, loss=-0.0008, mean_r=0.933, skip=0]2026-04-26 07:02:36,835 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.954 + mod=+0.080, cap=1.00) | Q=0.89 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:02:37,036 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.921 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.997 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:02:37,235 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:02:37,433 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:02:37,634 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.931 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:02:37,832 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:02:38,031 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:02:38,232 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:02:38,433 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.997 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:02:38,632 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:02:42,927 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.920 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.995 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:43,125 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.995 = clip(base=0.915 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.987 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:43,329 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.986 = clip(base=0.906 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.972 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:43,535 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.998 = clip(base=0.918 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.983 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:43,732 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=1.000 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:43,930 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.986 = clip(base=0.906 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.973 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:44,130 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.993 = clip(base=0.913 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.982 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:44,326 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.921 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:44,532 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.870 = clip(base=0.790 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.777 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.74)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:02:44,734 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 27 GRPO groups: 5%|5 | 1/20 [00:40<07:28, 23.62s/q, loss=-0.0002, mean_r=0.991, q_acc=100%, q_rew=0.820, skip=1]
Iter 27 GRPO groups: 10%|# | 2/20 [00:40<05:50, 19.45s/q, loss=-0.0002, mean_r=0.991, q_acc=100%, q_rew=0.820, skip=1]2026-04-26 07:02:50,688 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:02:50,772 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:02:59,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:02:59,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:02:59,479 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:02:59,561 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:03:08,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.917[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 07:03:08,640 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:03:08,723 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:03:08,806 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 27 GRPO groups: 10%|# | 2/20 [01:09<05:50, 19.45s/q, loss=0var, mean_r=0.996, skip=2]
Iter 27 GRPO groups: 15%|#5 | 3/20 [01:09<06:45, 23.88s/q, loss=0var, mean_r=0.996, skip=2]2026-04-26 07:03:23,764 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.932 = clip(base=0.852 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.987 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:23,962 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.985 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:24,169 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.890 = clip(base=0.810 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.963 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:03:24,379 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.985 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:24,585 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.984 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:24,786 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.982 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:24,985 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.972 novelty=0.68 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:25,191 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.987 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:25,394 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.989 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:25,596 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.891 = clip(base=0.811 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.964 novelty=0.68 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:03:38,867 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.995 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=8
+2026-04-26 07:03:39,064 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.756 = clip(base=0.676 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.718 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.78)+0.20*lccp(0.00) | steps=6
+2026-04-26 07:03:39,272 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.784 = clip(base=0.704 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.768 novelty=0.73 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.76)+0.20*lccp(0.38) | steps=8
+2026-04-26 07:03:39,471 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.766 = clip(base=0.686 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.755 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.88)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:03:39,667 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.885 = clip(base=0.805 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.948 novelty=0.73 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=8
+2026-04-26 07:03:39,863 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.992 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=8
+2026-04-26 07:03:40,068 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.753 = clip(base=0.673 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.716 novelty=0.73 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.74)+0.20*lccp(0.12) | steps=8
+2026-04-26 07:03:40,278 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.988 novelty=0.73 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=11
+2026-04-26 07:03:40,477 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.994 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:40,700 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.801 = clip(base=0.721 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.802 novelty=0.73 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.89)+0.20*lccp(0.27) | steps=11
+
Iter 27 GRPO groups: 15%|#5 | 3/20 [01:36<06:45, 23.88s/q, loss=0.0002, mean_r=0.874, q_acc=100%, q_rew=0.709, skip=2]
Iter 27 GRPO groups: 20%|## | 4/20 [01:36<06:41, 25.06s/q, loss=0.0002, mean_r=0.874, q_acc=100%, q_rew=0.709, skip=2]2026-04-26 07:03:56,387 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:03:56,492 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.762 = 0.50×0.67(prox=0.67) + 0.40×proc(0.823[fin=1.00,mean=0.56]) + 0.10×fmt(1.000) | pred='3' gold='4' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:03:56,603 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:03:56,705 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.610 = 0.50×0.40(prox=0.40) + 0.40×proc(0.775[fin=0.98,mean=0.46]) + 0.10×fmt(1.000) | pred='1' gold='4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 07:04:01,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.324 = 0.50×0.33(prox=0.33) + 0.40×proc(0.230[fin=0.29,mean=0.15]) + 0.10×fmt(0.650) | pred='0' gold='4' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 07:04:01,592 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.879[fin=0.98,mean=0.72]) + 0.10×fmt(1.000) | pred='0' gold='4' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 07:04:01,703 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:04:01,808 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.623 = 0.50×0.40(prox=0.40) + 0.40×proc(0.807[fin=0.98,mean=0.55]) + 0.10×fmt(1.000) | pred='1' gold='4' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:04:04,390 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.896 = 0.50×1.00(exact) + 0.40×proc(0.739[fin=0.97,mean=0.39]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 07:04:04,493 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 27 GRPO groups: 20%|## | 4/20 [01:59<06:41, 25.06s/q, loss=0.0005, mean_r=0.775, q_acc=100%, q_rew=0.709, skip=2]
Iter 27 GRPO groups: 25%|##5 | 5/20 [01:59<06:07, 24.52s/q, loss=0.0005, mean_r=0.775, q_acc=100%, q_rew=0.709, skip=2]2026-04-26 07:04:15,120 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.943 + mod=+0.080, cap=1.00) | Q=0.90 sol=0.974 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:04:15,308 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.904 novelty=0.77 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.77)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:04:15,495 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.821 = clip(base=0.741 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.687 novelty=0.77 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.70)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:04:15,682 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.910 novelty=0.77 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.76)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:04:15,872 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.767 = clip(base=0.687 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.582 novelty=0.77 | sol=0.45*prm_final(0.84)+0.35*prm_mean(0.58)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:04:16,058 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.481 = clip(base=0.401 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.208 novelty=0.77 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.39)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:04:16,253 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:04:16,441 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.831 = clip(base=0.751 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.691 novelty=0.77 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.71)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:04:16,627 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.820 = clip(base=0.740 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.689 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.70)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:04:16,814 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.907 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.75)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:04:25,754 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:04:25,968 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:04:26,179 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:04:26,388 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=8
+2026-04-26 07:04:26,593 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:04:26,797 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.931 = clip(base=0.851 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:04:27,002 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:04:27,219 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:04:27,425 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.931 = clip(base=0.851 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.973 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:04:27,630 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.922 = clip(base=0.842 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.994 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=7
+
Iter 27 GRPO groups: 25%|##5 | 5/20 [02:23<06:07, 24.52s/q, loss=-0.0001, mean_r=0.891, q_acc=100%, q_rew=0.712, skip=2]
Iter 27 GRPO groups: 30%|### | 6/20 [02:23<05:37, 24.14s/q, loss=-0.0001, mean_r=0.891, q_acc=100%, q_rew=0.712, skip=2]2026-04-26 07:04:41,504 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='57' gold='57' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:04:41,603 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.774 = 0.50×0.64(prox=0.64) + 0.40×proc(0.883[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='41' gold='57' | step_acc=80% lccp=20% (chain=2/10 ok_count=8) n_steps=10
+2026-04-26 07:05:05,078 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.769 = 0.50×0.64(prox=0.64) + 0.40×proc(0.871[fin=1.00,mean=0.68]) + 0.10×fmt(1.000) | pred='41' gold='57' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 07:05:05,170 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(0.700) | pred='' gold='57' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:05:05,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.942 = 0.50×1.00(exact) + 0.40×proc(0.856[fin=0.98,mean=0.66]) + 0.10×fmt(1.000) | pred='57' gold='57' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:05:05,348 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='57' gold='57' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:05:15,148 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.788 = 0.50×0.64(prox=0.64) + 0.40×proc(0.920[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='41' gold='57' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 07:05:15,240 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='57' gold='57' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:05:15,332 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.898[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='57' gold='57' | step_acc=70% lccp=0% (chain=0/10 ok_count=7) n_steps=10
+2026-04-26 07:05:15,416 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.795 = 0.50×0.64(prox=0.64) + 0.40×proc(0.937[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='41' gold='57' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+
Iter 27 GRPO groups: 30%|### | 6/20 [03:16<05:37, 24.14s/q, loss=-0.0008, mean_r=0.858, q_acc=100%, q_rew=0.712, skip=2]
Iter 27 GRPO groups: 35%|###5 | 7/20 [03:16<07:19, 33.80s/q, loss=-0.0008, mean_r=0.858, q_acc=100%, q_rew=0.712, skip=2]2026-04-26 07:05:28,334 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:05:28,418 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.950 = 0.50×1.00(exact) + 0.40×proc(0.875[fin=0.89,mean=0.85]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:05:28,500 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.960[fin=0.99,mean=0.92]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:05:28,584 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:05:35,824 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:05:35,909 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:05:35,995 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.728 = 0.50×0.67(prox=0.67) + 0.40×proc(0.736[fin=0.88,mean=0.52]) + 0.10×fmt(1.000) | pred='30' gold='24' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 07:05:36,079 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.938[fin=0.94,mean=0.93]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:05:45,558 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=0.99,mean=0.95]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:05:45,642 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 27 GRPO groups: 35%|###5 | 7/20 [03:40<07:19, 33.80s/q, loss=-0.0007, mean_r=0.961, q_acc=100%, q_rew=0.712, skip=2]
Iter 27 GRPO groups: 40%|#### | 8/20 [03:40<06:08, 30.69s/q, loss=-0.0007, mean_r=0.961, q_acc=100%, q_rew=0.712, skip=2]2026-04-26 07:05:55,973 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.948 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.993 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:05:56,195 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.936 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.995 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:05:56,406 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.995 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:05:56,619 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.935 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.994 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:05:56,831 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.990 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:05:57,049 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.930 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.995 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:05:57,263 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.922 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.976 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:05:57,476 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.992 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:05:57,687 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.992 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:05:57,901 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.992 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:06:04,223 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.962 + mod=+0.080, cap=1.00) | Q=0.91 sol=0.997 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:06:04,434 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.803 = clip(base=0.723 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.672 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.63)+0.20*lccp(0.00) | steps=6
+2026-04-26 07:06:04,643 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:06:04,855 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.941 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.996 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:06:05,067 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:06:05,277 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.937 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:06:05,486 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.941 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.996 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:06:05,695 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.938 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:06:05,905 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.842 = clip(base=0.762 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.729 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.66)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:06:06,118 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.937 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.997 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+
Iter 27 GRPO groups: 40%|#### | 8/20 [04:01<06:08, 30.69s/q, loss=-0.0006, mean_r=0.982, q_acc=100%, q_rew=0.745, skip=3]
Iter 27 GRPO groups: 45%|####5 | 9/20 [04:01<05:03, 27.55s/q, loss=-0.0006, mean_r=0.982, q_acc=100%, q_rew=0.745, skip=3]2026-04-26 07:06:13,501 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.964[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:06:13,583 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:06:21,433 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:06:21,515 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:06:21,597 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:06:21,679 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:06:26,390 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:06:26,471 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:06:26,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:06:26,636 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 27 GRPO groups: 45%|####5 | 9/20 [04:25<05:03, 27.55s/q, loss=0var, mean_r=0.995, skip=4]
Iter 27 GRPO groups: 50%|##### | 10/20 [04:25<04:24, 26.48s/q, loss=0var, mean_r=0.995, skip=4]2026-04-26 07:07:05,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='0' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:07:05,974 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.14(prox=0.14) + 0.40×proc(0.927[fin=0.96,mean=0.88]) + 0.10×fmt(1.000) | pred='3' gold='0' | step_acc=88% lccp=75% (chain=6/8 ok_count=7) n_steps=8
+2026-04-26 07:07:06,068 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.974[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='2' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:07:06,162 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:07:14,845 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.01(prox=0.01) + 0.40×proc(0.949[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='84' gold='0' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:07:14,938 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.944[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 07:07:15,035 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=12/12 ok_count=12) n_steps=12
+2026-04-26 07:07:15,132 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.949[fin=0.99,mean=0.89]) + 0.10×fmt(1.000) | pred='2' gold='0' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:07:25,542 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='2' gold='0' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 27 GRPO groups: 50%|##### | 10/20 [05:20<04:24, 26.48s/q, loss=0.0008, mean_r=0.647, q_acc=100%, q_rew=0.745, skip=4]
Iter 27 GRPO groups: 55%|#####5 | 11/20 [05:20<05:17, 35.27s/q, loss=0.0008, mean_r=0.647, q_acc=100%, q_rew=0.745, skip=4]2026-04-26 07:07:32,682 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:07:32,877 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:07:33,072 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:07:33,270 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.778 = clip(base=0.698 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.785 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.77)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:07:33,466 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:07:33,672 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.901 = clip(base=0.821 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.990 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:07:33,876 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:07:34,073 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:07:34,274 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:07:34,473 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:07:37,659 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.919 = clip(base=0.839 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.979 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:37,834 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.896 = clip(base=0.816 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.979 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:38,009 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.984 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:38,188 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.983 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:38,367 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.896 = clip(base=0.816 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.979 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:38,545 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.989 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:38,733 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.984 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:38,917 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.989 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:39,098 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.984 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:39,281 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.983 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+
Iter 27 GRPO groups: 55%|#####5 | 11/20 [05:34<05:17, 35.27s/q, loss=-0.0004, mean_r=0.898, q_acc=100%, q_rew=0.711, skip=4]
Iter 27 GRPO groups: 60%|###### | 12/20 [05:34<03:50, 28.79s/q, loss=-0.0004, mean_r=0.898, q_acc=100%, q_rew=0.711, skip=4]2026-04-26 07:07:49,102 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.760 = 0.50×0.59(prox=0.59) + 0.40×proc(0.908[fin=0.98,mean=0.79]) + 0.10×fmt(1.000) | pred='230' gold='350' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 07:07:49,198 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:07:49,291 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:08:00,206 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.738 = 0.50×0.74(prox=0.74) + 0.40×proc(0.664[fin=0.68,mean=0.64]) + 0.10×fmt(1.000) | pred='410' gold='350' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 07:08:00,297 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:08:00,389 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.945 = 0.50×1.00(exact) + 0.40×proc(0.863[fin=1.00,mean=0.66]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 07:08:00,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.926[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 07:08:09,027 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.476 = 0.50×0.54(prox=0.54) + 0.40×proc(0.267[fin=0.18,mean=0.40]) + 0.10×fmt(1.000) | pred='200' gold='350' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 07:08:09,121 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:08:09,217 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.38(prox=0.38) + 0.40×proc(0.779[fin=0.96,mean=0.50]) + 0.10×fmt(1.000) | pred='70' gold='350' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+
Iter 27 GRPO groups: 60%|###### | 12/20 [06:04<03:50, 28.79s/q, loss=-0.0007, mean_r=0.843, q_acc=100%, q_rew=0.711, skip=4]
Iter 27 GRPO groups: 65%|######5 | 13/20 [06:04<03:23, 29.06s/q, loss=-0.0007, mean_r=0.843, q_acc=100%, q_rew=0.711, skip=4]2026-04-26 07:08:17,850 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.931 = clip(base=0.851 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:18,069 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:18,298 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:18,519 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:18,740 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:18,958 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:19,182 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:19,404 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:19,626 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:19,847 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:26,444 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:26,646 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:26,849 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.893 = clip(base=0.813 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.977 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:27,051 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:27,259 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:27,464 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:27,666 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:27,876 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:28,081 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.755 = clip(base=0.675 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.747 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.85)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:08:28,287 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+
Iter 27 GRPO groups: 65%|######5 | 13/20 [06:23<03:23, 29.06s/q, loss=-0.0009, mean_r=0.902, q_acc=100%, q_rew=0.689, skip=4]
Iter 27 GRPO groups: 70%|####### | 14/20 [06:23<02:36, 26.11s/q, loss=-0.0009, mean_r=0.902, q_acc=100%, q_rew=0.689, skip=4]2026-04-26 07:08:41,304 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.974 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:08:41,498 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.857 = clip(base=0.777 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.900 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.72)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:08:41,691 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.813 = clip(base=0.733 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.826 novelty=0.74 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.79)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:08:41,887 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.622 = clip(base=0.542 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.490 novelty=0.74 | sol=0.45*prm_final(0.38)+0.35*prm_mean(0.58)+0.20*lccp(0.57) | steps=7
+2026-04-26 07:08:42,074 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.805 = clip(base=0.725 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.814 novelty=0.74 | sol=0.45*prm_final(0.92)+0.35*prm_mean(0.80)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:08:42,269 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.888 = clip(base=0.808 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.950 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:08:42,461 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.881 = clip(base=0.801 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.940 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:08:42,650 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.786 = clip(base=0.706 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.782 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.81)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:08:42,837 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.822 = clip(base=0.742 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.842 novelty=0.74 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.82)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:08:43,024 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.820 = clip(base=0.740 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.837 novelty=0.74 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.81)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:08:57,681 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.483 = clip(base=0.403 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.281 novelty=0.73 | sol=0.45*prm_final(0.47)+0.35*prm_mean(0.20)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:08:57,913 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.584 = clip(base=0.504 + mod=+0.080, cap=1.00) | Q=0.53 sol=0.485 novelty=0.73 | sol=0.45*prm_final(0.66)+0.35*prm_mean(0.46)+0.20*lccp(0.14) | steps=7
+2026-04-26 07:08:58,140 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.454 = clip(base=0.374 + mod=+0.080, cap=1.00) | Q=0.53 sol=0.272 novelty=0.73 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.37)+0.20*lccp(0.40) | steps=5
+2026-04-26 07:08:58,369 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.570 = clip(base=0.490 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.470 novelty=0.73 | sol=0.45*prm_final(0.70)+0.35*prm_mean(0.44)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:08:58,595 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.356 = clip(base=0.276 + mod=+0.080, cap=1.00) | Q=0.53 sol=0.110 novelty=0.73 | sol=0.45*prm_final(0.13)+0.35*prm_mean(0.15)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:08:58,821 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.369 = clip(base=0.289 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.123 novelty=0.73 | sol=0.45*prm_final(0.13)+0.35*prm_mean(0.18)+0.20*lccp(0.00) | steps=10
+2026-04-26 07:08:59,054 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.549 = clip(base=0.469 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.383 novelty=0.73 | sol=0.45*prm_final(0.33)+0.35*prm_mean(0.61)+0.20*lccp(0.11) | steps=9
+2026-04-26 07:08:59,282 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.376 = clip(base=0.296 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.148 novelty=0.73 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.24)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:08:59,506 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.644 = clip(base=0.564 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.553 novelty=0.73 | sol=0.45*prm_final(0.46)+0.35*prm_mean(0.64)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:08:59,735 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.695 = clip(base=0.615 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.639 novelty=0.73 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.67)+0.20*lccp(0.00) | steps=5
+
Iter 27 GRPO groups: 70%|####### | 14/20 [06:55<02:36, 26.11s/q, loss=-0.0002, mean_r=0.665, q_acc=100%, q_rew=0.673, skip=4]
Iter 27 GRPO groups: 75%|#######5 | 15/20 [06:55<02:18, 27.72s/q, loss=-0.0002, mean_r=0.665, q_acc=100%, q_rew=0.673, skip=4]2026-04-26 07:09:11,782 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:09:21,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:09:21,795 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:09:21,878 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.908[fin=0.97,mean=0.82]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 07:09:21,968 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:09:35,478 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:09:35,571 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:09:35,662 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:09:35,754 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.596 = 0.50×0.73(prox=0.73) + 0.40×proc(0.332[fin=0.19,mean=0.55]) + 0.10×fmt(1.000) | pred='19' gold='16' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 07:09:43,872 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 27 GRPO groups: 75%|#######5 | 15/20 [07:39<02:18, 27.72s/q, loss=-0.0001, mean_r=0.956, q_acc=100%, q_rew=0.673, skip=4]
Iter 27 GRPO groups: 80%|######## | 16/20 [07:39<02:10, 32.59s/q, loss=-0.0001, mean_r=0.956, q_acc=100%, q_rew=0.673, skip=4]2026-04-26 07:09:52,685 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.933 = clip(base=0.853 + mod=+0.080, cap=1.00) | Q=0.63 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:52,886 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:53,087 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:53,297 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:53,503 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:53,713 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:53,913 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:54,119 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:54,324 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:54,525 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:59,769 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.885 = clip(base=0.805 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.926 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.80)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:09:59,974 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.898 = clip(base=0.818 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.984 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:00,180 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.691 = clip(base=0.611 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.637 novelty=0.69 | sol=0.45*prm_final(0.77)+0.35*prm_mean(0.64)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:10:00,380 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.750 = clip(base=0.670 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.735 novelty=0.69 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.70)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:10:00,584 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.749 = clip(base=0.669 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.734 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.64)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:10:00,785 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.901 = clip(base=0.821 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.987 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:00,988 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.497 = clip(base=0.417 + mod=+0.080, cap=1.00) | Q=0.51 sol=0.358 novelty=0.69 | sol=0.45*prm_final(0.33)+0.35*prm_mean(0.41)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:10:01,201 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.894 = clip(base=0.814 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.976 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:01,416 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.746 = clip(base=0.666 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.729 novelty=0.69 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.65)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:10:01,621 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.876 = clip(base=0.796 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.946 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+
Iter 27 GRPO groups: 80%|######## | 16/20 [07:57<02:10, 32.59s/q, loss=-0.0004, mean_r=0.851, q_acc=100%, q_rew=0.661, skip=4]
Iter 27 GRPO groups: 85%|########5 | 17/20 [07:57<01:24, 28.21s/q, loss=-0.0004, mean_r=0.851, q_acc=100%, q_rew=0.661, skip=4]2026-04-26 07:10:39,484 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.967 + mod=+0.080, cap=1.00) | Q=0.93 sol=0.992 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:10:39,678 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.945 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.998 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:10:39,873 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.942 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.993 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:10:40,065 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.943 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.996 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:10:40,263 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.942 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.995 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:10:40,454 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.949 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.997 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:40,647 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.950 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.999 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:40,839 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.949 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.996 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:41,031 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.939 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.998 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:43,754 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.976 novelty=0.65 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:43,944 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.990 novelty=0.65 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:44,139 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.981 novelty=0.65 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:44,328 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.982 novelty=0.65 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:44,522 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.983 novelty=0.65 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:44,720 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.976 novelty=0.65 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:44,909 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.989 novelty=0.65 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:45,102 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.977 novelty=0.65 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:45,296 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.982 novelty=0.65 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:45,486 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.968 novelty=0.65 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+
Iter 27 GRPO groups: 85%|########5 | 17/20 [08:40<01:24, 28.21s/q, loss=0.0001, mean_r=0.968, q_acc=100%, q_rew=0.674, skip=5]
Iter 27 GRPO groups: 90%|######### | 18/20 [08:40<01:05, 32.88s/q, loss=0.0001, mean_r=0.968, q_acc=100%, q_rew=0.674, skip=5]2026-04-26 07:10:59,507 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.11(prox=0.11) + 0.40×proc(0.914[fin=0.99,mean=0.80]) + 0.10×fmt(1.000) | pred='15' gold='3' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 07:10:59,591 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='1/8' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:10:59,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.510 = 0.50×0.14(prox=0.14) + 0.40×proc(0.847[fin=0.98,mean=0.65]) + 0.10×fmt(1.000) | pred='12' gold='3' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 07:11:12,171 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:11:12,256 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 07:11:12,355 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.706 = 0.50×0.50(prox=0.50) + 0.40×proc(0.890[fin=0.98,mean=0.75]) + 0.10×fmt(1.000) | pred='1.5' gold='3' | step_acc=75% lccp=62% (chain=5/8 ok_count=6) n_steps=8
+2026-04-26 07:11:12,441 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.169 = 0.50×0.02(prox=0.02) + 0.40×proc(0.141[fin=0.06,mean=0.26]) + 0.10×fmt(1.000) | pred='63' gold='3' | step_acc=14% lccp=0% (chain=0/7 ok_count=1) n_steps=7
+2026-04-26 07:11:35,839 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.203 = 0.50×0.00(prox=0.00) + 0.40×proc(0.252[fin=0.33,mean=0.13]) + 0.10×fmt(1.000) | pred='315' gold='3' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 07:11:35,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.333 = 0.50×0.11(prox=0.11) + 0.40×proc(0.443[fin=0.55,mean=0.28]) + 0.10×fmt(1.000) | pred='15' gold='3' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 07:11:36,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.300 = 0.50×0.02(prox=0.02) + 0.40×proc(0.355[fin=0.30,mean=0.44]) + 0.10×fmt(1.000) | pred='90' gold='3' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+
Iter 27 GRPO groups: 90%|######### | 18/20 [09:31<01:05, 32.88s/q, loss=0.0003, mean_r=0.532, q_acc=100%, q_rew=0.674, skip=5]
Iter 27 GRPO groups: 95%|#########5| 19/20 [09:31<00:38, 38.15s/q, loss=0.0003, mean_r=0.532, q_acc=100%, q_rew=0.674, skip=5]2026-04-26 07:11:42,739 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.993 = clip(base=0.913 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.992 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:42,926 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.997 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:43,116 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.970 = clip(base=0.890 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.994 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:43,314 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.995 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:43,505 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.997 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:43,705 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.986 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:43,901 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.986 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:44,091 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.986 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:44,283 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.995 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:44,486 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.987 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:11:59,943 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.708 = clip(base=0.628 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.499 novelty=0.74 | sol=0.45*prm_final(0.05)+0.35*prm_mean(0.87)+0.20*lccp(0.87) | steps=15
+2026-04-26 07:12:00,170 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.932 = clip(base=0.852 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.898 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(0.71) | steps=7
+2026-04-26 07:12:00,394 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.739 = clip(base=0.659 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.601 novelty=0.74 | sol=0.45*prm_final(0.89)+0.35*prm_mean(0.46)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:12:00,617 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.893 novelty=0.74 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.91)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:12:00,839 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.550 = clip(base=0.470 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.288 novelty=0.74 | sol=0.45*prm_final(0.12)+0.35*prm_mean(0.44)+0.20*lccp(0.40) | steps=5
+2026-04-26 07:12:01,065 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.942 novelty=0.74 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.85)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:12:01,291 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.978 novelty=0.74 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:12:01,519 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.812 = clip(base=0.732 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.684 novelty=0.74 | sol=0.45*prm_final(0.58)+0.35*prm_mean(0.80)+0.20*lccp(0.71) | steps=7
+2026-04-26 07:12:01,746 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.742 = clip(base=0.662 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.551 novelty=0.74 | sol=0.45*prm_final(0.69)+0.35*prm_mean(0.60)+0.20*lccp(0.14) | steps=7
+2026-04-26 07:12:01,975 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.667 = clip(base=0.587 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.446 novelty=0.74 | sol=0.45*prm_final(0.42)+0.35*prm_mean(0.50)+0.20*lccp(0.40) | steps=5
+
Iter 27 GRPO groups: 95%|#########5| 19/20 [09:57<00:38, 38.15s/q, loss=0.0003, mean_r=0.887, q_acc=100%, q_rew=0.683, skip=5]
Iter 27 GRPO groups: 100%|##########| 20/20 [09:57<00:00, 34.56s/q, loss=0.0003, mean_r=0.887, q_acc=100%, q_rew=0.683, skip=5]
Iter 27 GRPO groups: 100%|##########| 20/20 [09:57<00:00, 29.87s/q, loss=0.0003, mean_r=0.887, q_acc=100%, q_rew=0.683, skip=5]
+2026-04-26 07:12:03,730 INFO __main__ - Iter 27 | loss=-0.0002 | reward mean=0.878 std=0.163 | gt_match=68.7% | grounded_acc=93.9% | step_acc=86.2% | lccp=74.1% | batch_acc=95.6% | phase=SELFPLAY_RAMP sp_ratio=50% | groups=25 skipped=5(0var=5) | lr=3.67e-06 | 597.5s
+2026-04-26 07:12:03,730 INFO __main__ - Question generation: 10/10 valid (100%) | q_reward=0.683 | q_acc=100.0% (>0.5 quality) | topic=0.57 diff=0.36 clarity=1.00 novelty=0.46 solvability=0.98
+2026-04-26 07:12:03,731 INFO __main__ - ======================================================================
+2026-04-26 07:12:03,731 INFO __main__ - GRPO ITERATION 28/60
+2026-04-26 07:12:03,731 INFO __main__ - ======================================================================
+2026-04-26 07:12:03,751 INFO __main__ - LR this iteration: 3.67e-06 | T=0.617 | MATH ratio=50%
+
Iter 28 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 07:12:09,837 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.879 = clip(base=0.799 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.925 novelty=0.65 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.83)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:10,050 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.740 = clip(base=0.660 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.723 novelty=0.65 | sol=0.45*prm_final(0.51)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:10,267 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.765 = clip(base=0.685 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.762 novelty=0.65 | sol=0.45*prm_final(0.58)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:10,480 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.767 = clip(base=0.687 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.768 novelty=0.65 | sol=0.45*prm_final(0.60)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:10,688 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.828 = clip(base=0.748 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.869 novelty=0.65 | sol=0.45*prm_final(0.77)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:10,897 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.790 = clip(base=0.710 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.807 novelty=0.65 | sol=0.45*prm_final(0.66)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:11,108 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.719 = clip(base=0.639 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.684 novelty=0.65 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.65)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:12:11,316 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.833 = clip(base=0.753 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.877 novelty=0.65 | sol=0.45*prm_final(0.79)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:11,525 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.751 = clip(base=0.671 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.738 novelty=0.65 | sol=0.45*prm_final(0.54)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:11,743 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.812 = clip(base=0.732 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.844 novelty=0.65 | sol=0.45*prm_final(0.73)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:18,939 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.73 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:19,141 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:19,381 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:19,601 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:19,813 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:20,015 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:20,216 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:20,422 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:20,628 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:20,837 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+
Iter 28 GRPO groups: 0%| | 0/20 [00:18, ?q/s, loss=0.0005, mean_r=0.872, q_acc=100%, q_rew=0.630, skip=0]
Iter 28 GRPO groups: 5%|5 | 1/20 [00:18<05:56, 18.76s/q, loss=0.0005, mean_r=0.872, q_acc=100%, q_rew=0.630, skip=0]2026-04-26 07:12:44,140 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.524 = 0.50×0.00(prox=0.00) + 0.40×proc(0.934[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='$5+\\sqrt{62}$' gold='13' | step_acc=78% lccp=33% (chain=3/9 ok_count=7) n_steps=9
+2026-04-26 07:13:07,451 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.891 = 0.50×0.85(prox=0.85) + 0.40×proc(0.914[fin=0.99,mean=0.80]) + 0.10×fmt(1.000) | pred='12' gold='13' | step_acc=76% lccp=29% (chain=5/17 ok_count=13) n_steps=17
+2026-04-26 07:13:07,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.438 = 0.50×0.00(prox=0.00) + 0.40×proc(0.844[fin=1.00,mean=0.61]) + 0.10×fmt(1.000) | pred='$10 + \\sqrt{170}$' gold='13' | step_acc=57% lccp=0% (chain=0/7 ok_count=4) n_steps=7
+2026-04-26 07:13:07,649 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.738 = 0.50×1.00(exact) + 0.40×proc(0.345[fin=0.25,mean=0.48]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=42% lccp=17% (chain=2/12 ok_count=5) n_steps=12
+2026-04-26 07:13:07,741 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.741 = 0.50×0.62(prox=0.62) + 0.40×proc(0.830[fin=0.98,mean=0.61]) + 0.10×fmt(1.000) | pred='9' gold='13' | step_acc=57% lccp=14% (chain=1/7 ok_count=4) n_steps=7
+2026-04-26 07:13:28,450 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.377 = 0.50×0.00(prox=0.00) + 0.40×proc(0.692[fin=0.72,mean=0.65]) + 0.10×fmt(1.000) | pred='No solution' gold='13' | step_acc=71% lccp=0% (chain=0/7 ok_count=5) n_steps=7
+2026-04-26 07:13:28,548 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.682 = 0.50×0.52(prox=0.52) + 0.40×proc(0.805[fin=0.98,mean=0.54]) + 0.10×fmt(1.000) | pred='19' gold='13' | step_acc=45% lccp=0% (chain=0/11 ok_count=5) n_steps=11
+2026-04-26 07:13:28,642 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.290 = 0.50×0.00(prox=0.00) + 0.40×proc(0.551[fin=0.62,mean=0.45]) + 0.10×fmt(0.700) | pred='' gold='13' | step_acc=43% lccp=0% (chain=0/7 ok_count=3) n_steps=7
+2026-04-26 07:13:28,740 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.468 = 0.50×0.00(prox=0.00) + 0.40×proc(0.834[fin=0.95,mean=0.65]) + 0.10×fmt(1.000) | pred='$5+\\sqrt{37+4\\sqrt{14}}$' gold='13' | step_acc=69% lccp=23% (chain=3/13 ok_count=9) n_steps=13
+2026-04-26 07:13:52,305 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.468 = 0.50×0.00(prox=0.00) + 0.40×proc(0.858[fin=0.98,mean=0.68]) + 0.10×fmt(1.000) | pred='$5+\\sqrt{129}$' gold='13' | step_acc=67% lccp=17% (chain=2/12 ok_count=8) n_steps=12
+
Iter 28 GRPO groups: 5%|5 | 1/20 [01:50<05:56, 18.76s/q, loss=0.0005, mean_r=0.562, q_acc=100%, q_rew=0.630, skip=0]
Iter 28 GRPO groups: 10%|# | 2/20 [01:50<18:26, 61.45s/q, loss=0.0005, mean_r=0.562, q_acc=100%, q_rew=0.630, skip=0]2026-04-26 07:14:03,355 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.973 = clip(base=0.893 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:03,552 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:03,748 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=8
+2026-04-26 07:14:03,947 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.996 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:14:04,149 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.576 = clip(base=0.496 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.339 novelty=0.69 | sol=0.45*prm_final(0.15)+0.35*prm_mean(0.58)+0.20*lccp(0.33) | steps=6
+2026-04-26 07:14:04,353 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:04,560 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:04,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:14:04,969 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.985 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=8
+2026-04-26 07:14:05,164 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:22,752 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.760 = clip(base=0.680 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.608 novelty=0.68 | sol=0.45*prm_final(0.71)+0.35*prm_mean(0.63)+0.20*lccp(0.33) | steps=9
+2026-04-26 07:14:22,973 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.838 = clip(base=0.758 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.779 novelty=0.68 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.81)+0.20*lccp(0.50) | steps=8
+2026-04-26 07:14:23,195 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.821 = clip(base=0.741 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.748 novelty=0.68 | sol=0.45*prm_final(0.77)+0.35*prm_mean(0.76)+0.20*lccp(0.67) | steps=9
+2026-04-26 07:14:23,407 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.780 = clip(base=0.700 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.698 novelty=0.68 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.69)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:14:23,633 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.781 = clip(base=0.701 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.672 novelty=0.68 | sol=0.45*prm_final(0.62)+0.35*prm_mean(0.89)+0.20*lccp(0.42) | steps=12
+2026-04-26 07:14:23,852 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.591 = clip(base=0.511 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.362 novelty=0.68 | sol=0.45*prm_final(0.24)+0.35*prm_mean(0.50)+0.20*lccp(0.40) | steps=5
+2026-04-26 07:14:24,071 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.605 = clip(base=0.525 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.392 novelty=0.68 | sol=0.45*prm_final(0.13)+0.35*prm_mean(0.60)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:14:24,313 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.833 = clip(base=0.753 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.767 novelty=0.68 | sol=0.45*prm_final(0.89)+0.35*prm_mean(0.83)+0.20*lccp(0.38) | steps=8
+2026-04-26 07:14:24,544 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.562 = clip(base=0.482 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.312 novelty=0.68 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.63)+0.20*lccp(0.25) | steps=8
+2026-04-26 07:14:24,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.863 = clip(base=0.783 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.827 novelty=0.68 | sol=0.45*prm_final(0.69)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=7
+
Iter 28 GRPO groups: 10%|# | 2/20 [02:22<18:26, 61.45s/q, loss=-0.0003, mean_r=0.830, q_acc=100%, q_rew=0.673, skip=0]
Iter 28 GRPO groups: 15%|#5 | 3/20 [02:22<13:41, 48.30s/q, loss=-0.0003, mean_r=0.830, q_acc=100%, q_rew=0.673, skip=0]2026-04-26 07:14:34,826 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.747 = clip(base=0.667 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.656 novelty=0.70 | sol=0.45*prm_final(0.70)+0.35*prm_mean(0.69)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:14:35,029 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.784 = clip(base=0.704 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.762 novelty=0.70 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.75)+0.20*lccp(0.40) | steps=5
+2026-04-26 07:14:35,233 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.962 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:14:35,435 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.495 = clip(base=0.415 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.285 novelty=0.70 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.50)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:14:35,637 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.498 = clip(base=0.418 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.286 novelty=0.70 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.51)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:14:35,846 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.965 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:36,053 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.967 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:14:36,257 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.919 = clip(base=0.839 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.966 novelty=0.70 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:36,460 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.986 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:36,665 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.522 = clip(base=0.442 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.330 novelty=0.70 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.59)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:14:42,758 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.992 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:42,964 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.812 = clip(base=0.732 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.788 novelty=0.69 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.80)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:14:43,177 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.937 = clip(base=0.857 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.995 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:43,382 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.993 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:43,591 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.770 = clip(base=0.690 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.734 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.68)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:14:43,797 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.993 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:44,004 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.961 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:44,208 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.993 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:44,413 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.992 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:44,617 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.987 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+
Iter 28 GRPO groups: 15%|#5 | 3/20 [02:42<13:41, 48.30s/q, loss=0.0003, mean_r=0.836, q_acc=100%, q_rew=0.663, skip=0]
Iter 28 GRPO groups: 20%|## | 4/20 [02:42<09:52, 37.04s/q, loss=0.0003, mean_r=0.836, q_acc=100%, q_rew=0.663, skip=0]2026-04-26 07:14:53,348 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.975 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:14:53,558 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.779 = clip(base=0.699 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.726 novelty=0.76 | sol=0.45*prm_final(0.94)+0.35*prm_mean(0.67)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:14:53,763 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.986 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:14:53,966 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.978 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:14:54,175 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.990 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:54,382 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.989 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:14:54,587 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.992 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:14:54,796 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.995 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:54,999 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.991 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:14:55,207 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.990 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:15:02,274 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.931 = clip(base=0.851 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.952 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:15:02,486 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.673 = clip(base=0.593 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.577 novelty=0.77 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.52)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:15:02,691 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.947 novelty=0.77 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:15:02,899 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.986 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:15:03,110 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.940 = clip(base=0.860 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.992 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:15:03,320 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.984 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:15:03,526 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.985 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:15:03,732 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.995 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:15:03,936 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.982 novelty=0.77 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:15:04,145 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.831 = clip(base=0.751 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.800 novelty=0.77 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.84)+0.20*lccp(0.50) | steps=4
+
Iter 28 GRPO groups: 20%|## | 4/20 [03:02<09:52, 37.04s/q, loss=0.0001, mean_r=0.916, q_acc=100%, q_rew=0.667, skip=0]
Iter 28 GRPO groups: 25%|##5 | 5/20 [03:02<07:40, 30.72s/q, loss=0.0001, mean_r=0.916, q_acc=100%, q_rew=0.667, skip=0]2026-04-26 07:15:14,832 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.38(prox=0.38) + 0.40×proc(0.791[fin=0.92,mean=0.60]) + 0.10×fmt(1.000) | pred='180' gold='1008' | step_acc=50% lccp=33% (chain=2/6 ok_count=3) n_steps=6
+2026-04-26 07:15:14,928 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.480 = 0.50×0.08(prox=0.08) + 0.40×proc(0.796[fin=0.98,mean=0.52]) + 0.10×fmt(1.000) | pred='6720' gold='1008' | step_acc=43% lccp=14% (chain=1/7 ok_count=3) n_steps=7
+2026-04-26 07:15:15,023 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.898[fin=0.99,mean=0.76]) + 0.10×fmt(1.000) | pred='1980' gold='1008' | step_acc=71% lccp=29% (chain=2/7 ok_count=5) n_steps=7
+2026-04-26 07:15:38,493 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.894[fin=0.99,mean=0.75]) + 0.10×fmt(1.000) | pred='1980' gold='1008' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 07:15:38,588 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.753 = 0.50×0.54(prox=0.54) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='1440' gold='1008' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:15:38,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.492 = 0.50×0.53(prox=0.53) + 0.40×proc(0.320[fin=0.21,mean=0.49]) + 0.10×fmt(1.000) | pred='1458' gold='1008' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 07:15:38,771 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.741 = 0.50×0.54(prox=0.54) + 0.40×proc(0.929[fin=0.99,mean=0.84]) + 0.10×fmt(1.000) | pred='1440' gold='1008' | step_acc=86% lccp=29% (chain=2/7 ok_count=6) n_steps=7
+2026-04-26 07:15:50,821 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.947[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='1008' gold='1008' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 07:15:50,917 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.865[fin=0.96,mean=0.72]) + 0.10×fmt(1.000) | pred='1980' gold='1008' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 07:15:51,011 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.940[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='1008' gold='1008' | step_acc=86% lccp=14% (chain=1/7 ok_count=6) n_steps=7
+
Iter 28 GRPO groups: 25%|##5 | 5/20 [03:48<07:40, 30.72s/q, loss=0.0002, mean_r=0.662, q_acc=100%, q_rew=0.667, skip=0]
Iter 28 GRPO groups: 30%|### | 6/20 [03:48<08:26, 36.16s/q, loss=0.0002, mean_r=0.662, q_acc=100%, q_rew=0.667, skip=0]2026-04-26 07:15:56,375 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:16:01,043 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:16:01,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:16:01,210 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:16:01,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:16:05,370 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:16:05,453 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:16:05,535 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:16:05,619 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.420 = 0.50×0.40(prox=0.40) + 0.40×proc(0.174[fin=0.05,mean=0.36]) + 0.10×fmt(1.000) | pred='8' gold='32' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 07:16:11,898 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 28 GRPO groups: 30%|### | 6/20 [04:09<08:26, 36.16s/q, loss=-0.0011, mean_r=0.941, q_acc=100%, q_rew=0.667, skip=0]
Iter 28 GRPO groups: 35%|###5 | 7/20 [04:09<06:44, 31.15s/q, loss=-0.0011, mean_r=0.941, q_acc=100%, q_rew=0.667, skip=0]2026-04-26 07:16:32,459 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.833 = 0.50×0.69(prox=0.69) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='155' gold='200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:16:32,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.700 = 0.50×0.85(prox=0.85) + 0.40×proc(0.436[fin=0.19,mean=0.80]) + 0.10×fmt(1.000) | pred='215' gold='200' | step_acc=85% lccp=46% (chain=6/13 ok_count=11) n_steps=13
+2026-04-26 07:16:32,649 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='200' gold='200' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:16:48,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.839 = 0.50×0.80(prox=0.80) + 0.40×proc(0.846[fin=1.00,mean=0.62]) + 0.10×fmt(1.000) | pred='225' gold='200' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 07:16:48,886 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='200' gold='200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:16:48,982 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.414 = 0.50×0.00(prox=0.00) + 0.40×proc(0.659[fin=0.63,mean=0.70]) + 0.10×fmt(1.000) | pred='125 or 335' gold='200' | step_acc=78% lccp=33% (chain=3/9 ok_count=7) n_steps=9
+2026-04-26 07:16:49,101 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.461 = 0.50×0.00(prox=0.00) + 0.40×proc(0.629[fin=0.52,mean=0.79]) + 0.10×fmt(1.000) | pred='50*sqrt(45)' gold='200' | step_acc=82% lccp=73% (chain=8/11 ok_count=9) n_steps=11
+2026-04-26 07:17:12,952 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.757 = 0.50×0.67(prox=0.67) + 0.40×proc(0.809[fin=0.92,mean=0.64]) + 0.10×fmt(1.000) | pred='250' gold='200' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 07:17:13,039 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.822 = 0.50×0.85(prox=0.85) + 0.40×proc(0.742[fin=0.96,mean=0.41]) + 0.10×fmt(1.000) | pred='187.5' gold='200' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 07:17:13,124 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.921 = 0.50×1.00(exact) + 0.40×proc(0.803[fin=1.00,mean=0.51]) + 0.10×fmt(1.000) | pred='200' gold='200' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+
Iter 28 GRPO groups: 35%|###5 | 7/20 [05:10<06:44, 31.15s/q, loss=-0.0019, mean_r=0.774, q_acc=100%, q_rew=0.667, skip=0]
Iter 28 GRPO groups: 40%|#### | 8/20 [05:10<08:08, 40.74s/q, loss=-0.0019, mean_r=0.774, q_acc=100%, q_rew=0.667, skip=0]2026-04-26 07:17:18,595 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.981 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:18,782 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.918 = clip(base=0.838 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.980 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:18,969 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.986 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:19,162 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.978 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:19,354 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.918 = clip(base=0.838 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.981 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:19,547 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.62 sol=1.000 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:19,741 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:19,942 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.985 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:20,143 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.984 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:20,339 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.986 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:24,532 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.979 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:24,722 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:24,910 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.974 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:25,101 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.973 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:25,293 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.972 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:25,484 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.918 = clip(base=0.838 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.982 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:25,684 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.918 = clip(base=0.838 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.981 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:25,876 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.972 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:26,072 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.996 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:26,267 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.979 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+
Iter 28 GRPO groups: 40%|#### | 8/20 [05:24<08:08, 40.74s/q, loss=0.0004, mean_r=0.920, q_acc=100%, q_rew=0.659, skip=0]
Iter 28 GRPO groups: 45%|####5 | 9/20 [05:24<05:53, 32.17s/q, loss=0.0004, mean_r=0.920, q_acc=100%, q_rew=0.659, skip=0]2026-04-26 07:17:34,946 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=1.000 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:35,154 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.73 sol=1.000 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:35,356 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.73 sol=1.000 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:35,561 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.73 sol=1.000 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:35,766 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:35,973 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:36,182 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:36,392 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:36,594 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:36,794 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:44,113 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:44,335 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.61 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:44,551 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:44,764 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.62 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:44,980 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:45,198 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:45,417 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:45,633 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:45,852 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:46,067 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+
Iter 28 GRPO groups: 45%|####5 | 9/20 [05:44<05:53, 32.17s/q, loss=-0.0005, mean_r=0.951, q_acc=100%, q_rew=0.663, skip=0]
Iter 28 GRPO groups: 50%|##### | 10/20 [05:44<04:43, 28.35s/q, loss=-0.0005, mean_r=0.951, q_acc=100%, q_rew=0.663, skip=0]2026-04-26 07:17:51,468 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.952 = 0.50×1.00(exact) + 0.40×proc(0.879[fin=0.99,mean=0.72]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:17:57,667 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.909[fin=0.99,mean=0.79]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:17:57,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:17:57,834 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.944 = 0.50×1.00(exact) + 0.40×proc(0.860[fin=1.00,mean=0.65]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:17:57,917 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.972 = 0.50×1.00(exact) + 0.40×proc(0.929[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:18:01,165 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.920[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:18:01,250 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:18:01,334 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.948 = 0.50×1.00(exact) + 0.40×proc(0.871[fin=1.00,mean=0.68]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 07:18:01,419 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.952 = 0.50×1.00(exact) + 0.40×proc(0.879[fin=0.99,mean=0.72]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:18:01,504 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.941[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 28 GRPO groups: 50%|##### | 10/20 [05:57<04:43, 28.35s/q, loss=0var, mean_r=0.963, skip=1]
Iter 28 GRPO groups: 55%|#####5 | 11/20 [05:57<03:34, 23.88s/q, loss=0var, mean_r=0.963, skip=1]2026-04-26 07:18:07,807 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.900[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:18:07,892 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 07:18:07,984 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:18:12,666 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:18:12,758 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:18:12,849 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:18:12,932 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 07:18:15,594 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.284 = 0.50×0.00(prox=0.00) + 0.40×proc(0.273[fin=0.13,mean=0.49]) + 0.10×fmt(1.000) | pred='$50\\sqrt{41}$' gold='250' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 07:18:15,686 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:18:15,777 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 28 GRPO groups: 55%|#####5 | 11/20 [06:13<03:34, 23.88s/q, loss=-0.0013, mean_r=0.916, q_acc=100%, q_rew=0.663, skip=1]
Iter 28 GRPO groups: 60%|###### | 12/20 [06:13<02:51, 21.40s/q, loss=-0.0013, mean_r=0.916, q_acc=100%, q_rew=0.663, skip=1]2026-04-26 07:18:24,533 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:18:34,615 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:18:34,698 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:18:34,781 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:18:34,863 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:18:44,621 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.938 = 0.50×1.00(exact) + 0.40×proc(0.846[fin=0.96,mean=0.67]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 07:18:44,706 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:18:44,790 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:18:44,874 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:18:50,085 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 28 GRPO groups: 60%|###### | 12/20 [06:46<02:51, 21.40s/q, loss=0var, mean_r=0.993, skip=2]
Iter 28 GRPO groups: 65%|######5 | 13/20 [06:46<02:54, 24.87s/q, loss=0var, mean_r=0.993, skip=2]2026-04-26 07:18:54,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:18:54,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.828 = 0.50×0.67(prox=0.67) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='15' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:18:54,533 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:19:05,752 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 07:19:05,835 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:19:05,918 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:19:06,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:19:17,929 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:19:18,012 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:19:18,099 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 28 GRPO groups: 65%|######5 | 13/20 [07:15<02:54, 24.87s/q, loss=0.0008, mean_r=0.981, q_acc=100%, q_rew=0.663, skip=2]
Iter 28 GRPO groups: 70%|####### | 14/20 [07:15<02:37, 26.25s/q, loss=0.0008, mean_r=0.981, q_acc=100%, q_rew=0.663, skip=2]2026-04-26 07:19:29,767 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.915[fin=0.98,mean=0.81]) + 0.10×fmt(1.000) | pred='3' gold='1' | step_acc=91% lccp=9% (chain=1/11 ok_count=10) n_steps=11
+2026-04-26 07:19:54,032 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 07:19:54,129 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:19:54,215 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.503 = 0.50×0.00(prox=0.00) + 0.40×proc(0.821[fin=0.90,mean=0.70]) + 0.10×fmt(1.000) | pred='10101' gold='1' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 07:19:54,303 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.925[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='3' gold='1' | step_acc=82% lccp=9% (chain=1/11 ok_count=9) n_steps=11
+2026-04-26 07:20:17,114 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.961[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=89% lccp=11% (chain=1/9 ok_count=8) n_steps=9
+2026-04-26 07:20:17,197 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.935 = 0.50×1.00(exact) + 0.40×proc(0.837[fin=1.00,mean=0.60]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=62% lccp=12% (chain=1/8 ok_count=5) n_steps=8
+2026-04-26 07:20:17,291 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.872[fin=0.98,mean=0.71]) + 0.10×fmt(1.000) | pred='3' gold='1' | step_acc=78% lccp=11% (chain=1/9 ok_count=7) n_steps=9
+2026-04-26 07:20:17,383 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 07:20:41,075 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.871 = 0.50×1.00(exact) + 0.40×proc(0.764[fin=0.92,mean=0.53]) + 0.10×fmt(0.650) | pred='1' gold='1' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+
Iter 28 GRPO groups: 70%|####### | 14/20 [08:38<02:37, 26.25s/q, loss=0.0002, mean_r=0.793, q_acc=100%, q_rew=0.663, skip=2]
Iter 28 GRPO groups: 75%|#######5 | 15/20 [08:38<03:36, 43.37s/q, loss=0.0002, mean_r=0.793, q_acc=100%, q_rew=0.663, skip=2]2026-04-26 07:20:45,936 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.986 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:46,131 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.997 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:46,330 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.996 = clip(base=0.916 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.984 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:46,522 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.998 = clip(base=0.918 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.986 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:46,717 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.997 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:46,914 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.998 = clip(base=0.918 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.986 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:47,114 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.996 = clip(base=0.916 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.984 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:47,311 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.998 = clip(base=0.918 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.986 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:47,502 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.996 = clip(base=0.916 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.984 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:47,693 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.988 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:51,280 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.941 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:51,472 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.786 = clip(base=0.706 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.654 novelty=0.69 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.63)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:20:51,666 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:51,859 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:52,053 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.786 = clip(base=0.706 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.654 novelty=0.69 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.63)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:20:52,247 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:52,441 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.784 = clip(base=0.704 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.652 novelty=0.69 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.63)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:20:52,632 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.784 = clip(base=0.704 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.652 novelty=0.69 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.63)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:20:52,827 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.786 = clip(base=0.706 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.654 novelty=0.69 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.63)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:20:53,026 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+
Iter 28 GRPO groups: 75%|#######5 | 15/20 [08:50<03:36, 43.37s/q, loss=0.0030, mean_r=0.945, q_acc=100%, q_rew=0.684, skip=2]
Iter 28 GRPO groups: 80%|######## | 16/20 [08:50<02:15, 33.97s/q, loss=0.0030, mean_r=0.945, q_acc=100%, q_rew=0.684, skip=2]2026-04-26 07:20:58,722 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.996 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:20:58,915 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.996 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:20:59,109 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.998 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:20:59,300 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.987 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:20:59,498 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.993 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:20:59,693 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.997 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:20:59,885 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:00,076 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.996 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:00,278 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.998 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:00,471 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.987 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:05,304 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.75 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:05,487 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:05,669 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:05,853 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:06,038 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:06,225 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.999 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:06,415 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:06,598 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:06,791 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:06,980 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 28 GRPO groups: 80%|######## | 16/20 [09:04<02:15, 33.97s/q, loss=-0.0000, mean_r=0.935, q_acc=100%, q_rew=0.679, skip=2]
Iter 28 GRPO groups: 85%|########5 | 17/20 [09:04<01:23, 27.96s/q, loss=-0.0000, mean_r=0.935, q_acc=100%, q_rew=0.679, skip=2]2026-04-26 07:21:16,448 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.980 = clip(base=0.900 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.974 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:16,655 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.988 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:16,863 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.977 = clip(base=0.897 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.987 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:17,072 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:17,276 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.980 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:17,480 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.967 novelty=0.70 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:17,692 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.970 = clip(base=0.890 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.990 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:21:17,897 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.995 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:18,109 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.993 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:18,324 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.615 = clip(base=0.535 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.357 novelty=0.70 | sol=0.45*prm_final(0.05)+0.35*prm_mean(0.61)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:21:25,287 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.986 = clip(base=0.906 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.995 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:25,487 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:25,686 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:25,885 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.558 = clip(base=0.478 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.291 novelty=0.73 | sol=0.45*prm_final(0.03)+0.35*prm_mean(0.51)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:21:26,088 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:26,295 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:26,496 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.990 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:26,694 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.996 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:26,893 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:27,095 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 28 GRPO groups: 85%|########5 | 17/20 [09:25<01:23, 27.96s/q, loss=-0.0016, mean_r=0.929, q_acc=100%, q_rew=0.685, skip=2]
Iter 28 GRPO groups: 90%|######### | 18/20 [09:25<00:51, 25.70s/q, loss=-0.0016, mean_r=0.929, q_acc=100%, q_rew=0.685, skip=2]2026-04-26 07:21:35,993 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.995 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:36,195 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.998 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:36,406 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.990 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:21:36,613 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.989 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:36,814 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.998 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:37,013 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.995 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:37,215 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.994 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:37,418 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.760 = clip(base=0.680 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.746 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.70)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:21:37,618 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.460 = clip(base=0.380 + mod=+0.080, cap=1.00) | Q=0.55 sol=0.265 novelty=0.78 | sol=0.45*prm_final(0.24)+0.35*prm_mean(0.31)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:21:37,826 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.785 = clip(base=0.705 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.787 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.77)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:21:45,715 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.923 = clip(base=0.843 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.975 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:45,919 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.974 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:21:46,123 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.991 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:46,329 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:46,529 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.995 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:21:46,732 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.896 = clip(base=0.816 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.973 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:21:46,941 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.990 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:47,165 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.940 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:47,364 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.996 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:47,563 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.487 = clip(base=0.407 + mod=+0.080, cap=1.00) | Q=0.55 sol=0.311 novelty=0.78 | sol=0.45*prm_final(0.27)+0.35*prm_mean(0.35)+0.20*lccp(0.33) | steps=3
+
Iter 28 GRPO groups: 90%|######### | 18/20 [09:45<00:51, 25.70s/q, loss=0.0006, mean_r=0.851, q_acc=100%, q_rew=0.675, skip=2]
Iter 28 GRPO groups: 95%|#########5| 19/20 [09:45<00:24, 24.02s/q, loss=0.0006, mean_r=0.851, q_acc=100%, q_rew=0.675, skip=2]2026-04-26 07:21:56,128 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.607 = clip(base=0.527 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.393 novelty=0.68 | sol=0.45*prm_final(0.43)+0.35*prm_mean(0.58)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:21:56,323 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.452 = clip(base=0.372 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.221 novelty=0.68 | sol=0.45*prm_final(0.07)+0.35*prm_mean(0.35)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:21:56,518 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.510 = clip(base=0.430 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.335 novelty=0.68 | sol=0.45*prm_final(0.27)+0.35*prm_mean(0.42)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:21:56,714 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.775 = clip(base=0.695 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.728 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.66)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:21:56,910 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.546 = clip(base=0.466 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.397 novelty=0.68 | sol=0.45*prm_final(0.38)+0.35*prm_mean(0.46)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:21:57,114 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.527 = clip(base=0.447 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.363 novelty=0.68 | sol=0.45*prm_final(0.32)+0.35*prm_mean(0.44)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:21:57,311 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.714 = clip(base=0.634 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.632 novelty=0.68 | sol=0.45*prm_final(0.79)+0.35*prm_mean(0.60)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:21:57,520 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.560 = clip(base=0.480 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.421 novelty=0.68 | sol=0.45*prm_final(0.42)+0.35*prm_mean(0.47)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:21:57,718 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.815 = clip(base=0.735 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.793 novelty=0.68 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.79)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:21:57,913 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.473 = clip(base=0.393 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.258 novelty=0.68 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.34)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:22:03,443 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.977 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:22:03,628 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.990 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:22:03,815 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.983 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:22:04,005 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.774 = clip(base=0.694 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.747 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.68)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:22:04,197 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.421 = clip(base=0.341 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.191 novelty=0.70 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.34)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:22:04,388 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:22:04,581 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.760 = clip(base=0.680 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.711 novelty=0.70 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.64)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:22:04,765 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.982 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:22:04,956 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.743 = clip(base=0.663 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.682 novelty=0.70 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.61)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:22:05,151 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.744 = clip(base=0.664 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.700 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.71)+0.20*lccp(0.00) | steps=3
+
Iter 28 GRPO groups: 95%|#########5| 19/20 [10:03<00:24, 24.02s/q, loss=-0.0007, mean_r=0.699, q_acc=100%, q_rew=0.669, skip=2]
Iter 28 GRPO groups: 100%|##########| 20/20 [10:03<00:00, 22.09s/q, loss=-0.0007, mean_r=0.699, q_acc=100%, q_rew=0.669, skip=2]
Iter 28 GRPO groups: 100%|##########| 20/20 [10:03<00:00, 30.15s/q, loss=-0.0007, mean_r=0.699, q_acc=100%, q_rew=0.669, skip=2]
+2026-04-26 07:22:06,833 INFO __main__ - Iter 28 | loss=0.0000 | reward mean=0.870 std=0.159 | gt_match=65.6% | grounded_acc=87.8% | step_acc=83.3% | lccp=61.9% | batch_acc=94.2% | phase=SELFPLAY_RAMP sp_ratio=54% | groups=29 skipped=2(0var=2) | lr=3.55e-06 | 603.1s
+2026-04-26 07:22:06,833 INFO __main__ - Question generation: 11/11 valid (100%) | q_reward=0.669 | q_acc=100.0% (>0.5 quality) | topic=0.59 diff=0.23 clarity=1.00 novelty=0.45 solvability=0.98
+2026-04-26 07:22:06,834 INFO __main__ - ======================================================================
+2026-04-26 07:22:06,834 INFO __main__ - GRPO ITERATION 29/60
+2026-04-26 07:22:06,834 INFO __main__ - ======================================================================
+2026-04-26 07:22:06,856 INFO __main__ - LR this iteration: 3.55e-06 | T=0.610 | MATH ratio=50%
+
Iter 29 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 07:22:16,231 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9050' gold='9050' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:22:16,325 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='9050' gold='9050' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:22:16,420 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9050' gold='9050' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:22:26,898 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='9050' gold='9050' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:22:26,995 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.563 = 0.50×0.51(prox=0.51) + 0.40×proc(0.521[fin=0.52,mean=0.52]) + 0.10×fmt(1.000) | pred='13400' gold='9050' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 07:22:27,089 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.972[fin=0.99,mean=0.94]) + 0.10×fmt(1.000) | pred='9050' gold='9050' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:22:27,183 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.799 = 0.50×0.69(prox=0.69) + 0.40×proc(0.881[fin=0.98,mean=0.74]) + 0.10×fmt(1.000) | pred='7050' gold='9050' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 07:22:33,238 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='9050' gold='9050' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:22:33,322 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.48(prox=0.48) + 0.40×proc(0.585[fin=0.65,mean=0.49]) + 0.10×fmt(1.000) | pred='4050' gold='9050' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 07:22:33,416 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='9050' gold='9050' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 29 GRPO groups: 0%| | 0/20 [00:28, ?q/s, loss=-0.0001, mean_r=0.886, skip=0]
Iter 29 GRPO groups: 5%|5 | 1/20 [00:28<08:52, 28.03s/q, loss=-0.0001, mean_r=0.886, skip=0]2026-04-26 07:22:40,489 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.756 = 0.50×0.71(prox=0.71) + 0.40×proc(0.746[fin=0.94,mean=0.46]) + 0.10×fmt(1.000) | pred='260' gold='325' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 07:22:49,480 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.47(prox=0.47) + 0.40×proc(0.833[fin=0.94,mean=0.68]) + 0.10×fmt(1.000) | pred='145' gold='325' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 07:22:49,572 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.793 = 0.50×0.63(prox=0.63) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='420' gold='325' | step_acc=86% lccp=57% (chain=4/7 ok_count=6) n_steps=7
+2026-04-26 07:22:49,663 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.761 = 0.50×0.63(prox=0.63) + 0.40×proc(0.865[fin=1.00,mean=0.66]) + 0.10×fmt(1.000) | pred='420' gold='325' | step_acc=71% lccp=14% (chain=1/7 ok_count=5) n_steps=7
+2026-04-26 07:22:49,747 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.949 = 0.50×1.00(exact) + 0.40×proc(0.872[fin=0.99,mean=0.69]) + 0.10×fmt(1.000) | pred='325' gold='325' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 07:23:02,195 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='325' gold='325' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:23:02,280 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.506 = 0.50×0.44(prox=0.44) + 0.40×proc(0.400[fin=0.39,mean=0.42]) + 0.10×fmt(1.000) | pred='120' gold='325' | step_acc=17% lccp=17% (chain=1/6 ok_count=1) n_steps=6
+2026-04-26 07:23:02,371 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='325' gold='325' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:23:02,462 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.944 = 0.50×1.00(exact) + 0.40×proc(0.860[fin=0.98,mean=0.67]) + 0.10×fmt(1.000) | pred='325' gold='325' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 07:23:15,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.755 = 0.50×0.71(prox=0.71) + 0.40×proc(0.744[fin=0.90,mean=0.51]) + 0.10×fmt(1.000) | pred='260' gold='325' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+
Iter 29 GRPO groups: 5%|5 | 1/20 [01:10<08:52, 28.03s/q, loss=-0.0000, mean_r=0.801, skip=0]
Iter 29 GRPO groups: 10%|# | 2/20 [01:10<10:53, 36.33s/q, loss=-0.0000, mean_r=0.801, skip=0]2026-04-26 07:23:19,007 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:19,088 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:19,165 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:25,124 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:25,206 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:25,286 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:25,365 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:29,894 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:29,971 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:30,047 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 29 GRPO groups: 10%|# | 2/20 [01:23<10:53, 36.33s/q, loss=0var, mean_r=0.999, skip=1]
Iter 29 GRPO groups: 15%|#5 | 3/20 [01:23<07:16, 25.69s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 07:23:35,744 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.77 sol=1.000 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:35,940 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:36,139 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:36,337 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:36,532 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:36,731 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:36,926 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:37,127 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:37,327 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:37,534 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:41,616 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.975 = clip(base=0.895 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.995 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:41,814 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.996 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:42,014 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.990 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:42,210 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.974 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:42,401 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.995 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:42,595 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:42,785 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.995 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:42,974 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.995 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:43,170 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.995 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:43,361 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 29 GRPO groups: 15%|#5 | 3/20 [01:38<07:16, 25.69s/q, loss=-0.0006, mean_r=0.965, q_acc=100%, q_rew=0.717, skip=1]
Iter 29 GRPO groups: 20%|## | 4/20 [01:38<05:43, 21.46s/q, loss=-0.0006, mean_r=0.965, q_acc=100%, q_rew=0.717, skip=1]2026-04-26 07:23:50,151 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:23:58,500 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 07:23:58,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:23:58,668 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 07:23:58,750 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 07:24:14,747 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.423 = 0.50×0.14(prox=0.14) + 0.40×proc(0.330[fin=0.03,mean=0.78]) + 0.10×fmt(1.000) | pred='32' gold='8' | step_acc=80% lccp=80% (chain=4/5 ok_count=4) n_steps=5
+2026-04-26 07:24:14,824 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:24:14,906 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:24:14,988 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:24:26,679 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 29 GRPO groups: 20%|## | 4/20 [02:21<05:43, 21.46s/q, loss=-0.0015, mean_r=0.942, q_acc=100%, q_rew=0.717, skip=1]
Iter 29 GRPO groups: 25%|##5 | 5/20 [02:21<07:19, 29.29s/q, loss=-0.0015, mean_r=0.942, q_acc=100%, q_rew=0.717, skip=1]2026-04-26 07:24:36,690 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.995 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:24:36,886 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.997 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:37,085 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:37,284 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:37,482 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.987 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:24:37,680 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:24:37,877 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:38,076 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:38,276 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:24:38,476 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:47,540 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.777 = clip(base=0.697 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.745 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:24:47,749 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:47,959 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:48,173 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:48,393 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:24:48,612 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:24:48,832 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:24:49,040 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:24:49,258 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:24:49,471 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.459 = clip(base=0.379 + mod=+0.080, cap=1.00) | Q=0.51 sol=0.291 novelty=0.71 | sol=0.45*prm_final(0.07)+0.35*prm_mean(0.45)+0.20*lccp(0.50) | steps=4
+
Iter 29 GRPO groups: 25%|##5 | 5/20 [02:44<07:19, 29.29s/q, loss=0.0016, mean_r=0.902, q_acc=100%, q_rew=0.674, skip=1]
Iter 29 GRPO groups: 30%|### | 6/20 [02:44<06:19, 27.14s/q, loss=0.0016, mean_r=0.902, q_acc=100%, q_rew=0.674, skip=1]2026-04-26 07:24:55,347 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.923 = clip(base=0.843 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.946 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.85)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:24:55,536 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.922 = clip(base=0.842 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.987 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:24:55,720 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.985 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:24:55,913 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.994 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:24:56,098 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.985 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:24:56,285 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.62 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:56,468 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:24:56,651 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.897 = clip(base=0.817 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.946 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.85)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:24:56,833 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.971 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:24:57,013 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.990 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:00,379 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.897 = clip(base=0.817 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.956 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:00,551 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.956 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:00,723 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.956 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:00,895 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.956 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:01,067 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.956 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:01,245 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.956 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:01,416 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=1.000 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:25:01,589 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.956 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:01,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.993 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:01,947 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.995 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+
Iter 29 GRPO groups: 30%|### | 6/20 [02:56<06:19, 27.14s/q, loss=0.0051, mean_r=0.904, q_acc=100%, q_rew=0.649, skip=1]
Iter 29 GRPO groups: 35%|###5 | 7/20 [02:56<04:50, 22.34s/q, loss=0.0051, mean_r=0.904, q_acc=100%, q_rew=0.649, skip=1]2026-04-26 07:25:10,718 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.996 = clip(base=0.916 + mod=+0.080, cap=1.00) | Q=0.79 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:25:10,923 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:25:11,132 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.976 = clip(base=0.896 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:11,351 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.73 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:25:11,565 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:25:11,774 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:25:11,987 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.73 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:25:12,199 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.73 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:25:12,409 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.786 = clip(base=0.706 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.658 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.59)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:25:12,615 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:25:20,072 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.979 novelty=0.67 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:20,287 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.941 novelty=0.67 | sol=0.45*prm_final(0.89)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:20,500 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.928 novelty=0.67 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:20,709 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.923 = clip(base=0.843 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.939 novelty=0.67 | sol=0.45*prm_final(0.89)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:20,921 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.992 novelty=0.67 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:21,138 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:21,355 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:21,571 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.977 novelty=0.67 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:21,785 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.991 novelty=0.67 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:21,995 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+
Iter 29 GRPO groups: 35%|###5 | 7/20 [03:16<04:50, 22.34s/q, loss=-0.0004, mean_r=0.951, q_acc=100%, q_rew=0.667, skip=1]
Iter 29 GRPO groups: 40%|#### | 8/20 [03:16<04:19, 21.62s/q, loss=-0.0004, mean_r=0.951, q_acc=100%, q_rew=0.667, skip=1]2026-04-26 07:26:00,279 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.809 = clip(base=0.729 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.781 novelty=0.72 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.78)+0.20*lccp(0.57) | steps=7
+2026-04-26 07:26:00,514 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.639 = clip(base=0.559 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.539 novelty=0.72 | sol=0.45*prm_final(0.38)+0.35*prm_mean(0.84)+0.20*lccp(0.36) | steps=11
+2026-04-26 07:26:00,735 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.789 = clip(base=0.709 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.773 novelty=0.72 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.77)+0.20*lccp(0.57) | steps=7
+2026-04-26 07:26:00,956 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.887 = clip(base=0.807 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.961 novelty=0.72 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:26:01,169 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.594 = clip(base=0.514 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.453 novelty=0.72 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.74)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:26:01,386 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.833 = clip(base=0.753 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.850 novelty=0.72 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.84)+0.20*lccp(0.57) | steps=7
+2026-04-26 07:26:01,606 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.629 = clip(base=0.549 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.503 novelty=0.72 | sol=0.45*prm_final(0.27)+0.35*prm_mean(0.71)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:26:01,820 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.555 = clip(base=0.475 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.397 novelty=0.72 | sol=0.45*prm_final(0.07)+0.35*prm_mean(0.67)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:26:02,044 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.606 = clip(base=0.526 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.473 novelty=0.72 | sol=0.45*prm_final(0.37)+0.35*prm_mean(0.72)+0.20*lccp(0.27) | steps=11
+2026-04-26 07:26:07,350 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:07,549 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.896 = clip(base=0.816 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:07,754 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.997 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:07,955 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:08,163 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:08,366 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:08,568 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:08,768 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.56 sol=1.000 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:08,974 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:09,184 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.892 = clip(base=0.812 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.978 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+
Iter 29 GRPO groups: 40%|#### | 8/20 [04:03<04:19, 21.62s/q, loss=-0.0003, mean_r=0.817, q_acc=100%, q_rew=0.652, skip=1]
Iter 29 GRPO groups: 45%|####5 | 9/20 [04:03<05:25, 29.60s/q, loss=-0.0003, mean_r=0.817, q_acc=100%, q_rew=0.652, skip=1]2026-04-26 07:26:16,272 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:26:16,356 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:26:16,438 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:26:32,104 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:26:32,189 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 07:26:32,272 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 07:26:32,356 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:26:44,302 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:26:44,386 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:26:44,469 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 29 GRPO groups: 45%|####5 | 9/20 [04:37<05:25, 29.60s/q, loss=0var, mean_r=1.000, skip=2]
Iter 29 GRPO groups: 50%|##### | 10/20 [04:37<05:08, 30.84s/q, loss=0var, mean_r=1.000, skip=2]2026-04-26 07:26:52,630 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.743 = 0.50×0.56(prox=0.56) + 0.40×proc(0.914[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='35' gold='25' | step_acc=67% lccp=17% (chain=1/6 ok_count=4) n_steps=6
+2026-04-26 07:27:04,263 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.950[fin=0.96,mean=0.93]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:27:04,349 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:27:04,433 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:27:04,517 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:27:12,566 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.944[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='$\\sqrt{505}$' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:27:12,649 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.644 = 0.50×0.61(prox=0.61) + 0.40×proc(0.597[fin=0.59,mean=0.61]) + 0.10×fmt(1.000) | pred='17' gold='25' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 07:27:12,731 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:27:12,833 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.263 = 0.50×0.00(prox=0.00) + 0.40×proc(0.407[fin=0.34,mean=0.51]) + 0.10×fmt(1.000) | pred='sqrt(545)' gold='25' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 07:27:21,458 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.776 = 0.50×0.61(prox=0.61) + 0.40×proc(0.927[fin=0.98,mean=0.84]) + 0.10×fmt(1.000) | pred='17' gold='25' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+
Iter 29 GRPO groups: 50%|##### | 10/20 [05:16<05:08, 30.84s/q, loss=-0.0008, mean_r=0.793, q_acc=100%, q_rew=0.652, skip=2]
Iter 29 GRPO groups: 55%|#####5 | 11/20 [05:16<04:58, 33.18s/q, loss=-0.0008, mean_r=0.793, q_acc=100%, q_rew=0.652, skip=2]2026-04-26 07:27:32,561 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.778 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:27:32,775 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.852 = clip(base=0.772 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.781 novelty=0.76 | sol=0.45*prm_final(0.94)+0.35*prm_mean(0.74)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:27:32,993 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.796 = clip(base=0.716 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.668 novelty=0.76 | sol=0.45*prm_final(0.82)+0.35*prm_mean(0.63)+0.20*lccp(0.40) | steps=5
+2026-04-26 07:27:33,221 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.874 = clip(base=0.794 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.804 novelty=0.76 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.77)+0.20*lccp(0.50) | steps=6
+2026-04-26 07:27:33,432 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.765 = clip(base=0.685 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.615 novelty=0.76 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.60)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:27:33,642 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.703 = clip(base=0.623 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.592 novelty=0.76 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.50)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:27:33,860 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.751 = clip(base=0.671 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.590 novelty=0.76 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.58)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:27:34,075 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.843 = clip(base=0.763 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.775 novelty=0.76 | sol=0.45*prm_final(0.72)+0.35*prm_mean(0.72)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:27:34,292 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.482 = clip(base=0.402 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.183 novelty=0.76 | sol=0.45*prm_final(0.18)+0.35*prm_mean(0.29)+0.20*lccp(0.00) | steps=6
+2026-04-26 07:27:34,508 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.793 = clip(base=0.713 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.672 novelty=0.76 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.66)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:27:38,766 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.998 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:27:38,954 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:27:39,149 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.941 = clip(base=0.861 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.983 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:27:39,336 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:27:39,525 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.999 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:27:39,712 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.982 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:27:39,903 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:27:40,099 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.780 = clip(base=0.700 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.691 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.69)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:27:40,292 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.983 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:27:40,486 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.941 = clip(base=0.861 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.984 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+
Iter 29 GRPO groups: 55%|#####5 | 11/20 [05:35<04:58, 33.18s/q, loss=-0.0001, mean_r=0.854, q_acc=100%, q_rew=0.665, skip=2]
Iter 29 GRPO groups: 60%|###### | 12/20 [05:35<03:51, 28.95s/q, loss=-0.0001, mean_r=0.854, q_acc=100%, q_rew=0.665, skip=2]2026-04-26 07:27:45,362 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:27:45,446 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:27:45,528 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:27:50,105 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:27:50,188 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:27:50,270 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:27:50,353 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:27:55,276 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:27:55,354 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:27:55,436 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 29 GRPO groups: 60%|###### | 12/20 [05:48<03:51, 28.95s/q, loss=0var, mean_r=0.999, skip=3]
Iter 29 GRPO groups: 65%|######5 | 13/20 [05:48<02:49, 24.18s/q, loss=0var, mean_r=0.999, skip=3]2026-04-26 07:28:02,743 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.931 = clip(base=0.851 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:02,948 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:03,158 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.984 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:03,364 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:28:03,569 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:28:03,774 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.987 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:03,978 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.601 = clip(base=0.521 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.470 novelty=0.69 | sol=0.45*prm_final(0.46)+0.35*prm_mean(0.66)+0.20*lccp(0.17) | steps=6
+2026-04-26 07:28:04,185 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.824 = clip(base=0.744 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.860 novelty=0.69 | sol=0.45*prm_final(0.78)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:04,391 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.583 = clip(base=0.503 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.453 novelty=0.69 | sol=0.45*prm_final(0.45)+0.35*prm_mean(0.72)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:28:04,598 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:28:11,111 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.923 = clip(base=0.843 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:11,316 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:11,514 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:11,714 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:11,913 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:28:12,116 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:28:12,319 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:12,521 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.658 = clip(base=0.578 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.580 novelty=0.73 | sol=0.45*prm_final(0.57)+0.35*prm_mean(0.64)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:28:12,724 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:28:12,929 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+
Iter 29 GRPO groups: 65%|######5 | 13/20 [06:08<02:49, 24.18s/q, loss=-0.0006, mean_r=0.861, q_acc=100%, q_rew=0.652, skip=3]
Iter 29 GRPO groups: 70%|####### | 14/20 [06:08<02:16, 22.77s/q, loss=-0.0006, mean_r=0.861, q_acc=100%, q_rew=0.652, skip=3]2026-04-26 07:28:26,407 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.640 = clip(base=0.560 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.425 novelty=0.73 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.66)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:28:26,612 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.596 = clip(base=0.516 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.388 novelty=0.73 | sol=0.45*prm_final(0.08)+0.35*prm_mean(0.63)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:28:26,808 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.600 = clip(base=0.520 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.401 novelty=0.73 | sol=0.45*prm_final(0.18)+0.35*prm_mean(0.57)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:28:27,006 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.595 = clip(base=0.515 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.441 novelty=0.73 | sol=0.45*prm_final(0.51)+0.35*prm_mean(0.49)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:28:27,203 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.671 = clip(base=0.591 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.518 novelty=0.73 | sol=0.45*prm_final(0.30)+0.35*prm_mean(0.71)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:28:27,403 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.595 = clip(base=0.515 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.389 novelty=0.73 | sol=0.45*prm_final(0.05)+0.35*prm_mean(0.67)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:28:27,607 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.509 = clip(base=0.429 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.294 novelty=0.73 | sol=0.45*prm_final(0.22)+0.35*prm_mean(0.45)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:28:27,803 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.770 = clip(base=0.690 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.671 novelty=0.73 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.57)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:28:27,999 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.548 = clip(base=0.468 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.320 novelty=0.73 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.51)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:28:28,212 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.809 = clip(base=0.729 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.751 novelty=0.73 | sol=0.45*prm_final(0.78)+0.35*prm_mean(0.76)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:28:33,348 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:33,544 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.997 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:33,740 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:33,935 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:34,136 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:34,336 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.937 = clip(base=0.857 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.996 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:34,538 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.65 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:34,733 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.919 = clip(base=0.839 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.960 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:34,928 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.65 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:35,129 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.787 = clip(base=0.707 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.734 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.63)+0.20*lccp(0.33) | steps=3
+
Iter 29 GRPO groups: 70%|####### | 14/20 [06:29<02:16, 22.77s/q, loss=0.0013, mean_r=0.779, q_acc=100%, q_rew=0.655, skip=3]
Iter 29 GRPO groups: 75%|#######5 | 15/20 [06:29<01:52, 22.49s/q, loss=0.0013, mean_r=0.779, q_acc=100%, q_rew=0.655, skip=3]2026-04-26 07:28:46,274 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.944 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:28:46,481 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.566 = clip(base=0.486 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.460 novelty=0.74 | sol=0.45*prm_final(0.68)+0.35*prm_mean(0.44)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:28:46,696 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.760 = clip(base=0.680 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.733 novelty=0.74 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.69)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:28:46,906 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.654 = clip(base=0.574 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.608 novelty=0.74 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.47)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:28:47,116 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.993 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:28:47,323 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.989 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:47,533 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.462 = clip(base=0.382 + mod=+0.080, cap=1.00) | Q=0.53 sol=0.284 novelty=0.74 | sol=0.45*prm_final(0.30)+0.35*prm_mean(0.33)+0.20*lccp(0.17) | steps=6
+2026-04-26 07:28:47,740 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.859 = clip(base=0.779 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.914 novelty=0.74 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.79)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:28:47,956 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.542 = clip(base=0.462 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.362 novelty=0.74 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.51)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:28:48,171 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.408 = clip(base=0.328 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.187 novelty=0.74 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.33)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:29:24,148 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.462 = clip(base=0.382 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.225 novelty=0.79 | sol=0.45*prm_final(0.12)+0.35*prm_mean(0.40)+0.20*lccp(0.17) | steps=6
+2026-04-26 07:29:24,377 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.536 = clip(base=0.456 + mod=+0.080, cap=1.00) | Q=0.53 sol=0.410 novelty=0.79 | sol=0.45*prm_final(0.44)+0.35*prm_mean(0.44)+0.20*lccp(0.29) | steps=7
+2026-04-26 07:29:24,609 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.648 = clip(base=0.568 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.536 novelty=0.79 | sol=0.45*prm_final(0.50)+0.35*prm_mean(0.60)+0.20*lccp(0.50) | steps=6
+2026-04-26 07:29:24,830 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.410 = clip(base=0.330 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.206 novelty=0.79 | sol=0.45*prm_final(0.03)+0.35*prm_mean(0.36)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:29:25,102 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.688 = clip(base=0.608 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.580 novelty=0.79 | sol=0.45*prm_final(0.61)+0.35*prm_mean(0.59)+0.20*lccp(0.50) | steps=6
+2026-04-26 07:29:25,338 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.511 = clip(base=0.431 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.333 novelty=0.79 | sol=0.45*prm_final(0.13)+0.35*prm_mean(0.49)+0.20*lccp(0.50) | steps=6
+2026-04-26 07:29:25,562 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.763 = clip(base=0.683 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.739 novelty=0.79 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.64)+0.20*lccp(0.40) | steps=5
+2026-04-26 07:29:25,777 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.456 = clip(base=0.376 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.281 novelty=0.79 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.49)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:29:26,001 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.546 = clip(base=0.466 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.345 novelty=0.79 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.63)+0.20*lccp(0.43) | steps=7
+
Iter 29 GRPO groups: 75%|#######5 | 15/20 [07:20<01:52, 22.49s/q, loss=0.0001, mean_r=0.639, q_acc=100%, q_rew=0.647, skip=3]
Iter 29 GRPO groups: 80%|######## | 16/20 [07:20<02:04, 31.02s/q, loss=0.0001, mean_r=0.639, q_acc=100%, q_rew=0.647, skip=3]2026-04-26 07:29:31,786 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.993 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:31,980 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.996 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:32,177 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.996 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:32,366 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:32,561 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:32,752 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:32,945 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:33,143 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.996 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:33,338 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.996 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:33,528 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:39,560 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:39,759 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.983 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:39,958 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:40,161 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:40,356 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:40,551 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:40,750 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:40,953 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:41,148 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:41,348 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+
Iter 29 GRPO groups: 80%|######## | 16/20 [07:36<02:04, 31.02s/q, loss=-0.0005, mean_r=0.957, q_acc=100%, q_rew=0.652, skip=3]
Iter 29 GRPO groups: 85%|########5 | 17/20 [07:36<01:18, 26.33s/q, loss=-0.0005, mean_r=0.957, q_acc=100%, q_rew=0.652, skip=3]2026-04-26 07:30:12,532 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.400 = 0.50×0.33(prox=0.33) + 0.40×proc(0.258[fin=0.06,mean=0.56]) + 0.10×fmt(1.000) | pred='0' gold='1' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 07:30:22,927 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.933[fin=0.92,mean=0.96]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=96% lccp=91% (chain=21/23 ok_count=22) n_steps=23
+2026-04-26 07:30:23,013 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:30:23,098 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:30:23,191 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:30:31,319 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.417[fin=0.08,mean=0.92]) + 0.10×fmt(1.000) | pred='0' gold='1' | step_acc=91% lccp=91% (chain=21/23 ok_count=21) n_steps=23
+2026-04-26 07:30:31,403 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.415 = 0.50×0.33(prox=0.33) + 0.40×proc(0.370[fin=0.36,mean=0.39]) + 0.10×fmt(1.000) | pred='0' gold='1' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 07:30:31,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.199 = 0.50×0.02(prox=0.02) + 0.40×proc(0.156[fin=0.01,mean=0.38]) + 0.10×fmt(1.000) | pred='22' gold='1' | step_acc=33% lccp=17% (chain=1/6 ok_count=2) n_steps=6
+2026-04-26 07:30:31,579 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.498 = 0.50×0.33(prox=0.33) + 0.40×proc(0.578[fin=0.64,mean=0.48]) + 0.10×fmt(1.000) | pred='0' gold='1' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 07:30:35,668 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.498 = 0.50×0.33(prox=0.33) + 0.40×proc(0.579[fin=0.71,mean=0.37]) + 0.10×fmt(1.000) | pred='0' gold='1' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+
Iter 29 GRPO groups: 85%|########5 | 17/20 [08:30<01:18, 26.33s/q, loss=0.0010, mean_r=0.651, q_acc=100%, q_rew=0.652, skip=3]
Iter 29 GRPO groups: 90%|######### | 18/20 [08:30<01:09, 34.69s/q, loss=0.0010, mean_r=0.651, q_acc=100%, q_rew=0.652, skip=3]2026-04-26 07:31:10,904 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.924[fin=0.99,mean=0.82]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=85% lccp=31% (chain=4/13 ok_count=11) n_steps=13
+2026-04-26 07:31:11,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.932[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=88% lccp=12% (chain=1/8 ok_count=7) n_steps=8
+2026-04-26 07:31:11,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='3' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 07:31:18,775 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.125 = 0.50×0.00(prox=0.00) + 0.40×proc(0.136[fin=0.03,mean=0.30]) + 0.10×fmt(0.700) | pred='' gold='3' | step_acc=17% lccp=0% (chain=0/6 ok_count=1) n_steps=6
+
Iter 29 GRPO groups: 90%|######### | 18/20 [09:12<01:09, 34.69s/q, loss=0.0005, mean_r=0.517, q_acc=100%, q_rew=0.652, skip=3]
Iter 29 GRPO groups: 95%|#########5| 19/20 [09:12<00:36, 36.96s/q, loss=0.0005, mean_r=0.517, q_acc=100%, q_rew=0.652, skip=3]2026-04-26 07:31:30,861 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.821 = clip(base=0.741 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.771 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.73)+0.20*lccp(0.33) | steps=6
+2026-04-26 07:31:31,083 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:31,297 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.61 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:31,509 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.61 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:31,723 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.61 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:31,935 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.61 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:32,151 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.61 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:32,364 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:31:32,576 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.59 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:32,788 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.59 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:42,748 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:31:42,981 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:31:43,213 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.994 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:31:43,445 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.994 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:31:43,677 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.973 novelty=0.71 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:31:43,908 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.923 = clip(base=0.843 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:31:44,142 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:31:44,371 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.919 = clip(base=0.839 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.989 novelty=0.71 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:31:44,599 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.992 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:31:44,826 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.987 novelty=0.71 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+
Iter 29 GRPO groups: 95%|#########5| 19/20 [09:39<00:36, 36.96s/q, loss=0.0001, mean_r=0.919, q_acc=100%, q_rew=0.650, skip=3]
Iter 29 GRPO groups: 100%|##########| 20/20 [09:39<00:00, 33.99s/q, loss=0.0001, mean_r=0.919, q_acc=100%, q_rew=0.650, skip=3]
Iter 29 GRPO groups: 100%|##########| 20/20 [09:39<00:00, 28.98s/q, loss=0.0001, mean_r=0.919, q_acc=100%, q_rew=0.650, skip=3]
+2026-04-26 07:31:46,526 INFO __main__ - Iter 29 | loss=0.0004 | reward mean=0.867 std=0.171 | gt_match=72.6% | grounded_acc=89.3% | step_acc=86.7% | lccp=76.0% | batch_acc=94.7% | phase=SELFPLAY_RAMP sp_ratio=57% | groups=28 skipped=3(0var=3) | lr=3.42e-06 | 579.7s
+2026-04-26 07:31:46,527 INFO __main__ - Question generation: 11/11 valid (100%) | q_reward=0.650 | q_acc=100.0% (>0.5 quality) | topic=0.56 diff=0.17 clarity=1.00 novelty=0.44 solvability=0.97
+2026-04-26 07:31:46,528 INFO __main__ - ======================================================================
+2026-04-26 07:31:46,528 INFO __main__ - GRPO ITERATION 30/60
+2026-04-26 07:31:46,528 INFO __main__ - ======================================================================
+2026-04-26 07:31:46,548 INFO __main__ - LR this iteration: 3.42e-06 | T=0.603 | MATH ratio=50%
+
Iter 30 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 07:31:52,783 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.977 = clip(base=0.897 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:52,976 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:31:53,174 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:31:53,368 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:53,562 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:53,758 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:31:53,953 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:31:54,155 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:31:54,346 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:31:54,542 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:32:00,868 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:01,071 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:01,273 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:01,473 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:01,672 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:01,875 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.991 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:02,077 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.996 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:02,280 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:02,480 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:02,680 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+
Iter 30 GRPO groups: 0%| | 0/20 [00:17, ?q/s, loss=0.0004, mean_r=0.955, q_acc=100%, q_rew=0.690, skip=0]
Iter 30 GRPO groups: 5%|5 | 1/20 [00:17<05:38, 17.82s/q, loss=0.0004, mean_r=0.955, q_acc=100%, q_rew=0.690, skip=0]2026-04-26 07:32:10,987 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.666 = clip(base=0.586 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.524 novelty=0.66 | sol=0.45*prm_final(0.47)+0.35*prm_mean(0.61)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:32:11,198 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.666 = clip(base=0.586 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.564 novelty=0.66 | sol=0.45*prm_final(0.59)+0.35*prm_mean(0.57)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:32:11,413 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.513 = clip(base=0.433 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.305 novelty=0.66 | sol=0.45*prm_final(0.06)+0.35*prm_mean(0.51)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:32:11,626 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.558 = clip(base=0.478 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.382 novelty=0.66 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.65)+0.20*lccp(0.75) | steps=4
+2026-04-26 07:32:11,838 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.687 = clip(base=0.607 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.598 novelty=0.66 | sol=0.45*prm_final(0.61)+0.35*prm_mean(0.64)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:32:12,047 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.506 = clip(base=0.426 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.305 novelty=0.66 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.56)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:32:12,256 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.596 = clip(base=0.516 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.455 novelty=0.66 | sol=0.45*prm_final(0.34)+0.35*prm_mean(0.58)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:32:12,466 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.715 = clip(base=0.635 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.645 novelty=0.66 | sol=0.45*prm_final(0.70)+0.35*prm_mean(0.66)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:32:12,674 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.506 = clip(base=0.426 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.288 novelty=0.66 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.51)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:32:12,884 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.992 novelty=0.66 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:32:17,615 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.987 = clip(base=0.907 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:17,824 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.994 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:18,033 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.985 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:18,255 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.975 = clip(base=0.895 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:18,465 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.980 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:18,681 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:18,892 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.982 novelty=0.71 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:19,116 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:19,331 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.986 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:19,544 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.983 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+
Iter 30 GRPO groups: 5%|5 | 1/20 [00:34<05:38, 17.82s/q, loss=0.0001, mean_r=0.798, q_acc=100%, q_rew=0.682, skip=0]
Iter 30 GRPO groups: 10%|# | 2/20 [00:34<05:10, 17.27s/q, loss=0.0001, mean_r=0.798, q_acc=100%, q_rew=0.682, skip=0]2026-04-26 07:32:25,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.675 = 0.50×0.50(prox=0.50) + 0.40×proc(0.814[fin=0.95,mean=0.60]) + 0.10×fmt(1.000) | pred='4000' gold='8000' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 07:32:25,901 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.923 = 0.50×1.00(exact) + 0.40×proc(0.807[fin=0.86,mean=0.73]) + 0.10×fmt(1.000) | pred='8000' gold='8000' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 07:32:25,984 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.702 = 0.50×0.50(prox=0.50) + 0.40×proc(0.880[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='4000' gold='8000' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:32:28,805 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.902[fin=0.98,mean=0.78]) + 0.10×fmt(1.000) | pred='8000' gold='8000' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 07:32:28,888 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8000' gold='8000' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:32:28,972 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.530 = 0.50×0.05(prox=0.05) + 0.40×proc(0.822[fin=0.90,mean=0.70]) + 0.10×fmt(1.000) | pred='80000' gold='8000' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 07:32:29,054 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='8000' gold='8000' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:32:35,829 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.894[fin=0.98,mean=0.76]) + 0.10×fmt(1.000) | pred='8000' gold='8000' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:32:35,910 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.609 = 0.50×0.50(prox=0.50) + 0.40×proc(0.735[fin=0.86,mean=0.54]) + 0.10×fmt(0.650) | pred='4000' gold='8000' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 07:32:35,992 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.939[fin=0.99,mean=0.86]) + 0.10×fmt(1.000) | pred='8000' gold='8000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 30 GRPO groups: 10%|# | 2/20 [00:50<05:10, 17.27s/q, loss=0.0009, mean_r=0.833, q_acc=100%, q_rew=0.682, skip=0]
Iter 30 GRPO groups: 15%|#5 | 3/20 [00:50<04:44, 16.76s/q, loss=0.0009, mean_r=0.833, q_acc=100%, q_rew=0.682, skip=0]2026-04-26 07:32:43,160 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:32:54,850 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:32:54,933 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:32:55,019 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:32:55,104 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:33:03,763 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:33:03,846 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:33:03,930 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:33:04,013 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:33:13,714 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 30 GRPO groups: 15%|#5 | 3/20 [01:27<04:44, 16.76s/q, loss=0var, mean_r=0.998, skip=1]
Iter 30 GRPO groups: 20%|## | 4/20 [01:27<06:31, 24.48s/q, loss=0var, mean_r=0.998, skip=1]2026-04-26 07:33:49,869 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.525 = clip(base=0.445 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.369 novelty=0.77 | sol=0.45*prm_final(0.43)+0.35*prm_mean(0.50)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:33:50,085 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.518 = clip(base=0.438 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.344 novelty=0.77 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.61)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:33:50,305 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.561 = clip(base=0.481 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.405 novelty=0.77 | sol=0.45*prm_final(0.25)+0.35*prm_mean(0.55)+0.20*lccp(0.50) | steps=6
+2026-04-26 07:33:50,531 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.807 = clip(base=0.727 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.825 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.84)+0.20*lccp(0.43) | steps=7
+2026-04-26 07:33:50,743 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.713 = clip(base=0.633 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.639 novelty=0.77 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.56)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:33:50,964 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.496 = clip(base=0.416 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.307 novelty=0.77 | sol=0.45*prm_final(0.04)+0.35*prm_mean(0.54)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:33:51,189 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.602 = clip(base=0.522 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.484 novelty=0.77 | sol=0.45*prm_final(0.42)+0.35*prm_mean(0.74)+0.20*lccp(0.18) | steps=11
+2026-04-26 07:33:51,409 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.638 = clip(base=0.558 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.543 novelty=0.77 | sol=0.45*prm_final(0.71)+0.35*prm_mean(0.64)+0.20*lccp(0.00) | steps=7
+2026-04-26 07:33:51,622 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.489 = clip(base=0.409 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.295 novelty=0.77 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.53)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:34:11,536 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.844 = clip(base=0.764 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.835 novelty=0.72 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.83)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:34:11,738 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.793 = clip(base=0.713 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.784 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.82)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:34:11,942 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.981 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:34:12,150 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.975 novelty=0.72 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:34:12,375 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.993 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=13
+2026-04-26 07:34:12,579 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.919 = clip(base=0.839 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.989 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:34:12,782 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.817 = clip(base=0.737 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.830 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.80)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:34:12,983 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.991 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:13,196 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.981 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:34:13,402 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.994 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+
Iter 30 GRPO groups: 20%|## | 4/20 [02:28<06:31, 24.48s/q, loss=0.0003, mean_r=0.762, q_acc=100%, q_rew=0.657, skip=1]
Iter 30 GRPO groups: 25%|##5 | 5/20 [02:28<09:26, 37.78s/q, loss=0.0003, mean_r=0.762, q_acc=100%, q_rew=0.657, skip=1]2026-04-26 07:34:18,579 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:18,663 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:18,746 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:27,294 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:27,378 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:27,461 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:27,546 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:35,708 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:35,792 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:35,875 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 30 GRPO groups: 25%|##5 | 5/20 [02:49<09:26, 37.78s/q, loss=0var, mean_r=1.000, skip=2]
Iter 30 GRPO groups: 30%|### | 6/20 [02:49<07:28, 32.01s/q, loss=0var, mean_r=1.000, skip=2]2026-04-26 07:34:41,499 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.988 = clip(base=0.908 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:41,698 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.992 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:41,898 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:42,100 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.996 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:42,306 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:42,511 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.996 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:42,716 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:42,915 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.990 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:43,117 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:43,317 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:49,569 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.979 = clip(base=0.899 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:49,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:34:49,971 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:50,171 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.995 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:50,372 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:50,571 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:34:50,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:50,974 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.973 = clip(base=0.893 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:34:51,180 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:51,384 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 30 GRPO groups: 30%|### | 6/20 [03:06<07:28, 32.01s/q, loss=-0.0003, mean_r=0.967, q_acc=100%, q_rew=0.674, skip=2]
Iter 30 GRPO groups: 35%|###5 | 7/20 [03:06<05:53, 27.17s/q, loss=-0.0003, mean_r=0.967, q_acc=100%, q_rew=0.674, skip=2]2026-04-26 07:34:58,253 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.994 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:58,448 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.994 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:58,642 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.992 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:58,837 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.887 = clip(base=0.807 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.967 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:34:59,035 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.994 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:59,231 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.992 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:59,426 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.734 = clip(base=0.654 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.712 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.75)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:34:59,620 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.993 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:59,815 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.734 = clip(base=0.654 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.712 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.75)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:35:00,011 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.734 = clip(base=0.654 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.712 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.75)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:35:06,001 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:35:06,194 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.566 = clip(base=0.486 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.410 novelty=0.60 | sol=0.45*prm_final(0.49)+0.35*prm_mean(0.40)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:35:06,382 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.992 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:35:06,578 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.758 = clip(base=0.678 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.715 novelty=0.60 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.68)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:35:06,776 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.755 = clip(base=0.675 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.710 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.74)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:35:06,968 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.991 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:35:07,157 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.626 = clip(base=0.546 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.538 novelty=0.60 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.43)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:35:07,349 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.984 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:35:07,543 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:35:07,733 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.886 = clip(base=0.806 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.929 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.80)+0.20*lccp(1.00) | steps=2
+
Iter 30 GRPO groups: 35%|###5 | 7/20 [03:22<05:53, 27.17s/q, loss=-0.0015, mean_r=0.840, q_acc=100%, q_rew=0.659, skip=2]
Iter 30 GRPO groups: 40%|#### | 8/20 [03:22<04:44, 23.71s/q, loss=-0.0015, mean_r=0.840, q_acc=100%, q_rew=0.659, skip=2]2026-04-26 07:35:14,418 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.975 = clip(base=0.895 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.997 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:14,622 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:14,820 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:15,023 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.996 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:15,221 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:15,422 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.992 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:15,624 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:15,823 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.989 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:16,022 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:16,220 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.980 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:20,379 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.997 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:20,578 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:20,776 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.992 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:20,969 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:21,163 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.971 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:21,355 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.992 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:21,552 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.995 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:21,750 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:21,954 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:22,153 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 30 GRPO groups: 40%|#### | 8/20 [03:37<04:44, 23.71s/q, loss=0.0005, mean_r=0.935, q_acc=100%, q_rew=0.657, skip=2]
Iter 30 GRPO groups: 45%|####5 | 9/20 [03:37<03:49, 20.90s/q, loss=0.0005, mean_r=0.935, q_acc=100%, q_rew=0.657, skip=2]2026-04-26 07:35:32,501 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:35:41,854 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:35:41,948 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.538 = 0.50×0.18(prox=0.18) + 0.40×proc(0.868[fin=1.00,mean=0.67]) + 0.10×fmt(1.000) | pred='52' gold='16' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 07:35:42,042 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.527 = 0.50×0.18(prox=0.18) + 0.40×proc(0.839[fin=1.00,mean=0.60]) + 0.10×fmt(1.000) | pred='52' gold='16' | step_acc=50% lccp=0% (chain=0/6 ok_count=3) n_steps=6
+2026-04-26 07:35:42,137 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.928[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 07:35:49,315 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:35:49,401 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.532 = 0.50×0.18(prox=0.18) + 0.40×proc(0.852[fin=1.00,mean=0.64]) + 0.10×fmt(1.000) | pred='52' gold='16' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 07:35:49,493 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:35:49,577 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:35:58,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.875 = 0.50×0.80(prox=0.80) + 0.40×proc(0.938[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='14' gold='16' | step_acc=86% lccp=57% (chain=4/7 ok_count=6) n_steps=7
+
Iter 30 GRPO groups: 45%|####5 | 9/20 [04:13<03:49, 20.90s/q, loss=-0.0010, mean_r=0.840, q_acc=100%, q_rew=0.657, skip=2]
Iter 30 GRPO groups: 50%|##### | 10/20 [04:13<04:14, 25.44s/q, loss=-0.0010, mean_r=0.840, q_acc=100%, q_rew=0.657, skip=2]2026-04-26 07:36:28,156 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.179 = 0.50×0.00(prox=0.00) + 0.40×proc(0.210[fin=0.04,mean=0.47]) + 0.10×fmt(0.700) | pred='' gold='13' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 07:36:28,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='13' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:36:28,352 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.964[fin=0.99,mean=0.92]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:36:43,740 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.441 = 0.50×0.00(prox=0.00) + 0.40×proc(0.927[fin=1.00,mean=0.82]) + 0.10×fmt(0.700) | pred='' gold='13' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 07:36:43,854 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.708 = 0.50×0.57(prox=0.57) + 0.40×proc(0.814[fin=0.94,mean=0.63]) + 0.10×fmt(1.000) | pred='8' gold='13' | step_acc=70% lccp=0% (chain=0/10 ok_count=7) n_steps=10
+2026-04-26 07:36:43,950 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.875 = 0.50×0.85(prox=0.85) + 0.40×proc(0.875[fin=0.99,mean=0.71]) + 0.10×fmt(1.000) | pred='12' gold='13' | step_acc=78% lccp=11% (chain=1/9 ok_count=7) n_steps=9
+2026-04-26 07:36:44,049 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.418 = 0.50×0.00(prox=0.00) + 0.40×proc(0.869[fin=0.97,mean=0.71]) + 0.10×fmt(0.700) | pred='' gold='13' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 07:36:56,411 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.700) | pred='' gold='13' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:36:56,504 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.099 = 0.50×0.00(prox=0.00) + 0.40×proc(0.072[fin=0.09,mean=0.05]) + 0.10×fmt(0.700) | pred='' gold='13' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 07:36:56,597 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.888 = 0.50×1.00(exact) + 0.40×proc(0.721[fin=0.92,mean=0.42]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=33% lccp=0% (chain=0/6 ok_count=2) n_steps=6
+
Iter 30 GRPO groups: 50%|##### | 10/20 [05:11<04:14, 25.44s/q, loss=0.0001, mean_r=0.569, q_acc=100%, q_rew=0.657, skip=2]
Iter 30 GRPO groups: 55%|#####5 | 11/20 [05:11<05:19, 35.52s/q, loss=0.0001, mean_r=0.569, q_acc=100%, q_rew=0.657, skip=2]2026-04-26 07:37:04,669 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.448 = clip(base=0.368 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.144 novelty=0.71 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.26)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:37:04,866 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.770 = clip(base=0.690 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.694 novelty=0.71 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.60)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:37:05,065 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.507 = clip(base=0.427 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.266 novelty=0.71 | sol=0.45*prm_final(0.06)+0.35*prm_mean(0.40)+0.20*lccp(0.50) | steps=6
+2026-04-26 07:37:05,263 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.500 = clip(base=0.420 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.245 novelty=0.71 | sol=0.45*prm_final(0.16)+0.35*prm_mean(0.39)+0.20*lccp(0.17) | steps=6
+2026-04-26 07:37:05,460 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.833 = clip(base=0.753 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.793 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:37:05,661 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.748 = clip(base=0.668 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.637 novelty=0.71 | sol=0.45*prm_final(0.83)+0.35*prm_mean(0.61)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:37:05,857 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.555 = clip(base=0.475 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.350 novelty=0.71 | sol=0.45*prm_final(0.24)+0.35*prm_mean(0.47)+0.20*lccp(0.40) | steps=5
+2026-04-26 07:37:06,053 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.801 = clip(base=0.721 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.732 novelty=0.71 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.72)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:37:06,259 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.941 = clip(base=0.861 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.979 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:06,459 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.958 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:37:11,097 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:37:11,285 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:11,473 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:11,660 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:37:11,847 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:37:12,036 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:12,226 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:37:12,416 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:12,612 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:37:12,804 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+
Iter 30 GRPO groups: 55%|#####5 | 11/20 [05:27<05:19, 35.52s/q, loss=-0.0011, mean_r=0.810, q_acc=100%, q_rew=0.655, skip=2]
Iter 30 GRPO groups: 60%|###### | 12/20 [05:27<03:57, 29.70s/q, loss=-0.0011, mean_r=0.810, q_acc=100%, q_rew=0.655, skip=2]2026-04-26 07:37:16,414 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:21,237 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:21,314 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:21,390 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:21,471 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:26,197 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:26,273 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:26,348 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:26,432 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:31,061 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 30 GRPO groups: 60%|###### | 12/20 [05:44<03:57, 29.70s/q, loss=0var, mean_r=0.998, skip=3]
Iter 30 GRPO groups: 65%|######5 | 13/20 [05:44<03:00, 25.73s/q, loss=0var, mean_r=0.998, skip=3]2026-04-26 07:37:36,633 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.870 = clip(base=0.790 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.750 novelty=0.79 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.72)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:37:36,807 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.985 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:36,986 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.534 = clip(base=0.454 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.307 novelty=0.79 | sol=0.45*prm_final(0.08)+0.35*prm_mean(0.49)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:37:37,170 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.931 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.997 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:37,354 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:37:37,538 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.979 novelty=0.79 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:37:37,718 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.999 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:37,898 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.999 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:38,077 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.986 novelty=0.79 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:38,264 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.983 novelty=0.79 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:44,435 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.992 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:44,628 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.960 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:37:44,825 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.649 = clip(base=0.569 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.450 novelty=0.76 | sol=0.45*prm_final(0.32)+0.35*prm_mean(0.59)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:37:45,030 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:45,234 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.566 = clip(base=0.486 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.377 novelty=0.76 | sol=0.45*prm_final(0.38)+0.35*prm_mean(0.45)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:37:45,432 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.953 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:45,622 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.985 = clip(base=0.905 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.983 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:37:45,815 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.993 = clip(base=0.913 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:46,008 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.986 = clip(base=0.906 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.988 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:37:46,202 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.992 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+
Iter 30 GRPO groups: 65%|######5 | 13/20 [06:01<03:00, 25.73s/q, loss=-0.0009, mean_r=0.922, q_acc=100%, q_rew=0.671, skip=3]
Iter 30 GRPO groups: 70%|####### | 14/20 [06:01<02:18, 23.04s/q, loss=-0.0009, mean_r=0.922, q_acc=100%, q_rew=0.671, skip=3]2026-04-26 07:37:55,694 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.859 = clip(base=0.779 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.793 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.79)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:37:55,897 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.840 = clip(base=0.760 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.807 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:37:56,103 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.634 = clip(base=0.554 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.489 novelty=0.75 | sol=0.45*prm_final(0.80)+0.35*prm_mean(0.37)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:37:56,309 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.446 = clip(base=0.366 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.171 novelty=0.75 | sol=0.45*prm_final(0.27)+0.35*prm_mean(0.14)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:37:56,514 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.448 = clip(base=0.368 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.174 novelty=0.75 | sol=0.45*prm_final(0.28)+0.35*prm_mean(0.14)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:37:56,724 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.738 = clip(base=0.658 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.608 novelty=0.75 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.52)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:37:56,929 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.777 = clip(base=0.697 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.653 novelty=0.75 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.57)+0.20*lccp(0.17) | steps=6
+2026-04-26 07:37:57,137 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.941 novelty=0.75 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:37:57,341 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.762 = clip(base=0.682 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.647 novelty=0.75 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.58)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:37:57,544 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.965 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:38:03,779 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.861 = clip(base=0.781 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.790 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.74)+0.20*lccp(0.40) | steps=5
+2026-04-26 07:38:03,977 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:38:04,176 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.755 = clip(base=0.675 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.650 novelty=0.68 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.52)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:38:04,380 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.778 = clip(base=0.698 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.701 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.72)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:38:04,588 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.562 = clip(base=0.482 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.374 novelty=0.68 | sol=0.45*prm_final(0.43)+0.35*prm_mean(0.37)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:38:04,790 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:38:04,998 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.964 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:38:05,201 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:38:05,398 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.763 = clip(base=0.683 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.671 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.64)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:38:05,596 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.595 = clip(base=0.515 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.421 novelty=0.68 | sol=0.45*prm_final(0.52)+0.35*prm_mean(0.39)+0.20*lccp(0.25) | steps=4
+
Iter 30 GRPO groups: 70%|####### | 14/20 [06:20<02:18, 23.04s/q, loss=-0.0000, mean_r=0.773, q_acc=100%, q_rew=0.674, skip=3]
Iter 30 GRPO groups: 75%|#######5 | 15/20 [06:20<01:49, 21.93s/q, loss=-0.0000, mean_r=0.773, q_acc=100%, q_rew=0.674, skip=3]2026-04-26 07:38:34,645 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.825 novelty=0.76 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.81)+0.20*lccp(0.50) | steps=12
+2026-04-26 07:38:34,896 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.872 = clip(base=0.792 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.786 novelty=0.76 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.83)+0.20*lccp(0.40) | steps=15
+2026-04-26 07:38:35,132 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.782 = clip(base=0.702 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.630 novelty=0.76 | sol=0.45*prm_final(0.59)+0.35*prm_mean(0.72)+0.20*lccp(0.57) | steps=14
+2026-04-26 07:38:35,375 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.888 = clip(base=0.808 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.800 novelty=0.76 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.82)+0.20*lccp(0.40) | steps=15
+2026-04-26 07:38:35,609 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.822 = clip(base=0.742 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.705 novelty=0.76 | sol=0.45*prm_final(0.60)+0.35*prm_mean(0.86)+0.20*lccp(0.67) | steps=9
+2026-04-26 07:38:35,866 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.882 = clip(base=0.802 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.802 novelty=0.76 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.81)+0.20*lccp(0.40) | steps=15
+2026-04-26 07:38:36,130 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.894 = clip(base=0.814 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.826 novelty=0.76 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.88)+0.20*lccp(0.36) | steps=14
+2026-04-26 07:38:36,373 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.823 = clip(base=0.743 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.700 novelty=0.76 | sol=0.45*prm_final(0.77)+0.35*prm_mean(0.75)+0.20*lccp(0.46) | steps=13
+2026-04-26 07:38:36,606 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.673 = clip(base=0.593 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.451 novelty=0.76 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.78)+0.20*lccp(0.70) | steps=10
+2026-04-26 07:38:36,843 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.932 = clip(base=0.852 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.891 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(0.64) | steps=11
+2026-04-26 07:38:46,562 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.932 = clip(base=0.852 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.948 novelty=0.76 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:38:46,780 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.985 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:38:47,016 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.751 = clip(base=0.671 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.660 novelty=0.76 | sol=0.45*prm_final(0.80)+0.35*prm_mean(0.67)+0.20*lccp(0.33) | steps=6
+2026-04-26 07:38:47,234 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.777 = clip(base=0.697 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.718 novelty=0.76 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.78)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:38:47,452 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.787 = clip(base=0.707 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.732 novelty=0.76 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.64)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:38:47,684 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.735 = clip(base=0.655 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.634 novelty=0.76 | sol=0.45*prm_final(0.58)+0.35*prm_mean(0.69)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:38:47,913 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.769 = clip(base=0.689 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.696 novelty=0.76 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.60)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:38:48,136 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.856 = clip(base=0.776 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.858 novelty=0.76 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.83)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:38:48,361 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.978 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:38:48,591 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.488 = clip(base=0.408 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.230 novelty=0.76 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.65)+0.20*lccp(0.00) | steps=5
+
Iter 30 GRPO groups: 75%|#######5 | 15/20 [07:03<01:49, 21.93s/q, loss=-0.0003, mean_r=0.822, q_acc=100%, q_rew=0.681, skip=3]
Iter 30 GRPO groups: 80%|######## | 16/20 [07:03<01:53, 28.28s/q, loss=-0.0003, mean_r=0.822, q_acc=100%, q_rew=0.681, skip=3]2026-04-26 07:38:54,268 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:38:54,351 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:38:54,435 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:02,164 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:02,248 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:02,331 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:39:02,413 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:09,316 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:09,400 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:09,483 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 30 GRPO groups: 80%|######## | 16/20 [07:22<01:53, 28.28s/q, loss=0var, mean_r=1.000, skip=4]
Iter 30 GRPO groups: 85%|########5 | 17/20 [07:22<01:16, 25.55s/q, loss=0var, mean_r=1.000, skip=4]2026-04-26 07:39:12,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:39:17,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:39:17,191 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:17,268 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:17,350 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:23,958 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:24,033 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:24,111 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.941[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:39:24,189 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:30,737 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 30 GRPO groups: 85%|########5 | 17/20 [07:44<01:16, 25.55s/q, loss=0var, mean_r=0.997, skip=5]
Iter 30 GRPO groups: 90%|######### | 18/20 [07:44<00:48, 24.26s/q, loss=0var, mean_r=0.997, skip=5]2026-04-26 07:39:37,480 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.771 = clip(base=0.691 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.727 novelty=0.73 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.67)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:39:37,693 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.901 = clip(base=0.821 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.985 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:37,902 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.996 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:38,115 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.897 = clip(base=0.817 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.977 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:38,326 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.947 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:38,536 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.873 = clip(base=0.793 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.938 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:38,746 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.988 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:38,959 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.889 = clip(base=0.809 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.964 novelty=0.73 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:39,173 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.990 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:39,383 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.850 = clip(base=0.770 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.900 novelty=0.73 | sol=0.45*prm_final(0.92)+0.35*prm_mean(0.82)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:45,343 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:39:45,547 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:39:45,747 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:45,952 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:39:46,153 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:46,351 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:46,551 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:46,756 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.991 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:46,963 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:47,168 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.993 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+
Iter 30 GRPO groups: 90%|######### | 18/20 [08:02<00:48, 24.26s/q, loss=-0.0007, mean_r=0.892, q_acc=100%, q_rew=0.671, skip=5]
Iter 30 GRPO groups: 95%|#########5| 19/20 [08:02<00:22, 22.41s/q, loss=-0.0007, mean_r=0.892, q_acc=100%, q_rew=0.671, skip=5]2026-04-26 07:39:55,588 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.829 = clip(base=0.749 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.727 novelty=0.77 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.70)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:39:55,785 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.774 = clip(base=0.694 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.675 novelty=0.77 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.61)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:39:55,989 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.811 = clip(base=0.731 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.736 novelty=0.77 | sol=0.45*prm_final(0.63)+0.35*prm_mean(0.72)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:56,195 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.797 = clip(base=0.717 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.706 novelty=0.77 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.64)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:39:56,394 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.960 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:39:56,595 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.540 = clip(base=0.460 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.329 novelty=0.77 | sol=0.45*prm_final(0.55)+0.35*prm_mean(0.23)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:39:56,793 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.779 = clip(base=0.699 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.673 novelty=0.77 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.58)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:39:57,000 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.477 = clip(base=0.397 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.219 novelty=0.77 | sol=0.45*prm_final(0.06)+0.35*prm_mean(0.36)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:39:57,205 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.546 = clip(base=0.466 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.332 novelty=0.77 | sol=0.45*prm_final(0.34)+0.35*prm_mean(0.32)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:39:57,403 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.958 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:40:05,989 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.992 = clip(base=0.912 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.997 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:40:06,202 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.971 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:40:06,412 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.654 = clip(base=0.574 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.436 novelty=0.78 | sol=0.45*prm_final(0.34)+0.35*prm_mean(0.52)+0.20*lccp(0.50) | steps=6
+2026-04-26 07:40:06,625 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.982 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:40:06,848 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.983 novelty=0.78 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:40:07,061 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.967 novelty=0.78 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:40:07,282 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.994 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:40:07,497 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.998 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:40:07,717 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.992 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:40:07,930 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.977 = clip(base=0.897 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.992 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+
Iter 30 GRPO groups: 95%|#########5| 19/20 [08:23<00:22, 22.41s/q, loss=-0.0004, mean_r=0.842, q_acc=100%, q_rew=0.676, skip=5]
Iter 30 GRPO groups: 100%|##########| 20/20 [08:23<00:00, 21.92s/q, loss=-0.0004, mean_r=0.842, q_acc=100%, q_rew=0.676, skip=5]
Iter 30 GRPO groups: 100%|##########| 20/20 [08:23<00:00, 25.15s/q, loss=-0.0004, mean_r=0.842, q_acc=100%, q_rew=0.676, skip=5]
+2026-04-26 07:40:09,618 INFO src.rl.llm_question_classifier - LLMClassifier cache=90% llm=1% fallback=9% (cache_size=218/10000)
+2026-04-26 07:40:09,618 INFO __main__ - Iter 30 | loss=-0.0003 | reward mean=0.871 std=0.160 | gt_match=80.0% | grounded_acc=95.0% | step_acc=91.6% | lccp=80.0% | batch_acc=96.6% | phase=SELFPLAY_RAMP sp_ratio=61% | groups=27 skipped=5(0var=5) | lr=3.29e-06 | 503.1s
+2026-04-26 07:40:09,618 INFO __main__ - Question generation: 12/12 valid (100%) | q_reward=0.676 | q_acc=100.0% (>0.5 quality) | topic=0.68 diff=0.15 clarity=1.00 novelty=0.46 solvability=0.97
+2026-04-26 07:40:09,618 INFO __main__ - Evaluating GSM8K (150 samples)...
+
GSM8K eval: 0%| | 0/150 [00:00, ?q/s]
GSM8K eval: 1%| | 1/150 [00:03<08:32, 3.44s/q, correct=1/1, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 1%|1 | 2/150 [00:08<10:38, 4.32s/q, correct=2/2, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 2%|2 | 3/150 [00:10<08:39, 3.53s/q, correct=3/3, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 3%|2 | 4/150 [00:13<07:23, 3.04s/q, correct=3/4, lccp=83.3%, score=0.887, step_acc=91.7%]
GSM8K eval: 3%|3 | 5/150 [00:14<06:07, 2.54s/q, correct=4/5, lccp=86.7%, score=0.910, step_acc=93.3%]
GSM8K eval: 4%|4 | 6/150 [00:20<08:36, 3.59s/q, correct=4/6, lccp=75.6%, score=0.887, step_acc=87.8%]
GSM8K eval: 5%|4 | 7/150 [00:23<08:25, 3.53s/q, correct=5/7, lccp=79.0%, score=0.903, step_acc=89.5%]
GSM8K eval: 5%|5 | 8/150 [00:26<07:31, 3.18s/q, correct=6/8, lccp=81.7%, score=0.915, step_acc=90.8%]
GSM8K eval: 6%|6 | 9/150 [00:29<07:33, 3.22s/q, correct=7/9, lccp=83.7%, score=0.924, step_acc=91.9%]
GSM8K eval: 7%|6 | 10/150 [00:34<08:47, 3.77s/q, correct=7/10, lccp=81.3%, score=0.908, step_acc=90.7%]
GSM8K eval: 7%|7 | 11/150 [00:37<08:06, 3.50s/q, correct=8/11, lccp=83.0%, score=0.916, step_acc=91.5%]
GSM8K eval: 8%|8 | 12/150 [00:39<07:06, 3.09s/q, correct=9/12, lccp=84.4%, score=0.923, step_acc=92.2%]
GSM8K eval: 9%|8 | 13/150 [00:42<06:43, 2.95s/q, correct=10/13, lccp=85.6%, score=0.926, step_acc=92.8%]
GSM8K eval: 9%|9 | 14/150 [00:46<07:47, 3.44s/q, correct=11/14, lccp=86.7%, score=0.931, step_acc=93.3%]
GSM8K eval: 10%|# | 15/150 [00:49<07:09, 3.18s/q, correct=12/15, lccp=87.6%, score=0.936, step_acc=93.8%]
GSM8K eval: 11%|# | 16/150 [00:51<06:37, 2.97s/q, correct=12/16, lccp=88.3%, score=0.911, step_acc=94.2%]
GSM8K eval: 11%|#1 | 17/150 [00:55<07:13, 3.26s/q, correct=13/17, lccp=89.0%, score=0.917, step_acc=94.5%]
GSM8K eval: 12%|#2 | 18/150 [01:00<07:49, 3.55s/q, correct=13/18, lccp=85.0%, score=0.894, step_acc=91.1%]
GSM8K eval: 13%|#2 | 19/150 [01:02<07:08, 3.27s/q, correct=14/19, lccp=85.8%, score=0.899, step_acc=91.6%]
GSM8K eval: 13%|#3 | 20/150 [01:06<07:26, 3.43s/q, correct=15/20, lccp=86.5%, score=0.904, step_acc=92.0%]
GSM8K eval: 14%|#4 | 21/150 [01:09<06:51, 3.19s/q, correct=16/21, lccp=87.1%, score=0.909, step_acc=92.4%]
GSM8K eval: 15%|#4 | 22/150 [01:12<06:34, 3.08s/q, correct=17/22, lccp=84.7%, score=0.904, step_acc=91.2%]
GSM8K eval: 15%|#5 | 23/150 [01:15<06:48, 3.22s/q, correct=17/23, lccp=82.1%, score=0.887, step_acc=88.3%]
GSM8K eval: 16%|#6 | 24/150 [01:18<06:17, 3.00s/q, correct=17/24, lccp=79.7%, score=0.871, step_acc=85.7%]
GSM8K eval: 17%|#6 | 25/150 [01:20<06:05, 2.92s/q, correct=17/25, lccp=77.5%, score=0.868, step_acc=85.3%]
GSM8K eval: 17%|#7 | 26/150 [01:25<06:58, 3.38s/q, correct=18/26, lccp=78.4%, score=0.873, step_acc=85.8%]
GSM8K eval: 18%|#8 | 27/150 [01:28<06:35, 3.21s/q, correct=18/27, lccp=79.2%, score=0.869, step_acc=86.4%]
GSM8K eval: 19%|#8 | 28/150 [01:30<05:55, 2.91s/q, correct=19/28, lccp=79.9%, score=0.874, step_acc=86.8%]
GSM8K eval: 19%|#9 | 29/150 [01:33<05:47, 2.87s/q, correct=20/29, lccp=80.6%, score=0.878, step_acc=87.3%]
GSM8K eval: 20%|## | 30/150 [01:36<06:18, 3.16s/q, correct=21/30, lccp=81.3%, score=0.882, step_acc=87.7%]
GSM8K eval: 21%|## | 31/150 [01:39<05:54, 2.98s/q, correct=22/31, lccp=81.9%, score=0.886, step_acc=88.1%]
GSM8K eval: 21%|##1 | 32/150 [01:41<05:08, 2.62s/q, correct=23/32, lccp=82.4%, score=0.889, step_acc=88.5%]
GSM8K eval: 22%|##2 | 33/150 [01:43<05:11, 2.66s/q, correct=24/33, lccp=83.0%, score=0.893, step_acc=88.8%]
GSM8K eval: 23%|##2 | 34/150 [01:45<04:46, 2.47s/q, correct=25/34, lccp=83.5%, score=0.896, step_acc=89.2%]
GSM8K eval: 23%|##3 | 35/150 [01:48<04:46, 2.49s/q, correct=26/35, lccp=84.0%, score=0.899, step_acc=89.5%]
GSM8K eval: 24%|##4 | 36/150 [01:52<05:17, 2.78s/q, correct=27/36, lccp=84.4%, score=0.901, step_acc=89.8%]
GSM8K eval: 25%|##4 | 37/150 [01:53<04:47, 2.55s/q, correct=28/37, lccp=84.8%, score=0.903, step_acc=90.0%]
GSM8K eval: 25%|##5 | 38/150 [01:56<05:00, 2.68s/q, correct=29/38, lccp=85.2%, score=0.906, step_acc=90.3%]
GSM8K eval: 26%|##6 | 39/150 [02:01<06:09, 3.33s/q, correct=30/39, lccp=85.6%, score=0.908, step_acc=90.6%]
GSM8K eval: 27%|##6 | 40/150 [02:07<07:38, 4.17s/q, correct=31/40, lccp=86.0%, score=0.910, step_acc=90.8%]
GSM8K eval: 27%|##7 | 41/150 [02:11<06:58, 3.84s/q, correct=31/41, lccp=86.3%, score=0.910, step_acc=91.0%]
GSM8K eval: 28%|##8 | 42/150 [02:16<07:40, 4.27s/q, correct=32/42, lccp=85.0%, score=0.912, step_acc=90.8%]
GSM8K eval: 29%|##8 | 43/150 [02:18<06:26, 3.61s/q, correct=33/43, lccp=85.4%, score=0.914, step_acc=91.0%]
GSM8K eval: 29%|##9 | 44/150 [02:24<07:48, 4.42s/q, correct=34/44, lccp=85.7%, score=0.916, step_acc=91.2%]
GSM8K eval: 30%|### | 45/150 [02:28<07:15, 4.15s/q, correct=35/45, lccp=86.0%, score=0.918, step_acc=91.4%]
GSM8K eval: 31%|### | 46/150 [02:33<07:34, 4.37s/q, correct=35/46, lccp=84.2%, score=0.913, step_acc=91.4%]
GSM8K eval: 31%|###1 | 47/150 [02:36<06:49, 3.98s/q, correct=36/47, lccp=84.5%, score=0.915, step_acc=91.6%]
GSM8K eval: 32%|###2 | 48/150 [02:37<05:38, 3.32s/q, correct=37/48, lccp=84.8%, score=0.917, step_acc=91.7%]
GSM8K eval: 33%|###2 | 49/150 [02:41<05:42, 3.39s/q, correct=38/49, lccp=83.8%, score=0.918, step_acc=91.6%]
GSM8K eval: 33%|###3 | 50/150 [02:44<05:32, 3.32s/q, correct=38/50, lccp=83.1%, score=0.910, step_acc=90.7%]
GSM8K eval: 34%|###4 | 51/150 [02:46<04:33, 2.76s/q, correct=39/51, lccp=83.4%, score=0.911, step_acc=90.9%]
GSM8K eval: 35%|###4 | 52/150 [02:50<05:15, 3.22s/q, correct=39/52, lccp=81.8%, score=0.911, step_acc=90.8%]
GSM8K eval: 35%|###5 | 53/150 [02:55<05:54, 3.66s/q, correct=39/53, lccp=81.4%, score=0.904, step_acc=90.2%]
GSM8K eval: 36%|###6 | 54/150 [02:58<05:41, 3.55s/q, correct=40/54, lccp=81.8%, score=0.905, step_acc=90.4%]
GSM8K eval: 37%|###6 | 55/150 [03:02<06:05, 3.85s/q, correct=41/55, lccp=82.1%, score=0.907, step_acc=90.6%]
GSM8K eval: 37%|###7 | 56/150 [03:06<05:54, 3.77s/q, correct=42/56, lccp=82.4%, score=0.908, step_acc=90.7%]
GSM8K eval: 38%|###8 | 57/150 [03:08<05:09, 3.33s/q, correct=43/57, lccp=82.7%, score=0.910, step_acc=90.9%]
GSM8K eval: 39%|###8 | 58/150 [03:12<05:30, 3.59s/q, correct=44/58, lccp=83.0%, score=0.911, step_acc=91.0%]
GSM8K eval: 39%|###9 | 59/150 [03:17<05:59, 3.95s/q, correct=44/59, lccp=81.6%, score=0.909, step_acc=90.6%]
GSM8K eval: 40%|#### | 60/150 [03:22<06:21, 4.24s/q, correct=45/60, lccp=81.9%, score=0.910, step_acc=90.8%]
GSM8K eval: 41%|#### | 61/150 [03:25<05:49, 3.93s/q, correct=46/61, lccp=82.2%, score=0.912, step_acc=90.9%]
GSM8K eval: 41%|####1 | 62/150 [03:29<05:24, 3.68s/q, correct=47/62, lccp=82.5%, score=0.913, step_acc=91.1%]
GSM8K eval: 42%|####2 | 63/150 [03:32<05:15, 3.62s/q, correct=47/63, lccp=82.2%, score=0.907, step_acc=90.7%]
GSM8K eval: 43%|####2 | 64/150 [03:35<04:51, 3.39s/q, correct=48/64, lccp=82.5%, score=0.909, step_acc=90.8%]
GSM8K eval: 43%|####3 | 65/150 [03:38<04:32, 3.20s/q, correct=49/65, lccp=82.8%, score=0.910, step_acc=91.0%]
GSM8K eval: 44%|####4 | 66/150 [03:40<03:58, 2.84s/q, correct=50/66, lccp=83.1%, score=0.911, step_acc=91.1%]
GSM8K eval: 45%|####4 | 67/150 [03:42<03:41, 2.67s/q, correct=51/67, lccp=83.3%, score=0.913, step_acc=91.3%]
GSM8K eval: 45%|####5 | 68/150 [03:45<03:39, 2.68s/q, correct=52/68, lccp=83.6%, score=0.914, step_acc=91.4%]
GSM8K eval: 46%|####6 | 69/150 [03:46<03:09, 2.34s/q, correct=53/69, lccp=83.8%, score=0.915, step_acc=91.5%]
GSM8K eval: 47%|####6 | 70/150 [03:49<03:23, 2.55s/q, correct=54/70, lccp=82.6%, score=0.916, step_acc=91.3%]
GSM8K eval: 47%|####7 | 71/150 [03:52<03:34, 2.72s/q, correct=55/71, lccp=81.4%, score=0.917, step_acc=91.2%]
GSM8K eval: 48%|####8 | 72/150 [03:54<03:02, 2.34s/q, correct=56/72, lccp=81.7%, score=0.918, step_acc=91.3%]
GSM8K eval: 49%|####8 | 73/150 [03:55<02:45, 2.15s/q, correct=57/73, lccp=81.9%, score=0.919, step_acc=91.4%]
GSM8K eval: 49%|####9 | 74/150 [03:59<03:15, 2.57s/q, correct=58/74, lccp=82.2%, score=0.920, step_acc=91.5%]
GSM8K eval: 50%|##### | 75/150 [04:01<02:53, 2.31s/q, correct=59/75, lccp=82.4%, score=0.921, step_acc=91.7%]
GSM8K eval: 51%|##### | 76/150 [04:07<04:26, 3.60s/q, correct=59/76, lccp=82.5%, score=0.916, step_acc=91.6%]
GSM8K eval: 51%|#####1 | 77/150 [04:11<04:30, 3.70s/q, correct=60/77, lccp=82.7%, score=0.917, step_acc=91.7%]
GSM8K eval: 52%|#####2 | 78/150 [04:14<04:03, 3.38s/q, correct=61/78, lccp=82.9%, score=0.918, step_acc=91.8%]
GSM8K eval: 53%|#####2 | 79/150 [04:17<03:52, 3.28s/q, correct=61/79, lccp=82.1%, score=0.913, step_acc=91.1%]
GSM8K eval: 53%|#####3 | 80/150 [04:20<03:46, 3.23s/q, correct=62/80, lccp=82.3%, score=0.914, step_acc=91.2%]
GSM8K eval: 54%|#####4 | 81/150 [04:22<03:24, 2.97s/q, correct=63/81, lccp=82.5%, score=0.915, step_acc=91.3%]
GSM8K eval: 55%|#####4 | 82/150 [04:25<03:21, 2.96s/q, correct=64/82, lccp=82.8%, score=0.916, step_acc=91.4%]
GSM8K eval: 55%|#####5 | 83/150 [04:28<03:17, 2.95s/q, correct=65/83, lccp=83.0%, score=0.917, step_acc=91.5%]
GSM8K eval: 56%|#####6 | 84/150 [04:31<03:09, 2.86s/q, correct=66/84, lccp=83.2%, score=0.918, step_acc=91.6%]
GSM8K eval: 57%|#####6 | 85/150 [04:35<03:27, 3.19s/q, correct=67/85, lccp=83.4%, score=0.919, step_acc=91.7%]
GSM8K eval: 57%|#####7 | 86/150 [04:38<03:29, 3.28s/q, correct=68/86, lccp=83.6%, score=0.920, step_acc=91.8%]
GSM8K eval: 58%|#####8 | 87/150 [04:44<04:12, 4.02s/q, correct=69/87, lccp=83.7%, score=0.920, step_acc=91.9%]
GSM8K eval: 59%|#####8 | 88/150 [04:46<03:29, 3.37s/q, correct=70/88, lccp=83.9%, score=0.921, step_acc=92.0%]
GSM8K eval: 59%|#####9 | 89/150 [04:49<03:15, 3.21s/q, correct=71/89, lccp=84.1%, score=0.922, step_acc=92.1%]
GSM8K eval: 60%|###### | 90/150 [04:51<02:58, 2.98s/q, correct=72/90, lccp=84.3%, score=0.923, step_acc=92.2%]
GSM8K eval: 61%|###### | 91/150 [04:56<03:23, 3.45s/q, correct=73/91, lccp=84.5%, score=0.924, step_acc=92.2%]
GSM8K eval: 61%|######1 | 92/150 [04:59<03:13, 3.34s/q, correct=74/92, lccp=84.6%, score=0.924, step_acc=92.3%]
GSM8K eval: 62%|######2 | 93/150 [05:06<04:22, 4.61s/q, correct=75/93, lccp=84.8%, score=0.925, step_acc=92.4%]
GSM8K eval: 63%|######2 | 94/150 [05:09<03:47, 4.06s/q, correct=75/94, lccp=83.9%, score=0.922, step_acc=91.8%]
GSM8K eval: 63%|######3 | 95/150 [05:14<03:51, 4.21s/q, correct=75/95, lccp=83.0%, score=0.919, step_acc=91.0%]
GSM8K eval: 64%|######4 | 96/150 [05:17<03:31, 3.91s/q, correct=75/96, lccp=82.5%, score=0.915, step_acc=90.4%]
GSM8K eval: 65%|######4 | 97/150 [05:20<03:07, 3.54s/q, correct=75/97, lccp=82.2%, score=0.912, step_acc=90.3%]
GSM8K eval: 65%|######5 | 98/150 [05:24<03:14, 3.74s/q, correct=75/98, lccp=81.8%, score=0.909, step_acc=90.1%]
GSM8K eval: 66%|######6 | 99/150 [05:26<02:49, 3.32s/q, correct=76/99, lccp=81.9%, score=0.909, step_acc=90.2%]
GSM8K eval: 67%|######6 | 100/150 [05:28<02:24, 2.90s/q, correct=77/100, lccp=81.1%, score=0.910, step_acc=89.9%]
GSM8K eval: 67%|######7 | 101/150 [05:31<02:25, 2.96s/q, correct=77/101, lccp=80.8%, score=0.906, step_acc=89.8%]
GSM8K eval: 68%|######8 | 102/150 [05:33<02:01, 2.53s/q, correct=78/102, lccp=81.0%, score=0.907, step_acc=89.9%]
GSM8K eval: 69%|######8 | 103/150 [05:35<01:51, 2.38s/q, correct=79/103, lccp=81.2%, score=0.908, step_acc=90.0%]
GSM8K eval: 69%|######9 | 104/150 [05:39<02:21, 3.07s/q, correct=80/104, lccp=81.4%, score=0.909, step_acc=90.1%]
GSM8K eval: 70%|####### | 105/150 [05:42<02:10, 2.89s/q, correct=81/105, lccp=81.5%, score=0.910, step_acc=90.2%]
GSM8K eval: 71%|####### | 106/150 [05:43<01:48, 2.47s/q, correct=82/106, lccp=81.7%, score=0.910, step_acc=90.3%]
GSM8K eval: 71%|#######1 | 107/150 [05:45<01:33, 2.18s/q, correct=83/107, lccp=81.9%, score=0.911, step_acc=90.4%]
GSM8K eval: 72%|#######2 | 108/150 [05:48<01:38, 2.34s/q, correct=84/108, lccp=82.1%, score=0.912, step_acc=90.5%]
GSM8K eval: 73%|#######2 | 109/150 [05:53<02:08, 3.14s/q, correct=84/109, lccp=81.6%, score=0.911, step_acc=90.4%]
GSM8K eval: 73%|#######3 | 110/150 [05:55<01:54, 2.85s/q, correct=85/110, lccp=81.8%, score=0.911, step_acc=90.5%]
GSM8K eval: 74%|#######4 | 111/150 [05:56<01:37, 2.49s/q, correct=86/111, lccp=81.9%, score=0.912, step_acc=90.6%]
GSM8K eval: 75%|#######4 | 112/150 [06:02<02:04, 3.28s/q, correct=86/112, lccp=82.1%, score=0.912, step_acc=90.7%]
GSM8K eval: 75%|#######5 | 113/150 [06:03<01:44, 2.83s/q, correct=87/113, lccp=82.3%, score=0.913, step_acc=90.7%]
GSM8K eval: 76%|#######6 | 114/150 [06:08<02:06, 3.50s/q, correct=88/114, lccp=81.8%, score=0.913, step_acc=90.7%]
GSM8K eval: 77%|#######6 | 115/150 [06:11<01:55, 3.31s/q, correct=89/115, lccp=81.9%, score=0.914, step_acc=90.8%]
GSM8K eval: 77%|#######7 | 116/150 [06:14<01:48, 3.19s/q, correct=90/116, lccp=82.1%, score=0.915, step_acc=90.8%]
GSM8K eval: 78%|#######8 | 117/150 [06:20<02:13, 4.03s/q, correct=91/117, lccp=82.3%, score=0.915, step_acc=90.9%]
GSM8K eval: 79%|#######8 | 118/150 [06:25<02:13, 4.17s/q, correct=91/118, lccp=81.6%, score=0.913, step_acc=90.9%]
GSM8K eval: 79%|#######9 | 119/150 [06:28<02:04, 4.00s/q, correct=91/119, lccp=81.7%, score=0.912, step_acc=90.9%]
GSM8K eval: 80%|######## | 120/150 [06:31<01:49, 3.66s/q, correct=92/120, lccp=81.9%, score=0.912, step_acc=91.0%]
GSM8K eval: 81%|######## | 121/150 [06:34<01:40, 3.48s/q, correct=93/121, lccp=82.0%, score=0.913, step_acc=91.1%]
GSM8K eval: 81%|########1 | 122/150 [06:37<01:33, 3.35s/q, correct=94/122, lccp=82.2%, score=0.914, step_acc=91.2%]
GSM8K eval: 82%|########2 | 123/150 [06:41<01:30, 3.37s/q, correct=95/123, lccp=82.3%, score=0.914, step_acc=91.2%]
GSM8K eval: 83%|########2 | 124/150 [06:43<01:18, 3.02s/q, correct=96/124, lccp=82.5%, score=0.915, step_acc=91.3%]
GSM8K eval: 83%|########3 | 125/150 [06:45<01:08, 2.74s/q, correct=97/125, lccp=82.6%, score=0.916, step_acc=91.4%]
GSM8K eval: 84%|########4 | 126/150 [06:48<01:05, 2.72s/q, correct=98/126, lccp=82.7%, score=0.916, step_acc=91.4%]
GSM8K eval: 85%|########4 | 127/150 [06:52<01:14, 3.25s/q, correct=99/127, lccp=82.9%, score=0.917, step_acc=91.5%]
GSM8K eval: 85%|########5 | 128/150 [06:55<01:09, 3.17s/q, correct=100/128, lccp=83.0%, score=0.918, step_acc=91.6%]
GSM8K eval: 86%|########6 | 129/150 [06:59<01:09, 3.29s/q, correct=101/129, lccp=83.1%, score=0.918, step_acc=91.6%]
GSM8K eval: 87%|########6 | 130/150 [07:01<00:56, 2.85s/q, correct=102/130, lccp=83.3%, score=0.919, step_acc=91.7%]
GSM8K eval: 87%|########7 | 131/150 [07:05<01:03, 3.37s/q, correct=103/131, lccp=83.4%, score=0.919, step_acc=91.8%]
GSM8K eval: 88%|########8 | 132/150 [07:07<00:50, 2.83s/q, correct=104/132, lccp=83.5%, score=0.920, step_acc=91.8%]
GSM8K eval: 89%|########8 | 133/150 [07:10<00:48, 2.83s/q, correct=105/133, lccp=83.6%, score=0.921, step_acc=91.9%]
GSM8K eval: 89%|########9 | 134/150 [07:14<00:52, 3.30s/q, correct=106/134, lccp=83.8%, score=0.921, step_acc=92.0%]
GSM8K eval: 90%|######### | 135/150 [07:17<00:48, 3.23s/q, correct=107/135, lccp=83.9%, score=0.922, step_acc=92.0%]
GSM8K eval: 91%|######### | 136/150 [07:22<00:53, 3.83s/q, correct=108/136, lccp=84.0%, score=0.922, step_acc=92.1%]
GSM8K eval: 91%|#########1| 137/150 [07:29<01:01, 4.72s/q, correct=109/137, lccp=84.1%, score=0.923, step_acc=92.1%]
GSM8K eval: 92%|#########2| 138/150 [07:33<00:53, 4.48s/q, correct=110/138, lccp=84.2%, score=0.923, step_acc=92.2%]
GSM8K eval: 93%|#########2| 139/150 [07:36<00:46, 4.20s/q, correct=111/139, lccp=84.3%, score=0.924, step_acc=92.2%]
GSM8K eval: 93%|#########3| 140/150 [07:41<00:41, 4.19s/q, correct=111/140, lccp=84.2%, score=0.920, step_acc=92.1%]
GSM8K eval: 94%|#########3| 141/150 [07:44<00:36, 4.06s/q, correct=112/141, lccp=84.3%, score=0.921, step_acc=92.1%]
GSM8K eval: 95%|#########4| 142/150 [07:49<00:33, 4.18s/q, correct=113/142, lccp=84.4%, score=0.921, step_acc=92.2%]
GSM8K eval: 95%|#########5| 143/150 [07:51<00:25, 3.61s/q, correct=114/143, lccp=84.5%, score=0.922, step_acc=92.2%]
GSM8K eval: 96%|#########6| 144/150 [07:53<00:19, 3.23s/q, correct=115/144, lccp=84.7%, score=0.922, step_acc=92.3%]
GSM8K eval: 97%|#########6| 145/150 [07:56<00:14, 2.90s/q, correct=115/145, lccp=84.8%, score=0.920, step_acc=92.3%]
GSM8K eval: 97%|#########7| 146/150 [07:59<00:11, 2.93s/q, correct=116/146, lccp=84.9%, score=0.920, step_acc=92.4%]
GSM8K eval: 98%|#########8| 147/150 [08:02<00:09, 3.19s/q, correct=117/147, lccp=85.0%, score=0.921, step_acc=92.4%]
GSM8K eval: 99%|#########8| 148/150 [08:06<00:06, 3.34s/q, correct=118/148, lccp=85.1%, score=0.921, step_acc=92.5%]
GSM8K eval: 99%|#########9| 149/150 [08:09<00:03, 3.36s/q, correct=119/149, lccp=85.2%, score=0.922, step_acc=92.5%]
GSM8K eval: 100%|##########| 150/150 [08:14<00:00, 3.83s/q, correct=119/150, lccp=85.0%, score=0.920, step_acc=92.3%]
GSM8K eval: 100%|##########| 150/150 [08:14<00:00, 3.30s/q, correct=119/150, lccp=85.0%, score=0.920, step_acc=92.3%]
+2026-04-26 07:48:24,548 INFO __main__ - Training Score [iter 30]: 0.9204 (best=0.9262) | n=150
+2026-04-26 07:48:24,548 INFO __main__ - Components : 0.50×correct(79.3%) + 0.40×process + 0.10×fmt(1.000)
+2026-04-26 07:48:24,548 INFO __main__ - Process score : prm_mean=0.904 prm_final=0.929 → weighted=0.919
+2026-04-26 07:48:24,548 INFO __main__ - Step accuracy : 92.3% (bag-of-steps: fraction of steps PRM >0.5)
+2026-04-26 07:48:24,548 INFO __main__ - Chain integrity (LCCP): 85.0% ← fraction of steps before first failure
+ [LCCP=100% → all steps correct; LCCP=0% → first step wrong]
+2026-04-26 07:48:24,549 INFO __main__ - (debug) final-answer accuracy: 79.3%
+2026-04-26 07:48:26,758 INFO __main__ - Pruned old checkpoint: iter_0010
+2026-04-26 07:48:26,764 INFO __main__ - ======================================================================
+2026-04-26 07:48:26,764 INFO __main__ - GRPO ITERATION 31/60
+2026-04-26 07:48:26,764 INFO __main__ - ======================================================================
+2026-04-26 07:48:26,783 INFO __main__ - LR this iteration: 3.29e-06 | T=0.597 | MATH ratio=50%
+
Iter 31 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 07:48:33,771 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.997 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:48:33,967 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.998 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:48:34,164 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.991 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:48:34,363 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:48:34,569 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:48:34,773 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.987 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:48:34,975 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.485 = clip(base=0.405 + mod=+0.080, cap=1.00) | Q=0.53 sol=0.320 novelty=0.63 | sol=0.45*prm_final(0.54)+0.35*prm_mean(0.23)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:48:35,179 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:48:35,386 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.891 = clip(base=0.811 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.976 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:48:35,584 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.897 = clip(base=0.817 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
diff --git a/logs/grpo/grpo_20260426_032827/config.json b/logs/grpo/grpo_20260426_032827/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f25129850954278e937e165aaabbe04d38d9528a
--- /dev/null
+++ b/logs/grpo/grpo_20260426_032827/config.json
@@ -0,0 +1,44 @@
+{
+ "base_model": "checkpoints/dual_task_v1",
+ "output_dir": "checkpoints/grpo",
+ "gsm8k_data": "data/sft/gsm8k_sft.jsonl",
+ "eval_data_path": "data/sft/gsm8k_test.jsonl",
+ "num_iterations": 60,
+ "group_size": 10,
+ "q_group_size": 2,
+ "questions_per_iter": 20,
+ "learning_rate": 5e-06,
+ "max_new_tokens": 1000,
+ "temperature": 0.8,
+ "eval_every": 5,
+ "eval_max_samples": 150,
+ "eval_max_new_tokens": 1000,
+ "eval_pass_at_k": 0,
+ "use_prm": true,
+ "prm_model": "Qwen/Qwen2.5-Math-PRM-7B",
+ "skip_initial_eval": false,
+ "run_name": "grpo_20260426_032827",
+ "max_grad_norm": 0.5,
+ "kl_coef": 0.06,
+ "math_data": null,
+ "math_mix_ratio": 0.3,
+ "math_mix_ratio_late": 0.5,
+ "math_ramp_start": 18,
+ "math_max_difficulty": 3,
+ "clip_eps": 0.2,
+ "warmup_iters": 8,
+ "min_lr_ratio": 0.1,
+ "difficulty_alpha": 3.5,
+ "overlong_filter": true,
+ "save_every": 5,
+ "keep_last": 4,
+ "self_play_ratio": 0.7,
+ "min_warmup": 12,
+ "selfplay_gt_thresh": 0.65,
+ "selfplay_grounded_thresh": 0.65,
+ "selfplay_step_thresh": 0.68,
+ "selfplay_ramp_iters": 28,
+ "grounded_floor": 0.55,
+ "extractor_model": "Qwen/Qwen2.5-0.5B-Instruct",
+ "extraction_cache": "data/extraction_cache.json"
+}
\ No newline at end of file
diff --git a/logs/grpo/grpo_20260426_032827/console_output.log b/logs/grpo/grpo_20260426_032827/console_output.log
new file mode 100644
index 0000000000000000000000000000000000000000..8aac480cbc594197a32ecc0bf704cd2afc4cc2a9
--- /dev/null
+++ b/logs/grpo/grpo_20260426_032827/console_output.log
@@ -0,0 +1,7426 @@
+2026-04-26 03:28:31,607 INFO __main__ - ======================================================================
+2026-04-26 03:28:31,607 INFO __main__ - GRPO run: grpo_20260426_032827
+2026-04-26 03:28:31,607 INFO __main__ - Checkpoints : checkpoints/grpo/grpo_20260426_032827
+2026-04-26 03:28:31,607 INFO __main__ - Logs : logs/grpo/grpo_20260426_032827
+2026-04-26 03:28:31,608 INFO __main__ - Console log : logs/grpo/grpo_20260426_032827/console_output.log
+2026-04-26 03:28:31,608 INFO __main__ - ======================================================================
+2026-04-26 03:28:31,727 INFO src.utils.attn_backend - Attention backend selected: flash_attention_2
+2026-04-26 03:28:31,727 INFO __main__ - Device: cuda:0 | attn: flash_attention_2
+2026-04-26 03:28:31,745 INFO __main__ - GPU: NVIDIA A100 80GB PCIe | 85.1 GB VRAM | capability sm_80
+2026-04-26 03:28:31,745 INFO __main__ - Run config: K=10 K_q=2 N=20 lr=5.0e-06 T=0.80 max_new=1000 | clip_eps=0.20 kl_coef=0.0600 warmup=8 | diff_alpha=3.5 | self_play=70% grounded=30% | math_mix=30% math_maxdiff=3 | overlong_filter=True | eval_every=5 eval_N=150 | grad_clip=0.50 save_every=5 keep_last=4 | question_GRPO=ENABLED (K_q=2)
+2026-04-26 03:28:31,745 INFO __main__ - Loading model from checkpoints/dual_task_v1 ...
+2026-04-26 03:28:32,465 INFO __main__ - Tokenizer has no chat_template; loading from base model Qwen/Qwen2.5-Math-1.5B-Instruct
+2026-04-26 03:28:32,841 INFO __main__ - Chat template loaded successfully.
+2026-04-26 03:28:32,842 INFO __main__ - Detected PEFT adapter — loading base Qwen/Qwen2.5-Math-1.5B-Instruct then merging checkpoints/dual_task_v1
+2026-04-26 03:28:34,358 WARNING __main__ - All parameters were frozen on load (PEFT merge_and_unload bug). Re-enabled requires_grad — any prior frozen runs were training nothing.
+2026-04-26 03:28:34,358 INFO __main__ - Flash-Attn 2 active — gradient checkpointing OFF (Flash already gives O(T) attention memory).
+2026-04-26 03:28:34,359 INFO __main__ - Trainable parameters: 1,543,714,304 / 1,543,714,304 (100.0%)
+2026-04-26 03:28:34,360 INFO __main__ - Creating frozen reference policy (kl_coef=0.0600, ~3.1 GB VRAM)...
+2026-04-26 03:28:34,425 INFO __main__ - Reference policy ready.
+2026-04-26 03:28:34,426 INFO __main__ - LR schedule: 5.0e-06 warmup(8 iters) → cosine decay(52 iters, min=5.0e-07)
+2026-04-26 03:28:34,538 INFO __main__ - Loaded 8792 QA pairs from data/sft/gsm8k_sft.jsonl
+2026-04-26 03:28:34,546 INFO __main__ - Loaded 4072 MATH pairs from data/math/math_numeric.jsonl
+2026-04-26 03:28:34,546 INFO __main__ - MATH mixing: 30% MATH (4072 problems) + 70% GSM8K (8792 problems)
+2026-04-26 03:28:34,546 INFO src.rl.prm_scorer - Loading PRM Qwen/Qwen2.5-Math-PRM-7B (4-bit=True, dtype=torch.bfloat16) on cuda:0 …
+
Loading checkpoint shards: 0%| | 0/4 [00:00, ?it/s]
Loading checkpoint shards: 25%|##5 | 1/4 [00:00<00:02, 1.17it/s]
Loading checkpoint shards: 50%|##### | 2/4 [00:01<00:01, 1.20it/s]
Loading checkpoint shards: 75%|#######5 | 3/4 [00:02<00:00, 1.20it/s]
Loading checkpoint shards: 100%|##########| 4/4 [00:03<00:00, 1.35it/s]
Loading checkpoint shards: 100%|##########| 4/4 [00:03<00:00, 1.29it/s]
+2026-04-26 03:28:38,414 INFO src.rl.prm_scorer - PRM ready. GPU memory allocated: 9.97 GB step_sep_id=151651
+2026-04-26 03:28:38,414 INFO __main__ - PRM loaded: Qwen/Qwen2.5-Math-PRM-7B (4-bit)
+2026-04-26 03:28:38,416 INFO src.rl.unified_accuracy - Extraction cache not found at data/extraction_cache.json — will build on first use
+2026-04-26 03:28:38,417 INFO __main__ - Unified accuracy calculator ready (extractor=Qwen/Qwen2.5-0.5B-Instruct, cache=data/extraction_cache.json)
+2026-04-26 03:28:38,417 INFO __main__ - Warming up step-chain extractor (eager load)...
+2026-04-26 03:28:38,417 INFO src.rl.unified_accuracy - Loading step chain extractor: Qwen/Qwen2.5-0.5B-Instruct
+2026-04-26 03:28:39,348 INFO src.rl.unified_accuracy - Step chain extractor loaded
+2026-04-26 03:28:39,348 INFO __main__ - Extractor warmup complete
+2026-04-26 03:28:39,349 INFO src.rl.llm_question_classifier - LLMQuestionClassifier ready (model=Qwen2ForCausalLM, cache=10000, topics=24)
+2026-04-26 03:28:40,949 INFO __main__ - Detected structured dataset (8792 records) — bootstrapping curriculum from skill_ids instead of keyword classifier.
+2026-04-26 03:28:40,954 INFO src.rl.curriculum_manager - Curriculum bootstrapped from 8792 records across 1 topics
+2026-04-26 03:28:40,954 INFO __main__ - ======================================================================
+2026-04-26 03:28:40,954 INFO __main__ - INITIAL EVALUATION (Iteration 0)
+2026-04-26 03:28:40,954 INFO __main__ - ======================================================================
+
GSM8K eval: 0%| | 0/150 [00:00, ?q/s]2026-04-26 03:28:43,322 WARNING transformers_modules.Qwen.Qwen2.5-Math-PRM-7B.0610740060112df12585d00a1c5f4624d2f59051.modeling_qwen2_rm - We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
+
GSM8K eval: 1%| | 1/150 [00:02<06:08, 2.48s/q, correct=1/1, lccp=100.0%, score=0.999, step_acc=100.0%]
GSM8K eval: 1%|1 | 2/150 [00:07<09:29, 3.85s/q, correct=2/2, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 2%|2 | 3/150 [00:09<07:59, 3.26s/q, correct=3/3, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 3%|2 | 4/150 [00:12<06:52, 2.83s/q, correct=3/4, lccp=83.3%, score=0.887, step_acc=91.7%]
GSM8K eval: 3%|3 | 5/150 [00:13<05:44, 2.37s/q, correct=4/5, lccp=86.7%, score=0.910, step_acc=93.3%]
GSM8K eval: 4%|4 | 6/150 [00:18<08:09, 3.40s/q, correct=4/6, lccp=75.6%, score=0.887, step_acc=87.8%]
GSM8K eval: 5%|4 | 7/150 [00:22<08:03, 3.38s/q, correct=5/7, lccp=79.0%, score=0.903, step_acc=89.5%]
GSM8K eval: 5%|5 | 8/150 [00:24<07:13, 3.05s/q, correct=6/8, lccp=81.7%, score=0.915, step_acc=90.8%]
GSM8K eval: 6%|6 | 9/150 [00:27<07:16, 3.10s/q, correct=7/9, lccp=83.7%, score=0.924, step_acc=91.9%]
GSM8K eval: 7%|6 | 10/150 [00:32<08:29, 3.64s/q, correct=7/10, lccp=81.3%, score=0.908, step_acc=90.7%]
GSM8K eval: 7%|7 | 11/150 [00:35<07:51, 3.40s/q, correct=8/11, lccp=83.0%, score=0.916, step_acc=91.5%]
GSM8K eval: 8%|8 | 12/150 [00:37<06:51, 2.98s/q, correct=9/12, lccp=84.4%, score=0.923, step_acc=92.2%]
GSM8K eval: 9%|8 | 13/150 [00:40<06:41, 2.93s/q, correct=10/13, lccp=80.5%, score=0.925, step_acc=90.3%]
GSM8K eval: 9%|9 | 14/150 [00:44<07:45, 3.42s/q, correct=11/14, lccp=81.9%, score=0.930, step_acc=91.0%]
GSM8K eval: 10%|# | 15/150 [00:47<07:05, 3.15s/q, correct=12/15, lccp=83.1%, score=0.935, step_acc=91.6%]
GSM8K eval: 11%|# | 16/150 [00:49<06:32, 2.93s/q, correct=12/16, lccp=84.2%, score=0.911, step_acc=92.1%]
GSM8K eval: 11%|#1 | 17/150 [00:52<06:25, 2.90s/q, correct=13/17, lccp=85.1%, score=0.916, step_acc=92.5%]
GSM8K eval: 12%|#2 | 18/150 [00:58<08:10, 3.72s/q, correct=13/18, lccp=81.1%, score=0.904, step_acc=90.2%]
GSM8K eval: 13%|#2 | 19/150 [01:00<07:20, 3.36s/q, correct=14/19, lccp=82.1%, score=0.909, step_acc=90.7%]
GSM8K eval: 13%|#3 | 20/150 [01:04<07:19, 3.38s/q, correct=15/20, lccp=83.0%, score=0.914, step_acc=91.2%]
GSM8K eval: 14%|#4 | 21/150 [01:06<06:40, 3.10s/q, correct=16/21, lccp=83.8%, score=0.918, step_acc=91.6%]
GSM8K eval: 15%|#4 | 22/150 [01:11<07:53, 3.70s/q, correct=17/22, lccp=81.9%, score=0.919, step_acc=90.0%]
GSM8K eval: 15%|#5 | 23/150 [01:15<08:02, 3.80s/q, correct=18/23, lccp=82.7%, score=0.923, step_acc=90.5%]
GSM8K eval: 16%|#6 | 24/150 [01:18<07:09, 3.41s/q, correct=18/24, lccp=80.3%, score=0.906, step_acc=87.7%]
GSM8K eval: 17%|#6 | 25/150 [01:21<06:38, 3.19s/q, correct=18/25, lccp=78.1%, score=0.902, step_acc=87.2%]
GSM8K eval: 17%|#7 | 26/150 [01:25<07:13, 3.50s/q, correct=19/26, lccp=78.9%, score=0.905, step_acc=87.7%]
GSM8K eval: 18%|#8 | 27/150 [01:27<06:40, 3.25s/q, correct=19/27, lccp=79.7%, score=0.900, step_acc=88.2%]
GSM8K eval: 19%|#8 | 28/150 [01:30<05:58, 2.94s/q, correct=20/28, lccp=80.4%, score=0.904, step_acc=88.6%]
GSM8K eval: 19%|#9 | 29/150 [01:32<05:47, 2.88s/q, correct=21/29, lccp=81.1%, score=0.907, step_acc=89.0%]
GSM8K eval: 20%|## | 30/150 [01:36<06:17, 3.14s/q, correct=22/30, lccp=81.7%, score=0.910, step_acc=89.3%]
GSM8K eval: 21%|## | 31/150 [01:39<05:51, 2.95s/q, correct=23/31, lccp=82.3%, score=0.913, step_acc=89.7%]
GSM8K eval: 21%|##1 | 32/150 [01:40<05:04, 2.58s/q, correct=24/32, lccp=82.9%, score=0.915, step_acc=90.0%]
GSM8K eval: 22%|##2 | 33/150 [01:43<05:10, 2.65s/q, correct=25/33, lccp=83.4%, score=0.917, step_acc=90.3%]
GSM8K eval: 23%|##2 | 34/150 [01:45<04:45, 2.46s/q, correct=26/34, lccp=83.9%, score=0.920, step_acc=90.6%]
GSM8K eval: 23%|##3 | 35/150 [01:48<04:48, 2.50s/q, correct=27/35, lccp=84.3%, score=0.922, step_acc=90.9%]
GSM8K eval: 24%|##4 | 36/150 [01:51<05:19, 2.81s/q, correct=28/36, lccp=84.8%, score=0.924, step_acc=91.1%]
GSM8K eval: 25%|##4 | 37/150 [01:53<04:48, 2.55s/q, correct=29/37, lccp=85.2%, score=0.925, step_acc=91.4%]
GSM8K eval: 25%|##5 | 38/150 [01:56<05:01, 2.69s/q, correct=30/38, lccp=85.6%, score=0.927, step_acc=91.6%]
GSM8K eval: 26%|##6 | 39/150 [02:01<06:16, 3.39s/q, correct=31/39, lccp=85.9%, score=0.929, step_acc=91.8%]
GSM8K eval: 27%|##6 | 40/150 [02:07<07:34, 4.13s/q, correct=32/40, lccp=86.3%, score=0.931, step_acc=92.0%]
GSM8K eval: 27%|##7 | 41/150 [02:10<06:48, 3.75s/q, correct=32/41, lccp=86.6%, score=0.930, step_acc=92.2%]
GSM8K eval: 28%|##8 | 42/150 [02:15<07:28, 4.15s/q, correct=33/42, lccp=85.4%, score=0.931, step_acc=92.0%]
GSM8K eval: 29%|##8 | 43/150 [02:17<06:09, 3.45s/q, correct=34/43, lccp=85.7%, score=0.933, step_acc=92.2%]
GSM8K eval: 29%|##9 | 44/150 [02:23<07:30, 4.25s/q, correct=35/44, lccp=86.0%, score=0.934, step_acc=92.4%]
GSM8K eval: 30%|### | 45/150 [02:26<06:47, 3.88s/q, correct=36/45, lccp=86.3%, score=0.936, step_acc=92.5%]
GSM8K eval: 31%|### | 46/150 [02:31<07:09, 4.13s/q, correct=36/46, lccp=84.5%, score=0.931, step_acc=92.4%]
GSM8K eval: 31%|###1 | 47/150 [02:34<06:27, 3.77s/q, correct=37/47, lccp=84.8%, score=0.932, step_acc=92.6%]
GSM8K eval: 32%|###2 | 48/150 [02:35<05:21, 3.15s/q, correct=38/48, lccp=85.1%, score=0.933, step_acc=92.8%]
GSM8K eval: 33%|###2 | 49/150 [02:42<07:00, 4.16s/q, correct=38/49, lccp=84.0%, score=0.919, step_acc=91.5%]
GSM8K eval: 33%|###3 | 50/150 [02:45<06:22, 3.83s/q, correct=38/50, lccp=83.3%, score=0.911, step_acc=90.6%]
GSM8K eval: 34%|###4 | 51/150 [02:46<05:05, 3.09s/q, correct=39/51, lccp=83.6%, score=0.913, step_acc=90.8%]
GSM8K eval: 35%|###4 | 52/150 [02:50<05:32, 3.40s/q, correct=39/52, lccp=82.0%, score=0.912, step_acc=90.7%]
GSM8K eval: 35%|###5 | 53/150 [02:55<06:00, 3.72s/q, correct=39/53, lccp=81.6%, score=0.905, step_acc=90.1%]
GSM8K eval: 36%|###6 | 54/150 [02:57<05:14, 3.28s/q, correct=40/54, lccp=81.9%, score=0.907, step_acc=90.3%]
GSM8K eval: 37%|###6 | 55/150 [03:00<04:56, 3.12s/q, correct=41/55, lccp=82.3%, score=0.908, step_acc=90.4%]
GSM8K eval: 37%|###7 | 56/150 [03:03<05:02, 3.22s/q, correct=42/56, lccp=82.6%, score=0.910, step_acc=90.6%]
GSM8K eval: 38%|###8 | 57/150 [03:06<04:31, 2.92s/q, correct=43/57, lccp=82.9%, score=0.911, step_acc=90.8%]
GSM8K eval: 39%|###8 | 58/150 [03:10<04:57, 3.23s/q, correct=44/58, lccp=83.2%, score=0.913, step_acc=90.9%]
GSM8K eval: 39%|###9 | 59/150 [03:13<05:00, 3.30s/q, correct=44/59, lccp=81.8%, score=0.904, step_acc=89.7%]
GSM8K eval: 40%|#### | 60/150 [03:18<05:35, 3.73s/q, correct=45/60, lccp=82.1%, score=0.906, step_acc=89.9%]
GSM8K eval: 41%|#### | 61/150 [03:20<04:51, 3.28s/q, correct=46/61, lccp=82.4%, score=0.908, step_acc=90.1%]
GSM8K eval: 41%|####1 | 62/150 [03:23<04:41, 3.20s/q, correct=47/62, lccp=82.6%, score=0.909, step_acc=90.2%]
GSM8K eval: 42%|####2 | 63/150 [03:26<04:39, 3.22s/q, correct=47/63, lccp=82.4%, score=0.903, step_acc=89.9%]
GSM8K eval: 43%|####2 | 64/150 [03:29<04:25, 3.08s/q, correct=48/64, lccp=82.7%, score=0.905, step_acc=90.0%]
GSM8K eval: 43%|####3 | 65/150 [03:32<04:12, 2.97s/q, correct=49/65, lccp=82.9%, score=0.906, step_acc=90.2%]
GSM8K eval: 44%|####4 | 66/150 [03:33<03:30, 2.50s/q, correct=50/66, lccp=83.2%, score=0.907, step_acc=90.3%]
GSM8K eval: 45%|####4 | 67/150 [03:35<03:19, 2.41s/q, correct=51/67, lccp=83.4%, score=0.909, step_acc=90.5%]
GSM8K eval: 45%|####5 | 68/150 [03:38<03:21, 2.46s/q, correct=52/68, lccp=83.7%, score=0.910, step_acc=90.6%]
GSM8K eval: 46%|####6 | 69/150 [03:39<02:55, 2.17s/q, correct=53/69, lccp=83.9%, score=0.911, step_acc=90.7%]
GSM8K eval: 47%|####6 | 70/150 [03:42<03:10, 2.38s/q, correct=54/70, lccp=82.7%, score=0.912, step_acc=90.6%]
GSM8K eval: 47%|####7 | 71/150 [03:45<03:23, 2.57s/q, correct=55/71, lccp=81.6%, score=0.913, step_acc=90.4%]
GSM8K eval: 48%|####8 | 72/150 [03:47<02:53, 2.22s/q, correct=56/72, lccp=81.8%, score=0.914, step_acc=90.6%]
GSM8K eval: 49%|####8 | 73/150 [03:48<02:37, 2.04s/q, correct=57/73, lccp=82.1%, score=0.915, step_acc=90.7%]
GSM8K eval: 49%|####9 | 74/150 [03:52<03:06, 2.45s/q, correct=58/74, lccp=82.3%, score=0.917, step_acc=90.8%]
GSM8K eval: 50%|##### | 75/150 [03:53<02:46, 2.22s/q, correct=59/75, lccp=82.5%, score=0.918, step_acc=91.0%]
GSM8K eval: 51%|##### | 76/150 [04:00<04:14, 3.44s/q, correct=59/76, lccp=82.6%, score=0.913, step_acc=90.9%]
GSM8K eval: 51%|#####1 | 77/150 [04:03<04:18, 3.54s/q, correct=60/77, lccp=82.8%, score=0.914, step_acc=91.0%]
GSM8K eval: 52%|#####2 | 78/150 [04:06<03:48, 3.18s/q, correct=61/78, lccp=83.1%, score=0.915, step_acc=91.1%]
GSM8K eval: 53%|#####2 | 79/150 [04:09<03:36, 3.05s/q, correct=62/79, lccp=82.8%, score=0.914, step_acc=91.0%]
GSM8K eval: 53%|#####3 | 80/150 [04:11<03:27, 2.97s/q, correct=63/80, lccp=83.0%, score=0.915, step_acc=91.1%]
GSM8K eval: 54%|#####4 | 81/150 [04:14<03:10, 2.76s/q, correct=64/81, lccp=83.2%, score=0.916, step_acc=91.2%]
GSM8K eval: 55%|#####4 | 82/150 [04:16<03:08, 2.77s/q, correct=65/82, lccp=83.4%, score=0.917, step_acc=91.3%]
GSM8K eval: 55%|#####5 | 83/150 [04:19<03:03, 2.73s/q, correct=66/83, lccp=83.6%, score=0.918, step_acc=91.4%]
GSM8K eval: 56%|#####6 | 84/150 [04:22<02:57, 2.68s/q, correct=67/84, lccp=83.8%, score=0.919, step_acc=91.5%]
GSM8K eval: 57%|#####6 | 85/150 [04:25<03:15, 3.01s/q, correct=68/85, lccp=84.0%, score=0.920, step_acc=91.6%]
GSM8K eval: 57%|#####7 | 86/150 [04:29<03:17, 3.09s/q, correct=69/86, lccp=84.2%, score=0.921, step_acc=91.7%]
GSM8K eval: 58%|#####8 | 87/150 [04:34<03:55, 3.74s/q, correct=70/87, lccp=84.3%, score=0.922, step_acc=91.8%]
GSM8K eval: 59%|#####8 | 88/150 [04:36<03:15, 3.16s/q, correct=71/88, lccp=84.5%, score=0.922, step_acc=91.9%]
GSM8K eval: 59%|#####9 | 89/150 [04:38<03:03, 3.01s/q, correct=72/89, lccp=84.7%, score=0.923, step_acc=92.0%]
GSM8K eval: 60%|###### | 90/150 [04:41<02:48, 2.80s/q, correct=73/90, lccp=84.9%, score=0.924, step_acc=92.1%]
GSM8K eval: 61%|###### | 91/150 [04:45<03:09, 3.21s/q, correct=74/91, lccp=85.0%, score=0.925, step_acc=92.2%]
GSM8K eval: 61%|######1 | 92/150 [04:48<03:00, 3.12s/q, correct=75/92, lccp=85.2%, score=0.926, step_acc=92.3%]
GSM8K eval: 62%|######2 | 93/150 [04:54<03:58, 4.18s/q, correct=76/93, lccp=85.4%, score=0.926, step_acc=92.4%]
GSM8K eval: 63%|######2 | 94/150 [04:58<03:46, 4.04s/q, correct=76/94, lccp=84.4%, score=0.924, step_acc=91.9%]
GSM8K eval: 63%|######3 | 95/150 [05:03<03:58, 4.34s/q, correct=77/95, lccp=83.6%, score=0.924, step_acc=91.5%]
GSM8K eval: 64%|######4 | 96/150 [05:06<03:32, 3.93s/q, correct=77/96, lccp=83.0%, score=0.919, step_acc=90.9%]
GSM8K eval: 65%|######4 | 97/150 [05:09<03:05, 3.51s/q, correct=77/97, lccp=82.7%, score=0.917, step_acc=90.7%]
GSM8K eval: 65%|######5 | 98/150 [05:13<03:09, 3.65s/q, correct=77/98, lccp=82.3%, score=0.913, step_acc=90.5%]
GSM8K eval: 66%|######6 | 99/150 [05:15<02:44, 3.23s/q, correct=78/99, lccp=82.5%, score=0.914, step_acc=90.6%]
GSM8K eval: 67%|######6 | 100/150 [05:17<02:19, 2.79s/q, correct=79/100, lccp=81.6%, score=0.914, step_acc=90.4%]
GSM8K eval: 67%|######7 | 101/150 [05:20<02:17, 2.80s/q, correct=79/101, lccp=81.3%, score=0.911, step_acc=90.2%]
GSM8K eval: 68%|######8 | 102/150 [05:21<01:54, 2.39s/q, correct=80/102, lccp=81.5%, score=0.911, step_acc=90.3%]
GSM8K eval: 69%|######8 | 103/150 [05:23<01:46, 2.26s/q, correct=81/103, lccp=81.7%, score=0.912, step_acc=90.4%]
GSM8K eval: 69%|######9 | 104/150 [05:27<02:15, 2.95s/q, correct=82/104, lccp=81.9%, score=0.913, step_acc=90.5%]
GSM8K eval: 70%|####### | 105/150 [05:30<02:04, 2.77s/q, correct=83/105, lccp=82.0%, score=0.914, step_acc=90.6%]
GSM8K eval: 71%|####### | 106/150 [05:31<01:44, 2.37s/q, correct=84/106, lccp=82.2%, score=0.914, step_acc=90.7%]
GSM8K eval: 71%|#######1 | 107/150 [05:33<01:29, 2.09s/q, correct=85/107, lccp=82.4%, score=0.915, step_acc=90.7%]
GSM8K eval: 72%|#######2 | 108/150 [05:35<01:33, 2.23s/q, correct=86/108, lccp=82.5%, score=0.916, step_acc=90.8%]
GSM8K eval: 73%|#######2 | 109/150 [05:40<02:02, 2.99s/q, correct=86/109, lccp=82.1%, score=0.915, step_acc=90.8%]
GSM8K eval: 73%|#######3 | 110/150 [05:42<01:49, 2.73s/q, correct=87/110, lccp=82.3%, score=0.915, step_acc=90.9%]
GSM8K eval: 74%|#######4 | 111/150 [05:44<01:33, 2.39s/q, correct=88/111, lccp=82.4%, score=0.916, step_acc=90.9%]
GSM8K eval: 75%|#######4 | 112/150 [05:49<01:58, 3.13s/q, correct=88/112, lccp=82.6%, score=0.916, step_acc=91.0%]
GSM8K eval: 75%|#######5 | 113/150 [05:50<01:39, 2.70s/q, correct=89/113, lccp=82.7%, score=0.916, step_acc=91.1%]
GSM8K eval: 76%|#######6 | 114/150 [05:53<01:42, 2.86s/q, correct=89/114, lccp=82.3%, score=0.913, step_acc=90.6%]
GSM8K eval: 77%|#######6 | 115/150 [05:56<01:34, 2.69s/q, correct=90/115, lccp=82.5%, score=0.913, step_acc=90.7%]
GSM8K eval: 77%|#######7 | 116/150 [05:58<01:30, 2.68s/q, correct=91/116, lccp=82.7%, score=0.914, step_acc=90.8%]
GSM8K eval: 78%|#######8 | 117/150 [06:04<01:54, 3.45s/q, correct=92/117, lccp=82.8%, score=0.915, step_acc=90.9%]
GSM8K eval: 79%|#######8 | 118/150 [06:07<01:50, 3.46s/q, correct=92/118, lccp=82.1%, score=0.912, step_acc=90.6%]
GSM8K eval: 79%|#######9 | 119/150 [06:11<01:46, 3.45s/q, correct=92/119, lccp=82.2%, score=0.910, step_acc=90.7%]
GSM8K eval: 80%|######## | 120/150 [06:13<01:36, 3.22s/q, correct=93/120, lccp=82.4%, score=0.911, step_acc=90.8%]
GSM8K eval: 81%|######## | 121/150 [06:16<01:30, 3.12s/q, correct=94/121, lccp=82.5%, score=0.912, step_acc=90.9%]
GSM8K eval: 81%|########1 | 122/150 [06:19<01:25, 3.05s/q, correct=95/122, lccp=82.7%, score=0.912, step_acc=90.9%]
GSM8K eval: 82%|########2 | 123/150 [06:22<01:23, 3.09s/q, correct=95/123, lccp=82.3%, score=0.912, step_acc=90.8%]
GSM8K eval: 83%|########2 | 124/150 [06:24<01:12, 2.78s/q, correct=96/124, lccp=82.5%, score=0.913, step_acc=90.9%]
GSM8K eval: 83%|########3 | 125/150 [06:26<01:03, 2.53s/q, correct=97/125, lccp=82.6%, score=0.914, step_acc=91.0%]
GSM8K eval: 84%|########4 | 126/150 [06:29<01:01, 2.54s/q, correct=98/126, lccp=82.8%, score=0.914, step_acc=91.1%]
GSM8K eval: 85%|########4 | 127/150 [06:33<01:09, 3.04s/q, correct=99/127, lccp=82.9%, score=0.915, step_acc=91.1%]
GSM8K eval: 85%|########5 | 128/150 [06:36<01:05, 2.97s/q, correct=100/128, lccp=83.0%, score=0.916, step_acc=91.2%]
GSM8K eval: 86%|########6 | 129/150 [06:39<01:04, 3.09s/q, correct=101/129, lccp=83.2%, score=0.916, step_acc=91.3%]
GSM8K eval: 87%|########6 | 130/150 [06:41<00:53, 2.68s/q, correct=102/130, lccp=83.3%, score=0.917, step_acc=91.3%]
GSM8K eval: 87%|########7 | 131/150 [06:45<01:00, 3.20s/q, correct=103/131, lccp=83.4%, score=0.917, step_acc=91.4%]
GSM8K eval: 88%|########8 | 132/150 [06:47<00:48, 2.70s/q, correct=104/132, lccp=83.5%, score=0.918, step_acc=91.5%]
GSM8K eval: 89%|########8 | 133/150 [06:50<00:46, 2.71s/q, correct=105/133, lccp=83.7%, score=0.919, step_acc=91.5%]
GSM8K eval: 89%|########9 | 134/150 [06:54<00:50, 3.16s/q, correct=106/134, lccp=83.8%, score=0.919, step_acc=91.6%]
GSM8K eval: 90%|######### | 135/150 [06:57<00:46, 3.09s/q, correct=107/135, lccp=83.9%, score=0.920, step_acc=91.7%]
GSM8K eval: 91%|######### | 136/150 [07:01<00:48, 3.49s/q, correct=107/136, lccp=83.5%, score=0.918, step_acc=91.3%]
GSM8K eval: 91%|#########1| 137/150 [07:08<00:57, 4.39s/q, correct=108/137, lccp=83.7%, score=0.919, step_acc=91.4%]
GSM8K eval: 92%|#########2| 138/150 [07:11<00:50, 4.22s/q, correct=109/138, lccp=83.8%, score=0.919, step_acc=91.5%]
GSM8K eval: 93%|#########2| 139/150 [07:14<00:40, 3.64s/q, correct=110/139, lccp=83.9%, score=0.920, step_acc=91.5%]
GSM8K eval: 93%|#########3| 140/150 [07:18<00:37, 3.76s/q, correct=110/140, lccp=83.8%, score=0.916, step_acc=91.4%]
GSM8K eval: 94%|#########3| 141/150 [07:21<00:33, 3.74s/q, correct=111/141, lccp=83.9%, score=0.917, step_acc=91.4%]
GSM8K eval: 95%|#########4| 142/150 [07:25<00:28, 3.56s/q, correct=112/142, lccp=84.0%, score=0.918, step_acc=91.5%]
GSM8K eval: 95%|#########5| 143/150 [07:27<00:21, 3.13s/q, correct=113/143, lccp=84.1%, score=0.918, step_acc=91.5%]
GSM8K eval: 96%|#########6| 144/150 [07:29<00:17, 2.86s/q, correct=114/144, lccp=84.2%, score=0.919, step_acc=91.6%]
GSM8K eval: 97%|#########6| 145/150 [07:32<00:14, 2.89s/q, correct=114/145, lccp=83.6%, score=0.915, step_acc=91.1%]
GSM8K eval: 97%|#########7| 146/150 [07:35<00:11, 2.89s/q, correct=115/146, lccp=83.8%, score=0.916, step_acc=91.1%]
GSM8K eval: 98%|#########8| 147/150 [07:38<00:09, 3.06s/q, correct=116/147, lccp=83.9%, score=0.917, step_acc=91.2%]
GSM8K eval: 99%|#########8| 148/150 [07:42<00:06, 3.20s/q, correct=117/148, lccp=84.0%, score=0.917, step_acc=91.3%]
GSM8K eval: 99%|#########9| 149/150 [07:45<00:03, 3.24s/q, correct=118/149, lccp=84.1%, score=0.918, step_acc=91.3%]
GSM8K eval: 100%|##########| 150/150 [07:50<00:00, 3.65s/q, correct=118/150, lccp=83.9%, score=0.916, step_acc=91.1%]
GSM8K eval: 100%|##########| 150/150 [07:50<00:00, 3.14s/q, correct=118/150, lccp=83.9%, score=0.916, step_acc=91.1%]
+2026-04-26 03:36:31,226 INFO __main__ - Training Score [INITIAL (iter 0)]: 0.9162 | n=150
+2026-04-26 03:36:31,227 INFO __main__ - Components : 0.50×correct(78.7%) + 0.40×process + 0.10×fmt(1.000)
+2026-04-26 03:36:31,227 INFO __main__ - Process score : prm_mean=0.899 prm_final=0.927 → weighted=0.916
+2026-04-26 03:36:31,227 INFO __main__ - Step accuracy : 91.1% (bag-of-steps: fraction of steps PRM >0.5)
+2026-04-26 03:36:31,227 INFO __main__ - Chain integrity (LCCP): 83.9% ← fraction of steps before first failure
+ [LCCP=100% → all steps correct; LCCP=0% → first step wrong]
+2026-04-26 03:36:31,227 INFO __main__ - (debug) final-answer accuracy: 78.7%
+2026-04-26 03:36:31,227 INFO __main__ - ======================================================================
+2026-04-26 03:36:31,227 INFO __main__ - GRPO ITERATION 1/60
+2026-04-26 03:36:31,228 INFO __main__ - ======================================================================
+2026-04-26 03:36:31,246 INFO __main__ - LR this iteration: 5.00e-07 | T=0.800 | MATH ratio=30%
+
Iter 1 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 03:36:36,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='390' gold='390' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:36:36,737 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.415 = 0.50×0.09(prox=0.09) + 0.40×proc(0.606[fin=0.74,mean=0.40]) + 0.10×fmt(1.000) | pred='2470' gold='390' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 03:36:36,817 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='390' gold='390' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:36:36,898 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.913 = 0.50×0.85(prox=0.85) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='370' gold='390' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:36:36,978 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='390' gold='390' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:36:37,059 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='390' gold='390' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:36:37,142 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='390' gold='390' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:36:37,222 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='390' gold='390' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:36:37,303 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=0.98,mean=0.95]) + 0.10×fmt(1.000) | pred='390' gold='390' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:36:37,384 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='390' gold='390' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 1 GRPO groups: 0%| | 0/20 [00:07, ?q/s, loss=-0.0005, mean_r=0.928, skip=0]
Iter 1 GRPO groups: 5%|5 | 1/20 [00:07<02:30, 7.94s/q, loss=-0.0005, mean_r=0.928, skip=0]2026-04-26 03:36:46,224 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:36:46,306 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:36:46,389 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:36:46,469 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:36:46,550 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:36:46,631 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:36:46,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:36:46,793 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:36:46,873 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.954 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.94]) + 0.10×fmt(0.650) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:36:46,954 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 1 GRPO groups: 5%|5 | 1/20 [00:15<02:30, 7.94s/q, loss=0var, mean_r=0.986, skip=1]
Iter 1 GRPO groups: 10%|# | 2/20 [00:15<02:21, 7.84s/q, loss=0var, mean_r=0.986, skip=1]2026-04-26 03:36:53,060 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:36:53,144 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:36:53,226 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:36:53,308 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:36:53,391 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:36:53,474 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:36:53,556 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:36:53,638 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:36:53,728 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:36:53,814 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='340' gold='340' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 1 GRPO groups: 10%|# | 2/20 [00:22<02:21, 7.84s/q, loss=0var, mean_r=0.998, skip=2]
Iter 1 GRPO groups: 15%|#5 | 3/20 [00:22<02:05, 7.39s/q, loss=0var, mean_r=0.998, skip=2]2026-04-26 03:37:00,254 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.821 = 0.50×0.67(prox=0.67) + 0.40×proc(0.969[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='6' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:00,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.685 = 0.50×0.50(prox=0.50) + 0.40×proc(0.837[fin=0.97,mean=0.63]) + 0.10×fmt(1.000) | pred='4' gold='8' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 03:37:00,420 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:00,502 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.724 = 0.50×0.50(prox=0.50) + 0.40×proc(0.934[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='4' gold='8' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 03:37:00,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.951[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 03:37:00,667 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:00,750 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:00,833 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.500 = 0.50×0.40(prox=0.40) + 0.40×proc(0.350[fin=0.28,mean=0.46]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 03:37:00,915 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:00,997 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 1 GRPO groups: 15%|#5 | 3/20 [00:31<02:05, 7.39s/q, loss=0.0006, mean_r=0.866, skip=2]
Iter 1 GRPO groups: 20%|## | 4/20 [00:31<02:06, 7.88s/q, loss=0.0006, mean_r=0.866, skip=2]2026-04-26 03:37:06,396 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='360' gold='360' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:06,478 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.670 = 0.50×0.60(prox=0.60) + 0.40×proc(0.674[fin=0.80,mean=0.48]) + 0.10×fmt(1.000) | pred='240' gold='360' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:37:06,559 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='360' gold='360' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:06,641 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='360' gold='360' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:06,723 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.712 = 0.50×0.60(prox=0.60) + 0.40×proc(0.780[fin=0.93,mean=0.55]) + 0.10×fmt(1.000) | pred='240' gold='360' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 03:37:06,804 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='360' gold='360' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:06,886 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='360' gold='360' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:06,966 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='360' gold='360' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:07,050 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='360' gold='360' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:07,133 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='360' gold='360' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 1 GRPO groups: 20%|## | 4/20 [00:37<02:06, 7.88s/q, loss=0.0003, mean_r=0.938, skip=2]
Iter 1 GRPO groups: 25%|##5 | 5/20 [00:37<01:48, 7.24s/q, loss=0.0003, mean_r=0.938, skip=2]2026-04-26 03:37:13,050 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1274' gold='1274' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:13,132 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1274' gold='1274' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:13,214 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.480 = 0.50×0.55(prox=0.55) + 0.40×proc(0.262[fin=0.00,mean=0.65]) + 0.10×fmt(1.000) | pred='754' gold='1274' | step_acc=67% lccp=67% (chain=2/3 ok_count=2) n_steps=3
+2026-04-26 03:37:13,297 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1274' gold='1274' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:13,380 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1274' gold='1274' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:13,463 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='1274' gold='1274' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:13,547 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.524 = 0.50×0.24(prox=0.24) + 0.40×proc(0.695[fin=0.79,mean=0.56]) + 0.10×fmt(1.000) | pred='-726' gold='1274' | step_acc=67% lccp=17% (chain=1/6 ok_count=4) n_steps=6
+2026-04-26 03:37:13,628 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1274' gold='1274' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:13,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1274' gold='1274' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:13,790 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1274' gold='1274' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 1 GRPO groups: 25%|##5 | 5/20 [00:43<01:48, 7.24s/q, loss=0.0012, mean_r=0.899, skip=2]
Iter 1 GRPO groups: 30%|### | 6/20 [00:43<01:38, 7.03s/q, loss=0.0012, mean_r=0.899, skip=2]2026-04-26 03:37:20,797 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.735 = 0.50×0.76(prox=0.76) + 0.40×proc(0.631[fin=0.73,mean=0.48]) + 0.10×fmt(1.000) | pred='11' gold='13' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 03:37:20,888 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.42(prox=0.42) + 0.40×proc(0.862[fin=0.91,mean=0.80]) + 0.10×fmt(1.000) | pred='22' gold='13' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:37:20,973 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.729 = 0.50×0.68(prox=0.68) + 0.40×proc(0.716[fin=0.81,mean=0.58]) + 0.10×fmt(1.000) | pred='10' gold='13' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:37:21,056 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.635 = 0.50×0.68(prox=0.68) + 0.40×proc(0.482[fin=0.43,mean=0.56]) + 0.10×fmt(1.000) | pred='10' gold='13' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 03:37:21,143 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.731 = 0.50×0.76(prox=0.76) + 0.40×proc(0.622[fin=0.80,mean=0.35]) + 0.10×fmt(1.000) | pred='11' gold='13' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 03:37:21,228 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.916 = 0.50×1.00(exact) + 0.40×proc(0.791[fin=0.96,mean=0.53]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 03:37:21,319 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.588 = 0.50×0.68(prox=0.68) + 0.40×proc(0.366[fin=0.28,mean=0.50]) + 0.10×fmt(1.000) | pred='10' gold='13' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 03:37:21,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.37(prox=0.37) + 0.40×proc(0.827[fin=1.00,mean=0.57]) + 0.10×fmt(1.000) | pred='24' gold='13' | step_acc=67% lccp=17% (chain=1/6 ok_count=4) n_steps=6
+2026-04-26 03:37:21,496 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.463 = 0.50×0.45(prox=0.45) + 0.40×proc(0.254[fin=0.14,mean=0.42]) + 0.10×fmt(1.000) | pred='5' gold='13' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:37:21,580 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.901 = 0.50×1.00(exact) + 0.40×proc(0.753[fin=0.92,mean=0.50]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+
Iter 1 GRPO groups: 30%|### | 6/20 [00:51<01:38, 7.03s/q, loss=0.0000, mean_r=0.680, skip=2]
Iter 1 GRPO groups: 35%|###5 | 7/20 [00:51<01:34, 7.30s/q, loss=0.0000, mean_r=0.680, skip=2]2026-04-26 03:37:25,982 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,058 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,142 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,225 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,306 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,388 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,463 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,628 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:26,705 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='153' gold='153' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 1 GRPO groups: 35%|###5 | 7/20 [00:55<01:34, 7.30s/q, loss=0var, mean_r=0.998, skip=3]
Iter 1 GRPO groups: 40%|#### | 8/20 [00:55<01:13, 6.15s/q, loss=0var, mean_r=0.998, skip=3]2026-04-26 03:37:31,307 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:31,389 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:31,470 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:31,547 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.656 = 0.50×0.50(prox=0.50) + 0.40×proc(0.764[fin=0.87,mean=0.61]) + 0.10×fmt(1.000) | pred='70' gold='140' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 03:37:31,630 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.341 = 0.50×0.25(prox=0.25) + 0.40×proc(0.289[fin=0.32,mean=0.24]) + 0.10×fmt(1.000) | pred='-70' gold='140' | step_acc=0% lccp=0% (chain=0/5 ok_count=0) n_steps=5
+2026-04-26 03:37:31,712 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:31,792 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:31,874 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:31,958 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:32,041 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 1 GRPO groups: 40%|#### | 8/20 [01:02<01:13, 6.15s/q, loss=-0.0002, mean_r=0.898, skip=3]
Iter 1 GRPO groups: 45%|####5 | 9/20 [01:02<01:09, 6.33s/q, loss=-0.0002, mean_r=0.898, skip=3]2026-04-26 03:37:40,582 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.588 = 0.50×0.73(prox=0.73) + 0.40×proc(0.306[fin=0.10,mean=0.62]) + 0.10×fmt(1.000) | pred='885' gold='1085' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 03:37:40,666 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1085' gold='1085' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:37:40,757 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1085' gold='1085' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:40,849 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1085' gold='1085' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:40,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.630 = 0.50×0.46(prox=0.46) + 0.40×proc(0.751[fin=0.95,mean=0.45]) + 0.10×fmt(1.000) | pred='445' gold='1085' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 03:37:41,026 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='1085' gold='1085' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:41,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.821 = 0.50×0.81(prox=0.81) + 0.40×proc(0.786[fin=0.96,mean=0.52]) + 0.10×fmt(1.000) | pred='960' gold='1085' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:37:41,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1085' gold='1085' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:37:41,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1085' gold='1085' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:37:41,378 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1085' gold='1085' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 1 GRPO groups: 45%|####5 | 9/20 [01:11<01:09, 6.33s/q, loss=0.0013, mean_r=0.902, skip=3]
Iter 1 GRPO groups: 50%|##### | 10/20 [01:11<01:12, 7.26s/q, loss=0.0013, mean_r=0.902, skip=3]2026-04-26 03:37:45,118 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:45,194 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:45,275 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:45,356 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:45,431 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:45,515 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:45,597 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:45,673 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:45,753 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:45,834 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 1 GRPO groups: 50%|##### | 10/20 [01:14<01:12, 7.26s/q, loss=0var, mean_r=0.998, skip=4]
Iter 1 GRPO groups: 55%|#####5 | 11/20 [01:14<00:53, 5.97s/q, loss=0var, mean_r=0.998, skip=4]2026-04-26 03:37:49,884 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2400' gold='2400' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:49,966 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2400' gold='2400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:50,049 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='$2400' gold='2400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:50,133 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2400' gold='2400' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:50,216 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2400' gold='2400' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:50,299 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2400' gold='2400' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:50,381 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2400' gold='2400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:50,466 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2400' gold='2400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:50,550 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='$2400' gold='2400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:50,632 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='$2400' gold='2400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 1 GRPO groups: 55%|#####5 | 11/20 [01:19<00:53, 5.97s/q, loss=0var, mean_r=0.999, skip=5]
Iter 1 GRPO groups: 60%|###### | 12/20 [01:19<00:44, 5.62s/q, loss=0var, mean_r=0.999, skip=5]2026-04-26 03:37:54,681 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.924 = 0.50×1.00(exact) + 0.40×proc(0.896[fin=1.00,mean=0.75]) + 0.10×fmt(0.650) | pred='78' gold='78' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 03:37:54,757 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.831 = 0.50×1.00(exact) + 0.40×proc(0.577[fin=0.75,mean=0.31]) + 0.10×fmt(1.000) | pred='78' gold='78' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 03:37:54,840 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.642 = 0.50×0.35(prox=0.35) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='150' gold='78' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:37:54,916 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.308 = 0.50×0.00(prox=0.00) + 0.40×proc(0.425[fin=0.46,mean=0.37]) + 0.10×fmt(1.000) | pred='475 5/6' gold='78' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 03:37:54,991 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='78' gold='78' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:55,072 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='78' gold='78' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:55,154 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='78' gold='78' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:37:55,235 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.608 = 0.50×0.85(prox=0.85) + 0.40×proc(0.208[fin=0.06,mean=0.43]) + 0.10×fmt(1.000) | pred='84' gold='78' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 03:37:55,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='78' gold='78' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:37:55,391 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='78' gold='78' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 1 GRPO groups: 60%|###### | 12/20 [01:25<00:44, 5.62s/q, loss=0.0008, mean_r=0.829, skip=5]
Iter 1 GRPO groups: 65%|######5 | 13/20 [01:25<00:40, 5.77s/q, loss=0.0008, mean_r=0.829, skip=5]2026-04-26 03:38:00,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:00,394 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:00,475 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:00,558 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:00,639 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:00,720 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:00,801 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:00,881 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:00,964 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:01,046 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='890' gold='890' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 1 GRPO groups: 65%|######5 | 13/20 [01:29<00:40, 5.77s/q, loss=0var, mean_r=0.999, skip=6]
Iter 1 GRPO groups: 70%|####### | 14/20 [01:29<00:31, 5.32s/q, loss=0var, mean_r=0.999, skip=6]2026-04-26 03:38:06,665 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6435' gold='6435' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:06,747 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.922 = 0.50×1.00(exact) + 0.40×proc(0.892[fin=1.00,mean=0.74]) + 0.10×fmt(0.650) | pred='6435' gold='6435' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 03:38:06,829 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='6435' gold='6435' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:06,910 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.929 = 0.50×1.00(exact) + 0.40×proc(0.909[fin=1.00,mean=0.77]) + 0.10×fmt(0.650) | pred='6435' gold='6435' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:38:06,994 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.372 = 0.50×0.48(prox=0.48) + 0.40×proc(0.076[fin=0.10,mean=0.04]) + 0.10×fmt(1.000) | pred='2993' gold='6435' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:38:07,076 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.953 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(0.650) | pred='6435' gold='6435' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:38:07,160 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(0.650) | pred='6435' gold='6435' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:38:07,241 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.914 = 0.50×1.00(exact) + 0.40×proc(0.871[fin=1.00,mean=0.68]) + 0.10×fmt(0.650) | pred='6435' gold='6435' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 03:38:07,323 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='6435' gold='6435' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:38:07,403 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.940 = 0.50×1.00(exact) + 0.40×proc(0.937[fin=0.99,mean=0.85]) + 0.10×fmt(0.650) | pred='6435' gold='6435' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 1 GRPO groups: 70%|####### | 14/20 [01:37<00:31, 5.32s/q, loss=-0.0011, mean_r=0.894, skip=6]
Iter 1 GRPO groups: 75%|#######5 | 15/20 [01:37<00:30, 6.04s/q, loss=-0.0011, mean_r=0.894, skip=6]2026-04-26 03:38:12,391 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.984[fin=0.99,mean=0.97]) + 0.10×fmt(1.000) | pred='120' gold='480' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:12,473 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.964[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:12,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:12,637 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:12,719 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.500 = 0.50×0.50(prox=0.50) + 0.40×proc(0.376[fin=0.42,mean=0.32]) + 0.10×fmt(1.000) | pred='240' gold='480' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:38:12,801 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.722 = 0.50×0.50(prox=0.50) + 0.40×proc(0.930[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='240' gold='480' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:38:12,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.303 = 0.50×0.34(prox=0.34) + 0.40×proc(0.082[fin=0.01,mean=0.18]) + 0.10×fmt(1.000) | pred='16' gold='480' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:38:12,958 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.947[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:13,039 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.14(prox=0.14) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='1920' gold='480' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:13,122 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.534 = 0.50×0.33(prox=0.33) + 0.40×proc(0.574[fin=0.67,mean=0.43]) + 0.10×fmt(1.000) | pred='960' gold='480' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+
Iter 1 GRPO groups: 75%|#######5 | 15/20 [01:43<00:30, 6.04s/q, loss=0.0027, mean_r=0.711, skip=6]
Iter 1 GRPO groups: 80%|######## | 16/20 [01:43<00:23, 5.96s/q, loss=0.0027, mean_r=0.711, skip=6]2026-04-26 03:38:21,123 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.832 = 0.50×0.71(prox=0.71) + 0.40×proc(0.937[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:38:21,208 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.810 = 0.50×0.71(prox=0.71) + 0.40×proc(0.883[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:38:21,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.811 = 0.50×0.71(prox=0.71) + 0.40×proc(0.885[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:38:21,382 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.38(prox=0.38) + 0.40×proc(0.905[fin=0.96,mean=0.83]) + 0.10×fmt(1.000) | pred='80' gold='400' | step_acc=89% lccp=22% (chain=2/9 ok_count=8) n_steps=9
+2026-04-26 03:38:21,468 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.782 = 0.50×0.71(prox=0.71) + 0.40×proc(0.812[fin=0.99,mean=0.54]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:38:21,553 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.828 = 0.50×0.71(prox=0.71) + 0.40×proc(0.928[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:38:21,635 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.840 = 0.50×0.71(prox=0.71) + 0.40×proc(0.958[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:21,719 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.849 = 0.50×0.71(prox=0.71) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:21,803 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.824 = 0.50×0.71(prox=0.71) + 0.40×proc(0.918[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:38:21,887 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.839 = 0.50×0.71(prox=0.71) + 0.40×proc(0.954[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='320' gold='400' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 1 GRPO groups: 80%|######## | 16/20 [01:52<00:23, 5.96s/q, loss=-0.0013, mean_r=0.797, skip=6]
Iter 1 GRPO groups: 85%|########5 | 17/20 [01:52<00:20, 6.80s/q, loss=-0.0013, mean_r=0.797, skip=6]2026-04-26 03:38:25,690 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:25,767 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:25,843 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:25,923 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:26,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:26,087 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:26,166 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:26,244 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:26,326 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:26,401 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 1 GRPO groups: 85%|########5 | 17/20 [01:55<00:20, 6.80s/q, loss=0var, mean_r=0.994, skip=7]
Iter 1 GRPO groups: 90%|######### | 18/20 [01:55<00:11, 5.69s/q, loss=0var, mean_r=0.994, skip=7]2026-04-26 03:38:30,416 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:30,500 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:30,584 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:38:30,664 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:30,746 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:38:30,822 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:30,897 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:30,972 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:31,050 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:31,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 1 GRPO groups: 90%|######### | 18/20 [01:59<00:11, 5.69s/q, loss=0var, mean_r=0.997, skip=8]
Iter 1 GRPO groups: 95%|#########5| 19/20 [01:59<00:05, 5.40s/q, loss=0var, mean_r=0.997, skip=8]2026-04-26 03:38:36,660 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:38:36,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:36,822 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.761 = 0.50×0.60(prox=0.60) + 0.40×proc(0.901[fin=0.99,mean=0.77]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 03:38:36,905 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:38:36,988 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:37,070 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:38:37,154 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:38:37,237 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:38:37,321 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:38:37,408 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 1 GRPO groups: 95%|#########5| 19/20 [02:07<00:05, 5.40s/q, loss=0.0036, mean_r=0.976, skip=8]
Iter 1 GRPO groups: 100%|##########| 20/20 [02:07<00:00, 6.09s/q, loss=0.0036, mean_r=0.976, skip=8]
Iter 1 GRPO groups: 100%|##########| 20/20 [02:07<00:00, 6.38s/q, loss=0.0036, mean_r=0.976, skip=8]
+2026-04-26 03:38:38,866 INFO __main__ - Iter 1 | loss=0.0006 | reward mean=0.914 std=0.164 | gt_match=78.0% | grounded_acc=96.0% | step_acc=89.5% | lccp=81.4% | batch_acc=96.0% | phase=GROUNDED_ONLY sp_ratio=0% | groups=12 skipped=8(0var=8) | lr=1.06e-06 | 127.6s
+2026-04-26 03:38:38,866 WARNING __main__ - STARVATION: 40% of groups skipped (zero variance). grounded_acc=96.0% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 03:38:38,867 INFO __main__ - ======================================================================
+2026-04-26 03:38:38,867 INFO __main__ - GRPO ITERATION 2/60
+2026-04-26 03:38:38,867 INFO __main__ - ======================================================================
+2026-04-26 03:38:38,884 INFO __main__ - LR this iteration: 1.06e-06 | T=0.793 | MATH ratio=30%
+
Iter 2 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 03:38:43,486 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:38:43,567 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.769 = 0.50×0.62(prox=0.62) + 0.40×proc(0.892[fin=0.99,mean=0.75]) + 0.10×fmt(1.000) | pred='13' gold='10' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:38:43,649 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:43,730 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.750 = 0.50×0.62(prox=0.62) + 0.40×proc(0.844[fin=0.98,mean=0.64]) + 0.10×fmt(1.000) | pred='13' gold='10' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:38:43,811 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:43,891 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:43,972 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.544 = 0.50×0.71(prox=0.71) + 0.40×proc(0.218[fin=0.05,mean=0.47]) + 0.10×fmt(1.000) | pred='12' gold='10' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 03:38:44,053 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:44,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:44,215 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 2 GRPO groups: 0%| | 0/20 [00:06, ?q/s, loss=0.0004, mean_r=0.906, skip=0]
Iter 2 GRPO groups: 5%|5 | 1/20 [00:06<02:07, 6.72s/q, loss=0.0004, mean_r=0.906, skip=0]2026-04-26 03:38:49,930 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:50,007 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:38:50,084 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:38:50,168 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:50,253 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:50,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:38:50,420 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:50,501 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:50,583 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='193' gold='193' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:50,666 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.543 = 0.50×0.20(prox=0.20) + 0.40×proc(0.633[fin=0.55,mean=0.76]) + 0.10×fmt(1.000) | pred='-193' gold='193' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+
Iter 2 GRPO groups: 5%|5 | 1/20 [00:13<02:07, 6.72s/q, loss=-0.0008, mean_r=0.945, skip=0]
Iter 2 GRPO groups: 10%|# | 2/20 [00:13<01:58, 6.58s/q, loss=-0.0008, mean_r=0.945, skip=0]2026-04-26 03:38:59,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2,870' gold='2870' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:38:59,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.972[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='2870' gold='2870' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:38:59,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2870' gold='2870' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:38:59,846 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2870' gold='2870' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:38:59,930 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.750 = 0.50×0.54(prox=0.54) + 0.40×proc(0.945[fin=0.99,mean=0.88]) + 0.10×fmt(1.000) | pred='1670' gold='2870' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:39:00,021 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.570 = 0.50×0.38(prox=0.38) + 0.40×proc(0.697[fin=0.90,mean=0.39]) + 0.10×fmt(1.000) | pred='550' gold='2870' | step_acc=33% lccp=0% (chain=0/6 ok_count=2) n_steps=6
+2026-04-26 03:39:00,114 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='2870' gold='2870' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:39:00,198 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.859 = 0.50×0.85(prox=0.85) + 0.40×proc(0.840[fin=0.94,mean=0.69]) + 0.10×fmt(1.000) | pred='3130' gold='2870' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:39:00,288 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='2870' gold='2870' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:39:00,371 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.785 = 0.50×0.59(prox=0.59) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='1870' gold='2870' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 2 GRPO groups: 10%|# | 2/20 [00:22<01:58, 6.58s/q, loss=0.0006, mean_r=0.894, skip=0]
Iter 2 GRPO groups: 15%|#5 | 3/20 [00:22<02:16, 8.02s/q, loss=0.0006, mean_r=0.894, skip=0]2026-04-26 03:39:08,394 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:39:08,476 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:39:08,559 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.915[fin=0.94,mean=0.88]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:39:08,641 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.791 = 0.50×0.67(prox=0.67) + 0.40×proc(0.894[fin=0.98,mean=0.77]) + 0.10×fmt(1.000) | pred='2.5' gold='2' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:39:08,724 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.908[fin=0.98,mean=0.81]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:39:08,807 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.525 = 0.50×0.00(prox=0.00) + 0.40×proc(0.875[fin=0.96,mean=0.75]) + 0.10×fmt(1.000) | pred='5/2' gold='2' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:39:08,890 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.951[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 03:39:08,973 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.466 = 0.50×0.00(prox=0.00) + 0.40×proc(0.822[fin=0.98,mean=0.59]) + 0.10×fmt(1.000) | pred='14/3' gold='2' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:39:09,055 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.514 = 0.50×0.00(prox=0.00) + 0.40×proc(0.911[fin=0.99,mean=0.79]) + 0.10×fmt(1.000) | pred='11/6' gold='2' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 03:39:09,137 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 2 GRPO groups: 15%|#5 | 3/20 [00:31<02:16, 8.02s/q, loss=0.0004, mean_r=0.820, skip=0]
Iter 2 GRPO groups: 20%|## | 4/20 [00:31<02:12, 8.29s/q, loss=0.0004, mean_r=0.820, skip=0]2026-04-26 03:39:14,085 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:39:14,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.394 = 0.50×0.35(prox=0.35) + 0.40×proc(0.205[fin=0.02,mean=0.49]) + 0.10×fmt(1.000) | pred='3' gold='45' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 03:39:14,249 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:39:14,331 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.45(prox=0.45) + 0.40×proc(0.668[fin=0.78,mean=0.50]) + 0.10×fmt(1.000) | pred='72' gold='45' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:39:14,413 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.905[fin=0.98,mean=0.79]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:39:14,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.595 = 0.50×0.43(prox=0.43) + 0.40×proc(0.702[fin=0.93,mean=0.36]) + 0.10×fmt(1.000) | pred='15' gold='45' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 03:39:14,578 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.23(prox=0.23) + 0.40×proc(0.818[fin=0.95,mean=0.62]) + 0.10×fmt(1.000) | pred='120' gold='45' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 03:39:14,661 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:39:14,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.771 = 0.50×0.56(prox=0.56) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='27' gold='45' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:39:14,828 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.411 = 0.50×0.41(prox=0.41) + 0.40×proc(0.168[fin=0.08,mean=0.31]) + 0.10×fmt(1.000) | pred='13' gold='45' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+
Iter 2 GRPO groups: 20%|## | 4/20 [00:37<02:12, 8.29s/q, loss=-0.0006, mean_r=0.722, skip=0]
Iter 2 GRPO groups: 25%|##5 | 5/20 [00:37<01:50, 7.36s/q, loss=-0.0006, mean_r=0.722, skip=0]2026-04-26 03:39:49,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.601 = 0.50×0.75(prox=0.75) + 0.40×proc(0.315[fin=0.17,mean=0.54]) + 0.10×fmt(1.000) | pred='5' gold='6' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 03:39:50,012 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.824 = 0.50×0.75(prox=0.75) + 0.40×proc(0.872[fin=0.93,mean=0.79]) + 0.10×fmt(1.000) | pred='7' gold='6' | step_acc=88% lccp=62% (chain=5/8 ok_count=7) n_steps=8
+2026-04-26 03:39:50,105 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.564 = 0.50×0.33(prox=0.33) + 0.40×proc(0.743[fin=0.77,mean=0.70]) + 0.10×fmt(1.000) | pred='12' gold='6' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:39:50,190 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.756 = 0.50×0.75(prox=0.75) + 0.40×proc(0.701[fin=0.77,mean=0.59]) + 0.10×fmt(1.000) | pred='7' gold='6' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:39:50,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:39:50,386 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.932 = 0.50×1.00(exact) + 0.40×proc(0.830[fin=0.97,mean=0.62]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:39:50,472 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.698 = 0.50×1.00(exact) + 0.40×proc(0.244[fin=0.01,mean=0.59]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 03:39:50,556 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.626 = 0.50×0.75(prox=0.75) + 0.40×proc(0.378[fin=0.34,mean=0.43]) + 0.10×fmt(1.000) | pred='5' gold='6' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+
Iter 2 GRPO groups: 25%|##5 | 5/20 [01:12<01:50, 7.36s/q, loss=0.0004, mean_r=0.748, skip=0]
Iter 2 GRPO groups: 30%|### | 6/20 [01:12<03:57, 16.97s/q, loss=0.0004, mean_r=0.748, skip=0]2026-04-26 03:39:58,448 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:39:58,533 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:39:58,616 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:39:58,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.923 = 0.50×0.85(prox=0.85) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7.60' gold='8' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:39:58,791 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:39:58,883 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:39:58,971 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.918 = 0.50×0.85(prox=0.85) + 0.40×proc(0.983[fin=0.98,mean=0.99]) + 0.10×fmt(1.000) | pred='7.6' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:39:59,054 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:39:59,137 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:39:59,220 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 2 GRPO groups: 30%|### | 6/20 [01:21<03:57, 16.97s/q, loss=0.0005, mean_r=0.984, skip=0]
Iter 2 GRPO groups: 35%|###5 | 7/20 [01:21<03:05, 14.29s/q, loss=0.0005, mean_r=0.984, skip=0]2026-04-26 03:40:20,484 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.914[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='-150' gold='-150' | step_acc=86% lccp=57% (chain=4/7 ok_count=6) n_steps=7
+2026-04-26 03:40:20,578 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.430 = 0.50×0.15(prox=0.15) + 0.40×proc(0.354[fin=0.09,mean=0.75]) + 0.10×fmt(1.000) | pred='290' gold='-150' | step_acc=77% lccp=77% (chain=10/13 ok_count=10) n_steps=13
+2026-04-26 03:40:20,662 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='150' gold='-150' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:40:20,757 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.389 = 0.50×0.33(prox=0.33) + 0.40×proc(0.253[fin=0.21,mean=0.32]) + 0.10×fmt(1.000) | pred='0' gold='-150' | step_acc=14% lccp=14% (chain=1/7 ok_count=1) n_steps=7
+2026-04-26 03:40:20,852 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.309 = 0.50×0.00(prox=0.00) + 0.40×proc(0.288[fin=0.03,mean=0.68]) + 0.10×fmt(1.000) | pred='78680' gold='-150' | step_acc=62% lccp=62% (chain=5/8 ok_count=5) n_steps=8
+2026-04-26 03:40:20,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.964[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='-150' gold='-150' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:40:21,025 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-150' gold='-150' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:40:21,125 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.907[fin=0.93,mean=0.87]) + 0.10×fmt(1.000) | pred='0' gold='-150' | step_acc=100% lccp=100% (chain=12/12 ok_count=12) n_steps=12
+2026-04-26 03:40:21,222 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.441 = 0.50×0.02(prox=0.02) + 0.40×proc(0.756[fin=0.87,mean=0.58]) + 0.10×fmt(1.000) | pred='3975' gold='-150' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 03:40:21,334 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.480 = 0.50×0.02(prox=0.02) + 0.40×proc(0.927[fin=0.98,mean=0.85]) + 0.10×fmt(1.000) | pred='4000' gold='-150' | step_acc=90% lccp=0% (chain=0/10 ok_count=9) n_steps=10
+
Iter 2 GRPO groups: 35%|###5 | 7/20 [01:43<03:05, 14.29s/q, loss=-0.0016, mean_r=0.610, skip=0]
Iter 2 GRPO groups: 40%|#### | 8/20 [01:43<03:21, 16.79s/q, loss=-0.0016, mean_r=0.610, skip=0]2026-04-26 03:40:25,466 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:25,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:25,622 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:25,704 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:25,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:25,863 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:25,940 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:26,016 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:26,094 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:26,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 2 GRPO groups: 40%|#### | 8/20 [01:47<03:21, 16.79s/q, loss=0var, mean_r=1.000, skip=1]
Iter 2 GRPO groups: 45%|####5 | 9/20 [01:47<02:18, 12.60s/q, loss=0var, mean_r=1.000, skip=1]2026-04-26 03:40:34,776 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:34,868 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:34,959 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:35,052 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.366 = 0.50×0.16(prox=0.16) + 0.40×proc(0.276[fin=0.12,mean=0.51]) + 0.10×fmt(1.000) | pred='453' gold='126' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 03:40:35,145 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.933[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:35,236 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.486 = 0.50×0.72(prox=0.72) + 0.40×proc(0.071[fin=0.05,mean=0.11]) + 0.10×fmt(1.000) | pred='151' gold='126' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:40:35,320 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.777 = 0.50×0.60(prox=0.60) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='168' gold='126' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:35,413 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:35,510 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.925[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 03:40:35,601 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 2 GRPO groups: 45%|####5 | 9/20 [01:58<02:18, 12.60s/q, loss=0.0001, mean_r=0.854, skip=1]
Iter 2 GRPO groups: 50%|##### | 10/20 [01:58<02:00, 12.07s/q, loss=0.0001, mean_r=0.854, skip=1]2026-04-26 03:40:41,618 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.840 = 0.50×0.71(prox=0.71) + 0.40×proc(0.958[fin=0.97,mean=0.94]) + 0.10×fmt(1.000) | pred='200' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:40:41,700 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:40:41,775 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:41,850 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:40:41,932 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:40:42,012 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.903[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 03:40:42,094 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:40:42,175 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.923[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 03:40:42,260 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.831[fin=0.95,mean=0.66]) + 0.10×fmt(1.000) | pred='750' gold='250' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:40:42,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.477 = 0.50×0.56(prox=0.56) + 0.40×proc(0.248[fin=0.15,mean=0.39]) + 0.10×fmt(1.000) | pred='150' gold='250' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+
Iter 2 GRPO groups: 50%|##### | 10/20 [02:04<02:00, 12.07s/q, loss=0.0008, mean_r=0.876, skip=1]
Iter 2 GRPO groups: 55%|#####5 | 11/20 [02:04<01:33, 10.42s/q, loss=0.0008, mean_r=0.876, skip=1]2026-04-26 03:40:48,059 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:48,142 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.295 = 0.50×0.14(prox=0.14) + 0.40×proc(0.216[fin=0.15,mean=0.32]) + 0.10×fmt(1.000) | pred='108' gold='27' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 03:40:48,224 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.894[fin=0.99,mean=0.75]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:40:48,305 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:48,389 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.521 = 0.50×0.08(prox=0.08) + 0.40×proc(0.833[fin=0.92,mean=0.70]) + 0.10×fmt(1.000) | pred='192' gold='27' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 03:40:48,473 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.510 = 0.50×0.07(prox=0.07) + 0.40×proc(0.847[fin=0.99,mean=0.63]) + 0.10×fmt(1.000) | pred='216' gold='27' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 03:40:48,553 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:48,639 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.27(prox=0.27) + 0.40×proc(0.842[fin=0.99,mean=0.62]) + 0.10×fmt(1.000) | pred='64' gold='27' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 03:40:48,723 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.534 = 0.50×0.36(prox=0.36) + 0.40×proc(0.506[fin=0.52,mean=0.49]) + 0.10×fmt(1.000) | pred='3.375' gold='27' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 03:40:48,804 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 2 GRPO groups: 55%|#####5 | 11/20 [02:11<01:33, 10.42s/q, loss=-0.0015, mean_r=0.736, skip=1]
Iter 2 GRPO groups: 60%|###### | 12/20 [02:11<01:13, 9.22s/q, loss=-0.0015, mean_r=0.736, skip=1]2026-04-26 03:40:54,005 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:54,086 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:54,168 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:54,249 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:54,331 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:54,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.173 = 0.50×0.07(prox=0.07) + 0.40×proc(0.093[fin=0.09,mean=0.09]) + 0.10×fmt(1.000) | pred='300' gold='40' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:40:54,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:54,578 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:54,661 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:40:54,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 2 GRPO groups: 60%|###### | 12/20 [02:17<01:13, 9.22s/q, loss=0.0003, mean_r=0.917, skip=1]
Iter 2 GRPO groups: 65%|######5 | 13/20 [02:17<00:58, 8.30s/q, loss=0.0003, mean_r=0.917, skip=1]2026-04-26 03:41:03,720 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:41:03,805 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.538 = 0.50×0.65(prox=0.65) + 0.40×proc(0.281[fin=0.02,mean=0.68]) + 0.10×fmt(1.000) | pred='11' gold='15' | step_acc=70% lccp=10% (chain=1/10 ok_count=7) n_steps=10
+2026-04-26 03:41:03,887 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=11/11 ok_count=11) n_steps=11
+2026-04-26 03:41:03,970 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:04,054 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:41:04,139 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:41:04,230 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:41:04,312 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:41:04,394 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:41:04,476 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+
Iter 2 GRPO groups: 65%|######5 | 13/20 [02:27<00:58, 8.30s/q, loss=0.0011, mean_r=0.953, skip=1]
Iter 2 GRPO groups: 70%|####### | 14/20 [02:27<00:51, 8.67s/q, loss=0.0011, mean_r=0.953, skip=1]2026-04-26 03:41:09,664 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.881 = 0.50×0.76(prox=0.76) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='55' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:09,740 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:09,822 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:09,905 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.946[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 03:41:09,987 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:10,068 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:10,152 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:10,234 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:10,316 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:10,397 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 2 GRPO groups: 70%|####### | 14/20 [02:32<00:51, 8.67s/q, loss=0.0001, mean_r=0.984, skip=1]
Iter 2 GRPO groups: 75%|#######5 | 15/20 [02:32<00:39, 7.83s/q, loss=0.0001, mean_r=0.984, skip=1]2026-04-26 03:41:30,722 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='154' gold='154' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:41:30,815 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.47(prox=0.47) + 0.40×proc(0.901[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='66' gold='154' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 03:41:30,907 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.37(prox=0.37) + 0.40×proc(0.858[fin=0.94,mean=0.73]) + 0.10×fmt(1.000) | pred='25' gold='154' | step_acc=71% lccp=43% (chain=3/7 ok_count=5) n_steps=7
+2026-04-26 03:41:30,999 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.945[fin=0.93,mean=0.97]) + 0.10×fmt(0.700) | pred='' gold='154' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:41:31,092 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.972[fin=0.99,mean=0.95]) + 0.10×fmt(0.700) | pred='' gold='154' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:31,184 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.864 = 0.50×0.85(prox=0.85) + 0.40×proc(0.849[fin=0.87,mean=0.81]) + 0.10×fmt(1.000) | pred='158' gold='154' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 03:41:31,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.491 = 0.50×0.00(prox=0.00) + 0.40×proc(0.710[fin=0.65,mean=0.80]) + 0.10×fmt(1.000) | pred='84 + 285i' gold='154' | step_acc=86% lccp=71% (chain=5/7 ok_count=6) n_steps=7
+2026-04-26 03:41:31,379 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.37(prox=0.37) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='24' gold='154' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:41:31,476 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.30(prox=0.30) + 0.40×proc(0.676[fin=0.79,mean=0.51]) + 0.10×fmt(1.000) | pred='338' gold='154' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:41:31,569 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.471 = 0.50×0.00(prox=0.00) + 0.40×proc(0.895[fin=1.00,mean=0.74]) + 0.10×fmt(0.700) | pred='' gold='154' | step_acc=71% lccp=29% (chain=2/7 ok_count=5) n_steps=7
+
Iter 2 GRPO groups: 75%|#######5 | 15/20 [02:54<00:39, 7.83s/q, loss=-0.0003, mean_r=0.613, skip=1]
Iter 2 GRPO groups: 80%|######## | 16/20 [02:54<00:47, 11.85s/q, loss=-0.0003, mean_r=0.613, skip=1]2026-04-26 03:41:39,471 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:39,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:39,649 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:41:39,742 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.532 = 0.50×0.34(prox=0.34) + 0.40×proc(0.339[fin=0.02,mean=0.82]) + 0.10×fmt(1.000) | pred='1' gold='23' | step_acc=83% lccp=83% (chain=5/6 ok_count=5) n_steps=6
+2026-04-26 03:41:39,833 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:39,916 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:39,999 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:40,084 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:40,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:40,253 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 2 GRPO groups: 80%|######## | 16/20 [03:02<00:47, 11.85s/q, loss=-0.0009, mean_r=0.951, skip=1]
Iter 2 GRPO groups: 85%|########5 | 17/20 [03:02<00:32, 10.90s/q, loss=-0.0009, mean_r=0.951, skip=1]2026-04-26 03:41:43,971 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,052 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,136 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,304 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,385 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,467 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,548 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,630 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:41:44,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 2 GRPO groups: 85%|########5 | 17/20 [03:05<00:32, 10.90s/q, loss=0var, mean_r=0.999, skip=2]
Iter 2 GRPO groups: 90%|######### | 18/20 [03:05<00:17, 8.54s/q, loss=0var, mean_r=0.999, skip=2]2026-04-26 03:41:49,364 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:49,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=0.98,mean=0.92]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:49,529 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.15(prox=0.15) + 0.40×proc(0.966[fin=0.99,mean=0.93]) + 0.10×fmt(1.000) | pred='68' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:49,612 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 03:41:49,694 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:41:49,776 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.15(prox=0.15) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='68' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:49,859 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='30' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:41:49,942 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:41:50,024 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:41:50,109 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.494 = 0.50×0.15(prox=0.15) + 0.40×proc(0.570[fin=0.47,mean=0.71]) + 0.10×fmt(1.000) | pred='68' gold='18' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+
Iter 2 GRPO groups: 90%|######### | 18/20 [03:12<00:17, 8.54s/q, loss=0.0016, mean_r=0.808, skip=2]
Iter 2 GRPO groups: 95%|#########5| 19/20 [03:12<00:08, 8.03s/q, loss=0.0016, mean_r=0.808, skip=2]2026-04-26 03:41:56,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.604 = 0.50×0.50(prox=0.50) + 0.40×proc(0.635[fin=0.73,mean=0.50]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:41:56,220 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.580 = 0.50×0.50(prox=0.50) + 0.40×proc(0.576[fin=0.77,mean=0.28]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:41:56,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.582 = 0.50×0.50(prox=0.50) + 0.40×proc(0.581[fin=0.77,mean=0.30]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:41:56,406 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.665 = 0.50×0.50(prox=0.50) + 0.40×proc(0.787[fin=0.98,mean=0.50]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:41:56,497 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.572 = 0.50×0.33(prox=0.33) + 0.40×proc(0.762[fin=0.92,mean=0.52]) + 0.10×fmt(1.000) | pred='8' gold='4' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:41:56,588 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.908 = 0.50×1.00(exact) + 0.40×proc(0.771[fin=0.97,mean=0.47]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:41:56,679 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.616 = 0.50×0.50(prox=0.50) + 0.40×proc(0.665[fin=0.84,mean=0.40]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:41:56,771 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.652 = 0.50×0.50(prox=0.50) + 0.40×proc(0.755[fin=0.93,mean=0.49]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:41:56,862 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.639 = 0.50×0.50(prox=0.50) + 0.40×proc(0.723[fin=0.91,mean=0.44]) + 0.10×fmt(1.000) | pred='6' gold='4' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:41:56,955 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.366 = 0.50×0.05(prox=0.05) + 0.40×proc(0.530[fin=0.65,mean=0.35]) + 0.10×fmt(1.000) | pred='44' gold='4' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+
Iter 2 GRPO groups: 95%|#########5| 19/20 [03:19<00:08, 8.03s/q, loss=-0.0012, mean_r=0.618, skip=2]
Iter 2 GRPO groups: 100%|##########| 20/20 [03:19<00:00, 7.67s/q, loss=-0.0012, mean_r=0.618, skip=2]
Iter 2 GRPO groups: 100%|##########| 20/20 [03:19<00:00, 9.97s/q, loss=-0.0012, mean_r=0.618, skip=2]
+2026-04-26 03:41:58,386 INFO __main__ - Iter 2 | loss=-0.0000 | reward mean=0.848 std=0.216 | gt_match=65.2% | grounded_acc=91.4% | step_acc=86.7% | lccp=76.5% | batch_acc=91.4% | phase=GROUNDED_ONLY sp_ratio=0% | groups=18 skipped=2(0var=2) | lr=1.63e-06 | 199.5s
+2026-04-26 03:41:58,386 INFO __main__ - ======================================================================
+2026-04-26 03:41:58,386 INFO __main__ - GRPO ITERATION 3/60
+2026-04-26 03:41:58,386 INFO __main__ - ======================================================================
+2026-04-26 03:41:58,404 INFO __main__ - LR this iteration: 1.63e-06 | T=0.786 | MATH ratio=30%
+
Iter 3 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 03:42:02,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.459 = 0.50×0.56(prox=0.56) + 0.40×proc(0.197[fin=0.09,mean=0.36]) + 0.10×fmt(1.000) | pred='56' gold='92' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 03:42:02,086 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.905[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='92' gold='92' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 03:42:02,167 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='92' gold='92' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:02,248 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='92' gold='92' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:02,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.463 = 0.50×0.56(prox=0.56) + 0.40×proc(0.206[fin=0.10,mean=0.37]) + 0.10×fmt(1.000) | pred='56' gold='92' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 03:42:02,410 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='92' gold='92' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:02,491 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.951[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='92' gold='92' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:02,572 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='92' gold='92' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:02,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='92' gold='92' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:02,728 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=0.99,mean=0.95]) + 0.10×fmt(1.000) | pred='92' gold='92' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 3 GRPO groups: 0%| | 0/20 [00:05, ?q/s, loss=0.0026, mean_r=0.883, skip=0]
Iter 3 GRPO groups: 5%|5 | 1/20 [00:05<01:48, 5.72s/q, loss=0.0026, mean_r=0.883, skip=0]2026-04-26 03:42:37,768 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.950[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:37,852 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.717 = 0.50×0.56(prox=0.56) + 0.40×proc(0.847[fin=0.89,mean=0.78]) + 0.10×fmt(1.000) | pred='15' gold='25' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:42:37,934 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.939[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:42:38,016 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:38,101 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.471 = 0.50×0.38(prox=0.38) + 0.40×proc(0.221[fin=0.01,mean=0.54]) + 0.10×fmt(1.000) | pred='5' gold='25' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 03:42:38,187 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.306 = 0.50×0.38(prox=0.38) + 0.40×proc(0.035[fin=0.03,mean=0.04]) + 0.10×fmt(1.000) | pred='5' gold='25' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 03:42:38,269 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:38,356 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.548 = 0.50×0.56(prox=0.56) + 0.40×proc(0.426[fin=0.41,mean=0.45]) + 0.10×fmt(1.000) | pred='15' gold='25' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 03:42:38,439 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 3 GRPO groups: 5%|5 | 1/20 [00:41<01:48, 5.72s/q, loss=-0.0003, mean_r=0.776, skip=0]
Iter 3 GRPO groups: 10%|# | 2/20 [00:41<07:00, 23.33s/q, loss=-0.0003, mean_r=0.776, skip=0]2026-04-26 03:42:43,064 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='20' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:43,147 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:43,228 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:43,311 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.744 = 0.50×0.67(prox=0.67) + 0.40×proc(0.777[fin=0.95,mean=0.52]) + 0.10×fmt(1.000) | pred='15' gold='12' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:42:43,393 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.947[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='20' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:43,475 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.686 = 0.50×0.50(prox=0.50) + 0.40×proc(0.839[fin=0.97,mean=0.64]) + 0.10×fmt(1.000) | pred='6' gold='12' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:43,551 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:43,628 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.942[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='20' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:43,705 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.921[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='20' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:43,787 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 3 GRPO groups: 10%|# | 2/20 [00:46<07:00, 23.33s/q, loss=0.0006, mean_r=0.761, skip=0]
Iter 3 GRPO groups: 15%|#5 | 3/20 [00:46<04:17, 15.14s/q, loss=0.0006, mean_r=0.761, skip=0]2026-04-26 03:42:47,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:47,860 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:47,937 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:48,013 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:48,091 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:48,170 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:48,251 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:48,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:48,408 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:42:48,486 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 3 GRPO groups: 15%|#5 | 3/20 [00:50<04:17, 15.14s/q, loss=0var, mean_r=0.998, skip=1]
Iter 3 GRPO groups: 20%|## | 4/20 [00:50<02:47, 10.47s/q, loss=0var, mean_r=0.998, skip=1]2026-04-26 03:42:53,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.864 = 0.50×0.74(prox=0.74) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='46' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:53,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.844 = 0.50×0.76(prox=0.76) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='65' gold='56' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:42:53,753 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:42:53,836 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.364[fin=0.18,mean=0.64]) + 0.10×fmt(1.000) | pred='14' gold='56' | step_acc=80% lccp=80% (chain=4/5 ok_count=4) n_steps=5
+2026-04-26 03:42:53,919 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:54,002 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.672 = 0.50×0.40(prox=0.40) + 0.40×proc(0.930[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='14' gold='56' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:54,088 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.811 = 0.50×0.82(prox=0.82) + 0.40×proc(0.748[fin=0.96,mean=0.43]) + 0.10×fmt(1.000) | pred='62' gold='56' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:42:54,174 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.725 = 0.50×0.50(prox=0.50) + 0.40×proc(0.936[fin=0.99,mean=0.85]) + 0.10×fmt(1.000) | pred='28' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:54,257 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.788 = 0.50×0.67(prox=0.67) + 0.40×proc(0.886[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='42' gold='56' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:54,342 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.888[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='14' gold='56' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+
Iter 3 GRPO groups: 20%|## | 4/20 [00:57<02:47, 10.47s/q, loss=0.0008, mean_r=0.780, skip=1]
Iter 3 GRPO groups: 25%|##5 | 5/20 [00:57<02:19, 9.32s/q, loss=0.0008, mean_r=0.780, skip=1]2026-04-26 03:42:58,655 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.921[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:58,741 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.920[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:58,825 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:58,910 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.905[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:58,993 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:42:59,078 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.900[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:59,163 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.920[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:59,247 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.919[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:59,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.921[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:42:59,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.909[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+
Iter 3 GRPO groups: 25%|##5 | 5/20 [01:01<02:19, 9.32s/q, loss=0var, mean_r=0.972, skip=2]
Iter 3 GRPO groups: 30%|### | 6/20 [01:01<01:43, 7.39s/q, loss=0var, mean_r=0.972, skip=2]2026-04-26 03:43:08,534 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:43:08,626 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.732 = 0.50×0.50(prox=0.50) + 0.40×proc(0.954[fin=0.94,mean=0.97]) + 0.10×fmt(1.000) | pred='64' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:43:08,717 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:43:08,808 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.719 = 0.50×0.50(prox=0.50) + 0.40×proc(0.922[fin=0.89,mean=0.97]) + 0.10×fmt(1.000) | pred='64' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:43:08,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:43:08,992 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:43:09,083 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:43:09,177 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:43:09,268 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:43:09,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+
Iter 3 GRPO groups: 30%|### | 6/20 [01:12<01:43, 7.39s/q, loss=0.0005, mean_r=0.944, skip=2]
Iter 3 GRPO groups: 35%|###5 | 7/20 [01:12<01:52, 8.69s/q, loss=0.0005, mean_r=0.944, skip=2]2026-04-26 03:43:44,376 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.888[fin=1.00,mean=0.73]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 03:43:44,453 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 03:43:44,535 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:43:44,616 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.922[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:43:44,699 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.276 = 0.50×0.03(prox=0.03) + 0.40×proc(0.212[fin=0.03,mean=0.49]) + 0.10×fmt(1.000) | pred='190' gold='12' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 03:43:44,780 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:43:44,862 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.936[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:43:44,939 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:43:45,021 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 3 GRPO groups: 35%|###5 | 7/20 [01:47<01:52, 8.69s/q, loss=-0.0011, mean_r=0.901, skip=2]
Iter 3 GRPO groups: 40%|#### | 8/20 [01:47<03:27, 17.25s/q, loss=-0.0011, mean_r=0.901, skip=2]2026-04-26 03:43:52,675 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.780 = 0.50×0.85(prox=0.85) + 0.40×proc(0.638[fin=0.74,mean=0.49]) + 0.10×fmt(1.000) | pred='1980' gold='2020' | step_acc=50% lccp=33% (chain=2/6 ok_count=3) n_steps=6
+2026-04-26 03:43:52,759 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.698 = 0.50×0.75(prox=0.75) + 0.40×proc(0.561[fin=0.60,mean=0.50]) + 0.10×fmt(1.000) | pred='1680' gold='2020' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 03:43:52,843 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.851 = 0.50×0.81(prox=0.81) + 0.40×proc(0.868[fin=0.98,mean=0.69]) + 0.10×fmt(1.000) | pred='1780' gold='2020' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 03:43:52,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.815 = 0.50×0.75(prox=0.75) + 0.40×proc(0.853[fin=0.96,mean=0.69]) + 0.10×fmt(1.000) | pred='1680' gold='2020' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 03:43:53,019 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.871 = 0.50×0.85(prox=0.85) + 0.40×proc(0.864[fin=0.89,mean=0.83]) + 0.10×fmt(1.000) | pred='2180' gold='2020' | step_acc=86% lccp=71% (chain=5/7 ok_count=6) n_steps=7
+2026-04-26 03:43:53,104 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.823 = 0.50×0.81(prox=0.81) + 0.40×proc(0.796[fin=0.95,mean=0.57]) + 0.10×fmt(1.000) | pred='1780' gold='2020' | step_acc=67% lccp=17% (chain=1/6 ok_count=4) n_steps=6
+2026-04-26 03:43:53,188 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.42(prox=0.42) + 0.40×proc(0.741[fin=0.79,mean=0.67]) + 0.10×fmt(1.000) | pred='3428' gold='2020' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:43:53,275 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.771 = 0.50×0.77(prox=0.77) + 0.40×proc(0.715[fin=0.85,mean=0.51]) + 0.10×fmt(1.000) | pred='1720' gold='2020' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 03:43:53,358 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.697 = 0.50×0.56(prox=0.56) + 0.40×proc(0.796[fin=0.81,mean=0.78]) + 0.10×fmt(1.000) | pred='1220' gold='2020' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 03:43:53,442 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.775 = 0.50×0.75(prox=0.75) + 0.40×proc(0.752[fin=0.92,mean=0.50]) + 0.10×fmt(1.000) | pred='1680' gold='2020' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+
Iter 3 GRPO groups: 40%|#### | 8/20 [01:56<03:27, 17.25s/q, loss=-0.0006, mean_r=0.763, skip=2]
Iter 3 GRPO groups: 45%|####5 | 9/20 [01:56<02:39, 14.52s/q, loss=-0.0006, mean_r=0.763, skip=2]2026-04-26 03:43:58,841 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:43:58,919 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:43:59,002 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:43:59,081 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:43:59,162 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:43:59,239 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.949 = 0.50×1.00(exact) + 0.40×proc(0.961[fin=1.00,mean=0.90]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:43:59,320 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:43:59,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 03:43:59,472 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:43:59,550 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 3 GRPO groups: 45%|####5 | 9/20 [02:01<02:39, 14.52s/q, loss=0var, mean_r=0.977, skip=3]
Iter 3 GRPO groups: 50%|##### | 10/20 [02:01<01:54, 11.48s/q, loss=0var, mean_r=0.977, skip=3]2026-04-26 03:44:02,249 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:02,332 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:02,407 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:02,483 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:02,563 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:02,643 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:02,724 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:02,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:02,878 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:02,954 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 3 GRPO groups: 50%|##### | 10/20 [02:04<01:54, 11.48s/q, loss=0var, mean_r=0.993, skip=4]
Iter 3 GRPO groups: 55%|#####5 | 11/20 [02:04<01:21, 9.01s/q, loss=0var, mean_r=0.993, skip=4]2026-04-26 03:44:07,923 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,010 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,096 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,262 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,344 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,428 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,513 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,596 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:08,679 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 3 GRPO groups: 55%|#####5 | 11/20 [02:10<01:21, 9.01s/q, loss=0var, mean_r=0.998, skip=5]
Iter 3 GRPO groups: 60%|###### | 12/20 [02:10<01:04, 8.01s/q, loss=0var, mean_r=0.998, skip=5]2026-04-26 03:44:15,300 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.905 = 0.50×0.85(prox=0.85) + 0.40×proc(0.950[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='7900' gold='7945' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:44:15,385 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.883 = 0.50×0.85(prox=0.85) + 0.40×proc(0.896[fin=0.96,mean=0.80]) + 0.10×fmt(1.000) | pred='7600' gold='7945' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:44:15,470 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.899 = 0.50×0.85(prox=0.85) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='7900' gold='7945' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:44:15,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.910 = 0.50×0.85(prox=0.85) + 0.40×proc(0.962[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='7900' gold='7945' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:15,639 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.919 = 0.50×0.85(prox=0.85) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='8000' gold='7945' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:15,731 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.916 = 0.50×0.85(prox=0.85) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='7900' gold='7945' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:44:15,822 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.865 = 0.50×0.85(prox=0.85) + 0.40×proc(0.850[fin=0.99,mean=0.64]) + 0.10×fmt(1.000) | pred='7900' gold='7945' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:44:15,914 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.910 = 0.50×0.85(prox=0.85) + 0.40×proc(0.963[fin=0.99,mean=0.92]) + 0.10×fmt(1.000) | pred='7920' gold='7945' | step_acc=89% lccp=78% (chain=7/9 ok_count=8) n_steps=9
+2026-04-26 03:44:15,997 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.888 = 0.50×0.85(prox=0.85) + 0.40×proc(0.908[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='8000' gold='7945' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 03:44:16,090 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.661 = 0.50×0.54(prox=0.54) + 0.40×proc(0.734[fin=0.83,mean=0.59]) + 0.10×fmt(1.000) | pred='4500' gold='7945' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+
Iter 3 GRPO groups: 60%|###### | 12/20 [02:19<01:04, 8.01s/q, loss=-0.0006, mean_r=0.876, skip=5]
Iter 3 GRPO groups: 65%|######5 | 13/20 [02:19<00:57, 8.26s/q, loss=-0.0006, mean_r=0.876, skip=5]2026-04-26 03:44:25,743 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.703 = 0.50×0.50(prox=0.50) + 0.40×proc(0.883[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='24' gold='16' | step_acc=71% lccp=14% (chain=1/7 ok_count=5) n_steps=7
+2026-04-26 03:44:25,827 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.740 = 0.50×0.50(prox=0.50) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='24' gold='16' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:44:25,910 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:44:26,001 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+2026-04-26 03:44:26,085 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+2026-04-26 03:44:26,172 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.432 = 0.50×0.33(prox=0.33) + 0.40×proc(0.302[fin=0.11,mean=0.59]) + 0.10×fmt(1.000) | pred='0' gold='16' | step_acc=60% lccp=30% (chain=3/10 ok_count=6) n_steps=10
+2026-04-26 03:44:26,256 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.810[fin=0.99,mean=0.54]) + 0.10×fmt(1.000) | pred='28' gold='16' | step_acc=62% lccp=12% (chain=1/8 ok_count=5) n_steps=8
+2026-04-26 03:44:26,341 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.809 = 0.50×1.00(exact) + 0.40×proc(0.522[fin=0.43,mean=0.67]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 03:44:26,434 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.504 = 0.50×0.53(prox=0.53) + 0.40×proc(0.352[fin=0.09,mean=0.75]) + 0.10×fmt(1.000) | pred='8.8' gold='16' | step_acc=80% lccp=60% (chain=6/10 ok_count=8) n_steps=10
+2026-04-26 03:44:26,516 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 3 GRPO groups: 65%|######5 | 13/20 [02:29<00:57, 8.26s/q, loss=-0.0012, mean_r=0.773, skip=5]
Iter 3 GRPO groups: 70%|####### | 14/20 [02:29<00:53, 8.92s/q, loss=-0.0012, mean_r=0.773, skip=5]2026-04-26 03:44:32,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:32,396 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:32,480 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:32,564 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:32,648 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:32,731 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:32,813 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:32,895 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:32,976 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:33,053 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.689 = 0.50×0.60(prox=0.60) + 0.40×proc(0.724[fin=0.95,mean=0.39]) + 0.10×fmt(1.000) | pred='90' gold='135' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+
Iter 3 GRPO groups: 70%|####### | 14/20 [02:36<00:53, 8.92s/q, loss=0.0025, mean_r=0.969, skip=5]
Iter 3 GRPO groups: 75%|#######5 | 15/20 [02:36<00:40, 8.19s/q, loss=0.0025, mean_r=0.969, skip=5]2026-04-26 03:44:39,345 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.904[fin=0.99,mean=0.77]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:39,427 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:39,509 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:39,590 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.948[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:39,672 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:39,752 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:39,833 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:39,914 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:39,997 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:40,080 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 3 GRPO groups: 75%|#######5 | 15/20 [02:41<00:40, 8.19s/q, loss=0var, mean_r=0.992, skip=6]
Iter 3 GRPO groups: 80%|######## | 16/20 [02:41<00:29, 7.42s/q, loss=0var, mean_r=0.992, skip=6]2026-04-26 03:44:44,052 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:44,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:44,215 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:44,296 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:44,377 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:44,459 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:44,541 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:44:44,622 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:44,703 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:44:44,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 3 GRPO groups: 80%|######## | 16/20 [02:46<00:29, 7.42s/q, loss=0var, mean_r=0.997, skip=7]
Iter 3 GRPO groups: 85%|########5 | 17/20 [02:46<00:19, 6.60s/q, loss=0var, mean_r=0.997, skip=7]2026-04-26 03:44:47,629 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:47,704 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:47,786 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.924[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 03:44:47,867 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:47,948 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:48,023 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:48,100 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:48,183 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:48,260 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:48,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 3 GRPO groups: 85%|########5 | 17/20 [02:49<00:19, 6.60s/q, loss=0var, mean_r=0.996, skip=8]
Iter 3 GRPO groups: 90%|######### | 18/20 [02:49<00:11, 5.69s/q, loss=0var, mean_r=0.996, skip=8]2026-04-26 03:44:57,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.626 = 0.50×0.85(prox=0.85) + 0.40×proc(0.252[fin=0.34,mean=0.12]) + 0.10×fmt(1.000) | pred='675' gold='671' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 03:44:57,088 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.946[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='671' gold='671' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:44:57,183 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.479 = 0.50×0.45(prox=0.45) + 0.40×proc(0.384[fin=0.44,mean=0.30]) + 0.10×fmt(1.000) | pred='261' gold='671' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 03:44:57,266 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.538 = 0.50×0.43(prox=0.43) + 0.40×proc(0.638[fin=0.79,mean=0.40]) + 0.10×fmt(0.650) | pred='235' gold='671' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 03:44:57,358 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.632 = 0.50×0.43(prox=0.43) + 0.40×proc(0.788[fin=0.94,mean=0.56]) + 0.10×fmt(1.000) | pred='233' gold='671' | step_acc=50% lccp=0% (chain=0/6 ok_count=3) n_steps=6
+2026-04-26 03:44:57,442 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.35(prox=0.35) + 0.40×proc(0.613[fin=0.44,mean=0.88]) + 0.10×fmt(1.000) | pred='55' gold='671' | step_acc=86% lccp=86% (chain=6/7 ok_count=6) n_steps=7
+2026-04-26 03:44:57,534 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.607 = 0.50×0.43(prox=0.43) + 0.40×proc(0.729[fin=0.91,mean=0.46]) + 0.10×fmt(1.000) | pred='229' gold='671' | step_acc=33% lccp=0% (chain=0/6 ok_count=2) n_steps=6
+2026-04-26 03:44:57,617 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.469 = 0.50×0.35(prox=0.35) + 0.40×proc(0.388[fin=0.33,mean=0.47]) + 0.10×fmt(1.000) | pred='55' gold='671' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 03:44:57,708 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.512 = 0.50×0.59(prox=0.59) + 0.40×proc(0.288[fin=0.37,mean=0.17]) + 0.10×fmt(1.000) | pred='901' gold='671' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 03:44:57,791 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.922 = 0.50×1.00(exact) + 0.40×proc(0.805[fin=0.90,mean=0.67]) + 0.10×fmt(1.000) | pred='671' gold='671' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+
Iter 3 GRPO groups: 90%|######### | 18/20 [03:00<00:11, 5.69s/q, loss=0.0002, mean_r=0.631, skip=8]
Iter 3 GRPO groups: 95%|#########5| 19/20 [03:00<00:07, 7.24s/q, loss=0.0002, mean_r=0.631, skip=8]2026-04-26 03:45:06,026 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.948 = 0.50×1.00(exact) + 0.40×proc(0.870[fin=1.00,mean=0.68]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 03:45:06,108 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:45:06,191 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:45:06,281 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:45:06,371 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:45:06,455 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.435 = 0.50×0.21(prox=0.21) + 0.40×proc(0.509[fin=0.62,mean=0.33]) + 0.10×fmt(1.000) | pred='100' gold='35' | step_acc=33% lccp=17% (chain=1/6 ok_count=2) n_steps=6
+2026-04-26 03:45:06,537 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:45:06,620 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:45:06,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:45:06,793 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.941[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+
Iter 3 GRPO groups: 95%|#########5| 19/20 [03:09<00:07, 7.24s/q, loss=0.0011, mean_r=0.935, skip=8]
Iter 3 GRPO groups: 100%|##########| 20/20 [03:09<00:00, 7.77s/q, loss=0.0011, mean_r=0.935, skip=8]
Iter 3 GRPO groups: 100%|##########| 20/20 [03:09<00:00, 9.49s/q, loss=0.0011, mean_r=0.935, skip=8]
+2026-04-26 03:45:08,223 INFO __main__ - Iter 3 | loss=0.0004 | reward mean=0.896 std=0.171 | gt_match=70.7% | grounded_acc=95.5% | step_acc=87.7% | lccp=76.5% | batch_acc=95.5% | phase=GROUNDED_ONLY sp_ratio=0% | groups=12 skipped=8(0var=8) | lr=2.19e-06 | 189.8s
+2026-04-26 03:45:08,223 WARNING __main__ - STARVATION: 40% of groups skipped (zero variance). grounded_acc=95.5% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 03:45:08,224 INFO __main__ - ======================================================================
+2026-04-26 03:45:08,224 INFO __main__ - GRPO ITERATION 4/60
+2026-04-26 03:45:08,224 INFO __main__ - ======================================================================
+2026-04-26 03:45:08,240 INFO __main__ - LR this iteration: 2.19e-06 | T=0.780 | MATH ratio=30%
+
Iter 4 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 03:45:12,324 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.772 = 0.50×0.67(prox=0.67) + 0.40×proc(0.848[fin=0.98,mean=0.65]) + 0.10×fmt(1.000) | pred='180' gold='240' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 03:45:12,407 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=0.99,mean=0.97]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:45:12,488 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.940[fin=0.97,mean=0.89]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:45:12,572 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.793 = 0.50×0.67(prox=0.67) + 0.40×proc(0.899[fin=0.95,mean=0.82]) + 0.10×fmt(1.000) | pred='180' gold='240' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:45:12,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.600 = 0.50×0.67(prox=0.67) + 0.40×proc(0.416[fin=0.31,mean=0.58]) + 0.10×fmt(1.000) | pred='180' gold='240' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 03:45:12,736 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:45:12,819 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.704 = 0.50×0.67(prox=0.67) + 0.40×proc(0.677[fin=0.68,mean=0.67]) + 0.10×fmt(1.000) | pred='180' gold='240' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:45:12,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.918 = 0.50×1.00(exact) + 0.40×proc(0.794[fin=0.93,mean=0.59]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:45:12,982 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:45:13,063 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.692 = 0.50×0.67(prox=0.67) + 0.40×proc(0.648[fin=0.63,mean=0.67]) + 0.10×fmt(1.000) | pred='180' gold='240' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+
Iter 4 GRPO groups: 0%| | 0/20 [00:06, ?q/s, loss=0.0004, mean_r=0.845, skip=0]
Iter 4 GRPO groups: 5%|5 | 1/20 [00:06<01:58, 6.22s/q, loss=0.0004, mean_r=0.845, skip=0]2026-04-26 03:45:26,846 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.460 = 0.50×0.32(prox=0.32) + 0.40×proc(0.401[fin=0.25,mean=0.63]) + 0.10×fmt(1.000) | pred='49' gold='24' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:45:26,944 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.32(prox=0.32) + 0.40×proc(0.707[fin=0.85,mean=0.50]) + 0.10×fmt(1.000) | pred='49' gold='24' | step_acc=44% lccp=11% (chain=1/9 ok_count=4) n_steps=9
+2026-04-26 03:45:27,036 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.779[fin=0.83,mean=0.70]) + 0.10×fmt(1.000) | pred='48' gold='24' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 03:45:27,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.226 = 0.50×0.00(prox=0.00) + 0.40×proc(0.295[fin=0.05,mean=0.66]) + 0.10×fmt(0.700) | pred='' gold='24' | step_acc=62% lccp=25% (chain=2/8 ok_count=5) n_steps=8
+2026-04-26 03:45:27,225 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.504 = 0.50×0.32(prox=0.32) + 0.40×proc(0.464[fin=0.40,mean=0.56]) + 0.10×fmt(1.000) | pred='49' gold='24' | step_acc=62% lccp=38% (chain=3/8 ok_count=5) n_steps=8
+2026-04-26 03:45:27,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.892 = 0.50×1.00(exact) + 0.40×proc(0.730[fin=0.77,mean=0.66]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 03:45:27,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.876[fin=0.91,mean=0.83]) + 0.10×fmt(1.000) | pred='48' gold='24' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:45:27,482 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.917 = 0.50×1.00(exact) + 0.40×proc(0.791[fin=0.91,mean=0.62]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=50% lccp=33% (chain=2/6 ok_count=3) n_steps=6
+2026-04-26 03:45:27,576 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.761[fin=0.86,mean=0.62]) + 0.10×fmt(1.000) | pred='48' gold='24' | step_acc=71% lccp=29% (chain=2/7 ok_count=5) n_steps=7
+2026-04-26 03:45:27,667 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.46(prox=0.46) + 0.40×proc(0.404[fin=0.34,mean=0.50]) + 0.10×fmt(1.000) | pred='10' gold='24' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+
Iter 4 GRPO groups: 5%|5 | 1/20 [00:20<01:58, 6.22s/q, loss=-0.0004, mean_r=0.575, skip=0]
Iter 4 GRPO groups: 10%|# | 2/20 [00:20<03:21, 11.18s/q, loss=-0.0004, mean_r=0.575, skip=0]2026-04-26 03:45:39,821 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.492 = 0.50×0.08(prox=0.08) + 0.40×proc(0.884[fin=0.99,mean=0.72]) + 0.10×fmt(1.000) | pred='-10' gold='2' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:45:39,913 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.12(prox=0.12) + 0.40×proc(0.849[fin=0.98,mean=0.66]) + 0.10×fmt(1.000) | pred='-5' gold='2' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 03:45:40,005 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.363 = 0.50×0.00(prox=0.00) + 0.40×proc(0.658[fin=0.83,mean=0.40]) + 0.10×fmt(1.000) | pred='9/2' gold='2' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:45:40,090 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.464 = 0.50×0.12(prox=0.12) + 0.40×proc(0.754[fin=0.95,mean=0.45]) + 0.10×fmt(1.000) | pred='-5' gold='2' | step_acc=29% lccp=0% (chain=0/7 ok_count=2) n_steps=7
+2026-04-26 03:45:40,175 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.284 = 0.50×0.09(prox=0.09) + 0.40×proc(0.351[fin=0.47,mean=0.17]) + 0.10×fmt(1.000) | pred='12.5' gold='2' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:45:40,269 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:40,360 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.502 = 0.50×0.08(prox=0.08) + 0.40×proc(0.834[fin=0.99,mean=0.60]) + 0.10×fmt(1.000) | pred='-10' gold='2' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 03:45:40,457 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:45:40,551 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:45:40,642 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.486 = 0.50×0.07(prox=0.07) + 0.40×proc(0.880[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='-12' gold='2' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+
Iter 4 GRPO groups: 10%|# | 2/20 [00:33<03:21, 11.18s/q, loss=0.0022, mean_r=0.568, skip=0]
Iter 4 GRPO groups: 15%|#5 | 3/20 [00:33<03:23, 11.99s/q, loss=0.0022, mean_r=0.568, skip=0]2026-04-26 03:45:45,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,192 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,269 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,346 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,421 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,497 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,573 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,651 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,730 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:45,807 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 4 GRPO groups: 15%|#5 | 3/20 [00:37<03:23, 11.99s/q, loss=0var, mean_r=0.994, skip=1]
Iter 4 GRPO groups: 20%|## | 4/20 [00:37<02:19, 8.73s/q, loss=0var, mean_r=0.994, skip=1]2026-04-26 03:45:49,384 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:49,461 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.937[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:49,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:45:49,620 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:49,695 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:49,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:45:49,848 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:45:49,923 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:49,999 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:50,074 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 4 GRPO groups: 20%|## | 4/20 [00:41<02:19, 8.73s/q, loss=0var, mean_r=0.994, skip=2]
Iter 4 GRPO groups: 25%|##5 | 5/20 [00:41<01:46, 7.12s/q, loss=0var, mean_r=0.994, skip=2]2026-04-26 03:45:54,128 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:54,211 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:54,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:54,376 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:54,458 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:54,541 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:54,626 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:54,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:45:54,793 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.37(prox=0.37) + 0.40×proc(0.736[fin=0.88,mean=0.51]) + 0.10×fmt(1.000) | pred='11' gold='6' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:45:54,875 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 4 GRPO groups: 25%|##5 | 5/20 [00:48<01:46, 7.12s/q, loss=0.0007, mean_r=0.954, skip=2]
Iter 4 GRPO groups: 30%|### | 6/20 [00:48<01:35, 6.82s/q, loss=0.0007, mean_r=0.954, skip=2]2026-04-26 03:46:29,993 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.453 = 0.50×0.00(prox=0.00) + 0.40×proc(0.689[fin=0.62,mean=0.79]) + 0.10×fmt(0.700) | pred='' gold='4' | step_acc=86% lccp=71% (chain=5/7 ok_count=6) n_steps=7
+2026-04-26 03:46:30,090 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.626 = 0.50×0.67(prox=0.67) + 0.40×proc(0.482[fin=0.32,mean=0.73]) + 0.10×fmt(1.000) | pred='5' gold='4' | step_acc=70% lccp=60% (chain=6/10 ok_count=7) n_steps=10
+2026-04-26 03:46:30,194 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.736 = 0.50×0.67(prox=0.67) + 0.40×proc(0.757[fin=0.86,mean=0.61]) + 0.10×fmt(1.000) | pred='3' gold='4' | step_acc=50% lccp=0% (chain=0/6 ok_count=3) n_steps=6
+2026-04-26 03:46:30,286 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.949 = 0.50×1.00(exact) + 0.40×proc(0.872[fin=0.95,mean=0.76]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:46:30,378 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.771 = 0.50×0.67(prox=0.67) + 0.40×proc(0.843[fin=0.95,mean=0.68]) + 0.10×fmt(1.000) | pred='3' gold='4' | step_acc=67% lccp=0% (chain=0/9 ok_count=6) n_steps=9
+2026-04-26 03:46:30,470 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.547 = 0.50×0.67(prox=0.67) + 0.40×proc(0.285[fin=0.10,mean=0.57]) + 0.10×fmt(1.000) | pred='5' gold='4' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 03:46:30,554 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.549 = 0.50×0.67(prox=0.67) + 0.40×proc(0.288[fin=0.12,mean=0.55]) + 0.10×fmt(1.000) | pred='5' gold='4' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 03:46:30,686 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.806 = 0.50×0.67(prox=0.67) + 0.40×proc(0.931[fin=0.99,mean=0.84]) + 0.10×fmt(1.000) | pred='3' gold='4' | step_acc=88% lccp=6% (chain=1/16 ok_count=14) n_steps=16
+2026-04-26 03:46:30,780 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.533 = 0.50×0.67(prox=0.67) + 0.40×proc(0.248[fin=0.10,mean=0.47]) + 0.10×fmt(1.000) | pred='5' gold='4' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+
Iter 4 GRPO groups: 30%|### | 6/20 [01:23<01:35, 6.82s/q, loss=0.0007, mean_r=0.663, skip=2]
Iter 4 GRPO groups: 35%|###5 | 7/20 [01:23<03:32, 16.32s/q, loss=0.0007, mean_r=0.663, skip=2]2026-04-26 03:46:35,164 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,242 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,323 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,404 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,485 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,560 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,636 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,788 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:35,870 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='585' gold='585' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 4 GRPO groups: 35%|###5 | 7/20 [01:27<03:32, 16.32s/q, loss=0var, mean_r=0.998, skip=3]
Iter 4 GRPO groups: 40%|#### | 8/20 [01:27<02:27, 12.30s/q, loss=0var, mean_r=0.998, skip=3]2026-04-26 03:46:42,207 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:46:42,294 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:46:42,380 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.49(prox=0.49) + 0.40×proc(0.821[fin=0.93,mean=0.66]) + 0.10×fmt(1.000) | pred='24' gold='49' | step_acc=86% lccp=14% (chain=1/7 ok_count=6) n_steps=7
+2026-04-26 03:46:42,464 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.875 = 0.50×0.83(prox=0.83) + 0.40×proc(0.900[fin=0.98,mean=0.78]) + 0.10×fmt(1.000) | pred='54' gold='49' | step_acc=80% lccp=70% (chain=7/10 ok_count=8) n_steps=10
+2026-04-26 03:46:42,547 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:46:42,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:46:42,717 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:46:42,808 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:46:42,891 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.466 = 0.50×0.39(prox=0.39) + 0.40×proc(0.304[fin=0.20,mean=0.47]) + 0.10×fmt(1.000) | pred='10.5' gold='49' | step_acc=50% lccp=33% (chain=2/6 ok_count=3) n_steps=6
+2026-04-26 03:46:42,973 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 4 GRPO groups: 40%|#### | 8/20 [01:36<02:27, 12.30s/q, loss=0.0013, mean_r=0.888, skip=3]
Iter 4 GRPO groups: 45%|####5 | 9/20 [01:36<02:02, 11.11s/q, loss=0.0013, mean_r=0.888, skip=3]2026-04-26 03:46:47,097 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:47,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:47,252 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:47,327 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.891 = 0.50×1.00(exact) + 0.40×proc(0.728[fin=0.95,mean=0.39]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 03:46:47,403 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:47,479 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:47,560 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:47,639 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:47,720 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:47,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.845 = 0.50×1.00(exact) + 0.40×proc(0.613[fin=0.82,mean=0.30]) + 0.10×fmt(1.000) | pred='528' gold='528' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+
Iter 4 GRPO groups: 45%|####5 | 9/20 [01:40<02:02, 11.11s/q, loss=0.0005, mean_r=0.972, skip=3]
Iter 4 GRPO groups: 50%|##### | 10/20 [01:40<01:31, 9.18s/q, loss=0.0005, mean_r=0.972, skip=3]2026-04-26 03:46:51,124 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,208 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,291 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,375 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,458 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,541 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,625 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,707 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,788 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:46:51,869 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 4 GRPO groups: 50%|##### | 10/20 [01:43<01:31, 9.18s/q, loss=0var, mean_r=0.997, skip=4]
Iter 4 GRPO groups: 55%|#####5 | 11/20 [01:43<01:04, 7.18s/q, loss=0var, mean_r=0.997, skip=4]2026-04-26 03:46:55,469 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:46:55,551 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:46:55,626 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.922[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 03:46:55,702 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:46:55,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:46:55,861 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.902[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 03:46:55,936 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.932[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:46:56,018 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:46:56,095 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.907[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 03:46:56,176 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 4 GRPO groups: 55%|#####5 | 11/20 [01:47<01:04, 7.18s/q, loss=0var, mean_r=0.985, skip=5]
Iter 4 GRPO groups: 60%|###### | 12/20 [01:47<00:50, 6.30s/q, loss=0var, mean_r=0.985, skip=5]2026-04-26 03:47:00,461 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:00,545 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:00,630 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.917[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:47:00,713 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:00,797 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:00,880 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:00,964 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:01,045 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:01,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:01,215 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.948[fin=0.99,mean=0.89]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+
Iter 4 GRPO groups: 60%|###### | 12/20 [01:52<00:50, 6.30s/q, loss=0var, mean_r=0.993, skip=6]
Iter 4 GRPO groups: 65%|######5 | 13/20 [01:52<00:41, 5.92s/q, loss=0var, mean_r=0.993, skip=6]2026-04-26 03:47:07,685 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.537 = 0.50×0.50(prox=0.50) + 0.40×proc(0.468[fin=0.28,mean=0.76]) + 0.10×fmt(1.000) | pred='13' gold='26' | step_acc=75% lccp=38% (chain=3/8 ok_count=6) n_steps=8
+2026-04-26 03:47:07,778 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.492 = 0.50×0.22(prox=0.22) + 0.40×proc(0.516[fin=0.43,mean=0.65]) + 0.10×fmt(1.000) | pred='72' gold='26' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 03:47:07,862 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='26' gold='26' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:07,946 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='26' gold='26' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:47:08,031 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.612 = 0.50×0.50(prox=0.50) + 0.40×proc(0.656[fin=0.49,mean=0.91]) + 0.10×fmt(1.000) | pred='13' gold='26' | step_acc=83% lccp=83% (chain=5/6 ok_count=5) n_steps=6
+2026-04-26 03:47:08,115 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.48(prox=0.48) + 0.40×proc(0.628[fin=0.65,mean=0.59]) + 0.10×fmt(1.000) | pred='40' gold='26' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 03:47:08,201 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.48(prox=0.48) + 0.40×proc(0.474[fin=0.43,mean=0.54]) + 0.10×fmt(1.000) | pred='40' gold='26' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 03:47:08,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='26' gold='26' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:47:08,385 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.760 = 0.50×0.57(prox=0.57) + 0.40×proc(0.945[fin=0.99,mean=0.87]) + 0.10×fmt(1.000) | pred='16' gold='26' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 03:47:08,467 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.48(prox=0.48) + 0.40×proc(0.889[fin=0.93,mean=0.83]) + 0.10×fmt(1.000) | pred='40' gold='26' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+
Iter 4 GRPO groups: 65%|######5 | 13/20 [02:01<00:41, 5.92s/q, loss=0.0008, mean_r=0.704, skip=6]
Iter 4 GRPO groups: 70%|####### | 14/20 [02:01<00:40, 6.76s/q, loss=0.0008, mean_r=0.704, skip=6]2026-04-26 03:47:12,149 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.972[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:12,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:12,307 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:12,383 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:12,459 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:12,535 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:12,610 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:12,686 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:12,761 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:12,836 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 4 GRPO groups: 70%|####### | 14/20 [02:04<00:40, 6.76s/q, loss=0var, mean_r=0.998, skip=7]
Iter 4 GRPO groups: 75%|#######5 | 15/20 [02:04<00:28, 5.60s/q, loss=0var, mean_r=0.998, skip=7]2026-04-26 03:47:46,346 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.499 = 0.50×0.35(prox=0.35) + 0.40×proc(0.653[fin=0.77,mean=0.48]) + 0.10×fmt(0.650) | pred='120' gold='2220' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 03:47:46,430 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.253 = 0.50×0.36(prox=0.36) + 0.40×proc(0.026[fin=0.01,mean=0.05]) + 0.10×fmt(0.650) | pred='210' gold='2220' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 03:47:46,513 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.094 = 0.50×0.01(prox=0.01) + 0.40×proc(0.065[fin=0.07,mean=0.05]) + 0.10×fmt(0.650) | pred='200200' gold='2220' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 03:47:46,598 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.484 = 0.50×0.37(prox=0.37) + 0.40×proc(0.276[fin=0.09,mean=0.55]) + 0.10×fmt(1.000) | pred='300' gold='2220' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 03:47:46,683 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.541 = 0.50×0.35(prox=0.35) + 0.40×proc(0.445[fin=0.30,mean=0.66]) + 0.10×fmt(1.000) | pred='120' gold='2220' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 03:47:46,765 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.480 = 0.50×0.37(prox=0.37) + 0.40×proc(0.304[fin=0.15,mean=0.53]) + 0.10×fmt(1.000) | pred='300' gold='2220' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 03:47:46,847 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.642 = 0.50×1.00(exact) + 0.40×proc(0.105[fin=0.11,mean=0.09]) + 0.10×fmt(1.000) | pred='2220' gold='2220' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:47:46,939 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.948 = 0.50×1.00(exact) + 0.40×proc(0.871[fin=0.99,mean=0.69]) + 0.10×fmt(1.000) | pred='2220' gold='2220' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 03:47:47,031 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='2220' gold='2220' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 4 GRPO groups: 75%|#######5 | 15/20 [02:40<00:28, 5.60s/q, loss=0.0011, mean_r=0.548, skip=7]
Iter 4 GRPO groups: 80%|######## | 16/20 [02:40<00:58, 14.62s/q, loss=0.0011, mean_r=0.548, skip=7]2026-04-26 03:47:53,692 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:53,774 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.291 = 0.50×0.14(prox=0.14) + 0.40×proc(0.149[fin=0.01,mean=0.36]) + 0.10×fmt(1.000) | pred='8' gold='2' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 03:47:53,851 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.412 = 0.50×0.50(prox=0.50) + 0.40×proc(0.155[fin=0.06,mean=0.29]) + 0.10×fmt(1.000) | pred='3' gold='2' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 03:47:53,927 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.432 = 0.50×0.50(prox=0.50) + 0.40×proc(0.205[fin=0.07,mean=0.41]) + 0.10×fmt(1.000) | pred='3' gold='2' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 03:47:54,009 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.425 = 0.50×0.33(prox=0.33) + 0.40×proc(0.208[fin=0.02,mean=0.49]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 03:47:54,091 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.449 = 0.50×0.50(prox=0.50) + 0.40×proc(0.249[fin=0.20,mean=0.32]) + 0.10×fmt(1.000) | pred='3' gold='2' | step_acc=20% lccp=20% (chain=1/5 ok_count=1) n_steps=5
+2026-04-26 03:47:54,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.433 = 0.50×0.50(prox=0.50) + 0.40×proc(0.206[fin=0.14,mean=0.31]) + 0.10×fmt(1.000) | pred='3' gold='2' | step_acc=20% lccp=20% (chain=1/5 ok_count=1) n_steps=5
+2026-04-26 03:47:54,251 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:47:54,333 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=0.99,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:47:54,417 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.811 = 0.50×1.00(exact) + 0.40×proc(0.529[fin=0.52,mean=0.55]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+
Iter 4 GRPO groups: 80%|######## | 16/20 [02:47<00:58, 14.62s/q, loss=0.0012, mean_r=0.625, skip=7]
Iter 4 GRPO groups: 85%|########5 | 17/20 [02:47<00:37, 12.46s/q, loss=0.0012, mean_r=0.625, skip=7]2026-04-26 03:47:59,042 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:59,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.925[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 03:47:59,210 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:59,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.643 = 0.50×0.70(prox=0.70) + 0.40×proc(0.485[fin=0.49,mean=0.48]) + 0.10×fmt(1.000) | pred='51' gold='65' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 03:47:59,375 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:59,457 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:59,539 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:59,622 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:47:59,705 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:47:59,787 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='65' gold='65' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 4 GRPO groups: 85%|########5 | 17/20 [02:53<00:37, 12.46s/q, loss=0.0019, mean_r=0.959, skip=7]
Iter 4 GRPO groups: 90%|######### | 18/20 [02:53<00:20, 10.42s/q, loss=0.0019, mean_r=0.959, skip=7]2026-04-26 03:48:04,805 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:04,885 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:04,963 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:05,045 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:05,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:05,209 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:05,291 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:05,366 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:05,441 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:05,516 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='117' gold='117' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 4 GRPO groups: 90%|######### | 18/20 [02:57<00:20, 10.42s/q, loss=0var, mean_r=0.997, skip=8]
Iter 4 GRPO groups: 95%|#########5| 19/20 [02:57<00:08, 8.50s/q, loss=0var, mean_r=0.997, skip=8]2026-04-26 03:48:09,600 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:09,686 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:09,771 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:09,853 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:09,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:10,018 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:10,101 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:10,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:10,262 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:10,339 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 4 GRPO groups: 95%|#########5| 19/20 [03:02<00:08, 8.50s/q, loss=0var, mean_r=1.000, skip=9]
Iter 4 GRPO groups: 100%|##########| 20/20 [03:02<00:00, 7.39s/q, loss=0var, mean_r=1.000, skip=9]
Iter 4 GRPO groups: 100%|##########| 20/20 [03:02<00:00, 9.10s/q, loss=0var, mean_r=1.000, skip=9]
+2026-04-26 03:48:10,349 INFO __main__ - Iter 4 | loss=0.0009 | reward mean=0.865 std=0.219 | gt_match=73.2% | grounded_acc=89.4% | step_acc=85.9% | lccp=76.5% | batch_acc=89.4% | phase=GROUNDED_ONLY sp_ratio=0% | groups=11 skipped=9(0var=9) | lr=2.75e-06 | 182.1s
+2026-04-26 03:48:10,349 WARNING __main__ - STARVATION: 45% of groups skipped (zero variance). grounded_acc=89.4% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 03:48:10,350 INFO __main__ - ======================================================================
+2026-04-26 03:48:10,350 INFO __main__ - GRPO ITERATION 5/60
+2026-04-26 03:48:10,350 INFO __main__ - ======================================================================
+2026-04-26 03:48:10,367 INFO __main__ - LR this iteration: 2.75e-06 | T=0.773 | MATH ratio=30%
+
Iter 5 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 03:48:16,985 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:17,070 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:17,163 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:17,246 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:17,338 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.815[fin=0.99,mean=0.56]) + 0.10×fmt(1.000) | pred='4' gold='35' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 03:48:17,424 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:17,508 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.924 = 0.50×1.00(exact) + 0.40×proc(0.810[fin=0.98,mean=0.55]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:48:17,601 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.261 = 0.50×0.00(prox=0.00) + 0.40×proc(0.310[fin=0.29,mean=0.34]) + 0.10×fmt(1.000) | pred='350000' gold='35' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 03:48:17,692 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.387 = 0.50×0.01(prox=0.01) + 0.40×proc(0.618[fin=0.72,mean=0.47]) + 0.10×fmt(1.000) | pred='3500' gold='35' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 03:48:17,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 5 GRPO groups: 0%| | 0/20 [00:08, ?q/s, loss=-0.0006, mean_r=0.809, skip=0]
Iter 5 GRPO groups: 5%|5 | 1/20 [00:08<02:49, 8.90s/q, loss=-0.0006, mean_r=0.809, skip=0]2026-04-26 03:48:28,583 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='81' gold='81' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:28,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.563 = 0.50×0.60(prox=0.60) + 0.40×proc(0.407[fin=0.54,mean=0.21]) + 0.10×fmt(1.000) | pred='54' gold='81' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:48:28,753 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='81' gold='81' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:28,839 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.907[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='81' gold='81' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 03:48:28,917 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='81' gold='81' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:28,993 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='81' gold='81' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:29,070 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='81' gold='81' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:29,153 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='81' gold='81' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:29,246 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.439 = 0.50×0.06(prox=0.06) + 0.40×proc(0.613[fin=0.64,mean=0.57]) + 0.10×fmt(1.000) | pred='729' gold='81' | step_acc=57% lccp=43% (chain=3/7 ok_count=4) n_steps=7
+2026-04-26 03:48:29,328 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.492 = 0.50×0.14(prox=0.14) + 0.40×proc(0.709[fin=0.86,mean=0.48]) + 0.10×fmt(1.000) | pred='324' gold='81' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+
Iter 5 GRPO groups: 5%|5 | 1/20 [00:20<02:49, 8.90s/q, loss=-0.0022, mean_r=0.844, skip=0]
Iter 5 GRPO groups: 10%|# | 2/20 [00:20<03:07, 10.43s/q, loss=-0.0022, mean_r=0.844, skip=0]2026-04-26 03:48:34,911 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:34,993 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:35,074 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:35,157 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.883 = 0.50×0.85(prox=0.85) + 0.40×proc(0.895[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='52' gold='48' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 03:48:35,240 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:35,321 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.950[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:48:35,404 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.824 = 0.50×0.85(prox=0.85) + 0.40×proc(0.746[fin=0.74,mean=0.76]) + 0.10×fmt(1.000) | pred='44' gold='48' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:48:35,486 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.861 = 0.50×0.85(prox=0.85) + 0.40×proc(0.840[fin=0.87,mean=0.79]) + 0.10×fmt(1.000) | pred='44' gold='48' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:48:35,567 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:35,649 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 5 GRPO groups: 10%|# | 2/20 [00:26<03:07, 10.43s/q, loss=0.0001, mean_r=0.954, skip=0]
Iter 5 GRPO groups: 15%|#5 | 3/20 [00:26<02:25, 8.54s/q, loss=0.0001, mean_r=0.954, skip=0]2026-04-26 03:48:41,510 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:41,591 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:41,674 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.810 = 0.50×0.85(prox=0.85) + 0.40×proc(0.713[fin=0.90,mean=0.43]) + 0.10×fmt(1.000) | pred='20' gold='21' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 03:48:41,755 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:41,836 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:41,919 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:42,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:42,085 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:42,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:48:42,251 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 5 GRPO groups: 15%|#5 | 3/20 [00:33<02:25, 8.54s/q, loss=-0.0013, mean_r=0.981, skip=0]
Iter 5 GRPO groups: 20%|## | 4/20 [00:33<02:04, 7.78s/q, loss=-0.0013, mean_r=0.981, skip=0]2026-04-26 03:48:47,662 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:47,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:47,826 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:47,908 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:47,990 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:48:48,072 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:48,157 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:48,241 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:48,326 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:48,410 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 5 GRPO groups: 20%|## | 4/20 [00:38<02:04, 7.78s/q, loss=0var, mean_r=0.998, skip=1]
Iter 5 GRPO groups: 25%|##5 | 5/20 [00:38<01:40, 6.68s/q, loss=0var, mean_r=0.998, skip=1]2026-04-26 03:48:52,305 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:52,388 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:52,469 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:48:52,551 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:52,635 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.688 = 0.50×0.81(prox=0.81) + 0.40×proc(0.462[fin=0.33,mean=0.67]) + 0.10×fmt(1.000) | pred='22' gold='25' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 03:48:52,717 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:52,801 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.38(prox=0.38) + 0.40×proc(0.788[fin=0.80,mean=0.77]) + 0.10×fmt(1.000) | pred='5' gold='25' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:48:52,883 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:52,966 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:48:53,047 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 5 GRPO groups: 25%|##5 | 5/20 [00:44<01:40, 6.68s/q, loss=0.0011, mean_r=0.924, skip=1]
Iter 5 GRPO groups: 30%|### | 6/20 [00:44<01:30, 6.47s/q, loss=0.0011, mean_r=0.924, skip=1]2026-04-26 03:49:00,958 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:01,041 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:01,124 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:01,206 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:01,288 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:01,372 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 03:49:01,456 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:01,548 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:01,630 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:01,712 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='700' gold='700' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 5 GRPO groups: 30%|### | 6/20 [00:51<01:30, 6.47s/q, loss=0var, mean_r=0.998, skip=2]
Iter 5 GRPO groups: 35%|###5 | 7/20 [00:51<01:27, 6.72s/q, loss=0var, mean_r=0.998, skip=2]2026-04-26 03:49:05,710 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.473 = 0.50×0.41(prox=0.41) + 0.40×proc(0.420[fin=0.38,mean=0.48]) + 0.10×fmt(1.000) | pred='7' gold='25' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:49:05,796 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.899 = 0.50×0.85(prox=0.85) + 0.40×proc(0.935[fin=0.94,mean=0.93]) + 0.10×fmt(1.000) | pred='23' gold='25' | step_acc=100% lccp=100% (chain=11/11 ok_count=11) n_steps=11
+2026-04-26 03:49:05,878 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.277 = 0.50×0.11(prox=0.11) + 0.40×proc(0.308[fin=0.40,mean=0.17]) + 0.10×fmt(1.000) | pred='128' gold='25' | step_acc=0% lccp=0% (chain=0/5 ok_count=0) n_steps=5
+2026-04-26 03:49:05,953 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.282 = 0.50×0.20(prox=0.20) + 0.40×proc(0.201[fin=0.11,mean=0.34]) + 0.10×fmt(1.000) | pred='74' gold='25' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 03:49:06,035 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.862 = 0.50×0.85(prox=0.85) + 0.40×proc(0.843[fin=0.89,mean=0.77]) + 0.10×fmt(1.000) | pred='23' gold='25' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 03:49:06,113 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.491 = 0.50×0.41(prox=0.41) + 0.40×proc(0.466[fin=0.56,mean=0.33]) + 0.10×fmt(1.000) | pred='7' gold='25' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:49:06,195 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.792 = 0.50×0.71(prox=0.71) + 0.40×proc(0.837[fin=0.88,mean=0.77]) + 0.10×fmt(1.000) | pred='30' gold='25' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:49:06,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.319 = 0.50×0.06(prox=0.06) + 0.40×proc(0.476[fin=0.62,mean=0.25]) + 0.10×fmt(1.000) | pred='233' gold='25' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 03:49:06,360 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.856 = 0.50×0.71(prox=0.71) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:06,442 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.724 = 0.50×0.58(prox=0.58) + 0.40×proc(0.833[fin=0.99,mean=0.60]) + 0.10×fmt(1.000) | pred='16' gold='25' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+
Iter 5 GRPO groups: 35%|###5 | 7/20 [00:57<01:27, 6.72s/q, loss=0.0037, mean_r=0.598, skip=2]
Iter 5 GRPO groups: 40%|#### | 8/20 [00:57<01:18, 6.54s/q, loss=0.0037, mean_r=0.598, skip=2]2026-04-26 03:49:17,471 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.702 = 0.50×0.85(prox=0.85) + 0.40×proc(0.442[fin=0.24,mean=0.75]) + 0.10×fmt(1.000) | pred='50' gold='49' | step_acc=75% lccp=75% (chain=6/8 ok_count=6) n_steps=8
+2026-04-26 03:49:17,556 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:49:17,640 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:49:17,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:49:17,804 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:49:17,886 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:49:17,977 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:49:18,061 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:49:18,145 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:49:18,229 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.720 = 0.50×0.64(prox=0.64) + 0.40×proc(0.756[fin=0.81,mean=0.68]) + 0.10×fmt(1.000) | pred='63' gold='49' | step_acc=78% lccp=0% (chain=0/9 ok_count=7) n_steps=9
+
Iter 5 GRPO groups: 40%|#### | 8/20 [01:09<01:18, 6.54s/q, loss=-0.0000, mean_r=0.940, skip=2]
Iter 5 GRPO groups: 45%|####5 | 9/20 [01:09<01:30, 8.18s/q, loss=-0.0000, mean_r=0.940, skip=2]2026-04-26 03:49:25,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.48(prox=0.48) + 0.40×proc(0.950[fin=0.99,mean=0.89]) + 0.10×fmt(1.000) | pred='5' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:25,858 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:25,942 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:26,026 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.48(prox=0.48) + 0.40×proc(0.803[fin=0.88,mean=0.70]) + 0.10×fmt(1.000) | pred='5' gold='11' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:49:26,111 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.902[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 03:49:26,205 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.584 = 0.50×0.73(prox=0.73) + 0.40×proc(0.294[fin=0.40,mean=0.14]) + 0.10×fmt(1.000) | pred='9' gold='11' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 03:49:26,288 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:26,380 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.661 = 0.50×0.48(prox=0.48) + 0.40×proc(0.806[fin=1.00,mean=0.52]) + 0.10×fmt(1.000) | pred='5' gold='11' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:49:26,463 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.895[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:49:26,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.709 = 0.50×0.48(prox=0.48) + 0.40×proc(0.924[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='5' gold='11' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+
Iter 5 GRPO groups: 45%|####5 | 9/20 [01:17<01:30, 8.18s/q, loss=-0.0017, mean_r=0.796, skip=2]
Iter 5 GRPO groups: 50%|##### | 10/20 [01:17<01:22, 8.23s/q, loss=-0.0017, mean_r=0.796, skip=2]2026-04-26 03:49:42,119 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:42,213 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:49:42,296 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:42,390 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.700) | pred='' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:42,475 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:49:42,568 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:42,568 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='9' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 03:49:42,651 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:42,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.905[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='27' gold='9' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 03:49:42,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.896[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='27' gold='9' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+
Iter 5 GRPO groups: 50%|##### | 10/20 [01:33<01:22, 8.23s/q, loss=-0.0011, mean_r=0.763, skip=2]
Iter 5 GRPO groups: 55%|#####5 | 11/20 [01:33<01:36, 10.70s/q, loss=-0.0011, mean_r=0.763, skip=2]2026-04-26 03:49:48,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:49:48,212 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:48,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:48,375 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:49:48,456 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:49:48,537 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:49:48,619 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:49:48,700 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:49:48,781 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:49:48,864 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.939[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='333200' gold='333200' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 5 GRPO groups: 55%|#####5 | 11/20 [01:38<01:36, 10.70s/q, loss=0var, mean_r=0.993, skip=3]
Iter 5 GRPO groups: 60%|###### | 12/20 [01:38<01:10, 8.83s/q, loss=0var, mean_r=0.993, skip=3]2026-04-26 03:50:02,139 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.888 = 0.50×1.00(exact) + 0.40×proc(0.721[fin=0.75,mean=0.68]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 03:50:02,226 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.463 = 0.50×0.33(prox=0.33) + 0.40×proc(0.340[fin=0.26,mean=0.46]) + 0.10×fmt(1.000) | pred='1' gold='0' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 03:50:02,322 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.332 = 0.50×0.00(prox=0.00) + 0.40×proc(0.421[fin=0.24,mean=0.69]) + 0.10×fmt(0.700) | pred='' gold='0' | step_acc=62% lccp=62% (chain=5/8 ok_count=5) n_steps=8
+2026-04-26 03:50:02,416 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:02,510 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.391 = 0.50×0.00(prox=0.00) + 0.40×proc(0.728[fin=0.83,mean=0.58]) + 0.10×fmt(0.700) | pred='' gold='0' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 03:50:02,594 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:50:02,678 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:50:02,771 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:50:02,858 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=0.99,mean=0.92]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:50:02,953 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.503 = 0.50×0.00(prox=0.00) + 0.40×proc(0.882[fin=0.98,mean=0.73]) + 0.10×fmt(1.000) | pred='$1 - \\sqrt{3}$' gold='0' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+
Iter 5 GRPO groups: 60%|###### | 12/20 [01:54<01:10, 8.83s/q, loss=-0.0012, mean_r=0.752, skip=3]
Iter 5 GRPO groups: 65%|######5 | 13/20 [01:54<01:16, 10.87s/q, loss=-0.0012, mean_r=0.752, skip=3]2026-04-26 03:50:12,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='270' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:12,361 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='270' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:12,444 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.917[fin=0.99,mean=0.81]) + 0.10×fmt(1.000) | pred='270' gold='270' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 03:50:12,528 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='270' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:12,612 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='270' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:12,696 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.794 = 0.50×0.60(prox=0.60) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='360' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:12,780 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='270' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:12,864 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='270' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:12,956 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='$270' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:13,039 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='270' gold='270' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 5 GRPO groups: 65%|######5 | 13/20 [02:04<01:16, 10.87s/q, loss=0.0008, mean_r=0.974, skip=3]
Iter 5 GRPO groups: 70%|####### | 14/20 [02:04<01:03, 10.62s/q, loss=0.0008, mean_r=0.974, skip=3]2026-04-26 03:50:19,128 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.795 = 0.50×0.78(prox=0.78) + 0.40×proc(0.765[fin=0.89,mean=0.58]) + 0.10×fmt(1.000) | pred='16' gold='14' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 03:50:19,211 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:19,296 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.665 = 0.50×0.41(prox=0.41) + 0.40×proc(0.897[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='4' gold='14' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 03:50:19,379 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.547 = 0.50×0.58(prox=0.58) + 0.40×proc(0.387[fin=0.35,mean=0.44]) + 0.10×fmt(1.000) | pred='19' gold='14' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 03:50:19,465 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.848 = 0.50×1.00(exact) + 0.40×proc(0.620[fin=0.67,mean=0.55]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 03:50:19,549 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:50:19,634 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.523 = 0.50×0.44(prox=0.44) + 0.40×proc(0.510[fin=0.66,mean=0.28]) + 0.10×fmt(1.000) | pred='5' gold='14' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 03:50:19,715 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:50:19,797 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:50:19,882 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.508 = 0.50×0.44(prox=0.44) + 0.40×proc(0.348[fin=0.35,mean=0.34]) + 0.10×fmt(1.000) | pred='5' gold='14' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+
Iter 5 GRPO groups: 70%|####### | 14/20 [02:10<01:03, 10.62s/q, loss=0.0019, mean_r=0.788, skip=3]
Iter 5 GRPO groups: 75%|#######5 | 15/20 [02:10<00:47, 9.47s/q, loss=0.0019, mean_r=0.788, skip=3]2026-04-26 03:50:24,625 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.934[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:50:24,701 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.932[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 03:50:24,778 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:50:24,861 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:50:24,936 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:50:25,012 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.969[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:50:25,096 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:50:25,180 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:50:25,259 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 03:50:25,336 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='82' gold='82' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+
Iter 5 GRPO groups: 75%|#######5 | 15/20 [02:14<00:47, 9.47s/q, loss=0var, mean_r=0.989, skip=4]
Iter 5 GRPO groups: 80%|######## | 16/20 [02:14<00:31, 7.84s/q, loss=0var, mean_r=0.989, skip=4]2026-04-26 03:50:32,824 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:50:32,908 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 03:50:32,990 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:50:33,074 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.310 = 0.50×0.17(prox=0.17) + 0.40×proc(0.271[fin=0.18,mean=0.40]) + 0.10×fmt(1.000) | pred='7' gold='2' | step_acc=38% lccp=12% (chain=1/8 ok_count=3) n_steps=8
+2026-04-26 03:50:33,157 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=12/12 ok_count=12) n_steps=12
+2026-04-26 03:50:33,240 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:50:33,322 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=11/11 ok_count=11) n_steps=11
+2026-04-26 03:50:33,405 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:50:33,496 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:50:33,578 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+
Iter 5 GRPO groups: 80%|######## | 16/20 [02:24<00:31, 7.84s/q, loss=0.0012, mean_r=0.929, skip=4]
Iter 5 GRPO groups: 85%|########5 | 17/20 [02:24<00:25, 8.39s/q, loss=0.0012, mean_r=0.929, skip=4]2026-04-26 03:51:08,058 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='648' gold='648' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:51:08,145 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.849 = 0.50×0.82(prox=0.82) + 0.40×proc(0.849[fin=1.00,mean=0.63]) + 0.10×fmt(1.000) | pred='576' gold='648' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 03:51:08,229 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.956[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='648' gold='648' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:51:08,311 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='648' gold='648' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:51:08,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='648' gold='648' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:51:08,480 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.749 = 0.50×0.60(prox=0.60) + 0.40×proc(0.873[fin=1.00,mean=0.69]) + 0.10×fmt(1.000) | pred='864' gold='648' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 03:51:08,563 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='648' gold='648' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:51:08,645 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='648' gold='648' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 03:51:08,727 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='648' gold='648' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 5 GRPO groups: 85%|########5 | 17/20 [02:59<00:25, 8.39s/q, loss=0.0003, mean_r=0.951, skip=4]
Iter 5 GRPO groups: 90%|######### | 18/20 [02:59<00:32, 16.43s/q, loss=0.0003, mean_r=0.951, skip=4]2026-04-26 03:51:18,739 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:51:18,831 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 03:51:18,923 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.35(prox=0.35) + 0.40×proc(0.491[fin=0.40,mean=0.63]) + 0.10×fmt(1.000) | pred='146' gold='76' | step_acc=57% lccp=57% (chain=4/7 ok_count=4) n_steps=7
+2026-04-26 03:51:19,015 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 03:51:19,100 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.925 = 0.50×1.00(exact) + 0.40×proc(0.813[fin=0.88,mean=0.71]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 03:51:19,194 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.815 = 0.50×0.85(prox=0.85) + 0.40×proc(0.724[fin=0.70,mean=0.76]) + 0.10×fmt(1.000) | pred='74' gold='76' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 03:51:19,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.831 = 0.50×0.85(prox=0.85) + 0.40×proc(0.766[fin=0.75,mean=0.80]) + 0.10×fmt(1.000) | pred='74' gold='76' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 03:51:19,378 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.899[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 03:51:19,470 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.833 = 0.50×0.85(prox=0.85) + 0.40×proc(0.771[fin=0.88,mean=0.62]) + 0.10×fmt(1.000) | pred='82' gold='76' | step_acc=57% lccp=43% (chain=3/7 ok_count=4) n_steps=7
+2026-04-26 03:51:19,565 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.838 = 0.50×0.79(prox=0.79) + 0.40×proc(0.856[fin=0.98,mean=0.66]) + 0.10×fmt(1.000) | pred='86' gold='76' | step_acc=67% lccp=33% (chain=3/9 ok_count=6) n_steps=9
+
Iter 5 GRPO groups: 90%|######### | 18/20 [03:10<00:32, 16.43s/q, loss=-0.0003, mean_r=0.874, skip=4]
Iter 5 GRPO groups: 95%|#########5| 19/20 [03:10<00:14, 14.75s/q, loss=-0.0003, mean_r=0.874, skip=4]2026-04-26 03:51:29,878 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.690 = 0.50×0.50(prox=0.50) + 0.40×proc(0.849[fin=0.93,mean=0.72]) + 0.10×fmt(1.000) | pred='90' gold='180' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 03:51:29,957 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.164 = 0.50×0.02(prox=0.02) + 0.40×proc(0.137[fin=0.09,mean=0.20]) + 0.10×fmt(1.000) | pred='5040' gold='180' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:51:30,043 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.316 = 0.50×0.34(prox=0.34) + 0.40×proc(0.117[fin=0.07,mean=0.18]) + 0.10×fmt(1.000) | pred='4' gold='180' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 03:51:30,128 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.188 = 0.50×0.02(prox=0.02) + 0.40×proc(0.195[fin=0.14,mean=0.27]) + 0.10×fmt(1.000) | pred='4320' gold='180' | step_acc=0% lccp=0% (chain=0/5 ok_count=0) n_steps=5
+2026-04-26 03:51:30,211 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.362 = 0.50×0.04(prox=0.04) + 0.40×proc(0.327[fin=0.16,mean=0.57]) + 0.10×fmt(1.000) | pred='2520' gold='180' | step_acc=75% lccp=75% (chain=3/4 ok_count=3) n_steps=4
+2026-04-26 03:51:30,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.232 = 0.50×0.02(prox=0.02) + 0.40×proc(0.308[fin=0.29,mean=0.34]) + 0.10×fmt(1.000) | pred='5040' gold='180' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 03:51:30,363 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.179 = 0.50×0.02(prox=0.02) + 0.40×proc(0.174[fin=0.14,mean=0.23]) + 0.10×fmt(1.000) | pred='5040' gold='180' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 03:51:30,454 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.307 = 0.50×0.04(prox=0.04) + 0.40×proc(0.471[fin=0.36,mean=0.64]) + 0.10×fmt(1.000) | pred='2520' gold='180' | step_acc=67% lccp=0% (chain=0/9 ok_count=6) n_steps=9
+2026-04-26 03:51:30,530 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.086 = 0.50×0.02(prox=0.02) + 0.40×proc(0.029[fin=0.03,mean=0.03]) + 0.10×fmt(0.650) | pred='5040' gold='180' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 03:51:30,605 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.394 = 0.50×0.60(prox=0.60) + 0.40×proc(0.073[fin=0.05,mean=0.10]) + 0.10×fmt(0.650) | pred='120' gold='180' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+
Iter 5 GRPO groups: 95%|#########5| 19/20 [03:21<00:14, 14.75s/q, loss=0.0005, mean_r=0.292, skip=4]
Iter 5 GRPO groups: 100%|##########| 20/20 [03:21<00:00, 13.63s/q, loss=0.0005, mean_r=0.292, skip=4]
Iter 5 GRPO groups: 100%|##########| 20/20 [03:21<00:00, 10.08s/q, loss=0.0005, mean_r=0.292, skip=4]
+2026-04-26 03:51:32,030 INFO __main__ - Iter 5 | loss=0.0001 | reward mean=0.857 std=0.239 | gt_match=69.3% | grounded_acc=88.4% | step_acc=83.5% | lccp=72.5% | batch_acc=88.4% | phase=GROUNDED_ONLY sp_ratio=0% | groups=16 skipped=4(0var=4) | lr=3.31e-06 | 201.7s
+2026-04-26 03:51:32,030 INFO __main__ - Evaluating GSM8K (150 samples)...
+
GSM8K eval: 0%| | 0/150 [00:00, ?q/s]
GSM8K eval: 1%| | 1/150 [00:02<05:32, 2.23s/q, correct=1/1, lccp=100.0%, score=0.998, step_acc=100.0%]
GSM8K eval: 1%|1 | 2/150 [00:07<09:23, 3.81s/q, correct=2/2, lccp=100.0%, score=0.999, step_acc=100.0%]
GSM8K eval: 2%|2 | 3/150 [00:09<07:59, 3.26s/q, correct=3/3, lccp=100.0%, score=0.999, step_acc=100.0%]
GSM8K eval: 3%|2 | 4/150 [00:11<06:57, 2.86s/q, correct=3/4, lccp=83.3%, score=0.887, step_acc=91.7%]
GSM8K eval: 3%|3 | 5/150 [00:13<05:48, 2.40s/q, correct=4/5, lccp=86.7%, score=0.910, step_acc=93.3%]
GSM8K eval: 4%|4 | 6/150 [00:19<08:14, 3.43s/q, correct=4/6, lccp=75.6%, score=0.887, step_acc=87.8%]
GSM8K eval: 5%|4 | 7/150 [00:22<08:08, 3.42s/q, correct=5/7, lccp=79.0%, score=0.903, step_acc=89.5%]
GSM8K eval: 5%|5 | 8/150 [00:24<07:18, 3.09s/q, correct=6/8, lccp=81.7%, score=0.915, step_acc=90.8%]
GSM8K eval: 6%|6 | 9/150 [00:28<07:21, 3.13s/q, correct=7/9, lccp=83.7%, score=0.924, step_acc=91.9%]
GSM8K eval: 7%|6 | 10/150 [00:34<09:46, 4.19s/q, correct=8/10, lccp=85.3%, score=0.932, step_acc=92.7%]
GSM8K eval: 7%|7 | 11/150 [00:37<08:45, 3.78s/q, correct=9/11, lccp=86.7%, score=0.938, step_acc=93.3%]
GSM8K eval: 8%|8 | 12/150 [00:39<07:30, 3.26s/q, correct=10/12, lccp=87.8%, score=0.943, step_acc=93.9%]
GSM8K eval: 9%|8 | 13/150 [00:42<07:03, 3.09s/q, correct=11/13, lccp=88.7%, score=0.943, step_acc=94.4%]
GSM8K eval: 9%|9 | 14/150 [00:46<07:57, 3.51s/q, correct=12/14, lccp=89.5%, score=0.947, step_acc=94.8%]
GSM8K eval: 10%|# | 15/150 [00:49<07:15, 3.22s/q, correct=13/15, lccp=90.2%, score=0.951, step_acc=95.1%]
GSM8K eval: 11%|# | 16/150 [00:51<06:42, 3.01s/q, correct=13/16, lccp=90.8%, score=0.926, step_acc=95.4%]
GSM8K eval: 11%|#1 | 17/150 [00:55<07:16, 3.28s/q, correct=14/17, lccp=91.4%, score=0.930, step_acc=95.7%]
GSM8K eval: 12%|#2 | 18/150 [00:59<07:44, 3.52s/q, correct=14/18, lccp=87.2%, score=0.907, step_acc=92.2%]
GSM8K eval: 13%|#2 | 19/150 [01:02<07:05, 3.25s/q, correct=15/19, lccp=87.9%, score=0.912, step_acc=92.6%]
GSM8K eval: 13%|#3 | 20/150 [01:05<07:17, 3.37s/q, correct=16/20, lccp=88.5%, score=0.916, step_acc=93.0%]
GSM8K eval: 14%|#4 | 21/150 [01:08<06:41, 3.12s/q, correct=17/21, lccp=89.0%, score=0.920, step_acc=93.3%]
GSM8K eval: 15%|#4 | 22/150 [01:11<06:21, 2.98s/q, correct=18/22, lccp=86.5%, score=0.915, step_acc=92.1%]
GSM8K eval: 15%|#5 | 23/150 [01:15<06:59, 3.30s/q, correct=19/23, lccp=87.1%, score=0.919, step_acc=92.5%]
GSM8K eval: 16%|#6 | 24/150 [01:17<06:27, 3.08s/q, correct=19/24, lccp=84.5%, score=0.902, step_acc=89.7%]
GSM8K eval: 17%|#6 | 25/150 [01:20<06:10, 2.97s/q, correct=19/25, lccp=82.1%, score=0.898, step_acc=89.1%]
GSM8K eval: 17%|#7 | 26/150 [01:24<06:58, 3.37s/q, correct=20/26, lccp=82.8%, score=0.902, step_acc=89.5%]
GSM8K eval: 18%|#8 | 27/150 [01:27<06:31, 3.19s/q, correct=20/27, lccp=83.5%, score=0.897, step_acc=89.9%]
GSM8K eval: 19%|#8 | 28/150 [01:29<05:51, 2.88s/q, correct=21/28, lccp=84.0%, score=0.900, step_acc=90.2%]
GSM8K eval: 19%|#9 | 29/150 [01:32<05:42, 2.83s/q, correct=22/29, lccp=84.6%, score=0.904, step_acc=90.6%]
GSM8K eval: 20%|## | 30/150 [01:36<06:12, 3.11s/q, correct=23/30, lccp=85.1%, score=0.907, step_acc=90.9%]
GSM8K eval: 21%|## | 31/150 [01:38<05:50, 2.95s/q, correct=24/31, lccp=85.6%, score=0.910, step_acc=91.2%]
GSM8K eval: 21%|##1 | 32/150 [01:40<05:03, 2.57s/q, correct=25/32, lccp=86.0%, score=0.912, step_acc=91.5%]
GSM8K eval: 22%|##2 | 33/150 [01:43<05:07, 2.63s/q, correct=26/33, lccp=86.5%, score=0.914, step_acc=91.7%]
GSM8K eval: 23%|##2 | 34/150 [01:45<04:41, 2.43s/q, correct=27/34, lccp=86.9%, score=0.917, step_acc=92.0%]
GSM8K eval: 23%|##3 | 35/150 [01:47<04:43, 2.47s/q, correct=28/35, lccp=87.2%, score=0.919, step_acc=92.2%]
GSM8K eval: 24%|##4 | 36/150 [01:51<05:15, 2.77s/q, correct=29/36, lccp=87.6%, score=0.921, step_acc=92.4%]
GSM8K eval: 25%|##4 | 37/150 [01:53<04:45, 2.53s/q, correct=30/37, lccp=87.9%, score=0.923, step_acc=92.6%]
GSM8K eval: 25%|##5 | 38/150 [01:56<04:57, 2.66s/q, correct=31/38, lccp=88.2%, score=0.925, step_acc=92.8%]
GSM8K eval: 26%|##6 | 39/150 [02:00<06:03, 3.28s/q, correct=32/39, lccp=88.5%, score=0.926, step_acc=93.0%]
GSM8K eval: 27%|##6 | 40/150 [02:07<07:36, 4.15s/q, correct=33/40, lccp=88.8%, score=0.928, step_acc=93.2%]
GSM8K eval: 27%|##7 | 41/150 [02:10<06:55, 3.81s/q, correct=33/41, lccp=89.1%, score=0.928, step_acc=93.3%]
GSM8K eval: 28%|##8 | 42/150 [02:14<07:07, 3.96s/q, correct=34/42, lccp=87.9%, score=0.929, step_acc=93.0%]
GSM8K eval: 29%|##8 | 43/150 [02:16<05:56, 3.33s/q, correct=35/43, lccp=88.2%, score=0.930, step_acc=93.2%]
GSM8K eval: 29%|##9 | 44/150 [02:22<07:25, 4.20s/q, correct=36/44, lccp=88.5%, score=0.932, step_acc=93.3%]
GSM8K eval: 30%|### | 45/150 [02:25<06:44, 3.85s/q, correct=37/45, lccp=88.7%, score=0.933, step_acc=93.5%]
GSM8K eval: 31%|### | 46/150 [02:30<07:10, 4.14s/q, correct=37/46, lccp=86.8%, score=0.928, step_acc=93.4%]
GSM8K eval: 31%|###1 | 47/150 [02:33<06:30, 3.79s/q, correct=38/47, lccp=87.1%, score=0.930, step_acc=93.5%]
GSM8K eval: 32%|###2 | 48/150 [02:35<05:24, 3.18s/q, correct=39/48, lccp=87.4%, score=0.931, step_acc=93.7%]
GSM8K eval: 33%|###2 | 49/150 [02:38<05:33, 3.30s/q, correct=40/49, lccp=86.3%, score=0.932, step_acc=93.4%]
GSM8K eval: 33%|###3 | 50/150 [02:41<05:25, 3.26s/q, correct=40/50, lccp=85.5%, score=0.923, step_acc=92.6%]
GSM8K eval: 34%|###4 | 51/150 [02:43<04:27, 2.70s/q, correct=41/51, lccp=85.8%, score=0.925, step_acc=92.7%]
GSM8K eval: 35%|###4 | 52/150 [02:47<05:19, 3.26s/q, correct=41/52, lccp=84.2%, score=0.924, step_acc=92.5%]
GSM8K eval: 35%|###5 | 53/150 [02:52<05:57, 3.69s/q, correct=41/53, lccp=83.7%, score=0.917, step_acc=91.9%]
GSM8K eval: 36%|###6 | 54/150 [02:54<05:15, 3.28s/q, correct=42/54, lccp=84.0%, score=0.918, step_acc=92.1%]
GSM8K eval: 37%|###6 | 55/150 [02:58<05:18, 3.36s/q, correct=43/55, lccp=84.3%, score=0.920, step_acc=92.2%]
GSM8K eval: 37%|###7 | 56/150 [03:01<05:21, 3.42s/q, correct=44/56, lccp=84.6%, score=0.921, step_acc=92.4%]
GSM8K eval: 38%|###8 | 57/150 [03:04<04:46, 3.09s/q, correct=45/57, lccp=84.9%, score=0.922, step_acc=92.5%]
GSM8K eval: 39%|###8 | 58/150 [03:08<05:10, 3.37s/q, correct=46/58, lccp=85.1%, score=0.924, step_acc=92.6%]
GSM8K eval: 39%|###9 | 59/150 [03:12<05:17, 3.49s/q, correct=46/59, lccp=83.7%, score=0.917, step_acc=92.1%]
GSM8K eval: 40%|#### | 60/150 [03:16<05:52, 3.92s/q, correct=47/60, lccp=83.9%, score=0.918, step_acc=92.2%]
GSM8K eval: 41%|#### | 61/150 [03:20<05:27, 3.68s/q, correct=48/61, lccp=84.2%, score=0.920, step_acc=92.3%]
GSM8K eval: 41%|####1 | 62/150 [03:23<05:07, 3.50s/q, correct=49/62, lccp=84.5%, score=0.921, step_acc=92.5%]
GSM8K eval: 42%|####2 | 63/150 [03:26<04:58, 3.43s/q, correct=49/63, lccp=84.2%, score=0.915, step_acc=92.0%]
GSM8K eval: 43%|####2 | 64/150 [03:29<04:39, 3.25s/q, correct=50/64, lccp=84.4%, score=0.916, step_acc=92.2%]
GSM8K eval: 43%|####3 | 65/150 [03:31<04:24, 3.11s/q, correct=51/65, lccp=84.7%, score=0.917, step_acc=92.3%]
GSM8K eval: 44%|####4 | 66/150 [03:33<03:38, 2.61s/q, correct=52/66, lccp=84.9%, score=0.919, step_acc=92.4%]
GSM8K eval: 45%|####4 | 67/150 [03:35<03:27, 2.50s/q, correct=53/67, lccp=85.1%, score=0.920, step_acc=92.5%]
GSM8K eval: 45%|####5 | 68/150 [03:38<03:29, 2.55s/q, correct=54/68, lccp=85.3%, score=0.921, step_acc=92.6%]
GSM8K eval: 46%|####6 | 69/150 [03:39<03:01, 2.24s/q, correct=55/69, lccp=85.6%, score=0.922, step_acc=92.7%]
GSM8K eval: 47%|####6 | 70/150 [03:42<03:15, 2.44s/q, correct=56/70, lccp=84.3%, score=0.923, step_acc=92.6%]
GSM8K eval: 47%|####7 | 71/150 [03:45<03:28, 2.64s/q, correct=57/71, lccp=83.1%, score=0.924, step_acc=92.4%]
GSM8K eval: 48%|####8 | 72/150 [03:47<02:57, 2.27s/q, correct=58/72, lccp=83.4%, score=0.925, step_acc=92.5%]
GSM8K eval: 49%|####8 | 73/150 [03:48<02:40, 2.09s/q, correct=59/73, lccp=83.6%, score=0.926, step_acc=92.6%]
GSM8K eval: 49%|####9 | 74/150 [03:52<03:12, 2.53s/q, correct=60/74, lccp=83.8%, score=0.927, step_acc=92.7%]
GSM8K eval: 50%|##### | 75/150 [03:54<02:51, 2.29s/q, correct=61/75, lccp=84.0%, score=0.928, step_acc=92.8%]
GSM8K eval: 51%|##### | 76/150 [04:00<04:26, 3.60s/q, correct=61/76, lccp=84.1%, score=0.923, step_acc=92.7%]
GSM8K eval: 51%|#####1 | 77/150 [04:04<04:29, 3.69s/q, correct=62/77, lccp=84.3%, score=0.924, step_acc=92.8%]
GSM8K eval: 52%|#####2 | 78/150 [04:07<03:58, 3.31s/q, correct=63/78, lccp=84.5%, score=0.924, step_acc=92.9%]
GSM8K eval: 53%|#####2 | 79/150 [04:10<03:48, 3.22s/q, correct=63/79, lccp=83.6%, score=0.919, step_acc=92.1%]
GSM8K eval: 53%|#####3 | 80/150 [04:13<03:38, 3.13s/q, correct=64/80, lccp=83.8%, score=0.920, step_acc=92.2%]
GSM8K eval: 54%|#####4 | 81/150 [04:15<03:19, 2.89s/q, correct=65/81, lccp=84.0%, score=0.921, step_acc=92.3%]
GSM8K eval: 55%|#####4 | 82/150 [04:18<03:16, 2.89s/q, correct=66/82, lccp=84.2%, score=0.922, step_acc=92.4%]
GSM8K eval: 55%|#####5 | 83/150 [04:21<03:10, 2.84s/q, correct=67/83, lccp=84.4%, score=0.923, step_acc=92.5%]
GSM8K eval: 56%|#####6 | 84/150 [04:23<03:03, 2.78s/q, correct=68/84, lccp=84.6%, score=0.923, step_acc=92.6%]
GSM8K eval: 57%|#####6 | 85/150 [04:27<03:18, 3.06s/q, correct=69/85, lccp=84.8%, score=0.924, step_acc=92.7%]
GSM8K eval: 57%|#####7 | 86/150 [04:30<03:22, 3.16s/q, correct=70/86, lccp=85.0%, score=0.925, step_acc=92.8%]
GSM8K eval: 58%|#####8 | 87/150 [04:36<04:02, 3.85s/q, correct=71/87, lccp=85.1%, score=0.926, step_acc=92.9%]
GSM8K eval: 59%|#####8 | 88/150 [04:38<03:21, 3.25s/q, correct=72/88, lccp=85.3%, score=0.927, step_acc=93.0%]
GSM8K eval: 59%|#####9 | 89/150 [04:40<03:08, 3.09s/q, correct=73/89, lccp=85.5%, score=0.928, step_acc=93.0%]
GSM8K eval: 60%|###### | 90/150 [04:43<02:52, 2.88s/q, correct=74/90, lccp=85.6%, score=0.928, step_acc=93.1%]
GSM8K eval: 61%|###### | 91/150 [04:47<03:14, 3.30s/q, correct=75/91, lccp=85.8%, score=0.929, step_acc=93.2%]
GSM8K eval: 61%|######1 | 92/150 [04:50<03:07, 3.23s/q, correct=76/92, lccp=86.0%, score=0.930, step_acc=93.3%]
GSM8K eval: 62%|######2 | 93/150 [04:57<04:06, 4.33s/q, correct=77/93, lccp=86.1%, score=0.930, step_acc=93.3%]
GSM8K eval: 63%|######2 | 94/150 [04:58<03:15, 3.48s/q, correct=77/94, lccp=85.2%, score=0.925, step_acc=92.3%]
GSM8K eval: 63%|######3 | 95/150 [05:04<03:42, 4.05s/q, correct=78/95, lccp=84.3%, score=0.925, step_acc=91.9%]
GSM8K eval: 64%|######4 | 96/150 [05:07<03:23, 3.77s/q, correct=78/96, lccp=83.8%, score=0.921, step_acc=91.3%]
GSM8K eval: 65%|######4 | 97/150 [05:10<03:01, 3.42s/q, correct=78/97, lccp=83.2%, score=0.919, step_acc=90.9%]
GSM8K eval: 65%|######5 | 98/150 [05:14<03:08, 3.62s/q, correct=78/98, lccp=82.7%, score=0.915, step_acc=90.7%]
GSM8K eval: 66%|######6 | 99/150 [05:16<02:44, 3.22s/q, correct=79/99, lccp=82.9%, score=0.916, step_acc=90.8%]
GSM8K eval: 67%|######6 | 100/150 [05:18<02:20, 2.81s/q, correct=80/100, lccp=82.1%, score=0.916, step_acc=90.5%]
GSM8K eval: 67%|######7 | 101/150 [05:21<02:19, 2.84s/q, correct=80/101, lccp=81.8%, score=0.913, step_acc=90.4%]
GSM8K eval: 68%|######8 | 102/150 [05:22<01:56, 2.43s/q, correct=81/102, lccp=81.9%, score=0.913, step_acc=90.5%]
GSM8K eval: 69%|######8 | 103/150 [05:24<01:48, 2.30s/q, correct=82/103, lccp=82.1%, score=0.914, step_acc=90.5%]
GSM8K eval: 69%|######9 | 104/150 [05:29<02:18, 3.00s/q, correct=83/104, lccp=82.3%, score=0.915, step_acc=90.6%]
GSM8K eval: 70%|####### | 105/150 [05:31<02:08, 2.86s/q, correct=84/105, lccp=82.5%, score=0.916, step_acc=90.7%]
GSM8K eval: 71%|####### | 106/150 [05:33<01:47, 2.45s/q, correct=85/106, lccp=82.6%, score=0.917, step_acc=90.8%]
GSM8K eval: 71%|#######1 | 107/150 [05:34<01:32, 2.16s/q, correct=86/107, lccp=82.8%, score=0.917, step_acc=90.9%]
GSM8K eval: 72%|#######2 | 108/150 [05:37<01:36, 2.30s/q, correct=87/108, lccp=83.0%, score=0.918, step_acc=91.0%]
GSM8K eval: 73%|#######2 | 109/150 [05:42<02:06, 3.08s/q, correct=87/109, lccp=82.5%, score=0.917, step_acc=90.9%]
GSM8K eval: 73%|#######3 | 110/150 [05:44<01:51, 2.80s/q, correct=88/110, lccp=82.7%, score=0.917, step_acc=91.0%]
GSM8K eval: 74%|#######4 | 111/150 [05:46<01:35, 2.46s/q, correct=89/111, lccp=82.8%, score=0.918, step_acc=91.1%]
GSM8K eval: 75%|#######4 | 112/150 [05:51<02:03, 3.25s/q, correct=89/112, lccp=83.0%, score=0.918, step_acc=91.2%]
GSM8K eval: 75%|#######5 | 113/150 [05:53<01:44, 2.81s/q, correct=90/113, lccp=83.1%, score=0.918, step_acc=91.2%]
GSM8K eval: 76%|#######6 | 114/150 [05:56<01:46, 2.96s/q, correct=90/114, lccp=82.7%, score=0.915, step_acc=90.8%]
GSM8K eval: 77%|#######6 | 115/150 [05:58<01:37, 2.77s/q, correct=91/115, lccp=82.9%, score=0.915, step_acc=90.9%]
GSM8K eval: 77%|#######7 | 116/150 [06:01<01:34, 2.76s/q, correct=92/116, lccp=83.0%, score=0.916, step_acc=90.9%]
GSM8K eval: 78%|#######8 | 117/150 [06:07<01:59, 3.63s/q, correct=93/117, lccp=83.2%, score=0.917, step_acc=91.0%]
GSM8K eval: 79%|#######8 | 118/150 [06:11<02:02, 3.84s/q, correct=93/118, lccp=82.5%, score=0.914, step_acc=91.0%]
GSM8K eval: 79%|#######9 | 119/150 [06:14<01:55, 3.72s/q, correct=93/119, lccp=82.6%, score=0.913, step_acc=91.0%]
GSM8K eval: 80%|######## | 120/150 [06:17<01:42, 3.42s/q, correct=94/120, lccp=82.8%, score=0.914, step_acc=91.1%]
GSM8K eval: 81%|######## | 121/150 [06:20<01:36, 3.32s/q, correct=95/121, lccp=82.9%, score=0.914, step_acc=91.2%]
GSM8K eval: 81%|########1 | 122/150 [06:23<01:31, 3.26s/q, correct=96/122, lccp=83.0%, score=0.915, step_acc=91.3%]
GSM8K eval: 82%|########2 | 123/150 [06:27<01:28, 3.29s/q, correct=96/123, lccp=82.7%, score=0.915, step_acc=91.2%]
GSM8K eval: 83%|########2 | 124/150 [06:29<01:17, 2.99s/q, correct=97/124, lccp=82.8%, score=0.916, step_acc=91.2%]
GSM8K eval: 83%|########3 | 125/150 [06:31<01:07, 2.71s/q, correct=98/125, lccp=83.0%, score=0.916, step_acc=91.3%]
GSM8K eval: 84%|########4 | 126/150 [06:34<01:05, 2.72s/q, correct=99/126, lccp=83.1%, score=0.917, step_acc=91.4%]
GSM8K eval: 85%|########4 | 127/150 [06:38<01:14, 3.26s/q, correct=100/127, lccp=83.2%, score=0.917, step_acc=91.4%]
GSM8K eval: 85%|########5 | 128/150 [06:41<01:09, 3.18s/q, correct=101/128, lccp=83.4%, score=0.918, step_acc=91.5%]
GSM8K eval: 86%|########6 | 129/150 [06:45<01:09, 3.31s/q, correct=102/129, lccp=83.5%, score=0.919, step_acc=91.6%]
GSM8K eval: 87%|########6 | 130/150 [06:47<00:57, 2.87s/q, correct=103/130, lccp=83.6%, score=0.919, step_acc=91.6%]
GSM8K eval: 87%|########7 | 131/150 [06:51<01:04, 3.41s/q, correct=104/131, lccp=83.8%, score=0.920, step_acc=91.7%]
GSM8K eval: 88%|########8 | 132/150 [06:53<00:51, 2.88s/q, correct=105/132, lccp=83.9%, score=0.920, step_acc=91.8%]
GSM8K eval: 89%|########8 | 133/150 [06:56<00:48, 2.87s/q, correct=106/133, lccp=84.0%, score=0.921, step_acc=91.8%]
GSM8K eval: 89%|########9 | 134/150 [07:00<00:53, 3.33s/q, correct=107/134, lccp=84.1%, score=0.922, step_acc=91.9%]
GSM8K eval: 90%|######### | 135/150 [07:03<00:47, 3.19s/q, correct=108/135, lccp=84.2%, score=0.922, step_acc=91.9%]
GSM8K eval: 91%|######### | 136/150 [07:08<00:50, 3.59s/q, correct=108/136, lccp=83.9%, score=0.921, step_acc=91.8%]
GSM8K eval: 91%|#########1| 137/150 [07:14<00:59, 4.55s/q, correct=109/137, lccp=84.0%, score=0.921, step_acc=91.8%]
GSM8K eval: 92%|#########2| 138/150 [07:18<00:52, 4.38s/q, correct=110/138, lccp=84.1%, score=0.922, step_acc=91.9%]
GSM8K eval: 93%|#########2| 139/150 [07:22<00:45, 4.10s/q, correct=111/139, lccp=84.2%, score=0.923, step_acc=91.9%]
GSM8K eval: 93%|#########3| 140/150 [07:26<00:41, 4.17s/q, correct=111/140, lccp=84.1%, score=0.919, step_acc=91.8%]
GSM8K eval: 94%|#########3| 141/150 [07:30<00:36, 4.07s/q, correct=112/141, lccp=84.2%, score=0.919, step_acc=91.8%]
GSM8K eval: 95%|#########4| 142/150 [07:33<00:30, 3.85s/q, correct=113/142, lccp=84.3%, score=0.920, step_acc=91.9%]
GSM8K eval: 95%|#########5| 143/150 [07:36<00:23, 3.38s/q, correct=114/143, lccp=84.4%, score=0.921, step_acc=91.9%]
GSM8K eval: 96%|#########6| 144/150 [07:38<00:18, 3.08s/q, correct=115/144, lccp=84.5%, score=0.921, step_acc=92.0%]
GSM8K eval: 97%|#########6| 145/150 [07:43<00:18, 3.72s/q, correct=115/145, lccp=84.0%, score=0.919, step_acc=91.8%]
GSM8K eval: 97%|#########7| 146/150 [07:46<00:13, 3.49s/q, correct=116/146, lccp=84.2%, score=0.919, step_acc=91.9%]
GSM8K eval: 98%|#########8| 147/150 [07:50<00:10, 3.59s/q, correct=117/147, lccp=84.3%, score=0.920, step_acc=92.0%]
GSM8K eval: 99%|#########8| 148/150 [07:54<00:07, 3.62s/q, correct=118/148, lccp=84.4%, score=0.920, step_acc=92.0%]
GSM8K eval: 99%|#########9| 149/150 [07:57<00:03, 3.57s/q, correct=119/149, lccp=84.5%, score=0.921, step_acc=92.1%]
GSM8K eval: 100%|##########| 150/150 [08:02<00:00, 3.96s/q, correct=119/150, lccp=84.3%, score=0.919, step_acc=91.9%]
GSM8K eval: 100%|##########| 150/150 [08:02<00:00, 3.22s/q, correct=119/150, lccp=84.3%, score=0.919, step_acc=91.9%]
+2026-04-26 03:59:34,576 INFO __main__ - Training Score [iter 5]: 0.9192 (best=0.9162) | n=150
+2026-04-26 03:59:34,576 INFO __main__ - Components : 0.50×correct(79.3%) + 0.40×process + 0.10×fmt(0.998)
+2026-04-26 03:59:34,576 INFO __main__ - Process score : prm_mean=0.903 prm_final=0.930 → weighted=0.920
+2026-04-26 03:59:34,577 INFO __main__ - Step accuracy : 91.8% (bag-of-steps: fraction of steps PRM >0.5)
+2026-04-26 03:59:34,577 INFO __main__ - Chain integrity (LCCP): 84.3% ← fraction of steps before first failure
+ [LCCP=100% → all steps correct; LCCP=0% → first step wrong]
+2026-04-26 03:59:34,577 INFO __main__ - (debug) final-answer accuracy: 79.3%
+2026-04-26 03:59:36,811 INFO __main__ - New best saved → checkpoints/grpo/grpo_20260426_032827/best_policy (combined 0.9192 > 0.9162)
+2026-04-26 03:59:39,019 INFO __main__ - ======================================================================
+2026-04-26 03:59:39,019 INFO __main__ - GRPO ITERATION 6/60
+2026-04-26 03:59:39,020 INFO __main__ - ======================================================================
+2026-04-26 03:59:39,040 INFO __main__ - LR this iteration: 3.31e-06 | T=0.766 | MATH ratio=30%
+
Iter 6 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 03:59:45,143 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,311 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,486 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,569 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,746 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,836 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 03:59:45,921 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='101' gold='101' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 6 GRPO groups: 0%| | 0/20 [00:06, ?q/s, loss=0var, mean_r=0.998, skip=1]
Iter 6 GRPO groups: 5%|5 | 1/20 [00:06<02:10, 6.88s/q, loss=0var, mean_r=0.998, skip=1]2026-04-26 03:59:49,066 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,149 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,232 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,314 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,397 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,479 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,561 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,644 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,726 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 03:59:49,811 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 6 GRPO groups: 5%|5 | 1/20 [00:10<02:10, 6.88s/q, loss=0var, mean_r=0.997, skip=2]
Iter 6 GRPO groups: 10%|# | 2/20 [00:10<01:32, 5.12s/q, loss=0var, mean_r=0.997, skip=2]2026-04-26 04:00:01,239 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:00:01,323 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.949[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:00:01,418 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:00:01,512 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:00:01,598 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.884 = 0.50×0.85(prox=0.85) + 0.40×proc(0.897[fin=0.96,mean=0.79]) + 0.10×fmt(1.000) | pred='26' gold='25' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:00:01,690 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:00:01,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:00:01,865 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.936[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 04:00:01,957 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.918[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=86% lccp=29% (chain=2/7 ok_count=6) n_steps=7
+2026-04-26 04:00:02,050 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.752 = 0.50×0.85(prox=0.85) + 0.40×proc(0.567[fin=0.42,mean=0.79]) + 0.10×fmt(1.000) | pred='26' gold='25' | step_acc=75% lccp=75% (chain=6/8 ok_count=6) n_steps=8
+
Iter 6 GRPO groups: 10%|# | 2/20 [00:24<01:32, 5.12s/q, loss=0.0001, mean_r=0.954, skip=2]
Iter 6 GRPO groups: 15%|#5 | 3/20 [00:24<02:34, 9.10s/q, loss=0.0001, mean_r=0.954, skip=2]2026-04-26 04:00:07,865 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='$240000' gold='240000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:00:07,947 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='240000' gold='240000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:00:08,029 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.962[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='$240000' gold='240000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:00:08,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.903[fin=0.97,mean=0.80]) + 0.10×fmt(1.000) | pred='240000' gold='240000' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:00:08,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='240000' gold='240000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:00:08,279 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.674 = 0.50×0.45(prox=0.45) + 0.40×proc(0.867[fin=1.00,mean=0.67]) + 0.10×fmt(1.000) | pred='384000' gold='240000' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:00:08,362 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='240000' gold='240000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:00:08,445 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='240000' gold='240000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:00:08,527 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.641 = 0.50×0.45(prox=0.45) + 0.40×proc(0.784[fin=0.97,mean=0.51]) + 0.10×fmt(1.000) | pred='384000' gold='240000' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:00:08,611 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240000' gold='240000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 6 GRPO groups: 15%|#5 | 3/20 [00:31<02:34, 9.10s/q, loss=-0.0007, mean_r=0.925, skip=2]
Iter 6 GRPO groups: 20%|## | 4/20 [00:31<02:09, 8.09s/q, loss=-0.0007, mean_r=0.925, skip=2]2026-04-26 04:00:18,599 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:00:18,691 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.25(prox=0.25) + 0.40×proc(0.948[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='25' gold='10' | step_acc=83% lccp=33% (chain=2/6 ok_count=5) n_steps=6
+2026-04-26 04:00:18,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:00:18,878 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.25(prox=0.25) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='25' gold='10' | step_acc=83% lccp=33% (chain=2/6 ok_count=5) n_steps=6
+2026-04-26 04:00:18,970 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.25(prox=0.25) + 0.40×proc(0.899[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='25' gold='10' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:00:19,064 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.25(prox=0.25) + 0.40×proc(0.938[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='25' gold='10' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:00:19,156 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.25(prox=0.25) + 0.40×proc(0.877[fin=1.00,mean=0.70]) + 0.10×fmt(1.000) | pred='25' gold='10' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:00:19,249 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:00:19,341 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.832[fin=0.96,mean=0.64]) + 0.10×fmt(1.000) | pred='0' gold='10' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:00:19,433 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.508 = 0.50×0.05(prox=0.05) + 0.40×proc(0.857[fin=0.99,mean=0.65]) + 0.10×fmt(1.000) | pred='115' gold='10' | step_acc=57% lccp=29% (chain=2/7 ok_count=4) n_steps=7
+
Iter 6 GRPO groups: 20%|## | 4/20 [00:41<02:09, 8.09s/q, loss=0.0007, mean_r=0.681, skip=2]
Iter 6 GRPO groups: 25%|##5 | 5/20 [00:41<02:15, 9.03s/q, loss=0.0007, mean_r=0.681, skip=2]2026-04-26 04:00:24,976 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,058 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,143 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,228 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,393 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,474 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,556 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,638 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:00:25,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='74' gold='74' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 6 GRPO groups: 25%|##5 | 5/20 [00:46<02:15, 9.03s/q, loss=0var, mean_r=0.999, skip=3]
Iter 6 GRPO groups: 30%|### | 6/20 [00:46<01:46, 7.61s/q, loss=0var, mean_r=0.999, skip=3]2026-04-26 04:00:59,418 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.381 = 0.50×0.00(prox=0.00) + 0.40×proc(0.609[fin=0.71,mean=0.45]) + 0.10×fmt(1.000) | pred='C' gold='119' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:00:59,516 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.377 = 0.50×0.00(prox=0.00) + 0.40×proc(0.646[fin=0.69,mean=0.58]) + 0.10×fmt(1.000) | pred='C' gold='119' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+2026-04-26 04:00:59,608 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.474 = 0.50×0.50(prox=0.50) + 0.40×proc(0.306[fin=0.12,mean=0.58]) + 0.10×fmt(1.000) | pred='60' gold='119' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 04:00:59,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.037 = 0.50×0.00(prox=0.00) + 0.40×proc(0.006[fin=0.01,mean=0.01]) + 0.10×fmt(0.350) | pred='' gold='119' | step_acc=0% lccp=0% (chain=0/1 ok_count=0) n_steps=1
+2026-04-26 04:00:59,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.347 = 0.50×0.00(prox=0.00) + 0.40×proc(0.706[fin=0.88,mean=0.45]) + 0.10×fmt(0.650) | pred='C' gold='119' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 04:00:59,953 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.161 = 0.50×0.00(prox=0.00) + 0.40×proc(0.153[fin=0.04,mean=0.32]) + 0.10×fmt(0.700) | pred='' gold='119' | step_acc=20% lccp=20% (chain=1/5 ok_count=1) n_steps=5
+2026-04-26 04:01:00,073 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='C' gold='119' | step_acc=100% lccp=100% (chain=16/16 ok_count=16) n_steps=16
+2026-04-26 04:01:00,163 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.441 = 0.50×0.00(prox=0.00) + 0.40×proc(0.628[fin=0.58,mean=0.70]) + 0.10×fmt(1.000) | pred='C' gold='119' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:01:00,292 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.475 = 0.50×0.00(prox=0.00) + 0.40×proc(0.953[fin=1.00,mean=0.89]) + 0.10×fmt(0.700) | pred='' gold='119' | step_acc=92% lccp=16% (chain=4/25 ok_count=23) n_steps=25
+
Iter 6 GRPO groups: 30%|### | 6/20 [01:22<01:46, 7.61s/q, loss=-0.0006, mean_r=0.360, skip=3]
Iter 6 GRPO groups: 35%|###5 | 7/20 [01:22<03:39, 16.92s/q, loss=-0.0006, mean_r=0.360, skip=3]2026-04-26 04:01:24,800 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.485[fin=0.44,mean=0.56]) + 0.10×fmt(1.000) | pred='3' gold='8' | step_acc=62% lccp=25% (chain=2/8 ok_count=5) n_steps=8
+2026-04-26 04:01:24,897 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(0.700) | pred='' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:01:24,983 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.330 = 0.50×0.00(prox=0.00) + 0.40×proc(0.525[fin=0.47,mean=0.61]) + 0.10×fmt(0.700) | pred='' gold='8' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 04:01:25,078 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.42(prox=0.42) + 0.40×proc(0.560[fin=0.64,mean=0.43]) + 0.10×fmt(1.000) | pred='2.40824' gold='8' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 04:01:25,178 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.754 = 0.50×0.57(prox=0.57) + 0.40×proc(0.920[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='11' gold='8' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 04:01:25,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:01:25,388 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.671 = 0.50×0.80(prox=0.80) + 0.40×proc(0.428[fin=0.55,mean=0.24]) + 0.10×fmt(1.000) | pred='7' gold='8' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:01:25,482 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.856 = 0.50×0.80(prox=0.80) + 0.40×proc(0.890[fin=0.98,mean=0.75]) + 0.10×fmt(1.000) | pred='9' gold='8' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:01:25,575 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.956 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(0.650) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:01:25,667 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.591 = 0.50×0.67(prox=0.67) + 0.40×proc(0.393[fin=0.53,mean=0.19]) + 0.10×fmt(1.000) | pred='6' gold='8' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+
Iter 6 GRPO groups: 35%|###5 | 7/20 [01:48<03:39, 16.92s/q, loss=0.0011, mean_r=0.681, skip=3]
Iter 6 GRPO groups: 40%|#### | 8/20 [01:48<03:55, 19.59s/q, loss=0.0011, mean_r=0.681, skip=3]2026-04-26 04:01:35,767 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:01:35,852 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:01:35,945 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.314 = 0.50×0.34(prox=0.34) + 0.40×proc(0.112[fin=0.07,mean=0.18]) + 0.10×fmt(1.000) | pred='1' gold='50' | step_acc=12% lccp=0% (chain=0/8 ok_count=1) n_steps=8
+2026-04-26 04:01:36,028 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.431 = 0.50×0.57(prox=0.57) + 0.40×proc(0.117[fin=0.04,mean=0.24]) + 0.10×fmt(1.000) | pred='31' gold='50' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 04:01:36,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:01:36,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.295 = 0.50×0.02(prox=0.02) + 0.40×proc(0.270[fin=0.10,mean=0.52]) + 0.10×fmt(1.000) | pred='1100' gold='50' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 04:01:36,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.895 = 0.50×1.00(exact) + 0.40×proc(0.738[fin=0.85,mean=0.57]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 04:01:36,360 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:01:36,451 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+2026-04-26 04:01:36,534 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 6 GRPO groups: 40%|#### | 8/20 [01:58<03:55, 19.59s/q, loss=0.0008, mean_r=0.793, skip=3]
Iter 6 GRPO groups: 45%|####5 | 9/20 [01:58<03:05, 16.85s/q, loss=0.0008, mean_r=0.793, skip=3]2026-04-26 04:01:44,178 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.452 = 0.50×0.27(prox=0.27) + 0.40×proc(0.258[fin=0.01,mean=0.63]) + 0.10×fmt(1.000) | pred='315' gold='135' | step_acc=75% lccp=75% (chain=3/4 ok_count=3) n_steps=4
+2026-04-26 04:01:44,260 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:01:44,354 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.669 = 0.50×0.53(prox=0.53) + 0.40×proc(0.761[fin=0.82,mean=0.68]) + 0.10×fmt(1.000) | pred='195' gold='135' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:01:44,439 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:01:44,530 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:01:44,613 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:01:44,695 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:01:44,786 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:01:44,872 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.463 = 0.50×0.27(prox=0.27) + 0.40×proc(0.284[fin=0.04,mean=0.66]) + 0.10×fmt(1.000) | pred='315' gold='135' | step_acc=75% lccp=75% (chain=3/4 ok_count=3) n_steps=4
+2026-04-26 04:01:44,955 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='135' gold='135' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 6 GRPO groups: 45%|####5 | 9/20 [02:07<03:05, 16.85s/q, loss=0.0002, mean_r=0.857, skip=3]
Iter 6 GRPO groups: 50%|##### | 10/20 [02:07<02:22, 14.25s/q, loss=0.0002, mean_r=0.857, skip=3]2026-04-26 04:01:54,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.487 = 0.50×0.36(prox=0.36) + 0.40×proc(0.270[fin=0.02,mean=0.64]) + 0.10×fmt(1.000) | pred='5' gold='50' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 04:01:54,423 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.938 = 0.50×1.00(exact) + 0.40×proc(0.846[fin=0.93,mean=0.72]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:01:54,507 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:01:54,598 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:01:54,691 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.515 = 0.50×0.36(prox=0.36) + 0.40×proc(0.374[fin=0.22,mean=0.61]) + 0.10×fmt(1.000) | pred='5.5' gold='50' | step_acc=57% lccp=57% (chain=4/7 ok_count=4) n_steps=7
+2026-04-26 04:01:54,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.461 = 0.50×0.36(prox=0.36) + 0.40×proc(0.241[fin=0.02,mean=0.57]) + 0.10×fmt(1.000) | pred='5' gold='50' | step_acc=57% lccp=57% (chain=4/7 ok_count=4) n_steps=7
+2026-04-26 04:01:54,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.659 = 0.50×0.85(prox=0.85) + 0.40×proc(0.334[fin=0.03,mean=0.80]) + 0.10×fmt(1.000) | pred='49.5' gold='50' | step_acc=83% lccp=83% (chain=5/6 ok_count=5) n_steps=6
+2026-04-26 04:01:54,962 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.925[fin=0.98,mean=0.85]) + 0.10×fmt(1.000) | pred='5.75' gold='50' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:01:55,046 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.918 = 0.50×0.85(prox=0.85) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='51' gold='50' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:01:55,132 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 6 GRPO groups: 50%|##### | 10/20 [02:17<02:22, 14.25s/q, loss=-0.0004, mean_r=0.752, skip=3]
Iter 6 GRPO groups: 55%|#####5 | 11/20 [02:17<01:57, 13.01s/q, loss=-0.0004, mean_r=0.752, skip=3]2026-04-26 04:01:59,602 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.669 = 0.50×0.44(prox=0.44) + 0.40×proc(0.868[fin=1.00,mean=0.67]) + 0.10×fmt(1.000) | pred='65' gold='40' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:01:59,684 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.666 = 0.50×0.44(prox=0.44) + 0.40×proc(0.858[fin=1.00,mean=0.65]) + 0.10×fmt(1.000) | pred='65' gold='40' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:01:59,765 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:01:59,846 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:01:59,926 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:00,007 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:00,088 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:00,171 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:00,253 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:00,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 6 GRPO groups: 55%|#####5 | 11/20 [02:22<01:57, 13.01s/q, loss=-0.0002, mean_r=0.927, skip=3]
Iter 6 GRPO groups: 60%|###### | 12/20 [02:22<01:25, 10.64s/q, loss=-0.0002, mean_r=0.927, skip=3]2026-04-26 04:02:05,789 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.958[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:05,872 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:05,956 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:06,039 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:06,122 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:06,205 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:06,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:06,368 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:06,450 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:06,533 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.953[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 6 GRPO groups: 60%|###### | 12/20 [02:27<01:25, 10.64s/q, loss=0var, mean_r=0.991, skip=4]
Iter 6 GRPO groups: 65%|######5 | 13/20 [02:27<01:01, 8.85s/q, loss=0var, mean_r=0.991, skip=4]2026-04-26 04:02:11,302 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:11,387 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:11,469 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:11,550 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:11,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.758 = 0.50×0.58(prox=0.58) + 0.40×proc(0.916[fin=0.98,mean=0.82]) + 0.10×fmt(1.000) | pred='38' gold='28' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:02:11,715 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.972[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:11,796 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:11,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:11,959 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:02:12,040 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 6 GRPO groups: 65%|######5 | 13/20 [02:34<01:01, 8.85s/q, loss=0.0004, mean_r=0.971, skip=4]
Iter 6 GRPO groups: 70%|####### | 14/20 [02:34<00:49, 8.29s/q, loss=0.0004, mean_r=0.971, skip=4]2026-04-26 04:02:16,958 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,040 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,121 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,205 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,290 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,372 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,455 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,538 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,620 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:02:17,701 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 6 GRPO groups: 70%|####### | 14/20 [02:38<00:49, 8.29s/q, loss=0var, mean_r=0.999, skip=5]
Iter 6 GRPO groups: 75%|#######5 | 15/20 [02:38<00:35, 7.05s/q, loss=0var, mean_r=0.999, skip=5]2026-04-26 04:02:23,551 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:02:23,634 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:02:23,720 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:02:23,805 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:23,890 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:02:23,973 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:02:24,057 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:02:24,141 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+2026-04-26 04:02:24,223 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:02:24,308 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+
Iter 6 GRPO groups: 75%|#######5 | 15/20 [02:45<00:35, 7.05s/q, loss=0var, mean_r=0.998, skip=6]
Iter 6 GRPO groups: 80%|######## | 16/20 [02:45<00:27, 6.92s/q, loss=0var, mean_r=0.998, skip=6]2026-04-26 04:02:30,625 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:30,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:30,800 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:02:30,882 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:30,965 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:31,048 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:31,133 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:31,217 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:31,300 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:02:31,384 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 6 GRPO groups: 80%|######## | 16/20 [02:52<00:27, 6.92s/q, loss=0var, mean_r=1.000, skip=7]
Iter 6 GRPO groups: 85%|########5 | 17/20 [02:52<00:20, 6.97s/q, loss=0var, mean_r=1.000, skip=7]2026-04-26 04:02:34,333 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:34,414 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:34,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:34,571 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.939[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:02:34,648 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.969[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:34,728 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.948[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:34,807 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.949[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:34,888 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:34,969 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:35,051 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.766 = 0.50×0.57(prox=0.57) + 0.40×proc(0.951[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='88' gold='64' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 6 GRPO groups: 85%|########5 | 17/20 [02:57<00:20, 6.97s/q, loss=-0.0011, mean_r=0.968, skip=7]
Iter 6 GRPO groups: 90%|######### | 18/20 [02:57<00:12, 6.41s/q, loss=-0.0011, mean_r=0.968, skip=7]2026-04-26 04:02:39,176 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,258 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,341 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,422 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,503 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,584 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,665 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,745 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,826 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:39,907 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='230' gold='230' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 6 GRPO groups: 90%|######### | 18/20 [03:00<00:12, 6.41s/q, loss=0var, mean_r=0.999, skip=8]
Iter 6 GRPO groups: 95%|#########5| 19/20 [03:00<00:05, 5.51s/q, loss=0var, mean_r=0.999, skip=8]2026-04-26 04:02:50,163 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.921[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:02:50,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.521 = 0.50×0.71(prox=0.71) + 0.40×proc(0.159[fin=0.10,mean=0.25]) + 0.10×fmt(1.000) | pred='6' gold='5' | step_acc=17% lccp=0% (chain=0/6 ok_count=1) n_steps=6
+2026-04-26 04:02:50,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.439 = 0.50×0.56(prox=0.56) + 0.40×proc(0.152[fin=0.19,mean=0.10]) + 0.10×fmt(1.000) | pred='7' gold='5' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 04:02:50,419 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.678 = 0.50×0.71(prox=0.71) + 0.40×proc(0.551[fin=0.73,mean=0.28]) + 0.10×fmt(1.000) | pred='6' gold='5' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 04:02:50,501 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.882 = 0.50×1.00(exact) + 0.40×proc(0.705[fin=0.75,mean=0.64]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:02:50,584 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:02:50,675 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.504 = 0.50×0.45(prox=0.45) + 0.40×proc(0.407[fin=0.24,mean=0.66]) + 0.10×fmt(1.000) | pred='2' gold='5' | step_acc=64% lccp=9% (chain=1/11 ok_count=7) n_steps=11
+2026-04-26 04:02:50,756 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.511 = 0.50×0.56(prox=0.56) + 0.40×proc(0.332[fin=0.36,mean=0.30]) + 0.10×fmt(1.000) | pred='7' gold='5' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:02:50,840 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:02:50,933 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.339 = 0.50×0.38(prox=0.38) + 0.40×proc(0.118[fin=0.06,mean=0.20]) + 0.10×fmt(1.000) | pred='1' gold='5' | step_acc=0% lccp=0% (chain=0/5 ok_count=0) n_steps=5
+
Iter 6 GRPO groups: 95%|#########5| 19/20 [03:13<00:05, 5.51s/q, loss=-0.0010, mean_r=0.684, skip=8]
Iter 6 GRPO groups: 100%|##########| 20/20 [03:13<00:00, 7.59s/q, loss=-0.0010, mean_r=0.684, skip=8]
Iter 6 GRPO groups: 100%|##########| 20/20 [03:13<00:00, 9.67s/q, loss=-0.0010, mean_r=0.684, skip=8]
+2026-04-26 04:02:52,370 INFO __main__ - Iter 6 | loss=-0.0001 | reward mean=0.879 std=0.215 | gt_match=74.9% | grounded_acc=91.0% | step_acc=88.5% | lccp=80.6% | batch_acc=91.0% | phase=GROUNDED_ONLY sp_ratio=0% | groups=12 skipped=8(0var=8) | lr=3.88e-06 | 193.4s
+2026-04-26 04:02:52,370 WARNING __main__ - STARVATION: 40% of groups skipped (zero variance). grounded_acc=91.0% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 04:02:52,371 INFO __main__ - ======================================================================
+2026-04-26 04:02:52,371 INFO __main__ - GRPO ITERATION 7/60
+2026-04-26 04:02:52,371 INFO __main__ - ======================================================================
+2026-04-26 04:02:52,392 INFO __main__ - LR this iteration: 3.88e-06 | T=0.759 | MATH ratio=30%
+
Iter 7 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:02:57,992 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.567 = 0.50×0.43(prox=0.43) + 0.40×proc(0.631[fin=0.79,mean=0.39]) + 0.10×fmt(1.000) | pred='12' gold='36' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:02:58,087 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.643 = 0.50×0.50(prox=0.50) + 0.40×proc(0.734[fin=0.91,mean=0.47]) + 0.10×fmt(1.000) | pred='18' gold='36' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:02:58,180 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.305 = 0.50×0.00(prox=0.00) + 0.40×proc(0.400[fin=0.25,mean=0.62]) + 0.10×fmt(0.700) | pred='' gold='36' | step_acc=50% lccp=50% (chain=1/2 ok_count=1) n_steps=2
+2026-04-26 04:02:58,272 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.651 = 0.50×0.50(prox=0.50) + 0.40×proc(0.752[fin=0.92,mean=0.50]) + 0.10×fmt(1.000) | pred='18' gold='36' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:02:58,363 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.460 = 0.50×0.50(prox=0.50) + 0.40×proc(0.362[fin=0.43,mean=0.26]) + 0.10×fmt(0.650) | pred='18' gold='36' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 04:02:58,456 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.664 = 0.50×0.50(prox=0.50) + 0.40×proc(0.786[fin=0.89,mean=0.62]) + 0.10×fmt(1.000) | pred='18' gold='36' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 04:02:58,549 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.596 = 0.50×0.50(prox=0.50) + 0.40×proc(0.614[fin=0.66,mean=0.54]) + 0.10×fmt(1.000) | pred='18' gold='36' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:02:58,642 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.821 = 0.50×1.00(exact) + 0.40×proc(0.553[fin=0.64,mean=0.43]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:02:58,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.657 = 0.50×0.50(prox=0.50) + 0.40×proc(0.766[fin=0.94,mean=0.51]) + 0.10×fmt(1.000) | pred='18' gold='36' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:02:58,829 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.675 = 0.50×0.50(prox=0.50) + 0.40×proc(0.813[fin=0.95,mean=0.61]) + 0.10×fmt(1.000) | pred='18' gold='36' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+
Iter 7 GRPO groups: 0%| | 0/20 [00:07, ?q/s, loss=0.0016, mean_r=0.604, skip=0]
Iter 7 GRPO groups: 5%|5 | 1/20 [00:07<02:29, 7.85s/q, loss=0.0016, mean_r=0.604, skip=0]2026-04-26 04:03:04,023 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:04,105 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:04,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:04,267 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:04,350 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:04,434 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:04,515 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:04,596 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:04,678 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:03:04,760 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.757 = 0.50×0.64(prox=0.64) + 0.40×proc(0.848[fin=0.95,mean=0.70]) + 0.10×fmt(1.000) | pred='10.71' gold='15' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+
Iter 7 GRPO groups: 5%|5 | 1/20 [00:13<02:29, 7.85s/q, loss=0.0010, mean_r=0.970, skip=0]
Iter 7 GRPO groups: 10%|# | 2/20 [00:13<02:01, 6.74s/q, loss=0.0010, mean_r=0.970, skip=0]2026-04-26 04:03:09,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:09,750 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:09,825 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:09,901 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.896[fin=0.99,mean=0.75]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:03:09,976 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:03:10,056 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:10,137 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:10,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:10,301 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:03:10,380 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 7 GRPO groups: 10%|# | 2/20 [00:17<02:01, 6.74s/q, loss=0var, mean_r=0.993, skip=1]
Iter 7 GRPO groups: 15%|#5 | 3/20 [00:17<01:34, 5.57s/q, loss=0var, mean_r=0.993, skip=1]2026-04-26 04:03:16,274 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:16,358 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:16,441 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:16,523 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:16,604 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:16,685 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:16,768 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:16,852 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:16,936 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:03:17,019 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 7 GRPO groups: 15%|#5 | 3/20 [00:24<01:34, 5.57s/q, loss=0var, mean_r=0.999, skip=2]
Iter 7 GRPO groups: 20%|## | 4/20 [00:24<01:35, 5.99s/q, loss=0var, mean_r=0.999, skip=2]2026-04-26 04:03:26,740 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.925[fin=0.98,mean=0.83]) + 0.10×fmt(1.000) | pred='0' gold='2000' | step_acc=88% lccp=62% (chain=5/8 ok_count=7) n_steps=8
+2026-04-26 04:03:26,834 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2000' gold='2000' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:03:26,926 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.421 = 0.50×0.22(prox=0.22) + 0.40×proc(0.319[fin=0.15,mean=0.57]) + 0.10×fmt(1.000) | pred='-1640' gold='2000' | step_acc=57% lccp=57% (chain=4/7 ok_count=4) n_steps=7
+2026-04-26 04:03:27,019 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.953 = 0.50×1.00(exact) + 0.40×proc(0.883[fin=0.93,mean=0.81]) + 0.10×fmt(1.000) | pred='2000' gold='2000' | step_acc=88% lccp=75% (chain=6/8 ok_count=7) n_steps=8
+2026-04-26 04:03:27,113 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.433 = 0.50×0.22(prox=0.22) + 0.40×proc(0.376[fin=0.25,mean=0.57]) + 0.10×fmt(1.000) | pred='-1640' gold='2000' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 04:03:27,207 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.436 = 0.50×0.00(prox=0.00) + 0.40×proc(0.700[fin=0.73,mean=0.65]) + 0.10×fmt(1.000) | pred='Thomas needs $1640 more to buy the car.' gold='2000' | step_acc=75% lccp=38% (chain=3/8 ok_count=6) n_steps=8
+2026-04-26 04:03:27,300 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.35(prox=0.35) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='180' gold='2000' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:03:27,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.433 = 0.50×0.07(prox=0.07) + 0.40×proc(0.508[fin=0.43,mean=0.62]) + 0.10×fmt(1.000) | pred='14640' gold='2000' | step_acc=62% lccp=62% (chain=5/8 ok_count=5) n_steps=8
+2026-04-26 04:03:27,487 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.958[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='2000' gold='2000' | step_acc=89% lccp=44% (chain=4/9 ok_count=8) n_steps=9
+2026-04-26 04:03:27,579 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.23(prox=0.23) + 0.40×proc(0.797[fin=0.85,mean=0.72]) + 0.10×fmt(1.000) | pred='5280' gold='2000' | step_acc=78% lccp=67% (chain=6/9 ok_count=7) n_steps=9
+
Iter 7 GRPO groups: 20%|## | 4/20 [00:36<01:35, 5.99s/q, loss=-0.0001, mean_r=0.631, skip=2]
Iter 7 GRPO groups: 25%|##5 | 5/20 [00:36<02:02, 8.15s/q, loss=-0.0001, mean_r=0.631, skip=2]2026-04-26 04:03:35,621 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:03:35,707 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:03:35,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.914[fin=0.98,mean=0.81]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:03:35,866 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:03:35,950 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:03:36,026 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.182 = 0.50×0.06(prox=0.06) + 0.40×proc(0.216[fin=0.19,mean=0.26]) + 0.10×fmt(0.650) | pred='113' gold='13' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 04:03:36,109 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:03:36,189 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:03:36,271 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:03:36,348 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 7 GRPO groups: 25%|##5 | 5/20 [00:45<02:02, 8.15s/q, loss=0.0042, mean_r=0.910, skip=2]
Iter 7 GRPO groups: 30%|### | 6/20 [00:45<01:57, 8.37s/q, loss=0.0042, mean_r=0.910, skip=2]2026-04-26 04:04:11,315 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.385 = 0.50×0.33(prox=0.33) + 0.40×proc(0.297[fin=0.40,mean=0.14]) + 0.10×fmt(1.000) | pred='0' gold='13535' | step_acc=0% lccp=0% (chain=0/7 ok_count=0) n_steps=7
+2026-04-26 04:04:11,426 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.457 = 0.50×0.01(prox=0.01) + 0.40×proc(0.844[fin=0.92,mean=0.73]) + 0.10×fmt(1.000) | pred='629663' gold='13535' | step_acc=73% lccp=9% (chain=1/11 ok_count=8) n_steps=11
+2026-04-26 04:04:11,519 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.600 = 0.50×0.36(prox=0.36) + 0.40×proc(0.801[fin=0.91,mean=0.64]) + 0.10×fmt(1.000) | pred='1471' gold='13535' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 04:04:11,634 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.473 = 0.50×0.35(prox=0.35) + 0.40×proc(0.379[fin=0.12,mean=0.77]) + 0.10×fmt(1.000) | pred='960' gold='13535' | step_acc=75% lccp=31% (chain=5/16 ok_count=12) n_steps=16
+2026-04-26 04:04:11,726 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.196 = 0.50×0.00(prox=0.00) + 0.40×proc(0.267[fin=0.07,mean=0.57]) + 0.10×fmt(0.700) | pred='' gold='13535' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+2026-04-26 04:04:11,829 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.573 = 0.50×0.35(prox=0.35) + 0.40×proc(0.748[fin=0.84,mean=0.62]) + 0.10×fmt(1.000) | pred='828' gold='13535' | step_acc=70% lccp=0% (chain=0/10 ok_count=7) n_steps=10
+2026-04-26 04:04:11,926 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.529 = 0.50×0.58(prox=0.58) + 0.40×proc(0.346[fin=0.44,mean=0.20]) + 0.10×fmt(1.000) | pred='8639' gold='13535' | step_acc=0% lccp=0% (chain=0/5 ok_count=0) n_steps=5
+2026-04-26 04:04:12,037 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.49(prox=0.49) + 0.40×proc(0.726[fin=0.70,mean=0.76]) + 0.10×fmt(1.000) | pred='20617' gold='13535' | step_acc=80% lccp=7% (chain=1/15 ok_count=12) n_steps=15
+2026-04-26 04:04:12,121 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.359 = 0.50×0.08(prox=0.08) + 0.40×proc(0.542[fin=0.68,mean=0.33]) + 0.10×fmt(1.000) | pred='86445' gold='13535' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+
Iter 7 GRPO groups: 30%|### | 6/20 [01:21<01:57, 8.37s/q, loss=0.0016, mean_r=0.458, skip=2]
Iter 7 GRPO groups: 35%|###5 | 7/20 [01:21<03:44, 17.30s/q, loss=0.0016, mean_r=0.458, skip=2]2026-04-26 04:04:18,526 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='348' gold='348' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:18,611 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.540 = 0.50×0.49(prox=0.49) + 0.40×proc(0.299[fin=0.14,mean=0.54]) + 0.10×fmt(1.000) | pred='168' gold='348' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:04:18,694 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.675 = 0.50×0.53(prox=0.53) + 0.40×proc(0.780[fin=0.82,mean=0.71]) + 0.10×fmt(1.000) | pred='192' gold='348' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:04:18,776 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='348' gold='348' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:18,858 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='348' gold='348' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:18,940 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='348' gold='348' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:19,023 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.497 = 0.50×0.49(prox=0.49) + 0.40×proc(0.192[fin=0.03,mean=0.43]) + 0.10×fmt(1.000) | pred='168' gold='348' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:04:19,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.715 = 0.50×0.49(prox=0.49) + 0.40×proc(0.924[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='168' gold='348' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 04:04:19,188 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='348' gold='348' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:04:19,270 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.921[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='348' gold='348' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+
Iter 7 GRPO groups: 35%|###5 | 7/20 [01:28<03:44, 17.30s/q, loss=0.0011, mean_r=0.837, skip=2]
Iter 7 GRPO groups: 40%|#### | 8/20 [01:28<02:49, 14.09s/q, loss=0.0011, mean_r=0.837, skip=2]2026-04-26 04:04:26,675 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:26,759 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:26,850 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:26,936 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:27,021 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:27,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.913[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:04:27,190 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:27,281 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:27,364 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:04:27,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 7 GRPO groups: 40%|#### | 8/20 [01:35<02:49, 14.09s/q, loss=0var, mean_r=0.994, skip=3]
Iter 7 GRPO groups: 45%|####5 | 9/20 [01:35<02:09, 11.80s/q, loss=0var, mean_r=0.994, skip=3]2026-04-26 04:05:00,794 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.09(prox=0.09) + 0.40×proc(0.954[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='5' gold='0' | step_acc=88% lccp=62% (chain=5/8 ok_count=7) n_steps=8
+2026-04-26 04:05:00,891 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.169 = 0.50×0.00(prox=0.00) + 0.40×proc(0.173[fin=0.16,mean=0.19]) + 0.10×fmt(1.000) | pred='$x^3 + 3x^2 + 3x + 4$' gold='0' | step_acc=14% lccp=0% (chain=0/7 ok_count=1) n_steps=7
+2026-04-26 04:05:00,976 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.524 = 0.50×0.09(prox=0.09) + 0.40×proc(0.853[fin=0.98,mean=0.66]) + 0.10×fmt(1.000) | pred='5' gold='0' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:05:01,067 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:05:01,163 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.252 = 0.50×0.00(prox=0.00) + 0.40×proc(0.286[fin=0.16,mean=0.47]) + 0.10×fmt(1.000) | pred='$x^{44} + x^{33} + x^{22} + x^{11} + 1$' gold='0' | step_acc=38% lccp=25% (chain=2/8 ok_count=3) n_steps=8
+2026-04-26 04:05:01,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.919 = 0.50×1.00(exact) + 0.40×proc(0.797[fin=0.94,mean=0.58]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 04:05:01,348 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.546 = 0.50×0.33(prox=0.33) + 0.40×proc(0.699[fin=0.92,mean=0.37]) + 0.10×fmt(1.000) | pred='1' gold='0' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:05:01,451 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.972 = 0.50×1.00(exact) + 0.40×proc(0.930[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=80% lccp=40% (chain=4/10 ok_count=8) n_steps=10
+
Iter 7 GRPO groups: 45%|####5 | 9/20 [02:10<02:09, 11.80s/q, loss=0.0004, mean_r=0.615, skip=3]
Iter 7 GRPO groups: 50%|##### | 10/20 [02:10<03:10, 19.03s/q, loss=0.0004, mean_r=0.615, skip=3]2026-04-26 04:05:06,957 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:05:07,039 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:05:07,121 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:05:07,203 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:05:07,285 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:05:07,367 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:05:07,448 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:05:07,529 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:05:07,604 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:05:07,688 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 7 GRPO groups: 50%|##### | 10/20 [02:15<03:10, 19.03s/q, loss=0var, mean_r=0.996, skip=4]
Iter 7 GRPO groups: 55%|#####5 | 11/20 [02:15<02:12, 14.74s/q, loss=0var, mean_r=0.996, skip=4]2026-04-26 04:05:18,067 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:05:18,154 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.741 = 0.50×0.62(prox=0.62) + 0.40×proc(0.820[fin=0.98,mean=0.57]) + 0.10×fmt(1.000) | pred='8.4' gold='12' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:05:18,246 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:05:18,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.710 = 0.50×0.62(prox=0.62) + 0.40×proc(0.744[fin=0.95,mean=0.43]) + 0.10×fmt(1.000) | pred='8.4' gold='12' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:05:18,416 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.766 = 0.50×0.60(prox=0.60) + 0.40×proc(0.914[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='8' gold='12' | step_acc=86% lccp=0% (chain=0/7 ok_count=6) n_steps=7
+2026-04-26 04:05:18,508 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+2026-04-26 04:05:18,601 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.739 = 0.50×0.55(prox=0.55) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='17' gold='12' | step_acc=78% lccp=0% (chain=0/9 ok_count=7) n_steps=9
+2026-04-26 04:05:18,693 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:05:18,787 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:05:18,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 7 GRPO groups: 55%|#####5 | 11/20 [02:27<02:12, 14.74s/q, loss=0.0018, mean_r=0.893, skip=4]
Iter 7 GRPO groups: 60%|###### | 12/20 [02:27<01:52, 14.09s/q, loss=0.0018, mean_r=0.893, skip=4]2026-04-26 04:05:27,788 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(0.650) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:05:27,871 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.920[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:05:27,952 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:05:28,033 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=0.99,mean=0.97]) + 0.10×fmt(0.650) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:05:28,118 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:05:28,200 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=0.99,mean=0.97]) + 0.10×fmt(0.650) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:05:28,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:05:28,375 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:05:28,458 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(0.650) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:05:28,537 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.954 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=0.99,mean=0.95]) + 0.10×fmt(0.650) | pred='126' gold='126' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 7 GRPO groups: 60%|###### | 12/20 [02:36<01:52, 14.09s/q, loss=0var, mean_r=0.971, skip=5]
Iter 7 GRPO groups: 65%|######5 | 13/20 [02:36<01:26, 12.32s/q, loss=0var, mean_r=0.971, skip=5]2026-04-26 04:05:33,888 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.787 = 0.50×0.60(prox=0.60) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='24' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:05:33,971 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.785 = 0.50×0.60(prox=0.60) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:05:34,057 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.828 = 0.50×0.75(prox=0.75) + 0.40×proc(0.881[fin=0.93,mean=0.80]) + 0.10×fmt(1.000) | pred='15' gold='18' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:05:34,143 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.778 = 0.50×0.60(prox=0.60) + 0.40×proc(0.945[fin=0.99,mean=0.88]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:05:34,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.736 = 0.50×0.60(prox=0.60) + 0.40×proc(0.840[fin=0.90,mean=0.75]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 04:05:34,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.478 = 0.50×0.43(prox=0.43) + 0.40×proc(0.410[fin=0.38,mean=0.46]) + 0.10×fmt(1.000) | pred='6' gold='18' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:05:34,393 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.753 = 0.50×0.60(prox=0.60) + 0.40×proc(0.882[fin=0.99,mean=0.72]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:05:34,477 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.462 = 0.50×0.04(prox=0.04) + 0.40×proc(0.856[fin=0.99,mean=0.66]) + 0.10×fmt(1.000) | pred='240' gold='18' | step_acc=71% lccp=0% (chain=0/7 ok_count=5) n_steps=7
+2026-04-26 04:05:34,560 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.354 = 0.50×0.07(prox=0.07) + 0.40×proc(0.550[fin=0.62,mean=0.45]) + 0.10×fmt(1.000) | pred='142' gold='18' | step_acc=57% lccp=0% (chain=0/7 ok_count=4) n_steps=7
+2026-04-26 04:05:34,644 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.744 = 0.50×0.60(prox=0.60) + 0.40×proc(0.859[fin=0.99,mean=0.67]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=50% lccp=33% (chain=2/6 ok_count=3) n_steps=6
+
Iter 7 GRPO groups: 65%|######5 | 13/20 [02:43<01:26, 12.32s/q, loss=-0.0013, mean_r=0.670, skip=5]
Iter 7 GRPO groups: 70%|####### | 14/20 [02:43<01:05, 10.89s/q, loss=-0.0013, mean_r=0.670, skip=5]2026-04-26 04:05:45,825 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.860 = 0.50×1.00(exact) + 0.40×proc(0.650[fin=0.79,mean=0.45]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=43% lccp=0% (chain=0/7 ok_count=3) n_steps=7
+2026-04-26 04:05:45,918 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.910[fin=0.99,mean=0.79]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:05:46,011 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:05:46,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.38(prox=0.38) + 0.40×proc(0.781[fin=0.90,mean=0.61]) + 0.10×fmt(1.000) | pred='70' gold='350' | step_acc=57% lccp=43% (chain=3/7 ok_count=4) n_steps=7
+2026-04-26 04:05:46,193 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.747 = 0.50×0.76(prox=0.76) + 0.40×proc(0.670[fin=0.68,mean=0.65]) + 0.10×fmt(1.000) | pred='406' gold='350' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:05:46,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.838 = 0.50×0.76(prox=0.76) + 0.40×proc(0.899[fin=0.98,mean=0.77]) + 0.10×fmt(1.000) | pred='406' gold='350' | step_acc=71% lccp=57% (chain=4/7 ok_count=5) n_steps=7
+2026-04-26 04:05:46,379 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.31(prox=0.31) + 0.40×proc(0.734[fin=0.91,mean=0.46]) + 0.10×fmt(1.000) | pred='736' gold='350' | step_acc=50% lccp=0% (chain=0/6 ok_count=3) n_steps=6
+2026-04-26 04:05:46,471 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.829 = 0.50×0.83(prox=0.83) + 0.40×proc(0.781[fin=0.91,mean=0.58]) + 0.10×fmt(1.000) | pred='385' gold='350' | step_acc=50% lccp=33% (chain=2/6 ok_count=3) n_steps=6
+2026-04-26 04:05:46,562 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:05:46,658 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.592 = 0.50×0.38(prox=0.38) + 0.40×proc(0.758[fin=0.83,mean=0.65]) + 0.10×fmt(1.000) | pred='60' gold='350' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+
Iter 7 GRPO groups: 70%|####### | 14/20 [02:55<01:05, 10.89s/q, loss=0.0002, mean_r=0.792, skip=5]
Iter 7 GRPO groups: 75%|#######5 | 15/20 [02:55<00:56, 11.22s/q, loss=0.0002, mean_r=0.792, skip=5]2026-04-26 04:05:53,423 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1000' gold='1000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:05:53,507 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.498 = 0.50×0.48(prox=0.48) + 0.40×proc(0.391[fin=0.49,mean=0.25]) + 0.10×fmt(1.000) | pred='1536' gold='1000' | step_acc=0% lccp=0% (chain=0/5 ok_count=0) n_steps=5
+2026-04-26 04:05:53,593 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.900[fin=0.97,mean=0.79]) + 0.10×fmt(1.000) | pred='1000' gold='1000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:05:53,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1000' gold='1000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:05:53,764 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1000' gold='1000' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:05:53,849 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.531 = 0.50×0.38(prox=0.38) + 0.40×proc(0.597[fin=0.70,mean=0.44]) + 0.10×fmt(1.000) | pred='200' gold='1000' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:05:53,933 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.476 = 0.50×0.71(prox=0.71) + 0.40×proc(0.048[fin=0.03,mean=0.08]) + 0.10×fmt(1.000) | pred='800' gold='1000' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:05:54,017 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1000' gold='1000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:05:54,103 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.442 = 0.50×0.38(prox=0.38) + 0.40×proc(0.374[fin=0.45,mean=0.26]) + 0.10×fmt(1.000) | pred='200' gold='1000' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:05:54,189 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.641 = 0.50×0.45(prox=0.45) + 0.40×proc(0.783[fin=0.92,mean=0.58]) + 0.10×fmt(1.000) | pred='400' gold='1000' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+
Iter 7 GRPO groups: 75%|#######5 | 15/20 [03:03<00:56, 11.22s/q, loss=0.0024, mean_r=0.754, skip=5]
Iter 7 GRPO groups: 80%|######## | 16/20 [03:03<00:40, 10.20s/q, loss=0.0024, mean_r=0.754, skip=5]2026-04-26 04:06:01,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:06:01,825 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.898[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 04:06:01,910 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:06:01,993 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:02,075 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.947[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=86% lccp=43% (chain=3/7 ok_count=6) n_steps=7
+2026-04-26 04:06:02,158 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:06:02,242 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.791 = 0.50×0.81(prox=0.81) + 0.40×proc(0.713[fin=0.93,mean=0.39]) + 0.10×fmt(1.000) | pred='46' gold='52' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:06:02,324 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.946[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 04:06:02,407 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:06:02,492 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 7 GRPO groups: 80%|######## | 16/20 [03:11<00:40, 10.20s/q, loss=0.0026, mean_r=0.964, skip=5]
Iter 7 GRPO groups: 85%|########5 | 17/20 [03:11<00:28, 9.53s/q, loss=0.0026, mean_r=0.964, skip=5]2026-04-26 04:06:08,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.896[fin=0.98,mean=0.77]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:06:08,792 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.554 = 0.50×0.60(prox=0.60) + 0.40×proc(0.386[fin=0.30,mean=0.52]) + 0.10×fmt(1.000) | pred='30' gold='45' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:06:08,875 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.752 = 0.50×0.82(prox=0.82) + 0.40×proc(0.607[fin=0.71,mean=0.45]) + 0.10×fmt(1.000) | pred='40' gold='45' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:06:08,959 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.873 = 0.50×0.82(prox=0.82) + 0.40×proc(0.909[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='40' gold='45' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:06:09,043 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:09,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.922[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='45' gold='45' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:06:09,210 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.786 = 0.50×0.69(prox=0.69) + 0.40×proc(0.849[fin=0.99,mean=0.64]) + 0.10×fmt(1.000) | pred='35' gold='45' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:06:09,291 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.565 = 0.50×0.45(prox=0.45) + 0.40×proc(0.595[fin=0.79,mean=0.31]) + 0.10×fmt(1.000) | pred='18' gold='45' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:06:09,377 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.320 = 0.50×0.23(prox=0.23) + 0.40×proc(0.167[fin=0.08,mean=0.30]) + 0.10×fmt(1.000) | pred='-30' gold='45' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 04:06:09,459 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.741[fin=0.94,mean=0.44]) + 0.10×fmt(1.000) | pred='15' gold='45' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+
Iter 7 GRPO groups: 85%|########5 | 17/20 [03:18<00:28, 9.53s/q, loss=-0.0003, mean_r=0.731, skip=5]
Iter 7 GRPO groups: 90%|######### | 18/20 [03:18<00:17, 8.76s/q, loss=-0.0003, mean_r=0.731, skip=5]2026-04-26 04:06:14,038 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:14,123 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.504 = 0.50×0.33(prox=0.33) + 0.40×proc(0.593[fin=0.78,mean=0.31]) + 0.10×fmt(1.000) | pred='210' gold='105' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:06:14,206 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:14,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:14,369 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:14,450 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:14,533 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:14,617 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.458 = 0.50×0.33(prox=0.33) + 0.40×proc(0.479[fin=0.64,mean=0.24]) + 0.10×fmt(1.000) | pred='210' gold='105' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:06:14,700 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:14,782 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 7 GRPO groups: 90%|######### | 18/20 [03:23<00:17, 8.76s/q, loss=-0.0002, mean_r=0.895, skip=5]
Iter 7 GRPO groups: 95%|#########5| 19/20 [03:23<00:07, 7.73s/q, loss=-0.0002, mean_r=0.895, skip=5]2026-04-26 04:06:19,856 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:19,938 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:20,019 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:20,102 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:20,184 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:20,267 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:20,349 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:20,430 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:20,511 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:20,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4500' gold='4500' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 7 GRPO groups: 95%|#########5| 19/20 [03:28<00:07, 7.73s/q, loss=0var, mean_r=0.997, skip=6]
Iter 7 GRPO groups: 100%|##########| 20/20 [03:28<00:00, 6.72s/q, loss=0var, mean_r=0.997, skip=6]
Iter 7 GRPO groups: 100%|##########| 20/20 [03:28<00:00, 10.41s/q, loss=0var, mean_r=0.997, skip=6]
+2026-04-26 04:06:20,595 INFO __main__ - Iter 7 | loss=0.0011 | reward mean=0.838 std=0.223 | gt_match=64.0% | grounded_acc=88.3% | step_acc=81.3% | lccp=65.8% | batch_acc=88.3% | phase=GROUNDED_ONLY sp_ratio=0% | groups=14 skipped=6(0var=6) | lr=4.44e-06 | 208.2s
+2026-04-26 04:06:20,596 INFO __main__ - ======================================================================
+2026-04-26 04:06:20,596 INFO __main__ - GRPO ITERATION 8/60
+2026-04-26 04:06:20,596 INFO __main__ - ======================================================================
+2026-04-26 04:06:20,614 INFO __main__ - LR this iteration: 4.44e-06 | T=0.753 | MATH ratio=30%
+
Iter 8 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:06:30,052 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:30,136 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:30,218 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:30,303 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.472 = 0.50×0.00(prox=0.00) + 0.40×proc(0.930[fin=0.98,mean=0.85]) + 0.10×fmt(1.000) | pred='4*6**0.5' gold='24' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:06:30,387 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:30,470 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:30,553 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:30,635 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:06:30,726 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.477 = 0.50×0.00(prox=0.00) + 0.40×proc(0.835[fin=0.99,mean=0.60]) + 0.10×fmt(1.000) | pred='$4\\sqrt{15}$' gold='24' | step_acc=71% lccp=29% (chain=2/7 ok_count=5) n_steps=7
+2026-04-26 04:06:30,816 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 8 GRPO groups: 0%| | 0/20 [00:11, ?q/s, loss=-0.0017, mean_r=0.895, skip=0]
Iter 8 GRPO groups: 5%|5 | 1/20 [00:11<03:41, 11.64s/q, loss=-0.0017, mean_r=0.895, skip=0]2026-04-26 04:06:35,918 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,002 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,086 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,168 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,251 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,333 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,418 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,503 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,587 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:36,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='120' gold='120' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 5%|5 | 1/20 [00:16<03:41, 11.64s/q, loss=0var, mean_r=0.999, skip=1]
Iter 8 GRPO groups: 10%|# | 2/20 [00:16<02:13, 7.39s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 04:06:40,619 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.467 = 0.50×0.33(prox=0.33) + 0.40×proc(0.313[fin=0.15,mean=0.56]) + 0.10×fmt(1.000) | pred='2700' gold='1350' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:06:40,705 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.550[fin=0.46,mean=0.68]) + 0.10×fmt(1.000) | pred='2700' gold='1350' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:06:40,791 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.465 = 0.50×0.33(prox=0.33) + 0.40×proc(0.308[fin=0.11,mean=0.61]) + 0.10×fmt(1.000) | pred='2700' gold='1350' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:06:40,875 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1350' gold='1350' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:40,960 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.900[fin=0.99,mean=0.76]) + 0.10×fmt(1.000) | pred='1350' gold='1350' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:06:41,044 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.749 = 0.50×0.60(prox=0.60) + 0.40×proc(0.873[fin=0.92,mean=0.80]) + 0.10×fmt(1.000) | pred='900' gold='1350' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:06:41,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1350' gold='1350' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:41,214 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.464 = 0.50×0.33(prox=0.33) + 0.40×proc(0.305[fin=0.12,mean=0.58]) + 0.10×fmt(1.000) | pred='2700' gold='1350' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:06:41,298 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.463 = 0.50×0.33(prox=0.33) + 0.40×proc(0.304[fin=0.12,mean=0.58]) + 0.10×fmt(1.000) | pred='2700' gold='1350' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:06:41,381 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='1350' gold='1350' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 8 GRPO groups: 10%|# | 2/20 [00:22<02:13, 7.39s/q, loss=-0.0000, mean_r=0.710, skip=1]
Iter 8 GRPO groups: 15%|#5 | 3/20 [00:22<01:55, 6.82s/q, loss=-0.0000, mean_r=0.710, skip=1]2026-04-26 04:06:46,262 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:46,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:46,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:46,490 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:46,571 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:46,651 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:46,734 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:46,816 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:06:46,890 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:46,967 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 15%|#5 | 3/20 [00:26<01:55, 6.82s/q, loss=0var, mean_r=0.999, skip=2]
Iter 8 GRPO groups: 20%|## | 4/20 [00:26<01:32, 5.77s/q, loss=0var, mean_r=0.999, skip=2]2026-04-26 04:06:51,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:51,260 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.652 = 0.50×0.41(prox=0.41) + 0.40×proc(0.864[fin=0.99,mean=0.67]) + 0.10×fmt(1.000) | pred='2' gold='7' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:06:51,343 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.934 = 0.50×1.00(exact) + 0.40×proc(0.835[fin=1.00,mean=0.59]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:06:51,424 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:51,506 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.940 = 0.50×1.00(exact) + 0.40×proc(0.850[fin=1.00,mean=0.62]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:06:51,588 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:51,671 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:06:51,754 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.948[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:06:51,838 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.953 = 0.50×1.00(exact) + 0.40×proc(0.882[fin=1.00,mean=0.70]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:06:51,919 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 20%|## | 4/20 [00:32<01:32, 5.77s/q, loss=0.0002, mean_r=0.945, skip=2]
Iter 8 GRPO groups: 25%|##5 | 5/20 [00:32<01:29, 5.98s/q, loss=0.0002, mean_r=0.945, skip=2]2026-04-26 04:07:07,499 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:07:07,595 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.838 = 0.50×0.74(prox=0.74) + 0.40×proc(0.923[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='81.25' gold='69' | step_acc=88% lccp=50% (chain=4/8 ok_count=7) n_steps=8
+2026-04-26 04:07:07,678 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.920 = 0.50×0.85(prox=0.85) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='63' gold='69' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:07:07,761 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.925[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:07,857 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.902 = 0.50×0.85(prox=0.85) + 0.40×proc(0.944[fin=0.99,mean=0.87]) + 0.10×fmt(1.000) | pred='75' gold='69' | step_acc=89% lccp=11% (chain=1/9 ok_count=8) n_steps=9
+2026-04-26 04:07:07,941 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:07:08,033 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.745 = 0.50×0.52(prox=0.52) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='37' gold='69' | step_acc=91% lccp=55% (chain=6/11 ok_count=10) n_steps=11
+2026-04-26 04:07:08,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.377 = 0.50×0.00(prox=0.00) + 0.40×proc(0.580[fin=0.46,mean=0.76]) + 0.10×fmt(0.700) | pred='' gold='69' | step_acc=67% lccp=50% (chain=6/12 ok_count=8) n_steps=12
+2026-04-26 04:07:08,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.38(prox=0.38) + 0.40×proc(0.854[fin=0.98,mean=0.67]) + 0.10×fmt(1.000) | pred='13' gold='69' | step_acc=73% lccp=18% (chain=2/11 ok_count=8) n_steps=11
+2026-04-26 04:07:08,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 8 GRPO groups: 25%|##5 | 5/20 [00:49<01:29, 5.98s/q, loss=-0.0013, mean_r=0.830, skip=2]
Iter 8 GRPO groups: 30%|### | 6/20 [00:49<02:13, 9.53s/q, loss=-0.0013, mean_r=0.830, skip=2]2026-04-26 04:07:12,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,336 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,418 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,499 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,580 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,662 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,746 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,829 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,913 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:12,991 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 30%|### | 6/20 [00:52<02:13, 9.53s/q, loss=0var, mean_r=0.997, skip=3]
Iter 8 GRPO groups: 35%|###5 | 7/20 [00:52<01:37, 7.48s/q, loss=0var, mean_r=0.997, skip=3]2026-04-26 04:07:16,858 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:16,943 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:17,024 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:17,108 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:17,192 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:17,274 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:07:17,355 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:07:17,436 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:17,518 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:07:17,599 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 8 GRPO groups: 35%|###5 | 7/20 [00:56<01:37, 7.48s/q, loss=0var, mean_r=1.000, skip=4]
Iter 8 GRPO groups: 40%|#### | 8/20 [00:56<01:18, 6.56s/q, loss=0var, mean_r=1.000, skip=4]2026-04-26 04:07:24,210 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.538 = 0.50×0.50(prox=0.50) + 0.40×proc(0.469[fin=0.61,mean=0.26]) + 0.10×fmt(1.000) | pred='70' gold='140' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:07:24,296 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:07:24,375 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.906 = 0.50×1.00(exact) + 0.40×proc(0.852[fin=0.99,mean=0.65]) + 0.10×fmt(0.650) | pred='140' gold='140' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 04:07:24,459 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:24,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:07:24,628 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:07:24,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:07:24,787 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.634 = 0.50×0.37(prox=0.37) + 0.40×proc(0.874[fin=0.99,mean=0.70]) + 0.10×fmt(1.000) | pred='260' gold='140' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:07:24,864 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.641 = 0.50×0.38(prox=0.38) + 0.40×proc(0.872[fin=0.97,mean=0.72]) + 0.10×fmt(1.000) | pred='28' gold='140' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:07:24,945 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='140' gold='140' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 40%|#### | 8/20 [01:05<01:18, 6.56s/q, loss=0.0008, mean_r=0.870, skip=4]
Iter 8 GRPO groups: 45%|####5 | 9/20 [01:05<01:19, 7.26s/q, loss=0.0008, mean_r=0.870, skip=4]2026-04-26 04:07:32,140 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='19' gold='19' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:07:32,228 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.786 = 0.50×0.70(prox=0.70) + 0.40×proc(0.835[fin=0.89,mean=0.75]) + 0.10×fmt(1.000) | pred='23' gold='19' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:07:32,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.876 = 0.50×0.83(prox=0.83) + 0.40×proc(0.907[fin=0.95,mean=0.84]) + 0.10×fmt(1.000) | pred='21' gold='19' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:07:32,403 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='19' gold='19' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:07:32,485 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.956 = 0.50×1.00(exact) + 0.40×proc(0.890[fin=0.98,mean=0.75]) + 0.10×fmt(1.000) | pred='19' gold='19' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:07:32,576 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='19' gold='19' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:07:32,661 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.868 = 0.50×0.83(prox=0.83) + 0.40×proc(0.889[fin=0.95,mean=0.80]) + 0.10×fmt(1.000) | pred='21' gold='19' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:07:32,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.767 = 0.50×0.85(prox=0.85) + 0.40×proc(0.605[fin=0.70,mean=0.47]) + 0.10×fmt(1.000) | pred='20' gold='19' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:07:32,828 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='19' gold='19' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:07:32,911 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='19' gold='19' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 8 GRPO groups: 45%|####5 | 9/20 [01:13<01:19, 7.26s/q, loss=0.0010, mean_r=0.925, skip=4]
Iter 8 GRPO groups: 50%|##### | 10/20 [01:13<01:14, 7.48s/q, loss=0.0010, mean_r=0.925, skip=4]2026-04-26 04:07:37,338 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:37,420 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:37,501 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:37,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:07:37,671 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:37,754 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:37,836 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:37,917 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:38,001 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:38,083 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='43' gold='43' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 50%|##### | 10/20 [01:17<01:14, 7.48s/q, loss=0var, mean_r=0.999, skip=5]
Iter 8 GRPO groups: 55%|#####5 | 11/20 [01:17<00:56, 6.33s/q, loss=0var, mean_r=0.999, skip=5]2026-04-26 04:07:41,772 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.364 = 0.50×0.40(prox=0.40) + 0.40×proc(0.161[fin=0.19,mean=0.11]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:07:41,856 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.462 = 0.50×0.40(prox=0.40) + 0.40×proc(0.404[fin=0.50,mean=0.26]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:07:41,938 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:42,021 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:42,104 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.895[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:07:42,191 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.373 = 0.50×0.40(prox=0.40) + 0.40×proc(0.183[fin=0.21,mean=0.14]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:07:42,273 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:42,356 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:07:42,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.354 = 0.50×0.40(prox=0.40) + 0.40×proc(0.136[fin=0.17,mean=0.09]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:07:42,521 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 8 GRPO groups: 55%|#####5 | 11/20 [01:23<00:56, 6.33s/q, loss=-0.0002, mean_r=0.751, skip=5]
Iter 8 GRPO groups: 60%|###### | 12/20 [01:23<00:49, 6.18s/q, loss=-0.0002, mean_r=0.751, skip=5]2026-04-26 04:08:15,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.622 = 0.50×0.34(prox=0.34) + 0.40×proc(0.876[fin=1.00,mean=0.70]) + 0.10×fmt(1.000) | pred='19787' gold='10100' | step_acc=80% lccp=0% (chain=0/10 ok_count=8) n_steps=10
+2026-04-26 04:08:15,216 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.886[fin=0.92,mean=0.83]) + 0.10×fmt(1.000) | pred='2525' gold='10100' | step_acc=86% lccp=71% (chain=5/7 ok_count=6) n_steps=7
+2026-04-26 04:08:15,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.587 = 0.50×0.49(prox=0.49) + 0.40×proc(0.602[fin=0.62,mean=0.58]) + 0.10×fmt(1.000) | pred='4899' gold='10100' | step_acc=57% lccp=0% (chain=0/7 ok_count=4) n_steps=7
+2026-04-26 04:08:15,406 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.785 = 0.50×0.85(prox=0.85) + 0.40×proc(0.651[fin=0.75,mean=0.51]) + 0.10×fmt(1.000) | pred='9414' gold='10100' | step_acc=50% lccp=33% (chain=2/6 ok_count=3) n_steps=6
+2026-04-26 04:08:15,535 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.816 = 0.50×0.85(prox=0.85) + 0.40×proc(0.727[fin=0.80,mean=0.61]) + 0.10×fmt(1.000) | pred='10088' gold='10100' | step_acc=67% lccp=22% (chain=2/9 ok_count=6) n_steps=9
+2026-04-26 04:08:15,639 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.817[fin=0.93,mean=0.64]) + 0.10×fmt(1.000) | pred='100' gold='10100' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 04:08:15,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.443 = 0.50×0.39(prox=0.39) + 0.40×proc(0.365[fin=0.29,mean=0.48]) + 0.10×fmt(1.000) | pred='2304' gold='10100' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:08:15,837 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.37(prox=0.37) + 0.40×proc(0.830[fin=0.94,mean=0.66]) + 0.10×fmt(1.000) | pred='1608' gold='10100' | step_acc=75% lccp=38% (chain=3/8 ok_count=6) n_steps=8
+2026-04-26 04:08:15,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.698 = 0.50×0.50(prox=0.50) + 0.40×proc(0.870[fin=0.99,mean=0.69]) + 0.10×fmt(1.000) | pred='5049' gold='10100' | step_acc=71% lccp=0% (chain=0/7 ok_count=5) n_steps=7
+2026-04-26 04:08:16,030 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.762[fin=0.85,mean=0.63]) + 0.10×fmt(1.000) | pred='200' gold='10100' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+
Iter 8 GRPO groups: 60%|###### | 12/20 [01:56<00:49, 6.18s/q, loss=0.0000, mean_r=0.615, skip=5]
Iter 8 GRPO groups: 65%|######5 | 13/20 [01:56<01:41, 14.49s/q, loss=0.0000, mean_r=0.615, skip=5]2026-04-26 04:08:21,459 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:21,542 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:21,625 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:21,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.729 = 0.50×0.62(prox=0.62) + 0.40×proc(0.792[fin=0.91,mean=0.61]) + 0.10×fmt(1.000) | pred='70' gold='100' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:08:21,794 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:08:21,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:21,959 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:08:22,042 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:22,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:22,209 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 65%|######5 | 13/20 [02:02<01:41, 14.49s/q, loss=-0.0012, mean_r=0.972, skip=5]
Iter 8 GRPO groups: 70%|####### | 14/20 [02:02<01:11, 11.95s/q, loss=-0.0012, mean_r=0.972, skip=5]2026-04-26 04:08:33,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.14(prox=0.14) + 0.40×proc(0.902[fin=0.99,mean=0.77]) + 0.10×fmt(1.000) | pred='8' gold='2' | step_acc=83% lccp=50% (chain=6/12 ok_count=10) n_steps=12
+2026-04-26 04:08:33,647 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.484 = 0.50×0.50(prox=0.50) + 0.40×proc(0.335[fin=0.29,mean=0.40]) + 0.10×fmt(1.000) | pred='1' gold='2' | step_acc=43% lccp=29% (chain=2/7 ok_count=3) n_steps=7
+2026-04-26 04:08:33,739 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.261 = 0.50×0.08(prox=0.08) + 0.40×proc(0.260[fin=0.13,mean=0.46]) + 0.10×fmt(1.000) | pred='14' gold='2' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+2026-04-26 04:08:33,830 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.954 = 0.50×1.00(exact) + 0.40×proc(0.884[fin=0.99,mean=0.72]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=86% lccp=29% (chain=2/7 ok_count=6) n_steps=7
+2026-04-26 04:08:33,921 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.868[fin=1.00,mean=0.68]) + 0.10×fmt(1.000) | pred='6' gold='2' | step_acc=67% lccp=33% (chain=4/12 ok_count=8) n_steps=12
+2026-04-26 04:08:34,011 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.528 = 0.50×0.09(prox=0.09) + 0.40×proc(0.815[fin=0.95,mean=0.61]) + 0.10×fmt(1.000) | pred='12' gold='2' | step_acc=62% lccp=38% (chain=3/8 ok_count=5) n_steps=8
+2026-04-26 04:08:34,096 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.25(prox=0.25) + 0.40×proc(0.878[fin=1.00,mean=0.70]) + 0.10×fmt(1.000) | pred='5' gold='2' | step_acc=67% lccp=17% (chain=1/6 ok_count=4) n_steps=6
+2026-04-26 04:08:34,181 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.949[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:34,265 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.924 = 0.50×1.00(exact) + 0.40×proc(0.809[fin=0.90,mean=0.68]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 04:08:34,350 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.503 = 0.50×0.17(prox=0.17) + 0.40×proc(0.693[fin=0.79,mean=0.55]) + 0.10×fmt(1.000) | pred='7' gold='2' | step_acc=57% lccp=29% (chain=2/7 ok_count=4) n_steps=7
+
Iter 8 GRPO groups: 70%|####### | 14/20 [02:15<01:11, 11.95s/q, loss=-0.0008, mean_r=0.628, skip=5]
Iter 8 GRPO groups: 75%|#######5 | 15/20 [02:15<01:00, 12.01s/q, loss=-0.0008, mean_r=0.628, skip=5]2026-04-26 04:08:39,419 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:39,501 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:39,581 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:39,663 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:39,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:39,825 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:39,906 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:39,987 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:08:40,062 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:08:40,146 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 75%|#######5 | 15/20 [02:19<01:00, 12.01s/q, loss=0var, mean_r=0.997, skip=6]
Iter 8 GRPO groups: 80%|######## | 16/20 [02:19<00:38, 9.71s/q, loss=0var, mean_r=0.997, skip=6]2026-04-26 04:08:46,373 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:46,459 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:46,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:46,636 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:46,720 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:46,803 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:46,886 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:46,970 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:47,053 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:08:47,139 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 8 GRPO groups: 80%|######## | 16/20 [02:26<00:38, 9.71s/q, loss=0var, mean_r=0.999, skip=7]
Iter 8 GRPO groups: 85%|########5 | 17/20 [02:26<00:26, 8.90s/q, loss=0var, mean_r=0.999, skip=7]2026-04-26 04:08:52,458 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.594 = 0.50×0.35(prox=0.35) + 0.40×proc(0.801[fin=0.99,mean=0.51]) + 0.10×fmt(1.000) | pred='66' gold='1056' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:08:52,542 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.27(prox=0.27) + 0.40×proc(0.802[fin=0.81,mean=0.79]) + 0.10×fmt(1.000) | pred='2464' gold='1056' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:08:52,626 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.940 = 0.50×1.00(exact) + 0.40×proc(0.851[fin=1.00,mean=0.63]) + 0.10×fmt(1.000) | pred='1056' gold='1056' | step_acc=57% lccp=14% (chain=1/7 ok_count=4) n_steps=7
+2026-04-26 04:08:52,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='1056' gold='1056' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:08:52,792 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1056' gold='1056' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:08:52,876 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.429 = 0.50×0.36(prox=0.36) + 0.40×proc(0.311[fin=0.30,mean=0.33]) + 0.10×fmt(1.000) | pred='111.9965' gold='1056' | step_acc=17% lccp=17% (chain=1/6 ok_count=1) n_steps=6
+2026-04-26 04:08:52,958 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.474 = 0.50×0.27(prox=0.27) + 0.40×proc(0.345[fin=0.12,mean=0.68]) + 0.10×fmt(1.000) | pred='2464' gold='1056' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 04:08:53,041 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.497 = 0.50×0.27(prox=0.27) + 0.40×proc(0.402[fin=0.21,mean=0.69]) + 0.10×fmt(1.000) | pred='2464' gold='1056' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 04:08:53,126 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.449 = 0.50×0.27(prox=0.27) + 0.40×proc(0.281[fin=0.04,mean=0.64]) + 0.10×fmt(1.000) | pred='2464' gold='1056' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 04:08:53,208 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.473 = 0.50×0.27(prox=0.27) + 0.40×proc(0.342[fin=0.13,mean=0.66]) + 0.10×fmt(1.000) | pred='2464' gold='1056' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+
Iter 8 GRPO groups: 85%|########5 | 17/20 [02:34<00:26, 8.90s/q, loss=0.0008, mean_r=0.640, skip=7]
Iter 8 GRPO groups: 90%|######### | 18/20 [02:34<00:16, 8.48s/q, loss=0.0008, mean_r=0.640, skip=7]2026-04-26 04:09:01,486 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.958[fin=0.98,mean=0.92]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:09:01,570 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.951[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:09:01,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.942[fin=0.98,mean=0.88]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:09:01,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=0.99,mean=0.94]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:09:01,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.908[fin=0.98,mean=0.79]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:09:01,901 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.945[fin=0.96,mean=0.92]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:09:01,993 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.766 = 0.50×0.71(prox=0.71) + 0.40×proc(0.772[fin=0.88,mean=0.62]) + 0.10×fmt(1.000) | pred='24' gold='20' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:09:02,078 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.431 = 0.50×0.44(prox=0.44) + 0.40×proc(0.272[fin=0.22,mean=0.36]) + 0.10×fmt(1.000) | pred='32.5' gold='20' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:09:02,172 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.914[fin=0.99,mean=0.80]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:09:02,257 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.954 = 0.50×1.00(exact) + 0.40×proc(0.885[fin=0.91,mean=0.85]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 8 GRPO groups: 90%|######### | 18/20 [02:43<00:16, 8.48s/q, loss=-0.0012, mean_r=0.899, skip=7]
Iter 8 GRPO groups: 95%|#########5| 19/20 [02:43<00:08, 8.64s/q, loss=-0.0012, mean_r=0.899, skip=7]2026-04-26 04:09:08,999 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.865 = 0.50×0.82(prox=0.82) + 0.40×proc(0.883[fin=0.99,mean=0.73]) + 0.10×fmt(1.000) | pred='310' gold='280' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:09:09,084 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.876 = 0.50×0.82(prox=0.82) + 0.40×proc(0.911[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='310' gold='280' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:09:09,168 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.883 = 0.50×0.82(prox=0.82) + 0.40×proc(0.928[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='310' gold='280' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:09:09,251 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.873 = 0.50×0.82(prox=0.82) + 0.40×proc(0.904[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='310' gold='280' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:09:09,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.845 = 0.50×0.82(prox=0.82) + 0.40×proc(0.833[fin=0.93,mean=0.69]) + 0.10×fmt(1.000) | pred='310' gold='280' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:09:09,424 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.850 = 0.50×0.82(prox=0.82) + 0.40×proc(0.845[fin=0.99,mean=0.62]) + 0.10×fmt(1.000) | pred='310' gold='280' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:09:09,509 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.887 = 0.50×0.82(prox=0.82) + 0.40×proc(0.939[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='310' gold='280' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:09:09,591 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='280' gold='280' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:09:09,675 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.875 = 0.50×0.82(prox=0.82) + 0.40×proc(0.908[fin=0.99,mean=0.79]) + 0.10×fmt(1.000) | pred='310' gold='280' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:09:09,758 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.474 = 0.50×0.45(prox=0.45) + 0.40×proc(0.225[fin=0.08,mean=0.44]) + 0.10×fmt(1.000) | pred='107.5' gold='280' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+
Iter 8 GRPO groups: 95%|#########5| 19/20 [02:50<00:08, 8.64s/q, loss=0.0002, mean_r=0.843, skip=7]
Iter 8 GRPO groups: 100%|##########| 20/20 [02:50<00:00, 8.30s/q, loss=0.0002, mean_r=0.843, skip=7]
Iter 8 GRPO groups: 100%|##########| 20/20 [02:50<00:00, 8.53s/q, loss=0.0002, mean_r=0.843, skip=7]
+2026-04-26 04:09:11,192 INFO __main__ - Iter 8 | loss=-0.0003 | reward mean=0.876 std=0.200 | gt_match=69.0% | grounded_acc=89.5% | step_acc=86.5% | lccp=74.8% | batch_acc=89.5% | phase=GROUNDED_ONLY sp_ratio=0% | groups=13 skipped=7(0var=7) | lr=5.00e-06 | 170.6s
+2026-04-26 04:09:11,192 WARNING __main__ - STARVATION: 35% of groups skipped (zero variance). grounded_acc=89.5% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 04:09:11,193 INFO __main__ - ======================================================================
+2026-04-26 04:09:11,193 INFO __main__ - GRPO ITERATION 9/60
+2026-04-26 04:09:11,193 INFO __main__ - ======================================================================
+2026-04-26 04:09:11,213 INFO __main__ - LR this iteration: 5.00e-06 | T=0.746 | MATH ratio=30%
+
Iter 9 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:09:19,193 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.527 = 0.50×0.47(prox=0.47) + 0.40×proc(0.296[fin=0.17,mean=0.48]) + 0.10×fmt(1.000) | pred='6' gold='14' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:09:19,277 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.449 = 0.50×0.54(prox=0.54) + 0.40×proc(0.199[fin=0.16,mean=0.26]) + 0.10×fmt(1.000) | pred='8' gold='14' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:09:19,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.921[fin=0.97,mean=0.84]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:09:19,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.945[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:09:19,523 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:09:19,604 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:09:19,696 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:09:19,778 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.512 = 0.50×0.37(prox=0.37) + 0.40×proc(0.569[fin=0.75,mean=0.30]) + 0.10×fmt(1.000) | pred='2' gold='14' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:09:19,868 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:09:19,951 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 9 GRPO groups: 0%| | 0/20 [00:10, ?q/s, loss=-0.0001, mean_r=0.839, skip=0]
Iter 9 GRPO groups: 5%|5 | 1/20 [00:10<03:13, 10.17s/q, loss=-0.0001, mean_r=0.839, skip=0]2026-04-26 04:09:55,074 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.606 = 0.50×0.85(prox=0.85) + 0.40×proc(0.202[fin=0.01,mean=0.49]) + 0.10×fmt(1.000) | pred='989' gold='990' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:09:55,159 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='990' gold='990' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:09:55,239 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='990' gold='990' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:09:55,320 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.949 = 0.50×1.00(exact) + 0.40×proc(0.961[fin=1.00,mean=0.91]) + 0.10×fmt(0.650) | pred='990' gold='990' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:09:55,404 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.290 = 0.50×0.35(prox=0.35) + 0.40×proc(0.031[fin=0.01,mean=0.06]) + 0.10×fmt(1.000) | pred='90' gold='990' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:09:55,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='990' gold='990' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:09:55,559 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.941 = 0.50×1.00(exact) + 0.40×proc(0.941[fin=1.00,mean=0.85]) + 0.10×fmt(0.650) | pred='990' gold='990' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:09:55,635 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='990' gold='990' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:09:55,712 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.509 = 0.50×0.85(prox=0.85) + 0.40×proc(0.048[fin=0.06,mean=0.03]) + 0.10×fmt(0.650) | pred='991' gold='990' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+
Iter 9 GRPO groups: 5%|5 | 1/20 [00:45<03:13, 10.17s/q, loss=-0.0033, mean_r=0.805, skip=0]
Iter 9 GRPO groups: 10%|# | 2/20 [00:45<07:33, 25.19s/q, loss=-0.0033, mean_r=0.805, skip=0]2026-04-26 04:10:04,618 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.810 = 0.50×0.77(prox=0.77) + 0.40×proc(0.814[fin=0.91,mean=0.67]) + 0.10×fmt(1.000) | pred='85' gold='100' | step_acc=71% lccp=43% (chain=3/7 ok_count=5) n_steps=7
+2026-04-26 04:10:04,702 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.888[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='200' gold='100' | step_acc=75% lccp=12% (chain=1/8 ok_count=6) n_steps=8
+2026-04-26 04:10:04,784 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.946[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=88% lccp=25% (chain=2/8 ok_count=7) n_steps=8
+2026-04-26 04:10:04,868 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.648 = 0.50×0.64(prox=0.64) + 0.40×proc(0.575[fin=0.55,mean=0.60]) + 0.10×fmt(1.000) | pred='71.43' gold='100' | step_acc=86% lccp=14% (chain=1/7 ok_count=6) n_steps=7
+2026-04-26 04:10:04,962 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.453 = 0.50×0.33(prox=0.33) + 0.40×proc(0.467[fin=0.57,mean=0.32]) + 0.10×fmt(1.000) | pred='0' gold='100' | step_acc=38% lccp=0% (chain=0/8 ok_count=3) n_steps=8
+2026-04-26 04:10:05,047 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.548 = 0.50×0.54(prox=0.54) + 0.40×proc(0.448[fin=0.46,mean=0.43]) + 0.10×fmt(1.000) | pred='142.85714' gold='100' | step_acc=25% lccp=25% (chain=2/8 ok_count=2) n_steps=8
+2026-04-26 04:10:05,131 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.684 = 0.50×0.54(prox=0.54) + 0.40×proc(0.787[fin=0.98,mean=0.50]) + 0.10×fmt(1.000) | pred='143' gold='100' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 04:10:05,215 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:05,299 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.300 = 0.50×0.00(prox=0.00) + 0.40×proc(0.437[fin=0.52,mean=0.31]) + 0.10×fmt(1.000) | pred='83 1/3' gold='100' | step_acc=33% lccp=17% (chain=1/6 ok_count=2) n_steps=6
+2026-04-26 04:10:05,383 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.754 = 0.50×0.82(prox=0.82) + 0.40×proc(0.614[fin=0.66,mean=0.54]) + 0.10×fmt(1.000) | pred='111.12' gold='100' | step_acc=62% lccp=12% (chain=1/8 ok_count=5) n_steps=8
+
Iter 9 GRPO groups: 10%|# | 2/20 [00:55<07:33, 25.19s/q, loss=0.0017, mean_r=0.672, skip=0]
Iter 9 GRPO groups: 15%|#5 | 3/20 [00:55<05:08, 18.13s/q, loss=0.0017, mean_r=0.672, skip=0]2026-04-26 04:10:11,496 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:11,580 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:11,663 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:11,747 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:11,830 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:11,913 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:11,996 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:12,079 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:12,161 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:12,244 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 9 GRPO groups: 15%|#5 | 3/20 [01:01<05:08, 18.13s/q, loss=0var, mean_r=0.999, skip=1]
Iter 9 GRPO groups: 20%|## | 4/20 [01:01<03:29, 13.12s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 04:10:15,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:10:15,193 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:10:15,275 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:10:15,356 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:10:15,438 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:15,519 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:15,600 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:15,682 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:15,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.960[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:15,844 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='27' gold='27' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 9 GRPO groups: 20%|## | 4/20 [01:04<03:29, 13.12s/q, loss=0var, mean_r=0.996, skip=2]
Iter 9 GRPO groups: 25%|##5 | 5/20 [01:04<02:25, 9.68s/q, loss=0var, mean_r=0.996, skip=2]2026-04-26 04:10:22,199 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:10:22,284 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:10:22,368 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:10:22,453 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:10:22,538 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:10:22,621 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:10:22,704 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:10:22,788 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:10:22,883 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.832[fin=0.96,mean=0.64]) + 0.10×fmt(1.000) | pred='31' gold='88' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:10:22,966 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 9 GRPO groups: 25%|##5 | 5/20 [01:13<02:25, 9.68s/q, loss=-0.0025, mean_r=0.950, skip=2]
Iter 9 GRPO groups: 30%|### | 6/20 [01:13<02:10, 9.31s/q, loss=-0.0025, mean_r=0.950, skip=2]2026-04-26 04:10:49,634 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:49,715 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:49,796 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:10:49,871 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=0.99,mean=0.91]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:49,953 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:50,064 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.785 = 0.50×0.60(prox=0.60) + 0.40×proc(0.962[fin=0.94,mean=0.99]) + 0.10×fmt(1.000) | pred='16' gold='12' | step_acc=100% lccp=100% (chain=24/24 ok_count=24) n_steps=24
+2026-04-26 04:10:50,140 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.401 = 0.50×0.33(prox=0.33) + 0.40×proc(0.211[fin=0.12,mean=0.35]) + 0.10×fmt(1.000) | pred='24' gold='12' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 04:10:50,223 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:50,304 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:10:50,385 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 9 GRPO groups: 30%|### | 6/20 [01:40<02:10, 9.31s/q, loss=0.0001, mean_r=0.913, skip=2]
Iter 9 GRPO groups: 35%|###5 | 7/20 [01:40<03:17, 15.22s/q, loss=0.0001, mean_r=0.913, skip=2]2026-04-26 04:10:59,635 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:10:59,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.651 = 0.50×0.50(prox=0.50) + 0.40×proc(0.752[fin=0.87,mean=0.58]) + 0.10×fmt(1.000) | pred='1' gold='2' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 04:10:59,805 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:10:59,890 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.642 = 0.50×0.50(prox=0.50) + 0.40×proc(0.731[fin=0.70,mean=0.78]) + 0.10×fmt(1.000) | pred='1' gold='2' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:10:59,973 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:00,056 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:11:00,149 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:00,231 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:00,315 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:00,399 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 9 GRPO groups: 35%|###5 | 7/20 [01:50<03:17, 15.22s/q, loss=0.0008, mean_r=0.929, skip=2]
Iter 9 GRPO groups: 40%|#### | 8/20 [01:50<02:42, 13.57s/q, loss=0.0008, mean_r=0.929, skip=2]2026-04-26 04:11:07,400 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:07,483 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:07,567 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.928 = 0.50×1.00(exact) + 0.40×proc(0.820[fin=0.98,mean=0.58]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:11:07,650 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:07,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:07,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:07,902 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:07,984 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:08,066 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:08,161 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.542[fin=0.60,mean=0.46]) + 0.10×fmt(1.000) | pred='132' gold='80' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+
Iter 9 GRPO groups: 40%|#### | 8/20 [01:58<02:42, 13.57s/q, loss=-0.0014, mean_r=0.945, skip=2]
Iter 9 GRPO groups: 45%|####5 | 9/20 [01:58<02:09, 11.75s/q, loss=-0.0014, mean_r=0.945, skip=2]2026-04-26 04:11:13,028 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:13,109 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:13,192 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.535 = 0.50×0.08(prox=0.08) + 0.40×proc(0.917[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='420' gold='60' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 04:11:13,276 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:13,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:13,434 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=0.99,mean=0.95]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:13,510 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:13,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:13,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:13,752 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 9 GRPO groups: 45%|####5 | 9/20 [02:03<02:09, 11.75s/q, loss=-0.0010, mean_r=0.950, skip=2]
Iter 9 GRPO groups: 50%|##### | 10/20 [02:03<01:38, 9.85s/q, loss=-0.0010, mean_r=0.950, skip=2]2026-04-26 04:11:19,737 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.925 = 0.50×1.00(exact) + 0.40×proc(0.811[fin=0.98,mean=0.56]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:11:19,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:19,899 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:11:19,981 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:20,064 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:11:20,148 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:20,233 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:20,316 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.614 = 0.50×1.00(exact) + 0.40×proc(0.035[fin=0.01,mean=0.07]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:11:20,399 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:11:20,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='160' gold='160' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 9 GRPO groups: 50%|##### | 10/20 [02:10<01:38, 9.85s/q, loss=0.0009, mean_r=0.952, skip=2]
Iter 9 GRPO groups: 55%|#####5 | 11/20 [02:10<01:20, 8.90s/q, loss=0.0009, mean_r=0.952, skip=2]2026-04-26 04:11:28,099 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:11:28,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.619 = 0.50×0.85(prox=0.85) + 0.40×proc(0.234[fin=0.05,mean=0.51]) + 0.10×fmt(1.000) | pred='132' gold='136' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:11:28,270 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:11:28,355 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:11:28,437 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:11:28,521 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:11:28,607 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:11:28,690 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:11:28,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.907[fin=0.96,mean=0.82]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:11:28,855 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='136' gold='136' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 9 GRPO groups: 55%|#####5 | 11/20 [02:19<01:20, 8.90s/q, loss=0.0001, mean_r=0.958, skip=2]
Iter 9 GRPO groups: 60%|###### | 12/20 [02:19<01:10, 8.83s/q, loss=0.0001, mean_r=0.958, skip=2]2026-04-26 04:11:34,077 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:34,159 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:34,241 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:34,321 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:34,403 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:34,484 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:34,565 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:34,646 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.951 = 0.50×1.00(exact) + 0.40×proc(0.879[fin=1.00,mean=0.70]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:11:34,728 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:34,810 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='76' gold='76' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 9 GRPO groups: 60%|###### | 12/20 [02:23<01:10, 8.83s/q, loss=0var, mean_r=0.995, skip=3]
Iter 9 GRPO groups: 65%|######5 | 13/20 [02:23<00:51, 7.43s/q, loss=0var, mean_r=0.995, skip=3]2026-04-26 04:11:38,974 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:39,055 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:39,138 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:39,213 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:39,289 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.428 = 0.50×0.43(prox=0.43) + 0.40×proc(0.160[fin=0.06,mean=0.31]) + 0.10×fmt(1.000) | pred='14' gold='42' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 04:11:39,366 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:39,449 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:39,526 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:39,607 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:39,682 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='42' gold='42' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 9 GRPO groups: 65%|######5 | 13/20 [02:29<00:51, 7.43s/q, loss=0.0032, mean_r=0.939, skip=3]
Iter 9 GRPO groups: 70%|####### | 14/20 [02:29<00:42, 7.08s/q, loss=0.0032, mean_r=0.939, skip=3]2026-04-26 04:11:44,001 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,083 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,166 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,247 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,410 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,491 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,573 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,655 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.928[fin=0.98,mean=0.85]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:44,737 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='900' gold='900' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 9 GRPO groups: 70%|####### | 14/20 [02:33<00:42, 7.08s/q, loss=0var, mean_r=0.995, skip=4]
Iter 9 GRPO groups: 75%|#######5 | 15/20 [02:33<00:30, 6.05s/q, loss=0var, mean_r=0.995, skip=4]2026-04-26 04:11:51,607 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:51,692 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:51,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:51,859 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:51,941 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=0.99,mean=0.98]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:52,023 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:11:52,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.540 = 0.50×0.06(prox=0.06) + 0.40×proc(0.901[fin=0.96,mean=0.81]) + 0.10×fmt(1.000) | pred='8' gold='0' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:11:52,198 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:11:52,280 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:11:52,362 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 9 GRPO groups: 75%|#######5 | 15/20 [02:42<00:30, 6.05s/q, loss=0.0023, mean_r=0.951, skip=4]
Iter 9 GRPO groups: 80%|######## | 16/20 [02:42<00:27, 6.94s/q, loss=0.0023, mean_r=0.951, skip=4]2026-04-26 04:12:26,843 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.798 = 0.50×1.00(exact) + 0.40×proc(0.496[fin=0.37,mean=0.68]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 04:12:26,931 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.10(prox=0.10) + 0.40×proc(0.945[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='198' gold='36' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:12:27,025 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.954[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='108' gold='36' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:12:27,110 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.895 = 0.50×1.00(exact) + 0.40×proc(0.737[fin=0.70,mean=0.79]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:12:27,195 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.522 = 0.50×0.20(prox=0.20) + 0.40×proc(0.804[fin=0.86,mean=0.72]) + 0.10×fmt(1.000) | pred='108' gold='36' | step_acc=71% lccp=0% (chain=0/7 ok_count=5) n_steps=7
+2026-04-26 04:12:27,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.631 = 0.50×0.85(prox=0.85) + 0.40×proc(0.265[fin=0.13,mean=0.47]) + 0.10×fmt(1.000) | pred='33' gold='36' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:12:27,362 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.896 = 0.50×0.85(prox=0.85) + 0.40×proc(0.928[fin=0.98,mean=0.85]) + 0.10×fmt(1.000) | pred='33' gold='36' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 04:12:27,446 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.756 = 0.50×0.85(prox=0.85) + 0.40×proc(0.578[fin=0.55,mean=0.61]) + 0.10×fmt(1.000) | pred='33' gold='36' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:12:27,530 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.870 = 0.50×1.00(exact) + 0.40×proc(0.674[fin=0.59,mean=0.79]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 9 GRPO groups: 80%|######## | 16/20 [03:17<00:27, 6.94s/q, loss=-0.0005, mean_r=0.719, skip=4]
Iter 9 GRPO groups: 85%|########5 | 17/20 [03:17<00:46, 15.43s/q, loss=-0.0005, mean_r=0.719, skip=4]2026-04-26 04:12:32,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:12:32,571 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:12:32,651 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:12:32,726 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.948[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:12:32,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:12:32,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:12:32,951 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.919[fin=0.99,mean=0.81]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:12:33,026 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.650) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:12:33,108 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:12:33,182 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 9 GRPO groups: 85%|########5 | 17/20 [03:21<00:46, 15.43s/q, loss=0var, mean_r=0.986, skip=5]
Iter 9 GRPO groups: 90%|######### | 18/20 [03:21<00:24, 12.07s/q, loss=0var, mean_r=0.986, skip=5]2026-04-26 04:12:41,341 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.522 = 0.50×0.60(prox=0.60) + 0.40×proc(0.305[fin=0.25,mean=0.39]) + 0.10×fmt(1.000) | pred='200000' gold='150000' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 04:12:41,433 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.907[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='300000' gold='150000' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 04:12:41,517 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.349 = 0.50×0.23(prox=0.23) + 0.40×proc(0.239[fin=0.19,mean=0.32]) + 0.10×fmt(1.000) | pred='400000' gold='150000' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 04:12:41,599 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.950 = 0.50×1.00(exact) + 0.40×proc(0.874[fin=0.98,mean=0.71]) + 0.10×fmt(1.000) | pred='150000' gold='150000' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:12:41,691 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.941[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='300000' gold='150000' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 04:12:41,782 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.728 = 0.50×0.60(prox=0.60) + 0.40×proc(0.820[fin=0.95,mean=0.63]) + 0.10×fmt(1.000) | pred='200000' gold='150000' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:12:41,874 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='300000' gold='150000' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 04:12:41,965 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='150000' gold='150000' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:12:42,057 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.743 = 0.50×0.60(prox=0.60) + 0.40×proc(0.856[fin=1.00,mean=0.64]) + 0.10×fmt(1.000) | pred='100000' gold='150000' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:12:42,143 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.902[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='300000' gold='150000' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+
Iter 9 GRPO groups: 90%|######### | 18/20 [03:32<00:24, 12.07s/q, loss=0.0005, mean_r=0.648, skip=5]
Iter 9 GRPO groups: 95%|#########5| 19/20 [03:32<00:11, 11.56s/q, loss=0.0005, mean_r=0.648, skip=5]2026-04-26 04:12:50,322 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:12:50,405 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:12:50,488 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=0.99,mean=0.94]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:12:50,579 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:12:50,664 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.720 = 0.50×0.54(prox=0.54) + 0.40×proc(0.875[fin=0.98,mean=0.72]) + 0.10×fmt(1.000) | pred='188' gold='328' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:12:50,748 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.894 = 0.50×1.00(exact) + 0.40×proc(0.735[fin=0.75,mean=0.72]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=83% lccp=33% (chain=2/6 ok_count=5) n_steps=6
+2026-04-26 04:12:50,831 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:12:50,922 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:12:51,006 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:12:51,089 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=0.99,mean=0.96]) + 0.10×fmt(1.000) | pred='328' gold='328' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 9 GRPO groups: 95%|#########5| 19/20 [03:41<00:11, 11.56s/q, loss=-0.0000, mean_r=0.958, skip=5]
Iter 9 GRPO groups: 100%|##########| 20/20 [03:41<00:00, 10.78s/q, loss=-0.0000, mean_r=0.958, skip=5]
Iter 9 GRPO groups: 100%|##########| 20/20 [03:41<00:00, 11.07s/q, loss=-0.0000, mean_r=0.958, skip=5]
+2026-04-26 04:12:52,544 INFO __main__ - Iter 9 | loss=0.0001 | reward mean=0.907 std=0.177 | gt_match=80.3% | grounded_acc=96.5% | step_acc=89.4% | lccp=81.8% | batch_acc=96.5% | phase=GROUNDED_ONLY sp_ratio=0% | groups=15 skipped=5(0var=5) | lr=5.00e-06 | 221.4s
+2026-04-26 04:12:52,545 INFO __main__ - ======================================================================
+2026-04-26 04:12:52,545 INFO __main__ - GRPO ITERATION 10/60
+2026-04-26 04:12:52,545 INFO __main__ - ======================================================================
+2026-04-26 04:12:52,566 INFO __main__ - LR this iteration: 5.00e-06 | T=0.739 | MATH ratio=30%
+
Iter 10 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:12:56,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.956[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:12:56,257 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.969[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:12:56,341 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.920[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:12:56,425 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.932[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:12:56,507 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.928[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:12:56,589 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.887[fin=0.97,mean=0.76]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:12:56,671 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:12:56,752 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.914[fin=0.99,mean=0.80]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:12:56,833 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.894[fin=0.98,mean=0.76]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:12:56,914 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.956 = 0.50×1.00(exact) + 0.40×proc(0.890[fin=0.98,mean=0.76]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+
Iter 10 GRPO groups: 0%| | 0/20 [00:04, ?q/s, loss=0var, mean_r=0.972, skip=1]
Iter 10 GRPO groups: 5%|5 | 1/20 [00:04<01:22, 4.35s/q, loss=0var, mean_r=0.972, skip=1]2026-04-26 04:13:01,303 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:01,379 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:01,456 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:01,539 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:01,618 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:01,697 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:01,775 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:01,856 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:13:01,930 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.921[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:13:02,012 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 10 GRPO groups: 5%|5 | 1/20 [00:09<01:22, 4.35s/q, loss=0var, mean_r=0.996, skip=2]
Iter 10 GRPO groups: 10%|# | 2/20 [00:09<01:26, 4.79s/q, loss=0var, mean_r=0.996, skip=2]2026-04-26 04:13:35,174 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.695 = 0.50×0.50(prox=0.50) + 0.40×proc(0.862[fin=0.99,mean=0.67]) + 0.10×fmt(1.000) | pred='7' gold='14' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:13:35,258 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.529 = 0.50×0.44(prox=0.44) + 0.40×proc(0.525[fin=0.68,mean=0.30]) + 0.10×fmt(1.000) | pred='5' gold='14' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:13:35,341 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.757 = 0.50×0.78(prox=0.78) + 0.40×proc(0.669[fin=0.86,mean=0.39]) + 0.10×fmt(1.000) | pred='16' gold='14' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:13:35,426 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.936 = 0.50×1.00(exact) + 0.40×proc(0.839[fin=0.98,mean=0.62]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:13:35,510 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.767 = 0.50×0.78(prox=0.78) + 0.40×proc(0.697[fin=0.90,mean=0.39]) + 0.10×fmt(1.000) | pred='12' gold='14' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:13:35,592 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.634 = 0.50×0.50(prox=0.50) + 0.40×proc(0.711[fin=0.84,mean=0.51]) + 0.10×fmt(1.000) | pred='7' gold='14' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:13:35,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:13:35,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.655 = 0.50×0.50(prox=0.50) + 0.40×proc(0.762[fin=0.96,mean=0.46]) + 0.10×fmt(1.000) | pred='7' gold='14' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:13:35,845 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 10 GRPO groups: 10%|# | 2/20 [00:44<01:26, 4.79s/q, loss=0.0015, mean_r=0.775, skip=2]
Iter 10 GRPO groups: 15%|#5 | 3/20 [00:44<05:17, 18.70s/q, loss=0.0015, mean_r=0.775, skip=2]2026-04-26 04:13:41,694 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:41,776 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:41,851 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:41,932 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:13:42,013 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:42,090 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:42,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.956[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:42,245 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:42,328 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:42,409 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.901[fin=0.98,mean=0.79]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 10 GRPO groups: 15%|#5 | 3/20 [00:49<05:17, 18.70s/q, loss=0var, mean_r=0.992, skip=3]
Iter 10 GRPO groups: 20%|## | 4/20 [00:49<03:33, 13.35s/q, loss=0var, mean_r=0.992, skip=3]2026-04-26 04:13:45,966 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,048 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,203 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,279 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,357 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,439 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,519 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,600 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:13:46,681 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 10 GRPO groups: 20%|## | 4/20 [00:54<03:33, 13.35s/q, loss=0var, mean_r=0.999, skip=4]
Iter 10 GRPO groups: 25%|##5 | 5/20 [00:54<02:31, 10.08s/q, loss=0var, mean_r=0.999, skip=4]2026-04-26 04:13:52,591 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:13:52,673 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:52,756 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.940[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:13:52,838 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:13:52,919 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.884 = 0.50×1.00(exact) + 0.40×proc(0.710[fin=0.86,mean=0.48]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:13:53,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.395 = 0.50×0.14(prox=0.14) + 0.40×proc(0.277[fin=0.00,mean=0.69]) + 0.10×fmt(1.000) | pred='8' gold='2' | step_acc=75% lccp=75% (chain=3/4 ok_count=3) n_steps=4
+2026-04-26 04:13:53,086 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:13:53,172 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.657 = 0.50×0.50(prox=0.50) + 0.40×proc(0.769[fin=0.89,mean=0.59]) + 0.10×fmt(1.000) | pred='3' gold='2' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:13:53,256 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:13:53,338 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.926 = 0.50×1.00(exact) + 0.40×proc(0.815[fin=0.98,mean=0.57]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+
Iter 10 GRPO groups: 25%|##5 | 5/20 [01:02<02:31, 10.08s/q, loss=0.0002, mean_r=0.882, skip=4]
Iter 10 GRPO groups: 30%|### | 6/20 [01:02<02:11, 9.40s/q, loss=0.0002, mean_r=0.882, skip=4]2026-04-26 04:13:58,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:58,868 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:58,949 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:59,031 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:59,113 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:59,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:59,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:59,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:59,441 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:13:59,523 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='77' gold='77' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 10 GRPO groups: 30%|### | 6/20 [01:06<02:11, 9.40s/q, loss=0var, mean_r=0.998, skip=5]
Iter 10 GRPO groups: 35%|###5 | 7/20 [01:06<01:42, 7.88s/q, loss=0var, mean_r=0.998, skip=5]2026-04-26 04:14:02,109 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:02,187 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:02,266 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:02,342 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:02,418 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:02,494 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:02,569 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:02,646 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.795 = 0.50×0.71(prox=0.71) + 0.40×proc(0.845[fin=0.96,mean=0.66]) + 0.10×fmt(1.000) | pred='8' gold='10' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:14:02,722 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:02,798 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.808 = 0.50×0.71(prox=0.71) + 0.40×proc(0.878[fin=0.99,mean=0.71]) + 0.10×fmt(1.000) | pred='8' gold='10' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+
Iter 10 GRPO groups: 35%|###5 | 7/20 [01:11<01:42, 7.88s/q, loss=0.0022, mean_r=0.955, skip=5]
Iter 10 GRPO groups: 40%|#### | 8/20 [01:11<01:22, 6.87s/q, loss=0.0022, mean_r=0.955, skip=5]2026-04-26 04:14:19,035 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.988[fin=0.98,mean=0.99]) + 0.10×fmt(0.700) | pred='' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:19,118 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:19,200 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:19,281 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:19,364 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:19,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:19,545 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.637 = 0.50×0.67(prox=0.67) + 0.40×proc(0.510[fin=0.43,mean=0.63]) + 0.10×fmt(1.000) | pred='7.5' gold='10' | step_acc=67% lccp=8% (chain=1/12 ok_count=8) n_steps=12
+2026-04-26 04:14:19,628 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.859 = 0.50×1.00(exact) + 0.40×proc(0.648[fin=0.60,mean=0.72]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:14:19,712 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:19,805 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 10 GRPO groups: 40%|#### | 8/20 [01:28<01:22, 6.87s/q, loss=-0.0007, mean_r=0.903, skip=5]
Iter 10 GRPO groups: 45%|####5 | 9/20 [01:28<01:50, 10.05s/q, loss=-0.0007, mean_r=0.903, skip=5]2026-04-26 04:14:25,027 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,113 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,277 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,442 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,526 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,607 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,689 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:25,771 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='100' gold='100' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 10 GRPO groups: 45%|####5 | 9/20 [01:33<01:50, 10.05s/q, loss=0var, mean_r=1.000, skip=6]
Iter 10 GRPO groups: 50%|##### | 10/20 [01:33<01:23, 8.34s/q, loss=0var, mean_r=1.000, skip=6]2026-04-26 04:14:29,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:29,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:29,755 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:29,839 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:29,925 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:14:30,009 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:30,092 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:14:30,175 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(0.650) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:14:30,257 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:30,338 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.935 = 0.50×1.00(exact) + 0.40×proc(0.925[fin=1.00,mean=0.81]) + 0.10×fmt(0.650) | pred='-13' gold='-13' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 10 GRPO groups: 50%|##### | 10/20 [01:39<01:23, 8.34s/q, loss=0.0028, mean_r=0.984, skip=6]
Iter 10 GRPO groups: 55%|#####5 | 11/20 [01:39<01:08, 7.62s/q, loss=0.0028, mean_r=0.984, skip=6]2026-04-26 04:14:36,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:36,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:36,833 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:36,914 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:36,995 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:37,078 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:37,161 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:37,242 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:37,324 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:14:37,407 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.929 = 0.50×1.00(exact) + 0.40×proc(0.823[fin=0.99,mean=0.57]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+
Iter 10 GRPO groups: 55%|#####5 | 11/20 [01:46<01:08, 7.62s/q, loss=-0.0017, mean_r=0.992, skip=6]
Iter 10 GRPO groups: 60%|###### | 12/20 [01:46<00:59, 7.46s/q, loss=-0.0017, mean_r=0.992, skip=6]2026-04-26 04:14:51,968 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.908 = 0.50×1.00(exact) + 0.40×proc(0.770[fin=0.80,mean=0.72]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=75% lccp=62% (chain=5/8 ok_count=6) n_steps=8
+2026-04-26 04:14:52,061 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.902 = 0.50×1.00(exact) + 0.40×proc(0.756[fin=0.72,mean=0.81]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=88% lccp=75% (chain=6/8 ok_count=7) n_steps=8
+2026-04-26 04:14:52,157 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.737 = 0.50×0.50(prox=0.50) + 0.40×proc(0.969[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='175' gold='350' | step_acc=89% lccp=78% (chain=7/9 ok_count=8) n_steps=9
+2026-04-26 04:14:52,249 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.627 = 0.50×0.85(prox=0.85) + 0.40×proc(0.254[fin=0.08,mean=0.52]) + 0.10×fmt(1.000) | pred='380' gold='350' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+2026-04-26 04:14:52,347 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.804 = 0.50×0.80(prox=0.80) + 0.40×proc(0.766[fin=0.73,mean=0.81]) + 0.10×fmt(1.000) | pred='305' gold='350' | step_acc=90% lccp=80% (chain=8/10 ok_count=9) n_steps=10
+2026-04-26 04:14:52,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.834 = 0.50×0.85(prox=0.85) + 0.40×proc(0.772[fin=0.83,mean=0.68]) + 0.10×fmt(1.000) | pred='380' gold='350' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:14:52,532 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.572 = 0.50×0.53(prox=0.53) + 0.40×proc(0.513[fin=0.46,mean=0.60]) + 0.10×fmt(1.000) | pred='197.5' gold='350' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 04:14:52,624 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.739 = 0.50×0.50(prox=0.50) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='175' gold='350' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:14:52,719 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.606 = 0.50×0.80(prox=0.80) + 0.40×proc(0.270[fin=0.01,mean=0.65]) + 0.10×fmt(1.000) | pred='395' gold='350' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 04:14:52,815 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.609 = 0.50×0.80(prox=0.80) + 0.40×proc(0.279[fin=0.07,mean=0.60]) + 0.10×fmt(1.000) | pred='395' gold='350' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+
Iter 10 GRPO groups: 60%|###### | 12/20 [02:01<00:59, 7.46s/q, loss=0.0005, mean_r=0.734, skip=6]
Iter 10 GRPO groups: 65%|######5 | 13/20 [02:01<01:09, 9.87s/q, loss=0.0005, mean_r=0.734, skip=6]2026-04-26 04:15:01,426 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.915 = 0.50×0.85(prox=0.85) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='54.5' gold='50' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:15:01,518 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.871 = 0.50×0.85(prox=0.85) + 0.40×proc(0.865[fin=0.90,mean=0.81]) + 0.10×fmt(1.000) | pred='49.5' gold='50' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:15:01,609 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:15:01,695 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.904 = 0.50×0.85(prox=0.85) + 0.40×proc(0.948[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='48.5' gold='50' | step_acc=86% lccp=14% (chain=1/7 ok_count=6) n_steps=7
+2026-04-26 04:15:01,779 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.470 = 0.50×0.36(prox=0.36) + 0.40×proc(0.254[fin=0.02,mean=0.61]) + 0.10×fmt(1.000) | pred='5' gold='50' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 04:15:01,870 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.897 = 0.50×0.85(prox=0.85) + 0.40×proc(0.929[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='50.5' gold='50' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:15:01,962 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:15:02,053 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:15:02,144 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.832 = 0.50×0.85(prox=0.85) + 0.40×proc(0.767[fin=0.84,mean=0.66]) + 0.10×fmt(1.000) | pred='46.2' gold='50' | step_acc=62% lccp=50% (chain=4/8 ok_count=5) n_steps=8
+2026-04-26 04:15:02,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.31(prox=0.31) + 0.40×proc(0.888[fin=1.00,mean=0.73]) + 0.10×fmt(1.000) | pred='105.5' gold='50' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+
Iter 10 GRPO groups: 65%|######5 | 13/20 [02:11<01:09, 9.87s/q, loss=0.0015, mean_r=0.842, skip=6]
Iter 10 GRPO groups: 70%|####### | 14/20 [02:11<00:58, 9.73s/q, loss=0.0015, mean_r=0.842, skip=6]2026-04-26 04:15:09,022 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=86% lccp=0% (chain=0/7 ok_count=6) n_steps=7
+2026-04-26 04:15:09,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=86% lccp=0% (chain=0/7 ok_count=6) n_steps=7
+2026-04-26 04:15:09,188 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:15:09,269 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.756 = 0.50×0.85(prox=0.85) + 0.40×proc(0.578[fin=0.63,mean=0.50]) + 0.10×fmt(1.000) | pred='39' gold='36' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:15:09,351 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.950[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=86% lccp=0% (chain=0/7 ok_count=6) n_steps=7
+2026-04-26 04:15:09,434 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:15:09,516 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.592 = 0.50×0.75(prox=0.75) + 0.40×proc(0.293[fin=0.08,mean=0.61]) + 0.10×fmt(1.000) | pred='30' gold='36' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 04:15:09,599 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.927[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 04:15:09,683 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:15:09,766 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.777 = 0.50×0.75(prox=0.75) + 0.40×proc(0.754[fin=0.83,mean=0.64]) + 0.10×fmt(1.000) | pred='30' gold='36' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+
Iter 10 GRPO groups: 70%|####### | 14/20 [02:18<00:58, 9.73s/q, loss=0.0010, mean_r=0.903, skip=6]
Iter 10 GRPO groups: 75%|#######5 | 15/20 [02:18<00:45, 9.06s/q, loss=0.0010, mean_r=0.903, skip=6]2026-04-26 04:15:17,449 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.727 = 0.50×0.60(prox=0.60) + 0.40×proc(0.817[fin=0.90,mean=0.69]) + 0.10×fmt(1.000) | pred='40' gold='60' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:15:17,533 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.756 = 0.50×0.60(prox=0.60) + 0.40×proc(0.891[fin=1.00,mean=0.73]) + 0.10×fmt(1.000) | pred='40' gold='60' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:15:17,617 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.680 = 0.50×0.60(prox=0.60) + 0.40×proc(0.701[fin=0.83,mean=0.51]) + 0.10×fmt(1.000) | pred='40' gold='60' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:15:17,700 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.924 = 0.50×1.00(exact) + 0.40×proc(0.810[fin=1.00,mean=0.53]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:15:17,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.667 = 0.50×0.60(prox=0.60) + 0.40×proc(0.667[fin=0.79,mean=0.49]) + 0.10×fmt(1.000) | pred='40' gold='60' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:15:17,867 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.670 = 0.50×0.60(prox=0.60) + 0.40×proc(0.676[fin=0.69,mean=0.66]) + 0.10×fmt(1.000) | pred='40' gold='60' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:15:17,950 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.587 = 0.50×0.52(prox=0.52) + 0.40×proc(0.571[fin=0.77,mean=0.28]) + 0.10×fmt(1.000) | pred='32' gold='60' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:15:18,041 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:15:18,136 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.727 = 0.50×0.64(prox=0.64) + 0.40×proc(0.769[fin=0.92,mean=0.54]) + 0.10×fmt(1.000) | pred='43' gold='60' | step_acc=57% lccp=43% (chain=3/7 ok_count=4) n_steps=7
+2026-04-26 04:15:18,220 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 10 GRPO groups: 75%|#######5 | 15/20 [02:27<00:45, 9.06s/q, loss=0.0004, mean_r=0.773, skip=6]
Iter 10 GRPO groups: 80%|######## | 16/20 [02:27<00:35, 8.88s/q, loss=0.0004, mean_r=0.773, skip=6]2026-04-26 04:15:30,284 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.728 = 0.50×0.60(prox=0.60) + 0.40×proc(0.819[fin=0.94,mean=0.64]) + 0.10×fmt(1.000) | pred='33.33' gold='25' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:15:30,378 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.704 = 0.50×0.55(prox=0.55) + 0.40×proc(0.822[fin=0.98,mean=0.58]) + 0.10×fmt(1.000) | pred='14.81' gold='25' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:15:30,465 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:15:30,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:15:30,650 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.881 = 0.50×0.85(prox=0.85) + 0.40×proc(0.891[fin=0.99,mean=0.74]) + 0.10×fmt(1.000) | pred='23.53' gold='25' | step_acc=71% lccp=43% (chain=3/7 ok_count=5) n_steps=7
+2026-04-26 04:15:30,742 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.956[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:15:30,827 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:15:30,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.749 = 0.50×0.60(prox=0.60) + 0.40×proc(0.872[fin=0.99,mean=0.70]) + 0.10×fmt(1.000) | pred='33.33' gold='25' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:15:31,017 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='25%' gold='25' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:15:31,107 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.759 = 0.50×0.60(prox=0.60) + 0.40×proc(0.897[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='33.3' gold='25' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+
Iter 10 GRPO groups: 80%|######## | 16/20 [02:39<00:35, 8.88s/q, loss=-0.0011, mean_r=0.835, skip=6]
Iter 10 GRPO groups: 85%|########5 | 17/20 [02:39<00:30, 10.09s/q, loss=-0.0011, mean_r=0.835, skip=6]2026-04-26 04:15:41,494 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.485 = 0.50×0.20(prox=0.20) + 0.40×proc(0.525[fin=0.49,mean=0.58]) + 0.10×fmt(1.000) | pred='648' gold='216' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 04:15:41,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.928[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='216' gold='216' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:15:41,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.459 = 0.50×0.37(prox=0.37) + 0.40×proc(0.322[fin=0.13,mean=0.62]) + 0.10×fmt(1.000) | pred='36' gold='216' | step_acc=57% lccp=29% (chain=2/7 ok_count=4) n_steps=7
+2026-04-26 04:15:41,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.450 = 0.50×0.37(prox=0.37) + 0.40×proc(0.330[fin=0.22,mean=0.50]) + 0.10×fmt(1.000) | pred='36' gold='216' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 04:15:41,843 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.39(prox=0.39) + 0.40×proc(0.435[fin=0.34,mean=0.58]) + 0.10×fmt(1.000) | pred='48' gold='216' | step_acc=57% lccp=57% (chain=4/7 ok_count=4) n_steps=7
+2026-04-26 04:15:41,924 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.406 = 0.50×0.35(prox=0.35) + 0.40×proc(0.207[fin=0.04,mean=0.46]) + 0.10×fmt(1.000) | pred='12' gold='216' | step_acc=33% lccp=33% (chain=2/6 ok_count=2) n_steps=6
+2026-04-26 04:15:42,007 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.506 = 0.50×0.37(prox=0.37) + 0.40×proc(0.405[fin=0.32,mean=0.54]) + 0.10×fmt(1.000) | pred='36' gold='216' | step_acc=38% lccp=38% (chain=3/8 ok_count=3) n_steps=8
+2026-04-26 04:15:42,100 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.455 = 0.50×0.39(prox=0.39) + 0.40×proc(0.237[fin=0.11,mean=0.43]) + 0.10×fmt(1.000) | pred='48' gold='216' | step_acc=43% lccp=43% (chain=3/7 ok_count=3) n_steps=7
+2026-04-26 04:15:42,184 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.489 = 0.50×0.20(prox=0.20) + 0.40×proc(0.670[fin=0.73,mean=0.58]) + 0.10×fmt(1.000) | pred='648' gold='216' | step_acc=57% lccp=14% (chain=1/7 ok_count=4) n_steps=7
+2026-04-26 04:15:42,267 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.37(prox=0.37) + 0.40×proc(0.880[fin=0.93,mean=0.81]) + 0.10×fmt(1.000) | pred='36' gold='216' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+
Iter 10 GRPO groups: 85%|########5 | 17/20 [02:51<00:30, 10.09s/q, loss=0.0008, mean_r=0.532, skip=6]
Iter 10 GRPO groups: 90%|######### | 18/20 [02:51<00:20, 10.40s/q, loss=0.0008, mean_r=0.532, skip=6]2026-04-26 04:15:49,639 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:15:49,722 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:15:49,812 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:15:49,903 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:15:49,994 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:15:50,086 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.536 = 0.50×0.50(prox=0.50) + 0.40×proc(0.465[fin=0.39,mean=0.57]) + 0.10×fmt(1.000) | pred='1' gold='2' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:15:50,177 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:15:50,269 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:15:50,360 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:15:50,454 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 10 GRPO groups: 90%|######### | 18/20 [02:59<00:20, 10.40s/q, loss=-0.0009, mean_r=0.953, skip=6]
Iter 10 GRPO groups: 95%|#########5| 19/20 [02:59<00:09, 9.73s/q, loss=-0.0009, mean_r=0.953, skip=6]2026-04-26 04:15:59,286 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.716 = 0.50×0.85(prox=0.85) + 0.40×proc(0.477[fin=0.41,mean=0.57]) + 0.10×fmt(1.000) | pred='55' gold='59' | step_acc=43% lccp=43% (chain=3/7 ok_count=3) n_steps=7
+2026-04-26 04:15:59,378 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.31(prox=0.31) + 0.40×proc(0.490[fin=0.36,mean=0.68]) + 0.10×fmt(1.000) | pred='124' gold='59' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 04:15:59,461 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.516 = 0.50×0.36(prox=0.36) + 0.40×proc(0.588[fin=0.68,mean=0.46]) + 0.10×fmt(1.000) | pred='111' gold='59' | step_acc=33% lccp=0% (chain=0/6 ok_count=2) n_steps=6
+2026-04-26 04:15:59,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.460 = 0.50×0.25(prox=0.25) + 0.40×proc(0.583[fin=0.72,mean=0.38]) + 0.10×fmt(1.000) | pred='146.36' gold='59' | step_acc=29% lccp=0% (chain=0/7 ok_count=2) n_steps=7
+2026-04-26 04:15:59,635 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.749 = 0.50×0.63(prox=0.63) + 0.40×proc(0.829[fin=0.96,mean=0.63]) + 0.10×fmt(1.000) | pred='76' gold='59' | step_acc=50% lccp=38% (chain=3/8 ok_count=4) n_steps=8
+2026-04-26 04:15:59,719 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.576 = 0.50×0.21(prox=0.21) + 0.40×proc(0.929[fin=0.97,mean=0.87]) + 0.10×fmt(1.000) | pred='171' gold='59' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 04:15:59,812 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.596 = 0.50×0.36(prox=0.36) + 0.40×proc(0.791[fin=0.93,mean=0.58]) + 0.10×fmt(1.000) | pred='6.5' gold='59' | step_acc=75% lccp=0% (chain=0/8 ok_count=6) n_steps=8
+2026-04-26 04:15:59,905 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.562 = 0.50×0.46(prox=0.46) + 0.40×proc(0.577[fin=0.77,mean=0.29]) + 0.10×fmt(1.000) | pred='93.3' gold='59' | step_acc=29% lccp=0% (chain=0/7 ok_count=2) n_steps=7
+2026-04-26 04:15:59,992 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.584 = 0.50×0.38(prox=0.38) + 0.40×proc(0.733[fin=0.85,mean=0.56]) + 0.10×fmt(1.000) | pred='11' gold='59' | step_acc=43% lccp=0% (chain=0/7 ok_count=3) n_steps=7
+2026-04-26 04:16:00,077 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.719[fin=0.82,mean=0.57]) + 0.10×fmt(1.000) | pred='14' gold='59' | step_acc=57% lccp=14% (chain=1/7 ok_count=4) n_steps=7
+
Iter 10 GRPO groups: 95%|#########5| 19/20 [03:08<00:09, 9.73s/q, loss=-0.0005, mean_r=0.586, skip=6]
Iter 10 GRPO groups: 100%|##########| 20/20 [03:08<00:00, 9.71s/q, loss=-0.0005, mean_r=0.586, skip=6]
Iter 10 GRPO groups: 100%|##########| 20/20 [03:08<00:00, 9.45s/q, loss=-0.0005, mean_r=0.586, skip=6]
+2026-04-26 04:16:01,527 INFO __main__ - Iter 10 | loss=0.0004 | reward mean=0.881 std=0.176 | gt_match=68.3% | grounded_acc=95.5% | step_acc=84.7% | lccp=72.2% | batch_acc=95.5% | phase=GROUNDED_ONLY sp_ratio=0% | groups=14 skipped=6(0var=6) | lr=4.98e-06 | 189.0s
+2026-04-26 04:16:01,528 INFO __main__ - Evaluating GSM8K (150 samples)...
+
GSM8K eval: 0%| | 0/150 [00:00, ?q/s]
GSM8K eval: 1%| | 1/150 [00:02<05:25, 2.18s/q, correct=1/1, lccp=100.0%, score=0.999, step_acc=100.0%]
GSM8K eval: 1%|1 | 2/150 [00:07<09:21, 3.79s/q, correct=2/2, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 2%|2 | 3/150 [00:09<07:57, 3.25s/q, correct=3/3, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 3%|2 | 4/150 [00:11<06:55, 2.84s/q, correct=3/4, lccp=83.3%, score=0.887, step_acc=91.7%]
GSM8K eval: 3%|3 | 5/150 [00:13<05:49, 2.41s/q, correct=4/5, lccp=86.7%, score=0.910, step_acc=93.3%]
GSM8K eval: 4%|4 | 6/150 [00:18<08:13, 3.42s/q, correct=4/6, lccp=75.6%, score=0.888, step_acc=87.8%]
GSM8K eval: 5%|4 | 7/150 [00:22<08:09, 3.43s/q, correct=5/7, lccp=79.0%, score=0.904, step_acc=89.5%]
GSM8K eval: 5%|5 | 8/150 [00:24<07:22, 3.12s/q, correct=6/8, lccp=81.7%, score=0.916, step_acc=90.8%]
GSM8K eval: 6%|6 | 9/150 [00:28<07:27, 3.17s/q, correct=7/9, lccp=83.7%, score=0.925, step_acc=91.9%]
GSM8K eval: 7%|6 | 10/150 [00:33<08:42, 3.73s/q, correct=7/10, lccp=81.3%, score=0.908, step_acc=90.7%]
GSM8K eval: 7%|7 | 11/150 [00:36<08:04, 3.48s/q, correct=8/11, lccp=83.0%, score=0.917, step_acc=91.5%]
GSM8K eval: 8%|8 | 12/150 [00:38<07:05, 3.08s/q, correct=9/12, lccp=84.4%, score=0.924, step_acc=92.2%]
GSM8K eval: 9%|8 | 13/150 [00:41<06:50, 3.00s/q, correct=10/13, lccp=85.6%, score=0.925, step_acc=92.8%]
GSM8K eval: 9%|9 | 14/150 [00:45<07:41, 3.39s/q, correct=11/14, lccp=86.7%, score=0.930, step_acc=93.3%]
GSM8K eval: 10%|# | 15/150 [00:47<07:03, 3.13s/q, correct=12/15, lccp=87.6%, score=0.935, step_acc=93.8%]
GSM8K eval: 11%|# | 16/150 [00:50<06:33, 2.94s/q, correct=12/16, lccp=88.3%, score=0.911, step_acc=94.2%]
GSM8K eval: 11%|#1 | 17/150 [00:54<07:11, 3.24s/q, correct=13/17, lccp=89.0%, score=0.916, step_acc=94.5%]
GSM8K eval: 12%|#2 | 18/150 [01:00<08:48, 4.01s/q, correct=13/18, lccp=84.8%, score=0.904, step_acc=92.0%]
GSM8K eval: 13%|#2 | 19/150 [01:02<07:49, 3.59s/q, correct=14/19, lccp=85.6%, score=0.909, step_acc=92.5%]
GSM8K eval: 13%|#3 | 20/150 [01:06<07:52, 3.63s/q, correct=15/20, lccp=86.3%, score=0.914, step_acc=92.8%]
GSM8K eval: 14%|#4 | 21/150 [01:09<07:08, 3.32s/q, correct=16/21, lccp=86.9%, score=0.918, step_acc=93.2%]
GSM8K eval: 15%|#4 | 22/150 [01:14<08:20, 3.91s/q, correct=17/22, lccp=84.9%, score=0.919, step_acc=91.5%]
GSM8K eval: 15%|#5 | 23/150 [01:17<07:57, 3.76s/q, correct=17/23, lccp=82.3%, score=0.902, step_acc=88.6%]
GSM8K eval: 16%|#6 | 24/150 [01:20<07:08, 3.40s/q, correct=17/24, lccp=79.9%, score=0.886, step_acc=86.0%]
GSM8K eval: 17%|#6 | 25/150 [01:23<06:41, 3.21s/q, correct=17/25, lccp=77.7%, score=0.882, step_acc=85.6%]
GSM8K eval: 17%|#7 | 26/150 [01:27<07:19, 3.55s/q, correct=18/26, lccp=78.6%, score=0.887, step_acc=86.1%]
GSM8K eval: 18%|#8 | 27/150 [01:30<06:49, 3.33s/q, correct=18/27, lccp=79.4%, score=0.882, step_acc=86.6%]
GSM8K eval: 19%|#8 | 28/150 [01:32<06:03, 2.98s/q, correct=19/28, lccp=80.1%, score=0.887, step_acc=87.1%]
GSM8K eval: 19%|#9 | 29/150 [01:35<05:53, 2.92s/q, correct=20/29, lccp=80.8%, score=0.890, step_acc=87.5%]
GSM8K eval: 20%|## | 30/150 [01:38<06:22, 3.19s/q, correct=21/30, lccp=81.5%, score=0.894, step_acc=88.0%]
GSM8K eval: 21%|## | 31/150 [01:41<05:56, 3.00s/q, correct=22/31, lccp=82.1%, score=0.897, step_acc=88.3%]
GSM8K eval: 21%|##1 | 32/150 [01:43<05:08, 2.61s/q, correct=23/32, lccp=82.6%, score=0.899, step_acc=88.7%]
GSM8K eval: 22%|##2 | 33/150 [01:45<05:11, 2.66s/q, correct=24/33, lccp=83.1%, score=0.903, step_acc=89.1%]
GSM8K eval: 23%|##2 | 34/150 [01:47<04:45, 2.46s/q, correct=25/34, lccp=83.6%, score=0.905, step_acc=89.4%]
GSM8K eval: 23%|##3 | 35/150 [01:50<04:45, 2.49s/q, correct=26/35, lccp=84.1%, score=0.908, step_acc=89.7%]
GSM8K eval: 24%|##4 | 36/150 [01:53<05:16, 2.77s/q, correct=27/36, lccp=84.5%, score=0.911, step_acc=90.0%]
GSM8K eval: 25%|##4 | 37/150 [01:55<04:47, 2.54s/q, correct=28/37, lccp=85.0%, score=0.912, step_acc=90.2%]
GSM8K eval: 25%|##5 | 38/150 [01:59<05:00, 2.69s/q, correct=29/38, lccp=85.4%, score=0.914, step_acc=90.5%]
GSM8K eval: 26%|##6 | 39/150 [02:03<06:14, 3.37s/q, correct=30/39, lccp=85.7%, score=0.916, step_acc=90.7%]
GSM8K eval: 27%|##6 | 40/150 [02:10<07:45, 4.23s/q, correct=31/40, lccp=86.1%, score=0.919, step_acc=91.0%]
GSM8K eval: 27%|##7 | 41/150 [02:13<07:01, 3.87s/q, correct=31/41, lccp=86.4%, score=0.918, step_acc=91.2%]
GSM8K eval: 28%|##8 | 42/150 [02:18<07:40, 4.27s/q, correct=32/42, lccp=85.2%, score=0.920, step_acc=91.0%]
GSM8K eval: 29%|##8 | 43/150 [02:20<06:20, 3.56s/q, correct=33/43, lccp=85.5%, score=0.922, step_acc=91.2%]
GSM8K eval: 29%|##9 | 44/150 [02:26<07:47, 4.41s/q, correct=34/44, lccp=85.8%, score=0.923, step_acc=91.4%]
GSM8K eval: 30%|### | 45/150 [02:29<07:01, 4.01s/q, correct=35/45, lccp=86.2%, score=0.925, step_acc=91.6%]
GSM8K eval: 31%|### | 46/150 [02:34<07:21, 4.25s/q, correct=35/46, lccp=84.3%, score=0.920, step_acc=91.5%]
GSM8K eval: 31%|###1 | 47/150 [02:37<06:38, 3.87s/q, correct=36/47, lccp=84.6%, score=0.922, step_acc=91.7%]
GSM8K eval: 32%|###2 | 48/150 [02:39<05:30, 3.24s/q, correct=37/48, lccp=84.9%, score=0.923, step_acc=91.9%]
GSM8K eval: 33%|###2 | 49/150 [02:46<07:11, 4.27s/q, correct=37/49, lccp=83.8%, score=0.910, step_acc=90.6%]
GSM8K eval: 33%|###3 | 50/150 [02:49<06:32, 3.93s/q, correct=37/50, lccp=83.1%, score=0.901, step_acc=89.8%]
GSM8K eval: 34%|###4 | 51/150 [02:50<05:14, 3.18s/q, correct=38/51, lccp=83.4%, score=0.903, step_acc=90.0%]
GSM8K eval: 35%|###4 | 52/150 [02:54<05:41, 3.48s/q, correct=38/52, lccp=81.8%, score=0.903, step_acc=89.9%]
GSM8K eval: 35%|###5 | 53/150 [03:00<06:39, 4.12s/q, correct=39/53, lccp=82.2%, score=0.905, step_acc=90.1%]
GSM8K eval: 36%|###6 | 54/150 [03:02<05:44, 3.58s/q, correct=40/54, lccp=82.5%, score=0.907, step_acc=90.2%]
GSM8K eval: 37%|###6 | 55/150 [03:06<05:39, 3.57s/q, correct=41/55, lccp=82.8%, score=0.908, step_acc=90.4%]
GSM8K eval: 37%|###7 | 56/150 [03:09<05:35, 3.57s/q, correct=42/56, lccp=83.1%, score=0.910, step_acc=90.6%]
GSM8K eval: 38%|###8 | 57/150 [03:12<04:55, 3.18s/q, correct=43/57, lccp=83.4%, score=0.911, step_acc=90.8%]
GSM8K eval: 39%|###8 | 58/150 [03:16<05:16, 3.44s/q, correct=44/58, lccp=83.7%, score=0.913, step_acc=90.9%]
GSM8K eval: 39%|###9 | 59/150 [03:19<05:17, 3.49s/q, correct=44/59, lccp=82.3%, score=0.904, step_acc=89.7%]
GSM8K eval: 40%|#### | 60/150 [03:24<05:52, 3.91s/q, correct=45/60, lccp=82.6%, score=0.906, step_acc=89.9%]
GSM8K eval: 41%|#### | 61/150 [03:27<05:28, 3.69s/q, correct=46/61, lccp=82.9%, score=0.908, step_acc=90.0%]
GSM8K eval: 41%|####1 | 62/150 [03:30<05:09, 3.51s/q, correct=47/62, lccp=83.2%, score=0.909, step_acc=90.2%]
GSM8K eval: 42%|####2 | 63/150 [03:34<05:00, 3.46s/q, correct=47/63, lccp=82.9%, score=0.903, step_acc=89.8%]
GSM8K eval: 43%|####2 | 64/150 [03:37<04:40, 3.27s/q, correct=48/64, lccp=83.2%, score=0.905, step_acc=90.0%]
GSM8K eval: 43%|####3 | 65/150 [03:39<04:24, 3.11s/q, correct=49/65, lccp=83.4%, score=0.906, step_acc=90.1%]
GSM8K eval: 44%|####4 | 66/150 [03:41<03:39, 2.62s/q, correct=50/66, lccp=83.7%, score=0.907, step_acc=90.3%]
GSM8K eval: 45%|####4 | 67/150 [03:43<03:29, 2.52s/q, correct=51/67, lccp=83.9%, score=0.909, step_acc=90.4%]
GSM8K eval: 45%|####5 | 68/150 [03:46<03:30, 2.56s/q, correct=52/68, lccp=84.2%, score=0.910, step_acc=90.6%]
GSM8K eval: 46%|####6 | 69/150 [03:47<03:02, 2.25s/q, correct=53/69, lccp=84.4%, score=0.911, step_acc=90.7%]
GSM8K eval: 47%|####6 | 70/150 [03:50<03:18, 2.48s/q, correct=54/70, lccp=83.2%, score=0.912, step_acc=90.6%]
GSM8K eval: 47%|####7 | 71/150 [03:53<03:31, 2.68s/q, correct=55/71, lccp=82.0%, score=0.913, step_acc=90.4%]
GSM8K eval: 48%|####8 | 72/150 [03:55<03:01, 2.33s/q, correct=56/72, lccp=82.3%, score=0.914, step_acc=90.6%]
GSM8K eval: 49%|####8 | 73/150 [03:57<02:44, 2.13s/q, correct=57/73, lccp=82.5%, score=0.915, step_acc=90.7%]
GSM8K eval: 49%|####9 | 74/150 [04:00<03:13, 2.55s/q, correct=58/74, lccp=82.7%, score=0.917, step_acc=90.8%]
GSM8K eval: 50%|##### | 75/150 [04:02<02:52, 2.31s/q, correct=59/75, lccp=83.0%, score=0.918, step_acc=90.9%]
GSM8K eval: 51%|##### | 76/150 [04:08<04:25, 3.59s/q, correct=59/76, lccp=83.0%, score=0.913, step_acc=90.9%]
GSM8K eval: 51%|#####1 | 77/150 [04:12<04:29, 3.70s/q, correct=60/77, lccp=83.2%, score=0.914, step_acc=91.0%]
GSM8K eval: 52%|#####2 | 78/150 [04:15<03:58, 3.31s/q, correct=61/78, lccp=83.5%, score=0.915, step_acc=91.1%]
GSM8K eval: 53%|#####2 | 79/150 [04:18<03:49, 3.23s/q, correct=61/79, lccp=83.0%, score=0.910, step_acc=90.8%]
GSM8K eval: 53%|#####3 | 80/150 [04:21<03:41, 3.16s/q, correct=62/80, lccp=83.2%, score=0.911, step_acc=90.9%]
GSM8K eval: 54%|#####4 | 81/150 [04:23<03:22, 2.94s/q, correct=63/81, lccp=83.5%, score=0.912, step_acc=91.0%]
GSM8K eval: 55%|#####4 | 82/150 [04:26<03:20, 2.94s/q, correct=64/82, lccp=83.7%, score=0.913, step_acc=91.1%]
GSM8K eval: 55%|#####5 | 83/150 [04:29<03:14, 2.91s/q, correct=65/83, lccp=83.9%, score=0.914, step_acc=91.3%]
GSM8K eval: 56%|#####6 | 84/150 [04:32<03:07, 2.84s/q, correct=66/84, lccp=84.0%, score=0.915, step_acc=91.4%]
GSM8K eval: 57%|#####6 | 85/150 [04:36<03:23, 3.14s/q, correct=67/85, lccp=84.2%, score=0.916, step_acc=91.5%]
GSM8K eval: 57%|#####7 | 86/150 [04:39<03:27, 3.24s/q, correct=68/86, lccp=84.4%, score=0.917, step_acc=91.6%]
GSM8K eval: 58%|#####8 | 87/150 [04:45<04:08, 3.95s/q, correct=69/87, lccp=84.6%, score=0.918, step_acc=91.7%]
GSM8K eval: 59%|#####8 | 88/150 [04:47<03:25, 3.32s/q, correct=70/88, lccp=84.8%, score=0.919, step_acc=91.7%]
GSM8K eval: 59%|#####9 | 89/150 [04:49<03:11, 3.13s/q, correct=71/89, lccp=84.9%, score=0.920, step_acc=91.8%]
GSM8K eval: 60%|###### | 90/150 [04:52<02:54, 2.91s/q, correct=72/90, lccp=85.1%, score=0.921, step_acc=91.9%]
GSM8K eval: 61%|###### | 91/150 [04:56<03:17, 3.35s/q, correct=73/91, lccp=85.3%, score=0.922, step_acc=92.0%]
GSM8K eval: 61%|######1 | 92/150 [04:59<03:09, 3.27s/q, correct=74/92, lccp=85.4%, score=0.922, step_acc=92.1%]
GSM8K eval: 62%|######2 | 93/150 [05:07<04:17, 4.52s/q, correct=75/93, lccp=85.6%, score=0.923, step_acc=92.2%]
GSM8K eval: 63%|######2 | 94/150 [05:38<11:51, 12.71s/q, correct=75/94, lccp=84.7%, score=0.916, step_acc=92.2%]
GSM8K eval: 63%|######3 | 95/150 [05:43<09:27, 10.31s/q, correct=76/95, lccp=83.8%, score=0.916, step_acc=91.6%]
GSM8K eval: 64%|######4 | 96/150 [05:48<07:48, 8.67s/q, correct=77/96, lccp=84.0%, score=0.917, step_acc=91.7%]
GSM8K eval: 65%|######4 | 97/150 [05:51<06:03, 6.86s/q, correct=77/97, lccp=83.3%, score=0.916, step_acc=91.3%]
GSM8K eval: 65%|######5 | 98/150 [05:55<05:14, 6.04s/q, correct=77/98, lccp=82.9%, score=0.912, step_acc=91.1%]
GSM8K eval: 66%|######6 | 99/150 [05:57<04:11, 4.94s/q, correct=78/99, lccp=83.1%, score=0.913, step_acc=91.2%]
GSM8K eval: 67%|######6 | 100/150 [05:59<03:21, 4.02s/q, correct=79/100, lccp=82.3%, score=0.913, step_acc=90.9%]
GSM8K eval: 67%|######7 | 101/150 [06:02<03:01, 3.70s/q, correct=79/101, lccp=82.0%, score=0.909, step_acc=90.8%]
GSM8K eval: 68%|######8 | 102/150 [06:03<02:25, 3.03s/q, correct=80/102, lccp=82.1%, score=0.910, step_acc=90.8%]
GSM8K eval: 69%|######8 | 103/150 [06:05<02:07, 2.72s/q, correct=81/103, lccp=82.3%, score=0.911, step_acc=90.9%]
GSM8K eval: 69%|######9 | 104/150 [06:10<02:31, 3.30s/q, correct=82/104, lccp=82.5%, score=0.912, step_acc=91.0%]
GSM8K eval: 70%|####### | 105/150 [06:12<02:18, 3.07s/q, correct=83/105, lccp=82.6%, score=0.913, step_acc=91.1%]
GSM8K eval: 71%|####### | 106/150 [06:14<01:54, 2.59s/q, correct=84/106, lccp=82.8%, score=0.913, step_acc=91.2%]
GSM8K eval: 71%|#######1 | 107/150 [06:15<01:37, 2.26s/q, correct=85/107, lccp=83.0%, score=0.914, step_acc=91.3%]
GSM8K eval: 72%|#######2 | 108/150 [06:18<01:40, 2.39s/q, correct=86/108, lccp=83.1%, score=0.915, step_acc=91.3%]
GSM8K eval: 73%|#######2 | 109/150 [06:23<02:10, 3.17s/q, correct=86/109, lccp=82.7%, score=0.914, step_acc=91.3%]
GSM8K eval: 73%|#######3 | 110/150 [06:25<01:55, 2.88s/q, correct=87/110, lccp=82.8%, score=0.914, step_acc=91.4%]
GSM8K eval: 74%|#######4 | 111/150 [06:27<01:38, 2.52s/q, correct=88/111, lccp=83.0%, score=0.915, step_acc=91.4%]
GSM8K eval: 75%|#######4 | 112/150 [06:32<02:06, 3.32s/q, correct=88/112, lccp=83.1%, score=0.915, step_acc=91.5%]
GSM8K eval: 75%|#######5 | 113/150 [06:34<01:45, 2.85s/q, correct=89/113, lccp=83.3%, score=0.915, step_acc=91.6%]
GSM8K eval: 76%|#######6 | 114/150 [06:39<02:08, 3.58s/q, correct=90/114, lccp=82.8%, score=0.916, step_acc=91.5%]
GSM8K eval: 77%|#######6 | 115/150 [06:42<01:57, 3.35s/q, correct=91/115, lccp=83.0%, score=0.917, step_acc=91.6%]
GSM8K eval: 77%|#######7 | 116/150 [06:45<01:48, 3.19s/q, correct=92/116, lccp=83.1%, score=0.917, step_acc=91.7%]
GSM8K eval: 78%|#######8 | 117/150 [06:51<02:10, 3.96s/q, correct=93/117, lccp=83.2%, score=0.918, step_acc=91.8%]
GSM8K eval: 79%|#######8 | 118/150 [06:55<02:11, 4.10s/q, correct=93/118, lccp=82.5%, score=0.916, step_acc=91.7%]
GSM8K eval: 79%|#######9 | 119/150 [06:59<02:01, 3.92s/q, correct=93/119, lccp=82.7%, score=0.914, step_acc=91.7%]
GSM8K eval: 80%|######## | 120/150 [07:01<01:47, 3.59s/q, correct=94/120, lccp=82.8%, score=0.915, step_acc=91.8%]
GSM8K eval: 81%|######## | 121/150 [07:04<01:38, 3.41s/q, correct=95/121, lccp=83.0%, score=0.916, step_acc=91.9%]
GSM8K eval: 81%|########1 | 122/150 [07:07<01:32, 3.30s/q, correct=96/122, lccp=83.1%, score=0.916, step_acc=92.0%]
GSM8K eval: 82%|########2 | 123/150 [07:11<01:28, 3.29s/q, correct=96/123, lccp=82.8%, score=0.916, step_acc=91.9%]
GSM8K eval: 83%|########2 | 124/150 [07:13<01:17, 2.96s/q, correct=97/124, lccp=82.9%, score=0.917, step_acc=91.9%]
GSM8K eval: 83%|########3 | 125/150 [07:15<01:07, 2.68s/q, correct=98/125, lccp=83.0%, score=0.917, step_acc=92.0%]
GSM8K eval: 84%|########4 | 126/150 [07:18<01:04, 2.70s/q, correct=99/126, lccp=83.2%, score=0.918, step_acc=92.0%]
GSM8K eval: 85%|########4 | 127/150 [07:22<01:14, 3.22s/q, correct=100/127, lccp=83.3%, score=0.919, step_acc=92.1%]
GSM8K eval: 85%|########5 | 128/150 [07:25<01:08, 3.13s/q, correct=101/128, lccp=83.4%, score=0.919, step_acc=92.2%]
GSM8K eval: 86%|########6 | 129/150 [07:29<01:08, 3.26s/q, correct=102/129, lccp=83.6%, score=0.920, step_acc=92.2%]
GSM8K eval: 87%|########6 | 130/150 [07:30<00:56, 2.84s/q, correct=103/130, lccp=83.7%, score=0.920, step_acc=92.3%]
GSM8K eval: 87%|########7 | 131/150 [07:35<01:03, 3.37s/q, correct=104/131, lccp=83.8%, score=0.921, step_acc=92.4%]
GSM8K eval: 88%|########8 | 132/150 [07:37<00:51, 2.84s/q, correct=105/132, lccp=83.9%, score=0.922, step_acc=92.4%]
GSM8K eval: 89%|########8 | 133/150 [07:39<00:48, 2.84s/q, correct=106/133, lccp=84.1%, score=0.922, step_acc=92.5%]
GSM8K eval: 89%|########9 | 134/150 [07:44<00:52, 3.29s/q, correct=107/134, lccp=84.2%, score=0.923, step_acc=92.5%]
GSM8K eval: 90%|######### | 135/150 [07:47<00:47, 3.15s/q, correct=108/135, lccp=84.3%, score=0.923, step_acc=92.6%]
GSM8K eval: 91%|######### | 136/150 [07:51<00:49, 3.51s/q, correct=108/136, lccp=83.9%, score=0.922, step_acc=92.4%]
GSM8K eval: 91%|#########1| 137/150 [07:58<00:57, 4.45s/q, correct=109/137, lccp=84.0%, score=0.923, step_acc=92.4%]
GSM8K eval: 92%|#########2| 138/150 [08:02<00:51, 4.28s/q, correct=110/138, lccp=84.2%, score=0.923, step_acc=92.5%]
GSM8K eval: 93%|#########2| 139/150 [08:05<00:43, 4.00s/q, correct=111/139, lccp=84.3%, score=0.924, step_acc=92.6%]
GSM8K eval: 93%|#########3| 140/150 [08:09<00:40, 4.04s/q, correct=111/140, lccp=84.1%, score=0.920, step_acc=92.4%]
GSM8K eval: 94%|#########3| 141/150 [08:13<00:35, 3.95s/q, correct=112/141, lccp=84.3%, score=0.921, step_acc=92.4%]
GSM8K eval: 95%|#########4| 142/150 [08:16<00:30, 3.76s/q, correct=113/142, lccp=84.4%, score=0.922, step_acc=92.5%]
GSM8K eval: 95%|#########5| 143/150 [08:18<00:23, 3.29s/q, correct=114/143, lccp=84.5%, score=0.922, step_acc=92.5%]
GSM8K eval: 96%|#########6| 144/150 [08:21<00:17, 3.00s/q, correct=115/144, lccp=84.6%, score=0.923, step_acc=92.6%]
GSM8K eval: 97%|#########6| 145/150 [08:24<00:15, 3.04s/q, correct=115/145, lccp=84.0%, score=0.919, step_acc=92.1%]
GSM8K eval: 97%|#########7| 146/150 [08:27<00:12, 3.01s/q, correct=116/146, lccp=84.1%, score=0.920, step_acc=92.1%]
GSM8K eval: 98%|#########8| 147/150 [08:30<00:09, 3.22s/q, correct=117/147, lccp=84.2%, score=0.920, step_acc=92.2%]
GSM8K eval: 99%|#########8| 148/150 [08:34<00:06, 3.40s/q, correct=118/148, lccp=84.3%, score=0.921, step_acc=92.2%]
GSM8K eval: 99%|#########9| 149/150 [08:38<00:03, 3.44s/q, correct=119/149, lccp=84.4%, score=0.921, step_acc=92.3%]
GSM8K eval: 100%|##########| 150/150 [08:43<00:00, 3.86s/q, correct=119/150, lccp=84.3%, score=0.920, step_acc=92.1%]
GSM8K eval: 100%|##########| 150/150 [08:43<00:00, 3.49s/q, correct=119/150, lccp=84.3%, score=0.920, step_acc=92.1%]
+2026-04-26 04:24:44,562 INFO __main__ - Training Score [iter 10]: 0.9199 (best=0.9192) | n=150
+2026-04-26 04:24:44,563 INFO __main__ - Components : 0.50×correct(79.3%) + 0.40×process + 0.10×fmt(0.998)
+2026-04-26 04:24:44,563 INFO __main__ - Process score : prm_mean=0.907 prm_final=0.941 → weighted=0.927
+2026-04-26 04:24:44,563 INFO __main__ - Step accuracy : 92.0% (bag-of-steps: fraction of steps PRM >0.5)
+2026-04-26 04:24:44,563 INFO __main__ - Chain integrity (LCCP): 84.3% ← fraction of steps before first failure
+ [LCCP=100% → all steps correct; LCCP=0% → first step wrong]
+2026-04-26 04:24:44,563 INFO __main__ - (debug) final-answer accuracy: 79.3%
+2026-04-26 04:24:47,497 INFO __main__ - New best saved → checkpoints/grpo/grpo_20260426_032827/best_policy (combined 0.9199 > 0.9192)
+2026-04-26 04:24:49,713 INFO __main__ - ======================================================================
+2026-04-26 04:24:49,713 INFO __main__ - GRPO ITERATION 11/60
+2026-04-26 04:24:49,713 INFO __main__ - ======================================================================
+2026-04-26 04:24:49,734 INFO __main__ - LR this iteration: 4.98e-06 | T=0.732 | MATH ratio=30%
+
Iter 11 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:24:54,678 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:24:54,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:24:54,845 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:24:54,929 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:24:55,014 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:24:55,099 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:24:55,184 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:24:55,266 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:24:55,348 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:24:55,430 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='300' gold='300' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 11 GRPO groups: 0%| | 0/20 [00:05, ?q/s, loss=0var, mean_r=0.999, skip=1]
Iter 11 GRPO groups: 5%|5 | 1/20 [00:05<01:48, 5.69s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 04:24:58,800 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.934[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:24:58,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:24:58,955 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:24:59,042 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.458 = 0.50×0.34(prox=0.34) + 0.40×proc(0.277[fin=0.10,mean=0.54]) + 0.10×fmt(1.000) | pred='1.05' gold='21' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:24:59,125 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:24:59,203 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:24:59,280 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:24:59,357 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.926 = 0.50×1.00(exact) + 0.40×proc(0.815[fin=0.98,mean=0.57]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:24:59,439 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:24:59,521 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 11 GRPO groups: 5%|5 | 1/20 [00:11<01:48, 5.69s/q, loss=-0.0009, mean_r=0.935, skip=1]
Iter 11 GRPO groups: 10%|# | 2/20 [00:11<01:42, 5.71s/q, loss=-0.0009, mean_r=0.935, skip=1]2026-04-26 04:25:06,506 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:06,593 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:06,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:06,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:06,847 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:06,931 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:07,013 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:07,094 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:07,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:07,261 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='88' gold='88' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 11 GRPO groups: 10%|# | 2/20 [00:17<01:42, 5.71s/q, loss=0var, mean_r=0.999, skip=2]
Iter 11 GRPO groups: 15%|#5 | 3/20 [00:17<01:40, 5.89s/q, loss=0var, mean_r=0.999, skip=2]2026-04-26 04:25:16,793 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:25:16,885 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:25:16,972 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.791 = 0.50×0.85(prox=0.85) + 0.40×proc(0.666[fin=0.61,mean=0.75]) + 0.10×fmt(1.000) | pred='444' gold='445' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:17,058 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.951 = 0.50×1.00(exact) + 0.40×proc(0.876[fin=0.97,mean=0.73]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:25:17,152 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:25:17,236 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.887[fin=0.99,mean=0.73]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:25:17,327 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:25:17,420 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:25:17,516 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:25:17,610 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='445' gold='445' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+
Iter 11 GRPO groups: 15%|#5 | 3/20 [00:29<01:40, 5.89s/q, loss=0.0002, mean_r=0.968, skip=2]
Iter 11 GRPO groups: 20%|## | 4/20 [00:29<02:12, 8.27s/q, loss=0.0002, mean_r=0.968, skip=2]2026-04-26 04:25:27,619 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.620 = 0.50×0.33(prox=0.33) + 0.40×proc(0.884[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='0' gold='10000' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 04:25:27,704 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10000' gold='10000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:27,790 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='$10000' gold='10000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:27,882 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10000' gold='10000' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:25:27,965 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.907[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='10000' gold='10000' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:25:28,055 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10000' gold='10000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:28,148 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10000' gold='10000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:28,238 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10000' gold='10000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:28,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10000' gold='10000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:28,422 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10000' gold='10000' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 11 GRPO groups: 20%|## | 4/20 [00:40<02:12, 8.27s/q, loss=-0.0001, mean_r=0.958, skip=2]
Iter 11 GRPO groups: 25%|##5 | 5/20 [00:40<02:17, 9.15s/q, loss=-0.0001, mean_r=0.958, skip=2]2026-04-26 04:25:38,272 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:38,355 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:38,444 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='$32.00' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:38,531 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:38,618 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:38,704 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:38,792 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.458 = 0.50×0.08(prox=0.08) + 0.40×proc(0.792[fin=0.93,mean=0.59]) + 0.10×fmt(1.000) | pred='212.8' gold='32' | step_acc=50% lccp=0% (chain=0/6 ok_count=3) n_steps=6
+2026-04-26 04:25:38,876 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:38,969 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:25:39,053 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 11 GRPO groups: 25%|##5 | 5/20 [00:50<02:17, 9.15s/q, loss=-0.0008, mean_r=0.945, skip=2]
Iter 11 GRPO groups: 30%|### | 6/20 [00:50<02:15, 9.65s/q, loss=-0.0008, mean_r=0.945, skip=2]2026-04-26 04:25:44,913 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:44,995 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:45,078 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:45,163 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:45,247 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:45,334 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:45,421 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:45,505 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:45,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:25:45,668 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 11 GRPO groups: 30%|### | 6/20 [00:55<02:15, 9.65s/q, loss=0var, mean_r=0.993, skip=3]
Iter 11 GRPO groups: 35%|###5 | 7/20 [00:55<01:46, 8.18s/q, loss=0var, mean_r=0.993, skip=3]2026-04-26 04:25:48,746 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:25:48,829 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:25:48,912 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:25:48,995 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.945[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:25:49,078 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:25:49,160 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.945[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:25:49,242 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.787 = 0.50×0.85(prox=0.85) + 0.40×proc(0.655[fin=0.83,mean=0.39]) + 0.10×fmt(1.000) | pred='-6.67' gold='-7' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:25:49,324 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.396 = 0.50×0.00(prox=0.00) + 0.40×proc(0.741[fin=0.92,mean=0.47]) + 0.10×fmt(1.000) | pred='-6 2/3' gold='-7' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:25:49,407 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:25:49,488 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 11 GRPO groups: 35%|###5 | 7/20 [01:01<01:46, 8.18s/q, loss=-0.0023, mean_r=0.911, skip=3]
Iter 11 GRPO groups: 40%|#### | 8/20 [01:01<01:26, 7.24s/q, loss=-0.0023, mean_r=0.911, skip=3]2026-04-26 04:25:53,138 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,216 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,374 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,456 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,532 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,608 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,685 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:25:53,837 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 11 GRPO groups: 40%|#### | 8/20 [01:04<01:26, 7.24s/q, loss=0var, mean_r=0.998, skip=4]
Iter 11 GRPO groups: 45%|####5 | 9/20 [01:04<01:04, 5.90s/q, loss=0var, mean_r=0.998, skip=4]2026-04-26 04:26:01,819 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:26:01,901 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:26:01,985 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:02,069 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:02,155 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:02,249 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:26:02,332 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:02,414 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:02,497 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:02,581 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 11 GRPO groups: 45%|####5 | 9/20 [01:12<01:04, 5.90s/q, loss=0var, mean_r=0.998, skip=5]
Iter 11 GRPO groups: 50%|##### | 10/20 [01:12<01:07, 6.78s/q, loss=0var, mean_r=0.998, skip=5]2026-04-26 04:26:07,240 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:07,317 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:07,400 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:07,482 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.951[fin=0.96,mean=0.94]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:07,564 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:07,640 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=0.99,mean=0.98]) + 0.10×fmt(0.650) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:26:07,717 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:07,798 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:07,881 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:07,963 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=0.99,mean=0.98]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 11 GRPO groups: 50%|##### | 10/20 [01:18<01:07, 6.78s/q, loss=0var, mean_r=0.991, skip=6]
Iter 11 GRPO groups: 55%|#####5 | 11/20 [01:18<00:57, 6.35s/q, loss=0var, mean_r=0.991, skip=6]2026-04-26 04:26:12,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:12,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:12,714 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:12,790 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:12,872 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:12,948 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:13,030 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:13,107 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:13,183 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:13,266 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 11 GRPO groups: 55%|#####5 | 11/20 [01:23<00:57, 6.35s/q, loss=0var, mean_r=0.999, skip=7]
Iter 11 GRPO groups: 60%|###### | 12/20 [01:23<00:48, 6.03s/q, loss=0var, mean_r=0.999, skip=7]2026-04-26 04:26:20,521 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:20,614 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.947[fin=0.97,mean=0.92]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:26:20,706 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:20,798 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:26:20,890 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.939[fin=0.93,mean=0.96]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:26:20,983 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:26:21,077 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.687 = 0.50×0.43(prox=0.43) + 0.40×proc(0.933[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='18' gold='54' | step_acc=86% lccp=0% (chain=0/7 ok_count=6) n_steps=7
+2026-04-26 04:26:21,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=0.98,mean=0.91]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=88% lccp=75% (chain=6/8 ok_count=7) n_steps=8
+2026-04-26 04:26:21,264 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=0.99,mean=0.95]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:26:21,355 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 11 GRPO groups: 60%|###### | 12/20 [01:33<00:48, 6.03s/q, loss=-0.0003, mean_r=0.957, skip=7]
Iter 11 GRPO groups: 65%|######5 | 13/20 [01:33<00:49, 7.09s/q, loss=-0.0003, mean_r=0.957, skip=7]2026-04-26 04:26:27,345 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:27,429 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:27,512 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:27,595 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:27,680 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:27,766 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:27,851 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:27,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:26:28,019 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:28,104 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 11 GRPO groups: 65%|######5 | 13/20 [01:38<00:49, 7.09s/q, loss=0var, mean_r=0.995, skip=8]
Iter 11 GRPO groups: 70%|####### | 14/20 [01:38<00:39, 6.55s/q, loss=0var, mean_r=0.995, skip=8]2026-04-26 04:26:35,042 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.518 = 0.50×0.22(prox=0.22) + 0.40×proc(0.586[fin=0.57,mean=0.60]) + 0.10×fmt(1.000) | pred='16.75' gold='6' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:26:35,131 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.821 = 0.50×0.71(prox=0.71) + 0.40×proc(0.920[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='4.75' gold='6' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:26:35,223 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:35,315 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=0.99,mean=0.93]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:35,400 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.962[fin=0.99,mean=0.92]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:35,493 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.807 = 0.50×0.71(prox=0.71) + 0.40×proc(0.886[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='4.75' gold='6' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:26:35,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.802 = 0.50×0.71(prox=0.71) + 0.40×proc(0.872[fin=1.00,mean=0.69]) + 0.10×fmt(1.000) | pred='4.75' gold='6' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:26:35,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.940 = 0.50×1.00(exact) + 0.40×proc(0.850[fin=0.94,mean=0.72]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:26:35,761 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.808 = 0.50×0.71(prox=0.71) + 0.40×proc(0.889[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='4.75' gold='6' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:26:35,854 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.527 = 0.50×0.50(prox=0.50) + 0.40×proc(0.443[fin=0.38,mean=0.53]) + 0.10×fmt(1.000) | pred='9' gold='6' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+
Iter 11 GRPO groups: 70%|####### | 14/20 [01:47<00:39, 6.55s/q, loss=-0.0002, mean_r=0.819, skip=8]
Iter 11 GRPO groups: 75%|#######5 | 15/20 [01:47<00:36, 7.35s/q, loss=-0.0002, mean_r=0.819, skip=8]2026-04-26 04:26:42,108 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:42,189 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:26:42,272 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:42,351 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:42,436 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:42,520 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+2026-04-26 04:26:42,605 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:42,687 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:26:42,768 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:26:42,850 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 11 GRPO groups: 75%|#######5 | 15/20 [01:53<00:36, 7.35s/q, loss=0var, mean_r=1.000, skip=9]
Iter 11 GRPO groups: 80%|######## | 16/20 [01:53<00:27, 6.81s/q, loss=0var, mean_r=1.000, skip=9]2026-04-26 04:26:48,087 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:48,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:48,260 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.951 = 0.50×1.00(exact) + 0.40×proc(0.877[fin=1.00,mean=0.70]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 04:26:48,343 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:48,424 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:48,506 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:48,588 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:48,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:48,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:48,835 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 11 GRPO groups: 80%|######## | 16/20 [01:59<00:27, 6.81s/q, loss=0var, mean_r=0.991, skip=10]
Iter 11 GRPO groups: 85%|########5 | 17/20 [01:59<00:19, 6.56s/q, loss=0var, mean_r=0.991, skip=10]2026-04-26 04:26:54,391 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:54,474 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:54,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:54,638 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:54,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:26:54,803 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:26:54,885 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:26:54,966 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:26:55,048 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:26:55,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='11' gold='11' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 11 GRPO groups: 85%|########5 | 17/20 [02:05<00:19, 6.56s/q, loss=0var, mean_r=0.992, skip=11]
Iter 11 GRPO groups: 90%|######### | 18/20 [02:05<00:12, 6.48s/q, loss=0var, mean_r=0.992, skip=11]2026-04-26 04:27:00,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='110' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:00,264 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='110.00' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:00,346 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='110' gold='110' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:00,428 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='110' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:00,511 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='110.00' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:00,596 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='110' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:00,682 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='110' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:00,768 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='110' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:00,853 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='110.00' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:00,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='110.00' gold='110' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 11 GRPO groups: 90%|######### | 18/20 [02:11<00:12, 6.48s/q, loss=0var, mean_r=0.996, skip=12]
Iter 11 GRPO groups: 95%|#########5| 19/20 [02:11<00:06, 6.28s/q, loss=0var, mean_r=0.996, skip=12]2026-04-26 04:27:09,441 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:27:09,525 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:09,608 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:09,696 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.37(prox=0.37) + 0.40×proc(0.784[fin=0.92,mean=0.58]) + 0.10×fmt(1.000) | pred='81' gold='486' | step_acc=57% lccp=43% (chain=3/7 ok_count=4) n_steps=7
+2026-04-26 04:27:09,791 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:27:09,876 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:09,963 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:27:10,047 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:27:10,132 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:10,226 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='486' gold='486' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+
Iter 11 GRPO groups: 95%|#########5| 19/20 [02:21<00:06, 6.28s/q, loss=-0.0002, mean_r=0.953, skip=12]
Iter 11 GRPO groups: 100%|##########| 20/20 [02:21<00:00, 7.61s/q, loss=-0.0002, mean_r=0.953, skip=12]
Iter 11 GRPO groups: 100%|##########| 20/20 [02:21<00:00, 7.10s/q, loss=-0.0002, mean_r=0.953, skip=12]
+2026-04-26 04:27:11,680 INFO __main__ - Iter 11 | loss=-0.0006 | reward mean=0.970 std=0.098 | gt_match=93.0% | grounded_acc=98.5% | step_acc=96.6% | lccp=92.2% | batch_acc=98.5% | phase=GROUNDED_ONLY sp_ratio=0% | groups=8 skipped=12(0var=12) | lr=4.96e-06 | 142.0s
+2026-04-26 04:27:11,680 WARNING __main__ - STARVATION: 60% of groups skipped (zero variance). grounded_acc=98.5% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 04:27:11,681 INFO __main__ - ======================================================================
+2026-04-26 04:27:11,681 INFO __main__ - GRPO ITERATION 12/60
+2026-04-26 04:27:11,681 INFO __main__ - ======================================================================
+2026-04-26 04:27:11,701 INFO __main__ - LR this iteration: 4.96e-06 | T=0.725 | MATH ratio=30%
+
Iter 12 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:27:14,947 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,029 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,113 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,281 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,365 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,528 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,610 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:15,692 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 12 GRPO groups: 0%| | 0/20 [00:03, ?q/s, loss=0var, mean_r=0.999, skip=1]
Iter 12 GRPO groups: 5%|5 | 1/20 [00:03<01:15, 3.99s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 04:27:21,375 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:21,460 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:21,543 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.960[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:21,627 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.962[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:21,710 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.894[fin=1.00,mean=0.73]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 04:27:21,793 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.947[fin=0.99,mean=0.89]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:21,878 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:27:21,964 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.943[fin=0.94,mean=0.95]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:22,047 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:22,138 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 12 GRPO groups: 5%|5 | 1/20 [00:10<01:15, 3.99s/q, loss=0var, mean_r=0.983, skip=2]
Iter 12 GRPO groups: 10%|# | 2/20 [00:10<01:37, 5.43s/q, loss=0var, mean_r=0.983, skip=2]2026-04-26 04:27:27,589 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.614 = 0.50×0.44(prox=0.44) + 0.40×proc(0.737[fin=0.92,mean=0.47]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:27:27,672 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.932[fin=0.99,mean=0.84]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:27,754 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.918[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:27:27,837 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.952[fin=0.98,mean=0.90]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:27,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.876[fin=0.96,mean=0.74]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:27:28,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.823[fin=0.97,mean=0.60]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:27:28,089 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.896[fin=0.98,mean=0.76]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:27:28,172 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.933[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:28,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.940[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='1040000' gold='2880000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:28,339 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.860 = 0.50×0.85(prox=0.85) + 0.40×proc(0.836[fin=0.99,mean=0.61]) + 0.10×fmt(1.000) | pred='3120000' gold='2880000' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+
Iter 12 GRPO groups: 10%|# | 2/20 [00:18<01:37, 5.43s/q, loss=0.0017, mean_r=0.587, skip=2]
Iter 12 GRPO groups: 15%|#5 | 3/20 [00:18<01:49, 6.47s/q, loss=0.0017, mean_r=0.587, skip=2]2026-04-26 04:27:37,657 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.515 = 0.50×0.02(prox=0.02) + 0.40×proc(0.851[fin=0.89,mean=0.79]) + 0.10×fmt(1.000) | pred='900' gold='30' | step_acc=89% lccp=44% (chain=4/9 ok_count=8) n_steps=9
+2026-04-26 04:27:37,752 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=89% lccp=11% (chain=1/9 ok_count=8) n_steps=9
+2026-04-26 04:27:37,835 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:27:37,919 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:27:38,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.448 = 0.50×0.02(prox=0.02) + 0.40×proc(0.795[fin=0.89,mean=0.65]) + 0.10×fmt(1.000) | pred='900' gold='30' | step_acc=71% lccp=14% (chain=1/7 ok_count=5) n_steps=7
+2026-04-26 04:27:38,085 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:38,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:38,252 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:27:38,343 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:27:38,426 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.440 = 0.50×0.20(prox=0.20) + 0.40×proc(0.411[fin=0.29,mean=0.59]) + 0.10×fmt(1.000) | pred='90' gold='30' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+
Iter 12 GRPO groups: 15%|#5 | 3/20 [00:28<01:49, 6.47s/q, loss=-0.0004, mean_r=0.837, skip=2]
Iter 12 GRPO groups: 20%|## | 4/20 [00:28<02:05, 7.87s/q, loss=-0.0004, mean_r=0.837, skip=2]2026-04-26 04:27:48,453 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.731 = 0.50×0.85(prox=0.85) + 0.40×proc(0.516[fin=0.44,mean=0.63]) + 0.10×fmt(1.000) | pred='11' gold='12' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 04:27:48,545 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:48,628 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:48,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.887[fin=0.90,mean=0.86]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:48,792 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.900[fin=0.94,mean=0.84]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:27:48,875 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.972[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:48,962 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.791 = 0.50×0.75(prox=0.75) + 0.40×proc(0.790[fin=0.87,mean=0.67]) + 0.10×fmt(1.000) | pred='10' gold='12' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 04:27:49,043 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.950[fin=0.99,mean=0.89]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:27:49,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:27:49,210 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.932[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=83% lccp=33% (chain=2/6 ok_count=5) n_steps=6
+
Iter 12 GRPO groups: 20%|## | 4/20 [00:38<02:05, 7.87s/q, loss=0.0006, mean_r=0.937, skip=2]
Iter 12 GRPO groups: 25%|##5 | 5/20 [00:38<02:13, 8.92s/q, loss=0.0006, mean_r=0.937, skip=2]2026-04-26 04:27:53,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:53,734 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:53,810 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:53,885 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:53,967 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:54,048 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:54,124 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:54,199 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:54,276 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:27:54,351 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 12 GRPO groups: 25%|##5 | 5/20 [00:42<02:13, 8.92s/q, loss=0var, mean_r=1.000, skip=3]
Iter 12 GRPO groups: 30%|### | 6/20 [00:42<01:40, 7.15s/q, loss=0var, mean_r=1.000, skip=3]2026-04-26 04:28:05,090 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.549 = 0.50×0.00(prox=0.00) + 0.40×proc(0.909[fin=0.99,mean=0.78]) + 0.10×fmt(1.000) | pred='$1175' gold='1100' | step_acc=71% lccp=57% (chain=4/7 ok_count=5) n_steps=7
+2026-04-26 04:28:05,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.490 = 0.50×0.00(prox=0.00) + 0.40×proc(0.834[fin=0.89,mean=0.76]) + 0.10×fmt(1.000) | pred='$2400' gold='1100' | step_acc=75% lccp=38% (chain=3/8 ok_count=6) n_steps=8
+2026-04-26 04:28:05,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.549 = 0.50×0.00(prox=0.00) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='$900' gold='1100' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:28:05,371 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1100' gold='1100' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+2026-04-26 04:28:05,464 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.884 = 0.50×0.85(prox=0.85) + 0.40×proc(0.897[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='1175' gold='1100' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:28:05,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.824 = 0.50×0.76(prox=0.76) + 0.40×proc(0.862[fin=0.99,mean=0.67]) + 0.10×fmt(1.000) | pred='1275' gold='1100' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:28:05,652 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.814 = 0.50×0.76(prox=0.76) + 0.40×proc(0.836[fin=0.96,mean=0.64]) + 0.10×fmt(1.000) | pred='1275' gold='1100' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:28:05,745 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.438 = 0.50×0.13(prox=0.13) + 0.40×proc(0.529[fin=0.46,mean=0.64]) + 0.10×fmt(1.000) | pred='4700' gold='1100' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:28:05,839 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1100' gold='1100' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:28:05,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.334 = 0.50×0.00(prox=0.00) + 0.40×proc(0.397[fin=0.21,mean=0.67]) + 0.10×fmt(1.000) | pred='$5500' gold='1100' | step_acc=62% lccp=50% (chain=4/8 ok_count=5) n_steps=8
+
Iter 12 GRPO groups: 30%|### | 6/20 [00:55<01:40, 7.15s/q, loss=-0.0003, mean_r=0.688, skip=3]
Iter 12 GRPO groups: 35%|###5 | 7/20 [00:55<01:57, 9.07s/q, loss=-0.0003, mean_r=0.688, skip=3]2026-04-26 04:28:10,638 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.542 = 0.50×0.60(prox=0.60) + 0.40×proc(0.354[fin=0.44,mean=0.23]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:28:10,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.644 = 0.50×0.43(prox=0.43) + 0.40×proc(0.826[fin=0.99,mean=0.58]) + 0.10×fmt(1.000) | pred='6' gold='18' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:28:10,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.972 = 0.50×1.00(exact) + 0.40×proc(0.931[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:28:10,885 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.442 = 0.50×0.18(prox=0.18) + 0.40×proc(0.635[fin=0.80,mean=0.39]) + 0.10×fmt(1.000) | pred='60' gold='18' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:28:10,967 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.493 = 0.50×0.47(prox=0.47) + 0.40×proc(0.391[fin=0.52,mean=0.21]) + 0.10×fmt(1.000) | pred='8' gold='18' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:28:11,045 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.426 = 0.50×0.27(prox=0.27) + 0.40×proc(0.348[fin=0.29,mean=0.44]) + 0.10×fmt(1.000) | pred='42' gold='18' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 04:28:11,126 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.505 = 0.50×0.33(prox=0.33) + 0.40×proc(0.596[fin=0.79,mean=0.31]) + 0.10×fmt(1.000) | pred='0' gold='18' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:28:11,204 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.459 = 0.50×0.60(prox=0.60) + 0.40×proc(0.148[fin=0.10,mean=0.21]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:28:11,282 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.750 = 0.50×0.60(prox=0.60) + 0.40×proc(0.875[fin=0.98,mean=0.72]) + 0.10×fmt(1.000) | pred='24' gold='18' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:28:11,365 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.332 = 0.50×0.14(prox=0.14) + 0.40×proc(0.401[fin=0.42,mean=0.37]) + 0.10×fmt(1.000) | pred='72' gold='18' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+
Iter 12 GRPO groups: 35%|###5 | 7/20 [01:01<01:57, 9.07s/q, loss=0.0018, mean_r=0.557, skip=3]
Iter 12 GRPO groups: 40%|#### | 8/20 [01:01<01:34, 7.91s/q, loss=0.0018, mean_r=0.557, skip=3]2026-04-26 04:28:32,551 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.683 = 0.50×0.60(prox=0.60) + 0.40×proc(0.707[fin=0.84,mean=0.51]) + 0.10×fmt(1.000) | pred='5.2' gold='7.8000' | step_acc=50% lccp=0% (chain=0/12 ok_count=6) n_steps=12
+2026-04-26 04:28:32,648 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.945 = 0.50×1.00(exact) + 0.40×proc(0.950[fin=1.00,mean=0.88]) + 0.10×fmt(0.650) | pred='7.8' gold='7.8000' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:28:32,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.316 = 0.50×0.24(prox=0.24) + 0.40×proc(0.143[fin=0.11,mean=0.19]) + 0.10×fmt(1.000) | pred='20.0' gold='7.8000' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 04:28:32,843 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.22(prox=0.22) + 0.40×proc(0.800[fin=0.95,mean=0.57]) + 0.10×fmt(1.000) | pred='21.7' gold='7.8000' | step_acc=75% lccp=25% (chain=2/8 ok_count=6) n_steps=8
+2026-04-26 04:28:32,939 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.721 = 0.50×0.60(prox=0.60) + 0.40×proc(0.803[fin=0.98,mean=0.53]) + 0.10×fmt(1.000) | pred='10.4' gold='7.8000' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 04:28:33,034 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='7.8' gold='7.8000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:28:33,128 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.921[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='7.8' gold='7.8000' | step_acc=86% lccp=14% (chain=1/7 ok_count=6) n_steps=7
+2026-04-26 04:28:33,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.438 = 0.50×0.43(prox=0.43) + 0.40×proc(0.184[fin=0.11,mean=0.29]) + 0.10×fmt(1.000) | pred='13.0' gold='7.8000' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 04:28:33,316 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.524 = 0.50×0.43(prox=0.43) + 0.40×proc(0.525[fin=0.68,mean=0.29]) + 0.10×fmt(1.000) | pred='13.0' gold='7.8000' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:28:33,411 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.330 = 0.50×0.27(prox=0.27) + 0.40×proc(0.241[fin=0.32,mean=0.13]) + 0.10×fmt(1.000) | pred='18.5' gold='7.8000' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+
Iter 12 GRPO groups: 40%|#### | 8/20 [01:23<01:34, 7.91s/q, loss=-0.0002, mean_r=0.646, skip=3]
Iter 12 GRPO groups: 45%|####5 | 9/20 [01:23<02:15, 12.36s/q, loss=-0.0002, mean_r=0.646, skip=3]2026-04-26 04:28:37,444 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:37,520 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:37,596 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:37,671 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:37,747 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:37,824 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:37,903 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:37,980 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:38,055 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:38,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.946[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='450' gold='450' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 12 GRPO groups: 45%|####5 | 9/20 [01:26<02:15, 12.36s/q, loss=0var, mean_r=0.997, skip=4]
Iter 12 GRPO groups: 50%|##### | 10/20 [01:26<01:35, 9.53s/q, loss=0var, mean_r=0.997, skip=4]2026-04-26 04:28:53,340 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.899[fin=0.98,mean=0.77]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:28:53,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.934 = 0.50×1.00(exact) + 0.40×proc(0.835[fin=0.99,mean=0.60]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=50% lccp=0% (chain=0/10 ok_count=5) n_steps=10
+2026-04-26 04:28:53,537 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.304 = 0.50×0.00(prox=0.00) + 0.40×proc(0.463[fin=0.38,mean=0.59]) + 0.10×fmt(1.000) | pred='3/2' gold='2' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+2026-04-26 04:28:53,632 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.869 = 0.50×1.00(exact) + 0.40×proc(0.672[fin=0.78,mean=0.51]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+2026-04-26 04:28:53,726 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:28:53,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:28:53,909 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:28:53,988 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.943 = 0.50×1.00(exact) + 0.40×proc(0.858[fin=0.98,mean=0.67]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:28:54,083 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:28:54,178 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 12 GRPO groups: 50%|##### | 10/20 [01:44<01:35, 9.53s/q, loss=-0.0003, mean_r=0.899, skip=4]
Iter 12 GRPO groups: 55%|#####5 | 11/20 [01:44<01:47, 11.99s/q, loss=-0.0003, mean_r=0.899, skip=4]2026-04-26 04:28:58,129 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:58,213 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:58,292 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:58,372 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:58,454 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:58,538 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:58,621 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:58,702 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:28:58,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:28:58,867 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 12 GRPO groups: 55%|#####5 | 11/20 [01:47<01:47, 11.99s/q, loss=0var, mean_r=0.996, skip=5]
Iter 12 GRPO groups: 60%|###### | 12/20 [01:47<01:14, 9.30s/q, loss=0var, mean_r=0.996, skip=5]2026-04-26 04:29:08,297 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:08,383 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:08,470 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:08,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:08,644 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:08,729 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.931 = 0.50×1.00(exact) + 0.40×proc(0.828[fin=0.91,mean=0.70]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:29:08,823 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='-7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:08,907 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:08,989 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:09,082 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-7' gold='-7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 12 GRPO groups: 60%|###### | 12/20 [01:58<01:14, 9.30s/q, loss=-0.0017, mean_r=0.947, skip=5]
Iter 12 GRPO groups: 65%|######5 | 13/20 [01:58<01:10, 10.02s/q, loss=-0.0017, mean_r=0.947, skip=5]2026-04-26 04:29:16,122 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.433 = 0.50×0.20(prox=0.20) + 0.40×proc(0.357[fin=0.16,mean=0.66]) + 0.10×fmt(1.000) | pred='-9' gold='9' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 04:29:16,207 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:16,290 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.695 = 0.50×0.60(prox=0.60) + 0.40×proc(0.738[fin=0.82,mean=0.62]) + 0.10×fmt(1.000) | pred='6' gold='9' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:29:16,376 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.709 = 0.50×0.60(prox=0.60) + 0.40×proc(0.773[fin=0.89,mean=0.60]) + 0.10×fmt(1.000) | pred='6' gold='9' | step_acc=67% lccp=17% (chain=1/6 ok_count=4) n_steps=6
+2026-04-26 04:29:16,462 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.717 = 0.50×0.60(prox=0.60) + 0.40×proc(0.792[fin=0.93,mean=0.59]) + 0.10×fmt(1.000) | pred='6' gold='9' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:29:16,548 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.30(prox=0.30) + 0.40×proc(0.734[fin=0.87,mean=0.53]) + 0.10×fmt(1.000) | pred='19.5' gold='9' | step_acc=50% lccp=38% (chain=3/8 ok_count=4) n_steps=8
+2026-04-26 04:29:16,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:29:16,719 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.860[fin=1.00,mean=0.65]) + 0.10×fmt(1.000) | pred='3' gold='9' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 04:29:16,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:16,884 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.904[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+
Iter 12 GRPO groups: 65%|######5 | 13/20 [02:06<01:10, 10.02s/q, loss=-0.0007, mean_r=0.761, skip=5]
Iter 12 GRPO groups: 70%|####### | 14/20 [02:06<00:56, 9.34s/q, loss=-0.0007, mean_r=0.761, skip=5]2026-04-26 04:29:22,273 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.307 = 0.50×0.33(prox=0.33) + 0.40×proc(0.101[fin=0.10,mean=0.10]) + 0.10×fmt(1.000) | pred='0' gold='90' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 04:29:22,355 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:22,438 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:22,516 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:22,598 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:22,675 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.273 = 0.50×0.36(prox=0.36) + 0.40×proc(0.069[fin=0.07,mean=0.06]) + 0.10×fmt(0.650) | pred='10' gold='90' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 04:29:22,757 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:22,838 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.911[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 04:29:22,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:23,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 12 GRPO groups: 70%|####### | 14/20 [02:12<00:56, 9.34s/q, loss=0.0038, mean_r=0.853, skip=5]
Iter 12 GRPO groups: 75%|#######5 | 15/20 [02:12<00:41, 8.38s/q, loss=0.0038, mean_r=0.853, skip=5]2026-04-26 04:29:42,580 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.433 = 0.50×0.37(prox=0.37) + 0.40×proc(0.189[fin=0.02,mean=0.44]) + 0.10×fmt(1.000) | pred='9' gold='69' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:29:42,666 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.638 = 0.50×0.83(prox=0.83) + 0.40×proc(0.307[fin=0.17,mean=0.51]) + 0.10×fmt(1.000) | pred='76' gold='69' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 04:29:42,750 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.736 = 0.50×0.83(prox=0.83) + 0.40×proc(0.550[fin=0.62,mean=0.44]) + 0.10×fmt(1.000) | pred='76' gold='69' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:29:42,834 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:42,918 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.919[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:29:43,011 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:29:43,095 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.919[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:29:43,203 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.756 = 0.50×0.83(prox=0.83) + 0.40×proc(0.600[fin=0.59,mean=0.61]) + 0.10×fmt(1.000) | pred='76' gold='69' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:29:43,284 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='69' gold='69' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:43,367 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.825 = 0.50×0.85(prox=0.85) + 0.40×proc(0.750[fin=0.85,mean=0.60]) + 0.10×fmt(1.000) | pred='68' gold='69' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+
Iter 12 GRPO groups: 75%|#######5 | 15/20 [02:33<00:41, 8.38s/q, loss=-0.0016, mean_r=0.830, skip=5]
Iter 12 GRPO groups: 80%|######## | 16/20 [02:33<00:47, 11.99s/q, loss=-0.0016, mean_r=0.830, skip=5]2026-04-26 04:29:47,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:47,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:47,714 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:47,797 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:47,882 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:47,959 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:48,041 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:48,119 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:48,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:29:48,279 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.969[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 12 GRPO groups: 80%|######## | 16/20 [02:36<00:47, 11.99s/q, loss=0var, mean_r=0.994, skip=6]
Iter 12 GRPO groups: 85%|########5 | 17/20 [02:36<00:28, 9.42s/q, loss=0var, mean_r=0.994, skip=6]2026-04-26 04:29:52,474 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.922[fin=0.99,mean=0.82]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:29:52,561 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.945[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:52,648 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:29:52,734 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:52,817 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:52,901 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:52,986 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.719 = 0.50×0.64(prox=0.64) + 0.40×proc(0.747[fin=0.92,mean=0.48]) + 0.10×fmt(1.000) | pred='64' gold='50' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:29:53,069 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:29:53,154 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.905[fin=0.99,mean=0.78]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:29:53,241 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 12 GRPO groups: 85%|########5 | 17/20 [02:43<00:28, 9.42s/q, loss=-0.0008, mean_r=0.956, skip=6]
Iter 12 GRPO groups: 90%|######### | 18/20 [02:43<00:17, 8.52s/q, loss=-0.0008, mean_r=0.956, skip=6]2026-04-26 04:30:00,880 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.874 = 0.50×0.85(prox=0.85) + 0.40×proc(0.872[fin=1.00,mean=0.69]) + 0.10×fmt(1.000) | pred='108' gold='112' | step_acc=75% lccp=12% (chain=1/8 ok_count=6) n_steps=8
+2026-04-26 04:30:00,964 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.865 = 0.50×1.00(exact) + 0.40×proc(0.661[fin=0.82,mean=0.42]) + 0.10×fmt(1.000) | pred='112' gold='112' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:30:01,048 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.438 = 0.50×0.52(prox=0.52) + 0.40×proc(0.197[fin=0.12,mean=0.32]) + 0.10×fmt(1.000) | pred='60' gold='112' | step_acc=20% lccp=20% (chain=1/5 ok_count=1) n_steps=5
+2026-04-26 04:30:01,133 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.790 = 0.50×0.61(prox=0.61) + 0.40×proc(0.964[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='76' gold='112' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:30:01,219 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.681 = 0.50×0.49(prox=0.49) + 0.40×proc(0.839[fin=0.98,mean=0.62]) + 0.10×fmt(1.000) | pred='54' gold='112' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:30:01,303 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.861 = 0.50×0.85(prox=0.85) + 0.40×proc(0.842[fin=0.99,mean=0.62]) + 0.10×fmt(1.000) | pred='102' gold='112' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:30:01,389 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.412 = 0.50×0.47(prox=0.47) + 0.40×proc(0.196[fin=0.06,mean=0.40]) + 0.10×fmt(1.000) | pred='48' gold='112' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:30:01,475 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.47(prox=0.47) + 0.40×proc(0.730[fin=0.70,mean=0.77]) + 0.10×fmt(1.000) | pred='48' gold='112' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 04:30:01,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.822 = 0.50×0.85(prox=0.85) + 0.40×proc(0.742[fin=0.87,mean=0.55]) + 0.10×fmt(1.000) | pred='108' gold='112' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:30:01,643 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.797 = 0.50×0.78(prox=0.78) + 0.40×proc(0.771[fin=0.98,mean=0.45]) + 0.10×fmt(1.000) | pred='96' gold='112' | step_acc=33% lccp=0% (chain=0/6 ok_count=2) n_steps=6
+
Iter 12 GRPO groups: 90%|######### | 18/20 [02:51<00:17, 8.52s/q, loss=0.0001, mean_r=0.709, skip=6]
Iter 12 GRPO groups: 95%|#########5| 19/20 [02:51<00:08, 8.49s/q, loss=0.0001, mean_r=0.709, skip=6]2026-04-26 04:30:07,469 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.912[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='192' gold='96' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:30:07,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.779 = 0.50×0.75(prox=0.75) + 0.40×proc(0.759[fin=0.81,mean=0.68]) + 0.10×fmt(1.000) | pred='80' gold='96' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:30:07,637 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.709 = 0.50×0.55(prox=0.55) + 0.40×proc(0.842[fin=0.97,mean=0.65]) + 0.10×fmt(1.000) | pred='136' gold='96' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:30:07,719 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='96' gold='96' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:30:07,801 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.634 = 0.50×0.50(prox=0.50) + 0.40×proc(0.709[fin=0.77,mean=0.61]) + 0.10×fmt(1.000) | pred='48' gold='96' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:30:07,883 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='96' gold='96' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:30:07,965 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.594 = 0.50×0.50(prox=0.50) + 0.40×proc(0.611[fin=0.75,mean=0.41]) + 0.10×fmt(1.000) | pred='48' gold='96' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:30:08,049 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.849 = 0.50×0.75(prox=0.75) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='112' gold='96' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 04:30:08,125 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='96' gold='96' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:30:08,209 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='96' gold='96' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 12 GRPO groups: 95%|#########5| 19/20 [02:57<00:08, 8.49s/q, loss=-0.0008, mean_r=0.809, skip=6]
Iter 12 GRPO groups: 100%|##########| 20/20 [02:57<00:00, 7.89s/q, loss=-0.0008, mean_r=0.809, skip=6]
Iter 12 GRPO groups: 100%|##########| 20/20 [02:57<00:00, 8.90s/q, loss=-0.0008, mean_r=0.809, skip=6]
+2026-04-26 04:30:09,636 INFO __main__ - PHASE → SELFPLAY_RAMP at iter 12 (gt_match=0.65 grounded_acc=0.90 step_acc=0.82) — shadow extraction active; chain scoring deferred until calibration passes (corr≥0.70, success_rate≥0.80)
+2026-04-26 04:30:09,636 INFO __main__ - Iter 12 | loss=0.0001 | reward mean=0.849 std=0.213 | gt_match=65.0% | grounded_acc=90.0% | step_acc=82.1% | lccp=68.7% | batch_acc=90.0% | phase=SELFPLAY_RAMP sp_ratio=0% | groups=14 skipped=6(0var=6) | lr=4.93e-06 | 178.0s
+2026-04-26 04:30:09,637 INFO __main__ - ======================================================================
+2026-04-26 04:30:09,638 INFO __main__ - GRPO ITERATION 13/60
+2026-04-26 04:30:09,638 INFO __main__ - ======================================================================
+2026-04-26 04:30:09,659 INFO __main__ - LR this iteration: 4.93e-06 | T=0.719 | MATH ratio=30%
+
Iter 13 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:30:16,229 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:30:16,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:590: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.1` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
+ warnings.warn(
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:595: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.
+ warnings.warn(
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:612: UserWarning: `do_sample` is set to `False`. However, `top_k` is set to `20` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_k`.
+ warnings.warn(
+2026-04-26 04:30:30,528 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:30:30,619 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:30:30,712 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:30:30,799 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:30:44,668 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:30:44,753 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:30:44,837 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:30:44,925 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.740 = 0.50×0.60(prox=0.60) + 0.40×proc(0.851[fin=1.00,mean=0.63]) + 0.10×fmt(1.000) | pred='96' gold='144' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+
Iter 13 GRPO groups: 0%| | 0/20 [00:46, ?q/s, loss=0.0004, mean_r=0.974, skip=0]
Iter 13 GRPO groups: 5%|5 | 1/20 [00:46<14:40, 46.34s/q, loss=0.0004, mean_r=0.974, skip=0]2026-04-26 04:31:02,947 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.731 = 0.50×0.60(prox=0.60) + 0.40×proc(0.827[fin=0.99,mean=0.58]) + 0.10×fmt(1.000) | pred='33.33' gold='25' | step_acc=57% lccp=14% (chain=1/7 ok_count=4) n_steps=7
+2026-04-26 04:31:03,034 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.928[fin=0.95,mean=0.90]) + 0.10×fmt(1.000) | pred='50' gold='25' | step_acc=86% lccp=71% (chain=5/7 ok_count=6) n_steps=7
+2026-04-26 04:31:03,118 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:31:03,202 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.953[fin=0.98,mean=0.91]) + 0.10×fmt(1.000) | pred='50' gold='25' | step_acc=88% lccp=62% (chain=5/8 ok_count=7) n_steps=8
+2026-04-26 04:31:17,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.701 = 0.50×0.57(prox=0.57) + 0.40×proc(0.797[fin=0.94,mean=0.59]) + 0.10×fmt(1.000) | pred='15.3846' gold='25' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 04:31:17,793 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.863 = 0.50×0.82(prox=0.82) + 0.40×proc(0.884[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='22.22' gold='25' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:31:17,875 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:31:17,961 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.734 = 0.50×0.60(prox=0.60) + 0.40×proc(0.834[fin=0.98,mean=0.61]) + 0.10×fmt(1.000) | pred='16.67' gold='25' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:31:25,961 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.728 = 0.50×0.60(prox=0.60) + 0.40×proc(0.819[fin=0.87,mean=0.74]) + 0.10×fmt(1.000) | pred='33.33' gold='25' | step_acc=78% lccp=22% (chain=2/9 ok_count=7) n_steps=9
+2026-04-26 04:31:26,046 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 13 GRPO groups: 5%|5 | 1/20 [01:17<14:40, 46.34s/q, loss=-0.0017, mean_r=0.783, skip=0]
Iter 13 GRPO groups: 10%|# | 2/20 [01:17<11:17, 37.62s/q, loss=-0.0017, mean_r=0.783, skip=0]2026-04-26 04:31:30,569 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:31:30,654 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:31:35,239 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:31:35,315 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:31:35,397 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:31:35,479 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:31:38,277 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:31:38,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:31:38,435 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:31:38,511 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 13 GRPO groups: 10%|# | 2/20 [01:33<11:17, 37.62s/q, loss=0var, mean_r=0.998, skip=1]
Iter 13 GRPO groups: 15%|#5 | 3/20 [01:33<07:50, 27.67s/q, loss=0var, mean_r=0.998, skip=1]2026-04-26 04:31:49,225 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:31:49,318 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:31:49,410 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:31:49,494 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:32:00,370 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:32:00,461 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:32:00,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:32:00,644 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:32:14,681 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:32:14,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 13 GRPO groups: 15%|#5 | 3/20 [02:05<07:50, 27.67s/q, loss=0var, mean_r=1.000, skip=2]
Iter 13 GRPO groups: 20%|## | 4/20 [02:05<07:46, 29.15s/q, loss=0var, mean_r=1.000, skip=2]2026-04-26 04:32:21,029 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.500 = 0.50×0.37(prox=0.37) + 0.40×proc(0.258[fin=0.03,mean=0.60]) + 0.10×fmt(1.000) | pred='300' gold='2100' | step_acc=75% lccp=75% (chain=3/4 ok_count=3) n_steps=4
+2026-04-26 04:32:21,115 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:32:32,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:32:32,257 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:32:32,344 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:32:32,431 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.468 = 0.50×0.39(prox=0.39) + 0.40×proc(0.432[fin=0.45,mean=0.40]) + 0.10×fmt(1.000) | pred='455' gold='2100' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:32:35,356 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:32:35,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:32:35,524 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:32:35,606 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 13 GRPO groups: 20%|## | 4/20 [02:37<07:46, 29.15s/q, loss=-0.0001, mean_r=0.896, skip=2]
Iter 13 GRPO groups: 25%|##5 | 5/20 [02:37<07:36, 30.41s/q, loss=-0.0001, mean_r=0.896, skip=2]2026-04-26 04:32:51,576 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='3.5' gold='3.5000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:32:51,658 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.864 = 0.50×1.00(exact) + 0.40×proc(0.747[fin=0.93,mean=0.48]) + 0.10×fmt(0.650) | pred='3.5' gold='3.5000' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 04:32:51,739 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='3.5' gold='3.5000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:32:51,817 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.802 = 0.50×1.00(exact) + 0.40×proc(0.506[fin=0.68,mean=0.25]) + 0.10×fmt(1.000) | pred='3.5' gold='3.5000' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:32:54,144 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.951 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.92]) + 0.10×fmt(0.650) | pred='3.5' gold='3.5000' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:32:54,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='3.5' gold='3.5000' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:32:54,309 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='3.5' gold='3.5000' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:32:54,390 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3.5' gold='3.5000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:32:59,487 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3.5' gold='3.5000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:32:59,564 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3.5' gold='3.5000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 13 GRPO groups: 25%|##5 | 5/20 [02:51<07:36, 30.41s/q, loss=0.0022, mean_r=0.956, skip=2]
Iter 13 GRPO groups: 30%|### | 6/20 [02:51<05:45, 24.69s/q, loss=0.0022, mean_r=0.956, skip=2]2026-04-26 04:33:03,294 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:33:03,371 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:33:07,972 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:33:08,051 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.897 = 0.50×1.00(exact) + 0.40×proc(0.829[fin=0.97,mean=0.62]) + 0.10×fmt(0.650) | pred='240' gold='240' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 04:33:08,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:33:08,206 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:33:13,047 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:33:13,128 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:33:13,206 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:33:13,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.922[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+
Iter 13 GRPO groups: 30%|### | 6/20 [03:08<05:45, 24.69s/q, loss=0.0062, mean_r=0.973, skip=2]
Iter 13 GRPO groups: 35%|###5 | 7/20 [03:08<04:47, 22.14s/q, loss=0.0062, mean_r=0.973, skip=2]2026-04-26 04:33:22,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.353 = 0.50×0.11(prox=0.11) + 0.40×proc(0.279[fin=0.08,mean=0.58]) + 0.10×fmt(1.000) | pred='-21' gold='7' | step_acc=57% lccp=57% (chain=4/7 ok_count=4) n_steps=7
+2026-04-26 04:33:22,081 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:33:22,167 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.47(prox=0.47) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='3' gold='7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:33:22,251 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:33:32,549 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:33:32,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.917 = 0.50×1.00(exact) + 0.40×proc(0.794[fin=0.79,mean=0.79]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 04:33:32,709 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:33:32,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.961[fin=0.99,mean=0.92]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:33:41,116 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:33:41,197 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 13 GRPO groups: 35%|###5 | 7/20 [03:32<04:47, 22.14s/q, loss=-0.0011, mean_r=0.878, skip=2]
Iter 13 GRPO groups: 40%|#### | 8/20 [03:32<04:35, 22.97s/q, loss=-0.0011, mean_r=0.878, skip=2]2026-04-26 04:33:48,370 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.912[fin=0.99,mean=0.80]) + 0.10×fmt(1.000) | pred='290' gold='290' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:33:48,452 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.201 = 0.50×0.17(prox=0.17) + 0.40×proc(0.041[fin=0.05,mean=0.03]) + 0.10×fmt(1.000) | pred='1000' gold='290' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:33:53,234 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.510 = 0.50×0.45(prox=0.45) + 0.40×proc(0.281[fin=0.10,mean=0.55]) + 0.10×fmt(1.000) | pred='110' gold='290' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:33:53,315 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.760 = 0.50×0.78(prox=0.78) + 0.40×proc(0.670[fin=0.84,mean=0.41]) + 0.10×fmt(1.000) | pred='250' gold='290' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:33:53,398 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.880 = 0.50×0.85(prox=0.85) + 0.40×proc(0.887[fin=0.98,mean=0.75]) + 0.10×fmt(1.000) | pred='310' gold='290' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:33:53,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='290' gold='290' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:34:04,688 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.405 = 0.50×0.45(prox=0.45) + 0.40×proc(0.205[fin=0.02,mean=0.48]) + 0.10×fmt(1.000) | pred='110' gold='290' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:34:04,772 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.854 = 0.50×1.00(exact) + 0.40×proc(0.635[fin=0.82,mean=0.36]) + 0.10×fmt(1.000) | pred='290' gold='290' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:34:04,856 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.514 = 0.50×0.45(prox=0.45) + 0.40×proc(0.291[fin=0.12,mean=0.54]) + 0.10×fmt(1.000) | pred='110' gold='290' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:34:04,940 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.549 = 0.50×0.45(prox=0.45) + 0.40×proc(0.378[fin=0.24,mean=0.58]) + 0.10×fmt(1.000) | pred='110' gold='290' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+
Iter 13 GRPO groups: 40%|#### | 8/20 [04:03<04:35, 22.97s/q, loss=0.0010, mean_r=0.662, skip=2]
Iter 13 GRPO groups: 45%|####5 | 9/20 [04:03<04:37, 25.26s/q, loss=0.0010, mean_r=0.662, skip=2]2026-04-26 04:34:18,980 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:34:19,062 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.674 = 0.50×0.50(prox=0.50) + 0.40×proc(0.811[fin=0.91,mean=0.66]) + 0.10×fmt(1.000) | pred='3' gold='2' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:34:19,144 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:34:19,226 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:34:24,326 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:34:24,411 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:34:24,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.895[fin=0.98,mean=0.77]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:34:24,587 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.298 = 0.50×0.09(prox=0.09) + 0.40×proc(0.236[fin=0.09,mean=0.46]) + 0.10×fmt(1.000) | pred='12.5' gold='2' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 04:34:33,755 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:34:33,841 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.744[fin=0.91,mean=0.49]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+
Iter 13 GRPO groups: 45%|####5 | 9/20 [04:25<04:37, 25.26s/q, loss=-0.0004, mean_r=0.845, skip=2]
Iter 13 GRPO groups: 50%|##### | 10/20 [04:25<04:03, 24.36s/q, loss=-0.0004, mean_r=0.845, skip=2]2026-04-26 04:34:39,400 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:34:39,483 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:34:44,805 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:34:44,889 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:34:44,976 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:34:45,059 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:34:56,700 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:34:56,782 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:34:56,865 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:34:56,947 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 13 GRPO groups: 50%|##### | 10/20 [04:54<04:03, 24.36s/q, loss=0var, mean_r=0.999, skip=3]
Iter 13 GRPO groups: 55%|#####5 | 11/20 [04:54<03:51, 25.73s/q, loss=0var, mean_r=0.999, skip=3]2026-04-26 04:35:11,170 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:35:11,263 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:35:11,347 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:35:11,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.915 = 0.50×1.00(exact) + 0.40×proc(0.788[fin=0.85,mean=0.70]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 04:35:22,381 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.892 = 0.50×1.00(exact) + 0.40×proc(0.731[fin=0.86,mean=0.54]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:35:22,465 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:35:22,548 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:35:22,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:35:33,058 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.908 = 0.50×1.00(exact) + 0.40×proc(0.770[fin=0.78,mean=0.75]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:35:33,145 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.951[fin=0.99,mean=0.89]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 13 GRPO groups: 55%|#####5 | 11/20 [05:24<03:51, 25.73s/q, loss=-0.0002, mean_r=0.968, skip=3]
Iter 13 GRPO groups: 60%|###### | 12/20 [05:24<03:37, 27.18s/q, loss=-0.0002, mean_r=0.968, skip=3]2026-04-26 04:35:41,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:35:41,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.441 = 0.50×0.40(prox=0.40) + 0.40×proc(0.278[fin=0.27,mean=0.29]) + 0.10×fmt(1.000) | pred='4' gold='16' | step_acc=20% lccp=20% (chain=1/5 ok_count=1) n_steps=5
+2026-04-26 04:35:50,747 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.919[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:35:50,831 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:35:50,913 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:35:50,995 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:35:55,523 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:35:55,605 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:35:55,688 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:35:55,770 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 13 GRPO groups: 60%|###### | 12/20 [05:51<03:37, 27.18s/q, loss=-0.0029, mean_r=0.936, skip=3]
Iter 13 GRPO groups: 65%|######5 | 13/20 [05:51<03:09, 27.10s/q, loss=-0.0029, mean_r=0.936, skip=3]2026-04-26 04:36:05,631 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:05,716 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:05,800 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:05,881 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:12,993 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:13,080 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:13,165 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:13,249 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:20,870 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:36:20,957 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 13 GRPO groups: 65%|######5 | 13/20 [06:11<03:09, 27.10s/q, loss=0var, mean_r=0.999, skip=4]
Iter 13 GRPO groups: 70%|####### | 14/20 [06:11<02:28, 24.78s/q, loss=0var, mean_r=0.999, skip=4]2026-04-26 04:36:27,658 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43200' gold='43200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:36:27,753 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='43200' gold='43200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:36:39,044 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='43200' gold='43200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:36:39,133 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43200' gold='43200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:36:39,219 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='43200' gold='43200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:36:39,305 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43200' gold='43200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:36:52,986 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.377 = 0.50×0.34(prox=0.34) + 0.40×proc(0.269[fin=0.06,mean=0.58]) + 0.10×fmt(1.000) | pred='1080' gold='43200' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:36:53,073 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.670 = 0.50×0.50(prox=0.50) + 0.40×proc(0.799[fin=0.99,mean=0.51]) + 0.10×fmt(1.000) | pred='21600' gold='43200' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:36:53,165 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43200' gold='43200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:36:53,256 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='43200' gold='43200' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 13 GRPO groups: 70%|####### | 14/20 [06:56<02:28, 24.78s/q, loss=0.0011, mean_r=0.903, skip=4]
Iter 13 GRPO groups: 75%|#######5 | 15/20 [06:56<02:34, 30.96s/q, loss=0.0011, mean_r=0.903, skip=4]2026-04-26 04:37:17,617 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:37:17,702 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.915[fin=0.99,mean=0.80]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:37:17,786 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.735 = 0.50×0.52(prox=0.52) + 0.40×proc(0.933[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='80' gold='55' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 04:37:17,879 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.489 = 0.50×0.26(prox=0.26) + 0.40×proc(0.652[fin=0.66,mean=0.64]) + 0.10×fmt(1.000) | pred='135' gold='55' | step_acc=62% lccp=0% (chain=0/8 ok_count=5) n_steps=8
+2026-04-26 04:37:32,537 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:37:32,632 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.42(prox=0.42) + 0.40×proc(0.835[fin=0.98,mean=0.62]) + 0.10×fmt(1.000) | pred='17.5' gold='55' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 04:37:32,723 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:37:32,807 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.817 = 0.50×0.65(prox=0.65) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='40' gold='55' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:37:39,315 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.624 = 0.50×0.38(prox=0.38) + 0.40×proc(0.836[fin=0.96,mean=0.65]) + 0.10×fmt(1.000) | pred='100' gold='55' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 04:37:39,406 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 13 GRPO groups: 75%|#######5 | 15/20 [07:31<02:34, 30.96s/q, loss=-0.0005, mean_r=0.817, skip=4]
Iter 13 GRPO groups: 80%|######## | 16/20 [07:31<02:08, 32.06s/q, loss=-0.0005, mean_r=0.817, skip=4]2026-04-26 04:37:46,172 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.766 = 0.50×0.72(prox=0.72) + 0.40×proc(0.769[fin=0.93,mean=0.52]) + 0.10×fmt(1.000) | pred='294' gold='366' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:37:46,260 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.788 = 0.50×0.72(prox=0.72) + 0.40×proc(0.823[fin=0.98,mean=0.59]) + 0.10×fmt(1.000) | pred='294' gold='366' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:37:53,583 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.936[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='366' gold='366' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:37:53,667 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.884 = 0.50×1.00(exact) + 0.40×proc(0.710[fin=0.88,mean=0.46]) + 0.10×fmt(1.000) | pred='366' gold='366' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 04:37:53,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.920 = 0.50×1.00(exact) + 0.40×proc(0.799[fin=0.98,mean=0.53]) + 0.10×fmt(1.000) | pred='366' gold='366' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 04:37:53,836 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='366' gold='366' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:38:02,994 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.451 = 0.50×0.50(prox=0.50) + 0.40×proc(0.246[fin=0.05,mean=0.55]) + 0.10×fmt(1.000) | pred='186' gold='366' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 04:38:03,080 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.726 = 0.50×0.72(prox=0.72) + 0.40×proc(0.667[fin=0.80,mean=0.46]) + 0.10×fmt(1.000) | pred='294' gold='366' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:38:03,165 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.792 = 0.50×0.72(prox=0.72) + 0.40×proc(0.834[fin=0.99,mean=0.60]) + 0.10×fmt(1.000) | pred='294' gold='366' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:38:03,248 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.961[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='366' gold='366' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 13 GRPO groups: 80%|######## | 16/20 [08:03<02:08, 32.06s/q, loss=0.0003, mean_r=0.828, skip=4]
Iter 13 GRPO groups: 85%|########5 | 17/20 [08:03<01:36, 32.20s/q, loss=0.0003, mean_r=0.828, skip=4]2026-04-26 04:38:16,516 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:16,599 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:16,681 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:16,764 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:23,410 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:23,496 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:23,578 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:23,660 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:30,661 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:30,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='89' gold='89' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 13 GRPO groups: 85%|########5 | 17/20 [08:21<01:36, 32.20s/q, loss=0var, mean_r=0.999, skip=5]
Iter 13 GRPO groups: 90%|######### | 18/20 [08:21<00:55, 27.74s/q, loss=0var, mean_r=0.999, skip=5]2026-04-26 04:38:35,541 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:38:35,624 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:38:40,341 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:38:40,427 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:38:40,511 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:38:40,594 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:38:45,610 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:38:45,694 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:38:45,776 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:38:45,861 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2.4' gold='2.4000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 13 GRPO groups: 90%|######### | 18/20 [08:41<00:55, 27.74s/q, loss=0var, mean_r=0.998, skip=6]
Iter 13 GRPO groups: 95%|#########5| 19/20 [08:41<00:25, 25.46s/q, loss=0var, mean_r=0.998, skip=6]2026-04-26 04:39:04,295 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.563 = 0.50×0.60(prox=0.60) + 0.40×proc(0.408[fin=0.38,mean=0.45]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 04:39:04,393 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.731 = 0.50×0.60(prox=0.60) + 0.40×proc(0.828[fin=0.89,mean=0.73]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=71% lccp=43% (chain=3/7 ok_count=5) n_steps=7
+2026-04-26 04:39:04,498 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.347 = 0.50×0.00(prox=0.00) + 0.40×proc(0.646[fin=0.76,mean=0.47]) + 0.10×fmt(0.700) | pred='' gold='18' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+2026-04-26 04:39:04,592 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.661 = 0.50×0.60(prox=0.60) + 0.40×proc(0.654[fin=0.72,mean=0.55]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:39:13,916 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.496 = 0.50×0.60(prox=0.60) + 0.40×proc(0.241[fin=0.07,mean=0.50]) + 0.10×fmt(1.000) | pred='24' gold='18' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 04:39:14,010 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.888[fin=1.00,mean=0.73]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=83% lccp=33% (chain=2/6 ok_count=5) n_steps=6
+2026-04-26 04:39:14,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.309 = 0.50×0.33(prox=0.33) + 0.40×proc(0.107[fin=0.06,mean=0.18]) + 0.10×fmt(1.000) | pred='0' gold='18' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:39:14,202 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.666 = 0.50×0.60(prox=0.60) + 0.40×proc(0.666[fin=0.86,mean=0.38]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:39:24,225 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.343 = 0.50×0.43(prox=0.43) + 0.40×proc(0.072[fin=0.04,mean=0.11]) + 0.10×fmt(1.000) | pred='6' gold='18' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:39:24,321 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.565 = 0.50×0.60(prox=0.60) + 0.40×proc(0.412[fin=0.39,mean=0.45]) + 0.10×fmt(1.000) | pred='12' gold='18' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+
Iter 13 GRPO groups: 95%|#########5| 19/20 [09:16<00:25, 25.46s/q, loss=-0.0005, mean_r=0.564, skip=6]
Iter 13 GRPO groups: 100%|##########| 20/20 [09:16<00:00, 28.30s/q, loss=-0.0005, mean_r=0.564, skip=6]
Iter 13 GRPO groups: 100%|##########| 20/20 [09:16<00:00, 27.81s/q, loss=-0.0005, mean_r=0.564, skip=6]
+2026-04-26 04:39:26,156 INFO __main__ - Iter 13 | loss=0.0003 | reward mean=0.899 std=0.186 | gt_match=78.0% | grounded_acc=93.0% | step_acc=87.1% | lccp=78.9% | batch_acc=93.0% | phase=SELFPLAY_RAMP sp_ratio=0% | groups=14 skipped=6(0var=6) | lr=4.90e-06 | 556.2s
+2026-04-26 04:39:26,157 INFO __main__ - ======================================================================
+2026-04-26 04:39:26,157 INFO __main__ - GRPO ITERATION 14/60
+2026-04-26 04:39:26,157 INFO __main__ - ======================================================================
+2026-04-26 04:39:26,178 INFO __main__ - LR this iteration: 4.90e-06 | T=0.712 | MATH ratio=30%
+
Iter 14 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:39:30,146 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.25(prox=0.25) + 0.40×proc(0.902[fin=0.95,mean=0.83]) + 0.10×fmt(1.000) | pred='10' gold='4' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:39:30,230 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:39:36,849 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:39:36,932 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.25(prox=0.25) + 0.40×proc(0.903[fin=0.95,mean=0.83]) + 0.10×fmt(1.000) | pred='10' gold='4' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:39:37,014 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:39:37,097 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:39:45,317 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:39:45,394 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.507 = 0.50×0.09(prox=0.09) + 0.40×proc(0.903[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='24' gold='4' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:39:45,476 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:39:45,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 14 GRPO groups: 0%| | 0/20 [00:27, ?q/s, loss=0.0000, mean_r=0.860, skip=0]
Iter 14 GRPO groups: 5%|5 | 1/20 [00:27<08:33, 27.02s/q, loss=0.0000, mean_r=0.860, skip=0]2026-04-26 04:39:57,564 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.871 = 0.50×0.85(prox=0.85) + 0.40×proc(0.864[fin=1.00,mean=0.66]) + 0.10×fmt(1.000) | pred='104' gold='108' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:39:57,648 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.210 = 0.50×0.02(prox=0.02) + 0.40×proc(0.247[fin=0.31,mean=0.15]) + 0.10×fmt(1.000) | pred='2592' gold='108' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 04:39:57,731 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:39:57,814 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.311 = 0.50×0.01(prox=0.01) + 0.40×proc(0.516[fin=0.66,mean=0.31]) + 0.10×fmt(1.000) | pred='6480' gold='108' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:40:05,847 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:40:05,930 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:40:06,015 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.953[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:40:06,100 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.193 = 0.50×0.02(prox=0.02) + 0.40×proc(0.207[fin=0.14,mean=0.30]) + 0.10×fmt(1.000) | pred='2592' gold='108' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 04:40:13,207 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.528 = 0.50×0.13(prox=0.13) + 0.40×proc(0.904[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='464' gold='108' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:40:13,289 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='108' gold='108' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 14 GRPO groups: 5%|5 | 1/20 [00:48<08:33, 27.02s/q, loss=-0.0000, mean_r=0.706, skip=0]
Iter 14 GRPO groups: 10%|# | 2/20 [00:48<07:08, 23.78s/q, loss=-0.0000, mean_r=0.706, skip=0]2026-04-26 04:40:20,695 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:40:20,778 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:40:25,205 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.738 = 0.50×0.85(prox=0.85) + 0.40×proc(0.619[fin=0.75,mean=0.42]) + 0.10×fmt(0.650) | pred='65' gold='64' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 04:40:25,292 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.707 = 0.50×0.85(prox=0.85) + 0.40×proc(0.455[fin=0.57,mean=0.29]) + 0.10×fmt(1.000) | pred='65' gold='64' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:40:25,374 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.633 = 0.50×0.52(prox=0.52) + 0.40×proc(0.774[fin=0.92,mean=0.56]) + 0.10×fmt(0.650) | pred='34' gold='64' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 04:40:25,458 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.822 = 0.50×0.85(prox=0.85) + 0.40×proc(0.742[fin=0.87,mean=0.56]) + 0.10×fmt(1.000) | pred='65' gold='64' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:40:35,013 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:40:35,093 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.801 = 0.50×1.00(exact) + 0.40×proc(0.503[fin=0.50,mean=0.51]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 04:40:35,178 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.752 = 0.50×0.85(prox=0.85) + 0.40×proc(0.567[fin=0.53,mean=0.63]) + 0.10×fmt(1.000) | pred='65' gold='64' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 04:40:35,256 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.622 = 0.50×0.85(prox=0.85) + 0.40×proc(0.331[fin=0.29,mean=0.40]) + 0.10×fmt(0.650) | pred='65' gold='64' | step_acc=50% lccp=50% (chain=1/2 ok_count=1) n_steps=2
+
Iter 14 GRPO groups: 10%|# | 2/20 [01:14<07:08, 23.78s/q, loss=0.0051, mean_r=0.800, skip=0]
Iter 14 GRPO groups: 15%|#5 | 3/20 [01:14<07:00, 24.72s/q, loss=0.0051, mean_r=0.800, skip=0]2026-04-26 04:40:45,300 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:40:45,385 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:40:45,468 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:40:45,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:40:57,145 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:40:57,230 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:40:57,314 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:40:57,399 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:41:09,724 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:41:09,808 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='51' gold='51' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 14 GRPO groups: 15%|#5 | 3/20 [01:43<07:00, 24.72s/q, loss=0var, mean_r=0.997, skip=1]
Iter 14 GRPO groups: 20%|## | 4/20 [01:43<07:04, 26.51s/q, loss=0var, mean_r=0.997, skip=1]2026-04-26 04:41:15,335 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:41:15,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:41:19,890 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:41:19,974 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.694[fin=0.84,mean=0.48]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:41:20,056 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.603 = 0.50×0.33(prox=0.33) + 0.40×proc(0.841[fin=1.00,mean=0.60]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:41:20,135 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:41:25,348 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:41:25,431 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:41:25,509 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:41:25,589 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 14 GRPO groups: 20%|## | 4/20 [02:07<07:04, 26.51s/q, loss=0var, mean_r=0.555, skip=2]
Iter 14 GRPO groups: 25%|##5 | 5/20 [02:07<06:22, 25.52s/q, loss=0var, mean_r=0.555, skip=2]2026-04-26 04:41:44,575 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:41:44,670 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.678 = 0.50×0.50(prox=0.50) + 0.40×proc(0.820[fin=0.97,mean=0.59]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=62% lccp=0% (chain=0/8 ok_count=5) n_steps=8
+2026-04-26 04:41:44,763 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:41:44,856 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:41:55,399 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:41:55,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:41:55,588 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:41:55,681 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:42:11,053 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:42:11,147 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 14 GRPO groups: 25%|##5 | 5/20 [02:46<06:22, 25.52s/q, loss=-0.0002, mean_r=0.966, skip=2]
Iter 14 GRPO groups: 30%|### | 6/20 [02:46<07:01, 30.12s/q, loss=-0.0002, mean_r=0.966, skip=2]2026-04-26 04:42:16,080 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.726 = 0.50×0.50(prox=0.50) + 0.40×proc(0.940[fin=0.97,mean=0.90]) + 0.10×fmt(1.000) | pred='4' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:42:16,164 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.702 = 0.50×0.50(prox=0.50) + 0.40×proc(0.881[fin=0.96,mean=0.76]) + 0.10×fmt(1.000) | pred='4' gold='8' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:42:23,429 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.728 = 0.50×0.50(prox=0.50) + 0.40×proc(0.945[fin=0.94,mean=0.96]) + 0.10×fmt(1.000) | pred='4' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:42:23,513 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:23,596 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.743 = 0.50×0.50(prox=0.50) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='4' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:42:23,679 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:30,729 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:30,811 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:30,897 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.835 = 0.50×0.73(prox=0.73) + 0.40×proc(0.928[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='6.5' gold='8' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:42:30,982 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 14 GRPO groups: 30%|### | 6/20 [03:11<07:01, 30.12s/q, loss=0.0027, mean_r=0.870, skip=2]
Iter 14 GRPO groups: 35%|###5 | 7/20 [03:11<06:10, 28.47s/q, loss=0.0027, mean_r=0.870, skip=2]2026-04-26 04:42:42,448 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:42,535 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:42,620 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.830 = 0.50×0.75(prox=0.75) + 0.40×proc(0.888[fin=0.97,mean=0.77]) + 0.10×fmt(1.000) | pred='42' gold='36' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:42:42,706 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:50,525 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:50,609 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:50,694 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:50,780 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:42:59,166 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.653 = 0.50×0.85(prox=0.85) + 0.40×proc(0.320[fin=0.17,mean=0.55]) + 0.10×fmt(1.000) | pred='39' gold='36' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:42:59,251 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 14 GRPO groups: 35%|###5 | 7/20 [03:34<06:10, 28.47s/q, loss=-0.0005, mean_r=0.947, skip=2]
Iter 14 GRPO groups: 40%|#### | 8/20 [03:34<05:22, 26.83s/q, loss=-0.0005, mean_r=0.947, skip=2]2026-04-26 04:43:04,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:04,217 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:11,503 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:11,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.925[fin=0.96,mean=0.88]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:11,668 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:11,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.617 = 0.50×0.67(prox=0.67) + 0.40×proc(0.460[fin=0.44,mean=0.49]) + 0.10×fmt(1.000) | pred='5' gold='4' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 04:43:16,101 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:16,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:16,270 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:16,353 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 14 GRPO groups: 40%|#### | 8/20 [03:58<05:22, 26.83s/q, loss=-0.0002, mean_r=0.957, skip=2]
Iter 14 GRPO groups: 45%|####5 | 9/20 [03:58<04:44, 25.87s/q, loss=-0.0002, mean_r=0.957, skip=2]2026-04-26 04:43:30,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.908[fin=0.98,mean=0.80]) + 0.10×fmt(1.000) | pred='96' gold='32' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:43:30,986 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.514 = 0.50×0.19(prox=0.19) + 0.40×proc(0.728[fin=0.90,mean=0.47]) + 0.10×fmt(1.000) | pred='102' gold='32' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 04:43:31,079 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.547 = 0.50×0.19(prox=0.19) + 0.40×proc(0.809[fin=0.96,mean=0.58]) + 0.10×fmt(1.000) | pred='102' gold='32' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 04:43:31,165 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.932[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='96' gold='32' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:43:39,748 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.349 = 0.50×0.18(prox=0.18) + 0.40×proc(0.299[fin=0.25,mean=0.37]) + 0.10×fmt(1.000) | pred='102.67' gold='32' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 04:43:39,832 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.522 = 0.50×0.19(prox=0.19) + 0.40×proc(0.730[fin=0.88,mean=0.50]) + 0.10×fmt(1.000) | pred='102' gold='32' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:43:39,918 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:43:40,011 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.539 = 0.50×0.18(prox=0.18) + 0.40×proc(0.800[fin=0.96,mean=0.56]) + 0.10×fmt(1.000) | pred='106' gold='32' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 04:43:49,822 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:43:49,908 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.764 = 0.50×0.80(prox=0.80) + 0.40×proc(0.660[fin=0.67,mean=0.64]) + 0.10×fmt(1.000) | pred='36' gold='32' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+
Iter 14 GRPO groups: 45%|####5 | 9/20 [04:25<04:44, 25.87s/q, loss=0.0000, mean_r=0.633, skip=2]
Iter 14 GRPO groups: 50%|##### | 10/20 [04:25<04:20, 26.09s/q, loss=0.0000, mean_r=0.633, skip=2]2026-04-26 04:43:54,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:43:54,529 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:01,081 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:01,163 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:01,245 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:01,327 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:08,292 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:08,376 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:08,458 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:08,541 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 14 GRPO groups: 50%|##### | 10/20 [04:49<04:20, 26.09s/q, loss=0var, mean_r=0.999, skip=3]
Iter 14 GRPO groups: 55%|#####5 | 11/20 [04:49<03:48, 25.43s/q, loss=0var, mean_r=0.999, skip=3]2026-04-26 04:44:15,308 INFO src.rl.curriculum_manager - Topic probabilities (rollout 0): [('basic_arithmetic', '0.042'), ('single_step_word_problems', '0.042'), ('fractions', '0.042'), ('percentages', '0.042'), ('ratios', '0.042')]
+2026-04-26 04:44:22,989 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.958 + mod=+0.080, cap=1.00) | Q=0.90 sol=1.000 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 04:44:23,196 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=1.000 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 04:44:23,403 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=1.000 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 04:44:23,600 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=1.000 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 04:44:23,802 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=1.000 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 04:44:24,009 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.984 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=5
+2026-04-26 04:44:24,216 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=1.000 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 04:44:24,423 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.921 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.990 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 04:44:24,625 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=1.000 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 04:44:24,822 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.926 + mod=+0.080, cap=1.00) | Q=0.82 sol=1.000 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 04:44:28,488 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.773 = clip(base=0.693 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.645 novelty=0.70 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.61)+0.20*lccp(0.00) | steps=2
+2026-04-26 04:44:28,686 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.749 = clip(base=0.669 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.636 novelty=0.70 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.60)+0.20*lccp(0.00) | steps=2
+2026-04-26 04:44:28,884 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.755 = clip(base=0.675 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.646 novelty=0.70 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.61)+0.20*lccp(0.00) | steps=2
+2026-04-26 04:44:29,085 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.754 = clip(base=0.674 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.645 novelty=0.70 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.61)+0.20*lccp(0.00) | steps=2
+2026-04-26 04:44:29,285 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.754 = clip(base=0.674 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.645 novelty=0.70 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.61)+0.20*lccp(0.00) | steps=2
+2026-04-26 04:44:29,481 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 04:44:29,673 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.755 = clip(base=0.675 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.646 novelty=0.70 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.61)+0.20*lccp(0.00) | steps=2
+2026-04-26 04:44:29,868 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.754 = clip(base=0.674 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.645 novelty=0.70 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.61)+0.20*lccp(0.00) | steps=2
+2026-04-26 04:44:30,067 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.919 = clip(base=0.839 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.932 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.81)+0.20*lccp(1.00) | steps=2
+2026-04-26 04:44:30,267 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+
Iter 14 GRPO groups: 55%|#####5 | 11/20 [05:05<03:48, 25.43s/q, loss=0.0024, mean_r=0.905, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 60%|###### | 12/20 [05:05<03:02, 22.78s/q, loss=0.0024, mean_r=0.905, q_acc=100%, q_rew=0.763, skip=3]2026-04-26 04:44:48,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.38(prox=0.38) + 0.40×proc(0.904[fin=0.94,mean=0.85]) + 0.10×fmt(1.000) | pred='9' gold='5' | step_acc=86% lccp=71% (chain=5/7 ok_count=6) n_steps=7
+2026-04-26 04:44:48,110 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.691 = 0.50×0.45(prox=0.45) + 0.40×proc(0.909[fin=0.99,mean=0.79]) + 0.10×fmt(1.000) | pred='8' gold='5' | step_acc=88% lccp=0% (chain=0/8 ok_count=7) n_steps=8
+2026-04-26 04:44:48,195 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.26(prox=0.26) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='12' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:44:48,282 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:44:59,830 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.952 = 0.50×1.00(exact) + 0.40×proc(0.880[fin=0.97,mean=0.75]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:44:59,914 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.939[fin=0.98,mean=0.87]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:45:00,007 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.889[fin=0.97,mean=0.77]) + 0.10×fmt(1.000) | pred='15' gold='5' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:45:00,091 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.45(prox=0.45) + 0.40×proc(0.640[fin=0.64,mean=0.64]) + 0.10×fmt(1.000) | pred='2' gold='5' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:45:05,699 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.45(prox=0.45) + 0.40×proc(0.756[fin=0.80,mean=0.69]) + 0.10×fmt(1.000) | pred='2' gold='5' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:45:05,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.45(prox=0.45) + 0.40×proc(0.856[fin=0.91,mean=0.77]) + 0.10×fmt(1.000) | pred='2' gold='5' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+
Iter 14 GRPO groups: 60%|###### | 12/20 [05:41<03:02, 22.78s/q, loss=-0.0009, mean_r=0.691, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 65%|######5 | 13/20 [05:41<03:05, 26.56s/q, loss=-0.0009, mean_r=0.691, q_acc=100%, q_rew=0.763, skip=3]2026-04-26 04:45:14,695 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:45:14,787 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:45:30,383 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.834 = 0.50×1.00(exact) + 0.40×proc(0.585[fin=0.41,mean=0.84]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=83% lccp=83% (chain=5/6 ok_count=5) n_steps=6
+2026-04-26 04:45:30,475 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:45:30,569 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:45:30,664 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:45:40,454 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 04:45:40,548 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:45:40,641 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:45:40,736 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+
Iter 14 GRPO groups: 65%|######5 | 13/20 [06:30<03:05, 26.56s/q, loss=0.0012, mean_r=0.982, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 70%|####### | 14/20 [06:30<03:20, 33.48s/q, loss=0.0012, mean_r=0.982, q_acc=100%, q_rew=0.763, skip=3]2026-04-26 04:46:06,276 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(0.650) | pred='1024' gold='1024' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:46:06,355 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.284 = 0.50×0.34(prox=0.34) + 0.40×proc(0.126[fin=0.13,mean=0.12]) + 0.10×fmt(0.650) | pred='20' gold='1024' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 04:46:06,439 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.255 = 0.50×0.00(prox=0.00) + 0.40×proc(0.293[fin=0.24,mean=0.38]) + 0.10×fmt(1.000) | pred='3628800' gold='1024' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 04:46:06,518 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='1024' gold='1024' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:46:12,602 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.950 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(0.650) | pred='1024' gold='1024' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:46:12,678 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.949 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(0.650) | pred='1024' gold='1024' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 04:46:12,755 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.268 = 0.50×0.34(prox=0.34) + 0.40×proc(0.086[fin=0.09,mean=0.08]) + 0.10×fmt(0.650) | pred='20' gold='1024' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 04:46:12,845 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.903[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='1024' gold='1024' | step_acc=86% lccp=14% (chain=1/7 ok_count=6) n_steps=7
+2026-04-26 04:46:24,959 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='1024' gold='1024' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:46:25,041 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(0.650) | pred='1024' gold='1024' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 14 GRPO groups: 70%|####### | 14/20 [07:00<03:20, 33.48s/q, loss=-0.0004, mean_r=0.756, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 75%|#######5 | 15/20 [07:00<02:41, 32.36s/q, loss=-0.0004, mean_r=0.756, q_acc=100%, q_rew=0.763, skip=3]2026-04-26 04:46:33,052 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.932[fin=0.99,mean=0.84]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:46:33,139 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.940[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:46:43,017 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:46:43,100 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:46:43,182 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:46:43,261 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:46:47,954 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.923 = 0.50×1.00(exact) + 0.40×proc(0.807[fin=0.94,mean=0.61]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:46:48,031 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.903 = 0.50×1.00(exact) + 0.40×proc(0.845[fin=0.97,mean=0.65]) + 0.10×fmt(0.650) | pred='3' gold='3' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 04:46:48,116 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:46:48,203 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 14 GRPO groups: 75%|#######5 | 15/20 [07:39<02:41, 32.36s/q, loss=0.0042, mean_r=0.971, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 80%|######## | 16/20 [07:39<02:17, 34.40s/q, loss=0.0042, mean_r=0.971, q_acc=100%, q_rew=0.763, skip=3]2026-04-26 04:47:09,666 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:47:09,752 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.928[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:47:09,835 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.789 = 0.50×0.70(prox=0.70) + 0.40×proc(0.847[fin=0.98,mean=0.64]) + 0.10×fmt(1.000) | pred='25.5' gold='21' | step_acc=71% lccp=14% (chain=1/7 ok_count=5) n_steps=7
+2026-04-26 04:47:09,919 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.442 = 0.50×0.26(prox=0.26) + 0.40×proc(0.469[fin=0.47,mean=0.47]) + 0.10×fmt(1.000) | pred='51' gold='21' | step_acc=33% lccp=17% (chain=1/6 ok_count=2) n_steps=6
+2026-04-26 04:47:20,793 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.26(prox=0.26) + 0.40×proc(0.776[fin=0.89,mean=0.60]) + 0.10×fmt(1.000) | pred='51' gold='21' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:47:20,878 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:47:20,963 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.925[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 04:47:21,039 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.925 = 0.50×1.00(exact) + 0.40×proc(0.813[fin=0.98,mean=0.56]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:47:25,907 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:47:25,989 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.962[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='21' gold='21' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 14 GRPO groups: 80%|######## | 16/20 [08:01<02:17, 34.40s/q, loss=-0.0018, mean_r=0.861, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 85%|########5 | 17/20 [08:01<01:31, 30.61s/q, loss=-0.0018, mean_r=0.861, q_acc=100%, q_rew=0.763, skip=3]2026-04-26 04:47:32,501 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.944 = 0.50×1.00(exact) + 0.40×proc(0.859[fin=0.99,mean=0.66]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:47:32,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.513 = 0.50×0.18(prox=0.18) + 0.40×proc(0.813[fin=1.00,mean=0.53]) + 0.10×fmt(1.000) | pred='20' gold='6' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:47:38,537 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:47:38,621 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.925[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 04:47:38,706 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.531 = 0.50×0.14(prox=0.14) + 0.40×proc(0.813[fin=0.95,mean=0.60]) + 0.10×fmt(1.000) | pred='25' gold='6' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 04:47:38,791 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:47:55,491 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.520 = 0.50×0.18(prox=0.18) + 0.40×proc(0.829[fin=0.98,mean=0.60]) + 0.10×fmt(1.000) | pred='20' gold='6' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:47:55,577 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:47:55,661 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:47:55,743 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 14 GRPO groups: 85%|########5 | 17/20 [08:35<01:31, 30.61s/q, loss=-0.0023, mean_r=0.848, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 90%|######### | 18/20 [08:35<01:03, 31.69s/q, loss=-0.0023, mean_r=0.848, q_acc=100%, q_rew=0.763, skip=3]2026-04-26 04:48:07,966 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:48:08,059 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.320 = 0.50×0.00(prox=0.00) + 0.40×proc(0.363[fin=0.25,mean=0.53]) + 0.10×fmt(1.000) | pred='No solution' gold='4' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 04:48:08,143 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:48:08,226 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:48:16,974 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.956 = 0.50×1.00(exact) + 0.40×proc(0.890[fin=0.95,mean=0.81]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:48:17,056 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:48:17,140 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:48:17,223 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=0.99,mean=0.93]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:48:22,246 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.687 = 0.50×0.67(prox=0.67) + 0.40×proc(0.721[fin=0.86,mean=0.51]) + 0.10×fmt(0.650) | pred='5' gold='4' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 04:48:22,330 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.671 = 0.50×0.67(prox=0.67) + 0.40×proc(0.682[fin=0.80,mean=0.51]) + 0.10×fmt(0.650) | pred='5' gold='4' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+
Iter 14 GRPO groups: 90%|######### | 18/20 [08:57<01:03, 31.69s/q, loss=-0.0004, mean_r=0.861, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 95%|#########5| 19/20 [08:57<00:28, 28.82s/q, loss=-0.0004, mean_r=0.861, q_acc=100%, q_rew=0.763, skip=3]2026-04-26 04:48:29,026 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:29,113 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:36,400 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:36,485 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.477 = 0.50×0.33(prox=0.33) + 0.40×proc(0.375[fin=0.27,mean=0.53]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 04:48:36,562 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:36,644 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:43,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:43,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:43,265 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:43,349 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.543 = 0.50×0.50(prox=0.50) + 0.40×proc(0.483[fin=0.44,mean=0.55]) + 0.10×fmt(1.000) | pred='3' gold='2' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+
Iter 14 GRPO groups: 95%|#########5| 19/20 [09:28<00:28, 28.82s/q, loss=-0.0023, mean_r=0.900, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 100%|##########| 20/20 [09:28<00:00, 29.40s/q, loss=-0.0023, mean_r=0.900, q_acc=100%, q_rew=0.763, skip=3]
Iter 14 GRPO groups: 100%|##########| 20/20 [09:28<00:00, 28.42s/q, loss=-0.0023, mean_r=0.900, q_acc=100%, q_rew=0.763, skip=3]
+2026-04-26 04:48:54,559 INFO __main__ - Iter 14 | loss=0.0005 | reward mean=0.856 std=0.208 | gt_match=67.4% | grounded_acc=94.7% | step_acc=85.8% | lccp=74.8% | batch_acc=95.2% | phase=SELFPLAY_RAMP sp_ratio=4% | groups=18 skipped=3(0var=3) | lr=4.85e-06 | 568.4s
+2026-04-26 04:48:54,559 INFO __main__ - Question generation: 1/1 valid (100%) | q_reward=0.763 | q_acc=100.0% (>0.5 quality) | topic=0.57 diff=0.89 clarity=1.00 novelty=0.43 solvability=1.00
+2026-04-26 04:48:54,560 INFO __main__ - ======================================================================
+2026-04-26 04:48:54,561 INFO __main__ - GRPO ITERATION 15/60
+2026-04-26 04:48:54,561 INFO __main__ - ======================================================================
+2026-04-26 04:48:54,581 INFO __main__ - LR this iteration: 4.85e-06 | T=0.705 | MATH ratio=30%
+
Iter 15 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 04:48:57,821 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:48:57,904 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:48:57,986 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.912[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 04:48:58,068 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:49:02,833 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.964[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:49:02,915 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:49:02,996 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:49:03,079 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:49:08,099 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:49:08,183 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.914[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+
Iter 15 GRPO groups: 0%| | 0/20 [00:13, ?q/s, loss=0var, mean_r=0.982, skip=1]
Iter 15 GRPO groups: 5%|5 | 1/20 [00:13<04:18, 13.60s/q, loss=0var, mean_r=0.982, skip=1]2026-04-26 04:49:14,753 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.854 = 0.50×0.78(prox=0.78) + 0.40×proc(0.912[fin=0.99,mean=0.79]) + 0.10×fmt(1.000) | pred='96' gold='84' | step_acc=88% lccp=38% (chain=3/8 ok_count=7) n_steps=8
+2026-04-26 04:49:14,840 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.408 = 0.50×0.43(prox=0.43) + 0.40×proc(0.234[fin=0.03,mean=0.54]) + 0.10×fmt(1.000) | pred='28' gold='84' | step_acc=57% lccp=0% (chain=0/7 ok_count=4) n_steps=7
+2026-04-26 04:49:28,676 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.677 = 0.50×0.78(prox=0.78) + 0.40×proc(0.471[fin=0.39,mean=0.60]) + 0.10×fmt(1.000) | pred='72' gold='84' | step_acc=57% lccp=43% (chain=3/7 ok_count=4) n_steps=7
+2026-04-26 04:49:28,760 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.528 = 0.50×0.49(prox=0.49) + 0.40×proc(0.460[fin=0.43,mean=0.51]) + 0.10×fmt(1.000) | pred='40' gold='84' | step_acc=43% lccp=0% (chain=0/7 ok_count=3) n_steps=7
+2026-04-26 04:49:28,843 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.447 = 0.50×0.46(prox=0.46) + 0.40×proc(0.298[fin=0.10,mean=0.59]) + 0.10×fmt(1.000) | pred='34' gold='84' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 04:49:28,926 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.915 = 0.50×0.85(prox=0.85) + 0.40×proc(0.976[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='80' gold='84' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:49:37,600 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.482 = 0.50×0.45(prox=0.45) + 0.40×proc(0.235[fin=0.01,mean=0.56]) + 0.10×fmt(1.000) | pred='32' gold='84' | step_acc=57% lccp=43% (chain=3/7 ok_count=4) n_steps=7
+2026-04-26 04:49:37,688 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:49:37,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.900 = 0.50×0.85(prox=0.85) + 0.40×proc(0.937[fin=0.99,mean=0.86]) + 0.10×fmt(1.000) | pred='80' gold='84' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:49:37,861 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.47(prox=0.47) + 0.40×proc(0.732[fin=0.70,mean=0.78]) + 0.10×fmt(1.000) | pred='36' gold='84' | step_acc=86% lccp=71% (chain=5/7 ok_count=6) n_steps=7
+
Iter 15 GRPO groups: 5%|5 | 1/20 [00:57<04:18, 13.60s/q, loss=0.0000, mean_r=0.676, skip=1]
Iter 15 GRPO groups: 10%|# | 2/20 [00:57<09:26, 31.50s/q, loss=0.0000, mean_r=0.676, skip=1]2026-04-26 04:49:56,210 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:49:56,292 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:49:56,367 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:49:56,443 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:00,710 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:00,790 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:00,867 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:00,943 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:50:08,451 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:50:08,527 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 15 GRPO groups: 10%|# | 2/20 [01:13<09:26, 31.50s/q, loss=0var, mean_r=0.998, skip=2]
Iter 15 GRPO groups: 15%|#5 | 3/20 [01:13<06:57, 24.57s/q, loss=0var, mean_r=0.998, skip=2]2026-04-26 04:50:11,654 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:11,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:17,175 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:17,256 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:17,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:17,418 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:22,331 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:22,414 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.613 = 0.50×0.50(prox=0.50) + 0.40×proc(0.657[fin=0.71,mean=0.58]) + 0.10×fmt(1.000) | pred='35' gold='70' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 04:50:22,496 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:50:22,581 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 15 GRPO groups: 15%|#5 | 3/20 [01:33<06:57, 24.57s/q, loss=-0.0025, mean_r=0.960, skip=2]
Iter 15 GRPO groups: 20%|## | 4/20 [01:33<06:03, 22.69s/q, loss=-0.0025, mean_r=0.960, skip=2]2026-04-26 04:50:28,353 INFO src.rl.curriculum_manager - Topic probabilities (rollout 20): [('basic_arithmetic', '0.043'), ('single_step_word_problems', '0.043'), ('fractions', '0.043'), ('percentages', '0.043'), ('ratios', '0.043')]
+2026-04-26 04:50:33,615 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.995 = clip(base=0.915 + mod=+0.080, cap=1.00) | Q=0.79 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:33,791 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:33,967 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:34,149 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:34,329 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:34,510 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:34,685 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:34,860 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:35,039 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:35,223 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:40,769 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:40,950 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:41,139 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:41,324 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:41,512 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:41,699 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:41,882 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:42,067 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:42,251 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 04:50:42,432 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 15 GRPO groups: 20%|## | 4/20 [01:49<06:03, 22.69s/q, loss=-0.0002, mean_r=0.968, q_acc=100%, q_rew=0.722, skip=2]
Iter 15 GRPO groups: 25%|##5 | 5/20 [01:49<05:02, 20.18s/q, loss=-0.0002, mean_r=0.968, q_acc=100%, q_rew=0.722, skip=2]2026-04-26 04:50:48,954 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.425 = 0.50×0.15(prox=0.15) + 0.40×proc(0.621[fin=0.68,mean=0.54]) + 0.10×fmt(1.000) | pred='30' gold='8' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:50:49,036 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:50:49,122 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.404 = 0.50×0.50(prox=0.50) + 0.40×proc(0.134[fin=0.14,mean=0.13]) + 0.10×fmt(1.000) | pred='4' gold='8' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 04:50:49,204 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.958[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:51:07,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.950[fin=0.93,mean=0.98]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:51:07,794 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.961[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:51:07,879 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:51:07,963 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:51:20,799 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.442 = 0.50×0.50(prox=0.50) + 0.40×proc(0.229[fin=0.06,mean=0.49]) + 0.10×fmt(1.000) | pred='4' gold='8' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 04:51:20,881 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=0.98,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 15 GRPO groups: 25%|##5 | 5/20 [02:27<05:02, 20.18s/q, loss=0.0002, mean_r=0.821, q_acc=100%, q_rew=0.722, skip=2]
Iter 15 GRPO groups: 30%|### | 6/20 [02:27<06:08, 26.33s/q, loss=0.0002, mean_r=0.821, q_acc=100%, q_rew=0.722, skip=2]2026-04-26 04:51:25,554 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:25,636 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.707 = 0.50×0.50(prox=0.50) + 0.40×proc(0.893[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='24' gold='48' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:51:32,367 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:51:32,453 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.717 = 0.50×0.50(prox=0.50) + 0.40×proc(0.919[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='24' gold='48' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:51:32,541 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:32,624 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:40,299 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:40,383 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.939[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 04:51:40,466 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:40,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 15 GRPO groups: 30%|### | 6/20 [02:53<06:08, 26.33s/q, loss=-0.0012, mean_r=0.938, q_acc=100%, q_rew=0.722, skip=2]
Iter 15 GRPO groups: 35%|###5 | 7/20 [02:53<05:40, 26.16s/q, loss=-0.0012, mean_r=0.938, q_acc=100%, q_rew=0.722, skip=2]2026-04-26 04:51:52,095 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:52,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:52,257 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:52,336 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:59,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:59,334 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:59,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:51:59,490 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:52:06,338 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:52:06,416 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='33' gold='33' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 15 GRPO groups: 35%|###5 | 7/20 [03:11<05:40, 26.16s/q, loss=0var, mean_r=0.999, skip=3]
Iter 15 GRPO groups: 40%|#### | 8/20 [03:11<04:43, 23.65s/q, loss=0var, mean_r=0.999, skip=3]2026-04-26 04:52:40,189 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.585 = 0.50×0.33(prox=0.33) + 0.40×proc(0.795[fin=0.95,mean=0.57]) + 0.10×fmt(1.000) | pred='348' gold='174' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 04:52:40,283 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.306 = 0.50×0.00(prox=0.00) + 0.40×proc(0.514[fin=0.54,mean=0.47]) + 0.10×fmt(1.000) | pred='1953/2' gold='174' | step_acc=50% lccp=0% (chain=0/6 ok_count=3) n_steps=6
+2026-04-26 04:52:51,623 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.415 = 0.50×0.00(prox=0.00) + 0.40×proc(0.862[fin=1.00,mean=0.66]) + 0.10×fmt(0.700) | pred='' gold='174' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 04:52:51,708 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.947[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='174' gold='174' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:52:51,801 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.648 = 0.50×0.50(prox=0.50) + 0.40×proc(0.744[fin=0.92,mean=0.48]) + 0.10×fmt(1.000) | pred='87' gold='174' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 04:52:51,893 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.907 = 0.50×1.00(exact) + 0.40×proc(0.767[fin=0.96,mean=0.47]) + 0.10×fmt(1.000) | pred='174' gold='174' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 04:53:07,776 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.471 = 0.50×0.20(prox=0.20) + 0.40×proc(0.677[fin=0.73,mean=0.59]) + 0.10×fmt(1.000) | pred='522' gold='174' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 04:53:07,859 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='174' gold='174' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:53:07,952 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.588 = 0.50×0.47(prox=0.47) + 0.40×proc(0.628[fin=0.62,mean=0.63]) + 0.10×fmt(1.000) | pred='77.3' gold='174' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+
Iter 15 GRPO groups: 40%|#### | 8/20 [04:14<04:43, 23.65s/q, loss=-0.0008, mean_r=0.652, q_acc=100%, q_rew=0.722, skip=3]
Iter 15 GRPO groups: 45%|####5 | 9/20 [04:14<06:35, 35.94s/q, loss=-0.0008, mean_r=0.652, q_acc=100%, q_rew=0.722, skip=3]2026-04-26 04:53:14,989 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:53:22,334 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:53:22,420 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:53:22,507 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:53:22,590 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.702 = 0.50×0.50(prox=0.50) + 0.40×proc(0.881[fin=0.98,mean=0.74]) + 0.10×fmt(1.000) | pred='-24' gold='-16' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 04:53:32,064 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.660 = 0.50×0.50(prox=0.50) + 0.40×proc(0.774[fin=0.97,mean=0.48]) + 0.10×fmt(1.000) | pred='-24' gold='-16' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:53:32,150 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:53:32,235 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:53:32,319 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:53:41,214 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-16' gold='-16' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 15 GRPO groups: 45%|####5 | 9/20 [04:48<06:35, 35.94s/q, loss=0.0010, mean_r=0.935, q_acc=100%, q_rew=0.722, skip=3]
Iter 15 GRPO groups: 50%|##### | 10/20 [04:48<05:51, 35.12s/q, loss=0.0010, mean_r=0.935, q_acc=100%, q_rew=0.722, skip=3]2026-04-26 04:53:47,744 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:53:47,827 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.794 = 0.50×0.66(prox=0.66) + 0.40×proc(0.910[fin=0.98,mean=0.81]) + 0.10×fmt(1.000) | pred='49' gold='39' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 04:53:47,909 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:53:59,406 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:53:59,489 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:53:59,572 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:53:59,654 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:54:07,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:54:07,636 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:54:07,722 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.545 = 0.50×0.71(prox=0.71) + 0.40×proc(0.227[fin=0.18,mean=0.29]) + 0.10×fmt(1.000) | pred='47' gold='39' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+
Iter 15 GRPO groups: 50%|##### | 10/20 [05:14<05:51, 35.12s/q, loss=-0.0003, mean_r=0.933, q_acc=100%, q_rew=0.722, skip=3]
Iter 15 GRPO groups: 55%|#####5 | 11/20 [05:14<04:52, 32.50s/q, loss=-0.0003, mean_r=0.933, q_acc=100%, q_rew=0.722, skip=3]2026-04-26 04:54:16,509 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:54:27,569 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.937[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 04:54:27,656 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.951 = 0.50×1.00(exact) + 0.40×proc(0.876[fin=0.99,mean=0.71]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=67% lccp=17% (chain=1/6 ok_count=4) n_steps=6
+2026-04-26 04:54:27,741 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:54:27,827 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:54:42,476 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 04:54:42,561 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:54:42,647 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:54:42,734 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 04:54:57,738 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 15 GRPO groups: 55%|#####5 | 11/20 [06:03<04:52, 32.50s/q, loss=0var, mean_r=0.991, skip=4]
Iter 15 GRPO groups: 60%|###### | 12/20 [06:03<04:58, 37.37s/q, loss=0var, mean_r=0.991, skip=4]2026-04-26 04:55:05,950 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:06,034 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:06,126 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:55:11,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:55:11,304 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:11,388 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:55:11,471 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:15,938 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:16,020 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:16,102 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 15 GRPO groups: 60%|###### | 12/20 [06:21<04:58, 37.37s/q, loss=0var, mean_r=0.998, skip=5]
Iter 15 GRPO groups: 65%|######5 | 13/20 [06:21<03:41, 31.61s/q, loss=0var, mean_r=0.998, skip=5]2026-04-26 04:55:18,737 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:22,830 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:22,908 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:22,988 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:23,073 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:28,070 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:28,154 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:28,234 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:28,314 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:33,128 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 15 GRPO groups: 65%|######5 | 13/20 [06:38<03:41, 31.61s/q, loss=0var, mean_r=0.999, skip=6]
Iter 15 GRPO groups: 70%|####### | 14/20 [06:38<02:43, 27.21s/q, loss=0var, mean_r=0.999, skip=6]2026-04-26 04:55:36,454 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:36,539 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.568 = 0.50×0.36(prox=0.36) + 0.40×proc(0.724[fin=0.95,mean=0.38]) + 0.10×fmt(1.000) | pred='1' gold='10' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 04:55:36,623 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:43,262 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:43,345 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:43,428 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:55:43,509 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:55:50,129 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:55:50,211 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:55:50,294 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 15 GRPO groups: 70%|####### | 14/20 [06:57<02:43, 27.21s/q, loss=0.0029, mean_r=0.956, q_acc=100%, q_rew=0.722, skip=6]
Iter 15 GRPO groups: 75%|#######5 | 15/20 [06:57<02:03, 24.62s/q, loss=0.0029, mean_r=0.956, q_acc=100%, q_rew=0.722, skip=6]2026-04-26 04:55:55,520 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:56:02,184 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.384 = 0.50×0.20(prox=0.20) + 0.40×proc(0.366[fin=0.34,mean=0.40]) + 0.10×fmt(1.000) | pred='60' gold='20' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 04:56:02,260 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:56:02,336 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:56:02,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:56:08,803 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:56:08,886 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 04:56:08,962 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:09,040 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:56:11,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 15 GRPO groups: 75%|#######5 | 15/20 [07:18<02:03, 24.62s/q, loss=0.0013, mean_r=0.938, q_acc=100%, q_rew=0.722, skip=6]
Iter 15 GRPO groups: 80%|######## | 16/20 [07:18<01:34, 23.65s/q, loss=0.0013, mean_r=0.938, q_acc=100%, q_rew=0.722, skip=6]2026-04-26 04:56:18,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:18,496 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:18,581 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:28,656 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:28,743 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:28,835 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:28,918 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:42,157 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:42,240 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:56:42,322 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 15 GRPO groups: 80%|######## | 16/20 [07:47<01:34, 23.65s/q, loss=0var, mean_r=0.998, skip=7]
Iter 15 GRPO groups: 85%|########5 | 17/20 [07:47<01:15, 25.31s/q, loss=0var, mean_r=0.998, skip=7]2026-04-26 04:56:45,165 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:51,348 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:51,430 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:51,511 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:51,594 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:56,668 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:56,750 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:56,832 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:56:56,914 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:57:02,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 15 GRPO groups: 85%|########5 | 17/20 [08:07<01:15, 25.31s/q, loss=0var, mean_r=0.998, skip=8]
Iter 15 GRPO groups: 90%|######### | 18/20 [08:07<00:47, 23.67s/q, loss=0var, mean_r=0.998, skip=8]2026-04-26 04:57:08,145 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:57:08,232 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:57:08,327 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:57:15,679 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:57:15,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:57:15,845 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:57:15,928 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:57:22,809 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:57:22,896 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 04:57:22,983 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 15 GRPO groups: 90%|######### | 18/20 [08:28<00:47, 23.67s/q, loss=0var, mean_r=0.998, skip=9]
Iter 15 GRPO groups: 95%|#########5| 19/20 [08:28<00:22, 22.81s/q, loss=0var, mean_r=0.998, skip=9]2026-04-26 04:57:29,568 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.940[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 04:57:38,271 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.941[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:57:38,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:57:38,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.212 = 0.50×0.08(prox=0.08) + 0.40×proc(0.178[fin=0.09,mean=0.31]) + 0.10×fmt(1.000) | pred='53' gold='8' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 04:57:38,533 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:57:47,857 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 04:57:47,942 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.797[fin=0.97,mean=0.53]) + 0.10×fmt(1.000) | pred='13' gold='8' | step_acc=57% lccp=14% (chain=1/7 ok_count=4) n_steps=7
+2026-04-26 04:57:48,024 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 04:57:48,111 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.467 = 0.50×0.10(prox=0.10) + 0.40×proc(0.727[fin=0.91,mean=0.45]) + 0.10×fmt(1.000) | pred='43' gold='8' | step_acc=33% lccp=17% (chain=1/6 ok_count=2) n_steps=6
+2026-04-26 04:58:02,864 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.384 = 0.50×0.14(prox=0.14) + 0.40×proc(0.387[fin=0.34,mean=0.45]) + 0.10×fmt(1.000) | pred='-17' gold='8' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+
Iter 15 GRPO groups: 95%|#########5| 19/20 [09:10<00:22, 22.81s/q, loss=0.0000, mean_r=0.754, q_acc=100%, q_rew=0.722, skip=9]
Iter 15 GRPO groups: 100%|##########| 20/20 [09:10<00:00, 28.48s/q, loss=0.0000, mean_r=0.754, q_acc=100%, q_rew=0.722, skip=9]
Iter 15 GRPO groups: 100%|##########| 20/20 [09:10<00:00, 27.51s/q, loss=0.0000, mean_r=0.754, q_acc=100%, q_rew=0.722, skip=9]
+2026-04-26 04:58:04,705 INFO src.rl.llm_question_classifier - LLMClassifier cache=90% llm=2% fallback=8% (cache_size=4/10000)
+2026-04-26 04:58:04,705 INFO __main__ - Iter 15 | loss=0.0000 | reward mean=0.928 std=0.167 | gt_match=83.6% | grounded_acc=93.1% | step_acc=91.7% | lccp=83.7% | batch_acc=93.8% | phase=SELFPLAY_RAMP sp_ratio=7% | groups=12 skipped=9(0var=9) | lr=4.80e-06 | 550.1s
+2026-04-26 04:58:04,706 WARNING __main__ - STARVATION: 43% of groups skipped (zero variance). grounded_acc=93.1% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 04:58:04,706 INFO __main__ - Question generation: 1/1 valid (100%) | q_reward=0.722 | q_acc=100.0% (>0.5 quality) | topic=0.35 diff=0.95 clarity=1.00 novelty=0.46 solvability=1.00
+2026-04-26 04:58:04,706 INFO __main__ - Evaluating GSM8K (150 samples)...
+
GSM8K eval: 0%| | 0/150 [00:00, ?q/s]
GSM8K eval: 1%| | 1/150 [00:02<05:36, 2.26s/q, correct=1/1, lccp=100.0%, score=0.999, step_acc=100.0%]
GSM8K eval: 1%|1 | 2/150 [00:07<09:34, 3.88s/q, correct=2/2, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 2%|2 | 3/150 [00:09<08:09, 3.33s/q, correct=3/3, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 3%|2 | 4/150 [00:12<07:07, 2.93s/q, correct=3/4, lccp=83.3%, score=0.887, step_acc=91.7%]
GSM8K eval: 3%|3 | 5/150 [00:13<05:59, 2.48s/q, correct=4/5, lccp=86.7%, score=0.910, step_acc=93.3%]
GSM8K eval: 4%|4 | 6/150 [00:19<08:29, 3.54s/q, correct=4/6, lccp=75.6%, score=0.887, step_acc=87.8%]
GSM8K eval: 5%|4 | 7/150 [00:23<08:23, 3.52s/q, correct=5/7, lccp=79.0%, score=0.903, step_acc=89.5%]
GSM8K eval: 5%|5 | 8/150 [00:25<07:34, 3.20s/q, correct=6/8, lccp=81.7%, score=0.915, step_acc=90.8%]
GSM8K eval: 6%|6 | 9/150 [00:28<07:38, 3.25s/q, correct=7/9, lccp=83.7%, score=0.924, step_acc=91.9%]
GSM8K eval: 7%|6 | 10/150 [00:33<08:52, 3.81s/q, correct=7/10, lccp=81.3%, score=0.908, step_acc=90.7%]
GSM8K eval: 7%|7 | 11/150 [00:36<08:13, 3.55s/q, correct=8/11, lccp=83.0%, score=0.916, step_acc=91.5%]
GSM8K eval: 8%|8 | 12/150 [00:39<07:09, 3.11s/q, correct=9/12, lccp=84.4%, score=0.923, step_acc=92.2%]
GSM8K eval: 9%|8 | 13/150 [00:41<06:44, 2.95s/q, correct=10/13, lccp=85.6%, score=0.926, step_acc=92.8%]
GSM8K eval: 9%|9 | 14/150 [00:46<07:49, 3.45s/q, correct=11/14, lccp=86.7%, score=0.932, step_acc=93.3%]
GSM8K eval: 10%|# | 15/150 [00:48<07:11, 3.19s/q, correct=12/15, lccp=87.6%, score=0.936, step_acc=93.8%]
GSM8K eval: 11%|# | 16/150 [00:51<06:39, 2.98s/q, correct=12/16, lccp=88.3%, score=0.912, step_acc=94.2%]
GSM8K eval: 11%|#1 | 17/150 [00:55<07:15, 3.28s/q, correct=13/17, lccp=89.0%, score=0.917, step_acc=94.5%]
GSM8K eval: 12%|#2 | 18/150 [01:01<08:52, 4.03s/q, correct=13/18, lccp=84.8%, score=0.906, step_acc=92.0%]
GSM8K eval: 13%|#2 | 19/150 [01:03<07:52, 3.61s/q, correct=14/19, lccp=85.6%, score=0.911, step_acc=92.5%]
GSM8K eval: 13%|#3 | 20/150 [01:07<08:00, 3.69s/q, correct=15/20, lccp=86.3%, score=0.915, step_acc=92.8%]
GSM8K eval: 14%|#4 | 21/150 [01:10<07:14, 3.37s/q, correct=16/21, lccp=86.9%, score=0.919, step_acc=93.2%]
GSM8K eval: 15%|#4 | 22/150 [01:12<06:45, 3.17s/q, correct=17/22, lccp=84.5%, score=0.914, step_acc=92.0%]
GSM8K eval: 15%|#5 | 23/150 [01:17<07:20, 3.47s/q, correct=18/23, lccp=85.2%, score=0.918, step_acc=92.3%]
GSM8K eval: 16%|#6 | 24/150 [01:19<06:42, 3.20s/q, correct=18/24, lccp=82.7%, score=0.901, step_acc=89.5%]
GSM8K eval: 17%|#6 | 25/150 [01:22<06:21, 3.06s/q, correct=18/25, lccp=80.4%, score=0.897, step_acc=88.9%]
GSM8K eval: 17%|#7 | 26/150 [01:26<07:08, 3.46s/q, correct=19/26, lccp=81.1%, score=0.901, step_acc=89.4%]
GSM8K eval: 18%|#8 | 27/150 [01:29<06:42, 3.27s/q, correct=19/27, lccp=81.8%, score=0.896, step_acc=89.8%]
GSM8K eval: 19%|#8 | 28/150 [01:31<06:00, 2.96s/q, correct=20/28, lccp=82.5%, score=0.900, step_acc=90.1%]
GSM8K eval: 19%|#9 | 29/150 [01:34<05:50, 2.90s/q, correct=21/29, lccp=83.1%, score=0.903, step_acc=90.5%]
GSM8K eval: 20%|## | 30/150 [01:38<06:21, 3.18s/q, correct=22/30, lccp=83.6%, score=0.906, step_acc=90.8%]
GSM8K eval: 21%|## | 31/150 [01:40<05:57, 3.00s/q, correct=23/31, lccp=84.2%, score=0.909, step_acc=91.1%]
GSM8K eval: 21%|##1 | 32/150 [01:42<05:10, 2.63s/q, correct=24/32, lccp=84.7%, score=0.911, step_acc=91.4%]
GSM8K eval: 22%|##2 | 33/150 [01:45<05:12, 2.67s/q, correct=25/33, lccp=85.1%, score=0.914, step_acc=91.6%]
GSM8K eval: 23%|##2 | 34/150 [01:47<04:48, 2.49s/q, correct=26/34, lccp=85.6%, score=0.916, step_acc=91.9%]
GSM8K eval: 23%|##3 | 35/150 [01:50<04:49, 2.52s/q, correct=27/35, lccp=86.0%, score=0.918, step_acc=92.1%]
GSM8K eval: 24%|##4 | 36/150 [01:53<05:17, 2.79s/q, correct=28/36, lccp=86.4%, score=0.921, step_acc=92.3%]
GSM8K eval: 25%|##4 | 37/150 [01:55<04:48, 2.55s/q, correct=29/37, lccp=86.7%, score=0.922, step_acc=92.5%]
GSM8K eval: 25%|##5 | 38/150 [01:58<05:03, 2.71s/q, correct=30/38, lccp=87.1%, score=0.924, step_acc=92.7%]
GSM8K eval: 26%|##6 | 39/150 [02:03<06:10, 3.33s/q, correct=31/39, lccp=87.4%, score=0.926, step_acc=92.9%]
GSM8K eval: 27%|##6 | 40/150 [02:09<07:37, 4.16s/q, correct=32/40, lccp=87.7%, score=0.928, step_acc=93.1%]
GSM8K eval: 27%|##7 | 41/150 [02:12<06:53, 3.79s/q, correct=32/41, lccp=88.0%, score=0.927, step_acc=93.3%]
GSM8K eval: 28%|##8 | 42/150 [02:17<07:36, 4.23s/q, correct=33/42, lccp=86.7%, score=0.928, step_acc=93.0%]
GSM8K eval: 29%|##8 | 43/150 [02:19<06:29, 3.64s/q, correct=34/43, lccp=87.0%, score=0.930, step_acc=93.2%]
GSM8K eval: 29%|##9 | 44/150 [02:26<07:44, 4.38s/q, correct=35/44, lccp=87.3%, score=0.931, step_acc=93.3%]
GSM8K eval: 30%|### | 45/150 [02:29<07:01, 4.02s/q, correct=36/45, lccp=87.6%, score=0.933, step_acc=93.5%]
GSM8K eval: 31%|### | 46/150 [02:34<07:23, 4.27s/q, correct=36/46, lccp=85.7%, score=0.928, step_acc=93.4%]
GSM8K eval: 31%|###1 | 47/150 [02:37<06:44, 3.92s/q, correct=37/47, lccp=86.0%, score=0.929, step_acc=93.5%]
GSM8K eval: 32%|###2 | 48/150 [02:39<05:35, 3.29s/q, correct=38/48, lccp=86.3%, score=0.931, step_acc=93.7%]
GSM8K eval: 33%|###2 | 49/150 [02:42<05:41, 3.38s/q, correct=39/49, lccp=85.2%, score=0.932, step_acc=93.4%]
GSM8K eval: 33%|###3 | 50/150 [02:45<05:31, 3.32s/q, correct=39/50, lccp=84.5%, score=0.923, step_acc=92.6%]
GSM8K eval: 34%|###4 | 51/150 [02:47<04:32, 2.75s/q, correct=40/51, lccp=84.8%, score=0.925, step_acc=92.7%]
GSM8K eval: 35%|###4 | 52/150 [02:51<05:15, 3.22s/q, correct=40/52, lccp=83.2%, score=0.924, step_acc=92.5%]
GSM8K eval: 35%|###5 | 53/150 [02:56<05:55, 3.67s/q, correct=40/53, lccp=82.8%, score=0.916, step_acc=91.9%]
GSM8K eval: 36%|###6 | 54/150 [02:59<05:41, 3.55s/q, correct=41/54, lccp=83.1%, score=0.918, step_acc=92.1%]
GSM8K eval: 37%|###6 | 55/150 [03:03<05:37, 3.55s/q, correct=42/55, lccp=83.4%, score=0.919, step_acc=92.2%]
GSM8K eval: 37%|###7 | 56/150 [03:06<05:38, 3.60s/q, correct=43/56, lccp=83.7%, score=0.920, step_acc=92.4%]
GSM8K eval: 38%|###8 | 57/150 [03:09<04:58, 3.21s/q, correct=44/57, lccp=84.0%, score=0.922, step_acc=92.5%]
GSM8K eval: 39%|###8 | 58/150 [03:13<05:20, 3.49s/q, correct=45/58, lccp=84.2%, score=0.923, step_acc=92.6%]
GSM8K eval: 39%|###9 | 59/150 [03:17<05:26, 3.59s/q, correct=45/59, lccp=82.8%, score=0.916, step_acc=92.1%]
GSM8K eval: 40%|#### | 60/150 [03:22<06:00, 4.00s/q, correct=46/60, lccp=83.1%, score=0.918, step_acc=92.2%]
GSM8K eval: 41%|#### | 61/150 [03:25<05:35, 3.77s/q, correct=47/61, lccp=83.4%, score=0.919, step_acc=92.3%]
GSM8K eval: 41%|####1 | 62/150 [03:28<05:14, 3.57s/q, correct=48/62, lccp=83.6%, score=0.920, step_acc=92.5%]
GSM8K eval: 42%|####2 | 63/150 [03:31<05:06, 3.52s/q, correct=48/63, lccp=83.4%, score=0.914, step_acc=92.0%]
GSM8K eval: 43%|####2 | 64/150 [03:34<04:44, 3.30s/q, correct=49/64, lccp=83.6%, score=0.916, step_acc=92.2%]
GSM8K eval: 43%|####3 | 65/150 [03:37<04:27, 3.15s/q, correct=50/65, lccp=83.9%, score=0.917, step_acc=92.3%]
GSM8K eval: 44%|####4 | 66/150 [03:39<03:54, 2.79s/q, correct=51/66, lccp=84.1%, score=0.918, step_acc=92.4%]
GSM8K eval: 45%|####4 | 67/150 [03:41<03:38, 2.64s/q, correct=52/67, lccp=84.4%, score=0.919, step_acc=92.5%]
GSM8K eval: 45%|####5 | 68/150 [03:44<03:37, 2.65s/q, correct=53/68, lccp=84.6%, score=0.921, step_acc=92.6%]
GSM8K eval: 46%|####6 | 69/150 [03:45<03:07, 2.32s/q, correct=54/69, lccp=84.8%, score=0.922, step_acc=92.7%]
GSM8K eval: 47%|####6 | 70/150 [03:48<03:20, 2.51s/q, correct=55/70, lccp=83.6%, score=0.922, step_acc=92.6%]
GSM8K eval: 47%|####7 | 71/150 [03:51<03:32, 2.69s/q, correct=56/71, lccp=82.4%, score=0.923, step_acc=92.4%]
GSM8K eval: 48%|####8 | 72/150 [03:53<03:01, 2.32s/q, correct=57/72, lccp=82.7%, score=0.924, step_acc=92.5%]
GSM8K eval: 49%|####8 | 73/150 [03:55<02:44, 2.13s/q, correct=58/73, lccp=82.9%, score=0.925, step_acc=92.6%]
GSM8K eval: 49%|####9 | 74/150 [03:58<03:14, 2.56s/q, correct=59/74, lccp=83.1%, score=0.926, step_acc=92.7%]
GSM8K eval: 50%|##### | 75/150 [04:00<02:52, 2.30s/q, correct=60/75, lccp=83.4%, score=0.927, step_acc=92.8%]
GSM8K eval: 51%|##### | 76/150 [04:06<04:25, 3.58s/q, correct=60/76, lccp=83.4%, score=0.922, step_acc=92.7%]
GSM8K eval: 51%|#####1 | 77/150 [04:10<04:29, 3.69s/q, correct=61/77, lccp=83.6%, score=0.923, step_acc=92.8%]
GSM8K eval: 52%|#####2 | 78/150 [04:13<03:58, 3.32s/q, correct=62/78, lccp=83.8%, score=0.924, step_acc=92.9%]
GSM8K eval: 53%|#####2 | 79/150 [04:16<03:50, 3.25s/q, correct=62/79, lccp=83.0%, score=0.918, step_acc=92.1%]
GSM8K eval: 53%|#####3 | 80/150 [04:19<03:44, 3.21s/q, correct=63/80, lccp=83.2%, score=0.919, step_acc=92.2%]
GSM8K eval: 54%|#####4 | 81/150 [04:21<03:24, 2.96s/q, correct=64/81, lccp=83.4%, score=0.920, step_acc=92.3%]
GSM8K eval: 55%|#####4 | 82/150 [04:24<03:23, 2.99s/q, correct=65/82, lccp=83.6%, score=0.921, step_acc=92.4%]
GSM8K eval: 55%|#####5 | 83/150 [04:27<03:17, 2.95s/q, correct=66/83, lccp=83.8%, score=0.922, step_acc=92.5%]
GSM8K eval: 56%|#####6 | 84/150 [04:30<03:08, 2.85s/q, correct=67/84, lccp=84.0%, score=0.923, step_acc=92.6%]
GSM8K eval: 57%|#####6 | 85/150 [04:34<03:23, 3.13s/q, correct=68/85, lccp=84.2%, score=0.924, step_acc=92.7%]
GSM8K eval: 57%|#####7 | 86/150 [04:37<03:26, 3.22s/q, correct=69/86, lccp=84.4%, score=0.925, step_acc=92.8%]
GSM8K eval: 58%|#####8 | 87/150 [04:43<04:07, 3.93s/q, correct=70/87, lccp=84.6%, score=0.926, step_acc=92.9%]
GSM8K eval: 59%|#####8 | 88/150 [04:45<03:25, 3.32s/q, correct=71/88, lccp=84.7%, score=0.926, step_acc=93.0%]
GSM8K eval: 59%|#####9 | 89/150 [04:47<03:11, 3.15s/q, correct=72/89, lccp=84.9%, score=0.927, step_acc=93.0%]
GSM8K eval: 60%|###### | 90/150 [04:50<02:55, 2.92s/q, correct=73/90, lccp=85.1%, score=0.928, step_acc=93.1%]
GSM8K eval: 61%|###### | 91/150 [04:54<03:17, 3.35s/q, correct=74/91, lccp=85.2%, score=0.929, step_acc=93.2%]
GSM8K eval: 61%|######1 | 92/150 [04:57<03:10, 3.28s/q, correct=75/92, lccp=85.4%, score=0.929, step_acc=93.3%]
GSM8K eval: 62%|######2 | 93/150 [05:05<04:21, 4.58s/q, correct=76/93, lccp=85.6%, score=0.930, step_acc=93.3%]
GSM8K eval: 63%|######2 | 94/150 [05:08<03:45, 4.03s/q, correct=77/94, lccp=84.6%, score=0.930, step_acc=92.7%]
GSM8K eval: 63%|######3 | 95/150 [05:12<03:52, 4.22s/q, correct=77/95, lccp=83.8%, score=0.927, step_acc=91.9%]
GSM8K eval: 64%|######4 | 96/150 [05:17<04:01, 4.47s/q, correct=78/96, lccp=83.9%, score=0.928, step_acc=92.0%]
GSM8K eval: 65%|######4 | 97/150 [05:20<03:28, 3.94s/q, correct=78/97, lccp=83.6%, score=0.926, step_acc=91.8%]
GSM8K eval: 65%|######5 | 98/150 [05:24<03:30, 4.04s/q, correct=78/98, lccp=83.2%, score=0.922, step_acc=91.6%]
GSM8K eval: 66%|######6 | 99/150 [05:27<03:01, 3.56s/q, correct=79/99, lccp=83.3%, score=0.923, step_acc=91.7%]
GSM8K eval: 67%|######6 | 100/150 [05:29<02:33, 3.07s/q, correct=80/100, lccp=82.5%, score=0.923, step_acc=91.5%]
GSM8K eval: 67%|######7 | 101/150 [05:33<02:43, 3.33s/q, correct=80/101, lccp=82.7%, score=0.919, step_acc=91.5%]
GSM8K eval: 68%|######8 | 102/150 [05:34<02:13, 2.79s/q, correct=81/102, lccp=82.8%, score=0.920, step_acc=91.6%]
GSM8K eval: 69%|######8 | 103/150 [05:36<02:01, 2.59s/q, correct=82/103, lccp=83.0%, score=0.921, step_acc=91.7%]
GSM8K eval: 69%|######9 | 104/150 [05:41<02:29, 3.26s/q, correct=83/104, lccp=83.2%, score=0.921, step_acc=91.8%]
GSM8K eval: 70%|####### | 105/150 [05:44<02:17, 3.06s/q, correct=84/105, lccp=83.3%, score=0.922, step_acc=91.9%]
GSM8K eval: 71%|####### | 106/150 [05:45<01:54, 2.61s/q, correct=85/106, lccp=83.5%, score=0.923, step_acc=91.9%]
GSM8K eval: 71%|#######1 | 107/150 [05:47<01:38, 2.30s/q, correct=86/107, lccp=83.6%, score=0.923, step_acc=92.0%]
GSM8K eval: 72%|#######2 | 108/150 [05:49<01:41, 2.43s/q, correct=87/108, lccp=83.8%, score=0.924, step_acc=92.1%]
GSM8K eval: 73%|#######2 | 109/150 [05:55<02:12, 3.22s/q, correct=87/109, lccp=83.3%, score=0.923, step_acc=92.0%]
GSM8K eval: 73%|#######3 | 110/150 [05:57<01:58, 2.97s/q, correct=88/110, lccp=82.8%, score=0.923, step_acc=91.9%]
GSM8K eval: 74%|#######4 | 111/150 [05:59<01:41, 2.60s/q, correct=89/111, lccp=83.0%, score=0.924, step_acc=91.9%]
GSM8K eval: 75%|#######4 | 112/150 [06:04<02:10, 3.42s/q, correct=89/112, lccp=83.1%, score=0.923, step_acc=92.0%]
GSM8K eval: 75%|#######5 | 113/150 [06:06<01:48, 2.94s/q, correct=90/113, lccp=83.3%, score=0.924, step_acc=92.1%]
GSM8K eval: 76%|#######6 | 114/150 [06:11<02:10, 3.63s/q, correct=91/114, lccp=82.8%, score=0.924, step_acc=92.0%]
GSM8K eval: 77%|#######6 | 115/150 [06:14<01:59, 3.42s/q, correct=92/115, lccp=82.9%, score=0.925, step_acc=92.1%]
GSM8K eval: 77%|#######7 | 116/150 [06:17<01:52, 3.30s/q, correct=93/116, lccp=83.1%, score=0.925, step_acc=92.2%]
GSM8K eval: 78%|#######8 | 117/150 [06:23<02:16, 4.13s/q, correct=94/117, lccp=83.2%, score=0.926, step_acc=92.2%]
GSM8K eval: 79%|#######8 | 118/150 [06:28<02:16, 4.27s/q, correct=94/118, lccp=82.5%, score=0.924, step_acc=92.1%]
GSM8K eval: 79%|#######9 | 119/150 [06:31<02:06, 4.09s/q, correct=94/119, lccp=82.7%, score=0.922, step_acc=92.2%]
GSM8K eval: 80%|######## | 120/150 [06:34<01:51, 3.72s/q, correct=95/120, lccp=82.8%, score=0.923, step_acc=92.3%]
GSM8K eval: 81%|######## | 121/150 [06:37<01:43, 3.56s/q, correct=96/121, lccp=82.9%, score=0.923, step_acc=92.3%]
GSM8K eval: 81%|########1 | 122/150 [06:40<01:35, 3.42s/q, correct=97/122, lccp=83.1%, score=0.924, step_acc=92.4%]
GSM8K eval: 82%|########2 | 123/150 [06:44<01:31, 3.40s/q, correct=97/123, lccp=82.7%, score=0.924, step_acc=92.3%]
GSM8K eval: 83%|########2 | 124/150 [06:46<01:19, 3.04s/q, correct=98/124, lccp=82.9%, score=0.924, step_acc=92.4%]
GSM8K eval: 83%|########3 | 125/150 [06:48<01:08, 2.75s/q, correct=99/125, lccp=83.0%, score=0.925, step_acc=92.4%]
GSM8K eval: 84%|########4 | 126/150 [06:51<01:06, 2.76s/q, correct=100/126, lccp=83.1%, score=0.926, step_acc=92.5%]
GSM8K eval: 85%|########4 | 127/150 [06:55<01:15, 3.27s/q, correct=101/127, lccp=83.3%, score=0.926, step_acc=92.5%]
GSM8K eval: 85%|########5 | 128/150 [06:58<01:10, 3.19s/q, correct=102/128, lccp=83.4%, score=0.927, step_acc=92.6%]
GSM8K eval: 86%|########6 | 129/150 [07:02<01:08, 3.26s/q, correct=103/129, lccp=83.5%, score=0.927, step_acc=92.7%]
GSM8K eval: 87%|########6 | 130/150 [07:04<00:56, 2.83s/q, correct=104/130, lccp=83.7%, score=0.928, step_acc=92.7%]
GSM8K eval: 87%|########7 | 131/150 [07:08<01:03, 3.37s/q, correct=105/131, lccp=83.8%, score=0.928, step_acc=92.8%]
GSM8K eval: 88%|########8 | 132/150 [07:10<00:51, 2.84s/q, correct=106/132, lccp=83.9%, score=0.929, step_acc=92.8%]
GSM8K eval: 89%|########8 | 133/150 [07:13<00:48, 2.84s/q, correct=107/133, lccp=84.0%, score=0.929, step_acc=92.9%]
GSM8K eval: 89%|########9 | 134/150 [07:17<00:53, 3.32s/q, correct=108/134, lccp=84.2%, score=0.930, step_acc=92.9%]
GSM8K eval: 90%|######### | 135/150 [07:20<00:47, 3.19s/q, correct=109/135, lccp=84.3%, score=0.930, step_acc=93.0%]
GSM8K eval: 91%|######### | 136/150 [07:24<00:50, 3.58s/q, correct=109/136, lccp=83.9%, score=0.929, step_acc=92.8%]
GSM8K eval: 91%|#########1| 137/150 [07:31<00:59, 4.60s/q, correct=110/137, lccp=84.0%, score=0.930, step_acc=92.8%]
GSM8K eval: 92%|#########2| 138/150 [07:35<00:53, 4.43s/q, correct=111/138, lccp=84.1%, score=0.930, step_acc=92.9%]
GSM8K eval: 93%|#########2| 139/150 [07:39<00:45, 4.14s/q, correct=112/139, lccp=84.2%, score=0.931, step_acc=93.0%]
GSM8K eval: 93%|#########3| 140/150 [07:43<00:42, 4.20s/q, correct=112/140, lccp=84.1%, score=0.927, step_acc=92.8%]
GSM8K eval: 94%|#########3| 141/150 [07:47<00:36, 4.10s/q, correct=113/141, lccp=84.2%, score=0.928, step_acc=92.8%]
GSM8K eval: 95%|#########4| 142/150 [07:52<00:33, 4.22s/q, correct=114/142, lccp=84.3%, score=0.928, step_acc=92.9%]
GSM8K eval: 95%|#########5| 143/150 [07:54<00:25, 3.64s/q, correct=115/143, lccp=84.5%, score=0.929, step_acc=92.9%]
GSM8K eval: 96%|#########6| 144/150 [07:56<00:19, 3.25s/q, correct=116/144, lccp=84.6%, score=0.929, step_acc=93.0%]
GSM8K eval: 97%|#########6| 145/150 [07:59<00:16, 3.21s/q, correct=116/145, lccp=84.0%, score=0.926, step_acc=92.4%]
GSM8K eval: 97%|#########7| 146/150 [08:02<00:12, 3.14s/q, correct=117/146, lccp=84.1%, score=0.926, step_acc=92.5%]
GSM8K eval: 98%|#########8| 147/150 [08:06<00:10, 3.34s/q, correct=118/147, lccp=84.2%, score=0.927, step_acc=92.5%]
GSM8K eval: 99%|#########8| 148/150 [08:10<00:06, 3.43s/q, correct=119/148, lccp=84.3%, score=0.927, step_acc=92.6%]
GSM8K eval: 99%|#########9| 149/150 [08:13<00:03, 3.47s/q, correct=120/149, lccp=84.4%, score=0.928, step_acc=92.6%]
GSM8K eval: 100%|##########| 150/150 [08:18<00:00, 3.88s/q, correct=120/150, lccp=84.2%, score=0.926, step_acc=92.4%]
GSM8K eval: 100%|##########| 150/150 [08:18<00:00, 3.32s/q, correct=120/150, lccp=84.2%, score=0.926, step_acc=92.4%]
+2026-04-26 05:06:23,420 INFO __main__ - Training Score [iter 15]: 0.9262 (best=0.9199) | n=150
+2026-04-26 05:06:23,420 INFO __main__ - Components : 0.50×correct(80.0%) + 0.40×process + 0.10×fmt(1.000)
+2026-04-26 05:06:23,420 INFO __main__ - Process score : prm_mean=0.907 prm_final=0.940 → weighted=0.927
+2026-04-26 05:06:23,420 INFO __main__ - Step accuracy : 92.4% (bag-of-steps: fraction of steps PRM >0.5)
+2026-04-26 05:06:23,420 INFO __main__ - Chain integrity (LCCP): 84.2% ← fraction of steps before first failure
+ [LCCP=100% → all steps correct; LCCP=0% → first step wrong]
+2026-04-26 05:06:23,420 INFO __main__ - (debug) final-answer accuracy: 80.0%
+2026-04-26 05:06:26,384 INFO __main__ - New best saved → checkpoints/grpo/grpo_20260426_032827/best_policy (combined 0.9262 > 0.9199)
+2026-04-26 05:06:28,581 INFO __main__ - ======================================================================
+2026-04-26 05:06:28,582 INFO __main__ - GRPO ITERATION 16/60
+2026-04-26 05:06:28,582 INFO __main__ - ======================================================================
+2026-04-26 05:06:28,602 INFO __main__ - LR this iteration: 4.80e-06 | T=0.698 | MATH ratio=30%
+
Iter 16 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 05:06:33,399 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.918[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:06:33,485 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.426 = 0.50×0.33(prox=0.33) + 0.40×proc(0.211[fin=0.01,mean=0.51]) + 0.10×fmt(1.000) | pred='30' gold='15' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 05:06:33,570 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.427 = 0.50×0.33(prox=0.33) + 0.40×proc(0.214[fin=0.01,mean=0.51]) + 0.10×fmt(1.000) | pred='30' gold='15' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:590: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.1` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
+ warnings.warn(
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:595: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.
+ warnings.warn(
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:612: UserWarning: `do_sample` is set to `False`. However, `top_k` is set to `20` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_k`.
+ warnings.warn(
+2026-04-26 05:06:42,912 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:06:42,995 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:06:43,079 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:06:43,166 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.427 = 0.50×0.33(prox=0.33) + 0.40×proc(0.214[fin=0.02,mean=0.50]) + 0.10×fmt(1.000) | pred='30' gold='15' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 05:06:51,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:06:51,564 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:06:51,647 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 16 GRPO groups: 0%| | 0/20 [00:24, ?q/s, loss=0.0015, mean_r=0.822, skip=0]
Iter 16 GRPO groups: 5%|5 | 1/20 [00:24<07:49, 24.74s/q, loss=0.0015, mean_r=0.822, skip=0]2026-04-26 05:06:57,019 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:03,880 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:03,962 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:04,043 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:04,125 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:13,883 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:13,967 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:14,050 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:14,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:20,864 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='190' gold='190' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 16 GRPO groups: 5%|5 | 1/20 [00:52<07:49, 24.74s/q, loss=0var, mean_r=0.999, skip=1]
Iter 16 GRPO groups: 10%|# | 2/20 [00:52<07:54, 26.38s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 05:07:28,577 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:07:28,662 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.748 = 0.50×0.80(prox=0.80) + 0.40×proc(0.620[fin=0.64,mean=0.59]) + 0.10×fmt(1.000) | pred='144' gold='128' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 05:07:28,748 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:07:33,874 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:33,958 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.493 = 0.50×0.41(prox=0.41) + 0.40×proc(0.469[fin=0.56,mean=0.33]) + 0.10×fmt(1.000) | pred='36' gold='128' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 05:07:34,042 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.569 = 0.50×0.80(prox=0.80) + 0.40×proc(0.171[fin=0.09,mean=0.29]) + 0.10×fmt(1.000) | pred='144' gold='128' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 05:07:34,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.573 = 0.50×0.80(prox=0.80) + 0.40×proc(0.182[fin=0.23,mean=0.11]) + 0.10×fmt(1.000) | pred='144' gold='128' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+:1: SyntaxWarning: 'int' object is not callable; perhaps you missed a comma?
+2026-04-26 05:07:38,879 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:38,964 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.592 = 0.50×0.80(prox=0.80) + 0.40×proc(0.230[fin=0.15,mean=0.36]) + 0.10×fmt(1.000) | pred='144' gold='128' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 05:07:39,048 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.972[fin=0.99,mean=0.94]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 16 GRPO groups: 10%|# | 2/20 [01:11<07:54, 26.38s/q, loss=0.0003, mean_r=0.794, skip=1]
Iter 16 GRPO groups: 15%|#5 | 3/20 [01:11<06:36, 23.33s/q, loss=0.0003, mean_r=0.794, skip=1]2026-04-26 05:07:44,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:51,477 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:51,561 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:51,646 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:51,730 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:59,190 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:59,276 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:59,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:07:59,441 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:08:06,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 16 GRPO groups: 15%|#5 | 3/20 [01:37<06:36, 23.33s/q, loss=0var, mean_r=1.000, skip=2]
Iter 16 GRPO groups: 20%|## | 4/20 [01:37<06:27, 24.23s/q, loss=0var, mean_r=1.000, skip=2]2026-04-26 05:08:06,187 INFO src.rl.curriculum_manager - Topic probabilities (rollout 40): [('algebra', '0.264'), ('basic_arithmetic', '0.033'), ('single_step_word_problems', '0.033'), ('fractions', '0.033'), ('percentages', '0.033')]
+2026-04-26 05:08:10,844 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.978 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:11,034 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.991 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:11,227 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.987 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:11,418 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.980 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:11,609 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.991 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:11,807 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.991 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:11,999 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.985 = clip(base=0.905 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.991 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:12,196 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.970 = clip(base=0.890 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.979 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:12,387 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.991 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:12,584 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.991 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:16,732 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.935 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.999 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:16,926 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.999 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:17,120 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.990 = clip(base=0.910 + mod=+0.080, cap=1.00) | Q=0.77 sol=1.000 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:17,312 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.988 = clip(base=0.908 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.997 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:17,510 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.988 = clip(base=0.908 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.997 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:17,705 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.990 = clip(base=0.910 + mod=+0.080, cap=1.00) | Q=0.77 sol=1.000 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:17,898 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.990 = clip(base=0.910 + mod=+0.080, cap=1.00) | Q=0.77 sol=1.000 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:18,095 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.998 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:18,294 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.998 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:08:18,487 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.999 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 16 GRPO groups: 20%|## | 4/20 [01:51<06:27, 24.23s/q, loss=0.0000, mean_r=0.985, q_acc=100%, q_rew=0.777, skip=2]
Iter 16 GRPO groups: 25%|##5 | 5/20 [01:51<05:07, 20.53s/q, loss=0.0000, mean_r=0.985, q_acc=100%, q_rew=0.777, skip=2]2026-04-26 05:08:27,954 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:08:28,044 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:08:28,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:08:39,717 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:08:39,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:08:39,893 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:08:39,976 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:08:51,772 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:08:51,855 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.941[fin=0.99,mean=0.87]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 05:08:51,938 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='144' gold='144' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 16 GRPO groups: 25%|##5 | 5/20 [02:23<05:07, 20.53s/q, loss=0var, mean_r=0.997, skip=3]
Iter 16 GRPO groups: 30%|### | 6/20 [02:23<05:41, 24.36s/q, loss=0var, mean_r=0.997, skip=3]2026-04-26 05:08:55,982 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:04,347 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:04,429 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:04,511 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:04,593 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:12,867 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:12,950 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:13,033 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:13,117 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:09:22,158 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='54' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 16 GRPO groups: 30%|### | 6/20 [02:53<05:41, 24.36s/q, loss=0var, mean_r=0.998, skip=4]
Iter 16 GRPO groups: 35%|###5 | 7/20 [02:53<05:41, 26.27s/q, loss=0var, mean_r=0.998, skip=4]2026-04-26 05:09:56,056 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.623[fin=0.68,mean=0.54]) + 0.10×fmt(1.000) | pred='2' gold='6' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:09:56,142 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:09:56,236 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.616 = 0.50×0.71(prox=0.71) + 0.40×proc(0.397[fin=0.43,mean=0.35]) + 0.10×fmt(1.000) | pred='4.8' gold='6' | step_acc=33% lccp=17% (chain=1/6 ok_count=2) n_steps=6
+2026-04-26 05:10:10,100 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.718 = 0.50×0.60(prox=0.60) + 0.40×proc(0.795[fin=0.97,mean=0.54]) + 0.10×fmt(1.000) | pred='8' gold='6' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 05:10:10,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.373 = 0.50×0.00(prox=0.00) + 0.40×proc(0.614[fin=0.59,mean=0.64]) + 0.10×fmt(1.000) | pred='8/3' gold='6' | step_acc=73% lccp=18% (chain=2/11 ok_count=8) n_steps=11
+2026-04-26 05:10:10,282 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:10:10,366 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.449 = 0.50×0.33(prox=0.33) + 0.40×proc(0.394[fin=0.41,mean=0.36]) + 0.10×fmt(1.000) | pred='12' gold='6' | step_acc=17% lccp=17% (chain=1/6 ok_count=1) n_steps=6
+2026-04-26 05:10:20,548 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:10:20,631 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 16 GRPO groups: 35%|###5 | 7/20 [03:53<05:41, 26.27s/q, loss=-0.0006, mean_r=0.744, q_acc=100%, q_rew=0.777, skip=4]
Iter 16 GRPO groups: 40%|#### | 8/20 [03:53<07:24, 37.01s/q, loss=-0.0006, mean_r=0.744, q_acc=100%, q_rew=0.777, skip=4]2026-04-26 05:10:22,173 INFO src.rl.curriculum_manager - Topic probabilities (rollout 60): [('basic_arithmetic', '0.044'), ('single_step_word_problems', '0.044'), ('fractions', '0.044'), ('percentages', '0.044'), ('ratios', '0.044')]
+2026-04-26 05:10:33,950 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.935 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.983 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:10:34,147 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.824 = clip(base=0.744 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.691 novelty=0.72 | sol=0.45*prm_final(0.75)+0.35*prm_mean(0.73)+0.20*lccp(0.50) | steps=4
+2026-04-26 05:10:34,348 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.943 novelty=0.72 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:10:34,548 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.997 = clip(base=0.917 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.993 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:10:34,749 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.842 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(0.50) | steps=6
+2026-04-26 05:10:34,943 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.883 = clip(base=0.803 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.806 novelty=0.72 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.80)+0.20*lccp(0.40) | steps=5
+2026-04-26 05:10:35,149 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.972 novelty=0.72 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:10:35,349 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.992 = clip(base=0.912 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.996 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:10:35,551 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.993 = clip(base=0.913 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.987 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:10:35,750 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.888 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(0.62) | steps=8
+2026-04-26 05:10:47,503 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.743 = clip(base=0.663 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.592 novelty=0.73 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.47)+0.20*lccp(0.00) | steps=10
+2026-04-26 05:10:47,710 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.868 = clip(base=0.788 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.747 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.75)+0.20*lccp(0.20) | steps=5
+2026-04-26 05:10:47,925 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.757 = clip(base=0.677 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.595 novelty=0.73 | sol=0.45*prm_final(0.92)+0.35*prm_mean(0.52)+0.20*lccp(0.00) | steps=7
+2026-04-26 05:10:48,153 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.769 = clip(base=0.689 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.616 novelty=0.73 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.50)+0.20*lccp(0.17) | steps=6
+2026-04-26 05:10:48,371 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.797 = clip(base=0.717 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.647 novelty=0.73 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.60)+0.20*lccp(0.00) | steps=5
+2026-04-26 05:10:48,588 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.717 = clip(base=0.637 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.590 novelty=0.73 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.44)+0.20*lccp(0.00) | steps=6
+2026-04-26 05:10:48,797 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.868 = clip(base=0.788 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.748 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.75)+0.20*lccp(0.20) | steps=5
+2026-04-26 05:10:49,013 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.872 = clip(base=0.792 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.760 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.81)+0.20*lccp(0.17) | steps=6
+2026-04-26 05:10:49,233 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.730 = clip(base=0.650 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.587 novelty=0.73 | sol=0.45*prm_final(0.94)+0.35*prm_mean(0.47)+0.20*lccp(0.00) | steps=4
+2026-04-26 05:10:49,450 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.733 = clip(base=0.653 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.593 novelty=0.73 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.47)+0.20*lccp(0.00) | steps=4
+
Iter 16 GRPO groups: 40%|#### | 8/20 [04:22<07:24, 37.01s/q, loss=-0.0001, mean_r=0.866, q_acc=100%, q_rew=0.788, skip=4]
Iter 16 GRPO groups: 45%|####5 | 9/20 [04:22<06:19, 34.49s/q, loss=-0.0001, mean_r=0.866, q_acc=100%, q_rew=0.788, skip=4]2026-04-26 05:10:54,870 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:10:54,952 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:11:03,982 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:11:04,067 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:04,152 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.514 = 0.50×0.37(prox=0.37) + 0.40×proc(0.473[fin=0.52,mean=0.41]) + 0.10×fmt(1.000) | pred='154' gold='84' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 05:11:04,235 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:11,815 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:11,898 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:11,980 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:12,063 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='84' gold='84' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 16 GRPO groups: 45%|####5 | 9/20 [04:53<06:19, 34.49s/q, loss=-0.0002, mean_r=0.951, q_acc=100%, q_rew=0.788, skip=4]
Iter 16 GRPO groups: 50%|##### | 10/20 [04:53<05:35, 33.55s/q, loss=-0.0002, mean_r=0.951, q_acc=100%, q_rew=0.788, skip=4]2026-04-26 05:11:28,398 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:11:28,475 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:11:28,556 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:11:28,641 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:11:33,803 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(0.650) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:11:33,887 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.650) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:11:33,968 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:11:34,052 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:37,141 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.934 = 0.50×1.00(exact) + 0.40×proc(0.923[fin=1.00,mean=0.81]) + 0.10×fmt(0.650) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:11:37,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='165' gold='165' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 16 GRPO groups: 50%|##### | 10/20 [05:10<05:35, 33.55s/q, loss=0.0014, mean_r=0.971, q_acc=100%, q_rew=0.788, skip=4]
Iter 16 GRPO groups: 55%|#####5 | 11/20 [05:10<04:13, 28.22s/q, loss=0.0014, mean_r=0.971, q_acc=100%, q_rew=0.788, skip=4]2026-04-26 05:11:43,397 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.524 = 0.50×0.50(prox=0.50) + 0.40×proc(0.435[fin=0.45,mean=0.41]) + 0.10×fmt(1.000) | pred='1' gold='2' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 05:11:43,484 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:47,945 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:48,027 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.956 = 0.50×1.00(exact) + 0.40×proc(0.891[fin=1.00,mean=0.73]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 05:11:48,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:48,195 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:55,589 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:55,672 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:11:55,755 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.972[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:11:55,837 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 16 GRPO groups: 55%|#####5 | 11/20 [05:35<04:13, 28.22s/q, loss=-0.0002, mean_r=0.944, q_acc=100%, q_rew=0.788, skip=4]
Iter 16 GRPO groups: 60%|###### | 12/20 [05:35<03:38, 27.32s/q, loss=-0.0002, mean_r=0.944, q_acc=100%, q_rew=0.788, skip=4]2026-04-26 05:12:12,631 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:12:12,714 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:12:12,805 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:12:12,888 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:12:29,152 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.05(prox=0.05) + 0.40×proc(0.939[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='154' gold='14' | step_acc=86% lccp=57% (chain=4/7 ok_count=6) n_steps=7
+2026-04-26 05:12:29,239 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:12:29,323 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:12:29,417 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 05:12:43,294 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:12:43,386 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+
Iter 16 GRPO groups: 60%|###### | 12/20 [06:16<03:38, 27.32s/q, loss=0.0012, mean_r=0.954, q_acc=100%, q_rew=0.788, skip=4]
Iter 16 GRPO groups: 65%|######5 | 13/20 [06:16<03:40, 31.44s/q, loss=0.0012, mean_r=0.954, q_acc=100%, q_rew=0.788, skip=4]2026-04-26 05:12:49,910 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-4' gold='-4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:12:49,992 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-4' gold='-4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:13:00,472 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='-4' gold='-4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:13:00,556 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-4' gold='-4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:13:00,642 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.544 = 0.50×0.29(prox=0.29) + 0.40×proc(0.754[fin=0.96,mean=0.44]) + 0.10×fmt(1.000) | pred='1' gold='-4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:13:00,726 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.546 = 0.50×0.29(prox=0.29) + 0.40×proc(0.758[fin=0.96,mean=0.45]) + 0.10×fmt(1.000) | pred='1' gold='-4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:13:05,960 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='-4' gold='-4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:13:06,042 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.544 = 0.50×0.29(prox=0.29) + 0.40×proc(0.753[fin=0.97,mean=0.43]) + 0.10×fmt(1.000) | pred='1' gold='-4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:13:06,124 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.552 = 0.50×0.29(prox=0.29) + 0.40×proc(0.773[fin=0.97,mean=0.48]) + 0.10×fmt(1.000) | pred='1' gold='-4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:13:06,207 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.551 = 0.50×0.29(prox=0.29) + 0.40×proc(0.770[fin=0.97,mean=0.47]) + 0.10×fmt(1.000) | pred='1' gold='-4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+
Iter 16 GRPO groups: 65%|######5 | 13/20 [06:43<03:40, 31.44s/q, loss=0.0009, mean_r=0.772, q_acc=100%, q_rew=0.788, skip=4]
Iter 16 GRPO groups: 70%|####### | 14/20 [06:43<03:01, 30.31s/q, loss=0.0009, mean_r=0.772, q_acc=100%, q_rew=0.788, skip=4]2026-04-26 05:13:25,346 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.890 = 0.50×1.00(exact) + 0.40×proc(0.726[fin=0.68,mean=0.80]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=83% lccp=33% (chain=4/12 ok_count=10) n_steps=12
+2026-04-26 05:13:25,434 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.526 = 0.50×0.09(prox=0.09) + 0.40×proc(0.825[fin=0.97,mean=0.61]) + 0.10×fmt(1.000) | pred='-4' gold='1' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 05:13:25,518 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.101 = 0.50×0.00(prox=0.00) + 0.40×proc(0.077[fin=0.02,mean=0.16]) + 0.10×fmt(0.700) | pred='' gold='1' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 05:13:25,604 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.499 = 0.50×0.33(prox=0.33) + 0.40×proc(0.505[fin=0.54,mean=0.45]) + 0.10×fmt(1.000) | pred='0' gold='1' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 05:13:36,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 05:13:36,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 05:13:36,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 05:13:36,404 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 05:13:58,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.405 = 0.50×0.00(prox=0.00) + 0.40×proc(0.687[fin=0.71,mean=0.65]) + 0.10×fmt(0.700) | pred='' gold='1' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 05:13:58,865 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+
Iter 16 GRPO groups: 70%|####### | 14/20 [07:31<03:01, 30.31s/q, loss=0.0016, mean_r=0.742, q_acc=100%, q_rew=0.788, skip=4]
Iter 16 GRPO groups: 75%|#######5 | 15/20 [07:31<02:57, 35.58s/q, loss=0.0016, mean_r=0.742, q_acc=100%, q_rew=0.788, skip=4]2026-04-26 05:14:02,220 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:02,300 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:07,118 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:07,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:07,272 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:07,347 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:11,669 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:11,747 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:11,823 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:14:11,902 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='12' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 16 GRPO groups: 75%|#######5 | 15/20 [07:47<02:57, 35.58s/q, loss=0var, mean_r=0.999, skip=5]
Iter 16 GRPO groups: 80%|######## | 16/20 [07:47<01:58, 29.71s/q, loss=0var, mean_r=0.999, skip=5]2026-04-26 05:14:20,136 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:14:20,219 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.639 = 0.50×0.50(prox=0.50) + 0.40×proc(0.722[fin=0.79,mean=0.61]) + 0.10×fmt(1.000) | pred='1' gold='2' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 05:14:20,302 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.385 = 0.50×0.25(prox=0.25) + 0.40×proc(0.249[fin=0.06,mean=0.54]) + 0.10×fmt(1.000) | pred='-1' gold='2' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 05:14:20,387 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.896 = 0.50×1.00(exact) + 0.40×proc(0.739[fin=0.82,mean=0.62]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 05:14:25,021 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.569 = 0.50×0.50(prox=0.50) + 0.40×proc(0.546[fin=0.57,mean=0.52]) + 0.10×fmt(1.000) | pred='1' gold='2' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:14:25,105 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.943 = 0.50×1.00(exact) + 0.40×proc(0.857[fin=0.97,mean=0.68]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 05:14:25,191 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.935 = 0.50×1.00(exact) + 0.40×proc(0.836[fin=0.91,mean=0.73]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:14:25,276 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.960[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:14:31,636 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.948[fin=0.99,mean=0.88]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:14:31,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.370 = 0.50×0.25(prox=0.25) + 0.40×proc(0.237[fin=0.08,mean=0.47]) + 0.10×fmt(1.000) | pred='5' gold='2' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+
Iter 16 GRPO groups: 80%|######## | 16/20 [08:04<01:58, 29.71s/q, loss=-0.0014, mean_r=0.769, q_acc=100%, q_rew=0.788, skip=5]
Iter 16 GRPO groups: 85%|########5 | 17/20 [08:04<01:17, 25.82s/q, loss=-0.0014, mean_r=0.769, q_acc=100%, q_rew=0.788, skip=5]2026-04-26 05:14:38,352 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:14:38,434 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:14:48,928 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:14:49,010 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:14:49,094 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:14:49,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:14:59,841 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:14:59,924 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.916[fin=0.99,mean=0.80]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=71% lccp=57% (chain=4/7 ok_count=5) n_steps=7
+2026-04-26 05:15:00,006 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:15:00,083 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 16 GRPO groups: 85%|########5 | 17/20 [08:40<01:17, 25.82s/q, loss=0var, mean_r=0.994, skip=6]
Iter 16 GRPO groups: 90%|######### | 18/20 [08:40<00:57, 28.82s/q, loss=0var, mean_r=0.994, skip=6]2026-04-26 05:15:12,420 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:15:12,504 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(0.650) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:15:12,588 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:15:12,670 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.642 = 0.50×0.47(prox=0.47) + 0.40×proc(0.851[fin=1.00,mean=0.63]) + 0.10×fmt(0.650) | pred='4' gold='9' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 05:15:18,263 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:15:18,345 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:15:18,427 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:15:18,508 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:15:23,177 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:15:23,261 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 16 GRPO groups: 90%|######### | 18/20 [08:56<00:57, 28.82s/q, loss=0.0005, mean_r=0.952, q_acc=100%, q_rew=0.788, skip=6]
Iter 16 GRPO groups: 95%|#########5| 19/20 [08:56<00:24, 24.89s/q, loss=0.0005, mean_r=0.952, q_acc=100%, q_rew=0.788, skip=6]2026-04-26 05:15:30,343 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:15:30,427 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:15:41,363 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:15:41,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:15:41,539 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:15:41,625 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:15:53,346 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:15:53,430 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:15:53,513 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:15:53,598 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='18' gold='18' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 16 GRPO groups: 95%|#########5| 19/20 [09:35<00:24, 24.89s/q, loss=0var, mean_r=0.999, skip=7]
Iter 16 GRPO groups: 100%|##########| 20/20 [09:35<00:00, 29.24s/q, loss=0var, mean_r=0.999, skip=7]
Iter 16 GRPO groups: 100%|##########| 20/20 [09:35<00:00, 28.77s/q, loss=0var, mean_r=0.999, skip=7]
+2026-04-26 05:16:04,112 INFO __main__ - Iter 16 | loss=0.0003 | reward mean=0.915 std=0.173 | gt_match=83.2% | grounded_acc=93.9% | step_acc=89.6% | lccp=84.4% | batch_acc=95.0% | phase=SELFPLAY_RAMP sp_ratio=11% | groups=15 skipped=7(0var=7) | lr=4.74e-06 | 575.5s
+2026-04-26 05:16:04,112 WARNING __main__ - STARVATION: 32% of groups skipped (zero variance). grounded_acc=93.9% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 05:16:04,112 INFO __main__ - Question generation: 2/2 valid (100%) | q_reward=0.788 | q_acc=100.0% (>0.5 quality) | topic=0.88 diff=0.58 clarity=1.00 novelty=0.45 solvability=0.96
+2026-04-26 05:16:04,114 INFO __main__ - ======================================================================
+2026-04-26 05:16:04,114 INFO __main__ - GRPO ITERATION 17/60
+2026-04-26 05:16:04,114 INFO __main__ - ======================================================================
+2026-04-26 05:16:04,135 INFO __main__ - LR this iteration: 4.74e-06 | T=0.692 | MATH ratio=30%
+
Iter 17 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 05:16:07,390 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:07,473 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:07,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:07,636 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:14,449 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:14,525 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:16:14,606 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:14,688 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:21,812 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:21,898 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 17 GRPO groups: 0%| | 0/20 [00:17, ?q/s, loss=0var, mean_r=0.999, skip=1]
Iter 17 GRPO groups: 5%|5 | 1/20 [00:17<05:37, 17.76s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 05:16:27,085 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:16:27,170 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:16:36,526 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:36,611 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:16:36,696 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.351 = 0.50×0.08(prox=0.08) + 0.40×proc(0.457[fin=0.54,mean=0.34]) + 0.10×fmt(1.000) | pred='252' gold='36' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 05:16:36,780 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.207 = 0.50×0.00(prox=0.00) + 0.40×proc(0.175[fin=0.04,mean=0.38]) + 0.10×fmt(1.000) | pred='4/9' gold='36' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 05:16:44,648 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:16:44,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:16:44,816 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:16:44,902 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 17 GRPO groups: 5%|5 | 1/20 [00:51<05:37, 17.76s/q, loss=0.0001, mean_r=0.853, skip=1]
Iter 17 GRPO groups: 10%|# | 2/20 [00:51<08:12, 27.37s/q, loss=0.0001, mean_r=0.853, skip=1]2026-04-26 05:17:03,565 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.922 = 0.50×1.00(exact) + 0.40×proc(0.805[fin=0.97,mean=0.56]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 05:17:03,660 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.669 = 0.50×0.45(prox=0.45) + 0.40×proc(0.854[fin=1.00,mean=0.64]) + 0.10×fmt(1.000) | pred='32' gold='20' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 05:17:03,745 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.693 = 0.50×1.00(exact) + 0.40×proc(0.232[fin=0.04,mean=0.53]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 05:17:03,837 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.825 = 0.50×0.85(prox=0.85) + 0.40×proc(0.751[fin=0.93,mean=0.48]) + 0.10×fmt(1.000) | pred='21' gold='20' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 05:17:12,176 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.915 = 0.50×1.00(exact) + 0.40×proc(0.787[fin=0.95,mean=0.55]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=50% lccp=33% (chain=2/6 ok_count=3) n_steps=6
+2026-04-26 05:17:12,261 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.705 = 0.50×0.77(prox=0.77) + 0.40×proc(0.551[fin=0.59,mean=0.50]) + 0.10×fmt(1.000) | pred='17' gold='20' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:17:12,353 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:17:12,437 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.677 = 0.50×0.71(prox=0.71) + 0.40×proc(0.548[fin=0.43,mean=0.73]) + 0.10×fmt(1.000) | pred='16' gold='20' | step_acc=80% lccp=80% (chain=4/5 ok_count=4) n_steps=5
+2026-04-26 05:17:20,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.45(prox=0.45) + 0.40×proc(0.794[fin=0.91,mean=0.63]) + 0.10×fmt(1.000) | pred='32' gold='20' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 05:17:20,538 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.921[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+
Iter 17 GRPO groups: 10%|# | 2/20 [01:17<08:12, 27.37s/q, loss=-0.0004, mean_r=0.792, skip=1]
Iter 17 GRPO groups: 15%|#5 | 3/20 [01:17<07:34, 26.75s/q, loss=-0.0004, mean_r=0.792, skip=1]2026-04-26 05:17:21,999 INFO src.rl.curriculum_manager - Topic probabilities (rollout 80): [('basic_arithmetic', '0.045'), ('single_step_word_problems', '0.045'), ('fractions', '0.045'), ('percentages', '0.045'), ('ratios', '0.045')]
+2026-04-26 05:17:27,783 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.944 + mod=+0.080, cap=1.00) | Q=0.91 sol=0.969 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:17:27,981 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.920 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.956 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:17:28,190 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.954 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:17:28,390 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:17:28,589 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.990 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:28,784 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.979 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:17:28,983 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.922 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.977 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:17:29,192 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.937 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.989 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:17:29,403 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.920 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.973 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:17:29,615 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.960 novelty=0.74 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:35,041 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.955 + mod=+0.080, cap=1.00) | Q=0.89 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:35,238 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.935 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.988 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:17:35,437 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.941 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:35,637 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.941 + mod=+0.080, cap=1.00) | Q=0.85 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:35,841 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.943 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:36,052 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.942 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:36,254 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.944 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:36,453 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.943 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:36,651 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.938 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:17:36,850 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.931 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+
Iter 17 GRPO groups: 15%|#5 | 3/20 [01:32<07:34, 26.75s/q, loss=-0.0000, mean_r=1.000, q_acc=100%, q_rew=0.857, skip=2]
Iter 17 GRPO groups: 20%|## | 4/20 [01:32<05:54, 22.16s/q, loss=-0.0000, mean_r=1.000, q_acc=100%, q_rew=0.857, skip=2]2026-04-26 05:17:42,997 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:17:43,081 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:17:53,620 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:17:53,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:17:53,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:17:53,886 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:18:04,507 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:18:04,591 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:18:04,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:18:04,761 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='55' gold='55' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 17 GRPO groups: 20%|## | 4/20 [02:11<05:54, 22.16s/q, loss=0var, mean_r=0.999, skip=3]
Iter 17 GRPO groups: 25%|##5 | 5/20 [02:11<07:02, 28.14s/q, loss=0var, mean_r=0.999, skip=3]2026-04-26 05:18:20,541 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.924 = 0.50×1.00(exact) + 0.40×proc(0.897[fin=1.00,mean=0.75]) + 0.10×fmt(0.650) | pred='2149' gold='2149' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 05:18:20,621 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.539 = 0.50×0.61(prox=0.61) + 0.40×proc(0.420[fin=0.52,mean=0.27]) + 0.10×fmt(0.650) | pred='1466' gold='2149' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 05:18:20,704 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(0.650) | pred='2149' gold='2149' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:18:20,787 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.325 = 0.50×0.17(prox=0.17) + 0.40×proc(0.349[fin=0.36,mean=0.33]) + 0.10×fmt(1.000) | pred='7375' gold='2149' | step_acc=0% lccp=0% (chain=0/5 ok_count=0) n_steps=5
+2026-04-26 05:18:32,302 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.927 = 0.50×1.00(exact) + 0.40×proc(0.904[fin=1.00,mean=0.77]) + 0.10×fmt(0.650) | pred='2149' gold='2149' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:18:32,386 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.920 = 0.50×1.00(exact) + 0.40×proc(0.887[fin=1.00,mean=0.72]) + 0.10×fmt(0.650) | pred='2149' gold='2149' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 05:18:32,470 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.947 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(0.650) | pred='2149' gold='2149' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:18:32,553 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='2149' gold='2149' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:18:37,924 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.94]) + 0.10×fmt(0.650) | pred='2149' gold='2149' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:18:38,007 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.952 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.92]) + 0.10×fmt(0.650) | pred='2149' gold='2149' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 17 GRPO groups: 25%|##5 | 5/20 [02:35<07:02, 28.14s/q, loss=0.0005, mean_r=0.840, q_acc=100%, q_rew=0.857, skip=3]
Iter 17 GRPO groups: 30%|### | 6/20 [02:35<06:12, 26.59s/q, loss=0.0005, mean_r=0.840, q_acc=100%, q_rew=0.857, skip=3]2026-04-26 05:18:43,086 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:18:43,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:18:52,489 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:18:52,571 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:18:52,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:18:52,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:01,386 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.866 = 0.50×0.85(prox=0.85) + 0.40×proc(0.851[fin=0.99,mean=0.64]) + 0.10×fmt(1.000) | pred='31' gold='29' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 05:19:01,468 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:01,551 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:01,634 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='29' gold='29' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 17 GRPO groups: 30%|### | 6/20 [03:08<06:12, 26.59s/q, loss=-0.0006, mean_r=0.983, q_acc=100%, q_rew=0.857, skip=3]
Iter 17 GRPO groups: 35%|###5 | 7/20 [03:08<06:12, 28.63s/q, loss=-0.0006, mean_r=0.983, q_acc=100%, q_rew=0.857, skip=3]2026-04-26 05:19:17,872 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:17,955 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:18,038 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:18,122 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:27,855 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:27,938 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.865[fin=0.98,mean=0.69]) + 0.10×fmt(1.000) | pred='24' gold='8' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 05:19:28,021 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:28,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:38,869 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:19:38,955 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 17 GRPO groups: 35%|###5 | 7/20 [03:36<06:12, 28.63s/q, loss=0.0007, mean_r=0.954, q_acc=100%, q_rew=0.857, skip=3]
Iter 17 GRPO groups: 40%|#### | 8/20 [03:36<05:41, 28.46s/q, loss=0.0007, mean_r=0.954, q_acc=100%, q_rew=0.857, skip=3]2026-04-26 05:20:13,932 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.540 = 0.50×0.14(prox=0.14) + 0.40×proc(0.828[fin=0.96,mean=0.63]) + 0.10×fmt(1.000) | pred='-2' gold='1' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 05:20:14,031 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.672[fin=0.57,mean=0.83]) + 0.10×fmt(1.000) | pred='2' gold='1' | step_acc=88% lccp=75% (chain=6/8 ok_count=7) n_steps=8
+2026-04-26 05:20:28,226 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.350 = 0.50×0.00(prox=0.00) + 0.40×proc(0.363[fin=0.05,mean=0.83]) + 0.10×fmt(0.700) | pred='' gold='1' | step_acc=90% lccp=90% (chain=9/10 ok_count=9) n_steps=10
+2026-04-26 05:20:28,323 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.201 = 0.50×0.00(prox=0.00) + 0.40×proc(0.275[fin=0.15,mean=0.47]) + 0.10×fmt(0.700) | pred='' gold='1' | step_acc=43% lccp=14% (chain=1/7 ok_count=3) n_steps=7
+2026-04-26 05:20:28,407 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.394 = 0.50×0.33(prox=0.33) + 0.40×proc(0.224[fin=0.15,mean=0.33]) + 0.10×fmt(1.000) | pred='0' gold='1' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 05:20:28,500 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.248 = 0.50×0.00(prox=0.00) + 0.40×proc(0.445[fin=0.41,mean=0.50]) + 0.10×fmt(0.700) | pred='' gold='1' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 05:20:39,936 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.342 = 0.50×0.00(prox=0.00) + 0.40×proc(0.359[fin=0.05,mean=0.82]) + 0.10×fmt(0.700) | pred='' gold='1' | step_acc=86% lccp=86% (chain=6/7 ok_count=6) n_steps=7
+2026-04-26 05:20:40,034 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.930 = 0.50×1.00(exact) + 0.40×proc(0.826[fin=0.91,mean=0.70]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=78% lccp=11% (chain=1/9 ok_count=7) n_steps=9
+2026-04-26 05:20:40,151 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 17 GRPO groups: 40%|#### | 8/20 [04:37<05:41, 28.46s/q, loss=0.0006, mean_r=0.506, q_acc=100%, q_rew=0.857, skip=3]
Iter 17 GRPO groups: 45%|####5 | 9/20 [04:37<07:05, 38.69s/q, loss=0.0006, mean_r=0.506, q_acc=100%, q_rew=0.857, skip=3]2026-04-26 05:20:41,584 INFO src.rl.curriculum_manager - Topic probabilities (rollout 100): [('statistics', '0.256'), ('basic_arithmetic', '0.034'), ('single_step_word_problems', '0.034'), ('fractions', '0.034'), ('percentages', '0.034')]
+2026-04-26 05:20:49,007 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.955 + mod=+0.080, cap=1.00) | Q=0.89 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:49,206 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.83 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:49,402 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.997 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:49,597 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.83 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:49,800 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.996 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:20:49,999 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.598 = clip(base=0.518 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.371 novelty=0.74 | sol=0.45*prm_final(0.43)+0.35*prm_mean(0.51)+0.20*lccp(0.00) | steps=6
+2026-04-26 05:20:50,201 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:50,400 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:50,598 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:50,790 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.935 + mod=+0.080, cap=1.00) | Q=0.84 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:58,499 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.981 = clip(base=0.901 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:20:58,698 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.837 = clip(base=0.757 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.806 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.79)+0.20*lccp(0.40) | steps=5
+2026-04-26 05:20:58,890 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:20:59,084 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:20:59,278 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:20:59,470 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.992 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:20:59,662 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:20:59,858 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:00,053 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:00,248 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+
Iter 17 GRPO groups: 45%|####5 | 9/20 [04:57<07:05, 38.69s/q, loss=-0.0021, mean_r=0.955, q_acc=100%, q_rew=0.812, skip=3]
Iter 17 GRPO groups: 50%|##### | 10/20 [04:57<05:30, 33.01s/q, loss=-0.0021, mean_r=0.955, q_acc=100%, q_rew=0.812, skip=3]2026-04-26 05:21:05,684 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.676 = 0.50×0.73(prox=0.73) + 0.40×proc(0.523[fin=0.61,mean=0.40]) + 0.10×fmt(1.000) | pred='36' gold='44' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 05:21:12,103 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:21:12,187 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:21:12,269 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:21:12,353 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:21:19,325 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.654 = 0.50×0.73(prox=0.73) + 0.40×proc(0.469[fin=0.57,mean=0.31]) + 0.10×fmt(1.000) | pred='36' gold='44' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 05:21:19,409 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:21:19,493 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:21:19,575 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:21:26,111 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.534 = 0.50×0.27(prox=0.27) + 0.40×proc(0.750[fin=0.99,mean=0.39]) + 0.10×fmt(1.000) | pred='104' gold='44' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+
Iter 17 GRPO groups: 50%|##### | 10/20 [05:23<05:30, 33.01s/q, loss=-0.0012, mean_r=0.884, q_acc=100%, q_rew=0.812, skip=3]
Iter 17 GRPO groups: 55%|#####5 | 11/20 [05:23<04:36, 30.77s/q, loss=-0.0012, mean_r=0.884, q_acc=100%, q_rew=0.812, skip=3]2026-04-26 05:21:27,565 INFO src.rl.curriculum_manager - Topic probabilities (rollout 120): [('statistics', '0.251'), ('basic_arithmetic', '0.035'), ('single_step_word_problems', '0.035'), ('fractions', '0.035'), ('percentages', '0.035')]
+2026-04-26 05:21:34,078 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.988 = clip(base=0.908 + mod=+0.080, cap=1.00) | Q=0.77 sol=1.000 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:34,283 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:34,487 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:34,685 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:34,884 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:35,085 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:35,283 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:35,479 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.996 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:35,675 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:35,876 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:21:39,259 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.836 = clip(base=0.756 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.671 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.64)+0.20*lccp(0.00) | steps=2
+2026-04-26 05:21:39,446 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.808 = clip(base=0.728 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.667 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.62)+0.20*lccp(0.00) | steps=2
+2026-04-26 05:21:39,633 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:21:39,820 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:21:40,007 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.997 = clip(base=0.917 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.980 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:21:40,202 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.994 = clip(base=0.914 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.985 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:21:40,391 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.995 = clip(base=0.915 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.988 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:21:40,580 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.824 = clip(base=0.744 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.689 novelty=0.71 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.69)+0.20*lccp(0.00) | steps=2
+2026-04-26 05:21:40,767 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.993 = clip(base=0.913 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.973 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:21:40,954 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.822 = clip(base=0.742 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.686 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.68)+0.20*lccp(0.00) | steps=2
+
Iter 17 GRPO groups: 55%|#####5 | 11/20 [05:38<04:36, 30.77s/q, loss=-0.0001, mean_r=0.947, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 60%|###### | 12/20 [05:38<03:27, 25.98s/q, loss=-0.0001, mean_r=0.947, q_acc=100%, q_rew=0.798, skip=3]2026-04-26 05:21:56,558 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.536 = 0.50×0.30(prox=0.30) + 0.40×proc(0.634[fin=0.67,mean=0.58]) + 0.10×fmt(1.000) | pred='-2324' gold='12834' | step_acc=56% lccp=22% (chain=2/9 ok_count=5) n_steps=9
+2026-04-26 05:21:56,656 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.901[fin=0.97,mean=0.79]) + 0.10×fmt(1.000) | pred='534' gold='12834' | step_acc=86% lccp=57% (chain=4/7 ok_count=6) n_steps=7
+2026-04-26 05:21:56,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.536 = 0.50×0.68(prox=0.68) + 0.40×proc(0.238[fin=0.20,mean=0.29]) + 0.10×fmt(1.000) | pred='9830' gold='12834' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 05:22:07,901 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.30(prox=0.30) + 0.40×proc(0.751[fin=0.81,mean=0.66]) + 0.10×fmt(1.000) | pred='-1950' gold='12834' | step_acc=57% lccp=43% (chain=3/7 ok_count=4) n_steps=7
+2026-04-26 05:22:07,995 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.429 = 0.50×0.04(prox=0.04) + 0.40×proc(0.716[fin=0.87,mean=0.49]) + 0.10×fmt(1.000) | pred='160910' gold='12834' | step_acc=57% lccp=14% (chain=1/7 ok_count=4) n_steps=7
+2026-04-26 05:22:08,093 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.443 = 0.50×0.29(prox=0.29) + 0.40×proc(0.443[fin=0.38,mean=0.55]) + 0.10×fmt(1.000) | pred='-3000' gold='12834' | step_acc=43% lccp=14% (chain=1/7 ok_count=3) n_steps=7
+2026-04-26 05:22:08,188 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.643[fin=0.62,mean=0.68]) + 0.10×fmt(1.000) | pred='534' gold='12834' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 05:22:23,045 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.745[fin=0.93,mean=0.47]) + 0.10×fmt(1.000) | pred='4763' gold='12834' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 05:22:23,140 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.468 = 0.50×0.30(prox=0.30) + 0.40×proc(0.487[fin=0.47,mean=0.52]) + 0.10×fmt(1.000) | pred='-1962' gold='12834' | step_acc=43% lccp=14% (chain=1/7 ok_count=3) n_steps=7
+2026-04-26 05:22:23,237 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.599 = 0.50×0.34(prox=0.34) + 0.40×proc(0.828[fin=0.99,mean=0.58]) + 0.10×fmt(1.000) | pred='25500' gold='12834' | step_acc=50% lccp=0% (chain=0/8 ok_count=4) n_steps=8
+
Iter 17 GRPO groups: 60%|###### | 12/20 [06:20<03:27, 25.98s/q, loss=0.0001, mean_r=0.521, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 65%|######5 | 13/20 [06:20<03:36, 30.87s/q, loss=0.0001, mean_r=0.521, q_acc=100%, q_rew=0.798, skip=3]2026-04-26 05:22:30,778 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.662 = 0.50×0.64(prox=0.64) + 0.40×proc(0.609[fin=0.59,mean=0.63]) + 0.10×fmt(1.000) | pred='240' gold='336' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 05:22:39,032 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.780 = 0.50×0.64(prox=0.64) + 0.40×proc(0.905[fin=0.98,mean=0.79]) + 0.10×fmt(1.000) | pred='240' gold='336' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:22:39,118 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.746 = 0.50×0.64(prox=0.64) + 0.40×proc(0.818[fin=0.91,mean=0.69]) + 0.10×fmt(1.000) | pred='240' gold='336' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:22:39,204 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='336' gold='336' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:22:39,289 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.621 = 0.50×0.64(prox=0.64) + 0.40×proc(0.508[fin=0.50,mean=0.51]) + 0.10×fmt(1.000) | pred='240' gold='336' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:22:45,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='336' gold='336' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:22:45,494 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='336' gold='336' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:22:45,576 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='336' gold='336' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:22:45,660 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='336' gold='336' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:22:55,757 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='336' gold='336' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 17 GRPO groups: 65%|######5 | 13/20 [06:53<03:36, 30.87s/q, loss=0.0006, mean_r=0.879, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 70%|####### | 14/20 [06:53<03:08, 31.35s/q, loss=0.0006, mean_r=0.879, q_acc=100%, q_rew=0.798, skip=3]2026-04-26 05:23:03,409 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:23:03,492 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:23:03,578 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:23:13,389 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:23:13,473 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.792 = 0.50×0.72(prox=0.72) + 0.40×proc(0.823[fin=0.97,mean=0.60]) + 0.10×fmt(1.000) | pred='446.5' gold='551' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:23:13,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:23:13,638 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:23:25,020 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:23:25,105 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:23:25,190 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='551' gold='551' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 17 GRPO groups: 70%|####### | 14/20 [07:22<03:08, 31.35s/q, loss=-0.0006, mean_r=0.978, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 75%|#######5 | 15/20 [07:22<02:33, 30.77s/q, loss=-0.0006, mean_r=0.978, q_acc=100%, q_rew=0.798, skip=3]2026-04-26 05:23:32,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:23:40,325 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:23:40,402 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.913[fin=1.00,mean=0.79]) + 0.10×fmt(0.650) | pred='210' gold='105' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:23:40,487 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.368 = 0.50×0.33(prox=0.33) + 0.40×proc(0.255[fin=0.16,mean=0.40]) + 0.10×fmt(1.000) | pred='210' gold='105' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 05:23:40,571 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.319 = 0.50×0.00(prox=0.00) + 0.40×proc(0.453[fin=0.49,mean=0.40]) + 0.10×fmt(1.000) | pred='6,537,021,8400' gold='105' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 05:23:47,404 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:23:47,487 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:23:47,569 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:23:47,652 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:23:53,994 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='105' gold='105' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 17 GRPO groups: 75%|#######5 | 15/20 [07:51<02:33, 30.77s/q, loss=0.0003, mean_r=0.811, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 80%|######## | 16/20 [07:51<02:00, 30.19s/q, loss=0.0003, mean_r=0.811, q_acc=100%, q_rew=0.798, skip=3]2026-04-26 05:24:02,609 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:24:02,694 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:24:02,781 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:24:12,373 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.524 = 0.50×0.43(prox=0.43) + 0.40×proc(0.431[fin=0.46,mean=0.38]) + 0.10×fmt(1.000) | pred='1' gold='3' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 05:24:12,466 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:24:12,550 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.14(prox=0.14) + 0.40×proc(0.880[fin=0.96,mean=0.75]) + 0.10×fmt(1.000) | pred='12' gold='3' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:24:12,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:24:20,401 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.928[fin=0.99,mean=0.84]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:24:20,493 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:24:20,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 17 GRPO groups: 80%|######## | 16/20 [08:17<02:00, 30.19s/q, loss=0.0022, mean_r=0.904, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 85%|########5 | 17/20 [08:17<01:27, 29.13s/q, loss=0.0022, mean_r=0.904, q_acc=100%, q_rew=0.798, skip=3]2026-04-26 05:24:28,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.815 = 0.50×0.64(prox=0.64) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='5' gold='7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:24:37,302 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.933 = 0.50×1.00(exact) + 0.40×proc(0.832[fin=0.99,mean=0.59]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 05:24:37,386 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.739 = 0.50×0.54(prox=0.54) + 0.40×proc(0.925[fin=0.99,mean=0.83]) + 0.10×fmt(1.000) | pred='10' gold='7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:24:37,469 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.745 = 0.50×0.54(prox=0.54) + 0.40×proc(0.938[fin=0.98,mean=0.87]) + 0.10×fmt(1.000) | pred='10' gold='7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:24:37,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.749 = 0.50×0.54(prox=0.54) + 0.40×proc(0.950[fin=0.98,mean=0.90]) + 0.10×fmt(1.000) | pred='10' gold='7' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:24:44,926 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:24:45,011 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.619 = 0.50×0.54(prox=0.54) + 0.40×proc(0.623[fin=0.66,mean=0.57]) + 0.10×fmt(1.000) | pred='10' gold='7' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 05:24:45,096 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.681 = 0.50×0.54(prox=0.54) + 0.40×proc(0.780[fin=0.79,mean=0.77]) + 0.10×fmt(1.000) | pred='10' gold='7' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:24:45,191 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.617 = 0.50×0.64(prox=0.64) + 0.40×proc(0.498[fin=0.50,mean=0.50]) + 0.10×fmt(1.000) | pred='5' gold='7' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 05:24:55,262 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 17 GRPO groups: 85%|########5 | 17/20 [08:52<01:27, 29.13s/q, loss=-0.0002, mean_r=0.790, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 90%|######### | 18/20 [08:52<01:01, 30.77s/q, loss=-0.0002, mean_r=0.790, q_acc=100%, q_rew=0.798, skip=3]2026-04-26 05:25:30,287 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:25:30,374 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:25:30,461 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:25:37,483 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.973[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:25:37,568 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:25:37,651 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:25:37,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:25:47,064 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.440 = 0.50×0.33(prox=0.33) + 0.40×proc(0.339[fin=0.34,mean=0.34]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+
Iter 17 GRPO groups: 90%|######### | 18/20 [09:44<01:01, 30.77s/q, loss=0.0001, mean_r=0.924, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 95%|#########5| 19/20 [09:44<00:37, 37.03s/q, loss=0.0001, mean_r=0.924, q_acc=100%, q_rew=0.798, skip=3]2026-04-26 05:25:53,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:25:53,641 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:25:53,724 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:05,421 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:05,515 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:05,599 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:05,684 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.755 = 0.50×0.56(prox=0.56) + 0.40×proc(0.944[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='300' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:18,485 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:18,571 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:18,655 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='500' gold='500' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 17 GRPO groups: 95%|#########5| 19/20 [10:15<00:37, 37.03s/q, loss=-0.0006, mean_r=0.974, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 100%|##########| 20/20 [10:15<00:00, 35.46s/q, loss=-0.0006, mean_r=0.974, q_acc=100%, q_rew=0.798, skip=3]
Iter 17 GRPO groups: 100%|##########| 20/20 [10:15<00:00, 30.80s/q, loss=-0.0006, mean_r=0.974, q_acc=100%, q_rew=0.798, skip=3]
+2026-04-26 05:26:20,134 INFO __main__ - Iter 17 | loss=-0.0001 | reward mean=0.888 std=0.195 | gt_match=70.1% | grounded_acc=91.6% | step_acc=85.6% | lccp=76.8% | batch_acc=93.8% | phase=SELFPLAY_RAMP sp_ratio=14% | groups=20 skipped=3(0var=3) | lr=4.68e-06 | 616.0s
+2026-04-26 05:26:20,134 INFO __main__ - Question generation: 3/3 valid (100%) | q_reward=0.798 | q_acc=100.0% (>0.5 quality) | topic=0.69 diff=0.89 clarity=1.00 novelty=0.46 solvability=1.00
+2026-04-26 05:26:20,136 INFO __main__ - ======================================================================
+2026-04-26 05:26:20,136 INFO __main__ - GRPO ITERATION 18/60
+2026-04-26 05:26:20,136 INFO __main__ - ======================================================================
+2026-04-26 05:26:20,157 INFO __main__ - LR this iteration: 4.68e-06 | T=0.685 | MATH ratio=30%
+
Iter 18 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 05:26:23,132 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:27,831 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:27,912 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:27,993 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:28,070 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:33,172 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:33,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:33,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:33,419 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:38,592 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 18 GRPO groups: 0%| | 0/20 [00:18, ?q/s, loss=0var, mean_r=0.999, skip=1]
Iter 18 GRPO groups: 5%|5 | 1/20 [00:18<05:50, 18.43s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 05:26:43,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:26:43,083 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:43,165 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:55,151 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:26:55,232 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:26:55,314 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:26:55,396 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:27:02,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:27:02,622 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.944 = 0.50×1.00(exact) + 0.40×proc(0.861[fin=0.97,mean=0.69]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 05:27:02,698 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 18 GRPO groups: 5%|5 | 1/20 [00:42<05:50, 18.43s/q, loss=0var, mean_r=0.993, skip=2]
Iter 18 GRPO groups: 10%|# | 2/20 [00:42<06:31, 21.77s/q, loss=0var, mean_r=0.993, skip=2]2026-04-26 05:27:07,202 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:27:09,871 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:27:09,958 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.954 = 0.50×1.00(exact) + 0.40×proc(0.885[fin=0.99,mean=0.73]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 05:27:10,046 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.491 = 0.50×0.05(prox=0.05) + 0.40×proc(0.913[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='340' gold='34' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 05:27:10,131 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:27:25,298 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:27:25,380 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:27:25,462 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:27:25,547 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:27:38,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='34' gold='34' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 18 GRPO groups: 10%|# | 2/20 [01:19<06:31, 21.77s/q, loss=-0.0006, mean_r=0.944, skip=2]
Iter 18 GRPO groups: 15%|#5 | 3/20 [01:19<08:08, 28.73s/q, loss=-0.0006, mean_r=0.944, skip=2]2026-04-26 05:27:47,637 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10000' gold='100002' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:27:47,722 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.989[fin=0.99,mean=0.99]) + 0.10×fmt(1.000) | pred='10000' gold='100002' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:27:47,813 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.414 = 0.50×0.36(prox=0.36) + 0.40×proc(0.427[fin=0.52,mean=0.28]) + 0.10×fmt(0.650) | pred='10000' gold='100002' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 05:27:54,413 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.992[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10000' gold='100002' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:27:54,496 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.822[fin=0.93,mean=0.66]) + 0.10×fmt(1.000) | pred='10100' gold='100002' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 05:27:54,579 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.989[fin=0.99,mean=0.98]) + 0.10×fmt(1.000) | pred='10000' gold='100002' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:27:54,671 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.996[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10000' gold='100002' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:28:03,692 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.973[fin=0.97,mean=0.98]) + 0.10×fmt(1.000) | pred='10000' gold='100002' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:28:03,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.988[fin=0.98,mean=0.99]) + 0.10×fmt(1.000) | pred='10000' gold='100002' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:28:03,866 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.465 = 0.50×0.34(prox=0.34) + 0.40×proc(0.493[fin=0.57,mean=0.37]) + 0.10×fmt(1.000) | pred='1111' gold='100002' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+
Iter 18 GRPO groups: 15%|#5 | 3/20 [01:45<08:08, 28.73s/q, loss=-0.0001, mean_r=0.528, skip=2]
Iter 18 GRPO groups: 20%|## | 4/20 [01:45<07:19, 27.48s/q, loss=-0.0001, mean_r=0.528, skip=2]2026-04-26 05:28:05,289 INFO src.rl.curriculum_manager - Topic probabilities (rollout 140): [('basic_arithmetic', '0.050'), ('fractions', '0.050'), ('percentages', '0.050'), ('ratios', '0.050'), ('money_problems', '0.050')]
+2026-04-26 05:28:10,605 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.952 + mod=+0.080, cap=1.00) | Q=0.89 sol=0.991 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:10,802 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:11,001 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.989 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:11,199 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:28:11,397 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.991 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:11,596 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.922 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.984 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:11,786 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.921 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.984 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:11,974 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.991 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:12,175 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.993 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:28:12,364 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.990 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:18,337 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.947 + mod=+0.080, cap=1.00) | Q=0.89 sol=0.985 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:18,540 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.844 = clip(base=0.764 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.751 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.76)+0.20*lccp(0.17) | steps=6
+2026-04-26 05:28:18,737 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.835 = clip(base=0.755 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.728 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.66)+0.20*lccp(0.25) | steps=4
+2026-04-26 05:28:18,936 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.993 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:28:19,136 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.821 = clip(base=0.741 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.717 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.69)+0.20*lccp(0.17) | steps=6
+2026-04-26 05:28:19,331 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.774 = clip(base=0.694 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.628 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.53)+0.20*lccp(0.00) | steps=4
+2026-04-26 05:28:19,530 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.938 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.82)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:28:19,724 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.979 = clip(base=0.899 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.943 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.85)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:28:19,919 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.778 = clip(base=0.698 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.637 novelty=0.70 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.51)+0.20*lccp(0.25) | steps=4
+2026-04-26 05:28:20,117 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.957 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=4
+
Iter 18 GRPO groups: 20%|## | 4/20 [02:01<07:19, 27.48s/q, loss=-0.0005, mean_r=0.948, q_acc=100%, q_rew=0.818, skip=3]
Iter 18 GRPO groups: 25%|##5 | 5/20 [02:01<05:52, 23.51s/q, loss=-0.0005, mean_r=0.948, q_acc=100%, q_rew=0.818, skip=3]2026-04-26 05:28:27,100 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.429 = 0.50×0.35(prox=0.35) + 0.40×proc(0.237[fin=0.11,mean=0.43]) + 0.10×fmt(1.000) | pred='468' gold='7020' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 05:28:37,504 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.737 = 0.50×0.60(prox=0.60) + 0.40×proc(0.844[fin=0.96,mean=0.67]) + 0.10×fmt(1.000) | pred='4680' gold='7020' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 05:28:37,591 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.765 = 0.50×0.60(prox=0.60) + 0.40×proc(0.912[fin=0.97,mean=0.82]) + 0.10×fmt(1.000) | pred='4680' gold='7020' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:28:37,675 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='7020' gold='7020' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:28:37,758 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.750 = 0.50×0.56(prox=0.56) + 0.40×proc(0.930[fin=0.99,mean=0.84]) + 0.10×fmt(1.000) | pred='4212' gold='7020' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:28:45,563 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7020' gold='7020' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:28:45,640 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.622[fin=0.61,mean=0.63]) + 0.10×fmt(1.000) | pred='14058' gold='7020' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:28:45,719 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.633 = 0.50×0.60(prox=0.60) + 0.40×proc(0.587[fin=0.56,mean=0.62]) + 0.10×fmt(1.000) | pred='4644' gold='7020' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:28:45,796 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.727[fin=0.76,mean=0.67]) + 0.10×fmt(1.000) | pred='14052' gold='7020' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:28:53,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.871[fin=0.95,mean=0.75]) + 0.10×fmt(1.000) | pred='14040' gold='7020' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+
Iter 18 GRPO groups: 25%|##5 | 5/20 [02:34<05:52, 23.51s/q, loss=0.0007, mean_r=0.696, q_acc=100%, q_rew=0.818, skip=3]
Iter 18 GRPO groups: 30%|### | 6/20 [02:34<06:15, 26.83s/q, loss=0.0007, mean_r=0.696, q_acc=100%, q_rew=0.818, skip=3]2026-04-26 05:28:58,202 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:28:58,283 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 05:28:58,364 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.920[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 05:29:05,309 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.927[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 05:29:05,392 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:29:05,474 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:29:05,556 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:29:13,597 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:29:13,681 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:29:13,766 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='1170' gold='1170' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 18 GRPO groups: 30%|### | 6/20 [02:53<06:15, 26.83s/q, loss=0var, mean_r=0.986, skip=4]
Iter 18 GRPO groups: 35%|###5 | 7/20 [02:53<05:14, 24.19s/q, loss=0var, mean_r=0.986, skip=4]2026-04-26 05:29:19,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.750[fin=0.85,mean=0.60]) + 0.10×fmt(1.000) | pred='490' gold='250' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:29:29,050 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.745[fin=0.85,mean=0.59]) + 0.10×fmt(1.000) | pred='490' gold='250' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:29:29,137 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:29:29,224 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:29:29,309 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:29:38,741 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.798 = 0.50×0.68(prox=0.68) + 0.40×proc(0.899[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='310' gold='250' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 05:29:38,824 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:29:38,909 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:29:38,997 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.663 = 0.50×0.68(prox=0.68) + 0.40×proc(0.564[fin=0.58,mean=0.54]) + 0.10×fmt(1.000) | pred='310' gold='250' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 05:29:49,867 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 18 GRPO groups: 35%|###5 | 7/20 [03:31<05:14, 24.19s/q, loss=-0.0006, mean_r=0.855, q_acc=100%, q_rew=0.818, skip=4]
Iter 18 GRPO groups: 40%|#### | 8/20 [03:31<05:41, 28.43s/q, loss=-0.0006, mean_r=0.855, q_acc=100%, q_rew=0.818, skip=4]2026-04-26 05:30:25,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:30:25,200 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:30:25,286 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:30:37,283 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.938[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=83% lccp=33% (chain=2/6 ok_count=5) n_steps=6
+2026-04-26 05:30:37,365 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.809 = 0.50×0.80(prox=0.80) + 0.40×proc(0.771[fin=0.97,mean=0.48]) + 0.10×fmt(1.000) | pred='7' gold='8' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:30:37,446 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.939 = 0.50×1.00(exact) + 0.40×proc(0.847[fin=0.99,mean=0.63]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 05:30:37,531 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.848 = 0.50×0.80(prox=0.80) + 0.40×proc(0.869[fin=1.00,mean=0.67]) + 0.10×fmt(1.000) | pred='7' gold='8' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 05:30:42,563 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.599 = 0.50×0.67(prox=0.67) + 0.40×proc(0.414[fin=0.54,mean=0.23]) + 0.10×fmt(1.000) | pred='6' gold='8' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 05:30:42,651 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.866 = 0.50×0.80(prox=0.80) + 0.40×proc(0.914[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='7' gold='8' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+
Iter 18 GRPO groups: 40%|#### | 8/20 [04:23<05:41, 28.43s/q, loss=0.0006, mean_r=0.892, q_acc=100%, q_rew=0.818, skip=4]
Iter 18 GRPO groups: 45%|####5 | 9/20 [04:23<06:36, 36.05s/q, loss=0.0006, mean_r=0.892, q_acc=100%, q_rew=0.818, skip=4]2026-04-26 05:30:50,410 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.677 = 0.50×0.85(prox=0.85) + 0.40×proc(0.380[fin=0.33,mean=0.46]) + 0.10×fmt(1.000) | pred='1.83' gold='2' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 05:30:50,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.964[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:31:00,245 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.940 = 0.50×1.00(exact) + 0.40×proc(0.849[fin=0.91,mean=0.75]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:31:00,333 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.791 = 0.50×0.80(prox=0.80) + 0.40×proc(0.728[fin=0.80,mean=0.61]) + 0.10×fmt(1.000) | pred='2.25' gold='2' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:31:00,421 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:31:00,506 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.917 = 0.50×1.00(exact) + 0.40×proc(0.793[fin=0.93,mean=0.58]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:31:09,853 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.815 = 0.50×0.85(prox=0.85) + 0.40×proc(0.725[fin=0.82,mean=0.59]) + 0.10×fmt(1.000) | pred='1.83' gold='2' | step_acc=57% lccp=29% (chain=2/7 ok_count=4) n_steps=7
+2026-04-26 05:31:09,938 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=0.99,mean=0.90]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:31:10,024 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.813 = 0.50×0.80(prox=0.80) + 0.40×proc(0.783[fin=0.90,mean=0.60]) + 0.10×fmt(1.000) | pred='2.25' gold='2' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:31:10,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 18 GRPO groups: 45%|####5 | 9/20 [04:59<06:36, 36.05s/q, loss=0.0001, mean_r=0.891, q_acc=100%, q_rew=0.818, skip=4]
Iter 18 GRPO groups: 50%|##### | 10/20 [04:59<05:59, 35.97s/q, loss=0.0001, mean_r=0.891, q_acc=100%, q_rew=0.818, skip=4]2026-04-26 05:31:24,297 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.268 = 0.50×0.00(prox=0.00) + 0.40×proc(0.420[fin=0.50,mean=0.30]) + 0.10×fmt(1.000) | pred='2/3' gold='8' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 05:31:24,385 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.417 = 0.50×0.29(prox=0.29) + 0.40×proc(0.374[fin=0.29,mean=0.49]) + 0.10×fmt(1.000) | pred='18' gold='8' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 05:31:24,469 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.104 = 0.50×0.00(prox=0.00) + 0.40×proc(0.097[fin=0.10,mean=0.10]) + 0.10×fmt(0.650) | pred='2/3' gold='8' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 05:31:24,556 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.629 = 0.50×0.40(prox=0.40) + 0.40×proc(0.822[fin=0.98,mean=0.58]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:31:29,750 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.590 = 0.50×0.40(prox=0.40) + 0.40×proc(0.725[fin=0.91,mean=0.44]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:31:29,835 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.946[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:31:29,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.514 = 0.50×0.29(prox=0.29) + 0.40×proc(0.678[fin=0.80,mean=0.50]) + 0.10×fmt(1.000) | pred='18' gold='8' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:31:30,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:31:35,314 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:31:35,399 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.474 = 0.50×0.50(prox=0.50) + 0.40×proc(0.310[fin=0.21,mean=0.47]) + 0.10×fmt(1.000) | pred='12' gold='8' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+
Iter 18 GRPO groups: 50%|##### | 10/20 [05:16<05:59, 35.97s/q, loss=0.0038, mean_r=0.595, q_acc=100%, q_rew=0.818, skip=4]
Iter 18 GRPO groups: 55%|#####5 | 11/20 [05:16<04:31, 30.17s/q, loss=0.0038, mean_r=0.595, q_acc=100%, q_rew=0.818, skip=4]2026-04-26 05:31:36,894 INFO src.rl.curriculum_manager - Topic probabilities (rollout 160): [('basic_arithmetic', '0.052'), ('fractions', '0.052'), ('percentages', '0.052'), ('money_problems', '0.052'), ('time_distance', '0.052')]
+2026-04-26 05:31:44,071 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.995 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:44,267 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:44,461 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.993 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:44,656 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:44,852 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:45,047 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.988 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:45,246 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:45,442 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.995 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:45,642 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:31:45,839 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.994 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:51,004 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.988 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:31:51,196 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.976 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:31:51,389 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:51,579 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.985 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:31:51,773 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.784 = clip(base=0.704 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.691 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.69)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:31:51,970 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.796 = clip(base=0.716 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.714 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.75)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:31:52,161 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.983 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:31:52,354 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:31:52,545 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.972 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:31:52,737 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.943 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+
Iter 18 GRPO groups: 55%|#####5 | 11/20 [05:34<04:31, 30.17s/q, loss=0.0005, mean_r=0.939, q_acc=100%, q_rew=0.762, skip=4]
Iter 18 GRPO groups: 60%|###### | 12/20 [05:34<03:30, 26.32s/q, loss=0.0005, mean_r=0.939, q_acc=100%, q_rew=0.762, skip=4]2026-04-26 05:31:57,563 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:31:57,646 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:32:02,610 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:32:02,692 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:32:02,773 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:32:02,855 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:32:07,491 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:32:07,573 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:32:07,654 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:32:07,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 18 GRPO groups: 60%|###### | 12/20 [05:52<03:30, 26.32s/q, loss=0var, mean_r=0.994, skip=5]
Iter 18 GRPO groups: 65%|######5 | 13/20 [05:52<02:46, 23.74s/q, loss=0var, mean_r=0.994, skip=5]2026-04-26 05:32:21,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.888 = 0.50×0.85(prox=0.85) + 0.40×proc(0.908[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='2.125' gold='2' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 05:32:21,816 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.809 = 0.50×0.85(prox=0.85) + 0.40×proc(0.709[fin=0.69,mean=0.74]) + 0.10×fmt(1.000) | pred='2.06' gold='2' | step_acc=86% lccp=29% (chain=2/7 ok_count=6) n_steps=7
+2026-04-26 05:32:21,909 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.899 = 0.50×0.85(prox=0.85) + 0.40×proc(0.935[fin=0.97,mean=0.89]) + 0.10×fmt(1.000) | pred='1.9375' gold='2' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:32:22,001 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.953 = 0.50×1.00(exact) + 0.40×proc(0.884[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:32:35,119 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.777 = 0.50×0.67(prox=0.67) + 0.40×proc(0.858[fin=1.00,mean=0.65]) + 0.10×fmt(1.000) | pred='2.5' gold='2' | step_acc=71% lccp=43% (chain=3/7 ok_count=5) n_steps=7
+2026-04-26 05:32:35,214 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.903 = 0.50×0.85(prox=0.85) + 0.40×proc(0.946[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='1.94' gold='2' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:32:35,307 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.903[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 05:32:35,402 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.722 = 0.50×0.62(prox=0.62) + 0.40×proc(0.786[fin=0.96,mean=0.52]) + 0.10×fmt(1.000) | pred='1.375' gold='2' | step_acc=57% lccp=29% (chain=2/7 ok_count=4) n_steps=7
+2026-04-26 05:32:50,166 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.847 = 0.50×0.85(prox=0.85) + 0.40×proc(0.805[fin=0.94,mean=0.61]) + 0.10×fmt(1.000) | pred='1.9375' gold='2' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 05:32:50,261 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.750 = 0.50×0.67(prox=0.67) + 0.40×proc(0.791[fin=1.00,mean=0.48]) + 0.10×fmt(1.000) | pred='2.5' gold='2' | step_acc=43% lccp=0% (chain=0/7 ok_count=3) n_steps=7
+
Iter 18 GRPO groups: 65%|######5 | 13/20 [06:31<02:46, 23.74s/q, loss=-0.0010, mean_r=0.851, q_acc=100%, q_rew=0.762, skip=5]
Iter 18 GRPO groups: 70%|####### | 14/20 [06:31<02:51, 28.51s/q, loss=-0.0010, mean_r=0.851, q_acc=100%, q_rew=0.762, skip=5]2026-04-26 05:32:59,302 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:32:59,385 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:33:08,755 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:33:08,847 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.748 = 0.50×0.50(prox=0.50) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='12' gold='8' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 05:33:08,930 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.914 = 0.50×1.00(exact) + 0.40×proc(0.784[fin=0.88,mean=0.64]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:33:09,011 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.917[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 05:33:15,980 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.679 = 0.50×0.50(prox=0.50) + 0.40×proc(0.822[fin=0.83,mean=0.81]) + 0.10×fmt(1.000) | pred='12' gold='8' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:33:16,064 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.638 = 0.50×0.50(prox=0.50) + 0.40×proc(0.720[fin=0.74,mean=0.69]) + 0.10×fmt(1.000) | pred='12' gold='8' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:33:16,149 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.357 = 0.50×0.11(prox=0.11) + 0.40×proc(0.506[fin=0.64,mean=0.30]) + 0.10×fmt(1.000) | pred='40.5' gold='8' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:33:16,233 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.795 = 0.50×0.80(prox=0.80) + 0.40×proc(0.737[fin=0.90,mean=0.50]) + 0.10×fmt(1.000) | pred='9' gold='8' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+
Iter 18 GRPO groups: 70%|####### | 14/20 [07:10<02:51, 28.51s/q, loss=0.0004, mean_r=0.810, q_acc=100%, q_rew=0.762, skip=5]
Iter 18 GRPO groups: 75%|#######5 | 15/20 [07:10<02:37, 31.55s/q, loss=0.0004, mean_r=0.810, q_acc=100%, q_rew=0.762, skip=5]2026-04-26 05:33:36,545 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:33:36,629 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:33:36,713 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.783 = 0.50×0.60(prox=0.60) + 0.40×proc(0.956[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='26' gold='39' | step_acc=86% lccp=71% (chain=5/7 ok_count=6) n_steps=7
+2026-04-26 05:33:36,797 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.779 = 0.50×0.60(prox=0.60) + 0.40×proc(0.947[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='26' gold='39' | step_acc=88% lccp=62% (chain=5/8 ok_count=7) n_steps=8
+2026-04-26 05:33:51,067 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:33:51,154 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:33:51,237 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:33:51,321 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.874 = 0.50×0.76(prox=0.76) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='45' gold='39' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:33:58,487 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:33:58,572 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='39' gold='39' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 18 GRPO groups: 75%|#######5 | 15/20 [07:39<02:37, 31.55s/q, loss=0.0012, mean_r=0.942, q_acc=100%, q_rew=0.762, skip=5]
Iter 18 GRPO groups: 80%|######## | 16/20 [07:39<02:03, 30.99s/q, loss=0.0012, mean_r=0.942, q_acc=100%, q_rew=0.762, skip=5]2026-04-26 05:34:00,029 INFO src.rl.curriculum_manager - Topic probabilities (rollout 180): [('basic_arithmetic', '0.054'), ('percentages', '0.054'), ('money_problems', '0.054'), ('time_distance', '0.054'), ('multi_step_reasoning', '0.054')]
+2026-04-26 05:34:06,357 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.955 + mod=+0.080, cap=1.00) | Q=0.92 sol=0.978 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:06,542 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.982 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:06,732 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.456 = clip(base=0.376 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.180 novelty=0.73 | sol=0.45*prm_final(0.31)+0.35*prm_mean(0.12)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:06,917 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.956 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:07,115 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.926 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.979 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:07,307 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.502 = clip(base=0.422 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.253 novelty=0.73 | sol=0.45*prm_final(0.43)+0.35*prm_mean(0.17)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:07,498 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:34:07,682 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.981 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:07,873 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.937 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:34:08,065 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.436 = clip(base=0.356 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.140 novelty=0.73 | sol=0.45*prm_final(0.24)+0.35*prm_mean(0.09)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:13,212 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.92 sol=0.944 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:13,403 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.834 = clip(base=0.754 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.709 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.74)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:13,595 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.972 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:13,785 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.831 = clip(base=0.751 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.705 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.73)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:13,975 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.828 = clip(base=0.748 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.700 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.71)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:14,174 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.836 = clip(base=0.756 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.711 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.75)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:14,360 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.820 = clip(base=0.740 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.675 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.65)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:14,551 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.812 = clip(base=0.732 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.676 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.65)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:14,739 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.818 = clip(base=0.738 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.672 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.64)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:14,921 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.842 = clip(base=0.762 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.720 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.77)+0.20*lccp(0.00) | steps=3
+
Iter 18 GRPO groups: 80%|######## | 16/20 [07:56<02:03, 30.99s/q, loss=0.0005, mean_r=0.850, q_acc=100%, q_rew=0.782, skip=5]
Iter 18 GRPO groups: 85%|########5 | 17/20 [07:56<01:19, 26.66s/q, loss=0.0005, mean_r=0.850, q_acc=100%, q_rew=0.782, skip=5]2026-04-26 05:34:16,633 INFO src.rl.curriculum_manager - Topic probabilities (rollout 200): [('basic_arithmetic', '0.056'), ('money_problems', '0.056'), ('time_distance', '0.056'), ('multi_step_reasoning', '0.056'), ('mixed_operations', '0.056')]
+2026-04-26 05:34:20,864 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.755 = clip(base=0.675 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.674 novelty=0.65 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.57)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:34:21,046 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.990 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:21,237 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.805 = clip(base=0.725 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.800 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(0.25) | steps=4
+2026-04-26 05:34:21,425 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.859 = clip(base=0.779 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.891 novelty=0.65 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.72)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:21,615 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.990 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:21,803 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.623 = clip(base=0.543 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.532 novelty=0.65 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.40)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:34:21,990 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.718 = clip(base=0.638 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.660 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.61)+0.20*lccp(0.00) | steps=2
+2026-04-26 05:34:22,174 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.881 = clip(base=0.801 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.934 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.81)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:22,358 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.881 = clip(base=0.801 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.934 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.81)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:22,542 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.713 = clip(base=0.633 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.650 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.58)+0.20*lccp(0.00) | steps=2
+2026-04-26 05:34:26,546 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.968 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:26,732 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.889 = clip(base=0.809 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.941 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:26,918 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.976 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:27,109 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.901 = clip(base=0.821 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.960 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:27,299 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.980 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:27,487 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.895 = clip(base=0.815 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.950 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:27,674 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.992 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:27,872 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.980 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:34:28,060 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.987 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:34:28,250 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.977 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+
Iter 18 GRPO groups: 85%|########5 | 17/20 [08:09<01:19, 26.66s/q, loss=-0.0021, mean_r=0.858, q_acc=100%, q_rew=0.739, skip=5]
Iter 18 GRPO groups: 90%|######### | 18/20 [08:09<00:45, 22.65s/q, loss=-0.0021, mean_r=0.858, q_acc=100%, q_rew=0.739, skip=5]2026-04-26 05:34:35,419 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:34:35,503 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:34:44,271 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:34:44,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:34:44,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:34:44,534 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:34:53,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:34:53,265 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.948 = 0.50×1.00(exact) + 0.40×proc(0.870[fin=1.00,mean=0.68]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:34:53,351 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.592 = 0.50×0.70(prox=0.70) + 0.40×proc(0.356[fin=0.49,mean=0.15]) + 0.10×fmt(1.000) | pred='255' gold='210' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 05:34:53,436 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='210' gold='210' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 18 GRPO groups: 90%|######### | 18/20 [08:41<00:45, 22.65s/q, loss=0.0004, mean_r=0.953, q_acc=100%, q_rew=0.739, skip=5]
Iter 18 GRPO groups: 95%|#########5| 19/20 [08:41<00:25, 25.29s/q, loss=0.0004, mean_r=0.953, q_acc=100%, q_rew=0.739, skip=5]2026-04-26 05:35:07,842 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.519 = 0.50×0.46(prox=0.46) + 0.40×proc(0.281[fin=0.12,mean=0.53]) + 0.10×fmt(1.000) | pred='79' gold='50' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 05:35:07,929 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.583 = 0.50×0.64(prox=0.64) + 0.40×proc(0.407[fin=0.36,mean=0.48]) + 0.10×fmt(1.000) | pred='36' gold='50' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 05:35:08,012 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.566 = 0.50×0.71(prox=0.71) + 0.40×proc(0.271[fin=0.05,mean=0.60]) + 0.10×fmt(1.000) | pred='60' gold='50' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 05:35:08,096 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.751 = 0.50×0.67(prox=0.67) + 0.40×proc(0.794[fin=0.89,mean=0.65]) + 0.10×fmt(1.000) | pred='62.5' gold='50' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 05:35:19,848 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.903 = 0.50×0.85(prox=0.85) + 0.40×proc(0.946[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='50.5' gold='50' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:35:19,931 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.595 = 0.50×0.53(prox=0.53) + 0.40×proc(0.572[fin=0.54,mean=0.63]) + 0.10×fmt(1.000) | pred='72' gold='50' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:35:20,015 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.894 = 0.50×0.83(prox=0.83) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='55' gold='50' | step_acc=86% lccp=43% (chain=3/7 ok_count=6) n_steps=7
+2026-04-26 05:35:20,099 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.568 = 0.50×0.71(prox=0.71) + 0.40×proc(0.278[fin=0.06,mean=0.61]) + 0.10×fmt(1.000) | pred='60' gold='50' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 05:35:29,135 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.831 = 0.50×0.85(prox=0.85) + 0.40×proc(0.764[fin=0.95,mean=0.49]) + 0.10×fmt(1.000) | pred='50.5' gold='50' | step_acc=43% lccp=14% (chain=1/7 ok_count=3) n_steps=7
+2026-04-26 05:35:29,222 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.560 = 0.50×0.71(prox=0.71) + 0.40×proc(0.258[fin=0.08,mean=0.52]) + 0.10×fmt(1.000) | pred='60' gold='50' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+
Iter 18 GRPO groups: 95%|#########5| 19/20 [09:10<00:25, 25.29s/q, loss=-0.0003, mean_r=0.677, q_acc=100%, q_rew=0.739, skip=5]
Iter 18 GRPO groups: 100%|##########| 20/20 [09:10<00:00, 26.50s/q, loss=-0.0003, mean_r=0.677, q_acc=100%, q_rew=0.739, skip=5]
Iter 18 GRPO groups: 100%|##########| 20/20 [09:10<00:00, 27.53s/q, loss=-0.0003, mean_r=0.677, q_acc=100%, q_rew=0.739, skip=5]
+2026-04-26 05:35:30,710 INFO __main__ - Iter 18 | loss=0.0001 | reward mean=0.866 std=0.178 | gt_match=59.1% | grounded_acc=94.3% | step_acc=83.1% | lccp=69.2% | batch_acc=95.4% | phase=SELFPLAY_RAMP sp_ratio=18% | groups=19 skipped=5(0var=5) | lr=4.60e-06 | 550.6s
+2026-04-26 05:35:30,710 INFO __main__ - Question generation: 4/4 valid (100%) | q_reward=0.739 | q_acc=100.0% (>0.5 quality) | topic=0.64 diff=0.63 clarity=1.00 novelty=0.45 solvability=0.98
+2026-04-26 05:35:30,711 INFO __main__ - ======================================================================
+2026-04-26 05:35:30,712 INFO __main__ - GRPO ITERATION 19/60
+2026-04-26 05:35:30,712 INFO __main__ - ======================================================================
+2026-04-26 05:35:30,733 INFO __main__ - LR this iteration: 4.60e-06 | T=0.678 | MATH ratio=32%
+
Iter 19 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 05:35:34,965 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:35:35,047 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:35:43,226 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:35:43,309 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:35:43,391 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:35:43,473 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:35:50,109 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:35:50,192 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:35:50,275 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.944[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:35:50,358 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 19 GRPO groups: 0%| | 0/20 [00:27, ?q/s, loss=0var, mean_r=0.998, skip=1]
Iter 19 GRPO groups: 5%|5 | 1/20 [00:27<08:43, 27.53s/q, loss=0var, mean_r=0.998, skip=1]2026-04-26 05:36:03,766 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:36:03,849 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.930 = 0.50×1.00(exact) + 0.40×proc(0.825[fin=0.99,mean=0.57]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:36:03,933 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:36:04,017 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:36:13,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:36:13,304 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:36:13,387 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.735 = 0.50×0.71(prox=0.71) + 0.40×proc(0.694[fin=0.84,mean=0.48]) + 0.10×fmt(1.000) | pred='60' gold='75' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:36:13,470 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.801 = 0.50×0.71(prox=0.71) + 0.40×proc(0.860[fin=1.00,mean=0.66]) + 0.10×fmt(1.000) | pred='60' gold='75' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:36:23,801 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.725 = 0.50×0.71(prox=0.71) + 0.40×proc(0.671[fin=0.81,mean=0.46]) + 0.10×fmt(1.000) | pred='60' gold='75' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 05:36:23,885 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='75' gold='75' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 19 GRPO groups: 5%|5 | 1/20 [00:54<08:43, 27.53s/q, loss=0.0011, mean_r=0.919, skip=1]
Iter 19 GRPO groups: 10%|# | 2/20 [00:54<08:11, 27.29s/q, loss=0.0011, mean_r=0.919, skip=1]2026-04-26 05:36:25,389 INFO src.rl.curriculum_manager - Topic probabilities (rollout 220): [('statistics', '0.233'), ('money_problems', '0.044'), ('time_distance', '0.044'), ('multi_step_reasoning', '0.044'), ('mixed_operations', '0.044')]
+2026-04-26 05:36:33,224 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.980 = clip(base=0.900 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.987 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:36:33,432 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.965 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:36:33,635 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.994 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:36:33,837 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.995 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:36:34,039 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.986 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:36:34,245 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.963 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:36:34,461 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.996 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:34,670 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.819 = clip(base=0.739 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.787 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.82)+0.20*lccp(0.25) | steps=4
+2026-04-26 05:36:34,885 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.980 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:36:35,103 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:36:39,976 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.987 = clip(base=0.907 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:40,189 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:40,398 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.519 = clip(base=0.439 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.341 novelty=0.72 | sol=0.45*prm_final(0.30)+0.35*prm_mean(0.40)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:36:40,606 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:40,810 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:41,017 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:41,226 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.995 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:41,430 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:41,638 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:36:41,844 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 19 GRPO groups: 10%|# | 2/20 [01:12<08:11, 27.29s/q, loss=-0.0009, mean_r=0.931, q_acc=100%, q_rew=0.706, skip=1]
Iter 19 GRPO groups: 15%|#5 | 3/20 [01:12<06:32, 23.10s/q, loss=-0.0009, mean_r=0.931, q_acc=100%, q_rew=0.706, skip=1]2026-04-26 05:36:46,571 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:36:46,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:36:51,099 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.949 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(0.650) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:36:51,181 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.956 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(0.650) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:36:51,263 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.944[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:36:51,345 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:36:55,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.314 = 0.50×0.00(prox=0.00) + 0.40×proc(0.623[fin=0.77,mean=0.40]) + 0.10×fmt(0.650) | pred='$\\sqrt{\\pi}$' gold='2' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 05:36:55,715 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:36:55,797 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:36:55,880 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 19 GRPO groups: 15%|#5 | 3/20 [01:28<06:32, 23.10s/q, loss=-0.0018, mean_r=0.914, q_acc=100%, q_rew=0.706, skip=1]
Iter 19 GRPO groups: 20%|## | 4/20 [01:28<05:23, 20.22s/q, loss=-0.0018, mean_r=0.914, q_acc=100%, q_rew=0.706, skip=1]2026-04-26 05:37:02,967 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:37:03,052 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:37:03,138 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:37:03,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:37:07,950 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:37:08,033 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:37:08,117 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:37:08,199 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:37:14,335 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:37:14,418 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='80' gold='80' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 19 GRPO groups: 20%|## | 4/20 [01:43<05:23, 20.22s/q, loss=0var, mean_r=0.999, skip=2]
Iter 19 GRPO groups: 25%|##5 | 5/20 [01:43<04:35, 18.38s/q, loss=0var, mean_r=0.999, skip=2]2026-04-26 05:37:14,419 INFO src.rl.curriculum_manager - Topic probabilities (rollout 240): [('money_problems', '0.063'), ('time_distance', '0.063'), ('multi_step_reasoning', '0.063'), ('mixed_operations', '0.063'), ('comparison_problems', '0.063')]
+2026-04-26 05:37:24,095 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.737 = clip(base=0.657 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.592 novelty=0.73 | sol=0.45*prm_final(0.44)+0.35*prm_mean(0.75)+0.20*lccp(0.67) | steps=6
+2026-04-26 05:37:24,310 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.740 = clip(base=0.660 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.640 novelty=0.73 | sol=0.45*prm_final(0.54)+0.35*prm_mean(0.76)+0.20*lccp(0.67) | steps=6
+2026-04-26 05:37:24,533 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.870 = clip(base=0.790 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.855 novelty=0.73 | sol=0.45*prm_final(0.78)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:37:24,752 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.807 = clip(base=0.727 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.753 novelty=0.73 | sol=0.45*prm_final(0.60)+0.35*prm_mean(0.81)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:37:24,970 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.836 = clip(base=0.756 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.795 novelty=0.73 | sol=0.45*prm_final(0.67)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:37:25,191 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.669 = clip(base=0.589 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.524 novelty=0.73 | sol=0.45*prm_final(0.31)+0.35*prm_mean(0.72)+0.20*lccp(0.67) | steps=6
+2026-04-26 05:37:25,417 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.747 = clip(base=0.667 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.654 novelty=0.73 | sol=0.45*prm_final(0.61)+0.35*prm_mean(0.71)+0.20*lccp(0.67) | steps=6
+2026-04-26 05:37:25,637 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.703 = clip(base=0.623 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.584 novelty=0.73 | sol=0.45*prm_final(0.41)+0.35*prm_mean(0.75)+0.20*lccp(0.67) | steps=6
+2026-04-26 05:37:25,860 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.848 = clip(base=0.768 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.818 novelty=0.73 | sol=0.45*prm_final(0.71)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:37:26,076 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.698 = clip(base=0.618 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.563 novelty=0.73 | sol=0.45*prm_final(0.38)+0.35*prm_mean(0.74)+0.20*lccp(0.67) | steps=6
+2026-04-26 05:37:32,782 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.704 = clip(base=0.624 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.540 novelty=0.75 | sol=0.45*prm_final(0.62)+0.35*prm_mean(0.55)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:37:32,991 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.643 = clip(base=0.563 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.478 novelty=0.75 | sol=0.45*prm_final(0.52)+0.35*prm_mean(0.51)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:37:33,201 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.445 = clip(base=0.365 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.205 novelty=0.75 | sol=0.45*prm_final(0.05)+0.35*prm_mean(0.33)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:37:33,406 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.780 = clip(base=0.700 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.696 novelty=0.75 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.69)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:37:33,609 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.953 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:37:33,811 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.805 = clip(base=0.725 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.742 novelty=0.75 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.68)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:37:34,011 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.968 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:37:34,214 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.797 = clip(base=0.717 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.715 novelty=0.75 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.72)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:37:34,414 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.661 = clip(base=0.581 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.512 novelty=0.75 | sol=0.45*prm_final(0.45)+0.35*prm_mean(0.61)+0.20*lccp(0.50) | steps=4
+2026-04-26 05:37:34,613 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.808 = clip(base=0.728 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.733 novelty=0.75 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.69)+0.20*lccp(0.33) | steps=3
+
Iter 19 GRPO groups: 25%|##5 | 5/20 [02:05<04:35, 18.38s/q, loss=0.0009, mean_r=0.759, q_acc=100%, q_rew=0.702, skip=2]
Iter 19 GRPO groups: 30%|### | 6/20 [02:05<04:33, 19.57s/q, loss=0.0009, mean_r=0.759, q_acc=100%, q_rew=0.702, skip=2]2026-04-26 05:37:39,191 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:37:39,270 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.435 = 0.50×0.50(prox=0.50) + 0.40×proc(0.213[fin=0.22,mean=0.21]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=0% lccp=0% (chain=0/4 ok_count=0) n_steps=4
+2026-04-26 05:37:43,584 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:37:43,661 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:37:43,736 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.894 = 0.50×1.00(exact) + 0.40×proc(0.734[fin=0.95,mean=0.41]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:37:43,812 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:37:50,746 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.458 = 0.50×0.50(prox=0.50) + 0.40×proc(0.269[fin=0.29,mean=0.23]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 05:37:50,823 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.908 = 0.50×1.00(exact) + 0.40×proc(0.770[fin=0.98,mean=0.45]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:37:50,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.906 = 0.50×1.00(exact) + 0.40×proc(0.765[fin=0.96,mean=0.48]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:37:50,977 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 19 GRPO groups: 30%|### | 6/20 [02:28<04:33, 19.57s/q, loss=-0.0019, mean_r=0.860, q_acc=100%, q_rew=0.702, skip=2]
Iter 19 GRPO groups: 35%|###5 | 7/20 [02:28<04:28, 20.63s/q, loss=-0.0019, mean_r=0.860, q_acc=100%, q_rew=0.702, skip=2]2026-04-26 05:38:03,140 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:38:03,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:38:03,304 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:38:03,385 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:38:08,291 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:38:08,373 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:38:08,455 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:38:08,536 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:38:15,618 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:38:15,699 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(0.650) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 19 GRPO groups: 35%|###5 | 7/20 [02:44<04:28, 20.63s/q, loss=0var, mean_r=0.989, skip=3]
Iter 19 GRPO groups: 40%|#### | 8/20 [02:44<03:52, 19.34s/q, loss=0var, mean_r=0.989, skip=3]2026-04-26 05:38:20,144 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.845 = 0.50×0.85(prox=0.85) + 0.40×proc(0.799[fin=0.96,mean=0.55]) + 0.10×fmt(1.000) | pred='5194' gold='5217' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:38:20,229 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.748 = 0.50×0.85(prox=0.85) + 0.40×proc(0.558[fin=0.65,mean=0.42]) + 0.10×fmt(1.000) | pred='5194' gold='5217' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 05:38:27,953 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.888[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='5217' gold='5217' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 05:38:28,036 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.803 = 0.50×0.85(prox=0.85) + 0.40×proc(0.694[fin=0.82,mean=0.50]) + 0.10×fmt(1.000) | pred='5194' gold='5217' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 05:38:28,120 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5217' gold='5217' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:38:28,205 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.861 = 0.50×0.85(prox=0.85) + 0.40×proc(0.841[fin=1.00,mean=0.60]) + 0.10×fmt(1.000) | pred='5194' gold='5217' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 05:38:38,239 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.829 = 0.50×0.85(prox=0.85) + 0.40×proc(0.760[fin=0.91,mean=0.53]) + 0.10×fmt(1.000) | pred='5194' gold='5217' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 05:38:38,324 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.894 = 0.50×0.85(prox=0.85) + 0.40×proc(0.923[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='5194' gold='5217' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 05:38:38,408 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.629 = 0.50×0.85(prox=0.85) + 0.40×proc(0.261[fin=0.27,mean=0.24]) + 0.10×fmt(1.000) | pred='5192' gold='5217' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 05:38:38,489 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5217' gold='5217' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 19 GRPO groups: 40%|#### | 8/20 [03:14<03:52, 19.34s/q, loss=-0.0002, mean_r=0.856, q_acc=100%, q_rew=0.702, skip=3]
Iter 19 GRPO groups: 45%|####5 | 9/20 [03:14<04:07, 22.51s/q, loss=-0.0002, mean_r=0.856, q_acc=100%, q_rew=0.702, skip=3]2026-04-26 05:38:51,357 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:38:51,448 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:38:51,531 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:38:51,622 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:39:05,758 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.473 = 0.50×0.50(prox=0.50) + 0.40×proc(0.305[fin=0.16,mean=0.52]) + 0.10×fmt(1.000) | pred='98' gold='195' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 05:39:05,853 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:39:05,941 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.955 = 0.50×1.00(exact) + 0.40×proc(0.887[fin=0.93,mean=0.82]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 05:39:06,028 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.824 = 0.50×0.80(prox=0.80) + 0.40×proc(0.806[fin=0.99,mean=0.53]) + 0.10×fmt(1.000) | pred='171' gold='195' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 05:39:16,659 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:39:16,745 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 19 GRPO groups: 45%|####5 | 9/20 [03:47<04:07, 22.51s/q, loss=0.0012, mean_r=0.923, q_acc=100%, q_rew=0.702, skip=3]
Iter 19 GRPO groups: 50%|##### | 10/20 [03:47<04:17, 25.76s/q, loss=0.0012, mean_r=0.923, q_acc=100%, q_rew=0.702, skip=3]2026-04-26 05:39:21,830 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:39:21,913 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:39:31,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:39:31,411 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:39:31,488 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.972 = 0.50×1.00(exact) + 0.40×proc(0.931[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:39:31,570 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:39:41,123 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:39:41,205 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:39:41,286 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:39:41,368 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 19 GRPO groups: 50%|##### | 10/20 [04:15<04:17, 25.76s/q, loss=0var, mean_r=0.997, skip=4]
Iter 19 GRPO groups: 55%|#####5 | 11/20 [04:15<03:56, 26.31s/q, loss=0var, mean_r=0.997, skip=4]2026-04-26 05:39:51,424 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='600' gold='600' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:39:51,506 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='600' gold='600' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:39:51,590 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.511 = 0.50×0.45(prox=0.45) + 0.40×proc(0.384[fin=0.38,mean=0.39]) + 0.10×fmt(1.000) | pred='240' gold='600' | step_acc=20% lccp=20% (chain=1/5 ok_count=1) n_steps=5
+2026-04-26 05:39:51,673 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='600' gold='600' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:40:05,587 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='600' gold='600' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:40:05,670 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.928[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='600' gold='600' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 05:40:05,754 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.543 = 0.50×0.37(prox=0.37) + 0.40×proc(0.645[fin=0.84,mean=0.36]) + 0.10×fmt(1.000) | pred='90' gold='600' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 05:40:05,838 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.589 = 0.50×0.37(prox=0.37) + 0.40×proc(0.766[fin=0.97,mean=0.46]) + 0.10×fmt(1.000) | pred='80' gold='600' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 05:40:13,425 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.819 = 0.50×0.75(prox=0.75) + 0.40×proc(0.860[fin=0.96,mean=0.71]) + 0.10×fmt(1.000) | pred='700' gold='600' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 05:40:13,510 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.933[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='600' gold='600' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 19 GRPO groups: 55%|#####5 | 11/20 [04:44<03:56, 26.31s/q, loss=0.0010, mean_r=0.839, q_acc=100%, q_rew=0.702, skip=4]
Iter 19 GRPO groups: 60%|###### | 12/20 [04:44<03:37, 27.19s/q, loss=0.0010, mean_r=0.839, q_acc=100%, q_rew=0.702, skip=4]2026-04-26 05:40:14,961 INFO src.rl.curriculum_manager - Topic probabilities (rollout 260): [('money_problems', '0.067'), ('time_distance', '0.067'), ('multi_step_reasoning', '0.067'), ('mixed_operations', '0.067'), ('comparison_problems', '0.067')]
+2026-04-26 05:40:20,811 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.946 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.987 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:21,004 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:21,198 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:21,391 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:21,583 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.922 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.996 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:21,775 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:21,970 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:22,167 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:22,359 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:22,555 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.998 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:29,769 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.960 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:29,973 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.512 = clip(base=0.432 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.300 novelty=0.68 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.55)+0.20*lccp(0.50) | steps=6
+2026-04-26 05:40:30,177 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:30,374 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.670 = clip(base=0.590 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.603 novelty=0.68 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.48)+0.20*lccp(0.00) | steps=4
+2026-04-26 05:40:30,571 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:30,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:30,974 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.399 = clip(base=0.319 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.188 novelty=0.68 | sol=0.45*prm_final(0.32)+0.35*prm_mean(0.13)+0.20*lccp(0.00) | steps=4
+2026-04-26 05:40:31,176 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.990 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:40:31,372 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:31,567 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+
Iter 19 GRPO groups: 60%|###### | 12/20 [05:02<03:37, 27.19s/q, loss=0.0000, mean_r=0.914, q_acc=100%, q_rew=0.716, skip=5]
Iter 19 GRPO groups: 65%|######5 | 13/20 [05:02<02:51, 24.47s/q, loss=0.0000, mean_r=0.914, q_acc=100%, q_rew=0.716, skip=5]2026-04-26 05:40:33,195 INFO src.rl.curriculum_manager - Topic probabilities (rollout 280): [('money_problems', '0.072'), ('time_distance', '0.072'), ('multi_step_reasoning', '0.072'), ('comparison_problems', '0.072'), ('optimization_problems', '0.072')]
+2026-04-26 05:40:42,725 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.976 + mod=+0.080, cap=1.00) | Q=0.94 sol=1.000 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:42,919 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.973 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:40:43,123 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.943 + mod=+0.080, cap=1.00) | Q=0.86 sol=1.000 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:43,317 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.939 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.985 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:43,518 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.945 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.995 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:43,716 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.933 = clip(base=0.853 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.849 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(0.50) | steps=4
+2026-04-26 05:40:43,909 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.939 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.986 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:44,105 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.943 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.990 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:44,298 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.979 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:44,491 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.813 novelty=0.75 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.79)+0.20*lccp(0.50) | steps=4
+2026-04-26 05:40:50,249 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.970 novelty=0.71 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:40:50,446 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.983 novelty=0.71 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:50,643 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.992 novelty=0.71 | sol=0.45*prm_final(0.98)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:40:50,844 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.911 novelty=0.71 | sol=0.45*prm_final(0.82)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:40:51,041 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.654 = clip(base=0.574 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.478 novelty=0.71 | sol=0.45*prm_final(0.63)+0.35*prm_mean(0.56)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:40:51,236 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.406 = clip(base=0.326 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.159 novelty=0.71 | sol=0.45*prm_final(0.22)+0.35*prm_mean(0.17)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:40:51,433 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.617 = clip(base=0.537 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.414 novelty=0.71 | sol=0.45*prm_final(0.03)+0.35*prm_mean(0.69)+0.20*lccp(0.80) | steps=5
+2026-04-26 05:40:51,628 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.990 novelty=0.71 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:51,824 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.829 = clip(base=0.749 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.781 novelty=0.71 | sol=0.45*prm_final(0.59)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:40:52,021 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.996 novelty=0.71 | sol=0.45*prm_final(0.99)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+
Iter 19 GRPO groups: 65%|######5 | 13/20 [05:22<02:51, 24.47s/q, loss=0.0008, mean_r=0.901, q_acc=100%, q_rew=0.733, skip=5]
Iter 19 GRPO groups: 70%|####### | 14/20 [05:22<02:19, 23.26s/q, loss=0.0008, mean_r=0.901, q_acc=100%, q_rew=0.733, skip=5]2026-04-26 05:41:27,059 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.842[fin=0.89,mean=0.76]) + 0.10×fmt(1.000) | pred='521' gold='5067' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 05:41:27,147 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.350 = 0.50×0.34(prox=0.34) + 0.40×proc(0.136[fin=0.10,mean=0.20]) + 0.10×fmt(1.000) | pred='156' gold='5067' | step_acc=17% lccp=17% (chain=1/6 ok_count=1) n_steps=6
+2026-04-26 05:41:39,374 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.470 = 0.50×0.35(prox=0.35) + 0.40×proc(0.306[fin=0.17,mean=0.51]) + 0.10×fmt(1.000) | pred='260' gold='5067' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 05:41:39,457 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.251 = 0.50×0.10(prox=0.10) + 0.40×proc(0.156[fin=0.08,mean=0.27]) + 0.10×fmt(1.000) | pred='-17240' gold='5067' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 05:41:39,551 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='506' gold='5067' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:41:39,642 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.868[fin=0.95,mean=0.74]) + 0.10×fmt(1.000) | pred='501' gold='5067' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 05:41:49,846 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.35(prox=0.35) + 0.40×proc(0.797[fin=0.90,mean=0.64]) + 0.10×fmt(1.000) | pred='410' gold='5067' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 05:41:49,930 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.49(prox=0.49) + 0.40×proc(0.305[fin=0.14,mean=0.56]) + 0.10×fmt(1.000) | pred='2407' gold='5067' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 05:41:50,022 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.39(prox=0.39) + 0.40×proc(0.655[fin=0.59,mean=0.75]) + 0.10×fmt(1.000) | pred='1106' gold='5067' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+
Iter 19 GRPO groups: 70%|####### | 14/20 [06:20<02:19, 23.26s/q, loss=0.0007, mean_r=0.486, q_acc=100%, q_rew=0.733, skip=5]
Iter 19 GRPO groups: 75%|#######5 | 15/20 [06:20<02:48, 33.64s/q, loss=0.0007, mean_r=0.486, q_acc=100%, q_rew=0.733, skip=5]2026-04-26 05:42:24,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.37(prox=0.37) + 0.40×proc(0.690[fin=0.75,mean=0.61]) + 0.10×fmt(1.000) | pred='11' gold='6' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:42:33,589 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.589 = 0.50×0.60(prox=0.60) + 0.40×proc(0.472[fin=0.39,mean=0.60]) + 0.10×fmt(1.000) | pred='8' gold='6' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 05:42:33,676 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.700 = 0.50×0.75(prox=0.75) + 0.40×proc(0.563[fin=0.53,mean=0.62]) + 0.10×fmt(1.000) | pred='7' gold='6' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 05:42:33,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.923[fin=0.95,mean=0.88]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=88% lccp=75% (chain=6/8 ok_count=7) n_steps=8
+2026-04-26 05:42:33,857 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.894 = 0.50×1.00(exact) + 0.40×proc(0.736[fin=0.64,mean=0.88]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=91% lccp=82% (chain=9/11 ok_count=10) n_steps=11
+2026-04-26 05:42:56,112 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.924 = 0.50×1.00(exact) + 0.40×proc(0.810[fin=0.78,mean=0.85]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=88% lccp=75% (chain=6/8 ok_count=7) n_steps=8
+2026-04-26 05:42:56,196 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.724 = 0.50×0.75(prox=0.75) + 0.40×proc(0.710[fin=0.87,mean=0.47]) + 0.10×fmt(0.650) | pred='5' gold='6' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 05:42:56,282 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.965[fin=0.98,mean=0.94]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=89% lccp=78% (chain=7/9 ok_count=8) n_steps=9
+
Iter 19 GRPO groups: 75%|#######5 | 15/20 [07:26<02:48, 33.64s/q, loss=0.0024, mean_r=0.792, q_acc=100%, q_rew=0.733, skip=5]
Iter 19 GRPO groups: 80%|######## | 16/20 [07:26<02:53, 43.44s/q, loss=0.0024, mean_r=0.792, q_acc=100%, q_rew=0.733, skip=5]2026-04-26 05:42:59,768 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:04,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:04,803 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:04,886 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:04,968 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:09,892 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:09,975 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:10,058 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:10,142 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:14,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='52' gold='52' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 19 GRPO groups: 80%|######## | 16/20 [07:43<02:53, 43.44s/q, loss=0var, mean_r=0.999, skip=6]
Iter 19 GRPO groups: 85%|########5 | 17/20 [07:43<01:46, 35.47s/q, loss=0var, mean_r=0.999, skip=6]2026-04-26 05:43:17,333 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:17,417 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.898[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 05:43:17,499 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:22,573 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:22,658 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.909[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 05:43:22,740 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.928[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:22,826 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:28,907 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:28,990 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:29,074 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='828' gold='828' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 19 GRPO groups: 85%|########5 | 17/20 [07:58<01:46, 35.47s/q, loss=0var, mean_r=0.987, skip=7]
Iter 19 GRPO groups: 90%|######### | 18/20 [07:58<00:58, 29.20s/q, loss=0var, mean_r=0.987, skip=7]2026-04-26 05:43:30,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:36,309 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:36,387 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:36,466 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:36,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:41,572 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:41,652 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:41,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:41,810 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:43:47,331 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 19 GRPO groups: 90%|######### | 18/20 [08:16<00:58, 29.20s/q, loss=0var, mean_r=0.998, skip=8]
Iter 19 GRPO groups: 95%|#########5| 19/20 [08:16<00:25, 25.91s/q, loss=0var, mean_r=0.998, skip=8]2026-04-26 05:43:55,590 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.587 = 0.50×0.36(prox=0.36) + 0.40×proc(0.764[fin=1.00,mean=0.42]) + 0.10×fmt(1.000) | pred='7.5' gold='60' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 05:43:55,674 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.964[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:43:55,759 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.536 = 0.50×0.46(prox=0.46) + 0.40×proc(0.512[fin=0.67,mean=0.27]) + 0.10×fmt(1.000) | pred='25' gold='60' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 05:44:01,536 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.953 = 0.50×1.00(exact) + 0.40×proc(0.883[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 05:44:01,632 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.288 = 0.50×0.00(prox=0.00) + 0.40×proc(0.471[fin=0.42,mean=0.55]) + 0.10×fmt(1.000) | pred='233.33%' gold='60' | step_acc=57% lccp=0% (chain=0/7 ok_count=4) n_steps=7
+2026-04-26 05:44:01,720 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.664 = 0.50×0.53(prox=0.53) + 0.40×proc(0.749[fin=0.95,mean=0.45]) + 0.10×fmt(1.000) | pred='33.33' gold='60' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 05:44:01,814 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 05:44:11,452 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.659 = 0.50×0.47(prox=0.47) + 0.40×proc(0.805[fin=0.97,mean=0.55]) + 0.10×fmt(1.000) | pred='26.67' gold='60' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 05:44:11,536 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:44:11,619 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='60' gold='60' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 19 GRPO groups: 95%|#########5| 19/20 [08:42<00:25, 25.91s/q, loss=-0.0018, mean_r=0.765, q_acc=100%, q_rew=0.733, skip=8]
Iter 19 GRPO groups: 100%|##########| 20/20 [08:42<00:00, 25.88s/q, loss=-0.0018, mean_r=0.765, q_acc=100%, q_rew=0.733, skip=8]
Iter 19 GRPO groups: 100%|##########| 20/20 [08:42<00:00, 26.12s/q, loss=-0.0018, mean_r=0.765, q_acc=100%, q_rew=0.733, skip=8]
+2026-04-26 05:44:13,142 INFO __main__ - Iter 19 | loss=0.0002 | reward mean=0.891 std=0.173 | gt_match=76.4% | grounded_acc=94.9% | step_acc=85.1% | lccp=75.7% | batch_acc=95.4% | phase=SELFPLAY_RAMP sp_ratio=21% | groups=16 skipped=8(0var=8) | lr=4.52e-06 | 522.4s
+2026-04-26 05:44:13,143 WARNING __main__ - STARVATION: 33% of groups skipped (zero variance). grounded_acc=94.9% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 05:44:13,143 INFO __main__ - Question generation: 4/4 valid (100%) | q_reward=0.733 | q_acc=100.0% (>0.5 quality) | topic=0.48 diff=0.85 clarity=1.00 novelty=0.46 solvability=0.97
+2026-04-26 05:44:13,144 INFO __main__ - ======================================================================
+2026-04-26 05:44:13,144 INFO __main__ - GRPO ITERATION 20/60
+2026-04-26 05:44:13,144 INFO __main__ - ======================================================================
+2026-04-26 05:44:13,165 INFO __main__ - LR this iteration: 4.52e-06 | T=0.671 | MATH ratio=34%
+
Iter 20 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 05:44:25,231 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.496 = 0.50×0.00(prox=0.00) + 0.40×proc(0.830[fin=0.89,mean=0.74]) + 0.10×fmt(1.000) | pred='$95,000' gold='250000' | step_acc=71% lccp=43% (chain=3/7 ok_count=5) n_steps=7
+2026-04-26 05:44:41,176 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.903[fin=0.98,mean=0.79]) + 0.10×fmt(1.000) | pred='250000' gold='250000' | step_acc=71% lccp=43% (chain=3/7 ok_count=5) n_steps=7
+2026-04-26 05:44:41,270 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.863 = 0.50×1.00(exact) + 0.40×proc(0.657[fin=0.57,mean=0.79]) + 0.10×fmt(1.000) | pred='250000' gold='250000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:44:41,368 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.962[fin=0.99,mean=0.92]) + 0.10×fmt(1.000) | pred='25000' gold='250000' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:44:41,463 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.901[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='5000' gold='250000' | step_acc=83% lccp=33% (chain=2/6 ok_count=5) n_steps=6
+2026-04-26 05:45:04,914 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.455 = 0.50×0.33(prox=0.33) + 0.40×proc(0.289[fin=0.11,mean=0.55]) + 0.10×fmt(1.000) | pred='-5000' gold='250000' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 05:45:05,006 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250000' gold='250000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:45:05,101 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.441 = 0.50×0.33(prox=0.33) + 0.40×proc(0.255[fin=0.07,mean=0.53]) + 0.10×fmt(1.000) | pred='-5000' gold='250000' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 05:45:05,194 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.451 = 0.50×0.33(prox=0.33) + 0.40×proc(0.278[fin=0.12,mean=0.52]) + 0.10×fmt(1.000) | pred='-5000' gold='250000' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 05:45:20,972 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.445 = 0.50×0.33(prox=0.33) + 0.40×proc(0.264[fin=0.09,mean=0.53]) + 0.10×fmt(1.000) | pred='-5000' gold='250000' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+
Iter 20 GRPO groups: 0%| | 0/20 [01:09, ?q/s, loss=0.0002, mean_r=0.621, skip=0]
Iter 20 GRPO groups: 5%|5 | 1/20 [01:09<21:57, 69.33s/q, loss=0.0002, mean_r=0.621, skip=0]2026-04-26 05:45:25,117 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:45:25,199 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:45:25,279 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:45:30,083 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:45:30,170 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:45:30,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:45:30,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:45:34,971 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:45:35,049 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:45:35,130 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.949[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='220' gold='220' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 20 GRPO groups: 5%|5 | 1/20 [01:21<21:57, 69.33s/q, loss=0var, mean_r=0.994, skip=1]
Iter 20 GRPO groups: 10%|# | 2/20 [01:21<10:47, 35.98s/q, loss=0var, mean_r=0.994, skip=1]2026-04-26 05:46:09,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:46:14,066 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.945[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:46:14,151 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:46:14,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.942[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:46:14,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:46:16,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.944[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:46:16,622 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:46:16,701 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:46:16,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 20 GRPO groups: 10%|# | 2/20 [02:07<10:47, 35.98s/q, loss=0var, mean_r=0.987, skip=2]
Iter 20 GRPO groups: 15%|#5 | 3/20 [02:07<11:28, 40.49s/q, loss=0var, mean_r=0.987, skip=2]2026-04-26 05:46:20,990 INFO src.rl.curriculum_manager - Topic probabilities (rollout 300): [('money_problems', '0.078'), ('time_distance', '0.078'), ('multi_step_reasoning', '0.078'), ('comparison_problems', '0.078'), ('optimization_problems', '0.078')]
+2026-04-26 05:46:26,362 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.947 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:26,562 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:26,765 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:26,969 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:27,175 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:27,380 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:27,582 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:27,783 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:27,981 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:28,184 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:46:32,054 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.936 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:32,262 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.926 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.997 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:32,460 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.988 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:32,659 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:32,865 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:33,064 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=1.000 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:33,273 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=1.000 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:33,472 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=1.000 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:33,671 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:46:33,869 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.920 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.988 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+
Iter 20 GRPO groups: 15%|#5 | 3/20 [02:22<11:28, 40.49s/q, loss=0.0004, mean_r=1.000, q_acc=100%, q_rew=0.819, skip=3]
Iter 20 GRPO groups: 20%|## | 4/20 [02:22<08:05, 30.37s/q, loss=0.0004, mean_r=1.000, q_acc=100%, q_rew=0.819, skip=3]2026-04-26 05:46:43,901 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:46:43,985 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:46:44,071 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:46:44,154 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:46:54,692 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:46:54,774 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.972 = 0.50×1.00(exact) + 0.40×proc(0.929[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 05:46:54,867 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:46:54,951 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.878 = 0.50×1.00(exact) + 0.40×proc(0.695[fin=0.84,mean=0.47]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 05:46:57,810 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.454 = 0.50×0.00(prox=0.00) + 0.40×proc(0.790[fin=0.87,mean=0.67]) + 0.10×fmt(1.000) | pred='41/2' gold='17' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 05:46:57,893 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 20 GRPO groups: 20%|## | 4/20 [02:46<08:05, 30.37s/q, loss=-0.0006, mean_r=0.930, q_acc=100%, q_rew=0.819, skip=3]
Iter 20 GRPO groups: 25%|##5 | 5/20 [02:46<06:58, 27.91s/q, loss=-0.0006, mean_r=0.930, q_acc=100%, q_rew=0.819, skip=3]2026-04-26 05:47:05,933 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:47:06,018 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 05:47:22,465 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:47:22,550 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:47:22,634 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.837[fin=0.88,mean=0.78]) + 0.10×fmt(1.000) | pred='215' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:47:22,719 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:47:31,614 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:47:31,698 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:47:31,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:47:31,868 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 20 GRPO groups: 25%|##5 | 5/20 [03:28<06:58, 27.91s/q, loss=0.0008, mean_r=0.953, q_acc=100%, q_rew=0.819, skip=3]
Iter 20 GRPO groups: 30%|### | 6/20 [03:28<07:38, 32.76s/q, loss=0.0008, mean_r=0.953, q_acc=100%, q_rew=0.819, skip=3]2026-04-26 05:47:41,562 INFO src.rl.curriculum_manager - Topic probabilities (rollout 320): [('probability', '0.111'), ('statistics', '0.111'), ('money_problems', '0.063'), ('time_distance', '0.063'), ('comparison_problems', '0.063')]
+2026-04-26 05:47:49,520 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.700 = clip(base=0.620 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.519 novelty=0.74 | sol=0.45*prm_final(0.59)+0.35*prm_mean(0.53)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:47:49,738 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.527 = clip(base=0.447 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.332 novelty=0.74 | sol=0.45*prm_final(0.26)+0.35*prm_mean(0.42)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:47:49,964 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.783 = clip(base=0.703 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.718 novelty=0.74 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.64)+0.20*lccp(0.25) | steps=4
+2026-04-26 05:47:50,183 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.792 = clip(base=0.712 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.731 novelty=0.74 | sol=0.45*prm_final(0.92)+0.35*prm_mean(0.72)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:47:50,399 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.957 novelty=0.74 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:47:50,616 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.764 = clip(base=0.684 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.666 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.62)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:47:50,832 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.957 novelty=0.74 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:47:51,050 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.941 novelty=0.74 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:47:51,265 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.976 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:47:51,483 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.991 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:47:58,392 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:47:58,616 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:47:58,833 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.991 novelty=0.71 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:47:59,055 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.971 novelty=0.71 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:47:59,272 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:47:59,485 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:47:59,695 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.960 novelty=0.71 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:47:59,914 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:48:00,128 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.973 novelty=0.71 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:48:00,365 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.954 novelty=0.71 | sol=0.45*prm_final(0.92)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+
Iter 20 GRPO groups: 30%|### | 6/20 [03:48<07:38, 32.76s/q, loss=0.0005, mean_r=0.889, q_acc=100%, q_rew=0.760, skip=3]
Iter 20 GRPO groups: 35%|###5 | 7/20 [03:48<06:13, 28.76s/q, loss=0.0005, mean_r=0.889, q_acc=100%, q_rew=0.760, skip=3]2026-04-26 05:48:05,521 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:05,605 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:05,687 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:05,771 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:12,296 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:12,381 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:12,466 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:12,550 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:19,817 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:48:19,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='23' gold='23' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 20 GRPO groups: 35%|###5 | 7/20 [04:06<06:13, 28.76s/q, loss=0var, mean_r=0.998, skip=4]
Iter 20 GRPO groups: 40%|#### | 8/20 [04:06<05:03, 25.28s/q, loss=0var, mean_r=0.998, skip=4]2026-04-26 05:48:21,472 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:21,550 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:26,411 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:26,489 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:26,566 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:26,643 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:31,615 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:31,697 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:31,779 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 05:48:31,859 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 20 GRPO groups: 40%|#### | 8/20 [04:23<05:03, 25.28s/q, loss=0var, mean_r=0.997, skip=5]
Iter 20 GRPO groups: 45%|####5 | 9/20 [04:23<04:08, 22.61s/q, loss=0var, mean_r=0.997, skip=5]2026-04-26 05:48:43,479 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:48:43,572 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:48:43,664 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:48:43,757 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:48:55,243 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:48:55,334 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:48:55,419 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:48:55,502 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:49:06,187 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:49:06,271 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 20 GRPO groups: 45%|####5 | 9/20 [04:53<04:08, 22.61s/q, loss=0var, mean_r=0.999, skip=6]
Iter 20 GRPO groups: 50%|##### | 10/20 [04:53<04:07, 24.77s/q, loss=0var, mean_r=0.999, skip=6]2026-04-26 05:49:17,617 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.321 = 0.50×0.23(prox=0.23) + 0.40×proc(0.201[fin=0.13,mean=0.30]) + 0.10×fmt(1.000) | pred='64' gold='24' | step_acc=17% lccp=17% (chain=1/6 ok_count=1) n_steps=6
+2026-04-26 05:49:17,742 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.323 = 0.50×0.00(prox=0.00) + 0.40×proc(0.511[fin=0.53,mean=0.48]) + 0.10×fmt(1.000) | pred='64-32*sqrt(2)' gold='24' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+2026-04-26 05:49:33,596 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.348 = 0.50×0.00(prox=0.00) + 0.40×proc(0.546[fin=0.51,mean=0.60]) + 0.10×fmt(1.000) | pred='64-16*sqrt(2)' gold='24' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 05:49:33,700 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.596 = 0.50×0.60(prox=0.60) + 0.40×proc(0.489[fin=0.46,mean=0.54]) + 0.10×fmt(1.000) | pred='32' gold='24' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 05:49:33,804 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.416 = 0.50×0.28(prox=0.28) + 0.40×proc(0.252[fin=0.14,mean=0.42]) + 0.10×fmt(1.000) | pred='54.848' gold='24' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 05:49:33,910 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.742 = 0.50×0.75(prox=0.75) + 0.40×proc(0.669[fin=0.82,mean=0.44]) + 0.10×fmt(1.000) | pred='28' gold='24' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 05:49:41,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.898[fin=0.98,mean=0.77]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 05:49:41,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.895[fin=0.98,mean=0.77]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=75% lccp=38% (chain=3/8 ok_count=6) n_steps=8
+2026-04-26 05:49:41,656 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.571 = 0.50×0.43(prox=0.43) + 0.40×proc(0.641[fin=0.84,mean=0.34]) + 0.10×fmt(1.000) | pred='40' gold='24' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 05:49:41,755 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.495 = 0.50×0.60(prox=0.60) + 0.40×proc(0.239[fin=0.04,mean=0.53]) + 0.10×fmt(1.000) | pred='32' gold='24' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+
Iter 20 GRPO groups: 50%|##### | 10/20 [05:37<04:07, 24.77s/q, loss=-0.0004, mean_r=0.573, q_acc=100%, q_rew=0.760, skip=6]
Iter 20 GRPO groups: 55%|#####5 | 11/20 [05:37<04:35, 30.65s/q, loss=-0.0004, mean_r=0.573, q_acc=100%, q_rew=0.760, skip=6]2026-04-26 05:49:57,432 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:49:57,517 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:49:57,603 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:49:57,698 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:50:18,878 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:50:18,971 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:50:19,056 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:50:19,149 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:50:32,103 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:50:32,198 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 20 GRPO groups: 55%|#####5 | 11/20 [06:19<04:35, 30.65s/q, loss=0var, mean_r=0.999, skip=7]
Iter 20 GRPO groups: 60%|###### | 12/20 [06:19<04:32, 34.09s/q, loss=0var, mean_r=0.999, skip=7]2026-04-26 05:50:32,200 INFO src.rl.curriculum_manager - Topic probabilities (rollout 340): [('probability', '0.110'), ('statistics', '0.110'), ('money_problems', '0.069'), ('time_distance', '0.069'), ('comparison_problems', '0.069')]
+2026-04-26 05:50:44,778 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.936 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:50:44,978 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.986 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:50:45,185 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.960 novelty=0.74 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:50:45,386 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.918 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.77)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:50:45,585 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.589 = clip(base=0.509 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.347 novelty=0.74 | sol=0.45*prm_final(0.13)+0.35*prm_mean(0.53)+0.20*lccp(0.50) | steps=4
+2026-04-26 05:50:45,792 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.877 = clip(base=0.797 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.839 novelty=0.74 | sol=0.45*prm_final(0.73)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:50:46,004 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.993 = clip(base=0.913 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:50:46,208 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.821 = clip(base=0.741 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.753 novelty=0.74 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.69)+0.20*lccp(0.33) | steps=6
+2026-04-26 05:50:46,406 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.905 novelty=0.74 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:50:46,602 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.564 = clip(base=0.484 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.354 novelty=0.74 | sol=0.45*prm_final(0.29)+0.35*prm_mean(0.45)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:51,394 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.864 = clip(base=0.784 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.758 novelty=0.79 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.74)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:51,588 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.845 = clip(base=0.765 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.758 novelty=0.79 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.74)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:51,816 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.849 = clip(base=0.769 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.763 novelty=0.79 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.75)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:52,050 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.854 = clip(base=0.774 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.759 novelty=0.79 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.78)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:52,290 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.866 = clip(base=0.786 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.777 novelty=0.79 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.80)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:52,503 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.852 = clip(base=0.772 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.756 novelty=0.79 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.77)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:52,704 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.576 = clip(base=0.496 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.377 novelty=0.79 | sol=0.45*prm_final(0.34)+0.35*prm_mean(0.45)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:52,904 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.853 = clip(base=0.773 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.756 novelty=0.79 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.78)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:53,103 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.822 = clip(base=0.742 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.708 novelty=0.79 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.72)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:50:53,305 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.858 = clip(base=0.778 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.778 novelty=0.79 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.77)+0.20*lccp(0.33) | steps=3
+
Iter 20 GRPO groups: 60%|###### | 12/20 [06:41<04:32, 34.09s/q, loss=-0.0001, mean_r=0.844, q_acc=100%, q_rew=0.763, skip=7]
Iter 20 GRPO groups: 65%|######5 | 13/20 [06:41<03:34, 30.68s/q, loss=-0.0001, mean_r=0.844, q_acc=100%, q_rew=0.763, skip=7]2026-04-26 05:50:55,048 INFO src.rl.curriculum_manager - Topic probabilities (rollout 360): [('money_problems', '0.094'), ('time_distance', '0.094'), ('comparison_problems', '0.094'), ('sets', '0.094'), ('combinatorics', '0.094')]
+2026-04-26 05:51:05,158 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.947 + mod=+0.080, cap=1.00) | Q=0.90 sol=0.982 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:51:05,377 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.936 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.995 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:51:05,588 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.876 = clip(base=0.796 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.785 novelty=0.70 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.72)+0.20*lccp(0.50) | steps=6
+2026-04-26 05:51:05,798 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.930 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.983 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:51:06,018 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.996 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:51:06,235 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.989 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:51:06,455 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.921 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.980 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:51:06,665 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.988 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:51:06,886 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.983 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=7
+2026-04-26 05:51:07,098 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.936 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.994 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:51:15,914 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.966 + mod=+0.080, cap=1.00) | Q=0.92 sol=0.996 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:51:16,102 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.976 novelty=0.80 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 05:51:16,294 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.959 novelty=0.80 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 05:51:16,495 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.736 = clip(base=0.656 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.547 novelty=0.80 | sol=0.45*prm_final(0.35)+0.35*prm_mean(0.74)+0.20*lccp(0.67) | steps=6
+2026-04-26 05:51:16,690 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.946 novelty=0.80 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=4
+2026-04-26 05:51:16,885 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.592 = clip(base=0.512 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.353 novelty=0.80 | sol=0.45*prm_final(0.42)+0.35*prm_mean(0.47)+0.20*lccp(0.00) | steps=4
+2026-04-26 05:51:17,080 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.990 = clip(base=0.910 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.979 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:51:17,273 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.820 = clip(base=0.740 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.652 novelty=0.80 | sol=0.45*prm_final(0.85)+0.35*prm_mean(0.77)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:51:17,465 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.893 = clip(base=0.813 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.774 novelty=0.80 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.76)+0.20*lccp(0.33) | steps=3
+2026-04-26 05:51:17,657 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.847 = clip(base=0.767 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.729 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.80)+0.20*lccp(0.00) | steps=4
+
Iter 20 GRPO groups: 65%|######5 | 13/20 [07:06<03:34, 30.68s/q, loss=-0.0016, mean_r=0.936, q_acc=100%, q_rew=0.783, skip=7]
Iter 20 GRPO groups: 70%|####### | 14/20 [07:06<02:52, 28.76s/q, loss=-0.0016, mean_r=0.936, q_acc=100%, q_rew=0.783, skip=7]2026-04-26 05:51:25,490 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:51:25,577 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:51:35,137 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:51:35,220 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:51:35,303 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:51:35,389 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.459 = 0.50×0.00(prox=0.00) + 0.40×proc(0.748[fin=0.85,mean=0.59]) + 0.10×fmt(1.000) | pred='2^119' gold='0' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 05:51:47,933 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:51:48,016 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:51:48,094 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:51:48,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 20 GRPO groups: 70%|####### | 14/20 [07:44<02:52, 28.76s/q, loss=-0.0009, mean_r=0.944, q_acc=100%, q_rew=0.783, skip=7]
Iter 20 GRPO groups: 75%|#######5 | 15/20 [07:44<02:38, 31.76s/q, loss=-0.0009, mean_r=0.944, q_acc=100%, q_rew=0.783, skip=7]2026-04-26 05:52:03,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:03,986 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.766 = 0.50×0.60(prox=0.60) + 0.40×proc(0.916[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='22400' gold='16800' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 05:52:04,070 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:04,154 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:11,344 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.926[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:11,429 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:11,512 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:11,594 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:19,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:19,844 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16800' gold='16800' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 20 GRPO groups: 75%|#######5 | 15/20 [08:08<02:38, 31.76s/q, loss=-0.0005, mean_r=0.972, q_acc=100%, q_rew=0.783, skip=7]
Iter 20 GRPO groups: 80%|######## | 16/20 [08:08<01:56, 29.19s/q, loss=-0.0005, mean_r=0.972, q_acc=100%, q_rew=0.783, skip=7]2026-04-26 05:52:26,021 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:26,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:34,063 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:34,149 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:34,236 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.824 = 0.50×0.85(prox=0.85) + 0.40×proc(0.749[fin=0.98,mean=0.40]) + 0.10×fmt(1.000) | pred='105' gold='115' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 05:52:34,322 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:42,493 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:42,577 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:42,664 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:52:42,748 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='115' gold='115' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 20 GRPO groups: 80%|######## | 16/20 [08:39<01:56, 29.19s/q, loss=0.0003, mean_r=0.982, q_acc=100%, q_rew=0.783, skip=7]
Iter 20 GRPO groups: 85%|########5 | 17/20 [08:39<01:29, 29.90s/q, loss=0.0003, mean_r=0.982, q_acc=100%, q_rew=0.783, skip=7]2026-04-26 05:53:00,748 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:53:00,840 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 05:53:00,933 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:53:01,026 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 05:53:10,829 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(0.650) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 05:53:10,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:53:11,014 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.420 = 0.50×0.00(prox=0.00) + 0.40×proc(0.822[fin=0.84,mean=0.80]) + 0.10×fmt(0.700) | pred='' gold='10' | step_acc=86% lccp=14% (chain=1/7 ok_count=6) n_steps=7
+2026-04-26 05:53:11,107 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 05:53:19,843 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 05:53:19,934 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 20 GRPO groups: 85%|########5 | 17/20 [09:08<01:29, 29.90s/q, loss=-0.0010, mean_r=0.890, q_acc=100%, q_rew=0.783, skip=7]
Iter 20 GRPO groups: 90%|######### | 18/20 [09:08<00:58, 29.50s/q, loss=-0.0010, mean_r=0.890, q_acc=100%, q_rew=0.783, skip=7]2026-04-26 05:53:21,405 INFO src.rl.curriculum_manager - Topic probabilities (rollout 380): [('money_problems', '0.094'), ('time_distance', '0.094'), ('comparison_problems', '0.094'), ('sets', '0.094'), ('combinatorics', '0.094')]
+2026-04-26 05:53:33,354 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.371 = clip(base=0.291 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.055 novelty=0.77 | sol=0.45*prm_final(0.07)+0.35*prm_mean(0.07)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:53:33,573 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.617 = clip(base=0.537 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.508 novelty=0.77 | sol=0.45*prm_final(0.66)+0.35*prm_mean(0.47)+0.20*lccp(0.25) | steps=4
+2026-04-26 05:53:33,786 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.982 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:53:33,999 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.395 = clip(base=0.315 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.138 novelty=0.77 | sol=0.45*prm_final(0.23)+0.35*prm_mean(0.09)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:53:34,230 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.652 = clip(base=0.572 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.576 novelty=0.77 | sol=0.45*prm_final(0.83)+0.35*prm_mean(0.46)+0.20*lccp(0.20) | steps=5
+2026-04-26 05:53:34,452 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.763 = clip(base=0.683 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.716 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.68)+0.20*lccp(0.17) | steps=6
+2026-04-26 05:53:34,681 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.768 = clip(base=0.688 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.709 novelty=0.77 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.57)+0.20*lccp(0.40) | steps=5
+2026-04-26 05:53:34,913 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.447 = clip(base=0.367 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.227 novelty=0.77 | sol=0.45*prm_final(0.23)+0.35*prm_mean(0.36)+0.20*lccp(0.00) | steps=5
+2026-04-26 05:53:35,141 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.780 = clip(base=0.700 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.740 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.76)+0.20*lccp(0.14) | steps=7
+2026-04-26 05:53:35,351 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.445 = clip(base=0.365 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.213 novelty=0.77 | sol=0.45*prm_final(0.31)+0.35*prm_mean(0.21)+0.20*lccp(0.00) | steps=3
+2026-04-26 05:53:51,967 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.958 + mod=+0.080, cap=1.00) | Q=0.90 sol=0.994 novelty=0.83 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=8
+2026-04-26 05:53:52,186 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.590 = clip(base=0.510 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.362 novelty=0.83 | sol=0.45*prm_final(0.39)+0.35*prm_mean(0.39)+0.20*lccp(0.25) | steps=4
+2026-04-26 05:53:52,421 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.939 novelty=0.83 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.85)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:53:52,625 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.533 = clip(base=0.533 + mod=-0.000, cap=1.00) | Q=0.77 sol=0.373 novelty=0.83 | sol=0.45*prm_final(0.59)+0.35*prm_mean(0.31)+0.20*lccp(0.00) | steps=2
+2026-04-26 05:53:52,845 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.994 = clip(base=0.914 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.961 novelty=0.83 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=5
+2026-04-26 05:53:53,068 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.845 novelty=0.83 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.81)+0.20*lccp(0.70) | steps=10
+2026-04-26 05:53:53,289 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.609 = clip(base=0.529 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.382 novelty=0.83 | sol=0.45*prm_final(0.41)+0.35*prm_mean(0.45)+0.20*lccp(0.20) | steps=5
+2026-04-26 05:53:53,503 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.930 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.992 novelty=0.83 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:53:53,715 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.941 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.994 novelty=0.83 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 05:53:53,934 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.940 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.997 novelty=0.83 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=11
+
Iter 20 GRPO groups: 90%|######### | 18/20 [09:42<00:58, 29.50s/q, loss=0.0039, mean_r=0.740, q_acc=100%, q_rew=0.770, skip=7]
Iter 20 GRPO groups: 95%|#########5| 19/20 [09:42<00:30, 30.92s/q, loss=0.0039, mean_r=0.740, q_acc=100%, q_rew=0.770, skip=7]2026-04-26 05:54:01,477 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.791 = 0.50×0.59(prox=0.59) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='54' gold='40' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:54:01,563 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.792 = 0.50×0.59(prox=0.59) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='54' gold='40' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:54:14,174 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.792 = 0.50×0.59(prox=0.59) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='54' gold='40' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:54:14,258 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.363 = 0.50×0.01(prox=0.01) + 0.40×proc(0.567[fin=0.64,mean=0.45]) + 0.10×fmt(1.000) | pred='1630' gold='40' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 05:54:14,342 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.772 = 0.50×0.59(prox=0.59) + 0.40×proc(0.944[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='54' gold='40' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 05:54:14,426 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.14(prox=0.14) + 0.40×proc(0.862[fin=0.99,mean=0.67]) + 0.10×fmt(1.000) | pred='162' gold='40' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 05:54:22,859 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.782 = 0.50×0.59(prox=0.59) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='54' gold='40' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:54:22,942 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.792 = 0.50×0.59(prox=0.59) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='54' gold='40' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 05:54:23,024 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.913[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 05:54:23,109 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.938[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+
Iter 20 GRPO groups: 95%|#########5| 19/20 [10:19<00:30, 30.92s/q, loss=0.0004, mean_r=0.757, q_acc=100%, q_rew=0.770, skip=7]
Iter 20 GRPO groups: 100%|##########| 20/20 [10:19<00:00, 32.86s/q, loss=0.0004, mean_r=0.757, q_acc=100%, q_rew=0.770, skip=7]
Iter 20 GRPO groups: 100%|##########| 20/20 [10:19<00:00, 30.99s/q, loss=0.0004, mean_r=0.757, q_acc=100%, q_rew=0.770, skip=7]
+2026-04-26 05:54:33,032 INFO src.rl.llm_question_classifier - LLMClassifier cache=90% llm=2% fallback=8% (cache_size=40/10000)
+2026-04-26 05:54:33,032 INFO __main__ - Iter 20 | loss=0.0002 | reward mean=0.896 std=0.178 | gt_match=79.9% | grounded_acc=90.6% | step_acc=92.3% | lccp=86.2% | batch_acc=92.8% | phase=SELFPLAY_RAMP sp_ratio=25% | groups=18 skipped=7(0var=7) | lr=4.43e-06 | 619.9s
+2026-04-26 05:54:33,032 INFO __main__ - Question generation: 5/5 valid (100%) | q_reward=0.770 | q_acc=100.0% (>0.5 quality) | topic=0.72 diff=0.70 clarity=1.00 novelty=0.47 solvability=0.94
+2026-04-26 05:54:33,032 INFO __main__ - Evaluating GSM8K (150 samples)...
+
GSM8K eval: 0%| | 0/150 [00:00, ?q/s]
GSM8K eval: 1%| | 1/150 [00:02<05:36, 2.26s/q, correct=1/1, lccp=100.0%, score=0.999, step_acc=100.0%]
GSM8K eval: 1%|1 | 2/150 [00:07<09:33, 3.88s/q, correct=2/2, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 2%|2 | 3/150 [00:09<08:06, 3.31s/q, correct=3/3, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 3%|2 | 4/150 [00:12<07:05, 2.91s/q, correct=3/4, lccp=83.3%, score=0.887, step_acc=91.7%]
GSM8K eval: 3%|3 | 5/150 [00:13<05:56, 2.46s/q, correct=4/5, lccp=86.7%, score=0.910, step_acc=93.3%]
GSM8K eval: 4%|4 | 6/150 [00:19<08:25, 3.51s/q, correct=4/6, lccp=75.6%, score=0.888, step_acc=87.8%]
GSM8K eval: 5%|4 | 7/150 [00:22<08:19, 3.49s/q, correct=5/7, lccp=79.0%, score=0.904, step_acc=89.5%]
GSM8K eval: 5%|5 | 8/150 [00:25<07:28, 3.16s/q, correct=6/8, lccp=81.7%, score=0.916, step_acc=90.8%]
GSM8K eval: 6%|6 | 9/150 [00:28<07:27, 3.18s/q, correct=7/9, lccp=83.7%, score=0.925, step_acc=91.9%]
GSM8K eval: 7%|6 | 10/150 [00:33<08:42, 3.73s/q, correct=7/10, lccp=81.3%, score=0.908, step_acc=90.7%]
GSM8K eval: 7%|7 | 11/150 [00:36<08:03, 3.48s/q, correct=8/11, lccp=83.0%, score=0.917, step_acc=91.5%]
GSM8K eval: 8%|8 | 12/150 [00:38<07:01, 3.06s/q, correct=9/12, lccp=84.4%, score=0.923, step_acc=92.2%]
GSM8K eval: 9%|8 | 13/150 [00:41<06:39, 2.91s/q, correct=10/13, lccp=85.6%, score=0.927, step_acc=92.8%]
GSM8K eval: 9%|9 | 14/150 [00:45<07:45, 3.42s/q, correct=11/14, lccp=86.7%, score=0.932, step_acc=93.3%]
GSM8K eval: 10%|# | 15/150 [00:48<07:07, 3.17s/q, correct=12/15, lccp=87.6%, score=0.937, step_acc=93.8%]
GSM8K eval: 11%|# | 16/150 [00:50<06:38, 2.98s/q, correct=12/16, lccp=88.3%, score=0.913, step_acc=94.2%]
GSM8K eval: 11%|#1 | 17/150 [00:54<07:19, 3.30s/q, correct=13/17, lccp=89.0%, score=0.918, step_acc=94.5%]
GSM8K eval: 12%|#2 | 18/150 [01:00<08:59, 4.09s/q, correct=13/18, lccp=84.8%, score=0.906, step_acc=92.0%]
GSM8K eval: 13%|#2 | 19/150 [01:03<07:57, 3.65s/q, correct=14/19, lccp=85.6%, score=0.911, step_acc=92.5%]
GSM8K eval: 13%|#3 | 20/150 [01:07<07:59, 3.69s/q, correct=15/20, lccp=86.3%, score=0.915, step_acc=92.8%]
GSM8K eval: 14%|#4 | 21/150 [01:09<07:16, 3.38s/q, correct=16/21, lccp=86.9%, score=0.919, step_acc=93.2%]
GSM8K eval: 15%|#4 | 22/150 [01:15<08:25, 3.95s/q, correct=17/22, lccp=84.9%, score=0.921, step_acc=91.5%]
GSM8K eval: 15%|#5 | 23/150 [01:19<08:30, 4.02s/q, correct=18/23, lccp=85.6%, score=0.924, step_acc=91.9%]
GSM8K eval: 16%|#6 | 24/150 [01:21<07:32, 3.59s/q, correct=18/24, lccp=83.1%, score=0.907, step_acc=89.1%]
GSM8K eval: 17%|#6 | 25/150 [01:24<06:57, 3.34s/q, correct=18/25, lccp=80.7%, score=0.903, step_acc=88.6%]
GSM8K eval: 17%|#7 | 26/150 [01:29<07:32, 3.65s/q, correct=19/26, lccp=81.5%, score=0.907, step_acc=89.0%]
GSM8K eval: 18%|#8 | 27/150 [01:31<07:00, 3.42s/q, correct=19/27, lccp=82.2%, score=0.901, step_acc=89.4%]
GSM8K eval: 19%|#8 | 28/150 [01:34<06:11, 3.04s/q, correct=20/28, lccp=82.8%, score=0.905, step_acc=89.8%]
GSM8K eval: 19%|#9 | 29/150 [01:36<05:58, 2.96s/q, correct=21/29, lccp=83.4%, score=0.908, step_acc=90.1%]
GSM8K eval: 20%|## | 30/150 [01:40<06:26, 3.22s/q, correct=22/30, lccp=84.0%, score=0.911, step_acc=90.5%]
GSM8K eval: 21%|## | 31/150 [01:43<05:58, 3.01s/q, correct=23/31, lccp=84.5%, score=0.914, step_acc=90.8%]
GSM8K eval: 21%|##1 | 32/150 [01:44<05:09, 2.62s/q, correct=24/32, lccp=85.0%, score=0.916, step_acc=91.1%]
GSM8K eval: 22%|##2 | 33/150 [01:47<05:11, 2.66s/q, correct=25/33, lccp=85.4%, score=0.918, step_acc=91.3%]
GSM8K eval: 23%|##2 | 34/150 [01:49<04:46, 2.47s/q, correct=26/34, lccp=85.8%, score=0.921, step_acc=91.6%]
GSM8K eval: 23%|##3 | 35/150 [01:52<04:47, 2.50s/q, correct=27/35, lccp=86.2%, score=0.923, step_acc=91.8%]
GSM8K eval: 24%|##4 | 36/150 [01:55<05:17, 2.78s/q, correct=28/36, lccp=86.6%, score=0.925, step_acc=92.1%]
GSM8K eval: 25%|##4 | 37/150 [01:57<04:48, 2.55s/q, correct=29/37, lccp=87.0%, score=0.926, step_acc=92.3%]
GSM8K eval: 25%|##5 | 38/150 [02:00<05:01, 2.69s/q, correct=30/38, lccp=87.3%, score=0.928, step_acc=92.5%]
GSM8K eval: 26%|##6 | 39/150 [02:05<06:10, 3.34s/q, correct=31/39, lccp=87.7%, score=0.930, step_acc=92.7%]
GSM8K eval: 27%|##6 | 40/150 [02:11<07:42, 4.20s/q, correct=32/40, lccp=88.0%, score=0.931, step_acc=92.8%]
GSM8K eval: 27%|##7 | 41/150 [02:14<06:57, 3.83s/q, correct=32/41, lccp=88.3%, score=0.931, step_acc=93.0%]
GSM8K eval: 28%|##8 | 42/150 [02:19<07:38, 4.25s/q, correct=33/42, lccp=87.0%, score=0.932, step_acc=92.8%]
GSM8K eval: 29%|##8 | 43/150 [02:22<06:31, 3.66s/q, correct=34/43, lccp=87.3%, score=0.933, step_acc=93.0%]
GSM8K eval: 29%|##9 | 44/150 [02:28<07:54, 4.48s/q, correct=35/44, lccp=87.5%, score=0.935, step_acc=93.1%]
GSM8K eval: 30%|### | 45/150 [02:31<07:07, 4.07s/q, correct=36/45, lccp=87.8%, score=0.936, step_acc=93.3%]
GSM8K eval: 31%|### | 46/150 [02:36<07:27, 4.31s/q, correct=36/46, lccp=85.9%, score=0.931, step_acc=93.2%]
GSM8K eval: 31%|###1 | 47/150 [02:39<06:45, 3.93s/q, correct=37/47, lccp=86.2%, score=0.933, step_acc=93.3%]
GSM8K eval: 32%|###2 | 48/150 [02:41<05:35, 3.29s/q, correct=38/48, lccp=86.5%, score=0.934, step_acc=93.5%]
GSM8K eval: 33%|###2 | 49/150 [02:48<07:16, 4.32s/q, correct=38/49, lccp=85.3%, score=0.920, step_acc=92.1%]
GSM8K eval: 33%|###3 | 50/150 [02:51<06:38, 3.99s/q, correct=38/50, lccp=84.6%, score=0.912, step_acc=91.3%]
GSM8K eval: 34%|###4 | 51/150 [02:52<05:18, 3.22s/q, correct=39/51, lccp=84.9%, score=0.913, step_acc=91.5%]
GSM8K eval: 35%|###4 | 52/150 [02:57<05:46, 3.53s/q, correct=39/52, lccp=83.3%, score=0.913, step_acc=91.3%]
GSM8K eval: 35%|###5 | 53/150 [03:01<06:18, 3.90s/q, correct=39/53, lccp=82.8%, score=0.905, step_acc=90.7%]
GSM8K eval: 36%|###6 | 54/150 [03:04<05:29, 3.44s/q, correct=40/54, lccp=83.2%, score=0.907, step_acc=90.9%]
GSM8K eval: 37%|###6 | 55/150 [03:07<05:30, 3.48s/q, correct=41/55, lccp=83.5%, score=0.909, step_acc=91.1%]
GSM8K eval: 37%|###7 | 56/150 [03:11<05:29, 3.51s/q, correct=42/56, lccp=83.8%, score=0.910, step_acc=91.2%]
GSM8K eval: 38%|###8 | 57/150 [03:13<04:52, 3.15s/q, correct=43/57, lccp=84.0%, score=0.912, step_acc=91.4%]
GSM8K eval: 39%|###8 | 58/150 [03:17<05:16, 3.44s/q, correct=44/58, lccp=84.3%, score=0.913, step_acc=91.5%]
GSM8K eval: 39%|###9 | 59/150 [03:22<05:50, 3.86s/q, correct=44/59, lccp=82.9%, score=0.911, step_acc=91.1%]
GSM8K eval: 40%|#### | 60/150 [03:27<06:17, 4.19s/q, correct=45/60, lccp=83.2%, score=0.912, step_acc=91.2%]
GSM8K eval: 41%|#### | 61/150 [03:29<05:22, 3.63s/q, correct=46/61, lccp=83.5%, score=0.914, step_acc=91.4%]
GSM8K eval: 41%|####1 | 62/150 [03:33<05:06, 3.48s/q, correct=47/62, lccp=83.7%, score=0.915, step_acc=91.5%]
GSM8K eval: 42%|####2 | 63/150 [03:36<04:59, 3.44s/q, correct=47/63, lccp=83.4%, score=0.909, step_acc=91.1%]
GSM8K eval: 43%|####2 | 64/150 [03:39<04:39, 3.25s/q, correct=48/64, lccp=83.7%, score=0.910, step_acc=91.3%]
GSM8K eval: 43%|####3 | 65/150 [03:41<04:24, 3.11s/q, correct=49/65, lccp=84.0%, score=0.912, step_acc=91.4%]
GSM8K eval: 44%|####4 | 66/150 [03:43<03:52, 2.76s/q, correct=50/66, lccp=84.2%, score=0.913, step_acc=91.5%]
GSM8K eval: 45%|####4 | 67/150 [03:46<03:37, 2.62s/q, correct=51/67, lccp=84.4%, score=0.914, step_acc=91.7%]
GSM8K eval: 45%|####5 | 68/150 [03:48<03:35, 2.63s/q, correct=52/68, lccp=84.7%, score=0.916, step_acc=91.8%]
GSM8K eval: 46%|####6 | 69/150 [03:50<03:05, 2.29s/q, correct=53/69, lccp=84.9%, score=0.917, step_acc=91.9%]
GSM8K eval: 47%|####6 | 70/150 [03:53<03:21, 2.51s/q, correct=54/70, lccp=83.7%, score=0.918, step_acc=91.7%]
GSM8K eval: 47%|####7 | 71/150 [03:56<03:33, 2.70s/q, correct=55/71, lccp=82.5%, score=0.918, step_acc=91.6%]
GSM8K eval: 48%|####8 | 72/150 [03:58<03:01, 2.33s/q, correct=56/72, lccp=82.7%, score=0.919, step_acc=91.7%]
GSM8K eval: 49%|####8 | 73/150 [03:59<02:45, 2.15s/q, correct=57/73, lccp=83.0%, score=0.921, step_acc=91.8%]
GSM8K eval: 49%|####9 | 74/150 [04:03<03:15, 2.57s/q, correct=58/74, lccp=83.2%, score=0.922, step_acc=91.9%]
GSM8K eval: 50%|##### | 75/150 [04:05<02:53, 2.31s/q, correct=59/75, lccp=83.4%, score=0.923, step_acc=92.0%]
GSM8K eval: 51%|##### | 76/150 [04:11<04:26, 3.61s/q, correct=59/76, lccp=83.5%, score=0.918, step_acc=92.0%]
GSM8K eval: 51%|#####1 | 77/150 [04:15<04:30, 3.70s/q, correct=60/77, lccp=83.7%, score=0.919, step_acc=92.1%]
GSM8K eval: 52%|#####2 | 78/150 [04:18<03:59, 3.33s/q, correct=61/78, lccp=83.9%, score=0.920, step_acc=92.2%]
GSM8K eval: 53%|#####2 | 79/150 [04:20<03:47, 3.20s/q, correct=62/79, lccp=83.6%, score=0.919, step_acc=92.0%]
GSM8K eval: 53%|#####3 | 80/150 [04:23<03:40, 3.15s/q, correct=63/80, lccp=83.8%, score=0.920, step_acc=92.1%]
GSM8K eval: 54%|#####4 | 81/150 [04:26<03:22, 2.93s/q, correct=64/81, lccp=84.0%, score=0.920, step_acc=92.2%]
GSM8K eval: 55%|#####4 | 82/150 [04:29<03:19, 2.93s/q, correct=65/82, lccp=84.2%, score=0.921, step_acc=92.3%]
GSM8K eval: 55%|#####5 | 83/150 [04:32<03:13, 2.88s/q, correct=66/83, lccp=84.4%, score=0.922, step_acc=92.4%]
GSM8K eval: 56%|#####6 | 84/150 [04:34<03:04, 2.80s/q, correct=67/84, lccp=84.6%, score=0.923, step_acc=92.5%]
GSM8K eval: 57%|#####6 | 85/150 [04:38<03:22, 3.11s/q, correct=68/85, lccp=84.8%, score=0.924, step_acc=92.6%]
GSM8K eval: 57%|#####7 | 86/150 [04:42<03:26, 3.23s/q, correct=69/86, lccp=84.9%, score=0.925, step_acc=92.7%]
GSM8K eval: 58%|#####8 | 87/150 [04:47<04:08, 3.94s/q, correct=70/87, lccp=85.1%, score=0.926, step_acc=92.7%]
GSM8K eval: 59%|#####8 | 88/150 [04:49<03:26, 3.33s/q, correct=71/88, lccp=85.3%, score=0.927, step_acc=92.8%]
GSM8K eval: 59%|#####9 | 89/150 [04:52<03:13, 3.17s/q, correct=72/89, lccp=85.4%, score=0.927, step_acc=92.9%]
GSM8K eval: 60%|###### | 90/150 [04:54<02:55, 2.93s/q, correct=73/90, lccp=85.6%, score=0.928, step_acc=93.0%]
GSM8K eval: 61%|###### | 91/150 [04:59<03:18, 3.36s/q, correct=74/91, lccp=85.8%, score=0.929, step_acc=93.1%]
GSM8K eval: 61%|######1 | 92/150 [05:02<03:11, 3.30s/q, correct=75/92, lccp=85.9%, score=0.930, step_acc=93.1%]
GSM8K eval: 62%|######2 | 93/150 [05:09<04:19, 4.55s/q, correct=76/93, lccp=86.1%, score=0.930, step_acc=93.2%]
GSM8K eval: 63%|######2 | 94/150 [05:12<03:44, 4.01s/q, correct=76/94, lccp=85.2%, score=0.927, step_acc=92.6%]
GSM8K eval: 63%|######3 | 95/150 [05:17<04:01, 4.39s/q, correct=77/95, lccp=84.3%, score=0.927, step_acc=92.1%]
GSM8K eval: 64%|######4 | 96/150 [05:20<03:37, 4.02s/q, correct=77/96, lccp=83.7%, score=0.922, step_acc=91.5%]
GSM8K eval: 65%|######4 | 97/150 [05:23<03:10, 3.60s/q, correct=77/97, lccp=83.4%, score=0.919, step_acc=91.3%]
GSM8K eval: 65%|######5 | 98/150 [05:27<03:17, 3.80s/q, correct=77/98, lccp=83.0%, score=0.916, step_acc=91.3%]
GSM8K eval: 66%|######6 | 99/150 [05:30<02:51, 3.36s/q, correct=78/99, lccp=83.1%, score=0.917, step_acc=91.4%]
GSM8K eval: 67%|######6 | 100/150 [05:31<02:25, 2.91s/q, correct=79/100, lccp=82.3%, score=0.917, step_acc=91.1%]
GSM8K eval: 67%|######7 | 101/150 [05:34<02:22, 2.92s/q, correct=79/101, lccp=82.0%, score=0.913, step_acc=91.0%]
GSM8K eval: 68%|######8 | 102/150 [05:36<01:59, 2.49s/q, correct=80/102, lccp=82.2%, score=0.914, step_acc=91.1%]
GSM8K eval: 69%|######8 | 103/150 [05:38<01:50, 2.36s/q, correct=81/103, lccp=82.3%, score=0.915, step_acc=91.1%]
GSM8K eval: 69%|######9 | 104/150 [05:43<02:20, 3.06s/q, correct=82/104, lccp=82.5%, score=0.915, step_acc=91.2%]
GSM8K eval: 70%|####### | 105/150 [05:45<02:10, 2.90s/q, correct=83/105, lccp=82.7%, score=0.916, step_acc=91.3%]
GSM8K eval: 71%|####### | 106/150 [05:47<01:49, 2.48s/q, correct=84/106, lccp=82.8%, score=0.917, step_acc=91.4%]
GSM8K eval: 71%|#######1 | 107/150 [05:48<01:33, 2.18s/q, correct=85/107, lccp=83.0%, score=0.917, step_acc=91.5%]
GSM8K eval: 72%|#######2 | 108/150 [05:51<01:39, 2.37s/q, correct=86/108, lccp=83.2%, score=0.918, step_acc=91.6%]
GSM8K eval: 73%|#######2 | 109/150 [05:56<02:09, 3.15s/q, correct=86/109, lccp=82.7%, score=0.917, step_acc=91.5%]
GSM8K eval: 73%|#######3 | 110/150 [05:58<01:55, 2.90s/q, correct=87/110, lccp=82.2%, score=0.917, step_acc=91.3%]
GSM8K eval: 74%|#######4 | 111/150 [06:00<01:38, 2.53s/q, correct=88/111, lccp=82.3%, score=0.918, step_acc=91.4%]
GSM8K eval: 75%|#######4 | 112/150 [06:05<02:05, 3.30s/q, correct=88/112, lccp=82.5%, score=0.918, step_acc=91.5%]
GSM8K eval: 75%|#######5 | 113/150 [06:07<01:45, 2.84s/q, correct=89/113, lccp=82.6%, score=0.918, step_acc=91.6%]
GSM8K eval: 76%|#######6 | 114/150 [06:12<02:07, 3.53s/q, correct=90/114, lccp=82.2%, score=0.919, step_acc=91.5%]
GSM8K eval: 77%|#######6 | 115/150 [06:15<01:56, 3.33s/q, correct=91/115, lccp=82.3%, score=0.920, step_acc=91.6%]
GSM8K eval: 77%|#######7 | 116/150 [06:18<01:49, 3.21s/q, correct=92/116, lccp=82.5%, score=0.920, step_acc=91.7%]
GSM8K eval: 78%|#######8 | 117/150 [06:24<02:12, 4.01s/q, correct=93/117, lccp=82.6%, score=0.921, step_acc=91.7%]
GSM8K eval: 79%|#######8 | 118/150 [06:28<02:12, 4.13s/q, correct=93/118, lccp=81.9%, score=0.918, step_acc=91.7%]
GSM8K eval: 79%|#######9 | 119/150 [06:32<02:02, 3.96s/q, correct=93/119, lccp=82.1%, score=0.917, step_acc=91.7%]
GSM8K eval: 80%|######## | 120/150 [06:34<01:48, 3.60s/q, correct=94/120, lccp=82.2%, score=0.918, step_acc=91.8%]
GSM8K eval: 81%|######## | 121/150 [06:37<01:40, 3.45s/q, correct=95/121, lccp=82.4%, score=0.918, step_acc=91.9%]
GSM8K eval: 81%|########1 | 122/150 [06:40<01:32, 3.32s/q, correct=96/122, lccp=82.5%, score=0.919, step_acc=91.9%]
GSM8K eval: 82%|########2 | 123/150 [06:44<01:30, 3.35s/q, correct=97/123, lccp=82.7%, score=0.920, step_acc=92.0%]
GSM8K eval: 83%|########2 | 124/150 [06:46<01:18, 3.00s/q, correct=98/124, lccp=82.8%, score=0.920, step_acc=92.1%]
GSM8K eval: 83%|########3 | 125/150 [06:48<01:07, 2.71s/q, correct=99/125, lccp=82.9%, score=0.921, step_acc=92.1%]
GSM8K eval: 84%|########4 | 126/150 [06:51<01:05, 2.75s/q, correct=100/126, lccp=83.1%, score=0.921, step_acc=92.2%]
GSM8K eval: 85%|########4 | 127/150 [06:55<01:14, 3.25s/q, correct=101/127, lccp=83.2%, score=0.922, step_acc=92.2%]
GSM8K eval: 85%|########5 | 128/150 [06:58<01:09, 3.16s/q, correct=102/128, lccp=83.3%, score=0.923, step_acc=92.3%]
GSM8K eval: 86%|########6 | 129/150 [07:02<01:08, 3.28s/q, correct=103/129, lccp=83.5%, score=0.923, step_acc=92.4%]
GSM8K eval: 87%|########6 | 130/150 [07:04<00:56, 2.84s/q, correct=104/130, lccp=83.6%, score=0.924, step_acc=92.4%]
GSM8K eval: 87%|########7 | 131/150 [07:08<01:03, 3.34s/q, correct=105/131, lccp=83.7%, score=0.924, step_acc=92.5%]
GSM8K eval: 88%|########8 | 132/150 [07:10<00:50, 2.81s/q, correct=106/132, lccp=83.8%, score=0.925, step_acc=92.5%]
GSM8K eval: 89%|########8 | 133/150 [07:13<00:47, 2.82s/q, correct=107/133, lccp=84.0%, score=0.926, step_acc=92.6%]
GSM8K eval: 89%|########9 | 134/150 [07:17<00:52, 3.28s/q, correct=108/134, lccp=84.1%, score=0.926, step_acc=92.7%]
GSM8K eval: 90%|######### | 135/150 [07:20<00:48, 3.21s/q, correct=109/135, lccp=84.2%, score=0.927, step_acc=92.7%]
GSM8K eval: 91%|######### | 136/150 [07:24<00:49, 3.56s/q, correct=109/136, lccp=83.8%, score=0.926, step_acc=92.5%]
GSM8K eval: 91%|#########1| 137/150 [07:31<00:58, 4.51s/q, correct=110/137, lccp=84.0%, score=0.926, step_acc=92.6%]
GSM8K eval: 92%|#########2| 138/150 [07:35<00:51, 4.32s/q, correct=111/138, lccp=84.1%, score=0.927, step_acc=92.6%]
GSM8K eval: 93%|#########2| 139/150 [07:38<00:44, 4.06s/q, correct=112/139, lccp=84.2%, score=0.927, step_acc=92.7%]
GSM8K eval: 93%|#########3| 140/150 [07:43<00:41, 4.12s/q, correct=112/140, lccp=84.1%, score=0.924, step_acc=92.5%]
GSM8K eval: 94%|#########3| 141/150 [07:47<00:36, 4.05s/q, correct=113/141, lccp=84.2%, score=0.924, step_acc=92.5%]
GSM8K eval: 95%|#########4| 142/150 [07:51<00:32, 4.05s/q, correct=114/142, lccp=84.3%, score=0.925, step_acc=92.6%]
GSM8K eval: 95%|#########5| 143/150 [07:53<00:24, 3.53s/q, correct=115/143, lccp=84.4%, score=0.925, step_acc=92.6%]
GSM8K eval: 96%|#########6| 144/150 [07:55<00:19, 3.17s/q, correct=116/144, lccp=84.5%, score=0.926, step_acc=92.7%]
GSM8K eval: 97%|#########6| 145/150 [08:00<00:18, 3.79s/q, correct=116/145, lccp=84.0%, score=0.923, step_acc=92.6%]
GSM8K eval: 97%|#########7| 146/150 [08:03<00:14, 3.53s/q, correct=117/146, lccp=84.1%, score=0.923, step_acc=92.6%]
GSM8K eval: 98%|#########8| 147/150 [08:07<00:10, 3.57s/q, correct=118/147, lccp=84.2%, score=0.924, step_acc=92.7%]
GSM8K eval: 99%|#########8| 148/150 [08:11<00:07, 3.58s/q, correct=119/148, lccp=84.3%, score=0.924, step_acc=92.7%]
GSM8K eval: 99%|#########9| 149/150 [08:14<00:03, 3.53s/q, correct=120/149, lccp=84.4%, score=0.925, step_acc=92.8%]
GSM8K eval: 100%|##########| 150/150 [08:19<00:00, 3.90s/q, correct=120/150, lccp=84.3%, score=0.923, step_acc=92.5%]
GSM8K eval: 100%|##########| 150/150 [08:19<00:00, 3.33s/q, correct=120/150, lccp=84.3%, score=0.923, step_acc=92.5%]
+2026-04-26 06:02:52,390 INFO __main__ - Training Score [iter 20]: 0.9234 (best=0.9262) | n=150
+2026-04-26 06:02:52,390 INFO __main__ - Components : 0.50×correct(80.0%) + 0.40×process + 0.10×fmt(1.000)
+2026-04-26 06:02:52,391 INFO __main__ - Process score : prm_mean=0.906 prm_final=0.935 → weighted=0.923
+2026-04-26 06:02:52,391 INFO __main__ - Step accuracy : 92.5% (bag-of-steps: fraction of steps PRM >0.5)
+2026-04-26 06:02:52,391 INFO __main__ - Chain integrity (LCCP): 84.3% ← fraction of steps before first failure
+ [LCCP=100% → all steps correct; LCCP=0% → first step wrong]
+2026-04-26 06:02:52,391 INFO __main__ - (debug) final-answer accuracy: 80.0%
+2026-04-26 06:02:54,581 INFO __main__ - ======================================================================
+2026-04-26 06:02:54,581 INFO __main__ - GRPO ITERATION 21/60
+2026-04-26 06:02:54,581 INFO __main__ - ======================================================================
+2026-04-26 06:02:54,601 INFO __main__ - LR this iteration: 4.43e-06 | T=0.664 | MATH ratio=36%
+
Iter 21 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 06:02:58,401 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:02:58,482 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:02:58,564 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:02:58,646 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:590: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.1` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
+ warnings.warn(
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:595: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.
+ warnings.warn(
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:612: UserWarning: `do_sample` is set to `False`. However, `top_k` is set to `20` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_k`.
+ warnings.warn(
+2026-04-26 06:03:04,834 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:03:04,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:03:05,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:03:05,088 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:03:11,820 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:03:11,902 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='63' gold='63' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 21 GRPO groups: 0%| | 0/20 [00:17, ?q/s, loss=0var, mean_r=0.999, skip=1]
Iter 21 GRPO groups: 5%|5 | 1/20 [00:17<05:28, 17.30s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 06:03:13,851 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.950 = 0.50×1.00(exact) + 0.40×proc(0.876[fin=0.99,mean=0.70]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:03:13,933 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.892[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:03:19,792 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:03:19,874 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.953 = 0.50×1.00(exact) + 0.40×proc(0.882[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:03:19,955 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.950 = 0.50×1.00(exact) + 0.40×proc(0.876[fin=0.99,mean=0.70]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:03:20,041 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.420 = 0.50×0.43(prox=0.43) + 0.40×proc(0.138[fin=0.04,mean=0.28]) + 0.10×fmt(1.000) | pred='400' gold='240' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 06:03:24,657 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.950 = 0.50×1.00(exact) + 0.40×proc(0.876[fin=0.99,mean=0.70]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:03:24,738 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.892[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:03:24,814 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.893[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:03:24,889 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.893[fin=0.99,mean=0.74]) + 0.10×fmt(1.000) | pred='240' gold='240' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+
Iter 21 GRPO groups: 5%|5 | 1/20 [00:37<05:28, 17.30s/q, loss=-0.0007, mean_r=0.904, skip=1]
Iter 21 GRPO groups: 10%|# | 2/20 [00:37<05:38, 18.79s/q, loss=-0.0007, mean_r=0.904, skip=1]2026-04-26 06:03:41,667 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.801 = 0.50×0.70(prox=0.70) + 0.40×proc(0.882[fin=0.99,mean=0.72]) + 0.10×fmt(1.000) | pred='82800' gold='106000' | step_acc=71% lccp=57% (chain=4/7 ok_count=5) n_steps=7
+2026-04-26 06:03:41,760 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.847[fin=1.00,mean=0.62]) + 0.10×fmt(1.000) | pred='11600' gold='106000' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 06:03:41,853 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.920 = 0.50×0.85(prox=0.85) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='100000' gold='106000' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:03:41,944 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='106000' gold='106000' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:03:53,357 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.500 = 0.50×0.35(prox=0.35) + 0.40×proc(0.294[fin=0.02,mean=0.71]) + 0.10×fmt(1.000) | pred='8245' gold='106000' | step_acc=71% lccp=71% (chain=5/7 ok_count=5) n_steps=7
+2026-04-26 06:03:53,448 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.35(prox=0.35) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='8245' gold='106000' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:03:53,541 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.900[fin=0.95,mean=0.82]) + 0.10×fmt(1.000) | pred='11200' gold='106000' | step_acc=88% lccp=75% (chain=6/8 ok_count=7) n_steps=8
+2026-04-26 06:03:53,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='106000' gold='106000' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:04:09,948 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='106000' gold='106000' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:04:10,040 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='106000' gold='106000' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 21 GRPO groups: 10%|# | 2/20 [01:17<05:38, 18.79s/q, loss=0.0000, mean_r=0.786, skip=1]
Iter 21 GRPO groups: 15%|#5 | 3/20 [01:17<08:03, 28.42s/q, loss=0.0000, mean_r=0.786, skip=1]2026-04-26 06:04:11,611 INFO src.rl.curriculum_manager - Topic probabilities (rollout 400): [('money_problems', '0.106'), ('time_distance', '0.106'), ('comparison_problems', '0.106'), ('sets', '0.106'), ('combinatorics', '0.106')]
+2026-04-26 06:04:22,995 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.630 = clip(base=0.550 + mod=+0.080, cap=1.00) | Q=0.92 sol=0.997 novelty=0.85 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=8
+2026-04-26 06:04:23,189 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.603 = clip(base=0.523 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.308 novelty=0.85 | sol=0.45*prm_final(0.03)+0.35*prm_mean(0.62)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:04:23,378 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.630 = clip(base=0.550 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.784 novelty=0.85 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.84)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:04:23,567 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.630 = clip(base=0.550 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.452 novelty=0.85 | sol=0.45*prm_final(0.06)+0.35*prm_mean(0.75)+0.20*lccp(0.80) | steps=5
+2026-04-26 06:04:23,760 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.630 = clip(base=0.550 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.990 novelty=0.85 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:04:23,944 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.609 = clip(base=0.529 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.336 novelty=0.85 | sol=0.45*prm_final(0.04)+0.35*prm_mean(0.53)+0.20*lccp(0.67) | steps=3
+2026-04-26 06:04:24,135 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.630 = clip(base=0.550 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.992 novelty=0.85 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:04:24,322 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.630 = clip(base=0.550 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.991 novelty=0.85 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:04:24,514 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.630 = clip(base=0.550 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.787 novelty=0.85 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.84)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:04:24,718 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.630 = clip(base=0.550 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.509 novelty=0.85 | sol=0.45*prm_final(0.08)+0.35*prm_mean(0.86)+0.20*lccp(0.86) | steps=7
+2026-04-26 06:04:31,297 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.474 = clip(base=0.394 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.220 novelty=0.69 | sol=0.45*prm_final(0.07)+0.35*prm_mean(0.35)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:04:31,490 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.360 = clip(base=0.280 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.057 novelty=0.69 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.15)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:04:31,682 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.655 = clip(base=0.575 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.505 novelty=0.69 | sol=0.45*prm_final(0.55)+0.35*prm_mean(0.54)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:04:31,879 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.408 = clip(base=0.328 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.126 novelty=0.69 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.34)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:04:32,079 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.377 = clip(base=0.297 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.111 novelty=0.69 | sol=0.45*prm_final(0.16)+0.35*prm_mean(0.11)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:04:32,288 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.726 = clip(base=0.646 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.601 novelty=0.69 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.52)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:04:32,479 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.361 = clip(base=0.281 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.059 novelty=0.69 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.15)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:04:32,678 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.926 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.79)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:04:32,877 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.775 = clip(base=0.695 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.720 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.78)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:04:33,080 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.381 = clip(base=0.301 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.093 novelty=0.69 | sol=0.45*prm_final(0.08)+0.35*prm_mean(0.16)+0.20*lccp(0.00) | steps=3
+
Iter 21 GRPO groups: 15%|#5 | 3/20 [01:40<08:03, 28.42s/q, loss=0.0013, mean_r=0.584, q_acc=100%, q_rew=0.742, skip=1]
Iter 21 GRPO groups: 20%|## | 4/20 [01:40<07:01, 26.37s/q, loss=0.0013, mean_r=0.584, q_acc=100%, q_rew=0.742, skip=1]2026-04-26 06:04:39,381 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.430 = 0.50×0.33(prox=0.33) + 0.40×proc(0.409[fin=0.43,mean=0.38]) + 0.10×fmt(1.000) | pred='10' gold='5' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:04:39,464 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.464 = 0.50×0.56(prox=0.56) + 0.40×proc(0.216[fin=0.08,mean=0.42]) + 0.10×fmt(1.000) | pred='7' gold='5' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 06:04:48,778 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.915 = 0.50×1.00(exact) + 0.40×proc(0.876[fin=0.99,mean=0.70]) + 0.10×fmt(0.650) | pred='5' gold='5' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 06:04:48,863 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.321 = 0.50×0.20(prox=0.20) + 0.40×proc(0.303[fin=0.29,mean=0.33]) + 0.10×fmt(1.000) | pred='15' gold='5' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 06:04:48,941 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:04:49,023 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:04:54,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.896[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 06:04:54,640 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.439 = 0.50×0.29(prox=0.29) + 0.40×proc(0.479[fin=0.56,mean=0.36]) + 0.10×fmt(1.000) | pred='11' gold='5' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 06:04:54,720 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.770 = 0.50×1.00(exact) + 0.40×proc(0.512[fin=0.60,mean=0.38]) + 0.10×fmt(0.650) | pred='5' gold='5' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 06:04:54,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.915 = 0.50×1.00(exact) + 0.40×proc(0.789[fin=0.91,mean=0.61]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+
Iter 21 GRPO groups: 20%|## | 4/20 [02:10<07:01, 26.37s/q, loss=-0.0026, mean_r=0.717, q_acc=100%, q_rew=0.742, skip=1]
Iter 21 GRPO groups: 25%|##5 | 5/20 [02:10<06:55, 27.73s/q, loss=-0.0026, mean_r=0.717, q_acc=100%, q_rew=0.742, skip=1]2026-04-26 06:05:07,854 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:07,937 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:08,022 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:08,105 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:14,800 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:14,882 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:14,964 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:15,046 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:21,676 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:05:21,758 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 21 GRPO groups: 25%|##5 | 5/20 [02:27<06:55, 27.73s/q, loss=0var, mean_r=0.993, skip=2]
Iter 21 GRPO groups: 30%|### | 6/20 [02:27<05:36, 24.01s/q, loss=0var, mean_r=0.993, skip=2]2026-04-26 06:05:25,007 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:05:25,091 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.332 = 0.50×0.40(prox=0.40) + 0.40×proc(0.080[fin=0.08,mean=0.07]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:05:29,713 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.946[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:05:29,798 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:05:29,876 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.507[fin=0.46,mean=0.58]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 06:05:29,959 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.352 = 0.50×0.40(prox=0.40) + 0.40×proc(0.131[fin=0.15,mean=0.10]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:05:35,116 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:05:35,202 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.352 = 0.50×0.40(prox=0.40) + 0.40×proc(0.129[fin=0.15,mean=0.09]) + 0.10×fmt(1.000) | pred='2' gold='8' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:05:35,285 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:05:35,368 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 21 GRPO groups: 30%|### | 6/20 [02:47<05:36, 24.01s/q, loss=0.0004, mean_r=0.756, q_acc=100%, q_rew=0.742, skip=2]
Iter 21 GRPO groups: 35%|###5 | 7/20 [02:47<04:56, 22.82s/q, loss=0.0004, mean_r=0.756, q_acc=100%, q_rew=0.742, skip=2]2026-04-26 06:05:46,549 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:05:46,631 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:05:46,713 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:05:46,795 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:05:55,454 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:05:55,536 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:05:55,618 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:05:55,701 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:06:04,873 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:06:04,959 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 21 GRPO groups: 35%|###5 | 7/20 [03:10<04:56, 22.82s/q, loss=0var, mean_r=0.997, skip=3]
Iter 21 GRPO groups: 40%|#### | 8/20 [03:10<04:33, 22.82s/q, loss=0var, mean_r=0.997, skip=3]2026-04-26 06:06:04,960 INFO src.rl.curriculum_manager - Topic probabilities (rollout 420): [('money_problems', '0.106'), ('time_distance', '0.106'), ('comparison_problems', '0.106'), ('sets', '0.106'), ('combinatorics', '0.106')]
+2026-04-26 06:06:07,429 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:07,609 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:07,788 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:07,962 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:08,143 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:08,325 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:08,503 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:08,679 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:08,854 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:09,034 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:12,726 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:12,904 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.998 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:06:13,084 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:06:13,264 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:13,445 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:13,625 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:13,802 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.933 = clip(base=0.853 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.997 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:13,978 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:14,157 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:06:14,341 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+
Iter 21 GRPO groups: 40%|#### | 8/20 [03:21<04:33, 22.82s/q, loss=0.0015, mean_r=0.930, q_acc=100%, q_rew=0.693, skip=3]
Iter 21 GRPO groups: 45%|####5 | 9/20 [03:21<03:30, 19.15s/q, loss=0.0015, mean_r=0.930, q_acc=100%, q_rew=0.693, skip=3]2026-04-26 06:06:23,740 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.31(prox=0.31) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='10160' gold='4830' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:06:23,833 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.884 = 0.50×0.85(prox=0.85) + 0.40×proc(0.898[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='5150' gold='4830' | step_acc=78% lccp=44% (chain=4/9 ok_count=7) n_steps=9
+2026-04-26 06:06:41,975 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.877 = 0.50×0.85(prox=0.85) + 0.40×proc(0.881[fin=0.95,mean=0.78]) + 0.10×fmt(1.000) | pred='4665' gold='4830' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 06:06:42,059 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.35(prox=0.35) + 0.40×proc(0.807[fin=0.91,mean=0.66]) + 0.10×fmt(1.000) | pred='430' gold='4830' | step_acc=50% lccp=38% (chain=3/8 ok_count=4) n_steps=8
+2026-04-26 06:06:42,145 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.680[fin=0.59,mean=0.81]) + 0.10×fmt(1.000) | pred='-4830' gold='4830' | step_acc=90% lccp=80% (chain=8/10 ok_count=9) n_steps=10
+2026-04-26 06:06:42,231 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.733 = 0.50×0.78(prox=0.78) + 0.40×proc(0.613[fin=0.70,mean=0.49]) + 0.10×fmt(1.000) | pred='4130' gold='4830' | step_acc=57% lccp=14% (chain=1/7 ok_count=4) n_steps=7
+2026-04-26 06:06:57,815 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.902[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='1264' gold='4830' | step_acc=78% lccp=11% (chain=1/9 ok_count=7) n_steps=9
+2026-04-26 06:06:57,902 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.667 = 0.50×0.77(prox=0.77) + 0.40×proc(0.452[fin=0.48,mean=0.41]) + 0.10×fmt(1.000) | pred='4120' gold='4830' | step_acc=33% lccp=11% (chain=1/9 ok_count=3) n_steps=9
+2026-04-26 06:06:57,988 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.38(prox=0.38) + 0.40×proc(0.723[fin=0.77,mean=0.65]) + 0.10×fmt(1.000) | pred='900' gold='4830' | step_acc=78% lccp=11% (chain=1/9 ok_count=7) n_steps=9
+2026-04-26 06:06:58,074 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.36(prox=0.36) + 0.40×proc(0.912[fin=0.98,mean=0.80]) + 0.10×fmt(1.000) | pred='485' gold='4830' | step_acc=89% lccp=56% (chain=5/9 ok_count=8) n_steps=9
+
Iter 21 GRPO groups: 45%|####5 | 9/20 [04:23<03:30, 19.15s/q, loss=-0.0005, mean_r=0.646, q_acc=100%, q_rew=0.693, skip=3]
Iter 21 GRPO groups: 50%|##### | 10/20 [04:23<05:25, 32.52s/q, loss=-0.0005, mean_r=0.646, q_acc=100%, q_rew=0.693, skip=3]2026-04-26 06:07:18,497 INFO src.rl.curriculum_manager - Topic probabilities (rollout 440): [('geometry', '0.092'), ('statistics', '0.092'), ('money_problems', '0.090'), ('time_distance', '0.090'), ('sets', '0.090')]
+2026-04-26 06:07:24,670 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.827 = clip(base=0.747 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.717 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.77)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:07:24,852 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.934 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.82)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:25,031 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.707 = clip(base=0.627 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.567 novelty=0.69 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.51)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:07:25,218 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.985 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:07:25,411 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.994 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:25,601 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.994 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:25,783 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:25,969 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:26,159 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:26,346 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.996 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:33,261 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.988 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:33,461 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.987 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:07:33,660 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.984 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:07:33,858 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.715 = clip(base=0.635 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.650 novelty=0.69 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.51)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:07:34,057 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.718 = clip(base=0.638 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.655 novelty=0.69 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.53)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:07:34,260 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.830 = clip(base=0.750 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.826 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.85)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:07:34,460 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:07:34,660 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:07:34,869 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:07:35,080 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.885 = clip(base=0.805 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.950 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+
Iter 21 GRPO groups: 50%|##### | 10/20 [04:42<05:25, 32.52s/q, loss=0.0017, mean_r=0.893, q_acc=100%, q_rew=0.684, skip=3]
Iter 21 GRPO groups: 55%|#####5 | 11/20 [04:42<04:13, 28.18s/q, loss=0.0017, mean_r=0.893, q_acc=100%, q_rew=0.684, skip=3]2026-04-26 06:07:42,736 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.706 = 0.50×1.00(exact) + 0.40×proc(0.264[fin=0.21,mean=0.34]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 06:07:42,820 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.718 = 0.50×1.00(exact) + 0.40×proc(0.295[fin=0.26,mean=0.35]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 06:07:42,906 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.500 = 0.50×0.37(prox=0.37) + 0.40×proc(0.283[fin=0.01,mean=0.69]) + 0.10×fmt(1.000) | pred='9' gold='56' | step_acc=67% lccp=67% (chain=4/6 ok_count=4) n_steps=6
+2026-04-26 06:07:42,989 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.644 = 0.50×1.00(exact) + 0.40×proc(0.111[fin=0.14,mean=0.06]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:07:47,468 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.646 = 0.50×1.00(exact) + 0.40×proc(0.115[fin=0.13,mean=0.09]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:07:47,549 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.860 = 0.50×1.00(exact) + 0.40×proc(0.649[fin=0.54,mean=0.82]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:07:47,636 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.663 = 0.50×0.43(prox=0.43) + 0.40×proc(0.870[fin=0.96,mean=0.73]) + 0.10×fmt(1.000) | pred='19' gold='56' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 06:07:47,718 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.636 = 0.50×1.00(exact) + 0.40×proc(0.177[fin=0.08,mean=0.33]) + 0.10×fmt(0.650) | pred='56' gold='56' | step_acc=50% lccp=50% (chain=1/2 ok_count=1) n_steps=2
+2026-04-26 06:07:52,687 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.826 = 0.50×1.00(exact) + 0.40×proc(0.566[fin=0.65,mean=0.44]) + 0.10×fmt(1.000) | pred='56' gold='56' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 06:07:52,771 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.380 = 0.50×0.33(prox=0.33) + 0.40×proc(0.236[fin=0.12,mean=0.41]) + 0.10×fmt(1.000) | pred='0' gold='56' | step_acc=50% lccp=12% (chain=1/8 ok_count=4) n_steps=8
+
Iter 21 GRPO groups: 55%|#####5 | 11/20 [04:59<04:13, 28.18s/q, loss=-0.0022, mean_r=0.658, q_acc=100%, q_rew=0.684, skip=3]
Iter 21 GRPO groups: 60%|###### | 12/20 [04:59<03:19, 24.90s/q, loss=-0.0022, mean_r=0.658, q_acc=100%, q_rew=0.684, skip=3]2026-04-26 06:08:00,415 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.812 = 0.50×0.64(prox=0.64) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='-1.43' gold='-2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:08:00,501 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.803 = 0.50×0.64(prox=0.64) + 0.40×proc(0.962[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='-1.43' gold='-2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:08:09,431 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.958[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='-2' gold='-2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:08:09,516 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.804 = 0.50×0.64(prox=0.64) + 0.40×proc(0.964[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='-1.43' gold='-2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:08:09,611 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='$-1 \\frac{3}{7}$' gold='-2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:08:09,696 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='-2' gold='-2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:08:18,506 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='-2' gold='-2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:08:18,593 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.835 = 0.50×0.77(prox=0.77) + 0.40×proc(0.875[fin=0.99,mean=0.70]) + 0.10×fmt(1.000) | pred='-1.7' gold='-2' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 06:08:18,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.958[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='-2' gold='-2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:08:18,761 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='-2' gold='-2' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 21 GRPO groups: 60%|###### | 12/20 [05:31<03:19, 24.90s/q, loss=-0.0023, mean_r=0.875, q_acc=100%, q_rew=0.684, skip=3]
Iter 21 GRPO groups: 65%|######5 | 13/20 [05:31<03:09, 27.02s/q, loss=-0.0023, mean_r=0.875, q_acc=100%, q_rew=0.684, skip=3]2026-04-26 06:08:26,116 INFO src.rl.curriculum_manager - Topic probabilities (rollout 460): [('money_problems', '0.121'), ('time_distance', '0.121'), ('sets', '0.121'), ('combinatorics', '0.121'), ('sequences', '0.121')]
+2026-04-26 06:08:36,354 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.997 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:08:36,564 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.661 = clip(base=0.581 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.450 novelty=0.72 | sol=0.45*prm_final(0.20)+0.35*prm_mean(0.62)+0.20*lccp(0.71) | steps=7
+2026-04-26 06:08:36,765 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.645 = clip(base=0.565 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.443 novelty=0.72 | sol=0.45*prm_final(0.12)+0.35*prm_mean(0.74)+0.20*lccp(0.67) | steps=6
+2026-04-26 06:08:36,972 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:08:37,190 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.640 = clip(base=0.560 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.438 novelty=0.72 | sol=0.45*prm_final(0.23)+0.35*prm_mean(0.61)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:08:37,393 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.643 = clip(base=0.563 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.442 novelty=0.72 | sol=0.45*prm_final(0.19)+0.35*prm_mean(0.67)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:08:37,601 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.996 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:08:37,810 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.842 = clip(base=0.762 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.789 novelty=0.72 | sol=0.45*prm_final(0.64)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:08:38,024 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=8
+2026-04-26 06:08:38,236 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.981 = clip(base=0.901 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.995 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:08:51,311 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.838 = clip(base=0.758 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.718 novelty=0.75 | sol=0.45*prm_final(0.66)+0.35*prm_mean(0.80)+0.20*lccp(0.71) | steps=7
+2026-04-26 06:08:51,518 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.756 = clip(base=0.676 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.595 novelty=0.75 | sol=0.45*prm_final(0.81)+0.35*prm_mean(0.57)+0.20*lccp(0.17) | steps=6
+2026-04-26 06:08:51,727 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.922 = clip(base=0.842 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.878 novelty=0.75 | sol=0.45*prm_final(0.85)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:08:51,931 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.703 = clip(base=0.623 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.499 novelty=0.75 | sol=0.45*prm_final(0.59)+0.35*prm_mean(0.55)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:08:52,140 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.981 = clip(base=0.901 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.990 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:08:52,341 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.624 = clip(base=0.544 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.392 novelty=0.75 | sol=0.45*prm_final(0.24)+0.35*prm_mean(0.53)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:08:52,543 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.491 = clip(base=0.411 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.253 novelty=0.75 | sol=0.45*prm_final(0.20)+0.35*prm_mean(0.27)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:08:52,754 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.831 = clip(base=0.751 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.732 novelty=0.75 | sol=0.45*prm_final(0.62)+0.35*prm_mean(0.84)+0.20*lccp(0.80) | steps=10
+2026-04-26 06:08:52,973 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.776 = clip(base=0.696 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.638 novelty=0.75 | sol=0.45*prm_final(0.49)+0.35*prm_mean(0.79)+0.20*lccp(0.71) | steps=7
+2026-04-26 06:08:53,180 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.807 = clip(base=0.727 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.690 novelty=0.75 | sol=0.45*prm_final(0.68)+0.35*prm_mean(0.76)+0.20*lccp(0.60) | steps=5
+
Iter 21 GRPO groups: 65%|######5 | 13/20 [06:00<03:09, 27.02s/q, loss=0.0009, mean_r=0.804, q_acc=100%, q_rew=0.704, skip=3]
Iter 21 GRPO groups: 70%|####### | 14/20 [06:00<02:45, 27.55s/q, loss=0.0009, mean_r=0.804, q_acc=100%, q_rew=0.704, skip=3]2026-04-26 06:08:58,417 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.938 = 0.50×1.00(exact) + 0.40×proc(0.844[fin=0.98,mean=0.63]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:08:58,493 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:08:58,575 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.935 = 0.50×1.00(exact) + 0.40×proc(0.838[fin=0.97,mean=0.64]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 06:08:58,652 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.910[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:09:04,939 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:09:05,015 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.898[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:09:05,091 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.958[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:09:05,169 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.945[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:09:10,562 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.829 = 0.50×0.85(prox=0.85) + 0.40×proc(0.761[fin=0.98,mean=0.43]) + 0.10×fmt(1.000) | pred='113' gold='107' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 06:09:10,640 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.945[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='107' gold='107' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 21 GRPO groups: 70%|####### | 14/20 [06:17<02:45, 27.55s/q, loss=-0.0007, mean_r=0.955, q_acc=100%, q_rew=0.704, skip=3]
Iter 21 GRPO groups: 75%|#######5 | 15/20 [06:17<02:02, 24.42s/q, loss=-0.0007, mean_r=0.955, q_acc=100%, q_rew=0.704, skip=3]2026-04-26 06:09:12,068 INFO src.rl.curriculum_manager - Topic probabilities (rollout 480): [('money_problems', '0.121'), ('time_distance', '0.121'), ('sets', '0.121'), ('combinatorics', '0.121'), ('sequences', '0.121')]
+2026-04-26 06:09:23,762 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:09:23,989 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.993 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:09:24,221 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.941 = clip(base=0.861 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.999 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:09:24,447 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.989 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:09:24,671 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.611 = clip(base=0.531 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.459 novelty=0.76 | sol=0.45*prm_final(0.26)+0.35*prm_mean(0.64)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:09:24,898 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.941 = clip(base=0.861 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:09:25,129 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.847 = clip(base=0.767 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.839 novelty=0.76 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.88)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:09:25,346 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.992 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:09:25,583 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:09:25,815 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.995 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:09:36,768 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.641 = clip(base=0.561 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.486 novelty=0.72 | sol=0.45*prm_final(0.60)+0.35*prm_mean(0.55)+0.20*lccp(0.12) | steps=8
+2026-04-26 06:09:36,994 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.746 = clip(base=0.666 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.675 novelty=0.72 | sol=0.45*prm_final(0.85)+0.35*prm_mean(0.76)+0.20*lccp(0.14) | steps=7
+2026-04-26 06:09:37,229 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.786 = clip(base=0.706 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.742 novelty=0.72 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.68)+0.20*lccp(0.33) | steps=6
+2026-04-26 06:09:37,460 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.672 = clip(base=0.592 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.578 novelty=0.72 | sol=0.45*prm_final(0.80)+0.35*prm_mean(0.53)+0.20*lccp(0.17) | steps=6
+2026-04-26 06:09:37,690 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.808 = clip(base=0.728 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.779 novelty=0.72 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.75)+0.20*lccp(0.50) | steps=8
+2026-04-26 06:09:37,923 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.685 = clip(base=0.605 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.572 novelty=0.72 | sol=0.45*prm_final(0.38)+0.35*prm_mean(0.71)+0.20*lccp(0.78) | steps=9
+2026-04-26 06:09:38,160 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.807 = clip(base=0.727 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.773 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(0.17) | steps=6
+2026-04-26 06:09:38,418 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.737 = clip(base=0.657 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.661 novelty=0.72 | sol=0.45*prm_final(0.75)+0.35*prm_mean(0.67)+0.20*lccp(0.43) | steps=7
+2026-04-26 06:09:38,659 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.816 = clip(base=0.736 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.799 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.85)+0.20*lccp(0.25) | steps=8
+2026-04-26 06:09:38,885 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.624 = clip(base=0.544 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.487 novelty=0.72 | sol=0.45*prm_final(0.20)+0.35*prm_mean(0.73)+0.20*lccp(0.71) | steps=7
+
Iter 21 GRPO groups: 75%|#######5 | 15/20 [06:46<02:02, 24.42s/q, loss=0.0008, mean_r=0.817, q_acc=100%, q_rew=0.695, skip=3]
Iter 21 GRPO groups: 80%|######## | 16/20 [06:46<01:42, 25.67s/q, loss=0.0008, mean_r=0.817, q_acc=100%, q_rew=0.695, skip=3]2026-04-26 06:09:40,646 INFO src.rl.curriculum_manager - Topic probabilities (rollout 500): [('money_problems', '0.143'), ('time_distance', '0.143'), ('sets', '0.143'), ('combinatorics', '0.143'), ('sequences', '0.143')]
+2026-04-26 06:09:49,859 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.451 = clip(base=0.371 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.159 novelty=0.68 | sol=0.45*prm_final(0.05)+0.35*prm_mean(0.39)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:09:50,060 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.576 = clip(base=0.496 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.408 novelty=0.68 | sol=0.45*prm_final(0.47)+0.35*prm_mean(0.45)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:09:50,269 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.633 = clip(base=0.553 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.446 novelty=0.68 | sol=0.45*prm_final(0.29)+0.35*prm_mean(0.56)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:09:50,473 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.502 = clip(base=0.422 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.281 novelty=0.68 | sol=0.45*prm_final(0.08)+0.35*prm_mean(0.41)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:09:50,674 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.686 = clip(base=0.606 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.535 novelty=0.68 | sol=0.45*prm_final(0.49)+0.35*prm_mean(0.55)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:09:50,881 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.920 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.996 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:09:51,085 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.964 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:09:51,293 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.838 = clip(base=0.758 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.769 novelty=0.68 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.86)+0.20*lccp(0.17) | steps=6
+2026-04-26 06:09:51,493 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.968 novelty=0.68 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:09:51,693 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.984 = clip(base=0.904 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.989 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:09:55,331 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.976 = clip(base=0.896 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.995 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:55,515 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.995 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:55,695 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:55,878 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:56,059 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:56,242 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:56,422 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.991 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:56,610 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:56,795 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:09:56,982 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.985 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+
Iter 21 GRPO groups: 80%|######## | 16/20 [07:04<01:42, 25.67s/q, loss=-0.0001, mean_r=0.858, q_acc=100%, q_rew=0.697, skip=3]
Iter 21 GRPO groups: 85%|########5 | 17/20 [07:04<01:10, 23.37s/q, loss=-0.0001, mean_r=0.858, q_acc=100%, q_rew=0.697, skip=3]2026-04-26 06:10:00,821 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:00,897 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:06,448 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:06,529 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:06,608 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:06,684 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:12,670 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:12,746 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:12,822 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:12,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 21 GRPO groups: 85%|########5 | 17/20 [07:22<01:10, 23.37s/q, loss=0var, mean_r=0.999, skip=4]
Iter 21 GRPO groups: 90%|######### | 18/20 [07:22<00:43, 21.90s/q, loss=0var, mean_r=0.999, skip=4]2026-04-26 06:10:22,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.458 = 0.50×0.50(prox=0.50) + 0.40×proc(0.269[fin=0.20,mean=0.38]) + 0.10×fmt(1.000) | pred='15' gold='30' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 06:10:22,845 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.646 = 0.50×0.36(prox=0.36) + 0.40×proc(0.920[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='3' gold='30' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:10:22,927 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:10:23,010 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.399 = 0.50×0.00(prox=0.00) + 0.40×proc(0.748[fin=0.94,mean=0.46]) + 0.10×fmt(1.000) | pred='3 2/3' gold='30' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 06:10:31,610 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:10:31,697 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:10:31,782 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:10:31,863 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:36,638 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.908 = 0.50×1.00(exact) + 0.40×proc(0.770[fin=0.96,mean=0.48]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 06:10:36,721 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 21 GRPO groups: 90%|######### | 18/20 [07:43<00:43, 21.90s/q, loss=0.0007, mean_r=0.841, q_acc=100%, q_rew=0.697, skip=4]
Iter 21 GRPO groups: 95%|#########5| 19/20 [07:43<00:21, 21.65s/q, loss=0.0007, mean_r=0.841, q_acc=100%, q_rew=0.697, skip=4]2026-04-26 06:10:42,085 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:42,173 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:10:48,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:10:48,666 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:10:48,742 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:10:48,824 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:11:00,038 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:11:00,117 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:11:00,193 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:11:00,269 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.650) | pred='128' gold='128' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 21 GRPO groups: 95%|#########5| 19/20 [08:10<00:21, 21.65s/q, loss=0var, mean_r=0.989, skip=5]
Iter 21 GRPO groups: 100%|##########| 20/20 [08:10<00:00, 23.18s/q, loss=0var, mean_r=0.989, skip=5]
Iter 21 GRPO groups: 100%|##########| 20/20 [08:10<00:00, 24.52s/q, loss=0var, mean_r=0.989, skip=5]
+2026-04-26 06:11:04,949 INFO __main__ - Iter 21 | loss=0.0002 | reward mean=0.842 std=0.188 | gt_match=73.6% | grounded_acc=91.4% | step_acc=82.0% | lccp=69.3% | batch_acc=92.3% | phase=SELFPLAY_RAMP sp_ratio=29% | groups=21 skipped=5(0var=5) | lr=4.34e-06 | 490.4s
+2026-04-26 06:11:04,950 INFO __main__ - Question generation: 6/6 valid (100%) | q_reward=0.697 | q_acc=100.0% (>0.5 quality) | topic=0.57 diff=0.48 clarity=1.00 novelty=0.45 solvability=0.96
+2026-04-26 06:11:04,951 INFO __main__ - ======================================================================
+2026-04-26 06:11:04,951 INFO __main__ - GRPO ITERATION 22/60
+2026-04-26 06:11:04,951 INFO __main__ - ======================================================================
+2026-04-26 06:11:04,971 INFO __main__ - LR this iteration: 4.34e-06 | T=0.658 | MATH ratio=38%
+
Iter 22 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 06:11:04,973 INFO src.rl.curriculum_manager - Topic probabilities (rollout 520): [('money_problems', '0.107'), ('time_distance', '0.107'), ('sets', '0.107'), ('combinatorics', '0.107'), ('sequences', '0.107')]
+2026-04-26 06:11:12,213 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.918 = clip(base=0.838 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.968 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:12,435 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.891 = clip(base=0.811 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.960 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:12,649 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.897 = clip(base=0.817 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.981 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:12,863 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.813 = clip(base=0.733 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.823 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:11:13,079 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.992 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:13,297 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.988 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:13,511 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.700 = clip(base=0.620 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.627 novelty=0.70 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.56)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:11:13,729 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.982 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:13,947 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.982 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:14,167 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.981 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:49,889 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.951 novelty=0.76 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:11:50,107 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.736 = clip(base=0.656 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.654 novelty=0.76 | sol=0.45*prm_final(0.73)+0.35*prm_mean(0.70)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:11:50,317 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.710 = clip(base=0.630 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.634 novelty=0.76 | sol=0.45*prm_final(0.78)+0.35*prm_mean(0.62)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:11:50,525 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.439 = clip(base=0.359 + mod=+0.080, cap=1.00) | Q=0.55 sol=0.229 novelty=0.76 | sol=0.45*prm_final(0.21)+0.35*prm_mean(0.24)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:11:50,733 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.689 = clip(base=0.609 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.594 novelty=0.76 | sol=0.45*prm_final(0.77)+0.35*prm_mean(0.52)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:11:50,942 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.684 = clip(base=0.604 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.580 novelty=0.76 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.55)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:11:51,163 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.867 = clip(base=0.787 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.915 novelty=0.76 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:11:51,379 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.769 = clip(base=0.689 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.730 novelty=0.76 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.78)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:11:51,599 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.702 = clip(base=0.622 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.604 novelty=0.76 | sol=0.45*prm_final(0.74)+0.35*prm_mean(0.68)+0.20*lccp(0.17) | steps=6
+
Iter 22 GRPO groups: 0%| | 0/20 [00:48, ?q/s, loss=-0.0004, mean_r=0.802, q_acc=100%, q_rew=0.608, skip=0]
Iter 22 GRPO groups: 5%|5 | 1/20 [00:48<15:20, 48.43s/q, loss=-0.0004, mean_r=0.802, q_acc=100%, q_rew=0.608, skip=0]2026-04-26 06:11:58,337 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:11:58,420 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:11:58,502 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:11:58,586 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:12:03,176 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:12:03,259 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:12:03,340 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:12:03,423 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:12:08,416 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:12:08,491 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='36' gold='36' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 22 GRPO groups: 5%|5 | 1/20 [01:03<15:20, 48.43s/q, loss=0var, mean_r=1.000, skip=1]
Iter 22 GRPO groups: 10%|# | 2/20 [01:03<08:38, 28.82s/q, loss=0var, mean_r=1.000, skip=1]2026-04-26 06:12:42,264 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.915[fin=0.98,mean=0.82]) + 0.10×fmt(1.000) | pred='751' gold='751' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:12:42,363 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.686 = 0.50×0.60(prox=0.60) + 0.40×proc(0.714[fin=0.87,mean=0.48]) + 0.10×fmt(1.000) | pred='1000' gold='751' | step_acc=33% lccp=17% (chain=1/6 ok_count=2) n_steps=6
+2026-04-26 06:12:54,657 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='751' gold='751' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:12:54,753 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='751' gold='751' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:12:54,847 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='751' gold='751' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:12:54,943 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='751' gold='751' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:13:19,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.950 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(0.650) | pred='751' gold='751' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:13:19,104 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='751' gold='751' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:19,203 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.874 = 0.50×0.85(prox=0.85) + 0.40×proc(0.873[fin=0.91,mean=0.82]) + 0.10×fmt(1.000) | pred='754' gold='751' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+
Iter 22 GRPO groups: 10%|# | 2/20 [02:16<08:38, 28.82s/q, loss=-0.0009, mean_r=0.941, q_acc=100%, q_rew=0.608, skip=1]
Iter 22 GRPO groups: 15%|#5 | 3/20 [02:16<13:49, 48.81s/q, loss=-0.0009, mean_r=0.941, q_acc=100%, q_rew=0.608, skip=1]2026-04-26 06:13:25,055 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:32,864 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:32,949 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:33,033 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:33,119 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:40,608 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:40,694 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:40,780 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:40,865 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:13:50,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='7200' gold='7200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 22 GRPO groups: 15%|#5 | 3/20 [02:45<13:49, 48.81s/q, loss=0var, mean_r=0.996, skip=2]
Iter 22 GRPO groups: 20%|## | 4/20 [02:45<10:57, 41.11s/q, loss=0var, mean_r=0.996, skip=2]2026-04-26 06:13:55,158 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.963 + mod=+0.080, cap=1.00) | Q=0.91 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:13:55,357 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.995 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:13:55,552 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.995 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:13:55,747 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.936 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.991 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:13:55,938 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.840 = clip(base=0.760 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.687 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.69)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:13:56,133 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.841 = clip(base=0.761 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.689 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.70)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:13:56,327 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.936 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.990 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:13:56,518 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.958 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:13:56,708 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.841 = clip(base=0.761 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.689 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.70)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:13:56,897 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.841 = clip(base=0.761 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.689 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.70)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:14:02,143 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.943 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:14:02,341 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.941 + mod=+0.080, cap=1.00) | Q=0.85 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:14:02,539 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.822 = clip(base=0.742 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.660 novelty=0.67 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.65)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:14:02,739 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.945 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:14:02,933 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.822 = clip(base=0.742 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.660 novelty=0.67 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.65)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:14:03,133 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:14:03,327 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.822 = clip(base=0.742 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.660 novelty=0.67 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.65)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:14:03,521 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.996 = clip(base=0.916 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.976 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:14:03,720 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.832 = clip(base=0.752 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.689 novelty=0.67 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.71)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:14:03,912 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+
Iter 22 GRPO groups: 20%|## | 4/20 [03:00<10:57, 41.11s/q, loss=0.0053, mean_r=0.932, q_acc=100%, q_rew=0.735, skip=2]
Iter 22 GRPO groups: 25%|##5 | 5/20 [03:00<07:56, 31.76s/q, loss=0.0053, mean_r=0.932, q_acc=100%, q_rew=0.735, skip=2]2026-04-26 06:14:12,491 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:14:12,574 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.955[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:14:12,657 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:14:22,799 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:14:22,883 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.744 = 0.50×0.58(prox=0.58) + 0.40×proc(0.887[fin=1.00,mean=0.72]) + 0.10×fmt(1.000) | pred='30' gold='22' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:14:22,967 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:14:23,051 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.834 = 0.50×0.73(prox=0.73) + 0.40×proc(0.918[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='18' gold='22' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 06:14:31,243 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='$22' gold='22' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:14:31,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:14:31,412 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 22 GRPO groups: 25%|##5 | 5/20 [03:27<07:56, 31.76s/q, loss=0.0003, mean_r=0.953, q_acc=100%, q_rew=0.735, skip=2]
Iter 22 GRPO groups: 30%|### | 6/20 [03:27<07:03, 30.25s/q, loss=0.0003, mean_r=0.953, q_acc=100%, q_rew=0.735, skip=2]2026-04-26 06:14:37,683 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:14:48,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:14:48,998 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:14:49,075 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:14:49,151 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:14:52,220 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:14:52,299 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:14:52,381 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:14:52,457 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:14:57,430 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(0.650) | pred='-10' gold='-10' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 22 GRPO groups: 30%|### | 6/20 [03:52<07:03, 30.25s/q, loss=0var, mean_r=0.977, skip=3]
Iter 22 GRPO groups: 35%|###5 | 7/20 [03:52<06:08, 28.38s/q, loss=0var, mean_r=0.977, skip=3]2026-04-26 06:15:02,323 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:02,409 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:02,490 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:15:05,199 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:15:05,283 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:15:05,360 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.976[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:15:05,443 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:15:14,075 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:14,162 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:15:14,241 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.936 = 0.50×1.00(exact) + 0.40×proc(0.927[fin=1.00,mean=0.82]) + 0.10×fmt(0.650) | pred='-1' gold='-1' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 22 GRPO groups: 35%|###5 | 7/20 [04:10<06:08, 28.38s/q, loss=0.0028, mean_r=0.984, q_acc=100%, q_rew=0.735, skip=3]
Iter 22 GRPO groups: 40%|#### | 8/20 [04:10<05:01, 25.16s/q, loss=0.0028, mean_r=0.984, q_acc=100%, q_rew=0.735, skip=3]2026-04-26 06:15:18,851 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:22,907 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:22,992 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:23,076 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:23,161 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:28,694 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:28,776 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:28,859 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:28,942 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:15:34,015 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 22 GRPO groups: 40%|#### | 8/20 [04:29<05:01, 25.16s/q, loss=0var, mean_r=1.000, skip=4]
Iter 22 GRPO groups: 45%|####5 | 9/20 [04:29<04:13, 23.02s/q, loss=0var, mean_r=1.000, skip=4]2026-04-26 06:15:50,875 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=8
+2026-04-26 06:15:51,114 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=8
+2026-04-26 06:15:51,346 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=9
+2026-04-26 06:15:51,591 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.986 novelty=0.73 | sol=0.45*prm_final(0.97)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=9
+2026-04-26 06:15:51,824 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.992 novelty=0.73 | sol=0.45*prm_final(0.98)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=9
+2026-04-26 06:15:52,059 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=9
+2026-04-26 06:15:52,292 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=8
+2026-04-26 06:15:52,523 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=9
+2026-04-26 06:15:52,750 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=9
+2026-04-26 06:15:52,978 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.989 novelty=0.73 | sol=0.45*prm_final(0.98)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=9
+2026-04-26 06:16:02,063 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.77 sol=1.000 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:16:02,283 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.665 = clip(base=0.585 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.514 novelty=0.72 | sol=0.45*prm_final(0.45)+0.35*prm_mean(0.60)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:16:02,507 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.475 = clip(base=0.395 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.269 novelty=0.72 | sol=0.45*prm_final(0.04)+0.35*prm_mean(0.48)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:16:02,721 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.787 = clip(base=0.707 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.715 novelty=0.72 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.66)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:16:02,943 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.933 = clip(base=0.853 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.973 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:16:03,168 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.993 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:16:03,393 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:16:03,616 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.811 = clip(base=0.731 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.763 novelty=0.72 | sol=0.45*prm_final(0.80)+0.35*prm_mean(0.81)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:16:03,833 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:16:04,058 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.994 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=7
+
Iter 22 GRPO groups: 45%|####5 | 9/20 [05:00<04:13, 23.02s/q, loss=-0.0004, mean_r=0.904, q_acc=100%, q_rew=0.722, skip=4]
Iter 22 GRPO groups: 50%|##### | 10/20 [05:00<04:17, 25.72s/q, loss=-0.0004, mean_r=0.904, q_acc=100%, q_rew=0.722, skip=4]2026-04-26 06:16:12,010 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:16:12,096 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:16:12,182 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:16:19,248 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.296 = 0.50×0.10(prox=0.10) + 0.40×proc(0.371[fin=0.39,mean=0.35]) + 0.10×fmt(1.000) | pred='86' gold='15' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 06:16:19,333 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:16:19,417 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:16:19,500 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:16:27,441 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.685 = 0.50×0.56(prox=0.56) + 0.40×proc(0.769[fin=0.89,mean=0.58]) + 0.10×fmt(1.000) | pred='21' gold='15' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:16:27,523 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:16:27,606 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 22 GRPO groups: 50%|##### | 10/20 [05:24<04:17, 25.72s/q, loss=-0.0005, mean_r=0.896, q_acc=100%, q_rew=0.722, skip=4]
Iter 22 GRPO groups: 55%|#####5 | 11/20 [05:24<03:44, 24.97s/q, loss=-0.0005, mean_r=0.896, q_acc=100%, q_rew=0.722, skip=4]2026-04-26 06:16:33,063 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:40,037 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:40,122 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:40,207 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:40,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:48,105 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:48,187 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:48,270 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:48,353 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:16:56,468 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='475' gold='475' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 22 GRPO groups: 55%|#####5 | 11/20 [05:51<03:44, 24.97s/q, loss=0var, mean_r=0.994, skip=5]
Iter 22 GRPO groups: 60%|###### | 12/20 [05:51<03:25, 25.72s/q, loss=0var, mean_r=0.994, skip=5]2026-04-26 06:17:01,519 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.988 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:01,720 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.987 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:01,915 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.918 = clip(base=0.838 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.996 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:02,114 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.918 = clip(base=0.838 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.996 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:02,320 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.993 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:02,518 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.999 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:02,714 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.986 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:02,908 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.919 = clip(base=0.839 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.997 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:03,108 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.995 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:03,310 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.993 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:07,367 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.952 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:07,565 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.996 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:07,765 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.893 = clip(base=0.813 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.957 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:07,963 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.759 = clip(base=0.679 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.734 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.63)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:17:08,163 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.995 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:08,364 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.975 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:08,567 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.882 = clip(base=0.802 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.938 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.82)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:08,765 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.893 = clip(base=0.813 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.957 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:08,958 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.995 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:17:09,162 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.996 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 22 GRPO groups: 60%|###### | 12/20 [06:05<03:25, 25.72s/q, loss=-0.0004, mean_r=0.904, q_acc=100%, q_rew=0.692, skip=5]
Iter 22 GRPO groups: 65%|######5 | 13/20 [06:05<02:36, 22.29s/q, loss=-0.0004, mean_r=0.904, q_acc=100%, q_rew=0.692, skip=5]2026-04-26 06:17:18,999 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:17:19,093 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.27(prox=0.27) + 0.40×proc(0.950[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='35' gold='15' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 06:17:19,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.27(prox=0.27) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='35' gold='15' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 06:17:31,911 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:17:31,996 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:17:32,091 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:17:32,185 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.27(prox=0.27) + 0.40×proc(0.897[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='35' gold='15' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 06:17:43,598 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:17:43,682 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.916[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 06:17:43,765 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 22 GRPO groups: 65%|######5 | 13/20 [06:40<02:36, 22.29s/q, loss=0.0001, mean_r=0.861, q_acc=100%, q_rew=0.692, skip=5]
Iter 22 GRPO groups: 70%|####### | 14/20 [06:40<02:35, 25.93s/q, loss=0.0001, mean_r=0.861, q_acc=100%, q_rew=0.692, skip=5]2026-04-26 06:18:07,498 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.733 = clip(base=0.653 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.637 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.54)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:18:07,707 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.890 = clip(base=0.810 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.942 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:18:07,929 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.831 = clip(base=0.751 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.843 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(0.50) | steps=6
+2026-04-26 06:18:08,139 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.889 = clip(base=0.809 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.941 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:18:08,350 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.727 = clip(base=0.647 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.674 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.64)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:18:08,563 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.789 = clip(base=0.709 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.772 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.68)+0.20*lccp(0.43) | steps=7
+2026-04-26 06:18:08,767 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.771 = clip(base=0.691 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.736 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.82)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:18:08,974 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.789 = clip(base=0.709 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.773 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.70)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:18:09,183 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.793 = clip(base=0.713 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.778 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.71)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:18:09,393 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.745 = clip(base=0.665 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.680 novelty=0.80 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.67)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:18:22,879 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.940 novelty=0.82 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:18:23,096 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.981 = clip(base=0.901 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.976 novelty=0.82 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:18:23,307 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.829 novelty=0.82 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:18:23,510 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.592 = clip(base=0.512 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.393 novelty=0.82 | sol=0.45*prm_final(0.46)+0.35*prm_mean(0.39)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:18:23,722 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.987 novelty=0.82 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:18:23,932 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.848 = clip(base=0.768 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.751 novelty=0.82 | sol=0.45*prm_final(0.84)+0.35*prm_mean(0.78)+0.20*lccp(0.50) | steps=6
+2026-04-26 06:18:24,146 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.859 = clip(base=0.779 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.769 novelty=0.82 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.81)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:18:24,350 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.839 = clip(base=0.759 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.717 novelty=0.82 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.63)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:18:24,562 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.844 = clip(base=0.764 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.724 novelty=0.82 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.65)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:18:24,776 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.986 = clip(base=0.906 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.982 novelty=0.82 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=7
+
Iter 22 GRPO groups: 70%|####### | 14/20 [07:21<02:35, 25.93s/q, loss=0.0007, mean_r=0.839, q_acc=100%, q_rew=0.696, skip=5]
Iter 22 GRPO groups: 75%|#######5 | 15/20 [07:21<02:32, 30.56s/q, loss=0.0007, mean_r=0.839, q_acc=100%, q_rew=0.696, skip=5]2026-04-26 06:18:29,987 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:36,534 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:36,617 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:36,699 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:36,781 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:43,942 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:44,025 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:44,110 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:44,194 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:18:50,349 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='90' gold='90' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 22 GRPO groups: 75%|#######5 | 15/20 [07:45<02:32, 30.56s/q, loss=0var, mean_r=0.999, skip=6]
Iter 22 GRPO groups: 80%|######## | 16/20 [07:45<01:54, 28.54s/q, loss=0var, mean_r=0.999, skip=6]2026-04-26 06:18:55,559 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.800 = 0.50×1.00(exact) + 0.40×proc(0.500[fin=0.42,mean=0.62]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 06:18:55,646 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:18:55,734 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:19:09,810 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:19:09,902 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:19:09,985 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.969[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:19:10,068 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:19:19,540 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.974[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:19:19,623 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:19:19,707 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='66' gold='66' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 22 GRPO groups: 80%|######## | 16/20 [08:16<01:54, 28.54s/q, loss=0.0001, mean_r=0.976, q_acc=100%, q_rew=0.696, skip=6]
Iter 22 GRPO groups: 85%|########5 | 17/20 [08:16<01:27, 29.22s/q, loss=0.0001, mean_r=0.976, q_acc=100%, q_rew=0.696, skip=6]2026-04-26 06:19:27,764 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:19:38,679 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:19:38,764 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:19:38,850 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.631[fin=0.60,mean=0.68]) + 0.10×fmt(1.000) | pred='120' gold='480' | step_acc=71% lccp=57% (chain=4/7 ok_count=5) n_steps=7
+2026-04-26 06:19:38,936 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:19:56,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:19:56,835 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:19:56,921 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:19:57,005 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='480' gold='480' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:20:06,533 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.739 = 0.50×0.50(prox=0.50) + 0.40×proc(0.973[fin=0.99,mean=0.95]) + 0.10×fmt(1.000) | pred='720' gold='480' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 22 GRPO groups: 85%|########5 | 17/20 [09:03<01:27, 29.22s/q, loss=-0.0003, mean_r=0.929, q_acc=100%, q_rew=0.696, skip=6]
Iter 22 GRPO groups: 90%|######### | 18/20 [09:03<01:09, 34.51s/q, loss=-0.0003, mean_r=0.929, q_acc=100%, q_rew=0.696, skip=6]2026-04-26 06:20:42,166 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=10/10 ok_count=10) n_steps=10
+2026-04-26 06:20:42,262 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.723 = 0.50×0.55(prox=0.55) + 0.40×proc(0.870[fin=0.90,mean=0.82]) + 0.10×fmt(1.000) | pred='29' gold='49' | step_acc=82% lccp=64% (chain=7/11 ok_count=9) n_steps=11
+2026-04-26 06:20:42,346 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=0.98,mean=0.98]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:20:44,978 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=12/12 ok_count=12) n_steps=12
+2026-04-26 06:20:45,069 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.956 = 0.50×1.00(exact) + 0.40×proc(0.891[fin=1.00,mean=0.73]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=75% lccp=33% (chain=4/12 ok_count=9) n_steps=12
+2026-04-26 06:20:45,163 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='49' gold='49' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:20:45,247 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.365 = 0.50×0.30(prox=0.30) + 0.40×proc(0.282[fin=0.13,mean=0.51]) + 0.10×fmt(1.000) | pred='105' gold='49' | step_acc=57% lccp=0% (chain=0/7 ok_count=4) n_steps=7
+
Iter 22 GRPO groups: 90%|######### | 18/20 [09:55<01:09, 34.51s/q, loss=0.0000, mean_r=0.862, q_acc=100%, q_rew=0.696, skip=6]
Iter 22 GRPO groups: 95%|#########5| 19/20 [09:55<00:39, 39.79s/q, loss=0.0000, mean_r=0.862, q_acc=100%, q_rew=0.696, skip=6]2026-04-26 06:21:06,268 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.726 = clip(base=0.646 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.600 novelty=0.70 | sol=0.45*prm_final(0.81)+0.35*prm_mean(0.49)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:21:06,468 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.794 = clip(base=0.714 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.698 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.58)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:21:06,670 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.756 = clip(base=0.676 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.640 novelty=0.70 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.52)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:21:06,873 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.821 = clip(base=0.741 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.752 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.73)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:21:07,077 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.763 = clip(base=0.683 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.654 novelty=0.70 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.57)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:21:07,277 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.754 = clip(base=0.674 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.641 novelty=0.70 | sol=0.45*prm_final(0.81)+0.35*prm_mean(0.60)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:21:07,476 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.589 = clip(base=0.509 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.394 novelty=0.70 | sol=0.45*prm_final(0.61)+0.35*prm_mean(0.34)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:21:07,678 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.795 = clip(base=0.715 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.712 novelty=0.70 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.63)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:21:07,877 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.697 = clip(base=0.617 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.546 novelty=0.70 | sol=0.45*prm_final(0.76)+0.35*prm_mean(0.58)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:21:08,077 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.654 = clip(base=0.574 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.513 novelty=0.70 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.36)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:21:13,346 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.993 = clip(base=0.913 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.997 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:21:13,542 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.979 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:13,739 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.979 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:13,947 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.961 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:14,149 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:14,354 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.984 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:14,551 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.994 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:14,747 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.984 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:14,946 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:15,147 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.996 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+
Iter 22 GRPO groups: 95%|#########5| 19/20 [10:11<00:39, 39.79s/q, loss=0.0002, mean_r=0.848, q_acc=100%, q_rew=0.700, skip=6]
Iter 22 GRPO groups: 100%|##########| 20/20 [10:11<00:00, 32.87s/q, loss=0.0002, mean_r=0.848, q_acc=100%, q_rew=0.700, skip=6]
Iter 22 GRPO groups: 100%|##########| 20/20 [10:11<00:00, 30.59s/q, loss=0.0002, mean_r=0.848, q_acc=100%, q_rew=0.700, skip=6]
+2026-04-26 06:21:16,825 INFO __main__ - Iter 22 | loss=0.0006 | reward mean=0.918 std=0.124 | gt_match=90.4% | grounded_acc=98.5% | step_acc=96.5% | lccp=92.8% | batch_acc=98.4% | phase=SELFPLAY_RAMP sp_ratio=32% | groups=20 skipped=6(0var=6) | lr=4.24e-06 | 611.9s
+2026-04-26 06:21:16,825 INFO __main__ - Question generation: 6/6 valid (100%) | q_reward=0.700 | q_acc=100.0% (>0.5 quality) | topic=0.62 diff=0.39 clarity=1.00 novelty=0.46 solvability=0.98
+2026-04-26 06:21:16,826 INFO __main__ - ======================================================================
+2026-04-26 06:21:16,826 INFO __main__ - GRPO ITERATION 23/60
+2026-04-26 06:21:16,826 INFO __main__ - ======================================================================
+2026-04-26 06:21:16,847 INFO __main__ - LR this iteration: 4.24e-06 | T=0.651 | MATH ratio=40%
+
Iter 23 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 06:21:21,199 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:21,389 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:21,589 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:21,779 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:21,977 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:22,178 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:22,374 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:22,562 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:22,749 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:22,938 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=1.000 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:28,794 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.997 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:28,985 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.997 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:29,180 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.997 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:29,372 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.997 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:29,562 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.996 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:29,751 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.995 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:29,941 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:30,135 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.997 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:30,324 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:30,510 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.997 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+
Iter 23 GRPO groups: 0%| | 0/20 [00:15, ?q/s, loss=0.0001, mean_r=0.967, q_acc=100%, q_rew=0.722, skip=0]
Iter 23 GRPO groups: 5%|5 | 1/20 [00:15<04:51, 15.36s/q, loss=0.0001, mean_r=0.967, q_acc=100%, q_rew=0.722, skip=0]2026-04-26 06:21:41,305 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.878 novelty=0.73 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:41,511 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.979 = clip(base=0.899 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.936 novelty=0.73 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:41,722 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.679 = clip(base=0.599 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.469 novelty=0.73 | sol=0.45*prm_final(0.62)+0.35*prm_mean(0.54)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:21:41,939 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.836 novelty=0.73 | sol=0.45*prm_final(0.82)+0.35*prm_mean(0.76)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:42,155 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.785 = clip(base=0.705 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.639 novelty=0.73 | sol=0.45*prm_final(0.70)+0.35*prm_mean(0.64)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:21:42,371 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.930 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.985 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:42,594 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.803 = clip(base=0.723 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.664 novelty=0.73 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.55)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:21:42,801 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.933 = clip(base=0.853 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.873 novelty=0.73 | sol=0.45*prm_final(0.82)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:43,008 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.774 = clip(base=0.694 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.613 novelty=0.73 | sol=0.45*prm_final(0.78)+0.35*prm_mean(0.60)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:21:43,222 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.729 = clip(base=0.649 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.540 novelty=0.73 | sol=0.45*prm_final(0.52)+0.35*prm_mean(0.59)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:21:49,104 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.940 = clip(base=0.860 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.934 novelty=0.76 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:49,301 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.963 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:49,499 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.961 novelty=0.76 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:21:49,696 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.988 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:49,894 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.993 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:50,096 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.965 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:50,295 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.990 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:50,497 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.976 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:21:50,695 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.932 novelty=0.76 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.82)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:21:50,896 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.993 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+
Iter 23 GRPO groups: 5%|5 | 1/20 [00:35<04:51, 15.36s/q, loss=-0.0003, mean_r=0.903, q_acc=100%, q_rew=0.748, skip=0]
Iter 23 GRPO groups: 10%|# | 2/20 [00:35<05:29, 18.32s/q, loss=-0.0003, mean_r=0.903, q_acc=100%, q_rew=0.748, skip=0]2026-04-26 06:21:57,202 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:21:57,278 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:21:57,360 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:21:57,435 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.940[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 06:22:04,899 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:22:04,974 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:22:05,050 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:22:05,132 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:22:11,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:22:11,620 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 23 GRPO groups: 10%|# | 2/20 [00:54<05:29, 18.32s/q, loss=0var, mean_r=0.986, skip=1]
Iter 23 GRPO groups: 15%|#5 | 3/20 [00:54<05:16, 18.64s/q, loss=0var, mean_r=0.986, skip=1]2026-04-26 06:22:16,848 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:22:16,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.35(prox=0.35) + 0.40×proc(0.599[fin=0.57,mean=0.64]) + 0.10×fmt(1.000) | pred='1.25' gold='20' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 06:22:28,976 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.972 = 0.50×1.00(exact) + 0.40×proc(0.930[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=88% lccp=50% (chain=4/8 ok_count=7) n_steps=8
+2026-04-26 06:22:29,062 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.811 = 0.50×0.71(prox=0.71) + 0.40×proc(0.884[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='16' gold='20' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 06:22:29,148 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.843 = 0.50×0.71(prox=0.71) + 0.40×proc(0.966[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='24' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:22:29,234 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.40(prox=0.40) + 0.40×proc(0.768[fin=0.91,mean=0.56]) + 0.10×fmt(1.000) | pred='5' gold='20' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 06:22:41,174 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:22:41,261 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.43(prox=0.43) + 0.40×proc(0.680[fin=0.88,mean=0.38]) + 0.10×fmt(1.000) | pred='6.67' gold='20' | step_acc=29% lccp=14% (chain=1/7 ok_count=2) n_steps=7
+2026-04-26 06:22:41,346 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.837[fin=0.99,mean=0.61]) + 0.10×fmt(1.000) | pred='40' gold='20' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 06:22:41,432 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.718 = 0.50×0.76(prox=0.76) + 0.40×proc(0.599[fin=0.69,mean=0.46]) + 0.10×fmt(1.000) | pred='16.8' gold='20' | step_acc=43% lccp=0% (chain=0/7 ok_count=3) n_steps=7
+
Iter 23 GRPO groups: 15%|#5 | 3/20 [01:38<05:16, 18.64s/q, loss=0.0001, mean_r=0.754, q_acc=100%, q_rew=0.748, skip=1]
Iter 23 GRPO groups: 20%|## | 4/20 [01:38<07:37, 28.58s/q, loss=0.0001, mean_r=0.754, q_acc=100%, q_rew=0.748, skip=1]2026-04-26 06:23:29,011 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.44(prox=0.44) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='13' gold='8' | step_acc=100% lccp=100% (chain=16/16 ok_count=16) n_steps=16
+2026-04-26 06:23:29,107 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.855 = 0.50×0.80(prox=0.80) + 0.40×proc(0.888[fin=0.88,mean=0.91]) + 0.10×fmt(1.000) | pred='9' gold='8' | step_acc=91% lccp=82% (chain=9/11 ok_count=10) n_steps=11
+
Iter 23 GRPO groups: 20%|## | 4/20 [02:12<07:37, 28.58s/q, loss=-0.0013, mean_r=0.703, q_acc=100%, q_rew=0.748, skip=1]
Iter 23 GRPO groups: 25%|##5 | 5/20 [02:12<07:38, 30.54s/q, loss=-0.0013, mean_r=0.703, q_acc=100%, q_rew=0.748, skip=1]2026-04-26 06:23:35,502 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.987 = clip(base=0.907 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:23:35,701 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:23:35,900 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.996 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:23:36,104 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:23:36,310 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.995 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:23:36,514 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.648 = clip(base=0.568 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.509 novelty=0.66 | sol=0.45*prm_final(0.69)+0.35*prm_mean(0.57)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:23:36,720 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:23:36,924 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.602 = clip(base=0.522 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.491 novelty=0.66 | sol=0.45*prm_final(0.71)+0.35*prm_mean(0.49)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:23:37,127 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:37,328 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:23:44,375 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.977 = clip(base=0.897 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.997 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:44,574 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.963 novelty=0.66 | sol=0.45*prm_final(0.94)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:44,781 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.990 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:23:44,984 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.945 novelty=0.66 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:45,179 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.962 novelty=0.66 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:45,377 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.934 = clip(base=0.854 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.960 novelty=0.66 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:45,574 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.980 novelty=0.66 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:45,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.993 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:45,980 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.876 = clip(base=0.796 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.857 novelty=0.66 | sol=0.45*prm_final(0.79)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:23:46,187 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.979 novelty=0.66 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+
Iter 23 GRPO groups: 25%|##5 | 5/20 [02:31<07:38, 30.54s/q, loss=-0.0004, mean_r=0.917, q_acc=100%, q_rew=0.731, skip=1]
Iter 23 GRPO groups: 30%|### | 6/20 [02:31<06:11, 26.52s/q, loss=-0.0004, mean_r=0.917, q_acc=100%, q_rew=0.731, skip=1]2026-04-26 06:23:52,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:23:52,522 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:23:57,261 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:23:57,342 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:23:57,425 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:23:57,506 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:24:04,180 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:24:04,261 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:24:04,342 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:24:04,425 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='228' gold='228' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 23 GRPO groups: 30%|### | 6/20 [02:54<06:11, 26.52s/q, loss=0var, mean_r=0.999, skip=2]
Iter 23 GRPO groups: 35%|###5 | 7/20 [02:54<05:29, 25.37s/q, loss=0var, mean_r=0.999, skip=2]2026-04-26 06:24:14,244 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:14,324 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:14,401 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:14,484 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:19,569 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:19,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:19,733 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:19,816 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:24,221 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:24,301 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 23 GRPO groups: 35%|###5 | 7/20 [03:07<05:29, 25.37s/q, loss=0var, mean_r=0.998, skip=3]
Iter 23 GRPO groups: 40%|#### | 8/20 [03:07<04:17, 21.47s/q, loss=0var, mean_r=0.998, skip=3]2026-04-26 06:24:28,378 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.942 = 0.50×1.00(exact) + 0.40×proc(0.855[fin=1.00,mean=0.64]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:24:28,462 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:33,166 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:33,243 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.969[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:33,321 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:33,404 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.923[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:38,267 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.953 = 0.50×1.00(exact) + 0.40×proc(0.883[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:24:38,344 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:38,428 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:38,511 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.917[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='48' gold='48' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 23 GRPO groups: 40%|#### | 8/20 [03:26<04:17, 21.47s/q, loss=0var, mean_r=0.980, skip=4]
Iter 23 GRPO groups: 45%|####5 | 9/20 [03:26<03:47, 20.71s/q, loss=0var, mean_r=0.980, skip=4]2026-04-26 06:24:47,841 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.367 = 0.50×0.50(prox=0.50) + 0.40×proc(0.043[fin=0.03,mean=0.06]) + 0.10×fmt(1.000) | pred='2' gold='4' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:24:47,934 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.960[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:24:48,027 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.923[fin=0.99,mean=0.82]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:24:48,027 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='4' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 06:24:50,841 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.217 = 0.50×0.00(prox=0.00) + 0.40×proc(0.380[fin=0.47,mean=0.24]) + 0.10×fmt(0.650) | pred='4/7' gold='4' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 06:24:50,935 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.281 = 0.50×0.00(prox=0.00) + 0.40×proc(0.358[fin=0.38,mean=0.32]) + 0.10×fmt(1.000) | pred='4/3' gold='4' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 06:24:51,031 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:24:51,126 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.143 = 0.50×0.00(prox=0.00) + 0.40×proc(0.182[fin=0.15,mean=0.24]) + 0.10×fmt(0.700) | pred='' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 06:25:01,247 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.212 = 0.50×0.00(prox=0.00) + 0.40×proc(0.279[fin=0.29,mean=0.26]) + 0.10×fmt(1.000) | pred='2/3' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 06:25:01,340 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 23 GRPO groups: 45%|####5 | 9/20 [03:45<03:47, 20.71s/q, loss=0.0001, mean_r=0.517, q_acc=100%, q_rew=0.731, skip=4]
Iter 23 GRPO groups: 50%|##### | 10/20 [03:45<03:23, 20.34s/q, loss=0.0001, mean_r=0.517, q_acc=100%, q_rew=0.731, skip=4]2026-04-26 06:25:09,211 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.990 = clip(base=0.910 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:25:09,416 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.995 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:25:09,618 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:25:09,820 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:25:10,024 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:25:10,230 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:25:10,434 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:25:10,643 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:25:10,851 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.994 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:25:11,056 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.70 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:25:14,885 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.958 novelty=0.67 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:25:15,083 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.976 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:25:15,277 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.933 = clip(base=0.853 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.977 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:25:15,470 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.923 = clip(base=0.843 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.958 novelty=0.67 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:25:15,676 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.994 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:25:15,871 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:25:16,068 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:25:16,266 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:25:16,458 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.979 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:25:16,652 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.988 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+
Iter 23 GRPO groups: 50%|##### | 10/20 [04:01<03:23, 20.34s/q, loss=0.0013, mean_r=0.956, q_acc=100%, q_rew=0.724, skip=4]
Iter 23 GRPO groups: 55%|#####5 | 11/20 [04:01<02:49, 18.85s/q, loss=0.0013, mean_r=0.956, q_acc=100%, q_rew=0.724, skip=4]2026-04-26 06:25:19,927 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:20,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:24,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:24,574 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:24,653 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:24,732 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:29,378 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:29,456 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:29,532 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:25:29,608 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 23 GRPO groups: 55%|#####5 | 11/20 [04:17<02:49, 18.85s/q, loss=0var, mean_r=0.997, skip=5]
Iter 23 GRPO groups: 60%|###### | 12/20 [04:17<02:23, 17.95s/q, loss=0var, mean_r=0.997, skip=5]2026-04-26 06:25:38,098 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.783 = 0.50×0.64(prox=0.64) + 0.40×proc(0.912[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='2700' gold='2100' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:25:38,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.780 = 0.50×0.64(prox=0.64) + 0.40×proc(0.905[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='2700' gold='2100' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 06:25:38,276 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.749 = 0.50×0.64(prox=0.64) + 0.40×proc(0.828[fin=0.93,mean=0.67]) + 0.10×fmt(1.000) | pred='2700' gold='2100' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 06:25:38,366 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.853 = 0.50×0.78(prox=0.78) + 0.40×proc(0.910[fin=0.98,mean=0.80]) + 0.10×fmt(1.000) | pred='2400' gold='2100' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:25:48,241 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.781 = 0.50×0.64(prox=0.64) + 0.40×proc(0.906[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='2700' gold='2100' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 06:25:48,324 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.861 = 0.50×0.78(prox=0.78) + 0.40×proc(0.931[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='2400' gold='2100' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 06:25:48,408 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.822 = 0.50×0.78(prox=0.78) + 0.40×proc(0.833[fin=1.00,mean=0.59]) + 0.10×fmt(1.000) | pred='2400' gold='2100' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 06:25:48,492 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.941 = 0.50×1.00(exact) + 0.40×proc(0.853[fin=0.91,mean=0.77]) + 0.10×fmt(1.000) | pred='2100' gold='2100' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:26:01,562 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.880 = 0.50×0.78(prox=0.78) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='2400' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:26:01,647 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.867 = 0.50×0.78(prox=0.78) + 0.40×proc(0.946[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='2400' gold='2100' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 23 GRPO groups: 60%|###### | 12/20 [04:46<02:23, 17.95s/q, loss=0.0008, mean_r=0.832, q_acc=100%, q_rew=0.724, skip=5]
Iter 23 GRPO groups: 65%|######5 | 13/20 [04:46<02:28, 21.28s/q, loss=0.0008, mean_r=0.832, q_acc=100%, q_rew=0.724, skip=5]2026-04-26 06:26:07,369 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:07,451 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:13,521 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:26:13,606 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:13,693 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:26:13,779 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:26:21,117 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:26:21,201 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:21,279 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:21,363 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 23 GRPO groups: 65%|######5 | 13/20 [05:13<02:28, 21.28s/q, loss=0var, mean_r=1.000, skip=6]
Iter 23 GRPO groups: 70%|####### | 14/20 [05:13<02:18, 23.02s/q, loss=0var, mean_r=1.000, skip=6]2026-04-26 06:26:33,759 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.997 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:33,952 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.995 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:34,154 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:34,352 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:34,542 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.995 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:34,734 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:34,925 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.995 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:35,121 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:35,315 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:35,524 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:39,581 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.790 = clip(base=0.710 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.699 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.73)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:26:39,772 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.997 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:39,964 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.992 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:40,160 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.989 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:26:40,349 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.772 = clip(base=0.692 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.699 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.73)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:26:40,539 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.996 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:40,729 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:40,922 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.997 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:41,116 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.993 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:26:41,317 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.772 = clip(base=0.692 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.699 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.73)+0.20*lccp(0.00) | steps=2
+
Iter 23 GRPO groups: 70%|####### | 14/20 [05:26<02:18, 23.02s/q, loss=0.0040, mean_r=0.926, q_acc=100%, q_rew=0.717, skip=6]
Iter 23 GRPO groups: 75%|#######5 | 15/20 [05:26<01:39, 19.94s/q, loss=0.0040, mean_r=0.926, q_acc=100%, q_rew=0.717, skip=6]2026-04-26 06:26:46,247 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:46,328 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:26:46,405 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:46,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:51,735 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:51,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:26:51,898 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:26:51,977 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:26:57,099 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:26:57,179 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='50' gold='50' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 23 GRPO groups: 75%|#######5 | 15/20 [05:40<01:39, 19.94s/q, loss=0var, mean_r=0.987, skip=7]
Iter 23 GRPO groups: 80%|######## | 16/20 [05:40<01:12, 18.21s/q, loss=0var, mean_r=0.987, skip=7]2026-04-26 06:27:05,484 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.968 novelty=0.71 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:27:05,694 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.716 = clip(base=0.636 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.583 novelty=0.71 | sol=0.45*prm_final(0.89)+0.35*prm_mean(0.52)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:27:05,898 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.796 = clip(base=0.716 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.724 novelty=0.71 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.72)+0.20*lccp(0.33) | steps=6
+2026-04-26 06:27:06,101 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.849 = clip(base=0.769 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.809 novelty=0.71 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.80)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:27:06,302 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.570 = clip(base=0.490 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.409 novelty=0.71 | sol=0.45*prm_final(0.40)+0.35*prm_mean(0.43)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:27:06,515 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.677 = clip(base=0.597 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.519 novelty=0.71 | sol=0.45*prm_final(0.55)+0.35*prm_mean(0.55)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:27:06,721 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.611 = clip(base=0.531 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.485 novelty=0.71 | sol=0.45*prm_final(0.55)+0.35*prm_mean(0.49)+0.20*lccp(0.33) | steps=6
+2026-04-26 06:27:06,925 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.729 = clip(base=0.649 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.600 novelty=0.71 | sol=0.45*prm_final(0.57)+0.35*prm_mean(0.63)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:27:07,129 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.839 = clip(base=0.759 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.791 novelty=0.71 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.74)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:27:07,335 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.755 = clip(base=0.675 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.649 novelty=0.71 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.70)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:27:14,793 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:15,007 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:15,226 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:15,443 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:15,658 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:15,906 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:16,122 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:16,336 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:16,562 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:27:16,775 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+
Iter 23 GRPO groups: 80%|######## | 16/20 [06:01<01:12, 18.21s/q, loss=-0.0001, mean_r=0.859, q_acc=100%, q_rew=0.716, skip=7]
Iter 23 GRPO groups: 85%|########5 | 17/20 [06:01<00:57, 19.13s/q, loss=-0.0001, mean_r=0.859, q_acc=100%, q_rew=0.716, skip=7]2026-04-26 06:27:22,060 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:22,146 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:28,483 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:28,565 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:28,649 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:28,731 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:35,045 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:35,132 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:35,218 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:27:35,302 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 23 GRPO groups: 85%|########5 | 17/20 [06:18<00:57, 19.13s/q, loss=0var, mean_r=0.999, skip=8]
Iter 23 GRPO groups: 90%|######### | 18/20 [06:18<00:36, 18.44s/q, loss=0var, mean_r=0.999, skip=8]2026-04-26 06:27:42,106 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.935 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.985 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:27:42,296 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.795 = clip(base=0.715 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.678 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.67)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:27:42,491 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.824 = clip(base=0.744 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.729 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.80)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:27:42,685 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.981 = clip(base=0.901 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.966 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:27:42,878 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.694 = clip(base=0.614 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.578 novelty=0.77 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.43)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:27:43,076 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.963 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:27:43,276 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.992 = clip(base=0.912 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.982 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:27:43,474 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.994 = clip(base=0.914 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:27:43,670 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.997 = clip(base=0.917 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.985 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:27:43,871 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.820 = clip(base=0.740 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.715 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.76)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:28:20,310 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:28:20,511 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.987 = clip(base=0.907 + mod=+0.080, cap=1.00) | Q=0.77 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:28:20,708 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.891 = clip(base=0.811 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.841 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:28:20,906 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.984 = clip(base=0.904 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.987 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:28:21,105 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.943 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:21,302 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.922 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:28:21,498 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.980 = clip(base=0.900 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.959 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:21,696 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.921 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.996 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:28:21,893 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.994 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+
Iter 23 GRPO groups: 90%|######### | 18/20 [07:06<00:36, 18.44s/q, loss=0.0010, mean_r=0.944, q_acc=100%, q_rew=0.726, skip=8]
Iter 23 GRPO groups: 95%|#########5| 19/20 [07:06<00:27, 27.38s/q, loss=0.0010, mean_r=0.944, q_acc=100%, q_rew=0.726, skip=8]2026-04-26 06:28:27,656 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:28:27,738 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:28:27,821 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:28:27,903 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:28:36,072 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:28:36,155 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:28:36,237 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:28:36,318 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:28:41,054 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:28:41,136 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='195' gold='195' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 23 GRPO groups: 95%|#########5| 19/20 [07:24<00:27, 27.38s/q, loss=0var, mean_r=0.998, skip=9]
Iter 23 GRPO groups: 100%|##########| 20/20 [07:24<00:00, 24.45s/q, loss=0var, mean_r=0.998, skip=9]
Iter 23 GRPO groups: 100%|##########| 20/20 [07:24<00:00, 22.21s/q, loss=0var, mean_r=0.998, skip=9]
+2026-04-26 06:28:41,148 INFO __main__ - Iter 23 | loss=0.0006 | reward mean=0.921 std=0.147 | gt_match=80.3% | grounded_acc=95.1% | step_acc=90.8% | lccp=84.8% | batch_acc=97.7% | phase=SELFPLAY_RAMP sp_ratio=36% | groups=18 skipped=9(0var=9) | lr=4.14e-06 | 444.3s
+2026-04-26 06:28:41,149 WARNING __main__ - STARVATION: 33% of groups skipped (zero variance). grounded_acc=95.1% suggests curriculum is too easy (raise alpha). Consider adjusting --difficulty-alpha.
+2026-04-26 06:28:41,149 INFO __main__ - Question generation: 7/7 valid (100%) | q_reward=0.726 | q_acc=100.0% (>0.5 quality) | topic=0.56 diff=0.66 clarity=1.00 novelty=0.44 solvability=0.99
+2026-04-26 06:28:41,150 INFO __main__ - ======================================================================
+2026-04-26 06:28:41,150 INFO __main__ - GRPO ITERATION 24/60
+2026-04-26 06:28:41,150 INFO __main__ - ======================================================================
+2026-04-26 06:28:41,169 INFO __main__ - LR this iteration: 4.14e-06 | T=0.644 | MATH ratio=42%
+
Iter 24 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 06:28:45,381 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.992 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:45,569 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:45,755 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:45,941 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:46,130 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:46,321 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:46,506 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:46,691 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:46,876 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:47,063 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:52,254 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.967 novelty=0.71 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:52,449 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.887 = clip(base=0.807 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.964 novelty=0.71 | sol=0.45*prm_final(0.94)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:52,643 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.889 = clip(base=0.809 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.967 novelty=0.71 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:52,837 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.880 = clip(base=0.800 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.952 novelty=0.71 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:53,031 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.876 = clip(base=0.796 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.945 novelty=0.71 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:53,231 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.891 = clip(base=0.811 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.971 novelty=0.71 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:53,435 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.889 = clip(base=0.809 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.967 novelty=0.71 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:53,633 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.893 = clip(base=0.813 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.974 novelty=0.71 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:53,832 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.891 = clip(base=0.811 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.971 novelty=0.71 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:28:54,027 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.893 = clip(base=0.813 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.974 novelty=0.71 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+
Iter 24 GRPO groups: 0%| | 0/20 [00:14, ?q/s, loss=-0.0001, mean_r=0.899, q_acc=100%, q_rew=0.575, skip=0]
Iter 24 GRPO groups: 5%|5 | 1/20 [00:14<04:35, 14.52s/q, loss=-0.0001, mean_r=0.899, q_acc=100%, q_rew=0.575, skip=0]2026-04-26 06:28:58,829 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:28:58,914 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.713 = 0.50×0.67(prox=0.67) + 0.40×proc(0.786[fin=0.97,mean=0.51]) + 0.10×fmt(0.650) | pred='12' gold='16' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 06:29:06,145 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:29:06,228 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.679 = 0.50×0.67(prox=0.67) + 0.40×proc(0.702[fin=0.87,mean=0.45]) + 0.10×fmt(0.650) | pred='12' gold='16' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 06:29:06,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:29:06,393 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.717 = 0.50×0.67(prox=0.67) + 0.40×proc(0.796[fin=0.98,mean=0.51]) + 0.10×fmt(0.650) | pred='12' gold='16' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 06:29:15,158 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.693 = 0.50×0.67(prox=0.67) + 0.40×proc(0.738[fin=0.91,mean=0.47]) + 0.10×fmt(0.650) | pred='12' gold='16' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 06:29:15,242 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:29:15,327 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.744 = 0.50×0.67(prox=0.67) + 0.40×proc(0.777[fin=0.98,mean=0.48]) + 0.10×fmt(1.000) | pred='12' gold='16' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 06:29:15,411 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 24 GRPO groups: 5%|5 | 1/20 [00:43<04:35, 14.52s/q, loss=0.0018, mean_r=0.850, q_acc=100%, q_rew=0.575, skip=0]
Iter 24 GRPO groups: 10%|# | 2/20 [00:43<06:57, 23.18s/q, loss=0.0018, mean_r=0.850, q_acc=100%, q_rew=0.575, skip=0]2026-04-26 06:29:31,262 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:29:31,345 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:29:31,428 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:29:31,514 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.948[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:29:38,316 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.965 = 0.50×1.00(exact) + 0.40×proc(0.912[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=88% lccp=12% (chain=1/8 ok_count=7) n_steps=8
+2026-04-26 06:29:38,398 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:29:38,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:29:38,564 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.939[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 06:29:46,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:29:46,479 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='130' gold='130' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 24 GRPO groups: 10%|# | 2/20 [01:05<06:57, 23.18s/q, loss=0var, mean_r=0.991, skip=1]
Iter 24 GRPO groups: 15%|#5 | 3/20 [01:05<06:21, 22.44s/q, loss=0var, mean_r=0.991, skip=1]2026-04-26 06:29:54,343 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.981 novelty=0.61 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:29:54,541 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.733 = clip(base=0.653 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.713 novelty=0.61 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.66)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:29:54,741 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.390 = clip(base=0.310 + mod=+0.080, cap=1.00) | Q=0.50 sol=0.184 novelty=0.61 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.31)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:29:54,945 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.509 = clip(base=0.429 + mod=+0.080, cap=1.00) | Q=0.49 sol=0.386 novelty=0.61 | sol=0.45*prm_final(0.41)+0.35*prm_mean(0.38)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:29:55,154 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.698 = clip(base=0.618 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.649 novelty=0.61 | sol=0.45*prm_final(0.61)+0.35*prm_mean(0.72)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:29:55,364 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.412 = clip(base=0.332 + mod=+0.080, cap=1.00) | Q=0.49 sol=0.224 novelty=0.61 | sol=0.45*prm_final(0.07)+0.35*prm_mean(0.36)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:29:55,569 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.896 = clip(base=0.816 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.985 novelty=0.61 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:29:55,769 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.636 = clip(base=0.556 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.531 novelty=0.61 | sol=0.45*prm_final(0.43)+0.35*prm_mean(0.64)+0.20*lccp(0.57) | steps=7
+2026-04-26 06:29:55,968 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.667 = clip(base=0.587 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.603 novelty=0.61 | sol=0.45*prm_final(0.62)+0.35*prm_mean(0.63)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:29:56,170 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.888 = clip(base=0.808 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.973 novelty=0.61 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:30:02,443 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.957 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:30:02,646 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.934 = clip(base=0.854 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.969 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:30:02,849 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.940 = clip(base=0.860 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.978 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:30:03,051 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.962 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:30:03,255 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.997 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:30:03,460 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.987 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:30:03,667 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:30:03,871 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.792 = clip(base=0.712 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.732 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.81)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:30:04,074 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:30:04,274 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.984 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+
Iter 24 GRPO groups: 15%|#5 | 3/20 [01:24<06:21, 22.44s/q, loss=0.0005, mean_r=0.801, q_acc=100%, q_rew=0.597, skip=1]
Iter 24 GRPO groups: 20%|## | 4/20 [01:24<05:40, 21.27s/q, loss=0.0005, mean_r=0.801, q_acc=100%, q_rew=0.597, skip=1]2026-04-26 06:30:15,546 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.970 = clip(base=0.890 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:30:15,752 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.999 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:30:15,954 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.998 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:30:16,158 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.996 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:30:16,361 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.66 sol=1.000 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:30:16,567 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.989 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:30:16,771 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.66 sol=1.000 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:30:16,974 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:30:17,179 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.999 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:30:17,386 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.975 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:30:23,279 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.787 = clip(base=0.707 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.691 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.69)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:23,483 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.787 = clip(base=0.707 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.725 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.79)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:23,688 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.754 = clip(base=0.674 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.689 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.68)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:23,895 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.742 = clip(base=0.662 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.668 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.63)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:24,101 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.778 = clip(base=0.698 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.713 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.75)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:24,313 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.771 = clip(base=0.691 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.702 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.72)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:24,521 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.768 = clip(base=0.688 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.708 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.74)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:24,724 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.779 = clip(base=0.699 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.724 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.78)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:24,941 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.778 = clip(base=0.698 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.722 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.78)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:30:25,158 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.788 = clip(base=0.708 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.726 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.79)+0.20*lccp(0.00) | steps=3
+
Iter 24 GRPO groups: 20%|## | 4/20 [01:45<05:40, 21.27s/q, loss=0.0003, mean_r=0.861, q_acc=100%, q_rew=0.623, skip=1]
Iter 24 GRPO groups: 25%|##5 | 5/20 [01:45<05:17, 21.14s/q, loss=0.0003, mean_r=0.861, q_acc=100%, q_rew=0.623, skip=1]2026-04-26 06:30:35,189 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.386 = 0.50×0.00(prox=0.00) + 0.40×proc(0.714[fin=0.92,mean=0.41]) + 0.10×fmt(1.000) | pred='$4\\sqrt{2}$' gold='9' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 06:30:35,276 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.449 = 0.50×0.00(prox=0.00) + 0.40×proc(0.873[fin=0.99,mean=0.70]) + 0.10×fmt(1.000) | pred='$4\\sqrt{2}$' gold='9' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 06:30:47,566 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.380 = 0.50×0.47(prox=0.47) + 0.40×proc(0.109[fin=0.11,mean=0.10]) + 0.10×fmt(1.000) | pred='4' gold='9' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:30:47,650 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.844 = 0.50×0.82(prox=0.82) + 0.40×proc(0.837[fin=0.98,mean=0.62]) + 0.10×fmt(1.000) | pred='8' gold='9' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 06:30:47,742 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.117 = 0.50×0.00(prox=0.00) + 0.40×proc(0.117[fin=0.08,mean=0.18]) + 0.10×fmt(0.700) | pred='' gold='9' | step_acc=17% lccp=0% (chain=0/6 ok_count=1) n_steps=6
+2026-04-26 06:30:47,825 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:30:56,450 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.987[fin=0.99,mean=0.99]) + 0.10×fmt(0.700) | pred='' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:30:56,532 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.898[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 06:30:56,623 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.365 = 0.50×0.00(prox=0.00) + 0.40×proc(0.663[fin=0.91,mean=0.30]) + 0.10×fmt(1.000) | pred='$4\\sqrt{2}$' gold='9' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 06:30:56,706 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='9' gold='9' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 24 GRPO groups: 25%|##5 | 5/20 [02:23<05:17, 21.14s/q, loss=0.0002, mean_r=0.602, q_acc=100%, q_rew=0.623, skip=1]
Iter 24 GRPO groups: 30%|### | 6/20 [02:23<06:15, 26.84s/q, loss=0.0002, mean_r=0.602, q_acc=100%, q_rew=0.623, skip=1]2026-04-26 06:31:09,543 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:31:09,627 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.558 = 0.50×0.34(prox=0.34) + 0.40×proc(0.716[fin=0.82,mean=0.56]) + 0.10×fmt(1.000) | pred='0.8' gold='20' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:31:09,708 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:31:09,791 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.928 = 0.50×1.00(exact) + 0.40×proc(0.820[fin=0.82,mean=0.83]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:31:16,023 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:31:16,111 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.924 = 0.50×1.00(exact) + 0.40×proc(0.809[fin=0.98,mean=0.55]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 06:31:16,200 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.631 = 0.50×0.38(prox=0.38) + 0.40×proc(0.847[fin=1.00,mean=0.62]) + 0.10×fmt(1.000) | pred='4' gold='20' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 06:31:16,288 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.572 = 0.50×0.38(prox=0.38) + 0.40×proc(0.699[fin=0.87,mean=0.44]) + 0.10×fmt(1.000) | pred='4' gold='20' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 06:31:23,253 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:31:23,335 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 24 GRPO groups: 30%|### | 6/20 [02:43<06:15, 26.84s/q, loss=-0.0022, mean_r=0.859, q_acc=100%, q_rew=0.623, skip=1]
Iter 24 GRPO groups: 35%|###5 | 7/20 [02:43<05:19, 24.61s/q, loss=-0.0022, mean_r=0.859, q_acc=100%, q_rew=0.623, skip=1]2026-04-26 06:31:31,924 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.954 + mod=+0.080, cap=1.00) | Q=0.89 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:32,143 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.991 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:32,356 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:32,576 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:32,788 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:33,001 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:33,224 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:33,440 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:33,663 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:31:33,879 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.938 + mod=+0.080, cap=1.00) | Q=0.85 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:31:43,891 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.982 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:31:44,107 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.994 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:31:44,322 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.971 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:31:44,546 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.392 = clip(base=0.312 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.173 novelty=0.77 | sol=0.45*prm_final(0.17)+0.35*prm_mean(0.27)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:31:44,762 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.812 = clip(base=0.732 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.791 novelty=0.77 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.75)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:31:44,986 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.682 = clip(base=0.602 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.617 novelty=0.77 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.58)+0.20*lccp(0.14) | steps=7
+2026-04-26 06:31:45,212 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:31:45,437 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.651 = clip(base=0.571 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.564 novelty=0.77 | sol=0.45*prm_final(0.42)+0.35*prm_mean(0.69)+0.20*lccp(0.67) | steps=6
+2026-04-26 06:31:45,656 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.971 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:31:45,889 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.634 = clip(base=0.554 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.521 novelty=0.77 | sol=0.45*prm_final(0.48)+0.35*prm_mean(0.58)+0.20*lccp(0.50) | steps=4
+
Iter 24 GRPO groups: 35%|###5 | 7/20 [03:06<05:19, 24.61s/q, loss=0.0002, mean_r=0.892, q_acc=100%, q_rew=0.650, skip=2]
Iter 24 GRPO groups: 40%|#### | 8/20 [03:06<04:48, 24.02s/q, loss=0.0002, mean_r=0.892, q_acc=100%, q_rew=0.650, skip=2]2026-04-26 06:31:53,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:31:53,673 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:32:02,861 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:32:02,944 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:32:03,027 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:32:03,113 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:32:12,967 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='-6' gold='6' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:32:13,049 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:32:13,133 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:32:13,216 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-6' gold='6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 24 GRPO groups: 40%|#### | 8/20 [03:45<04:48, 24.02s/q, loss=-0.0007, mean_r=0.864, q_acc=100%, q_rew=0.650, skip=2]
Iter 24 GRPO groups: 45%|####5 | 9/20 [03:45<05:17, 28.82s/q, loss=-0.0007, mean_r=0.864, q_acc=100%, q_rew=0.650, skip=2]2026-04-26 06:32:36,050 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.971 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:32:36,244 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:32:36,436 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.888 = clip(base=0.808 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.964 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:32:36,633 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.897 = clip(base=0.817 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.980 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:32:36,821 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.990 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:32:37,010 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.453 = clip(base=0.373 + mod=+0.080, cap=1.00) | Q=0.50 sol=0.285 novelty=0.71 | sol=0.45*prm_final(0.07)+0.35*prm_mean(0.44)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:32:37,209 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.989 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:32:37,400 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:32:37,590 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:32:37,784 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.892 = clip(base=0.812 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.971 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:32:47,791 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.972 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:32:47,991 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.884 = clip(base=0.804 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.958 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:32:48,199 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.482 = clip(base=0.402 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.312 novelty=0.70 | sol=0.45*prm_final(0.10)+0.35*prm_mean(0.47)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:32:48,402 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.757 = clip(base=0.677 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.738 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.65)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:32:48,604 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.461 = clip(base=0.381 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.277 novelty=0.70 | sol=0.45*prm_final(0.03)+0.35*prm_mean(0.46)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:32:48,815 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.869 = clip(base=0.789 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.925 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.80)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:32:49,017 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.873 = clip(base=0.793 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.940 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:32:49,219 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.869 = clip(base=0.789 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.933 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.82)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:32:49,433 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.624 = clip(base=0.544 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.536 novelty=0.70 | sol=0.45*prm_final(0.79)+0.35*prm_mean(0.40)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:32:49,634 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.884 = clip(base=0.804 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.958 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=2
+
Iter 24 GRPO groups: 45%|####5 | 9/20 [04:10<05:17, 28.82s/q, loss=-0.0018, mean_r=0.809, q_acc=100%, q_rew=0.634, skip=2]
Iter 24 GRPO groups: 50%|##### | 10/20 [04:10<04:34, 27.45s/q, loss=-0.0018, mean_r=0.809, q_acc=100%, q_rew=0.634, skip=2]2026-04-26 06:32:54,845 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='238' gold='238' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:32:54,927 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='238' gold='238' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:32:55,009 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='238' gold='238' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:32:55,084 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='238' gold='238' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:33:00,095 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='238' gold='238' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:33:00,177 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.873 = 0.50×0.85(prox=0.85) + 0.40×proc(0.871[fin=0.98,mean=0.70]) + 0.10×fmt(1.000) | pred='242' gold='238' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 06:33:00,259 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='238' gold='238' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:33:00,339 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.873 = 0.50×0.85(prox=0.85) + 0.40×proc(0.870[fin=0.98,mean=0.70]) + 0.10×fmt(1.000) | pred='242' gold='238' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 06:33:05,270 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='238' gold='238' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:33:05,355 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='238' gold='238' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 24 GRPO groups: 50%|##### | 10/20 [04:25<04:34, 27.45s/q, loss=0.0010, mean_r=0.974, q_acc=100%, q_rew=0.634, skip=2]
Iter 24 GRPO groups: 55%|#####5 | 11/20 [04:25<03:34, 23.79s/q, loss=0.0010, mean_r=0.974, q_acc=100%, q_rew=0.634, skip=2]2026-04-26 06:33:10,655 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:33:10,852 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.986 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:33:11,064 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.892 = clip(base=0.812 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.973 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:33:11,271 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:33:11,463 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.511 = clip(base=0.431 + mod=+0.080, cap=1.00) | Q=0.51 sol=0.376 novelty=0.69 | sol=0.45*prm_final(0.62)+0.35*prm_mean(0.27)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:33:11,655 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.727 = clip(base=0.647 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.693 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.70)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:33:11,850 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:33:12,042 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:33:12,241 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.896 = clip(base=0.816 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.979 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:33:12,432 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:33:18,076 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.594 = clip(base=0.514 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.432 novelty=0.63 | sol=0.45*prm_final(0.29)+0.35*prm_mean(0.57)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:33:18,278 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.499 = clip(base=0.419 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.298 novelty=0.63 | sol=0.45*prm_final(0.05)+0.35*prm_mean(0.51)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:33:18,492 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.727 = clip(base=0.647 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.682 novelty=0.63 | sol=0.45*prm_final(0.75)+0.35*prm_mean(0.70)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:33:18,697 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.478 = clip(base=0.398 + mod=+0.080, cap=1.00) | Q=0.53 sol=0.312 novelty=0.63 | sol=0.45*prm_final(0.12)+0.35*prm_mean(0.45)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:33:18,897 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.421 = clip(base=0.341 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.210 novelty=0.63 | sol=0.45*prm_final(0.12)+0.35*prm_mean(0.31)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:33:19,101 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.576 = clip(base=0.496 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.427 novelty=0.63 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.68)+0.20*lccp(0.75) | steps=4
+2026-04-26 06:33:19,302 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.565 = clip(base=0.485 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.409 novelty=0.63 | sol=0.45*prm_final(0.06)+0.35*prm_mean(0.66)+0.20*lccp(0.75) | steps=4
+2026-04-26 06:33:19,496 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.473 = clip(base=0.393 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.306 novelty=0.63 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.48)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:33:19,689 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.520 = clip(base=0.440 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.334 novelty=0.63 | sol=0.45*prm_final(0.10)+0.35*prm_mean(0.53)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:33:19,882 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.553 = clip(base=0.473 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.397 novelty=0.63 | sol=0.45*prm_final(0.03)+0.35*prm_mean(0.66)+0.20*lccp(0.75) | steps=4
+
Iter 24 GRPO groups: 55%|#####5 | 11/20 [04:40<03:34, 23.79s/q, loss=-0.0001, mean_r=0.694, q_acc=100%, q_rew=0.625, skip=2]
Iter 24 GRPO groups: 60%|###### | 12/20 [04:40<02:49, 21.15s/q, loss=-0.0001, mean_r=0.694, q_acc=100%, q_rew=0.625, skip=2]2026-04-26 06:33:25,144 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:25,227 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:32,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:32,865 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:32,947 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:33,030 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:40,923 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:41,008 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:41,095 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:33:41,180 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='575' gold='575' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 24 GRPO groups: 60%|###### | 12/20 [05:06<02:49, 21.15s/q, loss=0var, mean_r=0.998, skip=3]
Iter 24 GRPO groups: 65%|######5 | 13/20 [05:06<02:38, 22.70s/q, loss=0var, mean_r=0.998, skip=3]2026-04-26 06:33:56,366 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.990 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:33:56,564 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.995 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:33:56,758 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.964 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:33:56,951 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:33:57,151 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.987 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:33:57,356 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.996 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:33:57,584 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:33:57,780 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.992 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:33:57,975 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:33:58,171 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.989 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:34:07,041 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.976 = clip(base=0.896 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:34:07,253 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:34:07,457 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.980 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:34:07,655 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.984 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:34:07,854 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:34:08,054 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.717 = clip(base=0.637 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.571 novelty=0.71 | sol=0.45*prm_final(0.56)+0.35*prm_mean(0.62)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:34:08,264 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.824 = clip(base=0.744 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.751 novelty=0.71 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.74)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:34:08,474 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:34:08,683 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.990 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:34:08,882 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 24 GRPO groups: 65%|######5 | 13/20 [05:29<02:38, 22.70s/q, loss=-0.0013, mean_r=0.940, q_acc=100%, q_rew=0.637, skip=3]
Iter 24 GRPO groups: 70%|####### | 14/20 [05:29<02:15, 22.61s/q, loss=-0.0013, mean_r=0.940, q_acc=100%, q_rew=0.637, skip=3]2026-04-26 06:34:15,822 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:34:15,905 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:34:15,987 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:34:16,069 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:34:28,432 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:34:28,513 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:34:28,595 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:34:28,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:34:41,098 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:34:41,181 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='7' gold='7' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 24 GRPO groups: 70%|####### | 14/20 [06:00<02:15, 22.61s/q, loss=0var, mean_r=0.999, skip=4]
Iter 24 GRPO groups: 75%|#######5 | 15/20 [06:00<02:05, 25.02s/q, loss=0var, mean_r=0.999, skip=4]2026-04-26 06:34:44,922 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:45,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:51,794 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:51,879 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:51,962 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:52,045 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:59,059 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:59,144 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:59,228 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:34:59,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='221' gold='221' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 24 GRPO groups: 75%|#######5 | 15/20 [06:24<02:05, 25.02s/q, loss=0var, mean_r=0.998, skip=5]
Iter 24 GRPO groups: 80%|######## | 16/20 [06:24<01:39, 24.99s/q, loss=0var, mean_r=0.998, skip=5]2026-04-26 06:35:12,347 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:35:12,429 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 06:35:12,512 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:35:12,596 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:35:27,925 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:35:28,007 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:35:28,092 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:35:28,178 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:35:36,419 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:35:36,505 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='400' gold='400' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+
Iter 24 GRPO groups: 80%|######## | 16/20 [06:55<01:39, 24.99s/q, loss=0var, mean_r=0.999, skip=6]
Iter 24 GRPO groups: 85%|########5 | 17/20 [06:55<01:19, 26.62s/q, loss=0var, mean_r=0.999, skip=6]2026-04-26 06:35:40,326 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:40,408 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:44,524 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='$64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:44,607 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.981 = 0.50×1.00(exact) + 0.40×proc(0.952[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:44,690 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:44,772 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:50,133 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:50,220 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='$64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:50,304 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:35:50,386 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='64' gold='64' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 24 GRPO groups: 85%|########5 | 17/20 [07:14<01:19, 26.62s/q, loss=0var, mean_r=0.995, skip=7]
Iter 24 GRPO groups: 90%|######### | 18/20 [07:14<00:48, 24.33s/q, loss=0var, mean_r=0.995, skip=7]2026-04-26 06:36:01,508 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:36:01,593 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.399 = 0.50×0.40(prox=0.40) + 0.40×proc(0.155[fin=0.08,mean=0.27]) + 0.10×fmt(1.000) | pred='10' gold='40' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 06:36:01,679 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.461 = 0.50×0.40(prox=0.40) + 0.40×proc(0.308[fin=0.27,mean=0.37]) + 0.10×fmt(1.000) | pred='10' gold='40' | step_acc=25% lccp=25% (chain=1/4 ok_count=1) n_steps=4
+2026-04-26 06:36:01,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.906[fin=1.00,mean=0.77]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 06:36:08,936 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:36:09,018 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:36:09,103 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='40%' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:36:09,186 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='40' gold='40' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:36:17,381 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.412 = 0.50×0.40(prox=0.40) + 0.40×proc(0.154[fin=0.04,mean=0.33]) + 0.10×fmt(1.000) | pred='10' gold='40' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 06:36:17,465 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.702 = 0.50×0.50(prox=0.50) + 0.40×proc(0.881[fin=0.99,mean=0.71]) + 0.10×fmt(1.000) | pred='60' gold='40' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+
Iter 24 GRPO groups: 90%|######### | 18/20 [07:37<00:48, 24.33s/q, loss=0.0003, mean_r=0.747, q_acc=100%, q_rew=0.637, skip=7]
Iter 24 GRPO groups: 95%|#########5| 19/20 [07:37<00:24, 24.05s/q, loss=0.0003, mean_r=0.747, q_acc=100%, q_rew=0.637, skip=7]2026-04-26 06:36:24,981 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.593 = clip(base=0.513 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.278 novelty=0.76 | sol=0.45*prm_final(0.00)+0.35*prm_mean(0.50)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:36:25,195 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.999 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:36:25,413 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.976 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:25,626 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.940 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.999 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:25,838 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.983 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:26,046 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.999 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:36:26,264 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.934 + mod=+0.080, cap=1.00) | Q=0.84 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:36:26,483 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.935 + mod=+0.080, cap=1.00) | Q=0.84 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:36:26,702 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.926 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.979 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:26,912 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.940 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.999 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:30,816 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.948 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:31,017 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.998 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:31,216 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:31,411 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:31,616 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:31,807 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:31,999 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:32,195 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:32,388 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:32,580 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.997 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 24 GRPO groups: 95%|#########5| 19/20 [07:51<00:24, 24.05s/q, loss=0.0000, mean_r=0.980, q_acc=100%, q_rew=0.662, skip=8]
Iter 24 GRPO groups: 100%|##########| 20/20 [07:51<00:00, 21.01s/q, loss=0.0000, mean_r=0.980, q_acc=100%, q_rew=0.662, skip=8]
Iter 24 GRPO groups: 100%|##########| 20/20 [07:51<00:00, 23.58s/q, loss=0.0000, mean_r=0.980, q_acc=100%, q_rew=0.662, skip=8]
+2026-04-26 06:36:32,850 INFO __main__ - Iter 24 | loss=-0.0002 | reward mean=0.880 std=0.173 | gt_match=79.2% | grounded_acc=93.3% | step_acc=89.9% | lccp=81.2% | batch_acc=93.6% | phase=SELFPLAY_RAMP sp_ratio=39% | groups=20 skipped=8(0var=8) | lr=4.03e-06 | 471.7s
+2026-04-26 06:36:32,851 INFO __main__ - Question generation: 8/8 valid (100%) | q_reward=0.662 | q_acc=100.0% (>0.5 quality) | topic=0.53 diff=0.31 clarity=1.00 novelty=0.44 solvability=0.97
+2026-04-26 06:36:32,852 INFO __main__ - ======================================================================
+2026-04-26 06:36:32,852 INFO __main__ - GRPO ITERATION 25/60
+2026-04-26 06:36:32,852 INFO __main__ - ======================================================================
+2026-04-26 06:36:32,871 INFO __main__ - LR this iteration: 4.03e-06 | T=0.637 | MATH ratio=44%
+
Iter 25 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 06:36:37,962 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.923 = clip(base=0.843 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:38,165 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:38,370 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:38,570 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:38,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:38,980 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:39,185 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:39,394 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:39,600 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:39,808 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:43,677 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:43,873 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.998 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:44,070 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:44,268 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:44,465 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.996 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:44,669 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:44,881 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:45,085 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.998 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:45,284 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:36:45,481 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 25 GRPO groups: 0%| | 0/20 [00:14, ?q/s, loss=-0.0001, mean_r=0.906, q_acc=100%, q_rew=0.567, skip=0]
Iter 25 GRPO groups: 5%|5 | 1/20 [00:14<04:31, 14.28s/q, loss=-0.0001, mean_r=0.906, q_acc=100%, q_rew=0.567, skip=0]2026-04-26 06:36:53,985 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.650 = clip(base=0.570 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.505 novelty=0.75 | sol=0.45*prm_final(0.66)+0.35*prm_mean(0.45)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:36:54,186 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.536 = clip(base=0.456 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.350 novelty=0.75 | sol=0.45*prm_final(0.49)+0.35*prm_mean(0.37)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:36:54,387 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.431 = clip(base=0.351 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.173 novelty=0.75 | sol=0.45*prm_final(0.13)+0.35*prm_mean(0.33)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:36:54,580 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.818 = clip(base=0.738 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.754 novelty=0.75 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.79)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:36:54,777 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.824 = clip(base=0.744 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.766 novelty=0.75 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.76)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:36:54,970 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.511 = clip(base=0.431 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.307 novelty=0.75 | sol=0.45*prm_final(0.49)+0.35*prm_mean(0.25)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:36:55,160 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.921 novelty=0.75 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.80)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:36:55,355 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.956 novelty=0.75 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:36:55,545 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.574 = clip(base=0.494 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.427 novelty=0.75 | sol=0.45*prm_final(0.64)+0.35*prm_mean(0.40)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:36:55,739 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.782 = clip(base=0.702 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.688 novelty=0.75 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.61)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:37:10,210 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.575 = clip(base=0.495 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.368 novelty=0.74 | sol=0.45*prm_final(0.46)+0.35*prm_mean(0.47)+0.20*lccp(0.00) | steps=7
+2026-04-26 06:37:10,435 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.636 = clip(base=0.556 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.456 novelty=0.74 | sol=0.45*prm_final(0.42)+0.35*prm_mean(0.68)+0.20*lccp(0.14) | steps=7
+2026-04-26 06:37:10,666 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.644 = clip(base=0.564 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.462 novelty=0.74 | sol=0.45*prm_final(0.56)+0.35*prm_mean(0.50)+0.20*lccp(0.17) | steps=6
+2026-04-26 06:37:10,878 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.533 = clip(base=0.453 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.340 novelty=0.74 | sol=0.45*prm_final(0.17)+0.35*prm_mean(0.47)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:37:11,102 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.493 = clip(base=0.413 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.266 novelty=0.74 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.50)+0.20*lccp(0.14) | steps=7
+2026-04-26 06:37:11,323 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.659 = clip(base=0.579 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.568 novelty=0.74 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.49)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:37:11,545 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.547 = clip(base=0.467 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.299 novelty=0.74 | sol=0.45*prm_final(0.20)+0.35*prm_mean(0.52)+0.20*lccp(0.14) | steps=7
+2026-04-26 06:37:11,766 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.673 = clip(base=0.593 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.535 novelty=0.74 | sol=0.45*prm_final(0.69)+0.35*prm_mean(0.54)+0.20*lccp(0.17) | steps=6
+2026-04-26 06:37:11,985 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.730 = clip(base=0.650 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.608 novelty=0.74 | sol=0.45*prm_final(0.68)+0.35*prm_mean(0.57)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:37:12,216 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.671 = clip(base=0.591 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.543 novelty=0.74 | sol=0.45*prm_final(0.81)+0.35*prm_mean(0.51)+0.20*lccp(0.00) | steps=8
+
Iter 25 GRPO groups: 5%|5 | 1/20 [00:41<04:31, 14.28s/q, loss=-0.0009, mean_r=0.656, q_acc=100%, q_rew=0.618, skip=0]
Iter 25 GRPO groups: 10%|# | 2/20 [00:41<06:29, 21.66s/q, loss=-0.0009, mean_r=0.656, q_acc=100%, q_rew=0.618, skip=0]2026-04-26 06:37:19,518 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.761 = clip(base=0.681 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.701 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.73)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:37:19,702 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.744 = clip(base=0.664 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.701 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.73)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:37:19,887 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:37:20,075 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.994 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:37:20,273 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.818 = clip(base=0.738 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.776 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.79)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:37:20,459 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.995 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:37:20,649 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:37:20,840 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:37:21,027 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.744 = clip(base=0.664 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.701 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.73)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:37:21,218 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:37:26,454 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.937 = clip(base=0.857 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.998 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:37:26,655 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.62 sol=1.000 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:37:26,860 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.998 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:37:27,063 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.994 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:37:27,264 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.998 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:37:27,465 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.62 sol=1.000 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:37:27,664 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.998 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:37:27,859 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.992 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:37:28,058 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.998 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:37:28,262 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.62 sol=1.000 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 25 GRPO groups: 10%|# | 2/20 [00:57<06:29, 21.66s/q, loss=0.0017, mean_r=0.891, q_acc=100%, q_rew=0.617, skip=0]
Iter 25 GRPO groups: 15%|#5 | 3/20 [00:57<05:23, 19.06s/q, loss=0.0017, mean_r=0.891, q_acc=100%, q_rew=0.617, skip=0]2026-04-26 06:37:35,253 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.748 = 0.50×1.00(exact) + 0.40×proc(0.371[fin=0.24,mean=0.56]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 06:37:35,336 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.918 = 0.50×1.00(exact) + 0.40×proc(0.795[fin=0.81,mean=0.77]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:37:43,888 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.927[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:37:43,971 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.851 = 0.50×1.00(exact) + 0.40×proc(0.628[fin=0.64,mean=0.61]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=67% lccp=50% (chain=3/6 ok_count=4) n_steps=6
+2026-04-26 06:37:44,055 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:37:44,139 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:37:56,821 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.940[fin=0.99,mean=0.86]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=83% lccp=50% (chain=3/6 ok_count=5) n_steps=6
+2026-04-26 06:37:56,905 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.518[fin=0.41,mean=0.68]) + 0.10×fmt(1.000) | pred='10' gold='5' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 06:37:56,988 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:37:57,072 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 25 GRPO groups: 15%|#5 | 3/20 [01:35<05:23, 19.06s/q, loss=0.0010, mean_r=0.901, q_acc=100%, q_rew=0.617, skip=0]
Iter 25 GRPO groups: 20%|## | 4/20 [01:35<07:07, 26.75s/q, loss=0.0010, mean_r=0.901, q_acc=100%, q_rew=0.617, skip=0]2026-04-26 06:38:19,867 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:38:19,964 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.993[fin=1.00,mean=0.99]) + 0.10×fmt(0.700) | pred='' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:38:20,058 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:38:20,153 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:38:27,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:38:27,969 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:38:28,053 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:38:28,139 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:38:33,032 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:38:33,125 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 25 GRPO groups: 20%|## | 4/20 [02:01<07:07, 26.75s/q, loss=-0.0005, mean_r=0.908, q_acc=100%, q_rew=0.617, skip=0]
Iter 25 GRPO groups: 25%|##5 | 5/20 [02:01<06:37, 26.53s/q, loss=-0.0005, mean_r=0.908, q_acc=100%, q_rew=0.617, skip=0]2026-04-26 06:38:42,737 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:38:42,821 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.862 = 0.50×0.82(prox=0.82) + 0.40×proc(0.875[fin=0.98,mean=0.71]) + 0.10×fmt(1.000) | pred='15.5' gold='14' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 06:38:53,868 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:38:53,952 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.870 = 0.50×0.82(prox=0.82) + 0.40×proc(0.895[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='15.5' gold='14' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 06:38:54,036 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.820 = 0.50×0.82(prox=0.82) + 0.40×proc(0.771[fin=0.91,mean=0.56]) + 0.10×fmt(1.000) | pred='15.5' gold='14' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 06:38:54,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.869 = 0.50×0.82(prox=0.82) + 0.40×proc(0.892[fin=0.99,mean=0.74]) + 0.10×fmt(1.000) | pred='15.5' gold='14' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 06:39:07,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='14' gold='14' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:39:07,219 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.884 = 0.50×0.82(prox=0.82) + 0.40×proc(0.931[fin=0.99,mean=0.84]) + 0.10×fmt(1.000) | pred='15.5' gold='14' | step_acc=88% lccp=25% (chain=2/8 ok_count=7) n_steps=8
+2026-04-26 06:39:07,303 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.829 = 0.50×0.78(prox=0.78) + 0.40×proc(0.851[fin=0.98,mean=0.66]) + 0.10×fmt(1.000) | pred='12' gold='14' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 06:39:07,398 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.883 = 0.50×0.82(prox=0.82) + 0.40×proc(0.928[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='15.5' gold='14' | step_acc=89% lccp=22% (chain=2/9 ok_count=8) n_steps=9
+
Iter 25 GRPO groups: 25%|##5 | 5/20 [02:59<06:37, 26.53s/q, loss=0.0014, mean_r=0.900, q_acc=100%, q_rew=0.617, skip=0]
Iter 25 GRPO groups: 30%|### | 6/20 [02:59<08:39, 37.08s/q, loss=0.0014, mean_r=0.900, q_acc=100%, q_rew=0.617, skip=0]2026-04-26 06:39:38,820 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.798 = 0.50×0.67(prox=0.67) + 0.40×proc(0.912[fin=0.99,mean=0.80]) + 0.10×fmt(1.000) | pred='250' gold='200' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:39:38,913 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.539 = 0.50×0.43(prox=0.43) + 0.40×proc(0.375[fin=0.26,mean=0.54]) + 0.10×fmt(1.000) | pred='66.67' gold='200' | step_acc=50% lccp=50% (chain=3/6 ok_count=3) n_steps=6
+2026-04-26 06:39:38,998 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='200' gold='200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:39:39,092 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.770 = 0.50×0.67(prox=0.67) + 0.40×proc(0.841[fin=0.89,mean=0.76]) + 0.10×fmt(1.000) | pred='250' gold='200' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:39:49,110 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='200' gold='200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:39:49,204 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.790 = 0.50×0.67(prox=0.67) + 0.40×proc(0.892[fin=0.96,mean=0.79]) + 0.10×fmt(1.000) | pred='250' gold='200' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 06:39:49,294 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='200' gold='200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:39:49,377 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='200' gold='200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:39:57,879 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.536 = 0.50×0.67(prox=0.67) + 0.40×proc(0.257[fin=0.02,mean=0.61]) + 0.10×fmt(1.000) | pred='150' gold='200' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 06:39:57,973 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.794 = 0.50×0.67(prox=0.67) + 0.40×proc(0.902[fin=0.97,mean=0.80]) + 0.10×fmt(1.000) | pred='250' gold='200' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+
Iter 25 GRPO groups: 30%|### | 6/20 [03:26<08:39, 37.08s/q, loss=-0.0004, mean_r=0.818, q_acc=100%, q_rew=0.617, skip=0]
Iter 25 GRPO groups: 35%|###5 | 7/20 [03:26<07:20, 33.87s/q, loss=-0.0004, mean_r=0.818, q_acc=100%, q_rew=0.617, skip=0]2026-04-26 06:40:10,105 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='2' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 06:40:10,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='2' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 06:40:14,916 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='2' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 06:40:15,024 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.919 = 0.50×1.00(exact) + 0.40×proc(0.798[fin=1.00,mean=0.50]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 06:40:15,025 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='2' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 06:40:15,025 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='2' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 06:40:21,856 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.891 = 0.50×1.00(exact) + 0.40×proc(0.902[fin=0.90,mean=0.90]) + 0.10×fmt(0.300) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=1/1 ok_count=1) n_steps=1
+2026-04-26 06:40:21,856 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='2' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 06:40:21,974 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.930 = 0.50×1.00(exact) + 0.40×proc(0.825[fin=0.99,mean=0.57]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 06:40:21,975 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='2' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+
Iter 25 GRPO groups: 35%|###5 | 7/20 [03:55<07:20, 33.87s/q, loss=0.0004, mean_r=0.274, q_acc=100%, q_rew=0.617, skip=0]
Iter 25 GRPO groups: 40%|#### | 8/20 [03:55<06:27, 32.27s/q, loss=0.0004, mean_r=0.274, q_acc=100%, q_rew=0.617, skip=0]2026-04-26 06:40:34,716 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.992 = clip(base=0.912 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:34,916 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.994 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:35,122 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:40:35,326 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:35,524 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:40:35,729 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:40:35,931 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:36,132 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:36,333 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:36,531 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:43,024 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.993 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:43,235 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.820 = clip(base=0.740 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.744 novelty=0.75 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.77)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:40:43,455 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.995 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:43,667 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.989 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:43,877 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:44,086 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.994 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:44,294 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.991 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:44,503 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.984 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:44,710 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:40:44,920 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.995 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 25 GRPO groups: 40%|#### | 8/20 [04:13<06:27, 32.27s/q, loss=0.0003, mean_r=0.960, q_acc=100%, q_rew=0.644, skip=0]
Iter 25 GRPO groups: 45%|####5 | 9/20 [04:13<05:07, 27.91s/q, loss=0.0003, mean_r=0.960, q_acc=100%, q_rew=0.644, skip=0]2026-04-26 06:41:20,126 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:41:20,224 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.504[fin=0.29,mean=0.83]) + 0.10×fmt(1.000) | pred='4' gold='2' | step_acc=80% lccp=80% (chain=8/10 ok_count=8) n_steps=10
+2026-04-26 06:41:20,329 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.903[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 06:41:20,414 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=0.98,mean=0.97]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:41:29,334 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:41:29,460 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=94% lccp=11% (chain=2/18 ok_count=17) n_steps=18
+2026-04-26 06:41:29,542 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.957 = 0.50×1.00(exact) + 0.40×proc(0.893[fin=1.00,mean=0.74]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 06:41:29,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.912 = 0.50×1.00(exact) + 0.40×proc(0.779[fin=0.82,mean=0.72]) + 0.10×fmt(1.000) | pred='2' gold='2' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 06:41:34,582 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.695 = 0.50×0.50(prox=0.50) + 0.40×proc(0.863[fin=0.87,mean=0.85]) + 0.10×fmt(1.000) | pred='1' gold='2' | step_acc=83% lccp=44% (chain=8/18 ok_count=15) n_steps=18
+
Iter 25 GRPO groups: 45%|####5 | 9/20 [05:03<05:07, 27.91s/q, loss=-0.0008, mean_r=0.895, q_acc=100%, q_rew=0.644, skip=0]
Iter 25 GRPO groups: 50%|##### | 10/20 [05:03<05:45, 34.52s/q, loss=-0.0008, mean_r=0.895, q_acc=100%, q_rew=0.644, skip=0]2026-04-26 06:41:49,704 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.700) | pred='' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:41:49,796 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:41:49,887 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:42:05,910 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:42:06,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:42:06,103 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.301 = 0.50×0.00(prox=0.00) + 0.40×proc(0.576[fin=0.54,mean=0.64]) + 0.10×fmt(0.700) | pred='' gold='5' | step_acc=71% lccp=0% (chain=0/7 ok_count=5) n_steps=7
+2026-04-26 06:42:06,187 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:42:18,445 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='5' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:42:18,538 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:42:18,633 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 25 GRPO groups: 50%|##### | 10/20 [05:47<05:45, 34.52s/q, loss=0.0003, mean_r=0.705, q_acc=100%, q_rew=0.644, skip=0]
Iter 25 GRPO groups: 55%|#####5 | 11/20 [05:47<05:37, 37.48s/q, loss=0.0003, mean_r=0.705, q_acc=100%, q_rew=0.644, skip=0]2026-04-26 06:42:27,231 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.769 = clip(base=0.689 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.648 novelty=0.73 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.61)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:42:27,437 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:27,642 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:27,854 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:28,062 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.746 = clip(base=0.666 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.650 novelty=0.73 | sol=0.45*prm_final(0.89)+0.35*prm_mean(0.57)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:42:28,278 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:28,485 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:28,692 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:28,906 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.801 = clip(base=0.721 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.735 novelty=0.73 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.70)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:42:29,116 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.445 = clip(base=0.365 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.226 novelty=0.73 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.38)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:42:34,145 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:34,362 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:34,593 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:34,805 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:35,012 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:35,222 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.689 = clip(base=0.609 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.577 novelty=0.66 | sol=0.45*prm_final(0.69)+0.35*prm_mean(0.57)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:42:35,429 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:35,636 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:35,843 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:42:36,051 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 25 GRPO groups: 55%|#####5 | 11/20 [06:04<05:37, 37.48s/q, loss=-0.0008, mean_r=0.895, q_acc=100%, q_rew=0.655, skip=0]
Iter 25 GRPO groups: 60%|###### | 12/20 [06:04<04:11, 31.46s/q, loss=-0.0008, mean_r=0.895, q_acc=100%, q_rew=0.655, skip=0]2026-04-26 06:42:43,036 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.982 = 0.50×1.00(exact) + 0.40×proc(0.954[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:42:52,146 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:42:52,230 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:42:52,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:42:52,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:43:01,370 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:43:01,454 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:43:01,537 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:43:01,620 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:43:08,655 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 25 GRPO groups: 60%|###### | 12/20 [06:35<04:11, 31.46s/q, loss=0var, mean_r=0.995, skip=1]
Iter 25 GRPO groups: 65%|######5 | 13/20 [06:35<03:38, 31.27s/q, loss=0var, mean_r=0.995, skip=1]2026-04-26 06:43:21,594 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.988 = 0.50×1.00(exact) + 0.40×proc(0.970[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='-6' gold='-6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:43:21,687 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.936 = 0.50×1.00(exact) + 0.40×proc(0.841[fin=0.99,mean=0.61]) + 0.10×fmt(1.000) | pred='-6' gold='-6' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:43:21,779 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='-6' gold='-6' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:43:26,706 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='-6' gold='-6' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 06:43:26,793 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.448 = 0.50×0.33(prox=0.33) + 0.40×proc(0.452[fin=0.61,mean=0.22]) + 0.10×fmt(1.000) | pred='-12' gold='-6' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 06:43:26,879 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.493 = 0.50×0.00(prox=0.00) + 0.40×proc(0.907[fin=0.99,mean=0.78]) + 0.10×fmt(1.000) | pred='-8/3' gold='-6' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 06:43:26,984 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.267 = 0.50×0.00(prox=0.00) + 0.40×proc(0.342[fin=0.38,mean=0.28]) + 0.10×fmt(1.000) | pred='$\\frac{4}{9}$' gold='-6' | step_acc=20% lccp=20% (chain=1/5 ok_count=1) n_steps=5
+:1: SyntaxWarning: 'int' object is not callable; perhaps you missed a comma?
+2026-04-26 06:43:34,796 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='-6' gold='-6' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:43:34,892 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.933 = 0.50×1.00(exact) + 0.40×proc(0.831[fin=1.00,mean=0.58]) + 0.10×fmt(1.000) | pred='-6' gold='-6' | step_acc=57% lccp=0% (chain=0/7 ok_count=4) n_steps=7
+2026-04-26 06:43:34,987 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.952 = 0.50×1.00(exact) + 0.40×proc(0.969[fin=1.00,mean=0.92]) + 0.10×fmt(0.650) | pred='-6' gold='-6' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 25 GRPO groups: 65%|######5 | 13/20 [07:03<03:38, 31.27s/q, loss=0.0017, mean_r=0.799, q_acc=100%, q_rew=0.655, skip=1]
Iter 25 GRPO groups: 70%|####### | 14/20 [07:03<03:01, 30.25s/q, loss=0.0017, mean_r=0.799, q_acc=100%, q_rew=0.655, skip=1]2026-04-26 06:43:43,057 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.633 = clip(base=0.553 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.481 novelty=0.72 | sol=0.45*prm_final(0.24)+0.35*prm_mean(0.64)+0.20*lccp(0.75) | steps=4
+2026-04-26 06:43:43,260 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.995 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:43:43,471 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.835 = clip(base=0.755 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.875 novelty=0.72 | sol=0.45*prm_final(0.83)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:43:43,684 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.891 = clip(base=0.811 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.969 novelty=0.72 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:43:43,896 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.665 = clip(base=0.585 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.601 novelty=0.72 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.47)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:43:44,101 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.885 = clip(base=0.805 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.948 novelty=0.72 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:43:44,307 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.637 = clip(base=0.557 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.528 novelty=0.72 | sol=0.45*prm_final(0.59)+0.35*prm_mean(0.56)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:43:44,512 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.315 = clip(base=0.235 + mod=+0.080, cap=1.00) | Q=0.55 sol=0.024 novelty=0.72 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.04)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:43:44,725 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.663 = clip(base=0.583 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.599 novelty=0.72 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.50)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:43:44,927 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.773 = clip(base=0.693 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.756 novelty=0.72 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.74)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:43:49,849 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:43:50,040 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:43:50,234 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.953 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:43:50,430 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:43:50,627 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:43:50,826 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:43:51,016 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:43:51,217 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.858 = clip(base=0.778 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.909 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.75)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:43:51,413 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:43:51,610 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+
Iter 25 GRPO groups: 70%|####### | 14/20 [07:20<03:01, 30.25s/q, loss=0.0035, mean_r=0.811, q_acc=100%, q_rew=0.643, skip=1]
Iter 25 GRPO groups: 75%|#######5 | 15/20 [07:20<02:10, 26.20s/q, loss=0.0035, mean_r=0.811, q_acc=100%, q_rew=0.643, skip=1]2026-04-26 06:43:57,658 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.472 = 0.50×0.17(prox=0.17) + 0.40×proc(0.723[fin=0.91,mean=0.45]) + 0.10×fmt(1.000) | pred='14' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 06:44:04,699 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.484 = 0.50×0.40(prox=0.40) + 0.40×proc(0.460[fin=0.53,mean=0.36]) + 0.10×fmt(1.000) | pred='1' gold='4' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 06:44:04,785 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.505 = 0.50×0.67(prox=0.67) + 0.40×proc(0.179[fin=0.21,mean=0.13]) + 0.10×fmt(1.000) | pred='3' gold='4' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:44:04,879 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.683 = 0.50×0.67(prox=0.67) + 0.40×proc(0.625[fin=0.79,mean=0.38]) + 0.10×fmt(1.000) | pred='3' gold='4' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+2026-04-26 06:44:04,972 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.934[fin=0.99,mean=0.85]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 06:44:11,422 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.326 = 0.50×0.16(prox=0.16) + 0.40×proc(0.369[fin=0.42,mean=0.29]) + 0.10×fmt(1.000) | pred='14.67' gold='4' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:44:11,518 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.875[fin=0.96,mean=0.75]) + 0.10×fmt(1.000) | pred='3 1/3' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:44:11,612 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.740 = 0.50×0.85(prox=0.85) + 0.40×proc(0.537[fin=0.60,mean=0.45]) + 0.10×fmt(1.000) | pred='3.6666666666666665' gold='4' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:44:11,704 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.410 = 0.50×0.17(prox=0.17) + 0.40×proc(0.566[fin=0.70,mean=0.37]) + 0.10×fmt(1.000) | pred='14' gold='4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 06:44:16,758 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.273 = 0.50×0.00(prox=0.00) + 0.40×proc(0.432[fin=0.50,mean=0.32]) + 0.10×fmt(1.000) | pred='3 2/3' gold='4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+
Iter 25 GRPO groups: 75%|#######5 | 15/20 [07:45<02:10, 26.20s/q, loss=0.0012, mean_r=0.542, q_acc=100%, q_rew=0.643, skip=1]
Iter 25 GRPO groups: 80%|######## | 16/20 [07:45<01:43, 25.80s/q, loss=0.0012, mean_r=0.542, q_acc=100%, q_rew=0.643, skip=1]2026-04-26 06:44:26,314 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.998 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:26,517 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:26,724 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:26,927 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:27,138 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:27,346 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:27,552 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.998 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:27,757 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:27,964 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:28,173 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:44:34,427 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.932 = clip(base=0.852 + mod=+0.080, cap=1.00) | Q=0.63 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:34,626 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:34,831 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:35,034 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.992 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:35,238 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:35,444 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:35,644 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.992 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:35,848 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:36,053 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.500 = clip(base=0.420 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.341 novelty=0.71 | sol=0.45*prm_final(0.55)+0.35*prm_mean(0.26)+0.20*lccp(0.00) | steps=4
+2026-04-26 06:44:36,258 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.992 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+
Iter 25 GRPO groups: 80%|######## | 16/20 [08:05<01:43, 25.80s/q, loss=-0.0013, mean_r=0.893, q_acc=100%, q_rew=0.635, skip=1]
Iter 25 GRPO groups: 85%|########5 | 17/20 [08:05<01:11, 23.97s/q, loss=-0.0013, mean_r=0.893, q_acc=100%, q_rew=0.635, skip=1]2026-04-26 06:44:42,993 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.996 = clip(base=0.916 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.995 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:43,183 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.997 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:43,381 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.973 = clip(base=0.893 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.996 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:43,579 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.973 = clip(base=0.893 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.996 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:43,773 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.995 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:43,975 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.986 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 06:44:44,172 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.996 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:44,358 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.807 = clip(base=0.727 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.712 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.75)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:44:44,544 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.802 = clip(base=0.722 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.693 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.69)+0.20*lccp(0.00) | steps=3
+2026-04-26 06:44:44,738 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.973 = clip(base=0.893 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.995 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:49,775 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.996 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:49,966 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.997 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:50,163 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.995 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:50,351 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.992 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:50,543 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:50,727 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.968 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:50,912 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.994 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:51,107 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:51,301 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.998 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:51,497 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.995 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 25 GRPO groups: 85%|########5 | 17/20 [08:20<01:11, 23.97s/q, loss=0.0015, mean_r=0.953, q_acc=100%, q_rew=0.648, skip=1]
Iter 25 GRPO groups: 90%|######### | 18/20 [08:20<00:42, 21.37s/q, loss=0.0015, mean_r=0.953, q_acc=100%, q_rew=0.648, skip=1]2026-04-26 06:44:57,614 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:57,845 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:58,045 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:58,244 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:58,439 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:58,628 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:58,824 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:59,019 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:59,215 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:44:59,406 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:03,254 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:03,439 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.901 = clip(base=0.821 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.987 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:03,625 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:03,812 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.983 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:04,003 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.898 = clip(base=0.818 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.981 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:04,197 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.990 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:04,388 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:04,574 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.983 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:04,761 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.983 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:45:04,949 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 25 GRPO groups: 90%|######### | 18/20 [08:33<00:42, 21.37s/q, loss=-0.0013, mean_r=0.934, q_acc=100%, q_rew=0.647, skip=1]
Iter 25 GRPO groups: 95%|#########5| 19/20 [08:33<00:18, 18.97s/q, loss=-0.0013, mean_r=0.934, q_acc=100%, q_rew=0.647, skip=1]2026-04-26 06:45:08,502 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:45:08,583 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:45:08,659 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:45:11,200 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.798 = 0.50×1.00(exact) + 0.40×proc(0.582[fin=0.72,mean=0.37]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 06:45:11,280 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:45:11,357 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:45:11,433 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:45:15,887 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:45:15,962 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:45:16,042 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.954 = 0.50×1.00(exact) + 0.40×proc(0.972[fin=1.00,mean=0.93]) + 0.10×fmt(0.650) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 25 GRPO groups: 95%|#########5| 19/20 [08:44<00:18, 18.97s/q, loss=-0.0000, mean_r=0.962, q_acc=100%, q_rew=0.647, skip=1]
Iter 25 GRPO groups: 100%|##########| 20/20 [08:44<00:00, 16.54s/q, loss=-0.0000, mean_r=0.962, q_acc=100%, q_rew=0.647, skip=1]
Iter 25 GRPO groups: 100%|##########| 20/20 [08:44<00:00, 26.23s/q, loss=-0.0000, mean_r=0.962, q_acc=100%, q_rew=0.647, skip=1]
+2026-04-26 06:45:17,509 INFO src.rl.llm_question_classifier - LLMClassifier cache=90% llm=2% fallback=8% (cache_size=112/10000)
+2026-04-26 06:45:17,510 INFO __main__ - Iter 25 | loss=0.0003 | reward mean=0.845 std=0.209 | gt_match=60.6% | grounded_acc=85.3% | step_acc=78.1% | lccp=63.7% | batch_acc=92.7% | phase=SELFPLAY_RAMP sp_ratio=43% | groups=28 skipped=1(0var=1) | lr=3.91e-06 | 524.7s
+2026-04-26 06:45:17,510 INFO __main__ - Question generation: 9/9 valid (100%) | q_reward=0.647 | q_acc=100.0% (>0.5 quality) | topic=0.51 diff=0.26 clarity=1.00 novelty=0.44 solvability=0.97
+2026-04-26 06:45:17,510 INFO __main__ - Evaluating GSM8K (150 samples)...
+
GSM8K eval: 0%| | 0/150 [00:00, ?q/s]
GSM8K eval: 1%| | 1/150 [00:03<08:34, 3.45s/q, correct=1/1, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 1%|1 | 2/150 [00:08<10:51, 4.40s/q, correct=2/2, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 2%|2 | 3/150 [00:11<08:49, 3.60s/q, correct=3/3, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 3%|2 | 4/150 [00:13<07:31, 3.09s/q, correct=3/4, lccp=83.3%, score=0.887, step_acc=91.7%]
GSM8K eval: 3%|3 | 5/150 [00:15<06:15, 2.59s/q, correct=4/5, lccp=86.7%, score=0.910, step_acc=93.3%]
GSM8K eval: 4%|4 | 6/150 [00:20<08:34, 3.58s/q, correct=4/6, lccp=75.6%, score=0.888, step_acc=87.8%]
GSM8K eval: 5%|4 | 7/150 [00:24<08:25, 3.54s/q, correct=5/7, lccp=79.0%, score=0.904, step_acc=89.5%]
GSM8K eval: 5%|5 | 8/150 [00:26<07:32, 3.19s/q, correct=6/8, lccp=81.7%, score=0.916, step_acc=90.8%]
GSM8K eval: 6%|6 | 9/150 [00:29<07:33, 3.22s/q, correct=7/9, lccp=83.7%, score=0.925, step_acc=91.9%]
GSM8K eval: 7%|6 | 10/150 [00:34<08:34, 3.67s/q, correct=7/10, lccp=79.3%, score=0.890, step_acc=86.7%]
GSM8K eval: 7%|7 | 11/150 [00:37<07:59, 3.45s/q, correct=8/11, lccp=81.2%, score=0.900, step_acc=87.9%]
GSM8K eval: 8%|8 | 12/150 [00:39<06:57, 3.02s/q, correct=9/12, lccp=82.8%, score=0.908, step_acc=88.9%]
GSM8K eval: 9%|8 | 13/150 [00:42<06:36, 2.89s/q, correct=10/13, lccp=84.1%, score=0.912, step_acc=89.7%]
GSM8K eval: 9%|9 | 14/150 [00:46<07:40, 3.38s/q, correct=11/14, lccp=85.2%, score=0.918, step_acc=90.5%]
GSM8K eval: 10%|# | 15/150 [00:49<07:03, 3.14s/q, correct=12/15, lccp=86.2%, score=0.924, step_acc=91.1%]
GSM8K eval: 11%|# | 16/150 [00:51<06:32, 2.93s/q, correct=12/16, lccp=87.1%, score=0.900, step_acc=91.7%]
GSM8K eval: 11%|#1 | 17/150 [00:55<07:09, 3.23s/q, correct=13/17, lccp=87.8%, score=0.906, step_acc=92.2%]
GSM8K eval: 12%|#2 | 18/150 [01:01<08:45, 3.98s/q, correct=13/18, lccp=83.7%, score=0.895, step_acc=89.8%]
GSM8K eval: 13%|#2 | 19/150 [01:03<07:47, 3.57s/q, correct=14/19, lccp=84.5%, score=0.901, step_acc=90.4%]
GSM8K eval: 13%|#3 | 20/150 [01:07<07:53, 3.64s/q, correct=15/20, lccp=85.3%, score=0.906, step_acc=90.8%]
GSM8K eval: 14%|#4 | 21/150 [01:10<07:08, 3.32s/q, correct=16/21, lccp=86.0%, score=0.910, step_acc=91.3%]
GSM8K eval: 15%|#4 | 22/150 [01:13<06:42, 3.14s/q, correct=17/22, lccp=83.6%, score=0.905, step_acc=90.2%]
GSM8K eval: 15%|#5 | 23/150 [01:17<07:21, 3.48s/q, correct=18/23, lccp=84.3%, score=0.909, step_acc=90.6%]
GSM8K eval: 16%|#6 | 24/150 [01:19<06:44, 3.21s/q, correct=18/24, lccp=81.8%, score=0.893, step_acc=87.8%]
GSM8K eval: 17%|#6 | 25/150 [01:22<06:23, 3.07s/q, correct=18/25, lccp=79.6%, score=0.890, step_acc=87.3%]
GSM8K eval: 17%|#7 | 26/150 [01:27<07:12, 3.48s/q, correct=19/26, lccp=80.4%, score=0.894, step_acc=87.8%]
GSM8K eval: 18%|#8 | 27/150 [01:29<06:44, 3.29s/q, correct=19/27, lccp=81.1%, score=0.889, step_acc=88.3%]
GSM8K eval: 19%|#8 | 28/150 [01:32<06:01, 2.96s/q, correct=20/28, lccp=81.8%, score=0.893, step_acc=88.7%]
GSM8K eval: 19%|#9 | 29/150 [01:34<05:51, 2.90s/q, correct=21/29, lccp=82.4%, score=0.896, step_acc=89.1%]
GSM8K eval: 20%|## | 30/150 [01:38<06:23, 3.19s/q, correct=22/30, lccp=83.0%, score=0.900, step_acc=89.4%]
GSM8K eval: 21%|## | 31/150 [01:41<05:57, 3.00s/q, correct=23/31, lccp=83.5%, score=0.903, step_acc=89.8%]
GSM8K eval: 21%|##1 | 32/150 [01:43<05:11, 2.64s/q, correct=24/32, lccp=84.0%, score=0.905, step_acc=90.1%]
GSM8K eval: 22%|##2 | 33/150 [01:45<05:13, 2.68s/q, correct=25/33, lccp=84.5%, score=0.908, step_acc=90.4%]
GSM8K eval: 23%|##2 | 34/150 [01:47<04:48, 2.49s/q, correct=26/34, lccp=85.0%, score=0.911, step_acc=90.7%]
GSM8K eval: 23%|##3 | 35/150 [01:50<04:50, 2.52s/q, correct=27/35, lccp=85.4%, score=0.913, step_acc=91.0%]
GSM8K eval: 24%|##4 | 36/150 [01:54<05:21, 2.82s/q, correct=28/36, lccp=85.8%, score=0.915, step_acc=91.2%]
GSM8K eval: 25%|##4 | 37/150 [01:55<04:49, 2.57s/q, correct=29/37, lccp=86.2%, score=0.917, step_acc=91.4%]
GSM8K eval: 25%|##5 | 38/150 [01:59<05:02, 2.71s/q, correct=30/38, lccp=86.6%, score=0.919, step_acc=91.7%]
GSM8K eval: 26%|##6 | 39/150 [02:03<06:13, 3.36s/q, correct=31/39, lccp=86.9%, score=0.921, step_acc=91.9%]
GSM8K eval: 27%|##6 | 40/150 [02:10<07:46, 4.24s/q, correct=32/40, lccp=87.2%, score=0.923, step_acc=92.1%]
GSM8K eval: 27%|##7 | 41/150 [02:13<07:01, 3.86s/q, correct=32/41, lccp=87.5%, score=0.922, step_acc=92.3%]
GSM8K eval: 28%|##8 | 42/150 [02:16<06:37, 3.68s/q, correct=32/42, lccp=87.8%, score=0.921, step_acc=92.5%]
GSM8K eval: 29%|##8 | 43/150 [02:18<05:48, 3.25s/q, correct=33/43, lccp=88.1%, score=0.923, step_acc=92.6%]
GSM8K eval: 29%|##9 | 44/150 [02:25<07:26, 4.21s/q, correct=34/44, lccp=88.4%, score=0.925, step_acc=92.8%]
GSM8K eval: 30%|### | 45/150 [02:28<06:50, 3.91s/q, correct=35/45, lccp=88.6%, score=0.926, step_acc=93.0%]
GSM8K eval: 31%|### | 46/150 [02:33<07:17, 4.21s/q, correct=35/46, lccp=86.7%, score=0.921, step_acc=92.9%]
GSM8K eval: 31%|###1 | 47/150 [02:36<06:39, 3.88s/q, correct=36/47, lccp=87.0%, score=0.923, step_acc=93.0%]
GSM8K eval: 32%|###2 | 48/150 [02:38<05:32, 3.26s/q, correct=37/48, lccp=87.3%, score=0.925, step_acc=93.2%]
GSM8K eval: 33%|###2 | 49/150 [02:41<05:39, 3.36s/q, correct=38/49, lccp=86.2%, score=0.926, step_acc=93.0%]
GSM8K eval: 33%|###3 | 50/150 [02:44<05:31, 3.31s/q, correct=38/50, lccp=85.5%, score=0.917, step_acc=92.1%]
GSM8K eval: 34%|###4 | 51/150 [02:46<04:31, 2.75s/q, correct=39/51, lccp=85.7%, score=0.919, step_acc=92.3%]
GSM8K eval: 35%|###4 | 52/150 [02:50<05:14, 3.21s/q, correct=39/52, lccp=84.1%, score=0.918, step_acc=92.1%]
GSM8K eval: 35%|###5 | 53/150 [02:55<05:57, 3.68s/q, correct=39/53, lccp=83.6%, score=0.911, step_acc=91.5%]
GSM8K eval: 36%|###6 | 54/150 [02:58<05:42, 3.57s/q, correct=40/54, lccp=83.9%, score=0.912, step_acc=91.6%]
GSM8K eval: 37%|###6 | 55/150 [03:03<06:12, 3.92s/q, correct=41/55, lccp=84.2%, score=0.914, step_acc=91.8%]
GSM8K eval: 37%|###7 | 56/150 [03:07<05:58, 3.82s/q, correct=42/56, lccp=84.5%, score=0.915, step_acc=91.9%]
GSM8K eval: 38%|###8 | 57/150 [03:09<05:14, 3.38s/q, correct=43/57, lccp=84.8%, score=0.917, step_acc=92.1%]
GSM8K eval: 39%|###8 | 58/150 [03:13<05:32, 3.62s/q, correct=44/58, lccp=85.0%, score=0.918, step_acc=92.2%]
GSM8K eval: 39%|###9 | 59/150 [03:18<06:00, 3.96s/q, correct=44/59, lccp=83.6%, score=0.915, step_acc=91.8%]
GSM8K eval: 40%|#### | 60/150 [03:23<06:23, 4.27s/q, correct=45/60, lccp=83.9%, score=0.917, step_acc=91.9%]
GSM8K eval: 41%|#### | 61/150 [03:26<05:52, 3.96s/q, correct=46/61, lccp=84.1%, score=0.918, step_acc=92.1%]
GSM8K eval: 41%|####1 | 62/150 [03:29<05:26, 3.71s/q, correct=47/62, lccp=84.4%, score=0.919, step_acc=92.2%]
GSM8K eval: 42%|####2 | 63/150 [03:33<05:13, 3.60s/q, correct=47/63, lccp=84.1%, score=0.913, step_acc=91.8%]
GSM8K eval: 43%|####2 | 64/150 [03:35<04:49, 3.37s/q, correct=48/64, lccp=84.4%, score=0.915, step_acc=91.9%]
GSM8K eval: 43%|####3 | 65/150 [03:38<04:32, 3.20s/q, correct=49/65, lccp=84.6%, score=0.916, step_acc=92.0%]
GSM8K eval: 44%|####4 | 66/150 [03:40<03:58, 2.84s/q, correct=50/66, lccp=84.8%, score=0.917, step_acc=92.2%]
GSM8K eval: 45%|####4 | 67/150 [03:42<03:41, 2.67s/q, correct=51/67, lccp=85.1%, score=0.918, step_acc=92.3%]
GSM8K eval: 45%|####5 | 68/150 [03:45<03:40, 2.68s/q, correct=52/68, lccp=85.3%, score=0.920, step_acc=92.4%]
GSM8K eval: 46%|####6 | 69/150 [03:47<03:10, 2.35s/q, correct=53/69, lccp=85.5%, score=0.921, step_acc=92.5%]
GSM8K eval: 47%|####6 | 70/150 [03:50<03:23, 2.54s/q, correct=54/70, lccp=84.3%, score=0.921, step_acc=92.3%]
GSM8K eval: 47%|####7 | 71/150 [03:53<03:35, 2.73s/q, correct=55/71, lccp=83.1%, score=0.922, step_acc=92.1%]
GSM8K eval: 48%|####8 | 72/150 [03:54<03:03, 2.35s/q, correct=56/72, lccp=83.3%, score=0.923, step_acc=92.3%]
GSM8K eval: 49%|####8 | 73/150 [03:56<02:44, 2.14s/q, correct=57/73, lccp=83.6%, score=0.924, step_acc=92.4%]
GSM8K eval: 49%|####9 | 74/150 [04:00<03:14, 2.56s/q, correct=58/74, lccp=83.8%, score=0.925, step_acc=92.5%]
GSM8K eval: 50%|##### | 75/150 [04:01<02:53, 2.31s/q, correct=59/75, lccp=84.0%, score=0.926, step_acc=92.6%]
GSM8K eval: 51%|##### | 76/150 [04:08<04:26, 3.60s/q, correct=59/76, lccp=84.0%, score=0.921, step_acc=92.5%]
GSM8K eval: 51%|#####1 | 77/150 [04:12<04:29, 3.70s/q, correct=60/77, lccp=84.2%, score=0.922, step_acc=92.6%]
GSM8K eval: 52%|#####2 | 78/150 [04:14<04:00, 3.34s/q, correct=61/78, lccp=84.4%, score=0.923, step_acc=92.7%]
GSM8K eval: 53%|#####2 | 79/150 [04:17<03:50, 3.25s/q, correct=61/79, lccp=83.6%, score=0.917, step_acc=91.9%]
GSM8K eval: 53%|#####3 | 80/150 [04:20<03:42, 3.17s/q, correct=62/80, lccp=83.8%, score=0.918, step_acc=92.0%]
GSM8K eval: 54%|#####4 | 81/150 [04:23<03:22, 2.94s/q, correct=63/81, lccp=84.0%, score=0.919, step_acc=92.1%]
GSM8K eval: 55%|#####4 | 82/150 [04:26<03:19, 2.94s/q, correct=64/82, lccp=84.2%, score=0.920, step_acc=92.2%]
GSM8K eval: 55%|#####5 | 83/150 [04:29<03:14, 2.90s/q, correct=65/83, lccp=84.4%, score=0.921, step_acc=92.3%]
GSM8K eval: 56%|#####6 | 84/150 [04:31<03:06, 2.83s/q, correct=66/84, lccp=84.6%, score=0.922, step_acc=92.4%]
GSM8K eval: 57%|#####6 | 85/150 [04:35<03:22, 3.12s/q, correct=67/85, lccp=84.7%, score=0.923, step_acc=92.5%]
GSM8K eval: 57%|#####7 | 86/150 [04:38<03:26, 3.22s/q, correct=68/86, lccp=84.9%, score=0.924, step_acc=92.6%]
GSM8K eval: 58%|#####8 | 87/150 [04:44<04:10, 3.97s/q, correct=69/87, lccp=85.1%, score=0.925, step_acc=92.7%]
GSM8K eval: 59%|#####8 | 88/150 [04:46<03:27, 3.35s/q, correct=70/88, lccp=85.3%, score=0.926, step_acc=92.8%]
GSM8K eval: 59%|#####9 | 89/150 [04:49<03:13, 3.16s/q, correct=71/89, lccp=85.4%, score=0.927, step_acc=92.8%]
GSM8K eval: 60%|###### | 90/150 [04:51<02:56, 2.94s/q, correct=72/90, lccp=85.6%, score=0.927, step_acc=92.9%]
GSM8K eval: 61%|###### | 91/150 [04:56<03:21, 3.41s/q, correct=73/91, lccp=85.8%, score=0.928, step_acc=93.0%]
GSM8K eval: 61%|######1 | 92/150 [04:59<03:11, 3.30s/q, correct=74/92, lccp=85.9%, score=0.929, step_acc=93.1%]
GSM8K eval: 62%|######2 | 93/150 [05:06<04:22, 4.60s/q, correct=75/93, lccp=86.1%, score=0.929, step_acc=93.2%]
GSM8K eval: 63%|######2 | 94/150 [05:09<03:45, 4.03s/q, correct=75/94, lccp=85.1%, score=0.925, step_acc=92.2%]
GSM8K eval: 63%|######3 | 95/150 [05:15<04:04, 4.45s/q, correct=76/95, lccp=84.2%, score=0.925, step_acc=91.7%]
GSM8K eval: 64%|######4 | 96/150 [05:18<03:39, 4.07s/q, correct=76/96, lccp=83.7%, score=0.920, step_acc=91.1%]
GSM8K eval: 65%|######4 | 97/150 [05:20<03:14, 3.67s/q, correct=76/97, lccp=83.4%, score=0.918, step_acc=90.9%]
GSM8K eval: 65%|######5 | 98/150 [05:25<03:19, 3.84s/q, correct=76/98, lccp=83.0%, score=0.914, step_acc=90.7%]
GSM8K eval: 66%|######6 | 99/150 [05:27<02:52, 3.39s/q, correct=77/99, lccp=83.1%, score=0.915, step_acc=90.8%]
GSM8K eval: 67%|######6 | 100/150 [05:29<02:28, 2.96s/q, correct=78/100, lccp=82.3%, score=0.915, step_acc=90.6%]
GSM8K eval: 67%|######7 | 101/150 [05:32<02:25, 2.96s/q, correct=78/101, lccp=82.0%, score=0.912, step_acc=90.4%]
GSM8K eval: 68%|######8 | 102/150 [05:33<02:00, 2.52s/q, correct=79/102, lccp=82.2%, score=0.912, step_acc=90.5%]
GSM8K eval: 69%|######8 | 103/150 [05:35<01:51, 2.37s/q, correct=80/103, lccp=82.3%, score=0.913, step_acc=90.6%]
GSM8K eval: 69%|######9 | 104/150 [05:40<02:22, 3.10s/q, correct=81/104, lccp=82.5%, score=0.914, step_acc=90.7%]
GSM8K eval: 70%|####### | 105/150 [05:43<02:12, 2.95s/q, correct=82/105, lccp=82.7%, score=0.915, step_acc=90.8%]
GSM8K eval: 71%|####### | 106/150 [05:44<01:50, 2.52s/q, correct=83/106, lccp=82.8%, score=0.916, step_acc=90.9%]
GSM8K eval: 71%|#######1 | 107/150 [05:46<01:35, 2.23s/q, correct=84/107, lccp=83.0%, score=0.916, step_acc=91.0%]
GSM8K eval: 72%|#######2 | 108/150 [05:49<01:38, 2.35s/q, correct=85/108, lccp=83.1%, score=0.917, step_acc=91.1%]
GSM8K eval: 73%|#######2 | 109/150 [05:54<02:09, 3.16s/q, correct=85/109, lccp=82.7%, score=0.916, step_acc=91.0%]
GSM8K eval: 73%|#######3 | 110/150 [05:56<01:55, 2.89s/q, correct=86/110, lccp=82.8%, score=0.917, step_acc=91.1%]
GSM8K eval: 74%|#######4 | 111/150 [05:58<01:38, 2.53s/q, correct=87/111, lccp=83.0%, score=0.917, step_acc=91.2%]
GSM8K eval: 75%|#######4 | 112/150 [06:03<02:05, 3.31s/q, correct=87/112, lccp=83.2%, score=0.917, step_acc=91.2%]
GSM8K eval: 75%|#######5 | 113/150 [06:04<01:45, 2.86s/q, correct=88/113, lccp=83.3%, score=0.918, step_acc=91.3%]
GSM8K eval: 76%|#######6 | 114/150 [06:10<02:09, 3.59s/q, correct=89/114, lccp=82.8%, score=0.918, step_acc=91.3%]
GSM8K eval: 77%|#######6 | 115/150 [06:13<01:58, 3.39s/q, correct=90/115, lccp=83.0%, score=0.919, step_acc=91.3%]
GSM8K eval: 77%|#######7 | 116/150 [06:16<01:50, 3.24s/q, correct=91/116, lccp=83.1%, score=0.919, step_acc=91.4%]
GSM8K eval: 78%|#######8 | 117/150 [06:22<02:13, 4.06s/q, correct=92/117, lccp=83.3%, score=0.920, step_acc=91.5%]
GSM8K eval: 79%|#######8 | 118/150 [06:26<02:14, 4.19s/q, correct=92/118, lccp=82.6%, score=0.918, step_acc=91.4%]
GSM8K eval: 79%|#######9 | 119/150 [06:30<02:04, 4.01s/q, correct=92/119, lccp=82.7%, score=0.916, step_acc=91.5%]
GSM8K eval: 80%|######## | 120/150 [06:33<01:50, 3.68s/q, correct=93/120, lccp=82.8%, score=0.917, step_acc=91.6%]
GSM8K eval: 81%|######## | 121/150 [06:36<01:41, 3.50s/q, correct=94/121, lccp=83.0%, score=0.918, step_acc=91.6%]
GSM8K eval: 81%|########1 | 122/150 [06:39<01:34, 3.37s/q, correct=95/122, lccp=83.1%, score=0.918, step_acc=91.7%]
GSM8K eval: 82%|########2 | 123/150 [06:42<01:31, 3.39s/q, correct=96/123, lccp=83.3%, score=0.919, step_acc=91.8%]
GSM8K eval: 83%|########2 | 124/150 [06:44<01:19, 3.07s/q, correct=97/124, lccp=83.4%, score=0.920, step_acc=91.8%]
GSM8K eval: 83%|########3 | 125/150 [06:47<01:09, 2.76s/q, correct=98/125, lccp=83.5%, score=0.920, step_acc=91.9%]
GSM8K eval: 84%|########4 | 126/150 [06:49<01:06, 2.76s/q, correct=99/126, lccp=83.7%, score=0.921, step_acc=92.0%]
GSM8K eval: 85%|########4 | 127/150 [06:54<01:14, 3.26s/q, correct=100/127, lccp=83.8%, score=0.921, step_acc=92.0%]
GSM8K eval: 85%|########5 | 128/150 [06:57<01:09, 3.15s/q, correct=101/128, lccp=83.9%, score=0.922, step_acc=92.1%]
GSM8K eval: 86%|########6 | 129/150 [07:00<01:07, 3.21s/q, correct=102/129, lccp=84.0%, score=0.923, step_acc=92.1%]
GSM8K eval: 87%|########6 | 130/150 [07:02<00:55, 2.79s/q, correct=103/130, lccp=84.2%, score=0.923, step_acc=92.2%]
GSM8K eval: 87%|########7 | 131/150 [07:06<01:03, 3.34s/q, correct=104/131, lccp=84.3%, score=0.924, step_acc=92.3%]
GSM8K eval: 88%|########8 | 132/150 [07:08<00:50, 2.82s/q, correct=105/132, lccp=84.4%, score=0.924, step_acc=92.3%]
GSM8K eval: 89%|########8 | 133/150 [07:11<00:48, 2.85s/q, correct=106/133, lccp=84.5%, score=0.925, step_acc=92.4%]
GSM8K eval: 89%|########9 | 134/150 [07:15<00:53, 3.32s/q, correct=107/134, lccp=84.6%, score=0.925, step_acc=92.4%]
GSM8K eval: 90%|######### | 135/150 [07:18<00:48, 3.23s/q, correct=108/135, lccp=84.8%, score=0.926, step_acc=92.5%]
GSM8K eval: 91%|######### | 136/150 [07:23<00:50, 3.61s/q, correct=108/136, lccp=84.4%, score=0.925, step_acc=92.3%]
GSM8K eval: 91%|#########1| 137/150 [07:30<00:59, 4.56s/q, correct=109/137, lccp=84.5%, score=0.925, step_acc=92.4%]
GSM8K eval: 92%|#########2| 138/150 [07:34<00:52, 4.40s/q, correct=110/138, lccp=84.6%, score=0.926, step_acc=92.4%]
GSM8K eval: 93%|#########2| 139/150 [07:37<00:45, 4.12s/q, correct=111/139, lccp=84.7%, score=0.927, step_acc=92.5%]
GSM8K eval: 93%|#########3| 140/150 [07:41<00:41, 4.16s/q, correct=111/140, lccp=84.6%, score=0.923, step_acc=92.3%]
GSM8K eval: 94%|#########3| 141/150 [07:45<00:36, 4.07s/q, correct=112/141, lccp=84.7%, score=0.923, step_acc=92.3%]
GSM8K eval: 95%|#########4| 142/150 [07:50<00:33, 4.21s/q, correct=113/142, lccp=84.8%, score=0.924, step_acc=92.4%]
GSM8K eval: 95%|#########5| 143/150 [07:52<00:25, 3.63s/q, correct=114/143, lccp=84.9%, score=0.924, step_acc=92.4%]
GSM8K eval: 96%|#########6| 144/150 [07:54<00:19, 3.24s/q, correct=115/144, lccp=85.0%, score=0.925, step_acc=92.5%]
GSM8K eval: 97%|#########6| 145/150 [07:58<00:16, 3.22s/q, correct=115/145, lccp=84.4%, score=0.922, step_acc=92.0%]
GSM8K eval: 97%|#########7| 146/150 [08:00<00:12, 3.15s/q, correct=116/146, lccp=84.5%, score=0.922, step_acc=92.0%]
GSM8K eval: 98%|#########8| 147/150 [08:04<00:09, 3.31s/q, correct=117/147, lccp=84.6%, score=0.923, step_acc=92.1%]
GSM8K eval: 99%|#########8| 148/150 [08:08<00:06, 3.42s/q, correct=118/148, lccp=84.7%, score=0.923, step_acc=92.1%]
GSM8K eval: 99%|#########9| 149/150 [08:11<00:03, 3.45s/q, correct=119/149, lccp=84.8%, score=0.924, step_acc=92.2%]
GSM8K eval: 100%|##########| 150/150 [08:16<00:00, 3.87s/q, correct=119/150, lccp=84.7%, score=0.922, step_acc=92.0%]
GSM8K eval: 100%|##########| 150/150 [08:16<00:00, 3.31s/q, correct=119/150, lccp=84.7%, score=0.922, step_acc=92.0%]
+2026-04-26 06:53:34,257 INFO __main__ - Training Score [iter 25]: 0.9221 (best=0.9262) | n=150
+2026-04-26 06:53:34,257 INFO __main__ - Components : 0.50×correct(79.3%) + 0.40×process + 0.10×fmt(1.000)
+2026-04-26 06:53:34,257 INFO __main__ - Process score : prm_mean=0.903 prm_final=0.933 → weighted=0.921
+2026-04-26 06:53:34,257 INFO __main__ - Step accuracy : 92.0% (bag-of-steps: fraction of steps PRM >0.5)
+2026-04-26 06:53:34,258 INFO __main__ - Chain integrity (LCCP): 84.7% ← fraction of steps before first failure
+ [LCCP=100% → all steps correct; LCCP=0% → first step wrong]
+2026-04-26 06:53:34,258 INFO __main__ - (debug) final-answer accuracy: 79.3%
+2026-04-26 06:53:36,520 INFO __main__ - Pruned old checkpoint: iter_0005
+2026-04-26 06:53:36,527 INFO __main__ - ======================================================================
+2026-04-26 06:53:36,527 INFO __main__ - GRPO ITERATION 26/60
+2026-04-26 06:53:36,527 INFO __main__ - ======================================================================
+2026-04-26 06:53:36,548 INFO __main__ - LR this iteration: 3.91e-06 | T=0.631 | MATH ratio=46%
+
Iter 26 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 06:53:42,159 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.682 = 0.50×0.47(prox=0.47) + 0.40×proc(0.871[fin=1.00,mean=0.68]) + 0.10×fmt(1.000) | pred='110' gold='70' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:590: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.1` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
+ warnings.warn(
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:595: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.
+ warnings.warn(
+/workspace/finetune_qwen/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:612: UserWarning: `do_sample` is set to `False`. However, `top_k` is set to `20` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_k`.
+ warnings.warn(
+2026-04-26 06:53:48,688 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.646 = 0.50×0.54(prox=0.54) + 0.40×proc(0.780[fin=0.97,mean=0.50]) + 0.10×fmt(0.650) | pred='100' gold='70' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 06:53:48,780 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.763 = 0.50×0.64(prox=0.64) + 0.40×proc(0.861[fin=0.97,mean=0.69]) + 0.10×fmt(1.000) | pred='50' gold='70' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 06:53:48,871 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.969 = 0.50×1.00(exact) + 0.40×proc(0.922[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 06:53:48,961 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.186 = 0.50×0.00(prox=0.00) + 0.40×proc(0.290[fin=0.32,mean=0.25]) + 0.10×fmt(0.700) | pred='' gold='70' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:53:57,706 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.954 = 0.50×1.00(exact) + 0.40×proc(0.886[fin=0.99,mean=0.73]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 06:53:57,799 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.767 = 0.50×0.78(prox=0.78) + 0.40×proc(0.696[fin=0.79,mean=0.55]) + 0.10×fmt(1.000) | pred='80' gold='70' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 06:53:57,893 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.911[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 06:53:57,987 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.732 = 0.50×0.54(prox=0.54) + 0.40×proc(0.906[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='100' gold='70' | step_acc=80% lccp=20% (chain=1/5 ok_count=4) n_steps=5
+2026-04-26 06:54:08,828 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.621 = 0.50×0.54(prox=0.54) + 0.40×proc(0.628[fin=0.80,mean=0.36]) + 0.10×fmt(1.000) | pred='100' gold='70' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+
Iter 26 GRPO groups: 0%| | 0/20 [00:33, ?q/s, loss=0.0013, mean_r=0.728, skip=0]
Iter 26 GRPO groups: 5%|5 | 1/20 [00:33<10:43, 33.85s/q, loss=0.0013, mean_r=0.728, skip=0]2026-04-26 06:54:15,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:54:15,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:54:15,983 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:54:18,978 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:54:19,062 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:54:19,147 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:54:19,232 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:54:21,690 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:54:21,767 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:54:21,850 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='35' gold='35' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 26 GRPO groups: 5%|5 | 1/20 [00:45<10:43, 33.85s/q, loss=0var, mean_r=0.984, skip=1]
Iter 26 GRPO groups: 10%|# | 2/20 [00:45<06:12, 20.67s/q, loss=0var, mean_r=0.984, skip=1]2026-04-26 06:54:29,769 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.717 = clip(base=0.637 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.603 novelty=0.74 | sol=0.45*prm_final(0.72)+0.35*prm_mean(0.57)+0.20*lccp(0.40) | steps=5
+2026-04-26 06:54:29,967 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.995 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:54:30,171 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.991 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:54:30,371 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.425 = clip(base=0.345 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.215 novelty=0.74 | sol=0.45*prm_final(0.05)+0.35*prm_mean(0.44)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:54:30,572 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.992 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:54:30,772 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.483 = clip(base=0.403 + mod=+0.080, cap=1.00) | Q=0.55 sol=0.303 novelty=0.74 | sol=0.45*prm_final(0.24)+0.35*prm_mean(0.41)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:54:30,975 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.931 novelty=0.74 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:54:31,180 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.992 novelty=0.74 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:54:31,388 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.941 = clip(base=0.861 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.990 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:54:31,597 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.982 novelty=0.74 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:54:36,639 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:54:36,837 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:54:37,039 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:54:37,233 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:54:37,429 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.982 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:54:37,629 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:54:37,827 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:54:38,022 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.979 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:54:38,226 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:54:38,426 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.819 = clip(base=0.739 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.784 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.77)+0.20*lccp(0.33) | steps=3
+
Iter 26 GRPO groups: 10%|# | 2/20 [01:03<06:12, 20.67s/q, loss=-0.0013, mean_r=0.880, q_acc=100%, q_rew=0.670, skip=1]
Iter 26 GRPO groups: 15%|#5 | 3/20 [01:03<05:33, 19.61s/q, loss=-0.0013, mean_r=0.880, q_acc=100%, q_rew=0.670, skip=1]2026-04-26 06:54:44,783 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:54:53,649 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:54:53,734 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:54:53,819 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:54:53,904 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:55:04,546 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:55:04,630 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:55:04,714 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:55:04,800 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:55:13,024 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='121' gold='121' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 26 GRPO groups: 15%|#5 | 3/20 [01:36<05:33, 19.61s/q, loss=0var, mean_r=0.998, skip=2]
Iter 26 GRPO groups: 20%|## | 4/20 [01:36<06:37, 24.83s/q, loss=0var, mean_r=0.998, skip=2]2026-04-26 06:55:18,503 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.976 = clip(base=0.896 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.993 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:18,700 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:18,892 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.991 novelty=0.67 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:19,085 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:19,278 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.993 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:19,472 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:19,670 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:19,866 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.67 | sol=0.45*prm_final(0.99)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:20,059 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.993 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:20,254 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:25,047 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:25,244 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:25,438 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:25,634 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.993 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:25,832 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.993 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:26,021 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.997 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:26,224 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:26,428 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:26,618 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.362 = clip(base=0.344 + mod=+0.018, cap=1.00) | Q=0.64 sol=0.147 novelty=0.63 | sol=0.45*prm_final(0.22)+0.35*prm_mean(0.13)+0.20*lccp(0.00) | steps=2
+2026-04-26 06:55:26,812 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 26 GRPO groups: 20%|## | 4/20 [01:51<06:37, 24.83s/q, loss=0.0009, mean_r=0.923, q_acc=100%, q_rew=0.679, skip=2]
Iter 26 GRPO groups: 25%|##5 | 5/20 [01:51<05:21, 21.46s/q, loss=0.0009, mean_r=0.923, q_acc=100%, q_rew=0.679, skip=2]2026-04-26 06:55:40,159 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.964 novelty=0.80 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:40,358 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.979 = clip(base=0.899 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.968 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:40,553 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.990 = clip(base=0.910 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.984 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:40,747 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.720 = clip(base=0.640 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.508 novelty=0.80 | sol=0.45*prm_final(0.50)+0.35*prm_mean(0.52)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:55:40,952 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.785 = clip(base=0.705 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.601 novelty=0.80 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.59)+0.20*lccp(0.00) | steps=7
+2026-04-26 06:55:41,147 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.960 novelty=0.80 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:41,351 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.852 = clip(base=0.772 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.741 novelty=0.80 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.65)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:55:41,556 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.972 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:41,762 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.969 novelty=0.80 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:41,961 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.970 = clip(base=0.890 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.960 novelty=0.80 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:55:46,010 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.998 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:46,206 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:46,407 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:46,599 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.982 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:46,791 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:46,984 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:47,177 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:47,368 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:47,558 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.991 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 06:55:47,757 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.988 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+
Iter 26 GRPO groups: 25%|##5 | 5/20 [02:12<05:21, 21.46s/q, loss=-0.0003, mean_r=0.937, q_acc=100%, q_rew=0.703, skip=2]
Iter 26 GRPO groups: 30%|### | 6/20 [02:12<04:57, 21.28s/q, loss=-0.0003, mean_r=0.937, q_acc=100%, q_rew=0.703, skip=2]2026-04-26 06:55:58,228 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.592 = clip(base=0.512 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.401 novelty=0.65 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.63)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:55:58,451 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.840 = clip(base=0.760 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.862 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:55:58,668 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.601 = clip(base=0.521 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.423 novelty=0.65 | sol=0.45*prm_final(0.46)+0.35*prm_mean(0.52)+0.20*lccp(0.17) | steps=6
+2026-04-26 06:55:58,878 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.436 = clip(base=0.356 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.223 novelty=0.65 | sol=0.45*prm_final(0.18)+0.35*prm_mean(0.30)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:55:59,086 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.664 = clip(base=0.584 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.569 novelty=0.65 | sol=0.45*prm_final(0.68)+0.35*prm_mean(0.56)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:55:59,298 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.950 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:55:59,500 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.566 = clip(base=0.486 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.397 novelty=0.65 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.68)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:55:59,710 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.704 = clip(base=0.624 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.634 novelty=0.65 | sol=0.45*prm_final(0.64)+0.35*prm_mean(0.70)+0.20*lccp(0.50) | steps=4
+2026-04-26 06:55:59,919 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.438 = clip(base=0.358 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.222 novelty=0.65 | sol=0.45*prm_final(0.15)+0.35*prm_mean(0.30)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:56:00,125 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.562 = clip(base=0.482 + mod=+0.080, cap=1.00) | Q=0.53 sol=0.448 novelty=0.65 | sol=0.45*prm_final(0.46)+0.35*prm_mean(0.49)+0.20*lccp(0.33) | steps=3
+2026-04-26 06:56:05,131 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.998 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:05,330 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.996 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:05,532 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.997 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:05,732 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.998 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:05,931 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.997 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:06,133 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.996 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:06,334 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.997 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:06,534 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.998 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:06,735 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.998 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:56:06,941 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+
Iter 26 GRPO groups: 30%|### | 6/20 [02:32<04:57, 21.28s/q, loss=0.0003, mean_r=0.768, q_acc=100%, q_rew=0.674, skip=2]
Iter 26 GRPO groups: 35%|###5 | 7/20 [02:32<04:27, 20.61s/q, loss=0.0003, mean_r=0.768, q_acc=100%, q_rew=0.674, skip=2]2026-04-26 06:56:14,978 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:56:15,061 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:56:15,144 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:56:20,193 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.317 = 0.50×0.09(prox=0.09) + 0.40×proc(0.242[fin=0.06,mean=0.51]) + 0.10×fmt(1.000) | pred='18' gold='3' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 06:56:20,269 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:56:20,345 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:56:20,422 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:56:25,302 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:56:25,379 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:56:25,455 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 26 GRPO groups: 35%|###5 | 7/20 [02:50<04:27, 20.61s/q, loss=-0.0005, mean_r=0.930, q_acc=100%, q_rew=0.674, skip=2]
Iter 26 GRPO groups: 40%|#### | 8/20 [02:50<03:58, 19.85s/q, loss=-0.0005, mean_r=0.930, q_acc=100%, q_rew=0.674, skip=2]2026-04-26 06:56:32,132 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.900[fin=1.00,mean=0.76]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 06:56:40,511 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:56:40,595 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:56:40,680 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:56:40,765 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:56:50,595 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:56:50,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.894[fin=0.99,mean=0.75]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 06:56:50,762 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:56:50,846 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:56:58,069 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='5' gold='5' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 26 GRPO groups: 40%|#### | 8/20 [03:21<03:58, 19.85s/q, loss=0var, mean_r=0.991, skip=3]
Iter 26 GRPO groups: 45%|####5 | 9/20 [03:21<04:17, 23.39s/q, loss=0var, mean_r=0.991, skip=3]2026-04-26 06:57:01,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:57:01,210 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.953 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(0.650) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:57:01,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:57:07,906 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:57:07,991 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:57:08,073 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:57:08,150 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.962 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(0.650) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 06:57:12,486 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:57:12,564 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:57:12,641 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(0.650) | pred='-125' gold='-125' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+
Iter 26 GRPO groups: 45%|####5 | 9/20 [03:37<04:17, 23.39s/q, loss=0.0038, mean_r=0.980, q_acc=100%, q_rew=0.674, skip=3]
Iter 26 GRPO groups: 50%|##### | 10/20 [03:37<03:31, 21.12s/q, loss=0.0038, mean_r=0.980, q_acc=100%, q_rew=0.674, skip=3]2026-04-26 06:57:19,468 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.373 = 0.50×0.00(prox=0.00) + 0.40×proc(0.558[fin=0.64,mean=0.44]) + 0.10×fmt(1.000) | pred='11 1/7' gold='20' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 06:57:24,440 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:57:24,525 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:57:24,610 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.327 = 0.50×0.01(prox=0.01) + 0.40×proc(0.465[fin=0.53,mean=0.37]) + 0.10×fmt(1.000) | pred='1260' gold='20' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 06:57:24,691 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:57:27,973 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.500 = 0.50×0.54(prox=0.54) + 0.40×proc(0.327[fin=0.36,mean=0.27]) + 0.10×fmt(1.000) | pred='11.43' gold='20' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 06:57:28,054 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:57:28,138 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 06:57:28,220 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.560 = 0.50×0.20(prox=0.20) + 0.40×proc(0.899[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='60' gold='20' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 06:57:32,955 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.966[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 26 GRPO groups: 50%|##### | 10/20 [03:57<03:31, 21.12s/q, loss=-0.0002, mean_r=0.774, q_acc=100%, q_rew=0.674, skip=3]
Iter 26 GRPO groups: 55%|#####5 | 11/20 [03:57<03:07, 20.87s/q, loss=-0.0002, mean_r=0.774, q_acc=100%, q_rew=0.674, skip=3]2026-04-26 06:57:45,202 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.817 = 0.50×0.85(prox=0.85) + 0.40×proc(0.730[fin=0.74,mean=0.72]) + 0.10×fmt(1.000) | pred='44' gold='41' | step_acc=83% lccp=67% (chain=4/6 ok_count=5) n_steps=6
+2026-04-26 06:57:45,295 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:57:45,389 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 06:57:58,587 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.45(prox=0.45) + 0.40×proc(0.295[fin=0.04,mean=0.68]) + 0.10×fmt(1.000) | pred='16' gold='41' | step_acc=78% lccp=78% (chain=7/9 ok_count=7) n_steps=9
+2026-04-26 06:57:58,671 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.542 = 0.50×0.67(prox=0.67) + 0.40×proc(0.265[fin=0.04,mean=0.61]) + 0.10×fmt(1.000) | pred='31' gold='41' | step_acc=60% lccp=60% (chain=3/5 ok_count=3) n_steps=5
+2026-04-26 06:57:58,763 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 06:57:58,855 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.460 = 0.50×0.32(prox=0.32) + 0.40×proc(0.355[fin=0.14,mean=0.68]) + 0.10×fmt(1.000) | pred='84' gold='41' | step_acc=62% lccp=38% (chain=3/8 ok_count=5) n_steps=8
+2026-04-26 06:58:22,046 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 06:58:22,134 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.558 = 0.50×0.63(prox=0.63) + 0.40×proc(0.357[fin=0.27,mean=0.48]) + 0.10×fmt(1.000) | pred='29' gold='41' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 06:58:22,229 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 26 GRPO groups: 55%|#####5 | 11/20 [04:47<03:07, 20.87s/q, loss=0.0014, mean_r=0.791, q_acc=100%, q_rew=0.674, skip=3]
Iter 26 GRPO groups: 60%|###### | 12/20 [04:47<03:56, 29.52s/q, loss=0.0014, mean_r=0.791, q_acc=100%, q_rew=0.674, skip=3]2026-04-26 06:58:57,487 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.577 = 0.50×0.78(prox=0.78) + 0.40×proc(0.221[fin=0.03,mean=0.51]) + 0.10×fmt(1.000) | pred='32' gold='28' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 06:59:05,680 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:59:05,765 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:59:05,851 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.866 = 0.50×0.78(prox=0.78) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='24' gold='28' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:59:05,937 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:59:13,414 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:59:13,499 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 06:59:13,593 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 06:59:13,676 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='28' gold='28' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 26 GRPO groups: 60%|###### | 12/20 [05:45<03:56, 29.52s/q, loss=-0.0000, mean_r=0.938, q_acc=100%, q_rew=0.674, skip=3]
Iter 26 GRPO groups: 65%|######5 | 13/20 [05:45<04:27, 38.15s/q, loss=-0.0000, mean_r=0.938, q_acc=100%, q_rew=0.674, skip=3]2026-04-26 06:59:36,670 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.749 = clip(base=0.669 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.616 novelty=0.76 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.48)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:59:36,871 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.721 = clip(base=0.641 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.599 novelty=0.76 | sol=0.45*prm_final(0.85)+0.35*prm_mean(0.48)+0.20*lccp(0.25) | steps=4
+2026-04-26 06:59:37,090 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.940 = clip(base=0.860 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.947 novelty=0.76 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=4
+2026-04-26 06:59:37,310 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.966 novelty=0.76 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:59:37,518 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.921 novelty=0.76 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=5
+2026-04-26 06:59:37,732 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.986 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:59:37,944 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.970 = clip(base=0.890 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.967 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=6
+2026-04-26 06:59:38,166 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.975 = clip(base=0.895 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.979 novelty=0.76 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:59:38,386 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.876 = clip(base=0.796 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.815 novelty=0.76 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.83)+0.20*lccp(0.60) | steps=5
+2026-04-26 06:59:38,597 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.965 novelty=0.76 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=7
+2026-04-26 06:59:57,160 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.783 = clip(base=0.703 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.626 novelty=0.75 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.43)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:59:57,388 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.730 = clip(base=0.650 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.577 novelty=0.75 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.46)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:59:57,614 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.737 = clip(base=0.657 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.588 novelty=0.75 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.41)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:59:57,835 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.807 = clip(base=0.727 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.651 novelty=0.75 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.60)+0.20*lccp(0.00) | steps=7
+2026-04-26 06:59:58,074 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.818 = clip(base=0.738 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.679 novelty=0.75 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.55)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:59:58,312 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.830 = clip(base=0.750 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.695 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.70)+0.20*lccp(0.00) | steps=7
+2026-04-26 06:59:58,549 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.741 = clip(base=0.661 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.596 novelty=0.75 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.43)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:59:58,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.558 = clip(base=0.478 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.249 novelty=0.75 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.53)+0.20*lccp(0.00) | steps=5
+2026-04-26 06:59:58,994 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.799 = clip(base=0.719 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.643 novelty=0.75 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.52)+0.20*lccp(0.20) | steps=5
+2026-04-26 06:59:59,216 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.842 = clip(base=0.762 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.719 novelty=0.75 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.58)+0.20*lccp(0.40) | steps=5
+
Iter 26 GRPO groups: 65%|######5 | 13/20 [06:24<04:27, 38.15s/q, loss=0.0012, mean_r=0.835, q_acc=100%, q_rew=0.695, skip=3]
Iter 26 GRPO groups: 70%|####### | 14/20 [06:24<03:50, 38.46s/q, loss=0.0012, mean_r=0.835, q_acc=100%, q_rew=0.695, skip=3]2026-04-26 07:00:04,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:00:04,081 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:00:04,166 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.415 = 0.50×0.05(prox=0.05) + 0.40×proc(0.722[fin=0.88,mean=0.49]) + 0.10×fmt(1.000) | pred='443' gold='44' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:00:04,249 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.471 = 0.50×0.07(prox=0.07) + 0.40×proc(0.845[fin=0.98,mean=0.64]) + 0.10×fmt(1.000) | pred='355' gold='44' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 07:00:11,823 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:00:11,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:00:11,978 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:00:12,063 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:00:16,360 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.214 = 0.50×0.07(prox=0.07) + 0.40×proc(0.203[fin=0.15,mean=0.28]) + 0.10×fmt(1.000) | pred='353' gold='44' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 07:00:16,437 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.958[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='44' gold='44' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 26 GRPO groups: 70%|####### | 14/20 [06:41<03:50, 38.46s/q, loss=-0.0020, mean_r=0.804, q_acc=100%, q_rew=0.695, skip=3]
Iter 26 GRPO groups: 75%|#######5 | 15/20 [06:41<02:39, 31.98s/q, loss=-0.0020, mean_r=0.804, q_acc=100%, q_rew=0.695, skip=3]2026-04-26 07:00:23,571 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.893 = clip(base=0.813 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.866 novelty=0.62 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.86)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:00:23,766 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.868 = clip(base=0.788 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.858 novelty=0.62 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.83)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:00:23,961 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.962 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:00:24,164 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.845 = clip(base=0.765 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.826 novelty=0.62 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.81)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:00:24,364 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.844 = clip(base=0.764 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.814 novelty=0.62 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.76)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:00:24,567 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.967 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:00:24,761 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.813 = clip(base=0.733 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.748 novelty=0.62 | sol=0.45*prm_final(0.84)+0.35*prm_mean(0.71)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:00:24,958 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.961 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:00:25,162 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.952 novelty=0.62 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:00:25,366 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.937 novelty=0.62 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:00:33,707 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.669 = clip(base=0.589 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.519 novelty=0.68 | sol=0.45*prm_final(0.74)+0.35*prm_mean(0.39)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:00:33,912 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.697 = clip(base=0.617 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.602 novelty=0.68 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.49)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:00:34,115 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.456 = clip(base=0.376 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.186 novelty=0.68 | sol=0.45*prm_final(0.17)+0.35*prm_mean(0.31)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:00:34,318 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.676 = clip(base=0.596 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.558 novelty=0.68 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.43)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:00:34,519 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.819 = clip(base=0.739 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.755 novelty=0.68 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.74)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:00:34,720 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.851 = clip(base=0.771 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.824 novelty=0.68 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.81)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:00:34,921 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.592 = clip(base=0.512 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.387 novelty=0.68 | sol=0.45*prm_final(0.52)+0.35*prm_mean(0.34)+0.20*lccp(0.17) | steps=6
+2026-04-26 07:00:35,123 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.435 = clip(base=0.355 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.149 novelty=0.68 | sol=0.45*prm_final(0.17)+0.35*prm_mean(0.21)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:00:35,323 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.392 = clip(base=0.312 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.107 novelty=0.68 | sol=0.45*prm_final(0.11)+0.35*prm_mean(0.17)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:00:35,525 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.430 = clip(base=0.350 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.174 novelty=0.68 | sol=0.45*prm_final(0.05)+0.35*prm_mean(0.24)+0.20*lccp(0.33) | steps=3
+
Iter 26 GRPO groups: 75%|#######5 | 15/20 [07:00<02:39, 31.98s/q, loss=0.0016, mean_r=0.745, q_acc=100%, q_rew=0.692, skip=3]
Iter 26 GRPO groups: 80%|######## | 16/20 [07:00<01:52, 28.17s/q, loss=0.0016, mean_r=0.745, q_acc=100%, q_rew=0.692, skip=3]2026-04-26 07:00:44,289 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.424 = clip(base=0.344 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.199 novelty=0.66 | sol=0.45*prm_final(0.07)+0.35*prm_mean(0.33)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:00:44,483 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.755 = clip(base=0.675 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.736 novelty=0.66 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.69)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:00:44,678 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.596 = clip(base=0.516 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.501 novelty=0.66 | sol=0.45*prm_final(0.67)+0.35*prm_mean(0.46)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:00:44,871 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.388 = clip(base=0.308 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.156 novelty=0.66 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.26)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:00:45,065 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.687 = clip(base=0.607 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.602 novelty=0.66 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.51)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:00:45,270 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.494 = clip(base=0.414 + mod=+0.080, cap=1.00) | Q=0.55 sol=0.321 novelty=0.66 | sol=0.45*prm_final(0.21)+0.35*prm_mean(0.46)+0.20*lccp(0.33) | steps=6
+2026-04-26 07:00:45,467 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.383 = clip(base=0.303 + mod=+0.080, cap=1.00) | Q=0.51 sol=0.168 novelty=0.66 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.28)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:00:45,666 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.578 = clip(base=0.498 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.468 novelty=0.66 | sol=0.45*prm_final(0.73)+0.35*prm_mean(0.39)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:00:45,861 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.492 = clip(base=0.412 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.315 novelty=0.66 | sol=0.45*prm_final(0.38)+0.35*prm_mean(0.40)+0.20*lccp(0.00) | steps=6
+2026-04-26 07:00:46,055 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.428 = clip(base=0.348 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.188 novelty=0.66 | sol=0.45*prm_final(0.08)+0.35*prm_mean(0.34)+0.20*lccp(0.17) | steps=6
+2026-04-26 07:00:52,253 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.986 = clip(base=0.906 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.986 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:00:52,452 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.922 = clip(base=0.842 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.926 novelty=0.77 | sol=0.45*prm_final(0.92)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:00:52,653 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:00:52,854 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.996 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:00:53,053 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.990 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:00:53,253 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.888 = clip(base=0.808 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.867 novelty=0.77 | sol=0.45*prm_final(0.84)+0.35*prm_mean(0.83)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:00:53,455 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:00:53,660 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.988 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:00:53,863 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.995 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:00:54,067 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.992 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+
Iter 26 GRPO groups: 80%|######## | 16/20 [07:19<01:52, 28.17s/q, loss=0.0008, mean_r=0.738, q_acc=100%, q_rew=0.685, skip=3]
Iter 26 GRPO groups: 85%|########5 | 17/20 [07:19<01:15, 25.29s/q, loss=0.0008, mean_r=0.738, q_acc=100%, q_rew=0.685, skip=3]2026-04-26 07:01:06,608 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.744 = clip(base=0.664 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.670 novelty=0.71 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.65)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:01:06,814 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:01:07,018 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.969 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:01:07,229 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:01:07,445 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.798 = clip(base=0.718 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.773 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(0.17) | steps=6
+2026-04-26 07:01:07,665 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:01:07,871 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:01:08,081 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.737 = clip(base=0.657 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.675 novelty=0.71 | sol=0.45*prm_final(0.94)+0.35*prm_mean(0.58)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:01:08,298 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.919 = clip(base=0.839 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:01:08,521 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.655 = clip(base=0.575 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.527 novelty=0.71 | sol=0.45*prm_final(0.65)+0.35*prm_mean(0.55)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:01:12,943 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.933 = clip(base=0.853 + mod=+0.080, cap=1.00) | Q=0.63 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:13,151 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:13,364 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:13,568 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:13,766 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:13,966 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:14,174 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:14,376 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:14,574 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:14,773 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 26 GRPO groups: 85%|########5 | 17/20 [07:39<01:15, 25.29s/q, loss=-0.0003, mean_r=0.876, q_acc=100%, q_rew=0.673, skip=3]
Iter 26 GRPO groups: 90%|######### | 18/20 [07:39<00:47, 23.91s/q, loss=-0.0003, mean_r=0.876, q_acc=100%, q_rew=0.673, skip=3]2026-04-26 07:01:22,246 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.855 = 0.50×0.75(prox=0.75) + 0.40×proc(0.950[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='35' gold='30' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:01:22,332 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.983 = 0.50×1.00(exact) + 0.40×proc(0.957[fin=1.00,mean=0.89]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:01:27,630 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:01:27,714 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:01:27,799 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 07:01:27,884 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:01:32,239 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.950 = 0.50×1.00(exact) + 0.40×proc(0.874[fin=1.00,mean=0.69]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 07:01:32,325 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.974 = 0.50×1.00(exact) + 0.40×proc(0.935[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=86% lccp=14% (chain=1/7 ok_count=6) n_steps=7
+2026-04-26 07:01:32,409 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.941 = 0.50×1.00(exact) + 0.40×proc(0.852[fin=0.98,mean=0.66]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 07:01:32,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 26 GRPO groups: 90%|######### | 18/20 [08:12<00:47, 23.91s/q, loss=0.0015, mean_r=0.969, q_acc=100%, q_rew=0.673, skip=3]
Iter 26 GRPO groups: 95%|#########5| 19/20 [08:12<00:26, 26.62s/q, loss=0.0015, mean_r=0.969, q_acc=100%, q_rew=0.673, skip=3]2026-04-26 07:01:54,781 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.988 = clip(base=0.908 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:54,996 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:55,215 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:55,435 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:55,649 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:55,863 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:56,083 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:56,297 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:56,514 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:01:56,728 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.991 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:02,415 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.991 = clip(base=0.911 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.997 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:02,613 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.969 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:02,813 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.570 = clip(base=0.490 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.355 novelty=0.77 | sol=0.45*prm_final(0.33)+0.35*prm_mean(0.40)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:02:03,011 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.977 = clip(base=0.897 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.995 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:02:03,206 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.963 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:03,403 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.587 = clip(base=0.507 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.348 novelty=0.77 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.60)+0.20*lccp(0.67) | steps=3
+2026-04-26 07:02:03,596 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.956 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:03,792 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.997 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:03,993 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.950 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:04,193 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.967 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=3
+
Iter 26 GRPO groups: 95%|#########5| 19/20 [08:29<00:26, 26.62s/q, loss=-0.0001, mean_r=0.924, q_acc=100%, q_rew=0.679, skip=3]
Iter 26 GRPO groups: 100%|##########| 20/20 [08:29<00:00, 23.66s/q, loss=-0.0001, mean_r=0.924, q_acc=100%, q_rew=0.679, skip=3]
Iter 26 GRPO groups: 100%|##########| 20/20 [08:29<00:00, 25.48s/q, loss=-0.0001, mean_r=0.924, q_acc=100%, q_rew=0.679, skip=3]
+2026-04-26 07:02:06,205 INFO __main__ - Iter 26 | loss=0.0004 | reward mean=0.867 std=0.180 | gt_match=78.9% | grounded_acc=92.7% | step_acc=89.0% | lccp=79.4% | batch_acc=92.0% | phase=SELFPLAY_RAMP sp_ratio=46% | groups=26 skipped=3(0var=3) | lr=3.80e-06 | 509.7s
+2026-04-26 07:02:06,206 INFO __main__ - Question generation: 9/9 valid (100%) | q_reward=0.679 | q_acc=100.0% (>0.5 quality) | topic=0.66 diff=0.25 clarity=1.00 novelty=0.45 solvability=0.93
+2026-04-26 07:02:06,207 INFO __main__ - ======================================================================
+2026-04-26 07:02:06,207 INFO __main__ - GRPO ITERATION 27/60
+2026-04-26 07:02:06,207 INFO __main__ - ======================================================================
+2026-04-26 07:02:06,226 INFO __main__ - LR this iteration: 3.80e-06 | T=0.624 | MATH ratio=48%
+
Iter 27 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 07:02:10,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.948 = 0.50×1.00(exact) + 0.40×proc(0.870[fin=0.98,mean=0.71]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 07:02:10,836 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.940 = 0.50×1.00(exact) + 0.40×proc(0.849[fin=0.80,mean=0.92]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:02:10,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.925[fin=0.93,mean=0.92]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:02:11,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.933 = 0.50×1.00(exact) + 0.40×proc(0.832[fin=0.80,mean=0.89]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:02:18,310 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.904 = 0.50×1.00(exact) + 0.40×proc(0.761[fin=0.80,mean=0.70]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 07:02:18,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.945 = 0.50×1.00(exact) + 0.40×proc(0.862[fin=0.82,mean=0.92]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:02:18,490 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.977 = 0.50×1.00(exact) + 0.40×proc(0.943[fin=0.95,mean=0.93]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:02:18,574 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.911 = 0.50×1.00(exact) + 0.40×proc(0.779[fin=0.73,mean=0.85]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:02:28,279 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.905 = 0.50×1.00(exact) + 0.40×proc(0.762[fin=0.68,mean=0.89]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:02:28,366 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.901 = 0.50×1.00(exact) + 0.40×proc(0.752[fin=0.67,mean=0.87]) + 0.10×fmt(1.000) | pred='41' gold='41' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 27 GRPO groups: 0%| | 0/20 [00:23, ?q/s, loss=-0.0008, mean_r=0.933, skip=0]
Iter 27 GRPO groups: 5%|5 | 1/20 [00:23<07:28, 23.62s/q, loss=-0.0008, mean_r=0.933, skip=0]2026-04-26 07:02:36,835 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.954 + mod=+0.080, cap=1.00) | Q=0.89 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:02:37,036 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.921 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.997 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:02:37,235 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:02:37,433 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:02:37,634 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.931 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:02:37,832 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:02:38,031 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:02:38,232 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:02:38,433 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.997 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:02:38,632 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:02:42,927 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.920 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.995 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:43,125 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.995 = clip(base=0.915 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.987 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:43,329 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.986 = clip(base=0.906 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.972 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:43,535 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.998 = clip(base=0.918 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.983 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:43,732 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=1.000 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:43,930 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.986 = clip(base=0.906 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.973 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:44,130 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.993 = clip(base=0.913 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.982 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:44,326 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.921 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:02:44,532 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.870 = clip(base=0.790 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.777 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.74)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:02:44,734 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 27 GRPO groups: 5%|5 | 1/20 [00:40<07:28, 23.62s/q, loss=-0.0002, mean_r=0.991, q_acc=100%, q_rew=0.820, skip=1]
Iter 27 GRPO groups: 10%|# | 2/20 [00:40<05:50, 19.45s/q, loss=-0.0002, mean_r=0.991, q_acc=100%, q_rew=0.820, skip=1]2026-04-26 07:02:50,688 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:02:50,772 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:02:59,313 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:02:59,395 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:02:59,479 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:02:59,561 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:03:08,557 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.967 = 0.50×1.00(exact) + 0.40×proc(0.917[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 07:03:08,640 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:03:08,723 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:03:08,806 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 27 GRPO groups: 10%|# | 2/20 [01:09<05:50, 19.45s/q, loss=0var, mean_r=0.996, skip=2]
Iter 27 GRPO groups: 15%|#5 | 3/20 [01:09<06:45, 23.88s/q, loss=0var, mean_r=0.996, skip=2]2026-04-26 07:03:23,764 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.932 = clip(base=0.852 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.987 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:23,962 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.985 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:24,169 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.890 = clip(base=0.810 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.963 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:03:24,379 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.985 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:24,585 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.984 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:24,786 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.982 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:24,985 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.972 novelty=0.68 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:25,191 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.987 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:25,394 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.989 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:25,596 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.891 = clip(base=0.811 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.964 novelty=0.68 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:03:38,867 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.995 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=8
+2026-04-26 07:03:39,064 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.756 = clip(base=0.676 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.718 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.78)+0.20*lccp(0.00) | steps=6
+2026-04-26 07:03:39,272 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.784 = clip(base=0.704 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.768 novelty=0.73 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.76)+0.20*lccp(0.38) | steps=8
+2026-04-26 07:03:39,471 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.766 = clip(base=0.686 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.755 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.88)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:03:39,667 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.885 = clip(base=0.805 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.948 novelty=0.73 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=8
+2026-04-26 07:03:39,863 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.992 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=8
+2026-04-26 07:03:40,068 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.753 = clip(base=0.673 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.716 novelty=0.73 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.74)+0.20*lccp(0.12) | steps=8
+2026-04-26 07:03:40,278 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.988 novelty=0.73 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=11
+2026-04-26 07:03:40,477 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.994 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:03:40,700 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.801 = clip(base=0.721 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.802 novelty=0.73 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.89)+0.20*lccp(0.27) | steps=11
+
Iter 27 GRPO groups: 15%|#5 | 3/20 [01:36<06:45, 23.88s/q, loss=0.0002, mean_r=0.874, q_acc=100%, q_rew=0.709, skip=2]
Iter 27 GRPO groups: 20%|## | 4/20 [01:36<06:41, 25.06s/q, loss=0.0002, mean_r=0.874, q_acc=100%, q_rew=0.709, skip=2]2026-04-26 07:03:56,387 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:03:56,492 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.762 = 0.50×0.67(prox=0.67) + 0.40×proc(0.823[fin=1.00,mean=0.56]) + 0.10×fmt(1.000) | pred='3' gold='4' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:03:56,603 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:03:56,705 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.610 = 0.50×0.40(prox=0.40) + 0.40×proc(0.775[fin=0.98,mean=0.46]) + 0.10×fmt(1.000) | pred='1' gold='4' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 07:04:01,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.324 = 0.50×0.33(prox=0.33) + 0.40×proc(0.230[fin=0.29,mean=0.15]) + 0.10×fmt(0.650) | pred='0' gold='4' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 07:04:01,592 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.879[fin=0.98,mean=0.72]) + 0.10×fmt(1.000) | pred='0' gold='4' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 07:04:01,703 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.989[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:04:01,808 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.623 = 0.50×0.40(prox=0.40) + 0.40×proc(0.807[fin=0.98,mean=0.55]) + 0.10×fmt(1.000) | pred='1' gold='4' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:04:04,390 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.896 = 0.50×1.00(exact) + 0.40×proc(0.739[fin=0.97,mean=0.39]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=20% lccp=0% (chain=0/5 ok_count=1) n_steps=5
+2026-04-26 07:04:04,493 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 27 GRPO groups: 20%|## | 4/20 [01:59<06:41, 25.06s/q, loss=0.0005, mean_r=0.775, q_acc=100%, q_rew=0.709, skip=2]
Iter 27 GRPO groups: 25%|##5 | 5/20 [01:59<06:07, 24.52s/q, loss=0.0005, mean_r=0.775, q_acc=100%, q_rew=0.709, skip=2]2026-04-26 07:04:15,120 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.943 + mod=+0.080, cap=1.00) | Q=0.90 sol=0.974 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:04:15,308 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.904 novelty=0.77 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.77)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:04:15,495 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.821 = clip(base=0.741 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.687 novelty=0.77 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.70)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:04:15,682 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.910 novelty=0.77 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.76)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:04:15,872 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.767 = clip(base=0.687 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.582 novelty=0.77 | sol=0.45*prm_final(0.84)+0.35*prm_mean(0.58)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:04:16,058 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.481 = clip(base=0.401 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.208 novelty=0.77 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.39)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:04:16,253 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:04:16,441 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.831 = clip(base=0.751 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.691 novelty=0.77 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.71)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:04:16,627 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.820 = clip(base=0.740 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.689 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.70)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:04:16,814 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.907 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.75)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:04:25,754 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:04:25,968 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:04:26,179 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:04:26,388 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=8
+2026-04-26 07:04:26,593 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:04:26,797 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.931 = clip(base=0.851 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:04:27,002 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:04:27,219 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:04:27,425 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.931 = clip(base=0.851 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.973 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:04:27,630 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.922 = clip(base=0.842 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.994 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=7
+
Iter 27 GRPO groups: 25%|##5 | 5/20 [02:23<06:07, 24.52s/q, loss=-0.0001, mean_r=0.891, q_acc=100%, q_rew=0.712, skip=2]
Iter 27 GRPO groups: 30%|### | 6/20 [02:23<05:37, 24.14s/q, loss=-0.0001, mean_r=0.891, q_acc=100%, q_rew=0.712, skip=2]2026-04-26 07:04:41,504 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='57' gold='57' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:04:41,603 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.774 = 0.50×0.64(prox=0.64) + 0.40×proc(0.883[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='41' gold='57' | step_acc=80% lccp=20% (chain=2/10 ok_count=8) n_steps=10
+2026-04-26 07:05:05,078 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.769 = 0.50×0.64(prox=0.64) + 0.40×proc(0.871[fin=1.00,mean=0.68]) + 0.10×fmt(1.000) | pred='41' gold='57' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 07:05:05,170 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(0.700) | pred='' gold='57' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:05:05,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.942 = 0.50×1.00(exact) + 0.40×proc(0.856[fin=0.98,mean=0.66]) + 0.10×fmt(1.000) | pred='57' gold='57' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:05:05,348 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='57' gold='57' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:05:15,148 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.788 = 0.50×0.64(prox=0.64) + 0.40×proc(0.920[fin=1.00,mean=0.80]) + 0.10×fmt(1.000) | pred='41' gold='57' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 07:05:15,240 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='57' gold='57' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:05:15,332 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.959 = 0.50×1.00(exact) + 0.40×proc(0.898[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='57' gold='57' | step_acc=70% lccp=0% (chain=0/10 ok_count=7) n_steps=10
+2026-04-26 07:05:15,416 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.795 = 0.50×0.64(prox=0.64) + 0.40×proc(0.937[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='41' gold='57' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+
Iter 27 GRPO groups: 30%|### | 6/20 [03:16<05:37, 24.14s/q, loss=-0.0008, mean_r=0.858, q_acc=100%, q_rew=0.712, skip=2]
Iter 27 GRPO groups: 35%|###5 | 7/20 [03:16<07:19, 33.80s/q, loss=-0.0008, mean_r=0.858, q_acc=100%, q_rew=0.712, skip=2]2026-04-26 07:05:28,334 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:05:28,418 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.950 = 0.50×1.00(exact) + 0.40×proc(0.875[fin=0.89,mean=0.85]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:05:28,500 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.960[fin=0.99,mean=0.92]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:05:28,584 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:05:35,824 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:05:35,909 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:05:35,995 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.728 = 0.50×0.67(prox=0.67) + 0.40×proc(0.736[fin=0.88,mean=0.52]) + 0.10×fmt(1.000) | pred='30' gold='24' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 07:05:36,079 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.975 = 0.50×1.00(exact) + 0.40×proc(0.938[fin=0.94,mean=0.93]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:05:45,558 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.990 = 0.50×1.00(exact) + 0.40×proc(0.975[fin=0.99,mean=0.95]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:05:45,642 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='24' gold='24' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 27 GRPO groups: 35%|###5 | 7/20 [03:40<07:19, 33.80s/q, loss=-0.0007, mean_r=0.961, q_acc=100%, q_rew=0.712, skip=2]
Iter 27 GRPO groups: 40%|#### | 8/20 [03:40<06:08, 30.69s/q, loss=-0.0007, mean_r=0.961, q_acc=100%, q_rew=0.712, skip=2]2026-04-26 07:05:55,973 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.948 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.993 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:05:56,195 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.936 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.995 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:05:56,406 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.995 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:05:56,619 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.935 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.994 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:05:56,831 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.990 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:05:57,049 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.930 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.995 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:05:57,263 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.922 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.976 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:05:57,476 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.992 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:05:57,687 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.992 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:05:57,901 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.928 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.992 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:06:04,223 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.962 + mod=+0.080, cap=1.00) | Q=0.91 sol=0.997 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:06:04,434 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.803 = clip(base=0.723 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.672 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.63)+0.20*lccp(0.00) | steps=6
+2026-04-26 07:06:04,643 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:06:04,855 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.941 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.996 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:06:05,067 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:06:05,277 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.937 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:06:05,486 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.941 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.996 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:06:05,695 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.938 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:06:05,905 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.842 = clip(base=0.762 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.729 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.66)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:06:06,118 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.937 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.997 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+
Iter 27 GRPO groups: 40%|#### | 8/20 [04:01<06:08, 30.69s/q, loss=-0.0006, mean_r=0.982, q_acc=100%, q_rew=0.745, skip=3]
Iter 27 GRPO groups: 45%|####5 | 9/20 [04:01<05:03, 27.55s/q, loss=-0.0006, mean_r=0.982, q_acc=100%, q_rew=0.745, skip=3]2026-04-26 07:06:13,501 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.964[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:06:13,583 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:06:21,433 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:06:21,515 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:06:21,597 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:06:21,679 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:06:26,390 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:06:26,471 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:06:26,552 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:06:26,636 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.980[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='10' gold='10' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 27 GRPO groups: 45%|####5 | 9/20 [04:25<05:03, 27.55s/q, loss=0var, mean_r=0.995, skip=4]
Iter 27 GRPO groups: 50%|##### | 10/20 [04:25<04:24, 26.48s/q, loss=0var, mean_r=0.995, skip=4]2026-04-26 07:07:05,877 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='2' gold='0' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:07:05,974 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.14(prox=0.14) + 0.40×proc(0.927[fin=0.96,mean=0.88]) + 0.10×fmt(1.000) | pred='3' gold='0' | step_acc=88% lccp=75% (chain=6/8 ok_count=7) n_steps=8
+2026-04-26 07:07:06,068 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.974[fin=1.00,mean=0.94]) + 0.10×fmt(1.000) | pred='2' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:07:06,162 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='2' gold='0' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:07:14,845 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.01(prox=0.01) + 0.40×proc(0.949[fin=1.00,mean=0.88]) + 0.10×fmt(1.000) | pred='84' gold='0' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:07:14,938 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.978 = 0.50×1.00(exact) + 0.40×proc(0.944[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 07:07:15,035 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='0' gold='0' | step_acc=100% lccp=100% (chain=12/12 ok_count=12) n_steps=12
+2026-04-26 07:07:15,132 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.949[fin=0.99,mean=0.89]) + 0.10×fmt(1.000) | pred='2' gold='0' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:07:25,542 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.990[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='2' gold='0' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 27 GRPO groups: 50%|##### | 10/20 [05:20<04:24, 26.48s/q, loss=0.0008, mean_r=0.647, q_acc=100%, q_rew=0.745, skip=4]
Iter 27 GRPO groups: 55%|#####5 | 11/20 [05:20<05:17, 35.27s/q, loss=0.0008, mean_r=0.647, q_acc=100%, q_rew=0.745, skip=4]2026-04-26 07:07:32,682 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:07:32,877 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:07:33,072 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:07:33,270 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.778 = clip(base=0.698 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.785 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.77)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:07:33,466 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:07:33,672 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.901 = clip(base=0.821 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.990 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:07:33,876 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:07:34,073 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:07:34,274 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:07:34,473 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:07:37,659 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.919 = clip(base=0.839 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.979 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:37,834 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.896 = clip(base=0.816 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.979 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:38,009 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.984 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:38,188 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.983 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:38,367 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.896 = clip(base=0.816 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.979 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:38,545 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.989 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:38,733 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.984 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:38,917 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.989 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:39,098 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.984 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:07:39,281 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.983 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+
Iter 27 GRPO groups: 55%|#####5 | 11/20 [05:34<05:17, 35.27s/q, loss=-0.0004, mean_r=0.898, q_acc=100%, q_rew=0.711, skip=4]
Iter 27 GRPO groups: 60%|###### | 12/20 [05:34<03:50, 28.79s/q, loss=-0.0004, mean_r=0.898, q_acc=100%, q_rew=0.711, skip=4]2026-04-26 07:07:49,102 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.760 = 0.50×0.59(prox=0.59) + 0.40×proc(0.908[fin=0.98,mean=0.79]) + 0.10×fmt(1.000) | pred='230' gold='350' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+2026-04-26 07:07:49,198 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:07:49,291 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:08:00,206 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.738 = 0.50×0.74(prox=0.74) + 0.40×proc(0.664[fin=0.68,mean=0.64]) + 0.10×fmt(1.000) | pred='410' gold='350' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 07:08:00,297 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:08:00,389 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.945 = 0.50×1.00(exact) + 0.40×proc(0.863[fin=1.00,mean=0.66]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 07:08:00,481 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.926[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 07:08:09,027 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.476 = 0.50×0.54(prox=0.54) + 0.40×proc(0.267[fin=0.18,mean=0.40]) + 0.10×fmt(1.000) | pred='200' gold='350' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 07:08:09,121 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='350' gold='350' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:08:09,217 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.38(prox=0.38) + 0.40×proc(0.779[fin=0.96,mean=0.50]) + 0.10×fmt(1.000) | pred='70' gold='350' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+
Iter 27 GRPO groups: 60%|###### | 12/20 [06:04<03:50, 28.79s/q, loss=-0.0007, mean_r=0.843, q_acc=100%, q_rew=0.711, skip=4]
Iter 27 GRPO groups: 65%|######5 | 13/20 [06:04<03:23, 29.06s/q, loss=-0.0007, mean_r=0.843, q_acc=100%, q_rew=0.711, skip=4]2026-04-26 07:08:17,850 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.931 = clip(base=0.851 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:18,069 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:18,298 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:18,519 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:18,740 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:18,958 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:19,182 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:19,404 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:19,626 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:19,847 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:26,444 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:26,646 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:26,849 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.893 = clip(base=0.813 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.977 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:27,051 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:27,259 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:27,464 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:27,666 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:27,876 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:08:28,081 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.755 = clip(base=0.675 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.747 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.85)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:08:28,287 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+
Iter 27 GRPO groups: 65%|######5 | 13/20 [06:23<03:23, 29.06s/q, loss=-0.0009, mean_r=0.902, q_acc=100%, q_rew=0.689, skip=4]
Iter 27 GRPO groups: 70%|####### | 14/20 [06:23<02:36, 26.11s/q, loss=-0.0009, mean_r=0.902, q_acc=100%, q_rew=0.689, skip=4]2026-04-26 07:08:41,304 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.974 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:08:41,498 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.857 = clip(base=0.777 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.900 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.72)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:08:41,691 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.813 = clip(base=0.733 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.826 novelty=0.74 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.79)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:08:41,887 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.622 = clip(base=0.542 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.490 novelty=0.74 | sol=0.45*prm_final(0.38)+0.35*prm_mean(0.58)+0.20*lccp(0.57) | steps=7
+2026-04-26 07:08:42,074 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.805 = clip(base=0.725 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.814 novelty=0.74 | sol=0.45*prm_final(0.92)+0.35*prm_mean(0.80)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:08:42,269 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.888 = clip(base=0.808 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.950 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:08:42,461 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.881 = clip(base=0.801 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.940 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:08:42,650 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.786 = clip(base=0.706 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.782 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.81)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:08:42,837 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.822 = clip(base=0.742 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.842 novelty=0.74 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.82)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:08:43,024 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.820 = clip(base=0.740 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.837 novelty=0.74 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.81)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:08:57,681 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.483 = clip(base=0.403 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.281 novelty=0.73 | sol=0.45*prm_final(0.47)+0.35*prm_mean(0.20)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:08:57,913 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.584 = clip(base=0.504 + mod=+0.080, cap=1.00) | Q=0.53 sol=0.485 novelty=0.73 | sol=0.45*prm_final(0.66)+0.35*prm_mean(0.46)+0.20*lccp(0.14) | steps=7
+2026-04-26 07:08:58,140 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.454 = clip(base=0.374 + mod=+0.080, cap=1.00) | Q=0.53 sol=0.272 novelty=0.73 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.37)+0.20*lccp(0.40) | steps=5
+2026-04-26 07:08:58,369 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.570 = clip(base=0.490 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.470 novelty=0.73 | sol=0.45*prm_final(0.70)+0.35*prm_mean(0.44)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:08:58,595 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.356 = clip(base=0.276 + mod=+0.080, cap=1.00) | Q=0.53 sol=0.110 novelty=0.73 | sol=0.45*prm_final(0.13)+0.35*prm_mean(0.15)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:08:58,821 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.369 = clip(base=0.289 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.123 novelty=0.73 | sol=0.45*prm_final(0.13)+0.35*prm_mean(0.18)+0.20*lccp(0.00) | steps=10
+2026-04-26 07:08:59,054 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.549 = clip(base=0.469 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.383 novelty=0.73 | sol=0.45*prm_final(0.33)+0.35*prm_mean(0.61)+0.20*lccp(0.11) | steps=9
+2026-04-26 07:08:59,282 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.376 = clip(base=0.296 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.148 novelty=0.73 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.24)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:08:59,506 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.644 = clip(base=0.564 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.553 novelty=0.73 | sol=0.45*prm_final(0.46)+0.35*prm_mean(0.64)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:08:59,735 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.695 = clip(base=0.615 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.639 novelty=0.73 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.67)+0.20*lccp(0.00) | steps=5
+
Iter 27 GRPO groups: 70%|####### | 14/20 [06:55<02:36, 26.11s/q, loss=-0.0002, mean_r=0.665, q_acc=100%, q_rew=0.673, skip=4]
Iter 27 GRPO groups: 75%|#######5 | 15/20 [06:55<02:18, 27.72s/q, loss=-0.0002, mean_r=0.665, q_acc=100%, q_rew=0.673, skip=4]2026-04-26 07:09:11,782 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:09:21,711 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:09:21,795 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:09:21,878 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.908[fin=0.97,mean=0.82]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 07:09:21,968 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:09:35,478 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:09:35,571 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:09:35,662 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:09:35,754 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.596 = 0.50×0.73(prox=0.73) + 0.40×proc(0.332[fin=0.19,mean=0.55]) + 0.10×fmt(1.000) | pred='19' gold='16' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 07:09:43,872 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 27 GRPO groups: 75%|#######5 | 15/20 [07:39<02:18, 27.72s/q, loss=-0.0001, mean_r=0.956, q_acc=100%, q_rew=0.673, skip=4]
Iter 27 GRPO groups: 80%|######## | 16/20 [07:39<02:10, 32.59s/q, loss=-0.0001, mean_r=0.956, q_acc=100%, q_rew=0.673, skip=4]2026-04-26 07:09:52,685 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.933 = clip(base=0.853 + mod=+0.080, cap=1.00) | Q=0.63 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:52,886 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:53,087 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:53,297 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:53,503 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:53,713 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:53,913 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:54,119 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:54,324 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:54,525 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:09:59,769 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.885 = clip(base=0.805 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.926 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.80)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:09:59,974 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.898 = clip(base=0.818 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.984 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:00,180 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.691 = clip(base=0.611 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.637 novelty=0.69 | sol=0.45*prm_final(0.77)+0.35*prm_mean(0.64)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:10:00,380 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.750 = clip(base=0.670 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.735 novelty=0.69 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.70)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:10:00,584 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.749 = clip(base=0.669 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.734 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.64)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:10:00,785 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.901 = clip(base=0.821 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.987 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:00,988 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.497 = clip(base=0.417 + mod=+0.080, cap=1.00) | Q=0.51 sol=0.358 novelty=0.69 | sol=0.45*prm_final(0.33)+0.35*prm_mean(0.41)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:10:01,201 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.894 = clip(base=0.814 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.976 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:01,416 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.746 = clip(base=0.666 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.729 novelty=0.69 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.65)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:10:01,621 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.876 = clip(base=0.796 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.946 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+
Iter 27 GRPO groups: 80%|######## | 16/20 [07:57<02:10, 32.59s/q, loss=-0.0004, mean_r=0.851, q_acc=100%, q_rew=0.661, skip=4]
Iter 27 GRPO groups: 85%|########5 | 17/20 [07:57<01:24, 28.21s/q, loss=-0.0004, mean_r=0.851, q_acc=100%, q_rew=0.661, skip=4]2026-04-26 07:10:39,484 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.967 + mod=+0.080, cap=1.00) | Q=0.93 sol=0.992 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:10:39,678 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.945 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.998 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:10:39,873 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.942 + mod=+0.080, cap=1.00) | Q=0.87 sol=0.993 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:10:40,065 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.943 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.996 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:10:40,263 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.942 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.995 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:10:40,454 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.949 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.997 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:40,647 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.950 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.999 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:40,839 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.949 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.996 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:41,031 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.939 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.998 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:43,754 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.976 novelty=0.65 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:43,944 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.990 novelty=0.65 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:44,139 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.981 novelty=0.65 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:44,328 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.982 novelty=0.65 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:44,522 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.983 novelty=0.65 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:44,720 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.976 novelty=0.65 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:44,909 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.989 novelty=0.65 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:45,102 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.977 novelty=0.65 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:45,296 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.982 novelty=0.65 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:10:45,486 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.968 novelty=0.65 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+
Iter 27 GRPO groups: 85%|########5 | 17/20 [08:40<01:24, 28.21s/q, loss=0.0001, mean_r=0.968, q_acc=100%, q_rew=0.674, skip=5]
Iter 27 GRPO groups: 90%|######### | 18/20 [08:40<01:05, 32.88s/q, loss=0.0001, mean_r=0.968, q_acc=100%, q_rew=0.674, skip=5]2026-04-26 07:10:59,507 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.11(prox=0.11) + 0.40×proc(0.914[fin=0.99,mean=0.80]) + 0.10×fmt(1.000) | pred='15' gold='3' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 07:10:59,591 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='1/8' gold='3' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:10:59,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.510 = 0.50×0.14(prox=0.14) + 0.40×proc(0.847[fin=0.98,mean=0.65]) + 0.10×fmt(1.000) | pred='12' gold='3' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 07:11:12,171 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:11:12,256 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 07:11:12,355 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.706 = 0.50×0.50(prox=0.50) + 0.40×proc(0.890[fin=0.98,mean=0.75]) + 0.10×fmt(1.000) | pred='1.5' gold='3' | step_acc=75% lccp=62% (chain=5/8 ok_count=6) n_steps=8
+2026-04-26 07:11:12,441 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.169 = 0.50×0.02(prox=0.02) + 0.40×proc(0.141[fin=0.06,mean=0.26]) + 0.10×fmt(1.000) | pred='63' gold='3' | step_acc=14% lccp=0% (chain=0/7 ok_count=1) n_steps=7
+2026-04-26 07:11:35,839 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.203 = 0.50×0.00(prox=0.00) + 0.40×proc(0.252[fin=0.33,mean=0.13]) + 0.10×fmt(1.000) | pred='315' gold='3' | step_acc=0% lccp=0% (chain=0/3 ok_count=0) n_steps=3
+2026-04-26 07:11:35,920 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.333 = 0.50×0.11(prox=0.11) + 0.40×proc(0.443[fin=0.55,mean=0.28]) + 0.10×fmt(1.000) | pred='15' gold='3' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 07:11:36,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.300 = 0.50×0.02(prox=0.02) + 0.40×proc(0.355[fin=0.30,mean=0.44]) + 0.10×fmt(1.000) | pred='90' gold='3' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+
Iter 27 GRPO groups: 90%|######### | 18/20 [09:31<01:05, 32.88s/q, loss=0.0003, mean_r=0.532, q_acc=100%, q_rew=0.674, skip=5]
Iter 27 GRPO groups: 95%|#########5| 19/20 [09:31<00:38, 38.15s/q, loss=0.0003, mean_r=0.532, q_acc=100%, q_rew=0.674, skip=5]2026-04-26 07:11:42,739 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.993 = clip(base=0.913 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.992 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:42,926 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.997 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:43,116 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.970 = clip(base=0.890 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.994 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:43,314 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.995 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:43,505 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.997 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:43,705 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.986 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:43,901 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.986 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:44,091 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.986 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:44,283 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.995 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:11:44,486 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.987 novelty=0.81 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:11:59,943 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.708 = clip(base=0.628 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.499 novelty=0.74 | sol=0.45*prm_final(0.05)+0.35*prm_mean(0.87)+0.20*lccp(0.87) | steps=15
+2026-04-26 07:12:00,170 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.932 = clip(base=0.852 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.898 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(0.71) | steps=7
+2026-04-26 07:12:00,394 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.739 = clip(base=0.659 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.601 novelty=0.74 | sol=0.45*prm_final(0.89)+0.35*prm_mean(0.46)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:12:00,617 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.893 novelty=0.74 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.91)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:12:00,839 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.550 = clip(base=0.470 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.288 novelty=0.74 | sol=0.45*prm_final(0.12)+0.35*prm_mean(0.44)+0.20*lccp(0.40) | steps=5
+2026-04-26 07:12:01,065 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.942 novelty=0.74 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.85)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:12:01,291 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.978 novelty=0.74 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:12:01,519 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.812 = clip(base=0.732 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.684 novelty=0.74 | sol=0.45*prm_final(0.58)+0.35*prm_mean(0.80)+0.20*lccp(0.71) | steps=7
+2026-04-26 07:12:01,746 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.742 = clip(base=0.662 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.551 novelty=0.74 | sol=0.45*prm_final(0.69)+0.35*prm_mean(0.60)+0.20*lccp(0.14) | steps=7
+2026-04-26 07:12:01,975 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.667 = clip(base=0.587 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.446 novelty=0.74 | sol=0.45*prm_final(0.42)+0.35*prm_mean(0.50)+0.20*lccp(0.40) | steps=5
+
Iter 27 GRPO groups: 95%|#########5| 19/20 [09:57<00:38, 38.15s/q, loss=0.0003, mean_r=0.887, q_acc=100%, q_rew=0.683, skip=5]
Iter 27 GRPO groups: 100%|##########| 20/20 [09:57<00:00, 34.56s/q, loss=0.0003, mean_r=0.887, q_acc=100%, q_rew=0.683, skip=5]
Iter 27 GRPO groups: 100%|##########| 20/20 [09:57<00:00, 29.87s/q, loss=0.0003, mean_r=0.887, q_acc=100%, q_rew=0.683, skip=5]
+2026-04-26 07:12:03,730 INFO __main__ - Iter 27 | loss=-0.0002 | reward mean=0.878 std=0.163 | gt_match=68.7% | grounded_acc=93.9% | step_acc=86.2% | lccp=74.1% | batch_acc=95.6% | phase=SELFPLAY_RAMP sp_ratio=50% | groups=25 skipped=5(0var=5) | lr=3.67e-06 | 597.5s
+2026-04-26 07:12:03,730 INFO __main__ - Question generation: 10/10 valid (100%) | q_reward=0.683 | q_acc=100.0% (>0.5 quality) | topic=0.57 diff=0.36 clarity=1.00 novelty=0.46 solvability=0.98
+2026-04-26 07:12:03,731 INFO __main__ - ======================================================================
+2026-04-26 07:12:03,731 INFO __main__ - GRPO ITERATION 28/60
+2026-04-26 07:12:03,731 INFO __main__ - ======================================================================
+2026-04-26 07:12:03,751 INFO __main__ - LR this iteration: 3.67e-06 | T=0.617 | MATH ratio=50%
+
Iter 28 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 07:12:09,837 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.879 = clip(base=0.799 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.925 novelty=0.65 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.83)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:10,050 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.740 = clip(base=0.660 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.723 novelty=0.65 | sol=0.45*prm_final(0.51)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:10,267 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.765 = clip(base=0.685 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.762 novelty=0.65 | sol=0.45*prm_final(0.58)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:10,480 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.767 = clip(base=0.687 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.768 novelty=0.65 | sol=0.45*prm_final(0.60)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:10,688 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.828 = clip(base=0.748 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.869 novelty=0.65 | sol=0.45*prm_final(0.77)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:10,897 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.790 = clip(base=0.710 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.807 novelty=0.65 | sol=0.45*prm_final(0.66)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:11,108 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.719 = clip(base=0.639 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.684 novelty=0.65 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.65)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:12:11,316 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.833 = clip(base=0.753 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.877 novelty=0.65 | sol=0.45*prm_final(0.79)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:11,525 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.751 = clip(base=0.671 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.738 novelty=0.65 | sol=0.45*prm_final(0.54)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:11,743 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.812 = clip(base=0.732 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.844 novelty=0.65 | sol=0.45*prm_final(0.73)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:12:18,939 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.73 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:19,141 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:19,381 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:19,601 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:19,813 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:20,015 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:20,216 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:20,422 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:20,628 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:12:20,837 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+
Iter 28 GRPO groups: 0%| | 0/20 [00:18, ?q/s, loss=0.0005, mean_r=0.872, q_acc=100%, q_rew=0.630, skip=0]
Iter 28 GRPO groups: 5%|5 | 1/20 [00:18<05:56, 18.76s/q, loss=0.0005, mean_r=0.872, q_acc=100%, q_rew=0.630, skip=0]2026-04-26 07:12:44,140 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.524 = 0.50×0.00(prox=0.00) + 0.40×proc(0.934[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='$5+\\sqrt{62}$' gold='13' | step_acc=78% lccp=33% (chain=3/9 ok_count=7) n_steps=9
+2026-04-26 07:13:07,451 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.891 = 0.50×0.85(prox=0.85) + 0.40×proc(0.914[fin=0.99,mean=0.80]) + 0.10×fmt(1.000) | pred='12' gold='13' | step_acc=76% lccp=29% (chain=5/17 ok_count=13) n_steps=17
+2026-04-26 07:13:07,544 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.438 = 0.50×0.00(prox=0.00) + 0.40×proc(0.844[fin=1.00,mean=0.61]) + 0.10×fmt(1.000) | pred='$10 + \\sqrt{170}$' gold='13' | step_acc=57% lccp=0% (chain=0/7 ok_count=4) n_steps=7
+2026-04-26 07:13:07,649 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.738 = 0.50×1.00(exact) + 0.40×proc(0.345[fin=0.25,mean=0.48]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=42% lccp=17% (chain=2/12 ok_count=5) n_steps=12
+2026-04-26 07:13:07,741 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.741 = 0.50×0.62(prox=0.62) + 0.40×proc(0.830[fin=0.98,mean=0.61]) + 0.10×fmt(1.000) | pred='9' gold='13' | step_acc=57% lccp=14% (chain=1/7 ok_count=4) n_steps=7
+2026-04-26 07:13:28,450 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.377 = 0.50×0.00(prox=0.00) + 0.40×proc(0.692[fin=0.72,mean=0.65]) + 0.10×fmt(1.000) | pred='No solution' gold='13' | step_acc=71% lccp=0% (chain=0/7 ok_count=5) n_steps=7
+2026-04-26 07:13:28,548 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.682 = 0.50×0.52(prox=0.52) + 0.40×proc(0.805[fin=0.98,mean=0.54]) + 0.10×fmt(1.000) | pred='19' gold='13' | step_acc=45% lccp=0% (chain=0/11 ok_count=5) n_steps=11
+2026-04-26 07:13:28,642 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.290 = 0.50×0.00(prox=0.00) + 0.40×proc(0.551[fin=0.62,mean=0.45]) + 0.10×fmt(0.700) | pred='' gold='13' | step_acc=43% lccp=0% (chain=0/7 ok_count=3) n_steps=7
+2026-04-26 07:13:28,740 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.468 = 0.50×0.00(prox=0.00) + 0.40×proc(0.834[fin=0.95,mean=0.65]) + 0.10×fmt(1.000) | pred='$5+\\sqrt{37+4\\sqrt{14}}$' gold='13' | step_acc=69% lccp=23% (chain=3/13 ok_count=9) n_steps=13
+2026-04-26 07:13:52,305 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.468 = 0.50×0.00(prox=0.00) + 0.40×proc(0.858[fin=0.98,mean=0.68]) + 0.10×fmt(1.000) | pred='$5+\\sqrt{129}$' gold='13' | step_acc=67% lccp=17% (chain=2/12 ok_count=8) n_steps=12
+
Iter 28 GRPO groups: 5%|5 | 1/20 [01:50<05:56, 18.76s/q, loss=0.0005, mean_r=0.562, q_acc=100%, q_rew=0.630, skip=0]
Iter 28 GRPO groups: 10%|# | 2/20 [01:50<18:26, 61.45s/q, loss=0.0005, mean_r=0.562, q_acc=100%, q_rew=0.630, skip=0]2026-04-26 07:14:03,355 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.973 = clip(base=0.893 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:03,552 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:03,748 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=8
+2026-04-26 07:14:03,947 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.996 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:14:04,149 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.576 = clip(base=0.496 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.339 novelty=0.69 | sol=0.45*prm_final(0.15)+0.35*prm_mean(0.58)+0.20*lccp(0.33) | steps=6
+2026-04-26 07:14:04,353 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:04,560 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:04,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:14:04,969 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.985 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=8
+2026-04-26 07:14:05,164 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:22,752 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.760 = clip(base=0.680 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.608 novelty=0.68 | sol=0.45*prm_final(0.71)+0.35*prm_mean(0.63)+0.20*lccp(0.33) | steps=9
+2026-04-26 07:14:22,973 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.838 = clip(base=0.758 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.779 novelty=0.68 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.81)+0.20*lccp(0.50) | steps=8
+2026-04-26 07:14:23,195 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.821 = clip(base=0.741 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.748 novelty=0.68 | sol=0.45*prm_final(0.77)+0.35*prm_mean(0.76)+0.20*lccp(0.67) | steps=9
+2026-04-26 07:14:23,407 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.780 = clip(base=0.700 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.698 novelty=0.68 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.69)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:14:23,633 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.781 = clip(base=0.701 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.672 novelty=0.68 | sol=0.45*prm_final(0.62)+0.35*prm_mean(0.89)+0.20*lccp(0.42) | steps=12
+2026-04-26 07:14:23,852 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.591 = clip(base=0.511 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.362 novelty=0.68 | sol=0.45*prm_final(0.24)+0.35*prm_mean(0.50)+0.20*lccp(0.40) | steps=5
+2026-04-26 07:14:24,071 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.605 = clip(base=0.525 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.392 novelty=0.68 | sol=0.45*prm_final(0.13)+0.35*prm_mean(0.60)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:14:24,313 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.833 = clip(base=0.753 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.767 novelty=0.68 | sol=0.45*prm_final(0.89)+0.35*prm_mean(0.83)+0.20*lccp(0.38) | steps=8
+2026-04-26 07:14:24,544 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.562 = clip(base=0.482 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.312 novelty=0.68 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.63)+0.20*lccp(0.25) | steps=8
+2026-04-26 07:14:24,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.863 = clip(base=0.783 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.827 novelty=0.68 | sol=0.45*prm_final(0.69)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=7
+
Iter 28 GRPO groups: 10%|# | 2/20 [02:22<18:26, 61.45s/q, loss=-0.0003, mean_r=0.830, q_acc=100%, q_rew=0.673, skip=0]
Iter 28 GRPO groups: 15%|#5 | 3/20 [02:22<13:41, 48.30s/q, loss=-0.0003, mean_r=0.830, q_acc=100%, q_rew=0.673, skip=0]2026-04-26 07:14:34,826 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.747 = clip(base=0.667 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.656 novelty=0.70 | sol=0.45*prm_final(0.70)+0.35*prm_mean(0.69)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:14:35,029 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.784 = clip(base=0.704 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.762 novelty=0.70 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.75)+0.20*lccp(0.40) | steps=5
+2026-04-26 07:14:35,233 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.962 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:14:35,435 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.495 = clip(base=0.415 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.285 novelty=0.70 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.50)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:14:35,637 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.498 = clip(base=0.418 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.286 novelty=0.70 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.51)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:14:35,846 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.965 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:36,053 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.967 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:14:36,257 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.919 = clip(base=0.839 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.966 novelty=0.70 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:36,460 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.986 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:36,665 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.522 = clip(base=0.442 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.330 novelty=0.70 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.59)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:14:42,758 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.992 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:42,964 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.812 = clip(base=0.732 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.788 novelty=0.69 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.80)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:14:43,177 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.937 = clip(base=0.857 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.995 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:43,382 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.993 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:43,591 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.770 = clip(base=0.690 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.734 novelty=0.69 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.68)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:14:43,797 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.993 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:44,004 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.961 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:44,208 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.993 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:44,413 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.992 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:44,617 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.930 = clip(base=0.850 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.987 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+
Iter 28 GRPO groups: 15%|#5 | 3/20 [02:42<13:41, 48.30s/q, loss=0.0003, mean_r=0.836, q_acc=100%, q_rew=0.663, skip=0]
Iter 28 GRPO groups: 20%|## | 4/20 [02:42<09:52, 37.04s/q, loss=0.0003, mean_r=0.836, q_acc=100%, q_rew=0.663, skip=0]2026-04-26 07:14:53,348 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.975 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:14:53,558 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.779 = clip(base=0.699 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.726 novelty=0.76 | sol=0.45*prm_final(0.94)+0.35*prm_mean(0.67)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:14:53,763 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.986 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:14:53,966 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.978 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:14:54,175 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.990 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:54,382 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.989 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:14:54,587 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.992 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:14:54,796 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.995 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:14:54,999 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.991 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:14:55,207 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.990 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:15:02,274 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.931 = clip(base=0.851 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.952 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:15:02,486 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.673 = clip(base=0.593 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.577 novelty=0.77 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.52)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:15:02,691 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.947 novelty=0.77 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:15:02,899 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.947 = clip(base=0.867 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.986 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:15:03,110 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.940 = clip(base=0.860 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.992 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=7
+2026-04-26 07:15:03,320 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.984 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:15:03,526 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.985 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:15:03,732 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.995 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:15:03,936 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.982 novelty=0.77 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:15:04,145 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.831 = clip(base=0.751 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.800 novelty=0.77 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.84)+0.20*lccp(0.50) | steps=4
+
Iter 28 GRPO groups: 20%|## | 4/20 [03:02<09:52, 37.04s/q, loss=0.0001, mean_r=0.916, q_acc=100%, q_rew=0.667, skip=0]
Iter 28 GRPO groups: 25%|##5 | 5/20 [03:02<07:40, 30.72s/q, loss=0.0001, mean_r=0.916, q_acc=100%, q_rew=0.667, skip=0]2026-04-26 07:15:14,832 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.38(prox=0.38) + 0.40×proc(0.791[fin=0.92,mean=0.60]) + 0.10×fmt(1.000) | pred='180' gold='1008' | step_acc=50% lccp=33% (chain=2/6 ok_count=3) n_steps=6
+2026-04-26 07:15:14,928 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.480 = 0.50×0.08(prox=0.08) + 0.40×proc(0.796[fin=0.98,mean=0.52]) + 0.10×fmt(1.000) | pred='6720' gold='1008' | step_acc=43% lccp=14% (chain=1/7 ok_count=3) n_steps=7
+2026-04-26 07:15:15,023 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.898[fin=0.99,mean=0.76]) + 0.10×fmt(1.000) | pred='1980' gold='1008' | step_acc=71% lccp=29% (chain=2/7 ok_count=5) n_steps=7
+2026-04-26 07:15:38,493 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.894[fin=0.99,mean=0.75]) + 0.10×fmt(1.000) | pred='1980' gold='1008' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 07:15:38,588 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.753 = 0.50×0.54(prox=0.54) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='1440' gold='1008' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:15:38,677 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.492 = 0.50×0.53(prox=0.53) + 0.40×proc(0.320[fin=0.21,mean=0.49]) + 0.10×fmt(1.000) | pred='1458' gold='1008' | step_acc=40% lccp=40% (chain=2/5 ok_count=2) n_steps=5
+2026-04-26 07:15:38,771 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.741 = 0.50×0.54(prox=0.54) + 0.40×proc(0.929[fin=0.99,mean=0.84]) + 0.10×fmt(1.000) | pred='1440' gold='1008' | step_acc=86% lccp=29% (chain=2/7 ok_count=6) n_steps=7
+2026-04-26 07:15:50,821 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.979 = 0.50×1.00(exact) + 0.40×proc(0.947[fin=1.00,mean=0.87]) + 0.10×fmt(1.000) | pred='1008' gold='1008' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 07:15:50,917 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.34(prox=0.34) + 0.40×proc(0.865[fin=0.96,mean=0.72]) + 0.10×fmt(1.000) | pred='1980' gold='1008' | step_acc=80% lccp=40% (chain=2/5 ok_count=4) n_steps=5
+2026-04-26 07:15:51,011 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.940[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='1008' gold='1008' | step_acc=86% lccp=14% (chain=1/7 ok_count=6) n_steps=7
+
Iter 28 GRPO groups: 25%|##5 | 5/20 [03:48<07:40, 30.72s/q, loss=0.0002, mean_r=0.662, q_acc=100%, q_rew=0.667, skip=0]
Iter 28 GRPO groups: 30%|### | 6/20 [03:48<08:26, 36.16s/q, loss=0.0002, mean_r=0.662, q_acc=100%, q_rew=0.667, skip=0]2026-04-26 07:15:56,375 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:16:01,043 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:16:01,127 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:16:01,210 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:16:01,293 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:16:05,370 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:16:05,453 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:16:05,535 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:16:05,619 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.420 = 0.50×0.40(prox=0.40) + 0.40×proc(0.174[fin=0.05,mean=0.36]) + 0.10×fmt(1.000) | pred='8' gold='32' | step_acc=33% lccp=33% (chain=1/3 ok_count=1) n_steps=3
+2026-04-26 07:16:11,898 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='32' gold='32' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 28 GRPO groups: 30%|### | 6/20 [04:09<08:26, 36.16s/q, loss=-0.0011, mean_r=0.941, q_acc=100%, q_rew=0.667, skip=0]
Iter 28 GRPO groups: 35%|###5 | 7/20 [04:09<06:44, 31.15s/q, loss=-0.0011, mean_r=0.941, q_acc=100%, q_rew=0.667, skip=0]2026-04-26 07:16:32,459 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.833 = 0.50×0.69(prox=0.69) + 0.40×proc(0.971[fin=1.00,mean=0.93]) + 0.10×fmt(1.000) | pred='155' gold='200' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:16:32,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.700 = 0.50×0.85(prox=0.85) + 0.40×proc(0.436[fin=0.19,mean=0.80]) + 0.10×fmt(1.000) | pred='215' gold='200' | step_acc=85% lccp=46% (chain=6/13 ok_count=11) n_steps=13
+2026-04-26 07:16:32,649 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='200' gold='200' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:16:48,802 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.839 = 0.50×0.80(prox=0.80) + 0.40×proc(0.846[fin=1.00,mean=0.62]) + 0.10×fmt(1.000) | pred='225' gold='200' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 07:16:48,886 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='200' gold='200' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:16:48,982 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.414 = 0.50×0.00(prox=0.00) + 0.40×proc(0.659[fin=0.63,mean=0.70]) + 0.10×fmt(1.000) | pred='125 or 335' gold='200' | step_acc=78% lccp=33% (chain=3/9 ok_count=7) n_steps=9
+2026-04-26 07:16:49,101 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.461 = 0.50×0.00(prox=0.00) + 0.40×proc(0.629[fin=0.52,mean=0.79]) + 0.10×fmt(1.000) | pred='50*sqrt(45)' gold='200' | step_acc=82% lccp=73% (chain=8/11 ok_count=9) n_steps=11
+2026-04-26 07:17:12,952 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.757 = 0.50×0.67(prox=0.67) + 0.40×proc(0.809[fin=0.92,mean=0.64]) + 0.10×fmt(1.000) | pred='250' gold='200' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 07:17:13,039 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.822 = 0.50×0.85(prox=0.85) + 0.40×proc(0.742[fin=0.96,mean=0.41]) + 0.10×fmt(1.000) | pred='187.5' gold='200' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 07:17:13,124 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.921 = 0.50×1.00(exact) + 0.40×proc(0.803[fin=1.00,mean=0.51]) + 0.10×fmt(1.000) | pred='200' gold='200' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+
Iter 28 GRPO groups: 35%|###5 | 7/20 [05:10<06:44, 31.15s/q, loss=-0.0019, mean_r=0.774, q_acc=100%, q_rew=0.667, skip=0]
Iter 28 GRPO groups: 40%|#### | 8/20 [05:10<08:08, 40.74s/q, loss=-0.0019, mean_r=0.774, q_acc=100%, q_rew=0.667, skip=0]2026-04-26 07:17:18,595 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.981 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:18,782 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.918 = clip(base=0.838 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.980 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:18,969 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.986 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:19,162 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.978 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:19,354 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.918 = clip(base=0.838 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.981 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:19,547 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.62 sol=1.000 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:19,741 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:19,942 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.985 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:20,143 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.984 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:20,339 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.986 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:24,532 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.979 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:24,722 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.999 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:24,910 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.974 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:25,101 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.973 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:25,293 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.972 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:25,484 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.918 = clip(base=0.838 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.982 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:25,684 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.918 = clip(base=0.838 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.981 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:25,876 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.972 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:26,072 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.996 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:17:26,267 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.979 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+
Iter 28 GRPO groups: 40%|#### | 8/20 [05:24<08:08, 40.74s/q, loss=0.0004, mean_r=0.920, q_acc=100%, q_rew=0.659, skip=0]
Iter 28 GRPO groups: 45%|####5 | 9/20 [05:24<05:53, 32.17s/q, loss=0.0004, mean_r=0.920, q_acc=100%, q_rew=0.659, skip=0]2026-04-26 07:17:34,946 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.80 sol=1.000 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:35,154 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.73 sol=1.000 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:35,356 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.73 sol=1.000 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:35,561 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.73 sol=1.000 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:35,766 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:35,973 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:36,182 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:36,392 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:36,594 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:36,794 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.974 = clip(base=0.894 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:17:44,113 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:44,335 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.61 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:44,551 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:44,764 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.62 sol=1.000 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:44,980 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:45,198 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:45,417 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:45,633 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:45,852 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:17:46,067 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+
Iter 28 GRPO groups: 45%|####5 | 9/20 [05:44<05:53, 32.17s/q, loss=-0.0005, mean_r=0.951, q_acc=100%, q_rew=0.663, skip=0]
Iter 28 GRPO groups: 50%|##### | 10/20 [05:44<04:43, 28.35s/q, loss=-0.0005, mean_r=0.951, q_acc=100%, q_rew=0.663, skip=0]2026-04-26 07:17:51,468 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.952 = 0.50×1.00(exact) + 0.40×proc(0.879[fin=0.99,mean=0.72]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:17:57,667 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.964 = 0.50×1.00(exact) + 0.40×proc(0.909[fin=0.99,mean=0.79]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:17:57,751 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.966 = 0.50×1.00(exact) + 0.40×proc(0.915[fin=1.00,mean=0.79]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:17:57,834 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.944 = 0.50×1.00(exact) + 0.40×proc(0.860[fin=1.00,mean=0.65]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:17:57,917 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.972 = 0.50×1.00(exact) + 0.40×proc(0.929[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:18:01,165 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.968 = 0.50×1.00(exact) + 0.40×proc(0.920[fin=1.00,mean=0.81]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:18:01,250 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:18:01,334 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.948 = 0.50×1.00(exact) + 0.40×proc(0.871[fin=1.00,mean=0.68]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 07:18:01,419 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.952 = 0.50×1.00(exact) + 0.40×proc(0.879[fin=0.99,mean=0.72]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:18:01,504 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.941[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='70' gold='70' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 28 GRPO groups: 50%|##### | 10/20 [05:57<04:43, 28.35s/q, loss=0var, mean_r=0.963, skip=1]
Iter 28 GRPO groups: 55%|#####5 | 11/20 [05:57<03:34, 23.88s/q, loss=0var, mean_r=0.963, skip=1]2026-04-26 07:18:07,807 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.960 = 0.50×1.00(exact) + 0.40×proc(0.900[fin=1.00,mean=0.75]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:18:07,892 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 07:18:07,984 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:18:12,666 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:18:12,758 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:18:12,849 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:18:12,932 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.963 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(0.650) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=2/2 ok_count=2) n_steps=2
+2026-04-26 07:18:15,594 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.284 = 0.50×0.00(prox=0.00) + 0.40×proc(0.273[fin=0.13,mean=0.49]) + 0.10×fmt(1.000) | pred='$50\\sqrt{41}$' gold='250' | step_acc=50% lccp=50% (chain=2/4 ok_count=2) n_steps=4
+2026-04-26 07:18:15,686 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:18:15,777 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='250' gold='250' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 28 GRPO groups: 55%|#####5 | 11/20 [06:13<03:34, 23.88s/q, loss=-0.0013, mean_r=0.916, q_acc=100%, q_rew=0.663, skip=1]
Iter 28 GRPO groups: 60%|###### | 12/20 [06:13<02:51, 21.40s/q, loss=-0.0013, mean_r=0.916, q_acc=100%, q_rew=0.663, skip=1]2026-04-26 07:18:24,533 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:18:34,615 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:18:34,698 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:18:34,781 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:18:34,863 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:18:44,621 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.938 = 0.50×1.00(exact) + 0.40×proc(0.846[fin=0.96,mean=0.67]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 07:18:44,706 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:18:44,790 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:18:44,874 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:18:50,085 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='15' gold='15' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 28 GRPO groups: 60%|###### | 12/20 [06:46<02:51, 21.40s/q, loss=0var, mean_r=0.993, skip=2]
Iter 28 GRPO groups: 65%|######5 | 13/20 [06:46<02:54, 24.87s/q, loss=0var, mean_r=0.993, skip=2]2026-04-26 07:18:54,359 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.984[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:18:54,447 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.828 = 0.50×0.67(prox=0.67) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='15' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:18:54,533 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:19:05,752 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 07:19:05,835 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:19:05,918 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:19:06,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:19:17,929 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:19:18,012 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.990[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:19:18,099 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+
Iter 28 GRPO groups: 65%|######5 | 13/20 [07:15<02:54, 24.87s/q, loss=0.0008, mean_r=0.981, q_acc=100%, q_rew=0.663, skip=2]
Iter 28 GRPO groups: 70%|####### | 14/20 [07:15<02:37, 26.25s/q, loss=0.0008, mean_r=0.981, q_acc=100%, q_rew=0.663, skip=2]2026-04-26 07:19:29,767 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.915[fin=0.98,mean=0.81]) + 0.10×fmt(1.000) | pred='3' gold='1' | step_acc=91% lccp=9% (chain=1/11 ok_count=10) n_steps=11
+2026-04-26 07:19:54,032 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 07:19:54,129 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:19:54,215 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.503 = 0.50×0.00(prox=0.00) + 0.40×proc(0.821[fin=0.90,mean=0.70]) + 0.10×fmt(1.000) | pred='10101' gold='1' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 07:19:54,303 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.925[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='3' gold='1' | step_acc=82% lccp=9% (chain=1/11 ok_count=9) n_steps=11
+2026-04-26 07:20:17,114 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.961[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=89% lccp=11% (chain=1/9 ok_count=8) n_steps=9
+2026-04-26 07:20:17,197 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.935 = 0.50×1.00(exact) + 0.40×proc(0.837[fin=1.00,mean=0.60]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=62% lccp=12% (chain=1/8 ok_count=5) n_steps=8
+2026-04-26 07:20:17,291 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.20(prox=0.20) + 0.40×proc(0.872[fin=0.98,mean=0.71]) + 0.10×fmt(1.000) | pred='3' gold='1' | step_acc=78% lccp=11% (chain=1/9 ok_count=7) n_steps=9
+2026-04-26 07:20:17,383 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 07:20:41,075 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.871 = 0.50×1.00(exact) + 0.40×proc(0.764[fin=0.92,mean=0.53]) + 0.10×fmt(0.650) | pred='1' gold='1' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+
Iter 28 GRPO groups: 70%|####### | 14/20 [08:38<02:37, 26.25s/q, loss=0.0002, mean_r=0.793, q_acc=100%, q_rew=0.663, skip=2]
Iter 28 GRPO groups: 75%|#######5 | 15/20 [08:38<03:36, 43.37s/q, loss=0.0002, mean_r=0.793, q_acc=100%, q_rew=0.663, skip=2]2026-04-26 07:20:45,936 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.929 + mod=+0.080, cap=1.00) | Q=0.84 sol=0.986 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:46,131 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.997 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:46,330 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.996 = clip(base=0.916 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.984 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:46,522 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.998 = clip(base=0.918 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.986 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:46,717 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.997 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:46,914 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.998 = clip(base=0.918 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.986 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:47,114 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.996 = clip(base=0.916 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.984 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:47,311 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.998 = clip(base=0.918 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.986 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:47,502 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.996 = clip(base=0.916 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.984 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:47,693 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.988 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:51,280 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.941 + mod=+0.080, cap=1.00) | Q=0.86 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:51,472 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.786 = clip(base=0.706 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.654 novelty=0.69 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.63)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:20:51,666 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:51,859 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:52,053 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.786 = clip(base=0.706 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.654 novelty=0.69 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.63)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:20:52,247 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:20:52,441 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.784 = clip(base=0.704 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.652 novelty=0.69 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.63)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:20:52,632 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.784 = clip(base=0.704 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.652 novelty=0.69 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.63)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:20:52,827 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.786 = clip(base=0.706 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.654 novelty=0.69 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.63)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:20:53,026 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.925 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+
Iter 28 GRPO groups: 75%|#######5 | 15/20 [08:50<03:36, 43.37s/q, loss=0.0030, mean_r=0.945, q_acc=100%, q_rew=0.684, skip=2]
Iter 28 GRPO groups: 80%|######## | 16/20 [08:50<02:15, 33.97s/q, loss=0.0030, mean_r=0.945, q_acc=100%, q_rew=0.684, skip=2]2026-04-26 07:20:58,722 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.996 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:20:58,915 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.996 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:20:59,109 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.998 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:20:59,300 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.987 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:20:59,498 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.993 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:20:59,693 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.997 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:20:59,885 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:00,076 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.996 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:00,278 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.998 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:00,471 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.987 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:05,304 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.75 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:05,487 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:05,669 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:05,853 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:06,038 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:06,225 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.999 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:06,415 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:06,598 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:06,791 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:06,980 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 28 GRPO groups: 80%|######## | 16/20 [09:04<02:15, 33.97s/q, loss=-0.0000, mean_r=0.935, q_acc=100%, q_rew=0.679, skip=2]
Iter 28 GRPO groups: 85%|########5 | 17/20 [09:04<01:23, 27.96s/q, loss=-0.0000, mean_r=0.935, q_acc=100%, q_rew=0.679, skip=2]2026-04-26 07:21:16,448 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.980 = clip(base=0.900 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.974 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:16,655 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.988 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:16,863 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.977 = clip(base=0.897 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.987 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:17,072 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.969 = clip(base=0.889 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:17,276 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.980 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:17,480 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.967 novelty=0.70 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:17,692 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.970 = clip(base=0.890 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.990 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:21:17,897 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.995 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:18,109 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.993 novelty=0.70 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:18,324 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.615 = clip(base=0.535 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.357 novelty=0.70 | sol=0.45*prm_final(0.05)+0.35*prm_mean(0.61)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:21:25,287 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.986 = clip(base=0.906 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.995 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:25,487 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:25,686 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:25,885 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.558 = clip(base=0.478 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.291 novelty=0.73 | sol=0.45*prm_final(0.03)+0.35*prm_mean(0.51)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:21:26,088 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:26,295 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:26,496 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.990 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:26,694 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.996 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:26,893 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:27,095 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 28 GRPO groups: 85%|########5 | 17/20 [09:25<01:23, 27.96s/q, loss=-0.0016, mean_r=0.929, q_acc=100%, q_rew=0.685, skip=2]
Iter 28 GRPO groups: 90%|######### | 18/20 [09:25<00:51, 25.70s/q, loss=-0.0016, mean_r=0.929, q_acc=100%, q_rew=0.685, skip=2]2026-04-26 07:21:35,993 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.995 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:36,195 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.998 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:36,406 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.990 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:21:36,613 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.989 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:36,814 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.998 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:37,013 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.995 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:37,215 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.994 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:37,418 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.760 = clip(base=0.680 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.746 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.70)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:21:37,618 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.460 = clip(base=0.380 + mod=+0.080, cap=1.00) | Q=0.55 sol=0.265 novelty=0.78 | sol=0.45*prm_final(0.24)+0.35*prm_mean(0.31)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:21:37,826 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.785 = clip(base=0.705 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.787 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.77)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:21:45,715 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.923 = clip(base=0.843 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.975 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:45,919 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.974 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:21:46,123 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.991 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:46,329 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:46,529 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.995 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:21:46,732 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.896 = clip(base=0.816 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.973 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:21:46,941 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.990 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:47,165 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.940 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:21:47,364 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.996 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:21:47,563 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.487 = clip(base=0.407 + mod=+0.080, cap=1.00) | Q=0.55 sol=0.311 novelty=0.78 | sol=0.45*prm_final(0.27)+0.35*prm_mean(0.35)+0.20*lccp(0.33) | steps=3
+
Iter 28 GRPO groups: 90%|######### | 18/20 [09:45<00:51, 25.70s/q, loss=0.0006, mean_r=0.851, q_acc=100%, q_rew=0.675, skip=2]
Iter 28 GRPO groups: 95%|#########5| 19/20 [09:45<00:24, 24.02s/q, loss=0.0006, mean_r=0.851, q_acc=100%, q_rew=0.675, skip=2]2026-04-26 07:21:56,128 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.607 = clip(base=0.527 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.393 novelty=0.68 | sol=0.45*prm_final(0.43)+0.35*prm_mean(0.58)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:21:56,323 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.452 = clip(base=0.372 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.221 novelty=0.68 | sol=0.45*prm_final(0.07)+0.35*prm_mean(0.35)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:21:56,518 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.510 = clip(base=0.430 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.335 novelty=0.68 | sol=0.45*prm_final(0.27)+0.35*prm_mean(0.42)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:21:56,714 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.775 = clip(base=0.695 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.728 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.66)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:21:56,910 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.546 = clip(base=0.466 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.397 novelty=0.68 | sol=0.45*prm_final(0.38)+0.35*prm_mean(0.46)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:21:57,114 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.527 = clip(base=0.447 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.363 novelty=0.68 | sol=0.45*prm_final(0.32)+0.35*prm_mean(0.44)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:21:57,311 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.714 = clip(base=0.634 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.632 novelty=0.68 | sol=0.45*prm_final(0.79)+0.35*prm_mean(0.60)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:21:57,520 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.560 = clip(base=0.480 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.421 novelty=0.68 | sol=0.45*prm_final(0.42)+0.35*prm_mean(0.47)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:21:57,718 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.815 = clip(base=0.735 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.793 novelty=0.68 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.79)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:21:57,913 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.473 = clip(base=0.393 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.258 novelty=0.68 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.34)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:22:03,443 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.977 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:22:03,628 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.990 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:22:03,815 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.983 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:22:04,005 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.774 = clip(base=0.694 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.747 novelty=0.70 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.68)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:22:04,197 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.421 = clip(base=0.341 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.191 novelty=0.70 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.34)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:22:04,388 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:22:04,581 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.760 = clip(base=0.680 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.711 novelty=0.70 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.64)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:22:04,765 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.982 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:22:04,956 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.743 = clip(base=0.663 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.682 novelty=0.70 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.61)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:22:05,151 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.744 = clip(base=0.664 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.700 novelty=0.70 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.71)+0.20*lccp(0.00) | steps=3
+
Iter 28 GRPO groups: 95%|#########5| 19/20 [10:03<00:24, 24.02s/q, loss=-0.0007, mean_r=0.699, q_acc=100%, q_rew=0.669, skip=2]
Iter 28 GRPO groups: 100%|##########| 20/20 [10:03<00:00, 22.09s/q, loss=-0.0007, mean_r=0.699, q_acc=100%, q_rew=0.669, skip=2]
Iter 28 GRPO groups: 100%|##########| 20/20 [10:03<00:00, 30.15s/q, loss=-0.0007, mean_r=0.699, q_acc=100%, q_rew=0.669, skip=2]
+2026-04-26 07:22:06,833 INFO __main__ - Iter 28 | loss=0.0000 | reward mean=0.870 std=0.159 | gt_match=65.6% | grounded_acc=87.8% | step_acc=83.3% | lccp=61.9% | batch_acc=94.2% | phase=SELFPLAY_RAMP sp_ratio=54% | groups=29 skipped=2(0var=2) | lr=3.55e-06 | 603.1s
+2026-04-26 07:22:06,833 INFO __main__ - Question generation: 11/11 valid (100%) | q_reward=0.669 | q_acc=100.0% (>0.5 quality) | topic=0.59 diff=0.23 clarity=1.00 novelty=0.45 solvability=0.98
+2026-04-26 07:22:06,834 INFO __main__ - ======================================================================
+2026-04-26 07:22:06,834 INFO __main__ - GRPO ITERATION 29/60
+2026-04-26 07:22:06,834 INFO __main__ - ======================================================================
+2026-04-26 07:22:06,856 INFO __main__ - LR this iteration: 3.55e-06 | T=0.610 | MATH ratio=50%
+
Iter 29 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 07:22:16,231 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='9050' gold='9050' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:22:16,325 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.984 = 0.50×1.00(exact) + 0.40×proc(0.959[fin=1.00,mean=0.90]) + 0.10×fmt(1.000) | pred='9050' gold='9050' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:22:16,420 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='9050' gold='9050' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:22:26,898 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='9050' gold='9050' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:22:26,995 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.563 = 0.50×0.51(prox=0.51) + 0.40×proc(0.521[fin=0.52,mean=0.52]) + 0.10×fmt(1.000) | pred='13400' gold='9050' | step_acc=67% lccp=33% (chain=2/6 ok_count=4) n_steps=6
+2026-04-26 07:22:27,089 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.989 = 0.50×1.00(exact) + 0.40×proc(0.972[fin=0.99,mean=0.94]) + 0.10×fmt(1.000) | pred='9050' gold='9050' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:22:27,183 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.799 = 0.50×0.69(prox=0.69) + 0.40×proc(0.881[fin=0.98,mean=0.74]) + 0.10×fmt(1.000) | pred='7050' gold='9050' | step_acc=75% lccp=25% (chain=1/4 ok_count=3) n_steps=4
+2026-04-26 07:22:33,238 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.977[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='9050' gold='9050' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:22:33,322 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.48(prox=0.48) + 0.40×proc(0.585[fin=0.65,mean=0.49]) + 0.10×fmt(1.000) | pred='4050' gold='9050' | step_acc=50% lccp=25% (chain=1/4 ok_count=2) n_steps=4
+2026-04-26 07:22:33,416 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.981[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='9050' gold='9050' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 29 GRPO groups: 0%| | 0/20 [00:28, ?q/s, loss=-0.0001, mean_r=0.886, skip=0]
Iter 29 GRPO groups: 5%|5 | 1/20 [00:28<08:52, 28.03s/q, loss=-0.0001, mean_r=0.886, skip=0]2026-04-26 07:22:40,489 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.756 = 0.50×0.71(prox=0.71) + 0.40×proc(0.746[fin=0.94,mean=0.46]) + 0.10×fmt(1.000) | pred='260' gold='325' | step_acc=40% lccp=20% (chain=1/5 ok_count=2) n_steps=5
+2026-04-26 07:22:49,480 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.47(prox=0.47) + 0.40×proc(0.833[fin=0.94,mean=0.68]) + 0.10×fmt(1.000) | pred='145' gold='325' | step_acc=83% lccp=17% (chain=1/6 ok_count=5) n_steps=6
+2026-04-26 07:22:49,572 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.793 = 0.50×0.63(prox=0.63) + 0.40×proc(0.943[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='420' gold='325' | step_acc=86% lccp=57% (chain=4/7 ok_count=6) n_steps=7
+2026-04-26 07:22:49,663 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.761 = 0.50×0.63(prox=0.63) + 0.40×proc(0.865[fin=1.00,mean=0.66]) + 0.10×fmt(1.000) | pred='420' gold='325' | step_acc=71% lccp=14% (chain=1/7 ok_count=5) n_steps=7
+2026-04-26 07:22:49,747 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.949 = 0.50×1.00(exact) + 0.40×proc(0.872[fin=0.99,mean=0.69]) + 0.10×fmt(1.000) | pred='325' gold='325' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 07:23:02,195 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='325' gold='325' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:23:02,280 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.506 = 0.50×0.44(prox=0.44) + 0.40×proc(0.400[fin=0.39,mean=0.42]) + 0.10×fmt(1.000) | pred='120' gold='325' | step_acc=17% lccp=17% (chain=1/6 ok_count=1) n_steps=6
+2026-04-26 07:23:02,371 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.992[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='325' gold='325' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:23:02,462 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.944 = 0.50×1.00(exact) + 0.40×proc(0.860[fin=0.98,mean=0.67]) + 0.10×fmt(1.000) | pred='325' gold='325' | step_acc=67% lccp=0% (chain=0/6 ok_count=4) n_steps=6
+2026-04-26 07:23:15,555 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.755 = 0.50×0.71(prox=0.71) + 0.40×proc(0.744[fin=0.90,mean=0.51]) + 0.10×fmt(1.000) | pred='260' gold='325' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+
Iter 29 GRPO groups: 5%|5 | 1/20 [01:10<08:52, 28.03s/q, loss=-0.0000, mean_r=0.801, skip=0]
Iter 29 GRPO groups: 10%|# | 2/20 [01:10<10:53, 36.33s/q, loss=-0.0000, mean_r=0.801, skip=0]2026-04-26 07:23:19,007 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:19,088 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:19,165 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.993[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:25,124 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:25,206 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:25,286 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:25,365 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:29,894 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:29,971 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:23:30,047 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 29 GRPO groups: 10%|# | 2/20 [01:23<10:53, 36.33s/q, loss=0var, mean_r=0.999, skip=1]
Iter 29 GRPO groups: 15%|#5 | 3/20 [01:23<07:16, 25.69s/q, loss=0var, mean_r=0.999, skip=1]2026-04-26 07:23:35,744 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.989 = clip(base=0.909 + mod=+0.080, cap=1.00) | Q=0.77 sol=1.000 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:35,940 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:36,139 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:36,337 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:36,532 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:36,731 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:36,926 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:37,127 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:37,327 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:37,534 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:23:41,616 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.975 = clip(base=0.895 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.995 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:41,814 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.996 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:42,014 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.990 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:42,210 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.974 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:42,401 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.995 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:42,595 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:42,785 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.959 = clip(base=0.879 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.995 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:42,974 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.995 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:43,170 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.995 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:23:43,361 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.960 = clip(base=0.880 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 29 GRPO groups: 15%|#5 | 3/20 [01:38<07:16, 25.69s/q, loss=-0.0006, mean_r=0.965, q_acc=100%, q_rew=0.717, skip=1]
Iter 29 GRPO groups: 20%|## | 4/20 [01:38<05:43, 21.46s/q, loss=-0.0006, mean_r=0.965, q_acc=100%, q_rew=0.717, skip=1]2026-04-26 07:23:50,151 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:23:58,500 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 07:23:58,585 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:23:58,668 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=9/9 ok_count=9) n_steps=9
+2026-04-26 07:23:58,750 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 07:24:14,747 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.423 = 0.50×0.14(prox=0.14) + 0.40×proc(0.330[fin=0.03,mean=0.78]) + 0.10×fmt(1.000) | pred='32' gold='8' | step_acc=80% lccp=80% (chain=4/5 ok_count=4) n_steps=5
+2026-04-26 07:24:14,824 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.996[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:24:14,906 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:24:14,988 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:24:26,679 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8' gold='8' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 29 GRPO groups: 20%|## | 4/20 [02:21<05:43, 21.46s/q, loss=-0.0015, mean_r=0.942, q_acc=100%, q_rew=0.717, skip=1]
Iter 29 GRPO groups: 25%|##5 | 5/20 [02:21<07:19, 29.29s/q, loss=-0.0015, mean_r=0.942, q_acc=100%, q_rew=0.717, skip=1]2026-04-26 07:24:36,690 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.995 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:24:36,886 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.997 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:37,085 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:37,284 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=1.000 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:37,482 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.987 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:24:37,680 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:24:37,877 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:38,076 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:38,276 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:24:38,476 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:47,540 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.777 = clip(base=0.697 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.745 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:24:47,749 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:47,959 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:48,173 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:48,393 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:24:48,612 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.909 = clip(base=0.829 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:24:48,832 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:24:49,040 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:24:49,258 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:24:49,471 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.459 = clip(base=0.379 + mod=+0.080, cap=1.00) | Q=0.51 sol=0.291 novelty=0.71 | sol=0.45*prm_final(0.07)+0.35*prm_mean(0.45)+0.20*lccp(0.50) | steps=4
+
Iter 29 GRPO groups: 25%|##5 | 5/20 [02:44<07:19, 29.29s/q, loss=0.0016, mean_r=0.902, q_acc=100%, q_rew=0.674, skip=1]
Iter 29 GRPO groups: 30%|### | 6/20 [02:44<06:19, 27.14s/q, loss=0.0016, mean_r=0.902, q_acc=100%, q_rew=0.674, skip=1]2026-04-26 07:24:55,347 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.923 = clip(base=0.843 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.946 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.85)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:24:55,536 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.922 = clip(base=0.842 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.987 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:24:55,720 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.985 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:24:55,913 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.994 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:24:56,098 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.985 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:24:56,285 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.62 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:24:56,468 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:24:56,651 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.897 = clip(base=0.817 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.946 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.85)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:24:56,833 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.912 = clip(base=0.832 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.971 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:24:57,013 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.990 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:00,379 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.897 = clip(base=0.817 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.956 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:00,551 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.956 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:00,723 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.956 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:00,895 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.956 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:01,067 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.956 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:01,245 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.956 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:01,416 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=1.000 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:25:01,589 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.956 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:01,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.993 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:25:01,947 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.995 novelty=0.61 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=2
+
Iter 29 GRPO groups: 30%|### | 6/20 [02:56<06:19, 27.14s/q, loss=0.0051, mean_r=0.904, q_acc=100%, q_rew=0.649, skip=1]
Iter 29 GRPO groups: 35%|###5 | 7/20 [02:56<04:50, 22.34s/q, loss=0.0051, mean_r=0.904, q_acc=100%, q_rew=0.649, skip=1]2026-04-26 07:25:10,718 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.996 = clip(base=0.916 + mod=+0.080, cap=1.00) | Q=0.79 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:25:10,923 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:25:11,132 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.976 = clip(base=0.896 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:11,351 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.73 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:25:11,565 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:25:11,774 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:25:11,987 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.73 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:25:12,199 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.73 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:25:12,409 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.786 = clip(base=0.706 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.658 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.59)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:25:12,615 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:25:20,072 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.979 novelty=0.67 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:20,287 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.941 novelty=0.67 | sol=0.45*prm_final(0.89)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:20,500 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.928 novelty=0.67 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:20,709 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.923 = clip(base=0.843 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.939 novelty=0.67 | sol=0.45*prm_final(0.89)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:20,921 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.992 novelty=0.67 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:21,138 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:21,355 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:21,571 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.977 novelty=0.67 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:21,785 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.991 novelty=0.67 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:25:21,995 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+
Iter 29 GRPO groups: 35%|###5 | 7/20 [03:16<04:50, 22.34s/q, loss=-0.0004, mean_r=0.951, q_acc=100%, q_rew=0.667, skip=1]
Iter 29 GRPO groups: 40%|#### | 8/20 [03:16<04:19, 21.62s/q, loss=-0.0004, mean_r=0.951, q_acc=100%, q_rew=0.667, skip=1]2026-04-26 07:26:00,279 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.809 = clip(base=0.729 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.781 novelty=0.72 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.78)+0.20*lccp(0.57) | steps=7
+2026-04-26 07:26:00,514 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.639 = clip(base=0.559 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.539 novelty=0.72 | sol=0.45*prm_final(0.38)+0.35*prm_mean(0.84)+0.20*lccp(0.36) | steps=11
+2026-04-26 07:26:00,735 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.789 = clip(base=0.709 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.773 novelty=0.72 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.77)+0.20*lccp(0.57) | steps=7
+2026-04-26 07:26:00,956 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.887 = clip(base=0.807 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.961 novelty=0.72 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:26:01,169 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.594 = clip(base=0.514 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.453 novelty=0.72 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.74)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:26:01,386 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.833 = clip(base=0.753 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.850 novelty=0.72 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.84)+0.20*lccp(0.57) | steps=7
+2026-04-26 07:26:01,606 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.629 = clip(base=0.549 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.503 novelty=0.72 | sol=0.45*prm_final(0.27)+0.35*prm_mean(0.71)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:26:01,820 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.555 = clip(base=0.475 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.397 novelty=0.72 | sol=0.45*prm_final(0.07)+0.35*prm_mean(0.67)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:26:02,044 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.606 = clip(base=0.526 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.473 novelty=0.72 | sol=0.45*prm_final(0.37)+0.35*prm_mean(0.72)+0.20*lccp(0.27) | steps=11
+2026-04-26 07:26:07,350 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:07,549 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.896 = clip(base=0.816 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:07,754 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.997 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:07,955 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:08,163 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:08,366 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:08,568 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:08,768 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.56 sol=1.000 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:08,974 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:26:09,184 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.892 = clip(base=0.812 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.978 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+
Iter 29 GRPO groups: 40%|#### | 8/20 [04:03<04:19, 21.62s/q, loss=-0.0003, mean_r=0.817, q_acc=100%, q_rew=0.652, skip=1]
Iter 29 GRPO groups: 45%|####5 | 9/20 [04:03<05:25, 29.60s/q, loss=-0.0003, mean_r=0.817, q_acc=100%, q_rew=0.652, skip=1]2026-04-26 07:26:16,272 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:26:16,356 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:26:16,438 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:26:32,104 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:26:32,189 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 07:26:32,272 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=8/8 ok_count=8) n_steps=8
+2026-04-26 07:26:32,356 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:26:44,302 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:26:44,386 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=7/7 ok_count=7) n_steps=7
+2026-04-26 07:26:44,469 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='17' gold='17' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+
Iter 29 GRPO groups: 45%|####5 | 9/20 [04:37<05:25, 29.60s/q, loss=0var, mean_r=1.000, skip=2]
Iter 29 GRPO groups: 50%|##### | 10/20 [04:37<05:08, 30.84s/q, loss=0var, mean_r=1.000, skip=2]2026-04-26 07:26:52,630 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.743 = 0.50×0.56(prox=0.56) + 0.40×proc(0.914[fin=1.00,mean=0.78]) + 0.10×fmt(1.000) | pred='35' gold='25' | step_acc=67% lccp=17% (chain=1/6 ok_count=4) n_steps=6
+2026-04-26 07:27:04,263 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.980 = 0.50×1.00(exact) + 0.40×proc(0.950[fin=0.96,mean=0.93]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:27:04,349 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.992 = 0.50×1.00(exact) + 0.40×proc(0.979[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:27:04,433 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:27:04,517 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.985[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:27:12,566 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.944[fin=1.00,mean=0.86]) + 0.10×fmt(1.000) | pred='$\\sqrt{505}$' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:27:12,649 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.644 = 0.50×0.61(prox=0.61) + 0.40×proc(0.597[fin=0.59,mean=0.61]) + 0.10×fmt(1.000) | pred='17' gold='25' | step_acc=60% lccp=40% (chain=2/5 ok_count=3) n_steps=5
+2026-04-26 07:27:12,731 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='25' gold='25' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:27:12,833 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.263 = 0.50×0.00(prox=0.00) + 0.40×proc(0.407[fin=0.34,mean=0.51]) + 0.10×fmt(1.000) | pred='sqrt(545)' gold='25' | step_acc=40% lccp=0% (chain=0/5 ok_count=2) n_steps=5
+2026-04-26 07:27:21,458 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.776 = 0.50×0.61(prox=0.61) + 0.40×proc(0.927[fin=0.98,mean=0.84]) + 0.10×fmt(1.000) | pred='17' gold='25' | step_acc=80% lccp=60% (chain=3/5 ok_count=4) n_steps=5
+
Iter 29 GRPO groups: 50%|##### | 10/20 [05:16<05:08, 30.84s/q, loss=-0.0008, mean_r=0.793, q_acc=100%, q_rew=0.652, skip=2]
Iter 29 GRPO groups: 55%|#####5 | 11/20 [05:16<04:58, 33.18s/q, loss=-0.0008, mean_r=0.793, q_acc=100%, q_rew=0.652, skip=2]2026-04-26 07:27:32,561 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.778 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:27:32,775 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.852 = clip(base=0.772 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.781 novelty=0.76 | sol=0.45*prm_final(0.94)+0.35*prm_mean(0.74)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:27:32,993 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.796 = clip(base=0.716 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.668 novelty=0.76 | sol=0.45*prm_final(0.82)+0.35*prm_mean(0.63)+0.20*lccp(0.40) | steps=5
+2026-04-26 07:27:33,221 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.874 = clip(base=0.794 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.804 novelty=0.76 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.77)+0.20*lccp(0.50) | steps=6
+2026-04-26 07:27:33,432 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.765 = clip(base=0.685 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.615 novelty=0.76 | sol=0.45*prm_final(0.90)+0.35*prm_mean(0.60)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:27:33,642 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.703 = clip(base=0.623 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.592 novelty=0.76 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.50)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:27:33,860 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.751 = clip(base=0.671 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.590 novelty=0.76 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.58)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:27:34,075 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.843 = clip(base=0.763 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.775 novelty=0.76 | sol=0.45*prm_final(0.72)+0.35*prm_mean(0.72)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:27:34,292 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.482 = clip(base=0.402 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.183 novelty=0.76 | sol=0.45*prm_final(0.18)+0.35*prm_mean(0.29)+0.20*lccp(0.00) | steps=6
+2026-04-26 07:27:34,508 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.793 = clip(base=0.713 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.672 novelty=0.76 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.66)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:27:38,766 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.998 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:27:38,954 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:27:39,149 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.941 = clip(base=0.861 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.983 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:27:39,336 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:27:39,525 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.999 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:27:39,712 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.982 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:27:39,903 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:27:40,099 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.780 = clip(base=0.700 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.691 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.69)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:27:40,292 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.946 = clip(base=0.866 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.983 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:27:40,486 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.941 = clip(base=0.861 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.984 novelty=0.64 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+
Iter 29 GRPO groups: 55%|#####5 | 11/20 [05:35<04:58, 33.18s/q, loss=-0.0001, mean_r=0.854, q_acc=100%, q_rew=0.665, skip=2]
Iter 29 GRPO groups: 60%|###### | 12/20 [05:35<03:51, 28.95s/q, loss=-0.0001, mean_r=0.854, q_acc=100%, q_rew=0.665, skip=2]2026-04-26 07:27:45,362 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:27:45,446 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:27:45,528 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:27:50,105 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:27:50,188 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:27:50,270 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:27:50,353 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:27:55,276 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:27:55,354 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:27:55,436 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='30' gold='30' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 29 GRPO groups: 60%|###### | 12/20 [05:48<03:51, 28.95s/q, loss=0var, mean_r=0.999, skip=3]
Iter 29 GRPO groups: 65%|######5 | 13/20 [05:48<02:49, 24.18s/q, loss=0var, mean_r=0.999, skip=3]2026-04-26 07:28:02,743 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.931 = clip(base=0.851 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:02,948 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.999 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:03,158 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.984 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:03,364 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:28:03,569 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.997 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:28:03,774 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.987 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:03,978 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.601 = clip(base=0.521 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.470 novelty=0.69 | sol=0.45*prm_final(0.46)+0.35*prm_mean(0.66)+0.20*lccp(0.17) | steps=6
+2026-04-26 07:28:04,185 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.824 = clip(base=0.744 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.860 novelty=0.69 | sol=0.45*prm_final(0.78)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:04,391 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.583 = clip(base=0.503 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.453 novelty=0.69 | sol=0.45*prm_final(0.45)+0.35*prm_mean(0.72)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:28:04,598 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.998 novelty=0.69 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:28:11,111 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.923 = clip(base=0.843 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.997 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:11,316 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:11,514 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:11,714 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:11,913 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:28:12,116 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:28:12,319 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.999 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:28:12,521 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.658 = clip(base=0.578 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.580 novelty=0.73 | sol=0.45*prm_final(0.57)+0.35*prm_mean(0.64)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:28:12,724 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:28:12,929 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=1.000 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+
Iter 29 GRPO groups: 65%|######5 | 13/20 [06:08<02:49, 24.18s/q, loss=-0.0006, mean_r=0.861, q_acc=100%, q_rew=0.652, skip=3]
Iter 29 GRPO groups: 70%|####### | 14/20 [06:08<02:16, 22.77s/q, loss=-0.0006, mean_r=0.861, q_acc=100%, q_rew=0.652, skip=3]2026-04-26 07:28:26,407 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.640 = clip(base=0.560 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.425 novelty=0.73 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.66)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:28:26,612 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.596 = clip(base=0.516 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.388 novelty=0.73 | sol=0.45*prm_final(0.08)+0.35*prm_mean(0.63)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:28:26,808 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.600 = clip(base=0.520 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.401 novelty=0.73 | sol=0.45*prm_final(0.18)+0.35*prm_mean(0.57)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:28:27,006 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.595 = clip(base=0.515 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.441 novelty=0.73 | sol=0.45*prm_final(0.51)+0.35*prm_mean(0.49)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:28:27,203 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.671 = clip(base=0.591 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.518 novelty=0.73 | sol=0.45*prm_final(0.30)+0.35*prm_mean(0.71)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:28:27,403 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.595 = clip(base=0.515 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.389 novelty=0.73 | sol=0.45*prm_final(0.05)+0.35*prm_mean(0.67)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:28:27,607 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.509 = clip(base=0.429 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.294 novelty=0.73 | sol=0.45*prm_final(0.22)+0.35*prm_mean(0.45)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:28:27,803 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.770 = clip(base=0.690 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.671 novelty=0.73 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.57)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:28:27,999 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.548 = clip(base=0.468 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.320 novelty=0.73 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.51)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:28:28,212 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.809 = clip(base=0.729 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.751 novelty=0.73 | sol=0.45*prm_final(0.78)+0.35*prm_mean(0.76)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:28:33,348 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:33,544 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.942 = clip(base=0.862 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.997 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:33,740 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:33,935 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.938 = clip(base=0.858 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.998 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:34,136 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.943 = clip(base=0.863 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.999 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:34,336 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.937 = clip(base=0.857 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.996 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:34,538 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.65 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:34,733 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.919 = clip(base=0.839 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.960 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:34,928 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.65 sol=1.000 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:35,129 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.787 = clip(base=0.707 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.734 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.63)+0.20*lccp(0.33) | steps=3
+
Iter 29 GRPO groups: 70%|####### | 14/20 [06:29<02:16, 22.77s/q, loss=0.0013, mean_r=0.779, q_acc=100%, q_rew=0.655, skip=3]
Iter 29 GRPO groups: 75%|#######5 | 15/20 [06:29<01:52, 22.49s/q, loss=0.0013, mean_r=0.779, q_acc=100%, q_rew=0.655, skip=3]2026-04-26 07:28:46,274 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.944 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:28:46,481 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.566 = clip(base=0.486 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.460 novelty=0.74 | sol=0.45*prm_final(0.68)+0.35*prm_mean(0.44)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:28:46,696 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.760 = clip(base=0.680 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.733 novelty=0.74 | sol=0.45*prm_final(0.87)+0.35*prm_mean(0.69)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:28:46,906 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.654 = clip(base=0.574 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.608 novelty=0.74 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.47)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:28:47,116 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.993 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:28:47,323 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.989 novelty=0.74 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:28:47,533 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.462 = clip(base=0.382 + mod=+0.080, cap=1.00) | Q=0.53 sol=0.284 novelty=0.74 | sol=0.45*prm_final(0.30)+0.35*prm_mean(0.33)+0.20*lccp(0.17) | steps=6
+2026-04-26 07:28:47,740 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.859 = clip(base=0.779 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.914 novelty=0.74 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.79)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:28:47,956 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.542 = clip(base=0.462 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.362 novelty=0.74 | sol=0.45*prm_final(0.14)+0.35*prm_mean(0.51)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:28:48,171 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.408 = clip(base=0.328 + mod=+0.080, cap=1.00) | Q=0.54 sol=0.187 novelty=0.74 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.33)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:29:24,148 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.462 = clip(base=0.382 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.225 novelty=0.79 | sol=0.45*prm_final(0.12)+0.35*prm_mean(0.40)+0.20*lccp(0.17) | steps=6
+2026-04-26 07:29:24,377 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.536 = clip(base=0.456 + mod=+0.080, cap=1.00) | Q=0.53 sol=0.410 novelty=0.79 | sol=0.45*prm_final(0.44)+0.35*prm_mean(0.44)+0.20*lccp(0.29) | steps=7
+2026-04-26 07:29:24,609 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.648 = clip(base=0.568 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.536 novelty=0.79 | sol=0.45*prm_final(0.50)+0.35*prm_mean(0.60)+0.20*lccp(0.50) | steps=6
+2026-04-26 07:29:24,830 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.410 = clip(base=0.330 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.206 novelty=0.79 | sol=0.45*prm_final(0.03)+0.35*prm_mean(0.36)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:29:25,102 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.688 = clip(base=0.608 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.580 novelty=0.79 | sol=0.45*prm_final(0.61)+0.35*prm_mean(0.59)+0.20*lccp(0.50) | steps=6
+2026-04-26 07:29:25,338 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.511 = clip(base=0.431 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.333 novelty=0.79 | sol=0.45*prm_final(0.13)+0.35*prm_mean(0.49)+0.20*lccp(0.50) | steps=6
+2026-04-26 07:29:25,562 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.763 = clip(base=0.683 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.739 novelty=0.79 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.64)+0.20*lccp(0.40) | steps=5
+2026-04-26 07:29:25,777 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.456 = clip(base=0.376 + mod=+0.080, cap=1.00) | Q=0.52 sol=0.281 novelty=0.79 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.49)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:29:26,001 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.546 = clip(base=0.466 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.345 novelty=0.79 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.63)+0.20*lccp(0.43) | steps=7
+
Iter 29 GRPO groups: 75%|#######5 | 15/20 [07:20<01:52, 22.49s/q, loss=0.0001, mean_r=0.639, q_acc=100%, q_rew=0.647, skip=3]
Iter 29 GRPO groups: 80%|######## | 16/20 [07:20<02:04, 31.02s/q, loss=0.0001, mean_r=0.639, q_acc=100%, q_rew=0.647, skip=3]2026-04-26 07:29:31,786 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.993 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:31,980 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.996 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:32,177 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.996 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:32,366 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:32,561 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:32,752 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:32,945 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.995 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:33,143 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.996 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:33,338 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.949 = clip(base=0.869 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.996 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:33,528 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.66 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:29:39,560 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.982 = clip(base=0.902 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:39,759 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.983 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:39,958 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:40,161 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:40,356 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:40,551 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:40,750 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.71 sol=1.000 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:40,953 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:41,148 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:29:41,348 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+
Iter 29 GRPO groups: 80%|######## | 16/20 [07:36<02:04, 31.02s/q, loss=-0.0005, mean_r=0.957, q_acc=100%, q_rew=0.652, skip=3]
Iter 29 GRPO groups: 85%|########5 | 17/20 [07:36<01:18, 26.33s/q, loss=-0.0005, mean_r=0.957, q_acc=100%, q_rew=0.652, skip=3]2026-04-26 07:30:12,532 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.400 = 0.50×0.33(prox=0.33) + 0.40×proc(0.258[fin=0.06,mean=0.56]) + 0.10×fmt(1.000) | pred='0' gold='1' | step_acc=60% lccp=20% (chain=1/5 ok_count=3) n_steps=5
+2026-04-26 07:30:22,927 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.933[fin=0.92,mean=0.96]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=96% lccp=91% (chain=21/23 ok_count=22) n_steps=23
+2026-04-26 07:30:23,013 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.982[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:30:23,098 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:30:23,191 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.968[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='1' gold='1' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:30:31,319 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.33(prox=0.33) + 0.40×proc(0.417[fin=0.08,mean=0.92]) + 0.10×fmt(1.000) | pred='0' gold='1' | step_acc=91% lccp=91% (chain=21/23 ok_count=21) n_steps=23
+2026-04-26 07:30:31,403 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.415 = 0.50×0.33(prox=0.33) + 0.40×proc(0.370[fin=0.36,mean=0.39]) + 0.10×fmt(1.000) | pred='0' gold='1' | step_acc=33% lccp=0% (chain=0/3 ok_count=1) n_steps=3
+2026-04-26 07:30:31,495 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.199 = 0.50×0.02(prox=0.02) + 0.40×proc(0.156[fin=0.01,mean=0.38]) + 0.10×fmt(1.000) | pred='22' gold='1' | step_acc=33% lccp=17% (chain=1/6 ok_count=2) n_steps=6
+2026-04-26 07:30:31,579 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.498 = 0.50×0.33(prox=0.33) + 0.40×proc(0.578[fin=0.64,mean=0.48]) + 0.10×fmt(1.000) | pred='0' gold='1' | step_acc=50% lccp=0% (chain=0/4 ok_count=2) n_steps=4
+2026-04-26 07:30:35,668 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.498 = 0.50×0.33(prox=0.33) + 0.40×proc(0.579[fin=0.71,mean=0.37]) + 0.10×fmt(1.000) | pred='0' gold='1' | step_acc=25% lccp=0% (chain=0/4 ok_count=1) n_steps=4
+
Iter 29 GRPO groups: 85%|########5 | 17/20 [08:30<01:18, 26.33s/q, loss=0.0010, mean_r=0.651, q_acc=100%, q_rew=0.652, skip=3]
Iter 29 GRPO groups: 90%|######### | 18/20 [08:30<01:09, 34.69s/q, loss=0.0010, mean_r=0.651, q_acc=100%, q_rew=0.652, skip=3]2026-04-26 07:31:10,904 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.970 = 0.50×1.00(exact) + 0.40×proc(0.924[fin=0.99,mean=0.82]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=85% lccp=31% (chain=4/13 ok_count=11) n_steps=13
+2026-04-26 07:31:11,003 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.973 = 0.50×1.00(exact) + 0.40×proc(0.932[fin=1.00,mean=0.83]) + 0.10×fmt(1.000) | pred='3' gold='3' | step_acc=88% lccp=12% (chain=1/8 ok_count=7) n_steps=8
+2026-04-26 07:31:11,004 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.000 = 0.85×0.00 + 0.15×fmt(0.000) | pred='' gold='3' | step_acc=0% lccp=0% (chain=0/0 ok_count=0) n_steps=0
+2026-04-26 07:31:18,775 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.125 = 0.50×0.00(prox=0.00) + 0.40×proc(0.136[fin=0.03,mean=0.30]) + 0.10×fmt(0.700) | pred='' gold='3' | step_acc=17% lccp=0% (chain=0/6 ok_count=1) n_steps=6
+
Iter 29 GRPO groups: 90%|######### | 18/20 [09:12<01:09, 34.69s/q, loss=0.0005, mean_r=0.517, q_acc=100%, q_rew=0.652, skip=3]
Iter 29 GRPO groups: 95%|#########5| 19/20 [09:12<00:36, 36.96s/q, loss=0.0005, mean_r=0.517, q_acc=100%, q_rew=0.652, skip=3]2026-04-26 07:31:30,861 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.821 = clip(base=0.741 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.771 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.73)+0.20*lccp(0.33) | steps=6
+2026-04-26 07:31:31,083 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.999 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:31,297 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.61 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:31,509 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.61 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:31,723 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.61 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:31,935 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.61 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:32,151 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.61 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:32,364 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:31:32,576 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.59 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:32,788 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.917 = clip(base=0.837 + mod=+0.080, cap=1.00) | Q=0.59 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:42,748 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:31:42,981 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:31:43,213 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.994 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:31:43,445 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.994 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:31:43,677 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.973 novelty=0.71 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:31:43,908 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.923 = clip(base=0.843 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:31:44,142 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:31:44,371 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.919 = clip(base=0.839 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.989 novelty=0.71 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:31:44,599 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.920 = clip(base=0.840 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.992 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:31:44,826 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.925 = clip(base=0.845 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.987 novelty=0.71 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+
Iter 29 GRPO groups: 95%|#########5| 19/20 [09:39<00:36, 36.96s/q, loss=0.0001, mean_r=0.919, q_acc=100%, q_rew=0.650, skip=3]
Iter 29 GRPO groups: 100%|##########| 20/20 [09:39<00:00, 33.99s/q, loss=0.0001, mean_r=0.919, q_acc=100%, q_rew=0.650, skip=3]
Iter 29 GRPO groups: 100%|##########| 20/20 [09:39<00:00, 28.98s/q, loss=0.0001, mean_r=0.919, q_acc=100%, q_rew=0.650, skip=3]
+2026-04-26 07:31:46,526 INFO __main__ - Iter 29 | loss=0.0004 | reward mean=0.867 std=0.171 | gt_match=72.6% | grounded_acc=89.3% | step_acc=86.7% | lccp=76.0% | batch_acc=94.7% | phase=SELFPLAY_RAMP sp_ratio=57% | groups=28 skipped=3(0var=3) | lr=3.42e-06 | 579.7s
+2026-04-26 07:31:46,527 INFO __main__ - Question generation: 11/11 valid (100%) | q_reward=0.650 | q_acc=100.0% (>0.5 quality) | topic=0.56 diff=0.17 clarity=1.00 novelty=0.44 solvability=0.97
+2026-04-26 07:31:46,528 INFO __main__ - ======================================================================
+2026-04-26 07:31:46,528 INFO __main__ - GRPO ITERATION 30/60
+2026-04-26 07:31:46,528 INFO __main__ - ======================================================================
+2026-04-26 07:31:46,548 INFO __main__ - LR this iteration: 3.42e-06 | T=0.603 | MATH ratio=50%
+
Iter 30 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 07:31:52,783 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.977 = clip(base=0.897 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:52,976 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:31:53,174 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:31:53,368 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:53,562 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:31:53,758 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:31:53,953 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:31:54,155 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.999 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:31:54,346 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:31:54,542 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:32:00,868 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:01,071 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:01,273 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:01,473 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.950 = clip(base=0.870 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:01,672 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:01,875 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.954 = clip(base=0.874 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.991 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:02,077 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.996 novelty=0.65 | sol=0.45*prm_final(0.99)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:02,280 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:02,480 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:32:02,680 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.65 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+
Iter 30 GRPO groups: 0%| | 0/20 [00:17, ?q/s, loss=0.0004, mean_r=0.955, q_acc=100%, q_rew=0.690, skip=0]
Iter 30 GRPO groups: 5%|5 | 1/20 [00:17<05:38, 17.82s/q, loss=0.0004, mean_r=0.955, q_acc=100%, q_rew=0.690, skip=0]2026-04-26 07:32:10,987 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.666 = clip(base=0.586 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.524 novelty=0.66 | sol=0.45*prm_final(0.47)+0.35*prm_mean(0.61)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:32:11,198 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.666 = clip(base=0.586 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.564 novelty=0.66 | sol=0.45*prm_final(0.59)+0.35*prm_mean(0.57)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:32:11,413 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.513 = clip(base=0.433 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.305 novelty=0.66 | sol=0.45*prm_final(0.06)+0.35*prm_mean(0.51)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:32:11,626 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.558 = clip(base=0.478 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.382 novelty=0.66 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.65)+0.20*lccp(0.75) | steps=4
+2026-04-26 07:32:11,838 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.687 = clip(base=0.607 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.598 novelty=0.66 | sol=0.45*prm_final(0.61)+0.35*prm_mean(0.64)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:32:12,047 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.506 = clip(base=0.426 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.305 novelty=0.66 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.56)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:32:12,256 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.596 = clip(base=0.516 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.455 novelty=0.66 | sol=0.45*prm_final(0.34)+0.35*prm_mean(0.58)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:32:12,466 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.715 = clip(base=0.635 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.645 novelty=0.66 | sol=0.45*prm_final(0.70)+0.35*prm_mean(0.66)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:32:12,674 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.506 = clip(base=0.426 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.288 novelty=0.66 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.51)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:32:12,884 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.992 novelty=0.66 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:32:17,615 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.987 = clip(base=0.907 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:17,824 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.994 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:18,033 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.985 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:18,255 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.975 = clip(base=0.895 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:18,465 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.980 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:18,681 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:18,892 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.982 novelty=0.71 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:19,116 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.993 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:19,331 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.986 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:32:19,544 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.983 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+
Iter 30 GRPO groups: 5%|5 | 1/20 [00:34<05:38, 17.82s/q, loss=0.0001, mean_r=0.798, q_acc=100%, q_rew=0.682, skip=0]
Iter 30 GRPO groups: 10%|# | 2/20 [00:34<05:10, 17.27s/q, loss=0.0001, mean_r=0.798, q_acc=100%, q_rew=0.682, skip=0]2026-04-26 07:32:25,818 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.675 = 0.50×0.50(prox=0.50) + 0.40×proc(0.814[fin=0.95,mean=0.60]) + 0.10×fmt(1.000) | pred='4000' gold='8000' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 07:32:25,901 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.923 = 0.50×1.00(exact) + 0.40×proc(0.807[fin=0.86,mean=0.73]) + 0.10×fmt(1.000) | pred='8000' gold='8000' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 07:32:25,984 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.702 = 0.50×0.50(prox=0.50) + 0.40×proc(0.880[fin=1.00,mean=0.71]) + 0.10×fmt(1.000) | pred='4000' gold='8000' | step_acc=67% lccp=0% (chain=0/3 ok_count=2) n_steps=3
+2026-04-26 07:32:28,805 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.961 = 0.50×1.00(exact) + 0.40×proc(0.902[fin=0.98,mean=0.78]) + 0.10×fmt(1.000) | pred='8000' gold='8000' | step_acc=67% lccp=33% (chain=1/3 ok_count=2) n_steps=3
+2026-04-26 07:32:28,888 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='8000' gold='8000' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:32:28,972 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.530 = 0.50×0.05(prox=0.05) + 0.40×proc(0.822[fin=0.90,mean=0.70]) + 0.10×fmt(1.000) | pred='80000' gold='8000' | step_acc=75% lccp=50% (chain=2/4 ok_count=3) n_steps=4
+2026-04-26 07:32:29,054 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.996 = 0.50×1.00(exact) + 0.40×proc(0.991[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='8000' gold='8000' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:32:35,829 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.958 = 0.50×1.00(exact) + 0.40×proc(0.894[fin=0.98,mean=0.76]) + 0.10×fmt(1.000) | pred='8000' gold='8000' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:32:35,910 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.609 = 0.50×0.50(prox=0.50) + 0.40×proc(0.735[fin=0.86,mean=0.54]) + 0.10×fmt(0.650) | pred='4000' gold='8000' | step_acc=50% lccp=0% (chain=0/2 ok_count=1) n_steps=2
+2026-04-26 07:32:35,992 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.939[fin=0.99,mean=0.86]) + 0.10×fmt(1.000) | pred='8000' gold='8000' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 30 GRPO groups: 10%|# | 2/20 [00:50<05:10, 17.27s/q, loss=0.0009, mean_r=0.833, q_acc=100%, q_rew=0.682, skip=0]
Iter 30 GRPO groups: 15%|#5 | 3/20 [00:50<04:44, 16.76s/q, loss=0.0009, mean_r=0.833, q_acc=100%, q_rew=0.682, skip=0]2026-04-26 07:32:43,160 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:32:54,850 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:32:54,933 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:32:55,019 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:32:55,104 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:33:03,763 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:33:03,846 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.988[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:33:03,930 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:33:04,013 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.993 = 0.50×1.00(exact) + 0.40×proc(0.983[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:33:13,714 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 30 GRPO groups: 15%|#5 | 3/20 [01:27<04:44, 16.76s/q, loss=0var, mean_r=0.998, skip=1]
Iter 30 GRPO groups: 20%|## | 4/20 [01:27<06:31, 24.48s/q, loss=0var, mean_r=0.998, skip=1]2026-04-26 07:33:49,869 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.525 = clip(base=0.445 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.369 novelty=0.77 | sol=0.45*prm_final(0.43)+0.35*prm_mean(0.50)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:33:50,085 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.518 = clip(base=0.438 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.344 novelty=0.77 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.61)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:33:50,305 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.561 = clip(base=0.481 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.405 novelty=0.77 | sol=0.45*prm_final(0.25)+0.35*prm_mean(0.55)+0.20*lccp(0.50) | steps=6
+2026-04-26 07:33:50,531 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.807 = clip(base=0.727 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.825 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.84)+0.20*lccp(0.43) | steps=7
+2026-04-26 07:33:50,743 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.713 = clip(base=0.633 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.639 novelty=0.77 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.56)+0.20*lccp(0.00) | steps=2
+2026-04-26 07:33:50,964 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.496 = clip(base=0.416 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.307 novelty=0.77 | sol=0.45*prm_final(0.04)+0.35*prm_mean(0.54)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:33:51,189 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.602 = clip(base=0.522 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.484 novelty=0.77 | sol=0.45*prm_final(0.42)+0.35*prm_mean(0.74)+0.20*lccp(0.18) | steps=11
+2026-04-26 07:33:51,409 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.638 = clip(base=0.558 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.543 novelty=0.77 | sol=0.45*prm_final(0.71)+0.35*prm_mean(0.64)+0.20*lccp(0.00) | steps=7
+2026-04-26 07:33:51,622 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.489 = clip(base=0.409 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.295 novelty=0.77 | sol=0.45*prm_final(0.02)+0.35*prm_mean(0.53)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:34:11,536 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.844 = clip(base=0.764 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.835 novelty=0.72 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.83)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:34:11,738 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.793 = clip(base=0.713 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.784 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.82)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:34:11,942 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.981 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:34:12,150 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.975 novelty=0.72 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:34:12,375 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.993 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=13
+2026-04-26 07:34:12,579 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.919 = clip(base=0.839 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.989 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:34:12,782 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.817 = clip(base=0.737 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.830 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.80)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:34:12,983 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.926 = clip(base=0.846 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.991 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:13,196 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.61 sol=0.981 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:34:13,402 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.939 = clip(base=0.859 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.994 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+
Iter 30 GRPO groups: 20%|## | 4/20 [02:28<06:31, 24.48s/q, loss=0.0003, mean_r=0.762, q_acc=100%, q_rew=0.657, skip=1]
Iter 30 GRPO groups: 25%|##5 | 5/20 [02:28<09:26, 37.78s/q, loss=0.0003, mean_r=0.762, q_acc=100%, q_rew=0.657, skip=1]2026-04-26 07:34:18,579 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:18,663 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:18,746 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:27,294 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:27,378 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:27,461 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:27,546 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:35,708 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:35,792 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:34:35,875 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='20' gold='20' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+
Iter 30 GRPO groups: 25%|##5 | 5/20 [02:49<09:26, 37.78s/q, loss=0var, mean_r=1.000, skip=2]
Iter 30 GRPO groups: 30%|### | 6/20 [02:49<07:28, 32.01s/q, loss=0var, mean_r=1.000, skip=2]2026-04-26 07:34:41,499 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.988 = clip(base=0.908 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:41,698 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.962 = clip(base=0.882 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.992 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:41,898 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:42,100 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.996 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:42,306 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:42,511 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.996 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:42,716 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:42,915 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.961 = clip(base=0.881 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.990 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:43,117 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:43,317 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:49,569 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.979 = clip(base=0.899 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:49,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:34:49,971 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:50,171 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.964 = clip(base=0.884 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.995 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:50,372 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.965 = clip(base=0.885 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.997 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:50,571 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.967 = clip(base=0.887 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:34:50,770 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:50,974 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.973 = clip(base=0.893 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.998 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:34:51,180 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:51,384 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.999 novelty=0.77 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+
Iter 30 GRPO groups: 30%|### | 6/20 [03:06<07:28, 32.01s/q, loss=-0.0003, mean_r=0.967, q_acc=100%, q_rew=0.674, skip=2]
Iter 30 GRPO groups: 35%|###5 | 7/20 [03:06<05:53, 27.17s/q, loss=-0.0003, mean_r=0.967, q_acc=100%, q_rew=0.674, skip=2]2026-04-26 07:34:58,253 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.994 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:58,448 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.994 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:58,642 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.992 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:58,837 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.887 = clip(base=0.807 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.967 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:34:59,035 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.994 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:59,231 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.992 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:59,426 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.734 = clip(base=0.654 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.712 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.75)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:34:59,620 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.993 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:34:59,815 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.734 = clip(base=0.654 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.712 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.75)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:35:00,011 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.734 = clip(base=0.654 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.712 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.75)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:35:06,001 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.997 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:35:06,194 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.566 = clip(base=0.486 + mod=+0.080, cap=1.00) | Q=0.60 sol=0.410 novelty=0.60 | sol=0.45*prm_final(0.49)+0.35*prm_mean(0.40)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:35:06,382 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.992 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:35:06,578 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.758 = clip(base=0.678 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.715 novelty=0.60 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.68)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:35:06,776 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.755 = clip(base=0.675 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.710 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.74)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:35:06,968 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.991 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:35:07,157 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.626 = clip(base=0.546 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.538 novelty=0.60 | sol=0.45*prm_final(0.86)+0.35*prm_mean(0.43)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:35:07,349 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.984 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:35:07,543 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.999 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:35:07,733 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.886 = clip(base=0.806 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.929 novelty=0.60 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.80)+0.20*lccp(1.00) | steps=2
+
Iter 30 GRPO groups: 35%|###5 | 7/20 [03:22<05:53, 27.17s/q, loss=-0.0015, mean_r=0.840, q_acc=100%, q_rew=0.659, skip=2]
Iter 30 GRPO groups: 40%|#### | 8/20 [03:22<04:44, 23.71s/q, loss=-0.0015, mean_r=0.840, q_acc=100%, q_rew=0.659, skip=2]2026-04-26 07:35:14,418 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.975 = clip(base=0.895 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.997 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:14,622 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.994 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:14,820 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.997 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:15,023 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.996 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:15,221 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:15,422 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.952 = clip(base=0.872 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.992 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:15,624 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.957 = clip(base=0.877 + mod=+0.080, cap=1.00) | Q=0.69 sol=1.000 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:15,823 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.989 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:16,022 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.956 = clip(base=0.876 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.998 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:16,220 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.980 novelty=0.56 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:20,379 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.997 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:20,578 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:20,776 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.992 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:20,969 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:21,163 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.899 = clip(base=0.819 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.971 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:21,355 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.911 = clip(base=0.831 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.992 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:21,552 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.995 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:21,750 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:21,954 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.916 = clip(base=0.836 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.999 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:35:22,153 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.915 = clip(base=0.835 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.998 novelty=0.72 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+
Iter 30 GRPO groups: 40%|#### | 8/20 [03:37<04:44, 23.71s/q, loss=0.0005, mean_r=0.935, q_acc=100%, q_rew=0.657, skip=2]
Iter 30 GRPO groups: 45%|####5 | 9/20 [03:37<03:49, 20.90s/q, loss=0.0005, mean_r=0.935, q_acc=100%, q_rew=0.657, skip=2]2026-04-26 07:35:32,501 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:35:41,854 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.987 = 0.50×1.00(exact) + 0.40×proc(0.967[fin=1.00,mean=0.92]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:35:41,948 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.538 = 0.50×0.18(prox=0.18) + 0.40×proc(0.868[fin=1.00,mean=0.67]) + 0.10×fmt(1.000) | pred='52' gold='16' | step_acc=83% lccp=0% (chain=0/6 ok_count=5) n_steps=6
+2026-04-26 07:35:42,042 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.527 = 0.50×0.18(prox=0.18) + 0.40×proc(0.839[fin=1.00,mean=0.60]) + 0.10×fmt(1.000) | pred='52' gold='16' | step_acc=50% lccp=0% (chain=0/6 ok_count=3) n_steps=6
+2026-04-26 07:35:42,137 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.971 = 0.50×1.00(exact) + 0.40×proc(0.928[fin=1.00,mean=0.82]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=75% lccp=0% (chain=0/4 ok_count=3) n_steps=4
+2026-04-26 07:35:49,315 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.991 = 0.50×1.00(exact) + 0.40×proc(0.978[fin=1.00,mean=0.95]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:35:49,401 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.532 = 0.50×0.18(prox=0.18) + 0.40×proc(0.852[fin=1.00,mean=0.64]) + 0.10×fmt(1.000) | pred='52' gold='16' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 07:35:49,493 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.995 = 0.50×1.00(exact) + 0.40×proc(0.987[fin=1.00,mean=0.97]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:35:49,577 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.985 = 0.50×1.00(exact) + 0.40×proc(0.963[fin=1.00,mean=0.91]) + 0.10×fmt(1.000) | pred='16' gold='16' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:35:58,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.875 = 0.50×0.80(prox=0.80) + 0.40×proc(0.938[fin=1.00,mean=0.84]) + 0.10×fmt(1.000) | pred='14' gold='16' | step_acc=86% lccp=57% (chain=4/7 ok_count=6) n_steps=7
+
Iter 30 GRPO groups: 45%|####5 | 9/20 [04:13<03:49, 20.90s/q, loss=-0.0010, mean_r=0.840, q_acc=100%, q_rew=0.657, skip=2]
Iter 30 GRPO groups: 50%|##### | 10/20 [04:13<04:14, 25.44s/q, loss=-0.0010, mean_r=0.840, q_acc=100%, q_rew=0.657, skip=2]2026-04-26 07:36:28,156 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.179 = 0.50×0.00(prox=0.00) + 0.40×proc(0.210[fin=0.04,mean=0.47]) + 0.10×fmt(0.700) | pred='' gold='13' | step_acc=50% lccp=17% (chain=1/6 ok_count=3) n_steps=6
+2026-04-26 07:36:28,255 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(0.700) | pred='' gold='13' | step_acc=100% lccp=100% (chain=6/6 ok_count=6) n_steps=6
+2026-04-26 07:36:28,352 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.986 = 0.50×1.00(exact) + 0.40×proc(0.964[fin=0.99,mean=0.92]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=100% lccp=100% (chain=5/5 ok_count=5) n_steps=5
+2026-04-26 07:36:43,740 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.441 = 0.50×0.00(prox=0.00) + 0.40×proc(0.927[fin=1.00,mean=0.82]) + 0.10×fmt(0.700) | pred='' gold='13' | step_acc=80% lccp=0% (chain=0/5 ok_count=4) n_steps=5
+2026-04-26 07:36:43,854 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.708 = 0.50×0.57(prox=0.57) + 0.40×proc(0.814[fin=0.94,mean=0.63]) + 0.10×fmt(1.000) | pred='8' gold='13' | step_acc=70% lccp=0% (chain=0/10 ok_count=7) n_steps=10
+2026-04-26 07:36:43,950 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.875 = 0.50×0.85(prox=0.85) + 0.40×proc(0.875[fin=0.99,mean=0.71]) + 0.10×fmt(1.000) | pred='12' gold='13' | step_acc=78% lccp=11% (chain=1/9 ok_count=7) n_steps=9
+2026-04-26 07:36:44,049 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.418 = 0.50×0.00(prox=0.00) + 0.40×proc(0.869[fin=0.97,mean=0.71]) + 0.10×fmt(0.700) | pred='' gold='13' | step_acc=60% lccp=0% (chain=0/5 ok_count=3) n_steps=5
+2026-04-26 07:36:56,411 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.550 = 0.50×0.00(prox=0.00) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(0.700) | pred='' gold='13' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:36:56,504 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.099 = 0.50×0.00(prox=0.00) + 0.40×proc(0.072[fin=0.09,mean=0.05]) + 0.10×fmt(0.700) | pred='' gold='13' | step_acc=0% lccp=0% (chain=0/2 ok_count=0) n_steps=2
+2026-04-26 07:36:56,597 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.888 = 0.50×1.00(exact) + 0.40×proc(0.721[fin=0.92,mean=0.42]) + 0.10×fmt(1.000) | pred='13' gold='13' | step_acc=33% lccp=0% (chain=0/6 ok_count=2) n_steps=6
+
Iter 30 GRPO groups: 50%|##### | 10/20 [05:11<04:14, 25.44s/q, loss=0.0001, mean_r=0.569, q_acc=100%, q_rew=0.657, skip=2]
Iter 30 GRPO groups: 55%|#####5 | 11/20 [05:11<05:19, 35.52s/q, loss=0.0001, mean_r=0.569, q_acc=100%, q_rew=0.657, skip=2]2026-04-26 07:37:04,669 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.448 = clip(base=0.368 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.144 novelty=0.71 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.26)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:37:04,866 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.770 = clip(base=0.690 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.694 novelty=0.71 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.60)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:37:05,065 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.507 = clip(base=0.427 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.266 novelty=0.71 | sol=0.45*prm_final(0.06)+0.35*prm_mean(0.40)+0.20*lccp(0.50) | steps=6
+2026-04-26 07:37:05,263 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.500 = clip(base=0.420 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.245 novelty=0.71 | sol=0.45*prm_final(0.16)+0.35*prm_mean(0.39)+0.20*lccp(0.17) | steps=6
+2026-04-26 07:37:05,460 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.833 = clip(base=0.753 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.793 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:37:05,661 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.748 = clip(base=0.668 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.637 novelty=0.71 | sol=0.45*prm_final(0.83)+0.35*prm_mean(0.61)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:37:05,857 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.555 = clip(base=0.475 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.350 novelty=0.71 | sol=0.45*prm_final(0.24)+0.35*prm_mean(0.47)+0.20*lccp(0.40) | steps=5
+2026-04-26 07:37:06,053 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.801 = clip(base=0.721 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.732 novelty=0.71 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.72)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:37:06,259 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.941 = clip(base=0.861 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.979 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:06,459 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.929 = clip(base=0.849 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.958 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.88)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:37:11,097 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.936 = clip(base=0.856 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:37:11,285 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.995 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:11,473 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:11,660 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:37:11,847 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:37:12,036 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:12,226 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:37:12,416 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.913 = clip(base=0.833 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:12,612 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.996 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:37:12,804 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.914 = clip(base=0.834 + mod=+0.080, cap=1.00) | Q=0.59 sol=0.997 novelty=0.71 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+
Iter 30 GRPO groups: 55%|#####5 | 11/20 [05:27<05:19, 35.52s/q, loss=-0.0011, mean_r=0.810, q_acc=100%, q_rew=0.655, skip=2]
Iter 30 GRPO groups: 60%|###### | 12/20 [05:27<03:57, 29.70s/q, loss=-0.0011, mean_r=0.810, q_acc=100%, q_rew=0.655, skip=2]2026-04-26 07:37:16,414 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:21,237 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:21,314 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:21,390 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:21,471 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:26,197 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:26,273 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.994 = 0.50×1.00(exact) + 0.40×proc(0.986[fin=1.00,mean=0.96]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:26,348 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.998 = 0.50×1.00(exact) + 0.40×proc(0.995[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:26,432 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:37:31,061 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.997 = 0.50×1.00(exact) + 0.40×proc(0.994[fin=1.00,mean=0.98]) + 0.10×fmt(1.000) | pred='4' gold='4' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+
Iter 30 GRPO groups: 60%|###### | 12/20 [05:44<03:57, 29.70s/q, loss=0var, mean_r=0.998, skip=3]
Iter 30 GRPO groups: 65%|######5 | 13/20 [05:44<03:00, 25.73s/q, loss=0var, mean_r=0.998, skip=3]2026-04-26 07:37:36,633 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.870 = clip(base=0.790 + mod=+0.080, cap=1.00) | Q=0.85 sol=0.750 novelty=0.79 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.72)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:37:36,807 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.923 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.985 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:36,986 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.534 = clip(base=0.454 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.307 novelty=0.79 | sol=0.45*prm_final(0.08)+0.35*prm_mean(0.49)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:37:37,170 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.931 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.997 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:37,354 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.927 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.999 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:37:37,538 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.979 novelty=0.79 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:37:37,718 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.932 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.999 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:37,898 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.933 + mod=+0.080, cap=1.00) | Q=0.83 sol=0.999 novelty=0.79 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:38,077 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.978 = clip(base=0.898 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.986 novelty=0.79 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:38,264 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.983 = clip(base=0.903 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.983 novelty=0.79 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:44,435 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.992 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:44,628 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.948 = clip(base=0.868 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.960 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:37:44,825 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.649 = clip(base=0.569 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.450 novelty=0.76 | sol=0.45*prm_final(0.32)+0.35*prm_mean(0.59)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:37:45,030 INFO src.rl.math_environment_curriculum - PRM reward: combined=1.000 = clip(base=0.924 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:45,234 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.566 = clip(base=0.486 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.377 novelty=0.76 | sol=0.45*prm_final(0.38)+0.35*prm_mean(0.45)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:37:45,432 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.953 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.87)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:45,622 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.985 = clip(base=0.905 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.983 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:37:45,815 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.993 = clip(base=0.913 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.998 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:37:46,008 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.986 = clip(base=0.906 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.988 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:37:46,202 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.999 = clip(base=0.919 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.992 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=4
+
Iter 30 GRPO groups: 65%|######5 | 13/20 [06:01<03:00, 25.73s/q, loss=-0.0009, mean_r=0.922, q_acc=100%, q_rew=0.671, skip=3]
Iter 30 GRPO groups: 70%|####### | 14/20 [06:01<02:18, 23.04s/q, loss=-0.0009, mean_r=0.922, q_acc=100%, q_rew=0.671, skip=3]2026-04-26 07:37:55,694 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.859 = clip(base=0.779 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.793 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.79)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:37:55,897 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.840 = clip(base=0.760 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.807 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.83)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:37:56,103 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.634 = clip(base=0.554 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.489 novelty=0.75 | sol=0.45*prm_final(0.80)+0.35*prm_mean(0.37)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:37:56,309 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.446 = clip(base=0.366 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.171 novelty=0.75 | sol=0.45*prm_final(0.27)+0.35*prm_mean(0.14)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:37:56,514 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.448 = clip(base=0.368 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.174 novelty=0.75 | sol=0.45*prm_final(0.28)+0.35*prm_mean(0.14)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:37:56,724 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.738 = clip(base=0.658 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.608 novelty=0.75 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.52)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:37:56,929 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.777 = clip(base=0.697 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.653 novelty=0.75 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.57)+0.20*lccp(0.17) | steps=6
+2026-04-26 07:37:57,137 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.921 = clip(base=0.841 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.941 novelty=0.75 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:37:57,341 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.762 = clip(base=0.682 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.647 novelty=0.75 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.58)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:37:57,544 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.965 novelty=0.75 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.91)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:38:03,779 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.861 = clip(base=0.781 + mod=+0.080, cap=1.00) | Q=0.77 sol=0.790 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.74)+0.20*lccp(0.40) | steps=5
+2026-04-26 07:38:03,977 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.958 = clip(base=0.878 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.998 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:38:04,176 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.755 = clip(base=0.675 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.650 novelty=0.68 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.52)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:38:04,380 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.778 = clip(base=0.698 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.701 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.72)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:38:04,588 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.562 = clip(base=0.482 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.374 novelty=0.68 | sol=0.45*prm_final(0.43)+0.35*prm_mean(0.37)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:38:04,790 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.951 = clip(base=0.871 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.998 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:38:04,998 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.935 = clip(base=0.855 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.964 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:38:05,201 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.953 = clip(base=0.873 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.999 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:38:05,398 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.763 = clip(base=0.683 + mod=+0.080, cap=1.00) | Q=0.70 sol=0.671 novelty=0.68 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.64)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:38:05,596 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.595 = clip(base=0.515 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.421 novelty=0.68 | sol=0.45*prm_final(0.52)+0.35*prm_mean(0.39)+0.20*lccp(0.25) | steps=4
+
Iter 30 GRPO groups: 70%|####### | 14/20 [06:20<02:18, 23.04s/q, loss=-0.0000, mean_r=0.773, q_acc=100%, q_rew=0.674, skip=3]
Iter 30 GRPO groups: 75%|#######5 | 15/20 [06:20<01:49, 21.93s/q, loss=-0.0000, mean_r=0.773, q_acc=100%, q_rew=0.674, skip=3]2026-04-26 07:38:34,645 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.88 sol=0.825 novelty=0.76 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.81)+0.20*lccp(0.50) | steps=12
+2026-04-26 07:38:34,896 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.872 = clip(base=0.792 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.786 novelty=0.76 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.83)+0.20*lccp(0.40) | steps=15
+2026-04-26 07:38:35,132 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.782 = clip(base=0.702 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.630 novelty=0.76 | sol=0.45*prm_final(0.59)+0.35*prm_mean(0.72)+0.20*lccp(0.57) | steps=14
+2026-04-26 07:38:35,375 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.888 = clip(base=0.808 + mod=+0.080, cap=1.00) | Q=0.82 sol=0.800 novelty=0.76 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.82)+0.20*lccp(0.40) | steps=15
+2026-04-26 07:38:35,609 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.822 = clip(base=0.742 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.705 novelty=0.76 | sol=0.45*prm_final(0.60)+0.35*prm_mean(0.86)+0.20*lccp(0.67) | steps=9
+2026-04-26 07:38:35,866 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.882 = clip(base=0.802 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.802 novelty=0.76 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.81)+0.20*lccp(0.40) | steps=15
+2026-04-26 07:38:36,130 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.894 = clip(base=0.814 + mod=+0.080, cap=1.00) | Q=0.80 sol=0.826 novelty=0.76 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.88)+0.20*lccp(0.36) | steps=14
+2026-04-26 07:38:36,373 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.823 = clip(base=0.743 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.700 novelty=0.76 | sol=0.45*prm_final(0.77)+0.35*prm_mean(0.75)+0.20*lccp(0.46) | steps=13
+2026-04-26 07:38:36,606 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.673 = clip(base=0.593 + mod=+0.080, cap=1.00) | Q=0.81 sol=0.451 novelty=0.76 | sol=0.45*prm_final(0.09)+0.35*prm_mean(0.78)+0.20*lccp(0.70) | steps=10
+2026-04-26 07:38:36,843 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.932 = clip(base=0.852 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.891 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.90)+0.20*lccp(0.64) | steps=11
+2026-04-26 07:38:46,562 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.932 = clip(base=0.852 + mod=+0.080, cap=1.00) | Q=0.71 sol=0.948 novelty=0.76 | sol=0.45*prm_final(0.97)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:38:46,780 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.985 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:38:47,016 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.751 = clip(base=0.671 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.660 novelty=0.76 | sol=0.45*prm_final(0.80)+0.35*prm_mean(0.67)+0.20*lccp(0.33) | steps=6
+2026-04-26 07:38:47,234 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.777 = clip(base=0.697 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.718 novelty=0.76 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.78)+0.20*lccp(0.00) | steps=5
+2026-04-26 07:38:47,452 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.787 = clip(base=0.707 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.732 novelty=0.76 | sol=0.45*prm_final(0.91)+0.35*prm_mean(0.64)+0.20*lccp(0.50) | steps=4
+2026-04-26 07:38:47,684 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.735 = clip(base=0.655 + mod=+0.080, cap=1.00) | Q=0.69 sol=0.634 novelty=0.76 | sol=0.45*prm_final(0.58)+0.35*prm_mean(0.69)+0.20*lccp(0.67) | steps=6
+2026-04-26 07:38:47,913 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.769 = clip(base=0.689 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.696 novelty=0.76 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.60)+0.20*lccp(0.20) | steps=5
+2026-04-26 07:38:48,136 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.856 = clip(base=0.776 + mod=+0.080, cap=1.00) | Q=0.65 sol=0.858 novelty=0.76 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.83)+0.20*lccp(0.60) | steps=5
+2026-04-26 07:38:48,361 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.924 = clip(base=0.844 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.978 novelty=0.76 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.94)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:38:48,591 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.488 = clip(base=0.408 + mod=+0.080, cap=1.00) | Q=0.68 sol=0.230 novelty=0.76 | sol=0.45*prm_final(0.01)+0.35*prm_mean(0.65)+0.20*lccp(0.00) | steps=5
+
Iter 30 GRPO groups: 75%|#######5 | 15/20 [07:03<01:49, 21.93s/q, loss=-0.0003, mean_r=0.822, q_acc=100%, q_rew=0.681, skip=3]
Iter 30 GRPO groups: 80%|######## | 16/20 [07:03<01:53, 28.28s/q, loss=-0.0003, mean_r=0.822, q_acc=100%, q_rew=0.681, skip=3]2026-04-26 07:38:54,268 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:38:54,351 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:38:54,435 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:02,164 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:02,248 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:02,331 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:39:02,413 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:09,316 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:09,400 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:09,483 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='22' gold='22' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 30 GRPO groups: 80%|######## | 16/20 [07:22<01:53, 28.28s/q, loss=0var, mean_r=1.000, skip=4]
Iter 30 GRPO groups: 85%|########5 | 17/20 [07:22<01:16, 25.55s/q, loss=0var, mean_r=1.000, skip=4]2026-04-26 07:39:12,900 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:39:17,106 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:39:17,191 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.998[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:17,268 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:17,350 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:23,958 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(1.000[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:24,033 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:24,111 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.976 = 0.50×1.00(exact) + 0.40×proc(0.941[fin=1.00,mean=0.85]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=3/3 ok_count=3) n_steps=3
+2026-04-26 07:39:24,189 INFO src.rl.math_environment_curriculum - Grounded reward: combined=0.999 = 0.50×1.00(exact) + 0.40×proc(0.997[fin=1.00,mean=0.99]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+2026-04-26 07:39:30,737 INFO src.rl.math_environment_curriculum - Grounded reward: combined=1.000 = 0.50×1.00(exact) + 0.40×proc(0.999[fin=1.00,mean=1.00]) + 0.10×fmt(1.000) | pred='125' gold='125' | step_acc=100% lccp=100% (chain=4/4 ok_count=4) n_steps=4
+
Iter 30 GRPO groups: 85%|########5 | 17/20 [07:44<01:16, 25.55s/q, loss=0var, mean_r=0.997, skip=5]
Iter 30 GRPO groups: 90%|######### | 18/20 [07:44<00:48, 24.26s/q, loss=0var, mean_r=0.997, skip=5]2026-04-26 07:39:37,480 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.771 = clip(base=0.691 + mod=+0.080, cap=1.00) | Q=0.64 sol=0.727 novelty=0.73 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.67)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:39:37,693 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.901 = clip(base=0.821 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.985 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:37,902 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.908 = clip(base=0.828 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.996 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:38,115 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.897 = clip(base=0.817 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.977 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:38,326 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.878 = clip(base=0.798 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.947 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.86)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:38,536 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.873 = clip(base=0.793 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.938 novelty=0.73 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.84)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:38,746 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.988 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:38,959 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.889 = clip(base=0.809 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.964 novelty=0.73 | sol=0.45*prm_final(0.98)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:39,173 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.990 novelty=0.73 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:39,383 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.850 = clip(base=0.770 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.900 novelty=0.73 | sol=0.45*prm_final(0.92)+0.35*prm_mean(0.82)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:45,343 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.928 = clip(base=0.848 + mod=+0.080, cap=1.00) | Q=0.63 sol=0.997 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:39:45,547 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.907 = clip(base=0.827 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:39:45,747 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:45,952 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:39:46,153 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:46,351 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:46,551 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:46,756 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.991 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:46,963 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.996 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:47,168 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.903 = clip(base=0.823 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.993 novelty=0.67 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=3
+
Iter 30 GRPO groups: 90%|######### | 18/20 [08:02<00:48, 24.26s/q, loss=-0.0007, mean_r=0.892, q_acc=100%, q_rew=0.671, skip=5]
Iter 30 GRPO groups: 95%|#########5| 19/20 [08:02<00:22, 22.41s/q, loss=-0.0007, mean_r=0.892, q_acc=100%, q_rew=0.671, skip=5]2026-04-26 07:39:55,588 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.829 = clip(base=0.749 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.727 novelty=0.77 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.70)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:39:55,785 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.774 = clip(base=0.694 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.675 novelty=0.77 | sol=0.45*prm_final(0.88)+0.35*prm_mean(0.61)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:39:55,989 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.811 = clip(base=0.731 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.736 novelty=0.77 | sol=0.45*prm_final(0.63)+0.35*prm_mean(0.72)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:39:56,195 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.797 = clip(base=0.717 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.706 novelty=0.77 | sol=0.45*prm_final(0.96)+0.35*prm_mean(0.64)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:39:56,394 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.945 = clip(base=0.865 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.960 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.90)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:39:56,595 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.540 = clip(base=0.460 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.329 novelty=0.77 | sol=0.45*prm_final(0.55)+0.35*prm_mean(0.23)+0.20*lccp(0.00) | steps=3
+2026-04-26 07:39:56,793 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.779 = clip(base=0.699 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.673 novelty=0.77 | sol=0.45*prm_final(0.93)+0.35*prm_mean(0.58)+0.20*lccp(0.25) | steps=4
+2026-04-26 07:39:57,000 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.477 = clip(base=0.397 + mod=+0.080, cap=1.00) | Q=0.66 sol=0.219 novelty=0.77 | sol=0.45*prm_final(0.06)+0.35*prm_mean(0.36)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:39:57,205 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.546 = clip(base=0.466 + mod=+0.080, cap=1.00) | Q=0.67 sol=0.332 novelty=0.77 | sol=0.45*prm_final(0.34)+0.35*prm_mean(0.32)+0.20*lccp(0.33) | steps=3
+2026-04-26 07:39:57,403 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.944 = clip(base=0.864 + mod=+0.080, cap=1.00) | Q=0.72 sol=0.958 novelty=0.77 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.89)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:40:05,989 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.992 = clip(base=0.912 + mod=+0.080, cap=1.00) | Q=0.79 sol=0.997 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:40:06,202 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.955 = clip(base=0.875 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.971 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.92)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:40:06,412 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.654 = clip(base=0.574 + mod=+0.080, cap=1.00) | Q=0.78 sol=0.436 novelty=0.78 | sol=0.45*prm_final(0.34)+0.35*prm_mean(0.52)+0.20*lccp(0.50) | steps=6
+2026-04-26 07:40:06,625 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.966 = clip(base=0.886 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.982 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:40:06,848 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.972 = clip(base=0.892 + mod=+0.080, cap=1.00) | Q=0.75 sol=0.983 novelty=0.78 | sol=0.45*prm_final(0.99)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:40:07,061 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.963 = clip(base=0.883 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.967 novelty=0.78 | sol=0.45*prm_final(0.95)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:40:07,282 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.968 = clip(base=0.888 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.994 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:40:07,497 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.73 sol=0.998 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=5
+2026-04-26 07:40:07,717 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.971 = clip(base=0.891 + mod=+0.080, cap=1.00) | Q=0.74 sol=0.992 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=6
+2026-04-26 07:40:07,930 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.977 = clip(base=0.897 + mod=+0.080, cap=1.00) | Q=0.76 sol=0.992 novelty=0.78 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.98)+0.20*lccp(1.00) | steps=5
+
Iter 30 GRPO groups: 95%|#########5| 19/20 [08:23<00:22, 22.41s/q, loss=-0.0004, mean_r=0.842, q_acc=100%, q_rew=0.676, skip=5]
Iter 30 GRPO groups: 100%|##########| 20/20 [08:23<00:00, 21.92s/q, loss=-0.0004, mean_r=0.842, q_acc=100%, q_rew=0.676, skip=5]
Iter 30 GRPO groups: 100%|##########| 20/20 [08:23<00:00, 25.15s/q, loss=-0.0004, mean_r=0.842, q_acc=100%, q_rew=0.676, skip=5]
+2026-04-26 07:40:09,618 INFO src.rl.llm_question_classifier - LLMClassifier cache=90% llm=1% fallback=9% (cache_size=218/10000)
+2026-04-26 07:40:09,618 INFO __main__ - Iter 30 | loss=-0.0003 | reward mean=0.871 std=0.160 | gt_match=80.0% | grounded_acc=95.0% | step_acc=91.6% | lccp=80.0% | batch_acc=96.6% | phase=SELFPLAY_RAMP sp_ratio=61% | groups=27 skipped=5(0var=5) | lr=3.29e-06 | 503.1s
+2026-04-26 07:40:09,618 INFO __main__ - Question generation: 12/12 valid (100%) | q_reward=0.676 | q_acc=100.0% (>0.5 quality) | topic=0.68 diff=0.15 clarity=1.00 novelty=0.46 solvability=0.97
+2026-04-26 07:40:09,618 INFO __main__ - Evaluating GSM8K (150 samples)...
+
GSM8K eval: 0%| | 0/150 [00:00, ?q/s]
GSM8K eval: 1%| | 1/150 [00:03<08:32, 3.44s/q, correct=1/1, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 1%|1 | 2/150 [00:08<10:38, 4.32s/q, correct=2/2, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 2%|2 | 3/150 [00:10<08:39, 3.53s/q, correct=3/3, lccp=100.0%, score=1.000, step_acc=100.0%]
GSM8K eval: 3%|2 | 4/150 [00:13<07:23, 3.04s/q, correct=3/4, lccp=83.3%, score=0.887, step_acc=91.7%]
GSM8K eval: 3%|3 | 5/150 [00:14<06:07, 2.54s/q, correct=4/5, lccp=86.7%, score=0.910, step_acc=93.3%]
GSM8K eval: 4%|4 | 6/150 [00:20<08:36, 3.59s/q, correct=4/6, lccp=75.6%, score=0.887, step_acc=87.8%]
GSM8K eval: 5%|4 | 7/150 [00:23<08:25, 3.53s/q, correct=5/7, lccp=79.0%, score=0.903, step_acc=89.5%]
GSM8K eval: 5%|5 | 8/150 [00:26<07:31, 3.18s/q, correct=6/8, lccp=81.7%, score=0.915, step_acc=90.8%]
GSM8K eval: 6%|6 | 9/150 [00:29<07:33, 3.22s/q, correct=7/9, lccp=83.7%, score=0.924, step_acc=91.9%]
GSM8K eval: 7%|6 | 10/150 [00:34<08:47, 3.77s/q, correct=7/10, lccp=81.3%, score=0.908, step_acc=90.7%]
GSM8K eval: 7%|7 | 11/150 [00:37<08:06, 3.50s/q, correct=8/11, lccp=83.0%, score=0.916, step_acc=91.5%]
GSM8K eval: 8%|8 | 12/150 [00:39<07:06, 3.09s/q, correct=9/12, lccp=84.4%, score=0.923, step_acc=92.2%]
GSM8K eval: 9%|8 | 13/150 [00:42<06:43, 2.95s/q, correct=10/13, lccp=85.6%, score=0.926, step_acc=92.8%]
GSM8K eval: 9%|9 | 14/150 [00:46<07:47, 3.44s/q, correct=11/14, lccp=86.7%, score=0.931, step_acc=93.3%]
GSM8K eval: 10%|# | 15/150 [00:49<07:09, 3.18s/q, correct=12/15, lccp=87.6%, score=0.936, step_acc=93.8%]
GSM8K eval: 11%|# | 16/150 [00:51<06:37, 2.97s/q, correct=12/16, lccp=88.3%, score=0.911, step_acc=94.2%]
GSM8K eval: 11%|#1 | 17/150 [00:55<07:13, 3.26s/q, correct=13/17, lccp=89.0%, score=0.917, step_acc=94.5%]
GSM8K eval: 12%|#2 | 18/150 [01:00<07:49, 3.55s/q, correct=13/18, lccp=85.0%, score=0.894, step_acc=91.1%]
GSM8K eval: 13%|#2 | 19/150 [01:02<07:08, 3.27s/q, correct=14/19, lccp=85.8%, score=0.899, step_acc=91.6%]
GSM8K eval: 13%|#3 | 20/150 [01:06<07:26, 3.43s/q, correct=15/20, lccp=86.5%, score=0.904, step_acc=92.0%]
GSM8K eval: 14%|#4 | 21/150 [01:09<06:51, 3.19s/q, correct=16/21, lccp=87.1%, score=0.909, step_acc=92.4%]
GSM8K eval: 15%|#4 | 22/150 [01:12<06:34, 3.08s/q, correct=17/22, lccp=84.7%, score=0.904, step_acc=91.2%]
GSM8K eval: 15%|#5 | 23/150 [01:15<06:48, 3.22s/q, correct=17/23, lccp=82.1%, score=0.887, step_acc=88.3%]
GSM8K eval: 16%|#6 | 24/150 [01:18<06:17, 3.00s/q, correct=17/24, lccp=79.7%, score=0.871, step_acc=85.7%]
GSM8K eval: 17%|#6 | 25/150 [01:20<06:05, 2.92s/q, correct=17/25, lccp=77.5%, score=0.868, step_acc=85.3%]
GSM8K eval: 17%|#7 | 26/150 [01:25<06:58, 3.38s/q, correct=18/26, lccp=78.4%, score=0.873, step_acc=85.8%]
GSM8K eval: 18%|#8 | 27/150 [01:28<06:35, 3.21s/q, correct=18/27, lccp=79.2%, score=0.869, step_acc=86.4%]
GSM8K eval: 19%|#8 | 28/150 [01:30<05:55, 2.91s/q, correct=19/28, lccp=79.9%, score=0.874, step_acc=86.8%]
GSM8K eval: 19%|#9 | 29/150 [01:33<05:47, 2.87s/q, correct=20/29, lccp=80.6%, score=0.878, step_acc=87.3%]
GSM8K eval: 20%|## | 30/150 [01:36<06:18, 3.16s/q, correct=21/30, lccp=81.3%, score=0.882, step_acc=87.7%]
GSM8K eval: 21%|## | 31/150 [01:39<05:54, 2.98s/q, correct=22/31, lccp=81.9%, score=0.886, step_acc=88.1%]
GSM8K eval: 21%|##1 | 32/150 [01:41<05:08, 2.62s/q, correct=23/32, lccp=82.4%, score=0.889, step_acc=88.5%]
GSM8K eval: 22%|##2 | 33/150 [01:43<05:11, 2.66s/q, correct=24/33, lccp=83.0%, score=0.893, step_acc=88.8%]
GSM8K eval: 23%|##2 | 34/150 [01:45<04:46, 2.47s/q, correct=25/34, lccp=83.5%, score=0.896, step_acc=89.2%]
GSM8K eval: 23%|##3 | 35/150 [01:48<04:46, 2.49s/q, correct=26/35, lccp=84.0%, score=0.899, step_acc=89.5%]
GSM8K eval: 24%|##4 | 36/150 [01:52<05:17, 2.78s/q, correct=27/36, lccp=84.4%, score=0.901, step_acc=89.8%]
GSM8K eval: 25%|##4 | 37/150 [01:53<04:47, 2.55s/q, correct=28/37, lccp=84.8%, score=0.903, step_acc=90.0%]
GSM8K eval: 25%|##5 | 38/150 [01:56<05:00, 2.68s/q, correct=29/38, lccp=85.2%, score=0.906, step_acc=90.3%]
GSM8K eval: 26%|##6 | 39/150 [02:01<06:09, 3.33s/q, correct=30/39, lccp=85.6%, score=0.908, step_acc=90.6%]
GSM8K eval: 27%|##6 | 40/150 [02:07<07:38, 4.17s/q, correct=31/40, lccp=86.0%, score=0.910, step_acc=90.8%]
GSM8K eval: 27%|##7 | 41/150 [02:11<06:58, 3.84s/q, correct=31/41, lccp=86.3%, score=0.910, step_acc=91.0%]
GSM8K eval: 28%|##8 | 42/150 [02:16<07:40, 4.27s/q, correct=32/42, lccp=85.0%, score=0.912, step_acc=90.8%]
GSM8K eval: 29%|##8 | 43/150 [02:18<06:26, 3.61s/q, correct=33/43, lccp=85.4%, score=0.914, step_acc=91.0%]
GSM8K eval: 29%|##9 | 44/150 [02:24<07:48, 4.42s/q, correct=34/44, lccp=85.7%, score=0.916, step_acc=91.2%]
GSM8K eval: 30%|### | 45/150 [02:28<07:15, 4.15s/q, correct=35/45, lccp=86.0%, score=0.918, step_acc=91.4%]
GSM8K eval: 31%|### | 46/150 [02:33<07:34, 4.37s/q, correct=35/46, lccp=84.2%, score=0.913, step_acc=91.4%]
GSM8K eval: 31%|###1 | 47/150 [02:36<06:49, 3.98s/q, correct=36/47, lccp=84.5%, score=0.915, step_acc=91.6%]
GSM8K eval: 32%|###2 | 48/150 [02:37<05:38, 3.32s/q, correct=37/48, lccp=84.8%, score=0.917, step_acc=91.7%]
GSM8K eval: 33%|###2 | 49/150 [02:41<05:42, 3.39s/q, correct=38/49, lccp=83.8%, score=0.918, step_acc=91.6%]
GSM8K eval: 33%|###3 | 50/150 [02:44<05:32, 3.32s/q, correct=38/50, lccp=83.1%, score=0.910, step_acc=90.7%]
GSM8K eval: 34%|###4 | 51/150 [02:46<04:33, 2.76s/q, correct=39/51, lccp=83.4%, score=0.911, step_acc=90.9%]
GSM8K eval: 35%|###4 | 52/150 [02:50<05:15, 3.22s/q, correct=39/52, lccp=81.8%, score=0.911, step_acc=90.8%]
GSM8K eval: 35%|###5 | 53/150 [02:55<05:54, 3.66s/q, correct=39/53, lccp=81.4%, score=0.904, step_acc=90.2%]
GSM8K eval: 36%|###6 | 54/150 [02:58<05:41, 3.55s/q, correct=40/54, lccp=81.8%, score=0.905, step_acc=90.4%]
GSM8K eval: 37%|###6 | 55/150 [03:02<06:05, 3.85s/q, correct=41/55, lccp=82.1%, score=0.907, step_acc=90.6%]
GSM8K eval: 37%|###7 | 56/150 [03:06<05:54, 3.77s/q, correct=42/56, lccp=82.4%, score=0.908, step_acc=90.7%]
GSM8K eval: 38%|###8 | 57/150 [03:08<05:09, 3.33s/q, correct=43/57, lccp=82.7%, score=0.910, step_acc=90.9%]
GSM8K eval: 39%|###8 | 58/150 [03:12<05:30, 3.59s/q, correct=44/58, lccp=83.0%, score=0.911, step_acc=91.0%]
GSM8K eval: 39%|###9 | 59/150 [03:17<05:59, 3.95s/q, correct=44/59, lccp=81.6%, score=0.909, step_acc=90.6%]
GSM8K eval: 40%|#### | 60/150 [03:22<06:21, 4.24s/q, correct=45/60, lccp=81.9%, score=0.910, step_acc=90.8%]
GSM8K eval: 41%|#### | 61/150 [03:25<05:49, 3.93s/q, correct=46/61, lccp=82.2%, score=0.912, step_acc=90.9%]
GSM8K eval: 41%|####1 | 62/150 [03:29<05:24, 3.68s/q, correct=47/62, lccp=82.5%, score=0.913, step_acc=91.1%]
GSM8K eval: 42%|####2 | 63/150 [03:32<05:15, 3.62s/q, correct=47/63, lccp=82.2%, score=0.907, step_acc=90.7%]
GSM8K eval: 43%|####2 | 64/150 [03:35<04:51, 3.39s/q, correct=48/64, lccp=82.5%, score=0.909, step_acc=90.8%]
GSM8K eval: 43%|####3 | 65/150 [03:38<04:32, 3.20s/q, correct=49/65, lccp=82.8%, score=0.910, step_acc=91.0%]
GSM8K eval: 44%|####4 | 66/150 [03:40<03:58, 2.84s/q, correct=50/66, lccp=83.1%, score=0.911, step_acc=91.1%]
GSM8K eval: 45%|####4 | 67/150 [03:42<03:41, 2.67s/q, correct=51/67, lccp=83.3%, score=0.913, step_acc=91.3%]
GSM8K eval: 45%|####5 | 68/150 [03:45<03:39, 2.68s/q, correct=52/68, lccp=83.6%, score=0.914, step_acc=91.4%]
GSM8K eval: 46%|####6 | 69/150 [03:46<03:09, 2.34s/q, correct=53/69, lccp=83.8%, score=0.915, step_acc=91.5%]
GSM8K eval: 47%|####6 | 70/150 [03:49<03:23, 2.55s/q, correct=54/70, lccp=82.6%, score=0.916, step_acc=91.3%]
GSM8K eval: 47%|####7 | 71/150 [03:52<03:34, 2.72s/q, correct=55/71, lccp=81.4%, score=0.917, step_acc=91.2%]
GSM8K eval: 48%|####8 | 72/150 [03:54<03:02, 2.34s/q, correct=56/72, lccp=81.7%, score=0.918, step_acc=91.3%]
GSM8K eval: 49%|####8 | 73/150 [03:55<02:45, 2.15s/q, correct=57/73, lccp=81.9%, score=0.919, step_acc=91.4%]
GSM8K eval: 49%|####9 | 74/150 [03:59<03:15, 2.57s/q, correct=58/74, lccp=82.2%, score=0.920, step_acc=91.5%]
GSM8K eval: 50%|##### | 75/150 [04:01<02:53, 2.31s/q, correct=59/75, lccp=82.4%, score=0.921, step_acc=91.7%]
GSM8K eval: 51%|##### | 76/150 [04:07<04:26, 3.60s/q, correct=59/76, lccp=82.5%, score=0.916, step_acc=91.6%]
GSM8K eval: 51%|#####1 | 77/150 [04:11<04:30, 3.70s/q, correct=60/77, lccp=82.7%, score=0.917, step_acc=91.7%]
GSM8K eval: 52%|#####2 | 78/150 [04:14<04:03, 3.38s/q, correct=61/78, lccp=82.9%, score=0.918, step_acc=91.8%]
GSM8K eval: 53%|#####2 | 79/150 [04:17<03:52, 3.28s/q, correct=61/79, lccp=82.1%, score=0.913, step_acc=91.1%]
GSM8K eval: 53%|#####3 | 80/150 [04:20<03:46, 3.23s/q, correct=62/80, lccp=82.3%, score=0.914, step_acc=91.2%]
GSM8K eval: 54%|#####4 | 81/150 [04:22<03:24, 2.97s/q, correct=63/81, lccp=82.5%, score=0.915, step_acc=91.3%]
GSM8K eval: 55%|#####4 | 82/150 [04:25<03:21, 2.96s/q, correct=64/82, lccp=82.8%, score=0.916, step_acc=91.4%]
GSM8K eval: 55%|#####5 | 83/150 [04:28<03:17, 2.95s/q, correct=65/83, lccp=83.0%, score=0.917, step_acc=91.5%]
GSM8K eval: 56%|#####6 | 84/150 [04:31<03:09, 2.86s/q, correct=66/84, lccp=83.2%, score=0.918, step_acc=91.6%]
GSM8K eval: 57%|#####6 | 85/150 [04:35<03:27, 3.19s/q, correct=67/85, lccp=83.4%, score=0.919, step_acc=91.7%]
GSM8K eval: 57%|#####7 | 86/150 [04:38<03:29, 3.28s/q, correct=68/86, lccp=83.6%, score=0.920, step_acc=91.8%]
GSM8K eval: 58%|#####8 | 87/150 [04:44<04:12, 4.02s/q, correct=69/87, lccp=83.7%, score=0.920, step_acc=91.9%]
GSM8K eval: 59%|#####8 | 88/150 [04:46<03:29, 3.37s/q, correct=70/88, lccp=83.9%, score=0.921, step_acc=92.0%]
GSM8K eval: 59%|#####9 | 89/150 [04:49<03:15, 3.21s/q, correct=71/89, lccp=84.1%, score=0.922, step_acc=92.1%]
GSM8K eval: 60%|###### | 90/150 [04:51<02:58, 2.98s/q, correct=72/90, lccp=84.3%, score=0.923, step_acc=92.2%]
GSM8K eval: 61%|###### | 91/150 [04:56<03:23, 3.45s/q, correct=73/91, lccp=84.5%, score=0.924, step_acc=92.2%]
GSM8K eval: 61%|######1 | 92/150 [04:59<03:13, 3.34s/q, correct=74/92, lccp=84.6%, score=0.924, step_acc=92.3%]
GSM8K eval: 62%|######2 | 93/150 [05:06<04:22, 4.61s/q, correct=75/93, lccp=84.8%, score=0.925, step_acc=92.4%]
GSM8K eval: 63%|######2 | 94/150 [05:09<03:47, 4.06s/q, correct=75/94, lccp=83.9%, score=0.922, step_acc=91.8%]
GSM8K eval: 63%|######3 | 95/150 [05:14<03:51, 4.21s/q, correct=75/95, lccp=83.0%, score=0.919, step_acc=91.0%]
GSM8K eval: 64%|######4 | 96/150 [05:17<03:31, 3.91s/q, correct=75/96, lccp=82.5%, score=0.915, step_acc=90.4%]
GSM8K eval: 65%|######4 | 97/150 [05:20<03:07, 3.54s/q, correct=75/97, lccp=82.2%, score=0.912, step_acc=90.3%]
GSM8K eval: 65%|######5 | 98/150 [05:24<03:14, 3.74s/q, correct=75/98, lccp=81.8%, score=0.909, step_acc=90.1%]
GSM8K eval: 66%|######6 | 99/150 [05:26<02:49, 3.32s/q, correct=76/99, lccp=81.9%, score=0.909, step_acc=90.2%]
GSM8K eval: 67%|######6 | 100/150 [05:28<02:24, 2.90s/q, correct=77/100, lccp=81.1%, score=0.910, step_acc=89.9%]
GSM8K eval: 67%|######7 | 101/150 [05:31<02:25, 2.96s/q, correct=77/101, lccp=80.8%, score=0.906, step_acc=89.8%]
GSM8K eval: 68%|######8 | 102/150 [05:33<02:01, 2.53s/q, correct=78/102, lccp=81.0%, score=0.907, step_acc=89.9%]
GSM8K eval: 69%|######8 | 103/150 [05:35<01:51, 2.38s/q, correct=79/103, lccp=81.2%, score=0.908, step_acc=90.0%]
GSM8K eval: 69%|######9 | 104/150 [05:39<02:21, 3.07s/q, correct=80/104, lccp=81.4%, score=0.909, step_acc=90.1%]
GSM8K eval: 70%|####### | 105/150 [05:42<02:10, 2.89s/q, correct=81/105, lccp=81.5%, score=0.910, step_acc=90.2%]
GSM8K eval: 71%|####### | 106/150 [05:43<01:48, 2.47s/q, correct=82/106, lccp=81.7%, score=0.910, step_acc=90.3%]
GSM8K eval: 71%|#######1 | 107/150 [05:45<01:33, 2.18s/q, correct=83/107, lccp=81.9%, score=0.911, step_acc=90.4%]
GSM8K eval: 72%|#######2 | 108/150 [05:48<01:38, 2.34s/q, correct=84/108, lccp=82.1%, score=0.912, step_acc=90.5%]
GSM8K eval: 73%|#######2 | 109/150 [05:53<02:08, 3.14s/q, correct=84/109, lccp=81.6%, score=0.911, step_acc=90.4%]
GSM8K eval: 73%|#######3 | 110/150 [05:55<01:54, 2.85s/q, correct=85/110, lccp=81.8%, score=0.911, step_acc=90.5%]
GSM8K eval: 74%|#######4 | 111/150 [05:56<01:37, 2.49s/q, correct=86/111, lccp=81.9%, score=0.912, step_acc=90.6%]
GSM8K eval: 75%|#######4 | 112/150 [06:02<02:04, 3.28s/q, correct=86/112, lccp=82.1%, score=0.912, step_acc=90.7%]
GSM8K eval: 75%|#######5 | 113/150 [06:03<01:44, 2.83s/q, correct=87/113, lccp=82.3%, score=0.913, step_acc=90.7%]
GSM8K eval: 76%|#######6 | 114/150 [06:08<02:06, 3.50s/q, correct=88/114, lccp=81.8%, score=0.913, step_acc=90.7%]
GSM8K eval: 77%|#######6 | 115/150 [06:11<01:55, 3.31s/q, correct=89/115, lccp=81.9%, score=0.914, step_acc=90.8%]
GSM8K eval: 77%|#######7 | 116/150 [06:14<01:48, 3.19s/q, correct=90/116, lccp=82.1%, score=0.915, step_acc=90.8%]
GSM8K eval: 78%|#######8 | 117/150 [06:20<02:13, 4.03s/q, correct=91/117, lccp=82.3%, score=0.915, step_acc=90.9%]
GSM8K eval: 79%|#######8 | 118/150 [06:25<02:13, 4.17s/q, correct=91/118, lccp=81.6%, score=0.913, step_acc=90.9%]
GSM8K eval: 79%|#######9 | 119/150 [06:28<02:04, 4.00s/q, correct=91/119, lccp=81.7%, score=0.912, step_acc=90.9%]
GSM8K eval: 80%|######## | 120/150 [06:31<01:49, 3.66s/q, correct=92/120, lccp=81.9%, score=0.912, step_acc=91.0%]
GSM8K eval: 81%|######## | 121/150 [06:34<01:40, 3.48s/q, correct=93/121, lccp=82.0%, score=0.913, step_acc=91.1%]
GSM8K eval: 81%|########1 | 122/150 [06:37<01:33, 3.35s/q, correct=94/122, lccp=82.2%, score=0.914, step_acc=91.2%]
GSM8K eval: 82%|########2 | 123/150 [06:41<01:30, 3.37s/q, correct=95/123, lccp=82.3%, score=0.914, step_acc=91.2%]
GSM8K eval: 83%|########2 | 124/150 [06:43<01:18, 3.02s/q, correct=96/124, lccp=82.5%, score=0.915, step_acc=91.3%]
GSM8K eval: 83%|########3 | 125/150 [06:45<01:08, 2.74s/q, correct=97/125, lccp=82.6%, score=0.916, step_acc=91.4%]
GSM8K eval: 84%|########4 | 126/150 [06:48<01:05, 2.72s/q, correct=98/126, lccp=82.7%, score=0.916, step_acc=91.4%]
GSM8K eval: 85%|########4 | 127/150 [06:52<01:14, 3.25s/q, correct=99/127, lccp=82.9%, score=0.917, step_acc=91.5%]
GSM8K eval: 85%|########5 | 128/150 [06:55<01:09, 3.17s/q, correct=100/128, lccp=83.0%, score=0.918, step_acc=91.6%]
GSM8K eval: 86%|########6 | 129/150 [06:59<01:09, 3.29s/q, correct=101/129, lccp=83.1%, score=0.918, step_acc=91.6%]
GSM8K eval: 87%|########6 | 130/150 [07:01<00:56, 2.85s/q, correct=102/130, lccp=83.3%, score=0.919, step_acc=91.7%]
GSM8K eval: 87%|########7 | 131/150 [07:05<01:03, 3.37s/q, correct=103/131, lccp=83.4%, score=0.919, step_acc=91.8%]
GSM8K eval: 88%|########8 | 132/150 [07:07<00:50, 2.83s/q, correct=104/132, lccp=83.5%, score=0.920, step_acc=91.8%]
GSM8K eval: 89%|########8 | 133/150 [07:10<00:48, 2.83s/q, correct=105/133, lccp=83.6%, score=0.921, step_acc=91.9%]
GSM8K eval: 89%|########9 | 134/150 [07:14<00:52, 3.30s/q, correct=106/134, lccp=83.8%, score=0.921, step_acc=92.0%]
GSM8K eval: 90%|######### | 135/150 [07:17<00:48, 3.23s/q, correct=107/135, lccp=83.9%, score=0.922, step_acc=92.0%]
GSM8K eval: 91%|######### | 136/150 [07:22<00:53, 3.83s/q, correct=108/136, lccp=84.0%, score=0.922, step_acc=92.1%]
GSM8K eval: 91%|#########1| 137/150 [07:29<01:01, 4.72s/q, correct=109/137, lccp=84.1%, score=0.923, step_acc=92.1%]
GSM8K eval: 92%|#########2| 138/150 [07:33<00:53, 4.48s/q, correct=110/138, lccp=84.2%, score=0.923, step_acc=92.2%]
GSM8K eval: 93%|#########2| 139/150 [07:36<00:46, 4.20s/q, correct=111/139, lccp=84.3%, score=0.924, step_acc=92.2%]
GSM8K eval: 93%|#########3| 140/150 [07:41<00:41, 4.19s/q, correct=111/140, lccp=84.2%, score=0.920, step_acc=92.1%]
GSM8K eval: 94%|#########3| 141/150 [07:44<00:36, 4.06s/q, correct=112/141, lccp=84.3%, score=0.921, step_acc=92.1%]
GSM8K eval: 95%|#########4| 142/150 [07:49<00:33, 4.18s/q, correct=113/142, lccp=84.4%, score=0.921, step_acc=92.2%]
GSM8K eval: 95%|#########5| 143/150 [07:51<00:25, 3.61s/q, correct=114/143, lccp=84.5%, score=0.922, step_acc=92.2%]
GSM8K eval: 96%|#########6| 144/150 [07:53<00:19, 3.23s/q, correct=115/144, lccp=84.7%, score=0.922, step_acc=92.3%]
GSM8K eval: 97%|#########6| 145/150 [07:56<00:14, 2.90s/q, correct=115/145, lccp=84.8%, score=0.920, step_acc=92.3%]
GSM8K eval: 97%|#########7| 146/150 [07:59<00:11, 2.93s/q, correct=116/146, lccp=84.9%, score=0.920, step_acc=92.4%]
GSM8K eval: 98%|#########8| 147/150 [08:02<00:09, 3.19s/q, correct=117/147, lccp=85.0%, score=0.921, step_acc=92.4%]
GSM8K eval: 99%|#########8| 148/150 [08:06<00:06, 3.34s/q, correct=118/148, lccp=85.1%, score=0.921, step_acc=92.5%]
GSM8K eval: 99%|#########9| 149/150 [08:09<00:03, 3.36s/q, correct=119/149, lccp=85.2%, score=0.922, step_acc=92.5%]
GSM8K eval: 100%|##########| 150/150 [08:14<00:00, 3.83s/q, correct=119/150, lccp=85.0%, score=0.920, step_acc=92.3%]
GSM8K eval: 100%|##########| 150/150 [08:14<00:00, 3.30s/q, correct=119/150, lccp=85.0%, score=0.920, step_acc=92.3%]
+2026-04-26 07:48:24,548 INFO __main__ - Training Score [iter 30]: 0.9204 (best=0.9262) | n=150
+2026-04-26 07:48:24,548 INFO __main__ - Components : 0.50×correct(79.3%) + 0.40×process + 0.10×fmt(1.000)
+2026-04-26 07:48:24,548 INFO __main__ - Process score : prm_mean=0.904 prm_final=0.929 → weighted=0.919
+2026-04-26 07:48:24,548 INFO __main__ - Step accuracy : 92.3% (bag-of-steps: fraction of steps PRM >0.5)
+2026-04-26 07:48:24,548 INFO __main__ - Chain integrity (LCCP): 85.0% ← fraction of steps before first failure
+ [LCCP=100% → all steps correct; LCCP=0% → first step wrong]
+2026-04-26 07:48:24,549 INFO __main__ - (debug) final-answer accuracy: 79.3%
+2026-04-26 07:48:26,758 INFO __main__ - Pruned old checkpoint: iter_0010
+2026-04-26 07:48:26,764 INFO __main__ - ======================================================================
+2026-04-26 07:48:26,764 INFO __main__ - GRPO ITERATION 31/60
+2026-04-26 07:48:26,764 INFO __main__ - ======================================================================
+2026-04-26 07:48:26,783 INFO __main__ - LR this iteration: 3.29e-06 | T=0.597 | MATH ratio=50%
+
Iter 31 GRPO groups: 0%| | 0/20 [00:00, ?q/s]2026-04-26 07:48:33,771 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.927 = clip(base=0.847 + mod=+0.080, cap=1.00) | Q=0.62 sol=0.997 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:48:33,967 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.904 = clip(base=0.824 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.998 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:48:34,164 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.902 = clip(base=0.822 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.991 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.97)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:48:34,363 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.905 = clip(base=0.825 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.999 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=2
+2026-04-26 07:48:34,569 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.906 = clip(base=0.826 + mod=+0.080, cap=1.00) | Q=0.57 sol=1.000 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(1.00)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:48:34,773 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.900 = clip(base=0.820 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.987 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.96)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:48:34,975 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.485 = clip(base=0.405 + mod=+0.080, cap=1.00) | Q=0.53 sol=0.320 novelty=0.63 | sol=0.45*prm_final(0.54)+0.35*prm_mean(0.23)+0.20*lccp(0.00) | steps=4
+2026-04-26 07:48:35,179 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.910 = clip(base=0.830 + mod=+0.080, cap=1.00) | Q=0.58 sol=0.996 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.99)+0.20*lccp(1.00) | steps=4
+2026-04-26 07:48:35,386 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.891 = clip(base=0.811 + mod=+0.080, cap=1.00) | Q=0.56 sol=0.976 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.93)+0.20*lccp(1.00) | steps=3
+2026-04-26 07:48:35,584 INFO src.rl.math_environment_curriculum - PRM reward: combined=0.897 = clip(base=0.817 + mod=+0.080, cap=1.00) | Q=0.57 sol=0.984 novelty=0.63 | sol=0.45*prm_final(1.00)+0.35*prm_mean(0.95)+0.20*lccp(1.00) | steps=4
+
Iter 31 GRPO groups: 0%| | 0/20 [00:12, ?q/s]
diff --git a/logs/grpo/grpo_20260426_032827/metrics.csv b/logs/grpo/grpo_20260426_032827/metrics.csv
new file mode 100644
index 0000000000000000000000000000000000000000..50f2c8f0d1d3b3e41202746d2cb51c72c878e466
--- /dev/null
+++ b/logs/grpo/grpo_20260426_032827/metrics.csv
@@ -0,0 +1,31 @@
+iteration,timestamp,loss,mean_reward,std_reward,batch_accuracy,grounded_acc,gt_match_rate,step_accuracy,lccp,n_groups,skipped_groups,n_sp_groups,sp_ratio,sp_suspended,training_phase,learning_rate,iter_time_s,q_reward,q_valid_rate,q_novelty,q_solvability,chain_prm_corr,chain_scoring_on,eval_combined,eval_correct_rt,eval_prm,eval_step_acc,eval_lccp,eval_format,eval_n_scored,eval_final_ans
+1,2026-04-26T03:38:38,0.000610,0.914309,0.163605,0.960000,0.960000,0.780000,0.894861,0.814111,12,8,0,0.000000,0,GROUNDED_ONLY,0.000001,127.637996,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+2,2026-04-26T03:41:58,-0.000034,0.847892,0.216018,0.914141,0.914141,0.651500,0.866692,0.765381,18,2,0,0.000000,0,GROUNDED_ONLY,0.000002,199.518393,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+3,2026-04-26T03:45:08,0.000366,0.896391,0.170699,0.954545,0.954545,0.707100,0.876898,0.765238,12,8,0,0.000000,0,GROUNDED_ONLY,0.000002,189.836063,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+4,2026-04-26T03:48:10,0.000942,0.865431,0.218756,0.893939,0.893939,0.732300,0.858504,0.764982,11,9,0,0.000000,0,GROUNDED_ONLY,0.000003,182.125475,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+5,2026-04-26T03:59:39,0.000081,0.856875,0.239487,0.884422,0.884422,0.693500,0.918500,0.843100,16,4,0,0.000000,0,GROUNDED_ONLY,0.000003,201.679190,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.919200,0.793300,0.903500,0.918500,0.843100,0.997700,150,0.793333
+6,2026-04-26T04:02:52,-0.000063,0.879253,0.215318,0.909548,0.909548,0.748700,0.884646,0.805897,12,8,0,0.000000,0,GROUNDED_ONLY,0.000004,193.350312,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+7,2026-04-26T04:06:20,0.001071,0.837888,0.223356,0.883249,0.883249,0.639600,0.813073,0.658069,14,6,0,0.000000,0,GROUNDED_ONLY,0.000004,208.223944,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+8,2026-04-26T04:09:11,-0.000257,0.875536,0.200109,0.895000,0.895000,0.690000,0.864722,0.747928,13,7,0,0.000000,0,GROUNDED_ONLY,0.000005,170.595953,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+9,2026-04-26T04:12:52,0.000060,0.906506,0.176914,0.964646,0.964646,0.803000,0.893573,0.817532,15,5,0,0.000000,0,GROUNDED_ONLY,0.000005,221.350669,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+10,2026-04-26T04:24:49,0.000425,0.880765,0.175501,0.954774,0.954774,0.683400,0.920500,0.842600,14,6,0,0.000000,0,GROUNDED_ONLY,0.000005,188.981772,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.919900,0.793300,0.906600,0.920500,0.842600,0.998000,150,0.793333
+11,2026-04-26T04:27:11,-0.000557,0.969814,0.098322,0.985000,0.985000,0.930000,0.966268,0.921810,8,12,0,0.000000,0,GROUNDED_ONLY,0.000005,141.966778,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+12,2026-04-26T04:30:09,0.000073,0.849274,0.212864,0.900000,0.900000,0.650000,0.820526,0.687272,14,6,0,0.000000,0,SELFPLAY_RAMP,0.000005,177.954757,0.000000,0.000000,0.000000,0.000000,0.000000,0,,,,,,,,
+13,2026-04-26T04:39:26,0.000268,0.898824,0.185992,0.930000,0.930000,0.780000,0.870960,0.788730,14,6,0,0.000000,0,SELFPLAY_RAMP,0.000005,556.185637,0.000000,0.000000,0.000000,0.000000,-0.040000,0,,,,,,,,
+14,2026-04-26T04:48:54,0.000496,0.855832,0.208499,0.952381,0.947368,0.673700,0.857607,0.747807,18,3,1,0.036000,0,SELFPLAY_RAMP,0.000005,568.400518,0.763000,1.000000,0.428900,1.000000,0.209000,0,,,,,,,,
+15,2026-04-26T05:06:28,0.000023,0.927972,0.167187,0.937799,0.931217,0.836000,0.924200,0.842400,12,9,1,0.071000,0,SELFPLAY_RAMP,0.000005,550.143772,0.721800,1.000000,0.458000,1.000000,0.079000,0,0.926200,0.800000,0.907200,0.924200,0.842400,1.000000,150,0.800000
+16,2026-04-26T05:16:04,0.000330,0.914605,0.172733,0.949772,0.938547,0.832400,0.895523,0.843899,15,7,2,0.107000,0,SELFPLAY_RAMP,0.000005,575.528946,0.787800,1.000000,0.447500,0.960000,0.089000,0,,,,,,,,
+17,2026-04-26T05:26:20,-0.000137,0.888123,0.195006,0.938326,0.916168,0.700600,0.855796,0.768235,20,3,3,0.143000,0,SELFPLAY_RAMP,0.000005,616.018573,0.798200,1.000000,0.461600,1.000000,-0.191000,0,,,,,,,,
+18,2026-04-26T05:35:30,0.000079,0.866401,0.178010,0.953975,0.943396,0.591200,0.830780,0.692011,19,5,4,0.179000,0,SELFPLAY_RAMP,0.000005,550.572628,0.739400,1.000000,0.452000,0.976200,0.021000,0,,,,,,,,
+19,2026-04-26T05:44:13,0.000151,0.891281,0.172665,0.953586,0.949045,0.764300,0.851398,0.756874,16,8,4,0.214000,0,SELFPLAY_RAMP,0.000005,522.428960,0.733100,1.000000,0.456400,0.972500,0.075000,0,,,,,,,,
+20,2026-04-26T06:02:54,0.000244,0.896291,0.177842,0.927711,0.906040,0.798700,0.925300,0.842800,18,7,5,0.250000,0,SELFPLAY_RAMP,0.000004,619.886349,0.770000,1.000000,0.474100,0.945000,-0.118000,0,0.923400,0.800000,0.905600,0.925300,0.842800,1.000000,150,0.800000
+21,2026-04-26T06:11:04,0.000192,0.841732,0.187981,0.923077,0.914286,0.735700,0.819504,0.693061,21,5,6,0.286000,0,SELFPLAY_RAMP,0.000004,490.366938,0.697200,1.000000,0.449300,0.962500,0.209000,0,,,,,,,,
+22,2026-04-26T06:21:16,0.000579,0.917519,0.124242,0.984314,0.985294,0.904400,0.964735,0.928489,20,6,6,0.321000,0,SELFPLAY_RAMP,0.000004,611.872286,0.699800,1.000000,0.457100,0.979000,0.145000,0,,,,,,,,
+23,2026-04-26T06:28:41,0.000614,0.920698,0.147419,0.977011,0.950820,0.803300,0.907500,0.847631,18,9,7,0.357000,0,SELFPLAY_RAMP,0.000004,444.320885,0.726000,1.000000,0.441200,0.988500,0.143000,0,,,,,,,,
+24,2026-04-26T06:36:32,-0.000213,0.879590,0.173313,0.935714,0.933333,0.791700,0.898819,0.812292,20,8,8,0.393000,0,SELFPLAY_RAMP,0.000004,471.698962,0.662100,1.000000,0.440800,0.968800,0.082000,0,,,,,,,,
+25,2026-04-26T06:53:36,0.000344,0.844528,0.208658,0.927336,0.853211,0.605500,0.919800,0.846800,28,1,9,0.429000,0,SELFPLAY_RAMP,0.000004,524.655717,0.647100,1.000000,0.439400,0.967200,0.127000,0,0.922100,0.793300,0.903400,0.919800,0.846800,1.000000,150,0.793333
+26,2026-04-26T07:02:06,0.000421,0.866649,0.179636,0.920415,0.926606,0.789000,0.889846,0.794302,26,3,9,0.464000,0,SELFPLAY_RAMP,0.000004,509.677450,0.679200,1.000000,0.448800,0.931700,0.065000,0,,,,,,,,
+27,2026-04-26T07:12:03,-0.000227,0.877934,0.162866,0.956376,0.939394,0.686900,0.861628,0.740657,25,5,10,0.500000,0,SELFPLAY_RAMP,0.000004,597.521238,0.683100,1.000000,0.458400,0.975900,0.067000,0,,,,,,,,
+28,2026-04-26T07:22:06,0.000042,0.869600,0.159154,0.941935,0.877778,0.655600,0.833443,0.618623,29,2,11,0.536000,0,SELFPLAY_RAMP,0.000004,603.099793,0.669300,1.000000,0.448900,0.983600,0.047000,0,,,,,,,,
+29,2026-04-26T07:31:46,0.000377,0.867441,0.170826,0.947020,0.892857,0.726200,0.867407,0.760394,28,3,11,0.571000,0,SELFPLAY_RAMP,0.000003,579.690467,0.649600,1.000000,0.442500,0.973900,0.123000,0,,,,,,,,
+30,2026-04-26T07:48:26,-0.000299,0.870581,0.160260,0.965517,0.950000,0.800000,0.923200,0.850000,27,5,12,0.607000,0,SELFPLAY_RAMP,0.000003,503.087982,0.676400,1.000000,0.456600,0.969900,0.099000,0,0.920400,0.793300,0.904400,0.923200,0.850000,1.000000,150,0.793333
diff --git a/logs/metrics.jsonl b/logs/metrics.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9b727ff60fc4d7ede8ee65bb27808a7910c2b7a3
--- /dev/null
+++ b/logs/metrics.jsonl
@@ -0,0 +1,31 @@
+{"iteration": 0, "accuracy": 0.9162, "combined_score": 0.9162, "step_accuracy": 0.9111, "lccp": 0.8392, "correct_rate": 0.7867, "prm_mean": 0.8988, "prm_final": 0.9275, "sympy_mean": 0.0, "format_mean": 1.0, "n_scored": 150, "total": 150, "final_answer_correct": 118, "final_answer_accuracy": 0.7866666666666666}
+{"iteration": 1, "loss": 0.0006103356778718686, "mean_reward": 0.914308755129325, "std_reward": 0.1636050993381563, "batch_accuracy": 0.96, "grounded_accuracy": 0.96, "gt_match_rate": 0.78, "step_accuracy": 0.8948611111111111, "lccp": 0.8141111111111111, "n_groups": 12, "skipped_groups": 8, "learning_rate": 1.0625000000000002e-06, "iter_time_s": 127.63799649500288, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 2, "loss": -3.432962815471304e-05, "mean_reward": 0.8478923191518654, "std_reward": 0.2160182166583165, "batch_accuracy": 0.9141414141414141, "grounded_accuracy": 0.9141414141414141, "gt_match_rate": 0.6515, "step_accuracy": 0.8666916416916417, "lccp": 0.7653809153809155, "n_groups": 18, "skipped_groups": 2, "learning_rate": 1.6250000000000001e-06, "iter_time_s": 199.5183933188673, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 3, "loss": 0.0003658987698145211, "mean_reward": 0.8963912433066207, "std_reward": 0.17069859725714537, "batch_accuracy": 0.9545454545454546, "grounded_accuracy": 0.9545454545454546, "gt_match_rate": 0.7071, "step_accuracy": 0.876897947731281, "lccp": 0.765237694404361, "n_groups": 12, "skipped_groups": 8, "learning_rate": 2.1875000000000002e-06, "iter_time_s": 189.83606291818433, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 4, "loss": 0.0009415318305731158, "mean_reward": 0.8654313890820613, "std_reward": 0.21875612713334075, "batch_accuracy": 0.8939393939393939, "grounded_accuracy": 0.8939393939393939, "gt_match_rate": 0.7323, "step_accuracy": 0.8585036876703543, "lccp": 0.7649821628988295, "n_groups": 11, "skipped_groups": 9, "learning_rate": 2.7500000000000004e-06, "iter_time_s": 182.12547484994866, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 5, "loss": 8.118284122815567e-05, "mean_reward": 0.8568747993989829, "std_reward": 0.23948718740823036, "batch_accuracy": 0.8844221105527639, "grounded_accuracy": 0.8844221105527639, "gt_match_rate": 0.6935, "step_accuracy": 0.9185, "lccp": 0.8431, "n_groups": 16, "skipped_groups": 4, "learning_rate": 3.3125000000000005e-06, "iter_time_s": 201.67919013393112, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0, "accuracy": 0.9192, "combined_score": 0.9192, "correct_rate": 0.7933, "prm_mean": 0.9035, "prm_final": 0.9305, "sympy_mean": 0.0, "format_mean": 0.9977, "n_scored": 150, "total": 150, "final_answer_correct": 119, "final_answer_accuracy": 0.7933333333333333}
+{"iteration": 6, "loss": -6.271734067316477e-05, "mean_reward": 0.8792530329566163, "std_reward": 0.21531797453446344, "batch_accuracy": 0.9095477386934674, "grounded_accuracy": 0.9095477386934674, "gt_match_rate": 0.7487, "step_accuracy": 0.8846455219822055, "lccp": 0.8058971263242619, "n_groups": 12, "skipped_groups": 8, "learning_rate": 3.875e-06, "iter_time_s": 193.35031225602143, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 7, "loss": 0.0010708057920315436, "mean_reward": 0.8378877251545859, "std_reward": 0.2233563664223874, "batch_accuracy": 0.883248730964467, "grounded_accuracy": 0.883248730964467, "gt_match_rate": 0.6396, "step_accuracy": 0.8130725309659319, "lccp": 0.6580686304671076, "n_groups": 14, "skipped_groups": 6, "learning_rate": 4.4375e-06, "iter_time_s": 208.22394350194372, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 8, "loss": -0.0002566667799678376, "mean_reward": 0.8755362041151912, "std_reward": 0.20010863742401203, "batch_accuracy": 0.895, "grounded_accuracy": 0.895, "gt_match_rate": 0.69, "step_accuracy": 0.8647215007215007, "lccp": 0.7479280303030303, "n_groups": 13, "skipped_groups": 7, "learning_rate": 5e-06, "iter_time_s": 170.59595341305248, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 9, "loss": 5.9516330460004004e-05, "mean_reward": 0.906506146327221, "std_reward": 0.1769136401553803, "batch_accuracy": 0.9646464646464646, "grounded_accuracy": 0.9646464646464646, "gt_match_rate": 0.803, "step_accuracy": 0.8935726310726311, "lccp": 0.8175324675324676, "n_groups": 15, "skipped_groups": 5, "learning_rate": 4.995894997002465e-06, "iter_time_s": 221.35066892812029, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 10, "loss": 0.0004252615440886335, "mean_reward": 0.8807654454859567, "std_reward": 0.17550108931309533, "batch_accuracy": 0.9547738693467337, "grounded_accuracy": 0.9547738693467337, "gt_match_rate": 0.6834, "step_accuracy": 0.9205, "lccp": 0.8426, "n_groups": 14, "skipped_groups": 6, "learning_rate": 4.983594966720622e-06, "iter_time_s": 188.98177218902856, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0, "accuracy": 0.9199, "combined_score": 0.9199, "correct_rate": 0.7933, "prm_mean": 0.9066, "prm_final": 0.9408, "sympy_mean": 0.0, "format_mean": 0.998, "n_scored": 150, "total": 150, "final_answer_correct": 119, "final_answer_accuracy": 0.7933333333333333}
+{"iteration": 11, "loss": -0.0005566358695432427, "mean_reward": 0.9698135460130081, "std_reward": 0.0983216960471261, "batch_accuracy": 0.985, "grounded_accuracy": 0.985, "gt_match_rate": 0.93, "step_accuracy": 0.9662678571428571, "lccp": 0.9218095238095237, "n_groups": 8, "skipped_groups": 12, "learning_rate": 4.963144790631074e-06, "iter_time_s": 141.96677790791728, "training_phase": "GROUNDED_ONLY", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 12, "loss": 7.270745637859883e-05, "mean_reward": 0.8492740230597824, "std_reward": 0.2128636238290247, "batch_accuracy": 0.9, "grounded_accuracy": 0.9, "gt_match_rate": 0.65, "step_accuracy": 0.8205257936507937, "lccp": 0.6872718253968253, "n_groups": 14, "skipped_groups": 6, "learning_rate": 4.934619089208618e-06, "iter_time_s": 177.9547567779664, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.0, "extraction_success_rate": 0.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 13, "loss": 0.00026773045517204864, "mean_reward": 0.8988236995312778, "std_reward": 0.18599151493605476, "batch_accuracy": 0.93, "grounded_accuracy": 0.93, "gt_match_rate": 0.78, "step_accuracy": 0.8709603174603174, "lccp": 0.7887301587301587, "n_groups": 14, "skipped_groups": 6, "learning_rate": 4.898121949644228e-06, "iter_time_s": 556.1856374200433, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.0, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": -0.04, "extraction_success_rate": 1.0, "chain_scoring_active": 0, "n_self_play_groups": 0, "q_gen_attempts": 0, "q_gen_valid": 0, "q_gen_valid_rate": 0.0, "mean_question_reward": 0.0, "q_quality_rate": 0.0, "q_topic_match": 0.0, "q_difficulty_fit": 0.0, "q_clarity": 0.0, "q_novelty": 0.0, "q_solvability": 0.0}
+{"iteration": 14, "loss": 0.0004961729192069066, "mean_reward": 0.8558324048863098, "std_reward": 0.20849902292009304, "batch_accuracy": 0.9523809523809523, "grounded_accuracy": 0.9473684210526315, "gt_match_rate": 0.6737, "step_accuracy": 0.8576065162907268, "lccp": 0.7478070175438597, "n_groups": 18, "skipped_groups": 3, "learning_rate": 4.853786546042184e-06, "iter_time_s": 568.4005180909298, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.036, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.209, "extraction_success_rate": 1.0, "chain_scoring_active": 0, "n_self_play_groups": 1, "q_gen_attempts": 1, "q_gen_valid": 1, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.763, "q_quality_rate": 1.0, "q_topic_match": 0.575, "q_difficulty_fit": 0.89, "q_clarity": 1.0, "q_novelty": 0.4289, "q_solvability": 1.0}
+{"iteration": 15, "loss": 2.3262581635208335e-05, "mean_reward": 0.927972135586315, "std_reward": 0.16718736928397065, "batch_accuracy": 0.937799043062201, "grounded_accuracy": 0.9312169312169312, "gt_match_rate": 0.836, "step_accuracy": 0.9242, "lccp": 0.8424, "n_groups": 12, "skipped_groups": 9, "learning_rate": 4.801774653482204e-06, "iter_time_s": 550.1437717408407, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.071, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.079, "extraction_success_rate": 0.98, "chain_scoring_active": 0, "n_self_play_groups": 1, "q_gen_attempts": 1, "q_gen_valid": 1, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.7218, "q_quality_rate": 1.0, "q_topic_match": 0.35, "q_difficulty_fit": 0.9511, "q_clarity": 1.0, "q_novelty": 0.458, "q_solvability": 1.0, "accuracy": 0.9262, "combined_score": 0.9262, "correct_rate": 0.8, "prm_mean": 0.9072, "prm_final": 0.9404, "sympy_mean": 0.0, "format_mean": 1.0, "n_scored": 150, "total": 150, "final_answer_correct": 120, "final_answer_accuracy": 0.8}
+{"iteration": 16, "loss": 0.0003296181123005226, "mean_reward": 0.9146047620088099, "std_reward": 0.17273258044260062, "batch_accuracy": 0.9497716894977168, "grounded_accuracy": 0.9385474860335196, "gt_match_rate": 0.8324, "step_accuracy": 0.8955234709424654, "lccp": 0.8438994897095455, "n_groups": 15, "skipped_groups": 7, "learning_rate": 4.742276057719723e-06, "iter_time_s": 575.5289459908381, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.107, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.089, "extraction_success_rate": 0.94, "chain_scoring_active": 0, "n_self_play_groups": 2, "q_gen_attempts": 2, "q_gen_valid": 2, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.7878, "q_quality_rate": 1.0, "q_topic_match": 0.875, "q_difficulty_fit": 0.5838, "q_clarity": 1.0, "q_novelty": 0.4475, "q_solvability": 0.96}
+{"iteration": 17, "loss": -0.00013719029248022708, "mean_reward": 0.8881227328092163, "std_reward": 0.1950058307020988, "batch_accuracy": 0.9383259911894273, "grounded_accuracy": 0.9161676646706587, "gt_match_rate": 0.7006, "step_accuracy": 0.8557955517536356, "lccp": 0.7682349586541203, "n_groups": 20, "skipped_groups": 3, "learning_rate": 4.675507862678258e-06, "iter_time_s": 616.0185732548125, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.143, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": -0.191, "extraction_success_rate": 1.0, "chain_scoring_active": 0, "n_self_play_groups": 3, "q_gen_attempts": 3, "q_gen_valid": 3, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.7982, "q_quality_rate": 1.0, "q_topic_match": 0.69, "q_difficulty_fit": 0.8892, "q_clarity": 1.0, "q_novelty": 0.4616, "q_solvability": 1.0}
+{"iteration": 18, "loss": 7.917114673641903e-05, "mean_reward": 0.8664005137011263, "std_reward": 0.178010205898339, "batch_accuracy": 0.9539748953974896, "grounded_accuracy": 0.9433962264150944, "gt_match_rate": 0.5912, "step_accuracy": 0.830780173704702, "lccp": 0.6920110811620246, "n_groups": 19, "skipped_groups": 5, "learning_rate": 4.601713698260728e-06, "iter_time_s": 550.572628196096, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.179, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.021, "extraction_success_rate": 0.98, "chain_scoring_active": 0, "n_self_play_groups": 4, "q_gen_attempts": 4, "q_gen_valid": 4, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.7394, "q_quality_rate": 1.0, "q_topic_match": 0.6375, "q_difficulty_fit": 0.6293, "q_clarity": 1.0, "q_novelty": 0.452, "q_solvability": 0.9762}
+{"iteration": 19, "loss": 0.00015087392284840462, "mean_reward": 0.8912812767256229, "std_reward": 0.1726645221785555, "batch_accuracy": 0.9535864978902954, "grounded_accuracy": 0.9490445859872612, "gt_match_rate": 0.7643, "step_accuracy": 0.8513975055376328, "lccp": 0.7568744772566428, "n_groups": 16, "skipped_groups": 8, "learning_rate": 4.521162831370364e-06, "iter_time_s": 522.4289600129705, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.214, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.075, "extraction_success_rate": 0.98, "chain_scoring_active": 0, "n_self_play_groups": 4, "q_gen_attempts": 4, "q_gen_valid": 4, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.7331, "q_quality_rate": 1.0, "q_topic_match": 0.4813, "q_difficulty_fit": 0.8466, "q_clarity": 1.0, "q_novelty": 0.4564, "q_solvability": 0.9725}
+{"iteration": 20, "loss": 0.00024373266084391312, "mean_reward": 0.8962914079724992, "std_reward": 0.1778417367801085, "batch_accuracy": 0.927710843373494, "grounded_accuracy": 0.9060402684563759, "gt_match_rate": 0.7987, "step_accuracy": 0.9253, "lccp": 0.8428, "n_groups": 18, "skipped_groups": 7, "learning_rate": 4.434149183384978e-06, "iter_time_s": 619.8863487117924, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.25, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": -0.118, "extraction_success_rate": 0.96, "chain_scoring_active": 0, "n_self_play_groups": 5, "q_gen_attempts": 5, "q_gen_valid": 5, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.77, "q_quality_rate": 1.0, "q_topic_match": 0.723, "q_difficulty_fit": 0.703, "q_clarity": 1.0, "q_novelty": 0.4741, "q_solvability": 0.945, "accuracy": 0.9234, "combined_score": 0.9234, "correct_rate": 0.8, "prm_mean": 0.9056, "prm_final": 0.9353, "sympy_mean": 0.0, "format_mean": 1.0, "n_scored": 150, "total": 150, "final_answer_correct": 120, "final_answer_accuracy": 0.8}
+{"iteration": 21, "loss": 0.0001916794737033862, "mean_reward": 0.8417323480901788, "std_reward": 0.1879809468583581, "batch_accuracy": 0.9230769230769231, "grounded_accuracy": 0.9142857142857143, "gt_match_rate": 0.7357, "step_accuracy": 0.8195039682539682, "lccp": 0.6930612244897959, "n_groups": 21, "skipped_groups": 5, "learning_rate": 4.340990257669732e-06, "iter_time_s": 490.36693838005885, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.286, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.209, "extraction_success_rate": 0.92, "chain_scoring_active": 0, "n_self_play_groups": 6, "q_gen_attempts": 6, "q_gen_valid": 6, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6972, "q_quality_rate": 1.0, "q_topic_match": 0.5742, "q_difficulty_fit": 0.4754, "q_clarity": 1.0, "q_novelty": 0.4493, "q_solvability": 0.9625}
+{"iteration": 22, "loss": 0.000578732604299148, "mean_reward": 0.9175190043251262, "std_reward": 0.12424225720214971, "batch_accuracy": 0.984313725490196, "grounded_accuracy": 0.9852941176470589, "gt_match_rate": 0.9044, "step_accuracy": 0.9647345301757068, "lccp": 0.9284886681945506, "n_groups": 20, "skipped_groups": 6, "learning_rate": 4.2420259810417895e-06, "iter_time_s": 611.8722857821267, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.321, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.145, "extraction_success_rate": 0.92, "chain_scoring_active": 0, "n_self_play_groups": 6, "q_gen_attempts": 6, "q_gen_valid": 6, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6998, "q_quality_rate": 1.0, "q_topic_match": 0.6189, "q_difficulty_fit": 0.3856, "q_clarity": 1.0, "q_novelty": 0.4571, "q_solvability": 0.979}
+{"iteration": 23, "loss": 0.0006137362383419208, "mean_reward": 0.9206978778568132, "std_reward": 0.14741914089456262, "batch_accuracy": 0.9770114942528736, "grounded_accuracy": 0.9508196721311475, "gt_match_rate": 0.8033, "step_accuracy": 0.9075003548364204, "lccp": 0.847631466893762, "n_groups": 18, "skipped_groups": 9, "learning_rate": 4.137617463414222e-06, "iter_time_s": 444.32088500098325, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.357, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.143, "extraction_success_rate": 1.0, "chain_scoring_active": 0, "n_self_play_groups": 7, "q_gen_attempts": 7, "q_gen_valid": 7, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.726, "q_quality_rate": 1.0, "q_topic_match": 0.5621, "q_difficulty_fit": 0.6634, "q_clarity": 1.0, "q_novelty": 0.4412, "q_solvability": 0.9885}
+{"iteration": 24, "loss": -0.00021296025724950595, "mean_reward": 0.8795895609748888, "std_reward": 0.1733128827089799, "batch_accuracy": 0.9357142857142857, "grounded_accuracy": 0.9333333333333333, "gt_match_rate": 0.7917, "step_accuracy": 0.8988194444444446, "lccp": 0.8122916666666666, "n_groups": 20, "skipped_groups": 8, "learning_rate": 4.0281456801451e-06, "iter_time_s": 471.6989622868132, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.393, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.082, "extraction_success_rate": 0.98, "chain_scoring_active": 0, "n_self_play_groups": 8, "q_gen_attempts": 8, "q_gen_valid": 8, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6621, "q_quality_rate": 1.0, "q_topic_match": 0.5344, "q_difficulty_fit": 0.3108, "q_clarity": 1.0, "q_novelty": 0.4408, "q_solvability": 0.9688}
+{"iteration": 25, "loss": 0.0003441530472758002, "mean_reward": 0.8445275205076134, "std_reward": 0.20865777545087066, "batch_accuracy": 0.9273356401384083, "grounded_accuracy": 0.8532110091743119, "gt_match_rate": 0.6055, "step_accuracy": 0.9198, "lccp": 0.8468, "n_groups": 28, "skipped_groups": 1, "learning_rate": 3.9140100818997275e-06, "iter_time_s": 524.655717118876, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.429, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.127, "extraction_success_rate": 0.94, "chain_scoring_active": 0, "n_self_play_groups": 9, "q_gen_attempts": 9, "q_gen_valid": 9, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6471, "q_quality_rate": 1.0, "q_topic_match": 0.505, "q_difficulty_fit": 0.2634, "q_clarity": 1.0, "q_novelty": 0.4394, "q_solvability": 0.9672, "accuracy": 0.9221, "combined_score": 0.9221, "correct_rate": 0.7933, "prm_mean": 0.9034, "prm_final": 0.9329, "sympy_mean": 0.0, "format_mean": 1.0, "n_scored": 150, "total": 150, "final_answer_correct": 119, "final_answer_accuracy": 0.7933333333333333}
+{"iteration": 26, "loss": 0.0004209962865808428, "mean_reward": 0.8666489827432893, "std_reward": 0.1796360842988206, "batch_accuracy": 0.9204152249134948, "grounded_accuracy": 0.926605504587156, "gt_match_rate": 0.789, "step_accuracy": 0.8898463666812292, "lccp": 0.7943024610455803, "n_groups": 26, "skipped_groups": 3, "learning_rate": 3.795627137098479e-06, "iter_time_s": 509.6774504878558, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.464, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.065, "extraction_success_rate": 0.94, "chain_scoring_active": 0, "n_self_play_groups": 9, "q_gen_attempts": 9, "q_gen_valid": 9, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6792, "q_quality_rate": 1.0, "q_topic_match": 0.6639, "q_difficulty_fit": 0.2476, "q_clarity": 1.0, "q_novelty": 0.4488, "q_solvability": 0.9317}
+{"iteration": 27, "loss": -0.00022697661013808103, "mean_reward": 0.877933982604161, "std_reward": 0.1628662024521015, "batch_accuracy": 0.9563758389261745, "grounded_accuracy": 0.9393939393939394, "gt_match_rate": 0.6869, "step_accuracy": 0.8616281866281865, "lccp": 0.7406565656565657, "n_groups": 25, "skipped_groups": 5, "learning_rate": 3.673428812268702e-06, "iter_time_s": 597.5212381640449, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.5, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.067, "extraction_success_rate": 0.92, "chain_scoring_active": 0, "n_self_play_groups": 10, "q_gen_attempts": 10, "q_gen_valid": 10, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6831, "q_quality_rate": 1.0, "q_topic_match": 0.5699, "q_difficulty_fit": 0.3583, "q_clarity": 1.0, "q_novelty": 0.4584, "q_solvability": 0.9759}
+{"iteration": 28, "loss": 4.199455770111822e-05, "mean_reward": 0.8695997487614422, "std_reward": 0.15915376074701193, "batch_accuracy": 0.9419354838709677, "grounded_accuracy": 0.8777777777777778, "gt_match_rate": 0.6556, "step_accuracy": 0.8334434828062279, "lccp": 0.6186230200445887, "n_groups": 29, "skipped_groups": 2, "learning_rate": 3.5478609958457035e-06, "iter_time_s": 603.0997926741838, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.536, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.047, "extraction_success_rate": 0.8, "chain_scoring_active": 0, "n_self_play_groups": 11, "q_gen_attempts": 11, "q_gen_valid": 11, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6693, "q_quality_rate": 1.0, "q_topic_match": 0.5931, "q_difficulty_fit": 0.23, "q_clarity": 1.0, "q_novelty": 0.4489, "q_solvability": 0.9836}
+{"iteration": 29, "loss": 0.0003765096731578004, "mean_reward": 0.8674408392873937, "std_reward": 0.17082623284979875, "batch_accuracy": 0.9470198675496688, "grounded_accuracy": 0.8928571428571429, "gt_match_rate": 0.7262, "step_accuracy": 0.8674065194639727, "lccp": 0.7603936306964257, "n_groups": 28, "skipped_groups": 3, "learning_rate": 3.419381871174205e-06, "iter_time_s": 579.6904674370307, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.571, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.123, "extraction_success_rate": 0.84, "chain_scoring_active": 0, "n_self_play_groups": 11, "q_gen_attempts": 11, "q_gen_valid": 11, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6496, "q_quality_rate": 1.0, "q_topic_match": 0.5636, "q_difficulty_fit": 0.1695, "q_clarity": 1.0, "q_novelty": 0.4425, "q_solvability": 0.9739}
+{"iteration": 30, "loss": -0.00029927124827130075, "mean_reward": 0.8705812118012987, "std_reward": 0.16025951815561293, "batch_accuracy": 0.9655172413793104, "grounded_accuracy": 0.95, "gt_match_rate": 0.8, "step_accuracy": 0.9232, "lccp": 0.85, "n_groups": 27, "skipped_groups": 5, "learning_rate": 3.2884602446470037e-06, "iter_time_s": 503.08798154001124, "training_phase": "SELFPLAY_RAMP", "effective_sp_ratio": 0.607, "selfplay_suspended": 0, "chain_arith_score": null, "chain_dep_score": null, "chain_integrity_score": null, "sp_chain_integrity_score": null, "chain_prm_correlation": 0.099, "extraction_success_rate": 0.92, "chain_scoring_active": 0, "n_self_play_groups": 12, "q_gen_attempts": 12, "q_gen_valid": 12, "q_gen_valid_rate": 1.0, "mean_question_reward": 0.6764, "q_quality_rate": 1.0, "q_topic_match": 0.6752, "q_difficulty_fit": 0.1485, "q_clarity": 1.0, "q_novelty": 0.4566, "q_solvability": 0.9699, "accuracy": 0.9204, "combined_score": 0.9204, "correct_rate": 0.7933, "prm_mean": 0.9044, "prm_final": 0.9289, "sympy_mean": 0.0, "format_mean": 1.0, "n_scored": 150, "total": 150, "final_answer_correct": 119, "final_answer_accuracy": 0.7933333333333333}
\ No newline at end of file
diff --git a/models.py b/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb61bc48a9aa91e1103cf57e434c8ba2aeec7f2e
--- /dev/null
+++ b/models.py
@@ -0,0 +1,67 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Data models for the AxiomForgeAI math RL environment.
+
+The AxiomForgeAI environment presents math questions drawn from an adaptive
+curriculum; external agents submit step-by-step solutions and receive scored
+observations. The environment integrates with the GRPO training pipeline
+defined in scripts/run_grpo_training.py.
+"""
+
+from openenv.core.env_server.types import Action, Observation
+from pydantic import Field
+
+
+class AxiomforgeaiAction(Action):
+ """Action for the AxiomForgeAI math environment.
+
+ The agent submits a step-by-step solution to the current question.
+ Solutions should follow the format::
+
+ Step 1:
+ Step 2:
+ ...
+ Final Answer:
+ """
+
+ solution: str = Field(
+ default="",
+ description=(
+ "Step-by-step solution to the current math question. "
+ "Use 'Step N: ...' lines and end with 'Final Answer: '."
+ ),
+ )
+
+
+class AxiomforgeaiObservation(Observation):
+ """Observation from the AxiomForgeAI math environment.
+
+ On reset the question is populated and reward/feedback are empty.
+ After a step the reward and feedback reflect the quality of the submitted
+ solution; done=True signals the end of the single-step episode.
+ """
+
+ question: str = Field(
+ default="",
+ description="Math question the agent must solve.",
+ )
+ topic: str = Field(
+ default="",
+ description="Mathematical topic of the question (e.g. 'algebra', 'geometry').",
+ )
+ difficulty: float = Field(
+ default=0.5,
+ description="Estimated difficulty of the question in [0, 1].",
+ )
+ feedback: str = Field(
+ default="",
+ description=(
+ "Human-readable feedback on the submitted solution "
+ "(empty on reset, populated after step)."
+ ),
+ )
diff --git a/openenv.yaml b/openenv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..50d7a49293e04d02e4ce7bdc9e869d0a4af39ab6
--- /dev/null
+++ b/openenv.yaml
@@ -0,0 +1,7 @@
+spec_version: 1
+name: AxiomForgeAI
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000
+
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..747311bfce07ae224621bff8f494e8e37b1919e7
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "openenv-AxiomForgeAI"
+version = "0.1.0"
+description = "Axiomforgeai environment for OpenEnv"
+requires-python = ">=3.10"
+dependencies = [
+ # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
+ # install from github
+ # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
+ "openenv-core[core]>=0.2.2",
+ # Environment-specific dependencies
+ # Add all dependencies needed for your environment here
+ # Examples:
+ # "numpy>=1.19.0",
+ # "torch>=2.0.0",
+ # "gymnasium>=0.29.0",
+ # "openspiel>=1.0.0",
+ # "smolagents>=1.22.0,<2",
+]
+
+[project.optional-dependencies]
+dev = [
+ "pytest>=8.0.0",
+ "pytest-cov>=4.0.0",
+]
+
+[project.scripts]
+# Server entry point - enables running via: uv run --project . server
+# or: python -m AxiomForgeAI.server.app
+server = "AxiomForgeAI.server.app:main"
+
+[tool.setuptools]
+include-package-data = true
+packages = [
+ "AxiomForgeAI",
+ "AxiomForgeAI.server",
+ "src",
+ "src.config",
+ "src.rl",
+ "src.sft",
+ "src.utils",
+ "src.self_play",
+ "scripts",
+]
+package-dir = { "AxiomForgeAI" = ".", "AxiomForgeAI.server" = "server" }
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..52a1a149b31f6f0e70860248b61621a510d2cb6d
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,160 @@
+accelerate==1.2.1
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.5
+aiohttp-cors==0.8.1
+aiosignal==1.4.0
+airportsdata==20260315
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.13.0
+astor==0.8.1
+attrs==26.1.0
+bitsandbytes==0.44.1
+blake3==1.0.8
+certifi==2026.4.22
+cffi==2.0.0
+charset-normalizer==3.4.7
+click==8.3.2
+cloudpickle==3.1.2
+colorful==0.5.8
+compressed-tensors==0.9.0
+cryptography==46.0.7
+datasets==3.2.0
+depyf==0.18.0
+dill==0.3.8
+diskcache==5.6.3
+distlib==0.4.0
+distro==1.9.0
+einops==0.8.2
+fastapi==0.136.0
+filelock==3.29.0
+frozenlist==1.8.0
+fsspec==2024.9.0
+gguf==0.10.0
+google-api-core==2.30.3
+google-auth==2.49.2
+googleapis-common-protos==1.74.0
+grpcio==1.80.0
+h11==0.16.0
+hf-xet==1.4.3
+hjson==3.1.0
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+huggingface-hub==0.36.2
+idna==3.12
+importlib-metadata==9.0.0
+interegular==0.3.3
+jinja2==3.1.6
+jiter==0.14.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+lark==1.2.2
+linkify-it-py==2.1.0
+lm-format-enforcer==0.10.12
+markdown-it-py==4.0.0
+markupsafe==3.0.3
+mdit-py-plugins==0.5.0
+mdurl==0.1.2
+memray==1.19.3
+mistral-common==1.11.0
+mpmath==1.3.0
+msgpack==1.1.2
+msgspec==0.21.1
+multidict==6.7.1
+multiprocess==0.70.16
+nest-asyncio==1.6.0
+networkx==3.6.1
+ninja==1.13.0
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-ml-py==13.595.45
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==2.32.0
+opencensus==0.11.4
+opencensus-context==0.1.3
+opencv-python-headless==4.11.0.86
+outlines==0.1.11
+outlines-core==0.1.26
+packaging==26.1
+pandas==3.0.2
+partial-json-parser==0.2.1.1.post7
+peft==0.19.1
+pillow==12.2.0
+platformdirs==4.9.6
+prometheus-client==0.25.0
+prometheus-fastapi-instrumentator==7.1.0
+propcache==0.4.1
+proto-plus==1.27.2
+protobuf==7.34.1
+psutil==7.2.2
+py-cpuinfo==9.0.0
+py-spy==0.4.1
+pyarrow==24.0.0
+pyasn1==0.6.3
+pyasn1-modules==0.4.2
+pycountry==26.2.16
+pycparser==3.0
+pydantic==2.13.3
+pydantic-core==2.46.3
+pydantic-extra-types==2.11.1
+pygments==2.20.0
+python-dateutil==2.9.0.post0
+python-discovery==1.2.2
+python-dotenv==1.2.2
+pyyaml==6.0.3
+pyzmq==27.1.0
+ray==2.39.0
+referencing==0.37.0
+regex==2026.4.4
+requests==2.33.1
+rich==15.0.0
+rpds-py==0.30.0
+safetensors==0.7.0
+scipy>=1.14.0
+sentencepiece==0.2.1
+setuptools==82.0.1
+six==1.17.0
+smart-open==7.6.0
+sniffio==1.3.1
+starlette==0.52.1
+sympy==1.13.1
+textual==8.2.4
+tiktoken==0.12.0
+tokenizers==0.20.3
+torch==2.5.1
+torchaudio==2.5.1
+torchvision==0.20.1
+tqdm==4.67.3
+transformers==4.46.3
+triton==3.1.0
+trl==0.12.1
+typing-extensions==4.15.0
+typing-inspection==0.4.2
+uc-micro-py==2.0.0
+urllib3==2.6.3
+uvicorn==0.45.0
+uvloop==0.22.1
+virtualenv==21.2.4
+vllm==0.7.0
+watchfiles==1.1.1
+websockets==16.0
+wrapt==2.1.2
+xformers==0.0.28.post3
+xgrammar==0.1.33
+xxhash==3.6.0
+yarl==1.23.0
+zipp==3.23.1
+matplotlib==3.10.9
+flash-attn==2.8.3
+gradio>=4.44.0
\ No newline at end of file
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..774c17c653a8aa84262a5a04d1ff910bd226e2a7
--- /dev/null
+++ b/scripts/__init__.py
@@ -0,0 +1 @@
+"""Training and evaluation scripts for math reasoning models."""
diff --git a/scripts/convert_gsm8k_to_sft.py b/scripts/convert_gsm8k_to_sft.py
new file mode 100644
index 0000000000000000000000000000000000000000..28804b81a5a1551cc5ea79dbdca1b94efc61cec8
--- /dev/null
+++ b/scripts/convert_gsm8k_to_sft.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+"""
+Convert OpenAI GSM8K to SFT JSONL aligned with MathAgent solver format:
+
+ Step 1: ...
+ Step 2: ...
+ ...
+ Final Answer:
+
+Each record uses a chat messages list for Qwen-style fine-tuning.
+
+Usage
+-----
+ # From Hugging Face (default; same data as in test.ipynb)
+ python scripts/convert_gsm8k_to_sft.py \\
+ --output data/sft/gsm8k_sft.jsonl \\
+ --splits train test
+
+ # From a saved JSONL with columns \"question\" and \"answer\" (GSM8K schema)
+ python scripts/convert_gsm8k_to_sft.py \\
+ --source jsonl \\
+ --input path/to/file.jsonl \\
+ --output data/sft/gsm8k_sft.jsonl
+
+Requires: pip install datasets (and datasets will pull pyarrow as needed)
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+from pathlib import Path
+from typing import Any, Iterator
+
+# Keep in sync with src.agent.math_agent.SOLVER_SYSTEM_PROMPT
+SOLVER_SYSTEM_PROMPT = (
+ "You are a step-by-step math solver. "
+ "Solve the given problem one step at a time. "
+ "Each step must be on its own line, starting with 'Step N:'. "
+ "End with a line starting with 'Final Answer:'. "
+ "Write every mathematical expression in Python/SymPy syntax "
+ "so it can be verified programmatically."
+)
+
+USER_WRAPPER = (
+ "Solve the following problem. Show your reasoning as numbered steps, "
+ "then give the final numeric answer on the last line.\n\nProblem:\n{question}"
+)
+
+
+def parse_gsm8k_answer(raw_answer: str) -> tuple[str, str]:
+ """
+ Split GSM8K 'answer' field into reasoning text and final integer string.
+
+ GSM8K ends solutions with a line like: #### 42
+ """
+ text = raw_answer.strip()
+ parts = re.split(r"\s*####\s*", text, maxsplit=1)
+ reasoning = parts[0].strip()
+ final = parts[1].strip() if len(parts) > 1 else ""
+ # Normalize final (sometimes extra whitespace or commas)
+ final = re.sub(r"[,\s]+", "", final)
+ final_match = re.search(r"-?\d+", final)
+ final_clean = final_match.group(0) if final_match else final
+ return reasoning, final_clean
+
+
+def reasoning_to_step_lines(reasoning: str) -> list[str]:
+ """Turn reasoning into non-empty lines; each line becomes one Step N:."""
+ lines: list[str] = []
+ for raw in reasoning.splitlines():
+ line = raw.strip()
+ if line:
+ lines.append(line)
+ if not lines:
+ # Rare: single blob without newlines — split on sentence boundaries lightly
+ blob = reasoning.strip()
+ if blob:
+ chunks = re.split(r"(?<=[.!?])\s+", blob)
+ lines = [c.strip() for c in chunks if c.strip()]
+ return lines
+
+
+def build_assistant_content(reasoning: str, final_answer: str) -> str:
+ lines = reasoning_to_step_lines(reasoning)
+ out_parts: list[str] = []
+ for i, line in enumerate(lines, start=1):
+ # Prefer SymPy-friendly numerics: ** not ^, ascii-friendly
+ cleaned = line.replace("^", "**")
+ out_parts.append(f"Step {i}: {cleaned}")
+ body = "\n".join(out_parts)
+ if final_answer:
+ body = f"{body}\nFinal Answer: {final_answer}" if body else f"Final Answer: {final_answer}"
+ return body
+
+
+def row_to_record(
+ question: str,
+ answer: str,
+ example_id: str,
+ split: str,
+) -> dict[str, Any] | None:
+ reasoning, final_answer = parse_gsm8k_answer(answer)
+ if not final_answer and "####" not in answer:
+ return None
+ assistant = build_assistant_content(reasoning, final_answer)
+ if not assistant.strip():
+ return None
+
+ user_content = USER_WRAPPER.format(question=question.strip())
+
+ return {
+ "id": f"gsm8k_{example_id}",
+ "skill_id": "gsm8k_grade_school",
+ "source": "openai/gsm8k",
+ "split": split,
+ "messages": [
+ {"role": "system", "content": SOLVER_SYSTEM_PROMPT},
+ {"role": "user", "content": user_content},
+ {"role": "assistant", "content": assistant},
+ ],
+ # Convenience for non-chat trainers
+ "text": f"<|system|>\n{SOLVER_SYSTEM_PROMPT}\n<|user|>\n{user_content}\n<|assistant|>\n{assistant}",
+ }
+
+
+def iter_hf_rows(dataset_name: str, config: str, splits: list[str]) -> Iterator[tuple[str, str, dict]]:
+ from datasets import load_dataset
+
+ ds = load_dataset(dataset_name, config)
+ for split in splits:
+ if split not in ds:
+ raise KeyError(f"Split {split!r} not in dataset. Available: {list(ds.keys())}")
+ for i, row in enumerate(ds[split]):
+ yield f"{split}_{i}", split, row
+
+
+def main() -> None:
+ p = argparse.ArgumentParser(description="Convert GSM8K to SFT JSONL (chat messages).")
+ p.add_argument(
+ "--source",
+ choices=("hf", "jsonl"),
+ default="hf",
+ help="Load from Hugging Face dataset or a local JSONL file.",
+ )
+ p.add_argument("--dataset", default="openai/gsm8k", help="HF dataset id when --source hf.")
+ p.add_argument("--config", default="main", help="HF config name when --source hf.")
+ p.add_argument("--splits", nargs="+", default=["train", "test"], help="HF splits to export.")
+ p.add_argument("--input", type=Path, help="Local JSONL path when --source jsonl.")
+ p.add_argument(
+ "--output",
+ type=Path,
+ default=Path("data/sft/gsm8k_sft.jsonl"),
+ help="Output JSONL path.",
+ )
+ args = p.parse_args()
+
+ if args.source == "jsonl" and not args.input:
+ raise SystemExit("--input is required when --source jsonl")
+
+ args.output.parent.mkdir(parents=True, exist_ok=True)
+
+ n_ok, n_skip = 0, 0
+
+ def process(example_id: str, split: str, row: dict) -> None:
+ nonlocal n_ok, n_skip
+ q = row.get("question", "")
+ a = row.get("answer", "")
+ rec = row_to_record(q, a, example_id, split)
+ if rec is None:
+ n_skip += 1
+ return
+ out_f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+ n_ok += 1
+
+ with args.output.open("w", encoding="utf-8") as out_f:
+ if args.source == "hf":
+ for example_id, split, row in iter_hf_rows(args.dataset, args.config, args.splits):
+ process(example_id, split, row)
+ else:
+ for i, line in enumerate(args.input.open(encoding="utf-8")):
+ line = line.strip()
+ if not line:
+ continue
+ row = json.loads(line)
+ process(str(i), "jsonl", row)
+
+ print(f"Wrote {n_ok} examples to {args.output} ({n_skip} skipped).")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/create_dual_task_dataset.py b/scripts/create_dual_task_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..32e93ff68eba4170fd75859ed69fc706c911cbac
--- /dev/null
+++ b/scripts/create_dual_task_dataset.py
@@ -0,0 +1,321 @@
+#!/usr/bin/env python3
+"""
+Create dual-task training dataset by mixing question-generation and solution-generation examples.
+
+This script:
+1. Loads existing solution data (GSM8K format)
+2. Loads question-generation data (synthetic)
+3. Adds task prefixes to distinguish tasks
+4. Mixes datasets according to specified ratio
+5. Shuffles and splits into train/validation
+
+Usage:
+ python scripts/create_dual_task_dataset.py \
+ --solution-data data/sft/gsm8k_sft.jsonl \
+ --question-data data/sft/question_generation.jsonl \
+ --output-train data/sft/dual_task_train.jsonl \
+ --output-val data/sft/dual_task_val.jsonl \
+ --mix-ratio 0.8 \
+ --val-split 0.1
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import random
+import sys
+from pathlib import Path
+from typing import Any
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+
+from src.config.prompts import SOLVE_TASK_PREFIX, GENERATE_TASK_PREFIX
+
+
+def load_jsonl(path: Path) -> list[dict[str, Any]]:
+ """Load JSONL file into list of records."""
+ records = []
+ with path.open(encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ records.append(json.loads(line))
+ return records
+
+
+def add_solve_prefix(record: dict[str, Any]) -> dict[str, Any]:
+ """
+ Add 'Solve Problem' task prefix to user message.
+
+ This signals the model to generate a step-by-step solution.
+ """
+ modified = record.copy()
+ modified["messages"] = []
+
+ for msg in record["messages"]:
+ new_msg = msg.copy()
+ if msg["role"] == "user":
+ # Add task prefix to user content
+ content = msg["content"]
+ if not content.startswith(SOLVE_TASK_PREFIX):
+ new_msg["content"] = SOLVE_TASK_PREFIX + content
+ modified["messages"].append(new_msg)
+
+ # Update text field if present
+ if "text" in modified:
+ # Find and update user section
+ text = modified["text"]
+ if "<|user|>" in text:
+ parts = text.split("<|user|>")
+ if len(parts) > 1:
+ user_part = parts[1]
+ if not user_part.strip().startswith(SOLVE_TASK_PREFIX):
+ parts[1] = f"\n{SOLVE_TASK_PREFIX}" + user_part
+ modified["text"] = "<|user|>".join(parts)
+
+ # Mark as solve task
+ modified["task_type"] = "solve"
+
+ return modified
+
+
+def verify_question_prefix(record: dict[str, Any]) -> dict[str, Any]:
+ """
+ Verify question generation record has proper prefix.
+
+ Should already have it from generation script, but double-check.
+ """
+ modified = record.copy()
+ modified["messages"] = []
+
+ for msg in record["messages"]:
+ new_msg = msg.copy()
+ if msg["role"] == "user":
+ content = msg["content"]
+ if not content.startswith(GENERATE_TASK_PREFIX):
+ new_msg["content"] = GENERATE_TASK_PREFIX + content
+ modified["messages"].append(new_msg)
+
+ # Update text field if present
+ if "text" in modified:
+ text = modified["text"]
+ if "<|user|>" in text:
+ parts = text.split("<|user|>")
+ if len(parts) > 1:
+ user_part = parts[1]
+ if not user_part.strip().startswith(GENERATE_TASK_PREFIX):
+ parts[1] = f"\n{GENERATE_TASK_PREFIX}" + user_part
+ modified["text"] = "<|user|>".join(parts)
+
+ # Mark as question generation task
+ modified["task_type"] = "generate"
+
+ return modified
+
+
+def sample_with_ratio(
+ solution_records: list[dict[str, Any]],
+ question_records: list[dict[str, Any]],
+ mix_ratio: float,
+ target_total: int | None = None,
+) -> list[dict[str, Any]]:
+ """
+ Sample and mix datasets according to specified ratio.
+
+ Args:
+ solution_records: Solution examples
+ question_records: Question generation examples
+ mix_ratio: Fraction of solutions in final dataset (0.8 = 80% solutions, 20% questions)
+ target_total: Target total examples (None = use all available data)
+
+ Returns:
+ Mixed dataset
+ """
+ n_solutions = len(solution_records)
+ n_questions = len(question_records)
+
+ if target_total is None:
+ # Use all available data
+ target_total = n_solutions + n_questions
+
+ # Calculate target counts
+ n_sol_target = int(target_total * mix_ratio)
+ n_q_target = target_total - n_sol_target
+
+ # Check availability
+ if n_sol_target > n_solutions:
+ print(f"Warning: Requested {n_sol_target} solutions but only {n_solutions} available.")
+ n_sol_target = n_solutions
+
+ if n_q_target > n_questions:
+ print(f"Warning: Requested {n_q_target} questions but only {n_questions} available.")
+ n_q_target = n_questions
+
+ # Sample
+ selected_solutions = random.sample(solution_records, n_sol_target)
+ selected_questions = random.sample(question_records, n_q_target)
+
+ print(f"Sampled {n_sol_target} solutions and {n_q_target} questions")
+ print(f"Actual ratio: {n_sol_target/(n_sol_target+n_q_target):.2%} solutions, "
+ f"{n_q_target/(n_sol_target+n_q_target):.2%} questions")
+
+ return selected_solutions + selected_questions
+
+
+def write_jsonl(records: list[dict[str, Any]], path: Path) -> None:
+ """Write records to JSONL file."""
+ path.parent.mkdir(parents=True, exist_ok=True)
+ with path.open("w", encoding="utf-8") as f:
+ for record in records:
+ f.write(json.dumps(record, ensure_ascii=False) + "\n")
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(
+ description="Create dual-task training dataset from solution and question-generation examples."
+ )
+ parser.add_argument(
+ "--solution-data",
+ type=Path,
+ required=True,
+ help="Path to solution training data (GSM8K format)",
+ )
+ parser.add_argument(
+ "--question-data",
+ type=Path,
+ required=True,
+ help="Path to question-generation training data",
+ )
+ parser.add_argument(
+ "--output-train",
+ type=Path,
+ required=True,
+ help="Output path for training split",
+ )
+ parser.add_argument(
+ "--output-val",
+ type=Path,
+ required=True,
+ help="Output path for validation split",
+ )
+ parser.add_argument(
+ "--mix-ratio",
+ type=float,
+ default=0.8,
+ help="Fraction of solutions in mixed dataset (default: 0.8 = 80%% solutions)",
+ )
+ parser.add_argument(
+ "--val-split",
+ type=float,
+ default=0.1,
+ help="Fraction of data to use for validation (default: 0.1 = 10%%)",
+ )
+ parser.add_argument(
+ "--seed",
+ type=int,
+ default=42,
+ help="Random seed for reproducibility",
+ )
+ parser.add_argument(
+ "--max-total",
+ type=int,
+ default=None,
+ help="Maximum total examples to include (None = use all available)",
+ )
+ args = parser.parse_args()
+
+ # Validate inputs
+ if not args.solution_data.exists():
+ raise SystemExit(f"Error: Solution data not found at {args.solution_data}")
+ if not args.question_data.exists():
+ raise SystemExit(f"Error: Question data not found at {args.question_data}")
+
+ if not (0 < args.mix_ratio < 1):
+ raise SystemExit("Error: --mix-ratio must be between 0 and 1")
+ if not (0 < args.val_split < 1):
+ raise SystemExit("Error: --val-split must be between 0 and 1")
+
+ # Set random seed
+ random.seed(args.seed)
+
+ print("=" * 60)
+ print("Dual-Task Dataset Creation")
+ print("=" * 60)
+
+ # Load data
+ print("\n1. Loading data...")
+ print(f" Solution data: {args.solution_data}")
+ solution_records = load_jsonl(args.solution_data)
+ print(f" Loaded {len(solution_records)} solution examples")
+
+ print(f" Question data: {args.question_data}")
+ question_records = load_jsonl(args.question_data)
+ print(f" Loaded {len(question_records)} question-generation examples")
+
+ # Add task prefixes
+ print("\n2. Adding task prefixes...")
+ print(" Adding 'Solve Problem' prefix to solution examples...")
+ solution_records = [add_solve_prefix(r) for r in solution_records]
+
+ print(" Verifying 'Generate Question' prefix on question examples...")
+ question_records = [verify_question_prefix(r) for r in question_records]
+
+ # Mix datasets
+ print(f"\n3. Mixing datasets (ratio: {args.mix_ratio:.0%} solutions, {1-args.mix_ratio:.0%} questions)...")
+ mixed_records = sample_with_ratio(
+ solution_records=solution_records,
+ question_records=question_records,
+ mix_ratio=args.mix_ratio,
+ target_total=args.max_total,
+ )
+
+ # Shuffle
+ print(f"\n4. Shuffling {len(mixed_records)} total examples...")
+ random.shuffle(mixed_records)
+
+ # Split train/val
+ n_val = int(len(mixed_records) * args.val_split)
+ n_train = len(mixed_records) - n_val
+
+ train_records = mixed_records[:n_train]
+ val_records = mixed_records[n_train:]
+
+ print(f"\n5. Splitting data:")
+ print(f" Training: {len(train_records)} examples ({len(train_records)/len(mixed_records):.1%})")
+ print(f" Validation: {len(val_records)} examples ({len(val_records)/len(mixed_records):.1%})")
+
+ # Verify split composition
+ train_solve = sum(1 for r in train_records if r.get("task_type") == "solve")
+ train_gen = sum(1 for r in train_records if r.get("task_type") == "generate")
+ val_solve = sum(1 for r in val_records if r.get("task_type") == "solve")
+ val_gen = sum(1 for r in val_records if r.get("task_type") == "generate")
+
+ print(f"\n Train composition:")
+ print(f" Solve: {train_solve} ({train_solve/len(train_records):.1%})")
+ print(f" Generate: {train_gen} ({train_gen/len(train_records):.1%})")
+
+ print(f" Val composition:")
+ print(f" Solve: {val_solve} ({val_solve/len(val_records):.1%})")
+ print(f" Generate: {val_gen} ({val_gen/len(val_records):.1%})")
+
+ # Write outputs
+ print(f"\n6. Writing output files...")
+ print(f" Training data: {args.output_train}")
+ write_jsonl(train_records, args.output_train)
+
+ print(f" Validation data: {args.output_val}")
+ write_jsonl(val_records, args.output_val)
+
+ print("\n" + "=" * 60)
+ print("Dual-task dataset creation complete!")
+ print("=" * 60)
+ print(f"\nOutput files:")
+ print(f" Train: {args.output_train} ({len(train_records)} examples)")
+ print(f" Val: {args.output_val} ({len(val_records)} examples)")
+ print(f"\nNext step: Train dual-task model using these files")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/demo_before_after.py b/scripts/demo_before_after.py
new file mode 100644
index 0000000000000000000000000000000000000000..a546f53244646b9fa391640a7549cf36eb064f19
--- /dev/null
+++ b/scripts/demo_before_after.py
@@ -0,0 +1,591 @@
+"""Before / after demo — baseline vs GRPO-trained policy.
+
+Designed for hackathon judges: loads both models, runs greedy evaluation on
+a fixed problem set, and prints a clean side-by-side comparison with full
+solution text for the most interesting examples.
+
+Features
+--------
+* Handles all checkpoint types: HF model IDs, GRPO full-weight saves,
+ PEFT/LoRA adapter directories.
+* Automatically loads the chat template from the base model when the
+ checkpoint tokenizer doesn't have one (fixes the 0% accuracy bug that
+ silently swallows TemplateErrors).
+* Reads ``metrics.jsonl`` (if present) and prints the full accuracy curve,
+ showing judges the training progression at a glance.
+* Saves machine-readable JSON (for grading scripts) and prints a human-
+ readable Markdown table.
+* Shows full solution text for the best wins and worst regressions.
+
+Quick-start
+-----------
+After a GRPO run, point at ``best_policy/``::
+
+ python scripts/demo_before_after.py \\
+ --baseline-model checkpoints/dual_task_v1 \\
+ --trained-model checkpoints/grpo//best_policy \\
+ --problems data/sft/gsm8k_sft.jsonl \\
+ --max-samples 100
+
+Include the training curve::
+
+ python scripts/demo_before_after.py \\
+ --baseline-model checkpoints/dual_task_v1 \\
+ --trained-model checkpoints/grpo//best_policy \\
+ --metrics-jsonl checkpoints/grpo//metrics.jsonl \\
+ --problems data/sft/gsm8k_sft.jsonl \\
+ --max-samples 100 \\
+ --records-out results/demo.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import re
+import sys
+import time
+import types
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from peft import PeftModel
+from tqdm.auto import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from src.sft.solution_format import extract_final_answer_numeric_str
+from src.utils.attn_backend import select_attn_implementation
+
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s %(levelname)-8s %(name)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+_SEP = "=" * 78
+_SEP2 = "-" * 78
+
+
+# ---------------------------------------------------------------------------
+# Data
+# ---------------------------------------------------------------------------
+
+@dataclass
+class Problem:
+ question: str
+ gold_final: str
+
+
+def _parse_gold(answer: str) -> str:
+ m = re.search(r"####\s*([-0-9.,/ ]+)", answer)
+ if m:
+ return m.group(1).strip().replace(",", "")
+ return answer.strip().splitlines()[-1].strip()
+
+
+def _load_problems(path: Path, max_samples: int) -> List[Problem]:
+ """Accept GSM8K ``{question, answer}`` or SFT ``{messages}`` JSONL."""
+ out: List[Problem] = []
+ with path.open(encoding="utf-8") as fh:
+ for line in fh:
+ if max_samples > 0 and len(out) >= max_samples:
+ break
+ line = line.strip()
+ if not line:
+ continue
+ obj = json.loads(line)
+ if "question" in obj and "answer" in obj:
+ out.append(Problem(
+ question=obj["question"].strip(),
+ gold_final=_parse_gold(obj["answer"]),
+ ))
+ elif "messages" in obj:
+ user = next(
+ (m["content"] for m in obj["messages"] if m.get("role") == "user"), ""
+ ).strip()
+ asst = next(
+ (m["content"] for m in obj["messages"] if m.get("role") == "assistant"), ""
+ )
+ gold = extract_final_answer_numeric_str(asst) or ""
+ out.append(Problem(question=user, gold_final=gold.strip()))
+ return out
+
+
+# ---------------------------------------------------------------------------
+# Model loading — handles HF IDs, full-weight saves, and PEFT adapters
+# ---------------------------------------------------------------------------
+
+def _ensure_chat_template(
+ tokenizer: AutoTokenizer,
+ fallback_model: str = "Qwen/Qwen2.5-Math-1.5B-Instruct",
+) -> None:
+ """Load chat template from *fallback_model* when the checkpoint lacks one.
+
+ SFT adapter checkpoints often omit the chat_template from their tokenizer
+ config. Without it, ``apply_chat_template`` raises a TemplateError that
+ is silently swallowed inside ``evaluate_gsm8k``, returning 0% accuracy.
+ """
+ if tokenizer.chat_template is not None:
+ return
+ logger.info("Tokenizer missing chat_template — loading from %s", fallback_model)
+ try:
+ _base_tok = AutoTokenizer.from_pretrained(fallback_model, trust_remote_code=True)
+ if _base_tok.chat_template is not None:
+ tokenizer.chat_template = _base_tok.chat_template
+ logger.info("Chat template loaded.")
+ except Exception as exc:
+ logger.warning("Could not load chat template: %s", exc)
+
+
+def _load_model(
+ checkpoint: str,
+ base_model_id: str,
+ device: torch.device,
+ dtype: torch.dtype,
+ attn_impl: str,
+) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
+ """Load model + tokenizer from any checkpoint style.
+
+ Handles:
+ * HuggingFace model ID (e.g. ``Qwen/Qwen2.5-Math-1.5B-Instruct``)
+ * GRPO full-weight save (directory with ``model.safetensors`` / pytorch_model*)
+ * PEFT/LoRA adapter dir (directory with ``adapter_config.json``)
+ """
+ # PEFT shim — prevents crash in merge_and_unload on some versions.
+ if "transformers.integrations.tensor_parallel" not in sys.modules:
+ sys.modules["transformers.integrations.tensor_parallel"] = types.ModuleType(
+ "tensor_parallel"
+ )
+
+ ckpt_path = Path(checkpoint)
+ is_adapter = ckpt_path.is_dir() and (ckpt_path / "adapter_config.json").exists()
+ is_local_full = ckpt_path.is_dir() and not is_adapter
+
+ # Tokenizer
+ tok_src = checkpoint if (ckpt_path.is_dir() and (ckpt_path / "tokenizer_config.json").exists()) else base_model_id
+ tokenizer = AutoTokenizer.from_pretrained(tok_src, trust_remote_code=True)
+ if tokenizer.pad_token is None:
+ tokenizer.pad_token = tokenizer.eos_token
+ tokenizer.padding_side = "left" # standard for generation
+ _ensure_chat_template(tokenizer, fallback_model=base_model_id)
+
+ load_kw = dict(
+ torch_dtype=dtype,
+ low_cpu_mem_usage=True,
+ device_map={"": device},
+ trust_remote_code=True,
+ attn_implementation=attn_impl,
+ )
+
+ if is_adapter:
+ # Read base model from pipeline_meta.json if present
+ meta_file = ckpt_path / "pipeline_meta.json"
+ _base = base_model_id
+ if meta_file.exists():
+ _base = json.loads(meta_file.read_text()).get("base_model", _base)
+ logger.info("PEFT adapter — loading base %s then merging %s", _base, checkpoint)
+ _base_mdl = AutoModelForCausalLM.from_pretrained(_base, **load_kw)
+ model = PeftModel.from_pretrained(_base_mdl, checkpoint).merge_and_unload()
+ model = model.to(device)
+ else:
+ # Full weights (GRPO save) or HF model ID
+ src = checkpoint if is_local_full else checkpoint
+ logger.info("Loading full-weight model from %s", src)
+ model = AutoModelForCausalLM.from_pretrained(src, **load_kw)
+
+ # Re-enable requires_grad isn't needed for eval, but ensure eval mode.
+ model.eval()
+ n = sum(p.numel() for p in model.parameters())
+ logger.info("Loaded: %s (%.2fB params, %.1f GB VRAM est.)",
+ checkpoint, n / 1e9, n * 2 / 1e9)
+ return model, tokenizer
+
+
+# ---------------------------------------------------------------------------
+# Generation
+# ---------------------------------------------------------------------------
+
+def _build_prompt(tokenizer: AutoTokenizer, question: str) -> str:
+ """Format question using the model's chat template (matches training format)."""
+ if tokenizer.chat_template is None:
+ return question
+ msgs = [
+ {"role": "system", "content": "You are a helpful math assistant. Solve the problem step-by-step and end with 'Final Answer: '."},
+ {"role": "user", "content": question},
+ ]
+ try:
+ return tokenizer.apply_chat_template(
+ msgs, tokenize=False, add_generation_prompt=True
+ )
+ except Exception:
+ return question
+
+
+def _stop_ids(tokenizer: AutoTokenizer) -> List[int]:
+ ids = []
+ if tokenizer.eos_token_id is not None:
+ ids.append(tokenizer.eos_token_id)
+ im_end = tokenizer.convert_tokens_to_ids("<|im_end|>")
+ if isinstance(im_end, int) and im_end not in ids:
+ ids.append(im_end)
+ return ids or None # type: ignore[return-value]
+
+
+@torch.no_grad()
+def _generate(
+ model: AutoModelForCausalLM,
+ tokenizer: AutoTokenizer,
+ question: str,
+ max_new_tokens: int,
+ device: torch.device,
+) -> str:
+ prompt = _build_prompt(tokenizer, question)
+ enc = tokenizer(
+ prompt,
+ return_tensors="pt",
+ truncation=True,
+ max_length=1024,
+ ).to(device)
+ prompt_len = enc["input_ids"].shape[1]
+
+ out = model.generate(
+ input_ids=enc["input_ids"],
+ attention_mask=enc["attention_mask"],
+ max_new_tokens=max_new_tokens,
+ do_sample=False, # greedy — deterministic for reproducibility
+ temperature=1.0,
+ pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
+ eos_token_id=_stop_ids(tokenizer),
+ use_cache=True,
+ )
+ return tokenizer.decode(out[0][prompt_len:], skip_special_tokens=True)
+
+
+# ---------------------------------------------------------------------------
+# Scoring
+# ---------------------------------------------------------------------------
+
+def _normalize(x: str) -> str:
+ if not x:
+ return ""
+ s = x.strip().replace(",", "").replace("$", "").strip()
+ try:
+ f = float(s)
+ return f"{int(f)}" if f == int(f) else f"{f}"
+ except ValueError:
+ return s
+
+
+@dataclass
+class Record:
+ question: str
+ gold: str
+ pred: str
+ correct: bool
+ solution_text: str
+
+
+def _score_model(
+ model: AutoModelForCausalLM,
+ tokenizer: AutoTokenizer,
+ problems: List[Problem],
+ max_new_tokens: int,
+ device: torch.device,
+ label: str,
+) -> Tuple[int, List[Record]]:
+ records: List[Record] = []
+ correct = 0
+ for prob in tqdm(problems, desc=f"Scoring {label}", unit="q", dynamic_ncols=True):
+ try:
+ text = _generate(model, tokenizer, prob.question, max_new_tokens, device)
+ except Exception as exc:
+ text = f"[generation error: {exc}]"
+ pred = extract_final_answer_numeric_str(text) or ""
+ ok = bool(pred) and _normalize(pred) == _normalize(prob.gold_final)
+ if ok:
+ correct += 1
+ records.append(Record(
+ question=prob.question,
+ gold=prob.gold_final,
+ pred=pred,
+ correct=ok,
+ solution_text=text,
+ ))
+ return correct, records
+
+
+# ---------------------------------------------------------------------------
+# Metrics curve
+# ---------------------------------------------------------------------------
+
+def _load_metrics_curve(path: Path) -> List[Dict]:
+ """Read metrics.jsonl and return rows that contain GSM8K accuracy."""
+ rows = []
+ if not path.exists():
+ return rows
+ with path.open(encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ obj = json.loads(line)
+ if "accuracy" in obj or "iteration" in obj:
+ rows.append(obj)
+ except json.JSONDecodeError:
+ pass
+ return rows
+
+
+def _print_curve(rows: List[Dict]) -> None:
+ if not rows:
+ return
+ print(f"\n{_SEP}")
+ print("TRAINING ACCURACY CURVE (from metrics.jsonl)")
+ print(_SEP)
+ print(f"{'Iter':>5} {'GSM8K%':>7} {'Reward':>7} {'Batch%':>7} {'LR':>10} {'Time(s)':>8}")
+ print(_SEP2)
+ for r in rows:
+ it = r.get("iteration", "")
+ acc = r.get("accuracy", None)
+ rwd = r.get("mean_reward", None)
+ bat = r.get("batch_accuracy", None)
+ lr = r.get("learning_rate", None)
+ ts = r.get("iter_time_s", None)
+ acc_s = f"{100*acc:.1f}%" if acc is not None else "—"
+ rwd_s = f"{rwd:.3f}" if rwd is not None else "—"
+ bat_s = f"{100*bat:.1f}%" if bat is not None else "—"
+ lr_s = f"{lr:.2e}" if lr is not None else "—"
+ ts_s = f"{ts:.1f}" if ts is not None else "—"
+ print(f"{it:>5} {acc_s:>7} {rwd_s:>7} {bat_s:>7} {lr_s:>10} {ts_s:>8}")
+ print()
+
+
+# ---------------------------------------------------------------------------
+# Output
+# ---------------------------------------------------------------------------
+
+def _print_summary(
+ base_correct: int,
+ tr_correct: int,
+ base_records: List[Record],
+ tr_records: List[Record],
+ baseline_name: str,
+ trained_name: str,
+ n_solutions: int = 3,
+) -> None:
+ n = len(base_records)
+ wins = [(p, b, t) for p, b, t in zip(base_records, base_records, tr_records) if not b.correct and t.correct]
+ losses = [(p, b, t) for p, b, t in zip(base_records, base_records, tr_records) if b.correct and not t.correct]
+ both_wrong = sum(1 for b, t in zip(base_records, tr_records) if not b.correct and not t.correct)
+ both_right = sum(1 for b, t in zip(base_records, tr_records) if b.correct and t.correct)
+
+ delta = tr_correct - base_correct
+ sign = "+" if delta >= 0 else ""
+
+ print(f"\n{_SEP}")
+ print("BEFORE vs AFTER — GSM8K accuracy (greedy decoding, fixed seed)")
+ print(_SEP)
+ print(f" Baseline : {baseline_name}")
+ print(f" Trained : {trained_name}")
+ print(_SEP2)
+ print(f" Baseline accuracy : {base_correct}/{n} ({100*base_correct/n:.1f}%)")
+ print(f" Trained accuracy : {tr_correct}/{n} ({100*tr_correct/n:.1f}%)")
+ print(f" Delta : {sign}{delta} problems ({sign}{100*delta/n:.1f} pp)")
+ print(_SEP2)
+ print(f" Newly correct (wins) : {len(wins)}")
+ print(f" Newly wrong (losses) : {len(losses)}")
+ print(f" Both correct : {both_right}")
+ print(f" Both wrong : {both_wrong}")
+ print(_SEP)
+
+ if wins:
+ print(f"\n{'='*78}")
+ print(f"WINS — problems the RL model now solves that the baseline could not")
+ print(f"{'='*78}")
+ for i, (_, base_r, tr_r) in enumerate(wins[:n_solutions]):
+ print(f"\n[Win {i+1}/{min(n_solutions, len(wins))}]")
+ _print_problem(base_r, tr_r)
+
+ if losses:
+ print(f"\n{'='*78}")
+ print(f"REGRESSIONS — problems the baseline solved but the RL model now misses")
+ print(f"{'='*78}")
+ for i, (_, base_r, tr_r) in enumerate(losses[:min(2, len(losses))]):
+ print(f"\n[Regression {i+1}/{min(2, len(losses))}]")
+ _print_problem(base_r, tr_r, is_regression=True)
+
+ print(f"\n{_SEP}")
+ pct_gain = 100 * delta / max(n - base_correct, 1)
+ print(f"SUMMARY: RL training fixed {len(wins)} problems, regressed {len(losses)}.")
+ print(f" Net: {sign}{delta} pts. Relative gain on previously-wrong: {pct_gain:+.1f}%")
+ print(_SEP)
+
+
+def _print_problem(base_r: Record, tr_r: Record, is_regression: bool = False) -> None:
+ q = base_r.question
+ # Truncate long questions
+ if len(q) > 250:
+ q = q[:247] + "..."
+ print(f" Q : {q}")
+ print(f" Gold : {base_r.gold}")
+ if not is_regression:
+ print(f" Before : {base_r.pred!r:30s} ✗")
+ print(f" After : {tr_r.pred!r:30s} ✓")
+ # Show trained solution (truncated)
+ sol = tr_r.solution_text.strip()
+ if sol:
+ lines = sol.splitlines()
+ show = "\n ".join(lines[:12])
+ if len(lines) > 12:
+ show += f"\n ... ({len(lines)-12} more lines)"
+ print(f"\n Solution (trained model):\n {show}")
+ else:
+ print(f" Before : {base_r.pred!r:30s} ✓")
+ print(f" After : {tr_r.pred!r:30s} ✗")
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def main() -> int:
+ parser = argparse.ArgumentParser(
+ description=__doc__,
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ )
+ parser.add_argument(
+ "--baseline-model", default="checkpoints/dual_task_v1",
+ help="Pre-RL checkpoint. HF model ID, full-weight dir, or PEFT adapter dir.",
+ )
+ parser.add_argument(
+ "--trained-model", required=True,
+ help="Post-RL checkpoint (GRPO best_policy/ dir, or iteration checkpoint).",
+ )
+ parser.add_argument(
+ "--base-model-for-adapter", default="Qwen/Qwen2.5-Math-1.5B-Instruct",
+ help="Base model used when loading a PEFT adapter checkpoint.",
+ )
+ parser.add_argument(
+ "--problems", type=Path, default=Path("data/sft/gsm8k_sft.jsonl"),
+ help="JSONL eval set. Defaults to GSM8K training split (first --max-samples rows).",
+ )
+ parser.add_argument("--max-samples", type=int, default=100)
+ parser.add_argument("--max-new-tokens", type=int, default=512)
+ parser.add_argument(
+ "--metrics-jsonl", type=Path, default=None,
+ help="Path to metrics.jsonl from a GRPO run — prints the accuracy curve.",
+ )
+ parser.add_argument(
+ "--n-solutions", type=int, default=3,
+ help="Number of win/loss examples to print in full.",
+ )
+ parser.add_argument(
+ "--records-out", type=Path, default=None,
+ help="Save full per-problem JSON records here (for judge grading scripts).",
+ )
+ parser.add_argument(
+ "--device", default="cuda" if torch.cuda.is_available() else "cpu",
+ )
+ parser.add_argument(
+ "--dtype", default="bfloat16",
+ choices=["float32", "float16", "bfloat16"],
+ )
+ args = parser.parse_args()
+
+ if not args.problems.is_file():
+ logger.error("Problems file not found: %s", args.problems)
+ return 2
+
+ dtype_map = {"float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16}
+ dtype = dtype_map[args.dtype]
+ device = torch.device(args.device)
+ attn = select_attn_implementation()
+ logger.info("Device: %s | dtype: %s | attn: %s", device, args.dtype, attn)
+
+ # Print training curve if available
+ if args.metrics_jsonl:
+ curve = _load_metrics_curve(args.metrics_jsonl)
+ _print_curve(curve)
+
+ problems = _load_problems(args.problems, args.max_samples)
+ if not problems:
+ logger.error("No problems loaded from %s", args.problems)
+ return 2
+ logger.info("Evaluating on %d problems from %s", len(problems), args.problems)
+
+ # ── Baseline ──────────────────────────────────────────────────────────
+ logger.info("%s\nScoring BASELINE: %s\n%s", _SEP, args.baseline_model, _SEP)
+ t0 = time.perf_counter()
+ base_model, base_tok = _load_model(
+ args.baseline_model, args.base_model_for_adapter, device, dtype, attn
+ )
+ base_correct, base_records = _score_model(
+ base_model, base_tok, problems, args.max_new_tokens, device, "baseline"
+ )
+ del base_model
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ logger.info("Baseline done in %.1fs — accuracy: %d/%d (%.1f%%)",
+ time.perf_counter() - t0,
+ base_correct, len(problems),
+ 100 * base_correct / len(problems))
+
+ # ── Trained ───────────────────────────────────────────────────────────
+ logger.info("%s\nScoring TRAINED: %s\n%s", _SEP, args.trained_model, _SEP)
+ t0 = time.perf_counter()
+ tr_model, tr_tok = _load_model(
+ args.trained_model, args.base_model_for_adapter, device, dtype, attn
+ )
+ tr_correct, tr_records = _score_model(
+ tr_model, tr_tok, problems, args.max_new_tokens, device, "trained"
+ )
+ del tr_model
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ logger.info("Trained done in %.1fs — accuracy: %d/%d (%.1f%%)",
+ time.perf_counter() - t0,
+ tr_correct, len(problems),
+ 100 * tr_correct / len(problems))
+
+ # ── Summary ───────────────────────────────────────────────────────────
+ _print_summary(
+ base_correct, tr_correct,
+ base_records, tr_records,
+ baseline_name=args.baseline_model,
+ trained_name=args.trained_model,
+ n_solutions=args.n_solutions,
+ )
+
+ # ── Save records ──────────────────────────────────────────────────────
+ if args.records_out:
+ args.records_out.parent.mkdir(parents=True, exist_ok=True)
+ payload = {
+ "baseline_model": args.baseline_model,
+ "trained_model": args.trained_model,
+ "n_problems": len(problems),
+ "baseline": {
+ "correct": base_correct,
+ "accuracy": base_correct / len(problems),
+ "records": [vars(r) for r in base_records],
+ },
+ "trained": {
+ "correct": tr_correct,
+ "accuracy": tr_correct / len(problems),
+ "records": [vars(r) for r in tr_records],
+ },
+ }
+ args.records_out.write_text(
+ json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8"
+ )
+ logger.info("Per-problem records saved to %s", args.records_out)
+
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/scripts/dual_task_sft_pipeline.py b/scripts/dual_task_sft_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..43926c06c98bfaaf714d7504fdf4fc033585ac49
--- /dev/null
+++ b/scripts/dual_task_sft_pipeline.py
@@ -0,0 +1,390 @@
+"""
+Dual-task SFT pipeline: train model on both question generation and solution tasks.
+
+This pipeline trains a single model that can:
+1. Generate math questions when prompted with "### Task: Generate Question"
+2. Solve math problems when prompted with "### Task: Solve Problem"
+
+Examples
+--------
+ # Train dual-task model
+ python scripts/dual_task_sft_pipeline.py train \\
+ --data data/sft/dual_task_train.jsonl \\
+ --output-dir checkpoints/dual_task_v1 \\
+ --epochs 2
+
+ # Infer - Question Generation
+ python scripts/dual_task_sft_pipeline.py infer \\
+ --adapter checkpoints/dual_task_v1 \\
+ --task generate \\
+ --prompt "Create a word problem about fractions and money requiring 3 steps."
+
+ # Infer - Solution Generation
+ python scripts/dual_task_sft_pipeline.py infer \\
+ --adapter checkpoints/dual_task_v1 \\
+ --task solve \\
+ --problem "Janet has 16 eggs. She eats 3. How many are left?"
+
+Dependencies: torch, transformers, peft, datasets, accelerate, bitsandbytes, trl
+"""
+
+from __future__ import annotations
+
+import os
+
+if "HF_HUB_DISABLE_XET" not in os.environ:
+ os.environ["HF_HUB_DISABLE_XET"] = "1"
+
+import argparse
+import json
+import math
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+
+from src.config.prompts import (
+ SOLVE_TASK_PREFIX,
+ GENERATE_TASK_PREFIX,
+ SOLVER_SYSTEM_PROMPT,
+ GENERATOR_SYSTEM_PROMPT,
+)
+
+
+def _warmup_steps_from_ratio(
+ num_examples: int,
+ per_device_train_batch_size: int,
+ gradient_accumulation_steps: int,
+ num_train_epochs: float,
+ warmup_ratio: float,
+) -> int:
+ """Calculate warmup steps from ratio."""
+ if warmup_ratio <= 0:
+ return 0
+ num_batches = max(
+ 1,
+ (num_examples + per_device_train_batch_size - 1) // per_device_train_batch_size,
+ )
+ num_update_steps_per_epoch = max(1, num_batches // gradient_accumulation_steps)
+ total_optimizer_steps = max(1, math.ceil(num_train_epochs * num_update_steps_per_epoch))
+ return min(total_optimizer_steps, int(total_optimizer_steps * warmup_ratio))
+
+
+def cmd_train(args: argparse.Namespace) -> None:
+ try:
+ import torch
+ from datasets import load_dataset
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+ from trl import SFTConfig, SFTTrainer
+ except ImportError as e:
+ raise SystemExit(
+ "Missing dependency for training. Install:\n"
+ " pip install torch transformers peft datasets accelerate bitsandbytes trl\n"
+ f"Original error: {e}"
+ ) from e
+
+ data_path = Path(args.data)
+ if not data_path.is_file():
+ raise SystemExit(f"Data file not found: {data_path}")
+
+ out_dir = Path(args.output_dir)
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ compute_dtype = getattr(torch, args.bnb_compute_dtype)
+ bnb_config = BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_compute_dtype=compute_dtype,
+ bnb_4bit_quant_type="nf4",
+ bnb_4bit_use_double_quant=True,
+ )
+
+ tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+ if tokenizer.pad_token is None:
+ tokenizer.pad_token = tokenizer.eos_token
+ tokenizer.padding_side = "right"
+
+ print(f"Loading model {args.model} …")
+ model = AutoModelForCausalLM.from_pretrained(
+ args.model,
+ quantization_config=bnb_config,
+ device_map="auto",
+ trust_remote_code=True,
+ dtype=compute_dtype,
+ )
+ model = prepare_model_for_kbit_training(model)
+
+ peft = LoraConfig(
+ r=args.lora_rank,
+ lora_alpha=args.lora_alpha,
+ lora_dropout=args.lora_dropout,
+ bias="none",
+ task_type="CAUSAL_LM",
+ target_modules=list(args.target_modules.split(",")),
+ )
+ model = get_peft_model(model, peft)
+ model.config.use_cache = False
+ model.print_trainable_parameters()
+
+ print(f"Loading dual-task dataset from {data_path} …")
+ ds = load_dataset("json", data_files=str(data_path), split="train")
+ if args.max_samples and args.max_samples > 0:
+ ds = ds.select(range(min(args.max_samples, len(ds))))
+
+ task_counts = {"solve": 0, "generate": 0, "unknown": 0}
+ for example in ds:
+ task_type = example.get("task_type", "unknown")
+ task_counts[task_type] = task_counts.get(task_type, 0) + 1
+
+ print(f"Dataset composition:")
+ print(f" Total examples: {len(ds)}")
+ print(f" Solve tasks: {task_counts['solve']} ({task_counts['solve']/len(ds):.1%})")
+ print(f" Generate tasks: {task_counts['generate']} ({task_counts['generate']/len(ds):.1%})")
+ if task_counts['unknown'] > 0:
+ print(f" Unknown tasks: {task_counts['unknown']}")
+
+ def formatting_func(example):
+ return tokenizer.apply_chat_template(
+ example["messages"],
+ tokenize=False,
+ add_generation_prompt=False,
+ )
+
+ if args.warmup_steps is not None:
+ warmup_steps = max(0, args.warmup_steps)
+ else:
+ warmup_steps = _warmup_steps_from_ratio(
+ len(ds),
+ args.batch_size,
+ args.grad_accum,
+ args.epochs,
+ args.warmup_ratio,
+ )
+
+ sft_args = SFTConfig(
+ output_dir=str(out_dir),
+ num_train_epochs=args.epochs,
+ per_device_train_batch_size=args.batch_size,
+ gradient_accumulation_steps=args.grad_accum,
+ learning_rate=args.learning_rate,
+ logging_steps=args.logging_steps,
+ save_steps=args.save_steps,
+ save_total_limit=3,
+ bf16=args.bf16 and torch.cuda.is_available(),
+ fp16=args.fp16 and torch.cuda.is_available() and not args.bf16,
+ max_length=args.max_seq_length,
+ warmup_steps=warmup_steps,
+ lr_scheduler_type="cosine",
+ report_to="none",
+ gradient_checkpointing=True,
+ )
+
+ print("\nStarting dual-task training...")
+ trainer = SFTTrainer(
+ model=model,
+ args=sft_args,
+ train_dataset=ds,
+ processing_class=tokenizer,
+ formatting_func=formatting_func,
+ )
+
+ trainer.train()
+ trainer.save_model(str(out_dir))
+ tokenizer.save_pretrained(str(out_dir))
+
+ with (out_dir / "pipeline_meta.json").open("w", encoding="utf-8") as f:
+ json.dump(
+ {
+ "pipeline_type": "dual_task",
+ "base_model": args.model,
+ "data": str(data_path),
+ "lora_rank": args.lora_rank,
+ "epochs": args.epochs,
+ "task_distribution": task_counts,
+ },
+ f,
+ indent=2,
+ )
+ print(f"\nSaved dual-task adapter and tokenizer to {out_dir}")
+
+
+def cmd_infer(args: argparse.Namespace) -> None:
+ import torch
+ from peft import PeftModel
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+ adapter = Path(args.adapter)
+ meta_path = adapter / "pipeline_meta.json"
+ base_model = args.base_model
+
+ if meta_path.is_file():
+ meta = json.loads(meta_path.read_text(encoding="utf-8"))
+ base_model = meta.get("base_model", base_model)
+ pipeline_type = meta.get("pipeline_type", "unknown")
+ if pipeline_type != "dual_task":
+ print(f"Warning: Adapter trained with pipeline_type='{pipeline_type}', expected 'dual_task'")
+
+ compute_dtype = getattr(torch, args.bnb_compute_dtype)
+ bnb_config = BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_compute_dtype=compute_dtype,
+ bnb_4bit_quant_type="nf4",
+ bnb_4bit_use_double_quant=True,
+ )
+
+ tokenizer = AutoTokenizer.from_pretrained(adapter, trust_remote_code=True)
+ if tokenizer.pad_token is None:
+ tokenizer.pad_token = tokenizer.eos_token
+
+ print(f"Loading base {base_model} + adapter {adapter} …")
+ base = AutoModelForCausalLM.from_pretrained(
+ base_model,
+ quantization_config=bnb_config,
+ device_map="auto",
+ trust_remote_code=True,
+ )
+ model = PeftModel.from_pretrained(base, str(adapter))
+ model.eval()
+
+ if args.task == "solve":
+ system_prompt = SOLVER_SYSTEM_PROMPT
+ user_content = (
+ f"{SOLVE_TASK_PREFIX}"
+ "Solve the following problem. Show your reasoning as numbered steps, "
+ "then give the final numeric answer on the last line.\n\n"
+ f"Problem:\n{args.problem.strip()}"
+ )
+ elif args.task == "generate":
+ system_prompt = GENERATOR_SYSTEM_PROMPT
+ user_content = f"{GENERATE_TASK_PREFIX}{args.prompt.strip()}"
+ else:
+ raise ValueError(f"Unknown task: {args.task}. Must be 'solve' or 'generate'")
+
+ messages = [
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": user_content},
+ ]
+
+ prompt = tokenizer.apply_chat_template(
+ messages, tokenize=False, add_generation_prompt=True
+ )
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+
+ print(f"\nTask: {args.task}")
+ print(f"Prompt length: {inputs['input_ids'].shape[1]} tokens")
+ print("\nGenerating...")
+
+ with torch.no_grad():
+ out = model.generate(
+ **inputs,
+ max_new_tokens=args.max_new_tokens,
+ temperature=args.temperature,
+ top_p=args.top_p,
+ do_sample=not args.greedy,
+ pad_token_id=tokenizer.pad_token_id,
+ )
+
+ gen_ids = out[0, inputs["input_ids"].shape[1] :]
+ text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
+
+ print("\n" + "=" * 60)
+ print("Generated Output")
+ print("=" * 60)
+ print(text)
+ print("=" * 60)
+
+ if args.task == "solve":
+ print("\n--- Format Validation ---")
+ from src.sft.solution_format import validate_sympy_solution_format
+ r = validate_sympy_solution_format(text)
+ print(json.dumps(r.__dict__, indent=2))
+
+
+def build_parser() -> argparse.ArgumentParser:
+ p = argparse.ArgumentParser(description="Dual-task SFT pipeline (train / infer)")
+ sub = p.add_subparsers(dest="command", required=True)
+
+ tr = sub.add_parser("train", help="Train dual-task model on mixed dataset")
+ tr.add_argument("--data", type=str, required=True, help="Dual-task training JSONL")
+ tr.add_argument("--output-dir", type=str, required=True, help="Output directory for adapter")
+ tr.add_argument("--model", type=str, default="Qwen/Qwen2.5-Math-1.5B-Instruct", help="Base model")
+ tr.add_argument("--epochs", type=float, default=2.0, help="Training epochs (default: 2.0 for dual-task)")
+ tr.add_argument("--batch-size", type=int, default=1)
+ tr.add_argument("--grad-accum", type=int, default=8)
+ tr.add_argument("--learning-rate", type=float, default=2e-4)
+ tr.add_argument("--max-samples", type=int, default=0, help="0 = use full dataset")
+ tr.add_argument("--lora-rank", type=int, default=16)
+ tr.add_argument("--lora-alpha", type=int, default=32)
+ tr.add_argument("--lora-dropout", type=float, default=0.05)
+ tr.add_argument(
+ "--target-modules",
+ type=str,
+ default="q_proj,v_proj,o_proj,gate_proj",
+ )
+ tr.add_argument("--max-seq-length", type=int, default=2048)
+ tr.add_argument("--save-steps", type=int, default=200)
+ tr.add_argument("--logging-steps", type=int, default=10)
+ tr.add_argument("--warmup-ratio", type=float, default=0.03)
+ tr.add_argument("--warmup-steps", type=int, default=None)
+ tr.add_argument("--bf16", action="store_true", default=True)
+ tr.add_argument("--no-bf16", dest="bf16", action="store_false")
+ tr.add_argument("--fp16", action="store_true")
+ tr.add_argument("--bnb-compute-dtype", type=str, default="bfloat16")
+ tr.set_defaults(func=cmd_train)
+
+ inf = sub.add_parser("infer", help="Generate with dual-task model")
+ inf.add_argument("--adapter", type=str, required=True, help="Adapter directory")
+ inf.add_argument(
+ "--base-model",
+ type=str,
+ default="Qwen/Qwen2.5-Math-1.5B-Instruct",
+ help="Base model (auto-detected from pipeline_meta.json if present)",
+ )
+ inf.add_argument(
+ "--task",
+ type=str,
+ required=True,
+ choices=["solve", "generate"],
+ help="Task type: 'solve' for problem solving, 'generate' for question generation",
+ )
+ inf.add_argument(
+ "--problem",
+ type=str,
+ default="",
+ help="Math problem to solve (required if --task solve)",
+ )
+ inf.add_argument(
+ "--prompt",
+ type=str,
+ default="",
+ help="Question generation prompt (required if --task generate)",
+ )
+ inf.add_argument("--max-new-tokens", type=int, default=1024)
+ inf.add_argument("--temperature", type=float, default=0.7)
+ inf.add_argument("--top-p", type=float, default=0.95)
+ inf.add_argument("--greedy", action="store_true", help="Use greedy decoding")
+ inf.add_argument("--bnb-compute-dtype", type=str, default="bfloat16")
+ inf.set_defaults(func=cmd_infer)
+
+ return p
+
+
+def main() -> None:
+ parser = build_parser()
+ args = parser.parse_args()
+
+ if args.command == "infer":
+ if args.task == "solve" and not args.problem:
+ raise SystemExit("Error: --problem is required when --task solve")
+ if args.task == "generate" and not args.prompt:
+ raise SystemExit("Error: --prompt is required when --task generate")
+
+ if str(ROOT) not in sys.path:
+ sys.path.insert(0, str(ROOT))
+
+ args.func(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/eval_sft_inference.py b/scripts/eval_sft_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..8747e6edea9acb27ea052d29434edf7e5f856a4a
--- /dev/null
+++ b/scripts/eval_sft_inference.py
@@ -0,0 +1,565 @@
+#!/usr/bin/env python3
+"""
+Run batch inference for a trained QLoRA adapter and report quality metrics.
+
+This helps decide whether another SFT epoch is needed before RL.
+
+Examples
+--------
+ # Evaluate on GSM8K test split (first 100 samples)
+ python scripts/eval_sft_inference.py \
+ --adapter checkpoints/gsm8k_sft \
+ --max-samples 100
+
+ # Evaluate on local JSONL with {question, answer} rows
+ python scripts/eval_sft_inference.py \
+ --adapter checkpoints/gsm8k_sft \
+ --source jsonl \
+ --input data/raw/gsm8k_test.jsonl \
+ --max-samples 50 \
+ --output-json reports/sft_eval.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import sys
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any, Optional
+
+# Prefer classic HTTP Hub downloads by default.
+if "HF_HUB_DISABLE_XET" not in os.environ:
+ os.environ["HF_HUB_DISABLE_XET"] = "1"
+
+# Ensure project-root imports work when invoked as `python scripts/...`.
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+ sys.path.insert(0, str(ROOT))
+
+import torch
+from datasets import load_dataset
+from peft import PeftModel
+from sympy import simplify
+from sympy.parsing.sympy_parser import parse_expr
+from tqdm.auto import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+from scripts.convert_gsm8k_to_sft import parse_gsm8k_answer
+from src.config.prompts import create_solver_messages
+from src.sft.solution_format import extract_final_answer_numeric_str, validate_sympy_solution_format
+from src.sft.sympy_normalize import normalize_for_parse_expr
+
+
+@dataclass
+class EvalRow:
+ index: int
+ question: str
+ gold_final: str
+ pred_final: str
+ exact_match: Optional[bool]
+ format_ok: bool
+ step_count: int
+ scratchpad_leak: bool
+ output_text: str
+
+
+def _norm_expr(s: str) -> str:
+ s = s.strip()
+ s = s.replace("^", "**")
+ s = re.sub(r"[,$€£\s]+", "", s)
+ return s
+
+
+def _equiv_expr(a: str, b: str) -> Optional[bool]:
+ """Check if two answer strings are mathematically equivalent.
+
+ Uses the same normalization as CurriculumMathEnvironment._answers_equivalent
+ so eval and training agree on what counts as "correct".
+ """
+ if not a or not b:
+ return None
+ a_n = normalize_for_parse_expr(_norm_expr(a))
+ b_n = normalize_for_parse_expr(_norm_expr(b))
+ try:
+ return bool(simplify(parse_expr(a_n) - parse_expr(b_n)) == 0)
+ except Exception:
+ return a_n == b_n
+
+
+def _iter_examples(args: argparse.Namespace) -> list[dict[str, str]]:
+ rows: list[dict[str, str]] = []
+ if args.source == "hf":
+ ds = load_dataset(args.dataset, args.config, split=args.split)
+ if args.max_samples > 0:
+ ds = ds.select(range(min(args.max_samples, len(ds))))
+ for row in ds:
+ _, final = parse_gsm8k_answer(row["answer"])
+ rows.append({"question": row["question"].strip(), "gold_final": final})
+ return rows
+
+ in_path = Path(args.input)
+ if not in_path.is_file():
+ raise SystemExit(f"Input JSONL not found: {in_path}")
+ with in_path.open(encoding="utf-8") as f:
+ for line in f:
+ if args.max_samples > 0 and len(rows) >= args.max_samples:
+ break
+ line = line.strip()
+ if not line:
+ continue
+ o = json.loads(line)
+ if "question" in o and "answer" in o:
+ _, final = parse_gsm8k_answer(o["answer"])
+ rows.append({"question": o["question"].strip(), "gold_final": final})
+ continue
+ if "messages" in o:
+ user = next((m["content"] for m in o["messages"] if m.get("role") == "user"), "").strip()
+ asst = next((m["content"] for m in o["messages"] if m.get("role") == "assistant"), "")
+ gold = extract_final_answer_numeric_str(asst) or ""
+ user = re.sub(r"^Solve the following problem\..*?Problem:\n", "", user, flags=re.S)
+ rows.append({"question": user.strip(), "gold_final": gold.strip()})
+ continue
+ raise SystemExit("JSONL rows must contain either {question, answer} or {messages}.")
+ return rows
+
+
+def _generate(
+ model: Any,
+ tokenizer: Any,
+ problem: str,
+ max_new_tokens: int,
+ temperature: float,
+ top_p: float,
+ greedy: bool,
+) -> str:
+ # Use the canonical solver prompt (same system + user format as GRPO training)
+ # so eval measures the model under the exact distribution it was trained on.
+ messages = create_solver_messages(problem.strip())
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+ # HuggingFace warns once-per-call when `temperature`/`top_p` are passed
+ # alongside `do_sample=False`. Skip those kwargs entirely in greedy mode
+ # so long eval loops don't spam the log.
+ gen_kwargs = {
+ "max_new_tokens": max_new_tokens,
+ "do_sample": not greedy,
+ "pad_token_id": tokenizer.pad_token_id,
+ }
+ if not greedy:
+ gen_kwargs["temperature"] = temperature
+ gen_kwargs["top_p"] = top_p
+ with torch.no_grad():
+ out = model.generate(**inputs, **gen_kwargs)
+ gen_ids = out[0, inputs["input_ids"].shape[1] :]
+ return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
+
+
+def main() -> None:
+ p = argparse.ArgumentParser(description="Batch eval for SFT adapter inference.")
+ p.add_argument("--adapter", type=Path, required=True, help="Adapter directory from training step.")
+ p.add_argument("--base-model", type=str, default="Qwen/Qwen2.5-Math-1.5B-Instruct")
+ p.add_argument("--source", choices=("hf", "jsonl"), default="hf")
+ p.add_argument("--dataset", type=str, default="openai/gsm8k")
+ p.add_argument("--config", type=str, default="main")
+ p.add_argument("--split", type=str, default="test")
+ p.add_argument("--input", type=Path, help="JSONL path for --source jsonl")
+ p.add_argument("--max-samples", type=int, default=100)
+ p.add_argument("--max-new-tokens", type=int, default=512)
+ p.add_argument("--temperature", type=float, default=0.0)
+ p.add_argument("--top-p", type=float, default=1.0)
+ p.add_argument("--greedy", action="store_true", default=True)
+ p.add_argument("--no-greedy", dest="greedy", action="store_false")
+ p.add_argument("--bnb-compute-dtype", type=str, default="bfloat16")
+ p.add_argument("--show-samples", type=int, default=3)
+ p.add_argument("--output-json", type=Path, default=None)
+ args = p.parse_args()
+
+ if args.source == "jsonl" and not args.input:
+ raise SystemExit("--input is required when --source jsonl")
+
+ meta_path = args.adapter / "pipeline_meta.json"
+ base_model = args.base_model
+ if meta_path.is_file():
+ meta = json.loads(meta_path.read_text(encoding="utf-8"))
+ base_model = meta.get("base_model", base_model)
+
+ rows = _iter_examples(args)
+ if not rows:
+ raise SystemExit("No evaluation examples loaded.")
+ print(f"Loaded {len(rows)} evaluation examples.")
+
+ compute_dtype = getattr(torch, args.bnb_compute_dtype)
+ bnb_config = BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_compute_dtype=compute_dtype,
+ bnb_4bit_quant_type="nf4",
+ bnb_4bit_use_double_quant=True,
+ )
+
+ print(f"Loading base {base_model} + adapter {args.adapter} …")
+ tokenizer = AutoTokenizer.from_pretrained(args.adapter, trust_remote_code=True)
+ if tokenizer.pad_token is None:
+ tokenizer.pad_token = tokenizer.eos_token
+ base = AutoModelForCausalLM.from_pretrained(
+ base_model,
+ quantization_config=bnb_config,
+ device_map="auto",
+ trust_remote_code=True,
+ )
+ model = PeftModel.from_pretrained(base, str(args.adapter))
+ model.eval()
+
+ results: list[EvalRow] = []
+ for i, row in enumerate(rows):
+ text = _generate(
+ model=model,
+ tokenizer=tokenizer,
+ problem=row["question"],
+ max_new_tokens=args.max_new_tokens,
+ temperature=args.temperature,
+ top_p=args.top_p,
+ greedy=args.greedy,
+ )
+ fmt = validate_sympy_solution_format(text)
+ pred_final = extract_final_answer_numeric_str(text) or ""
+ exact = _equiv_expr(pred_final, row["gold_final"])
+ results.append(
+ EvalRow(
+ index=i,
+ question=row["question"],
+ gold_final=row["gold_final"],
+ pred_final=pred_final,
+ exact_match=exact,
+ format_ok=fmt.ok,
+ step_count=fmt.step_count,
+ scratchpad_leak=("<<" in text and ">>" in text),
+ output_text=text,
+ )
+ )
+ if i < args.show_samples:
+ print(f"\n=== Sample {i} ===")
+ print("Q:", row["question"])
+ print("Gold:", row["gold_final"])
+ print("Pred:", pred_final)
+ print("Format OK:", fmt.ok, "| Steps:", fmt.step_count)
+ print(text)
+
+ n = len(results)
+ n_format_ok = sum(1 for r in results if r.format_ok)
+ n_scratch = sum(1 for r in results if r.scratchpad_leak)
+ em_scored = [r for r in results if r.exact_match is not None]
+ n_em = sum(1 for r in em_scored if r.exact_match)
+
+ print("\n=== Summary ===")
+ print(f"Samples: {n}")
+ print(f"Format OK: {n_format_ok}/{n} ({100.0 * n_format_ok / n:.2f}%)")
+ print(f"Scratchpad leakage (<< >>): {n_scratch}/{n} ({100.0 * n_scratch / n:.2f}%)")
+ if em_scored:
+ print(f"Exact match (final answer): {n_em}/{len(em_scored)} ({100.0 * n_em / len(em_scored):.2f}%)")
+ else:
+ print("Exact match (final answer): N/A (missing gold labels)")
+
+ if args.output_json is not None:
+ args.output_json.parent.mkdir(parents=True, exist_ok=True)
+ payload = {
+ "summary": {
+ "samples": n,
+ "format_ok": n_format_ok,
+ "format_ok_rate": n_format_ok / n,
+ "scratchpad_leakage": n_scratch,
+ "scratchpad_leakage_rate": n_scratch / n,
+ "exact_match_scored": len(em_scored),
+ "exact_match": n_em,
+ "exact_match_rate": (n_em / len(em_scored)) if em_scored else None,
+ },
+ "results": [asdict(r) for r in results],
+ }
+ args.output_json.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
+ print(f"Wrote detailed report to {args.output_json}")
+
+
+def _infer_dataset_name(data_path: str) -> str:
+ """Derive a short human-readable dataset label from the file path."""
+ stem = Path(data_path).stem.lower() # e.g. "aqua_validation", "gsm8k_test"
+ if "aqua" in stem:
+ return "AQuA-RAT"
+ if "math" in stem:
+ return "MATH"
+ if "gsm" in stem:
+ return "GSM8K"
+ return Path(data_path).stem # fallback: raw filename stem
+
+
+def evaluate_gsm8k(
+ model: Any,
+ tokenizer: Any,
+ data_path: str = "data/sft/gsm8k_test.jsonl",
+ max_samples: int = 500,
+ max_new_tokens: int = 512,
+ temperature: float = 0.0,
+ top_p: float = 1.0,
+ reward_fn: Any = None,
+ pass_at_k: int = 0,
+ dataset_name: str = "",
+ pass_at_k_temperature: float = 0.8,
+) -> dict:
+ """
+ Evaluate *model* on a math JSONL file using the SAME scoring
+ function used during GRPO training.
+
+ Args:
+ model : AutoModelForCausalLM (already on correct device).
+ tokenizer : Matching AutoTokenizer.
+ data_path : Path to JSONL with {question, answer} rows.
+ max_samples : Evaluation cap.
+ max_new_tokens / temperature / top_p : generation hyper-params.
+ reward_fn : callable(question: str, solution: str, gold: str) -> dict
+ Must return at minimum {"combined_score": float} and
+ optionally {"gt_match": bool, "prm_mean_score": float,
+ "sympy_score": float, "format_score": float}.
+ When supplied the primary accuracy metric becomes the
+ mean combined_score — identical to the GRPO training
+ objective — so every component (correctness, PRM step
+ quality, SymPy verification, format) contributes and
+ improvements in any of them show up immediately.
+ When None the function falls back to final-answer
+ exact-match accuracy (coarse binary).
+
+ Returns dict keys:
+ accuracy – mean combined_score per solution (or exact-match if no reward_fn)
+ combined_score – same as accuracy (alias)
+ correct_rate – fraction of solutions with gt_match == True
+ prm_mean – mean PRM step-quality score per solution
+ sympy_mean – mean SymPy verification score
+ format_mean – mean format compliance score
+ n_scored – solutions successfully scored by reward_fn
+ total – total solutions evaluated
+ # fallback (no reward_fn):
+ exact_match_rate – fraction of final answers matching gold
+ """
+ import logging as _logging
+ _logger = _logging.getLogger(__name__)
+
+ greedy = temperature < 1e-6
+ rows: list[dict] = []
+
+ p = Path(data_path)
+ if p.is_file():
+ with p.open(encoding="utf-8") as fh:
+ for line in fh:
+ if max_samples > 0 and len(rows) >= max_samples:
+ break
+ line = line.strip()
+ if not line:
+ continue
+ obj = json.loads(line)
+ if "question" in obj and "gold_final" in obj and obj["gold_final"]:
+ # Pre-extracted format (our gsm8k_test.jsonl)
+ rows.append({"question": obj["question"].strip(), "gold_final": obj["gold_final"].strip()})
+ elif "question" in obj and "answer" in obj:
+ _, final = parse_gsm8k_answer(obj["answer"])
+ if final:
+ rows.append({"question": obj["question"].strip(), "gold_final": final})
+ elif "messages" in obj:
+ task_type = obj.get("task_type", "solve")
+ if task_type != "solve":
+ continue # skip question-generation entries
+ user = next(
+ (m["content"] for m in obj["messages"] if m.get("role") == "user"), ""
+ ).strip()
+ asst = next(
+ (m["content"] for m in obj["messages"] if m.get("role") == "assistant"), ""
+ )
+ gold = extract_final_answer_numeric_str(asst) or ""
+ if not gold:
+ continue # skip entries with no parseable gold answer
+ user = re.sub(r"^Solve the following problem\..*?Problem:\n", "", user, flags=re.S)
+ rows.append({"question": user.strip(), "gold_final": gold.strip()})
+ else:
+ _logger.warning(
+ f"evaluate_gsm8k: {data_path} not found; loading openai/gsm8k from Hub."
+ )
+ try:
+ ds = load_dataset("openai/gsm8k", "main", split="test")
+ if max_samples > 0:
+ ds = ds.select(range(min(max_samples, len(ds))))
+ for row in ds:
+ _, final = parse_gsm8k_answer(row["answer"])
+ rows.append({"question": row["question"].strip(), "gold_final": final})
+ except Exception as exc:
+ _logger.error(f"Could not load GSM8K: {exc}")
+ return {"accuracy": 0.0, "correct": 0, "total": 0, "exact_match_rate": 0.0}
+
+ if not rows:
+ return {"accuracy": 0.0, "correct": 0, "total": 0, "exact_match_rate": 0.0}
+
+ correct = 0
+ total = len(rows)
+ _n_errors = 0
+ _MAX_ERROR_WARNINGS = 3
+
+ # Per-solution reward accumulators (populated when reward_fn is supplied).
+ _combined: list[float] = []
+ _gt_match: list[float] = []
+ _prm_comp: list[float] = []
+ _prm_final: list[float] = []
+ _step_acc: list[float] = [] # fraction of steps rated correct by PRM (>0.5)
+ _lccp: list[float] = [] # longest correct consecutive prefix ratio
+ _sympy_comp:list[float] = []
+ _fmt_comp: list[float] = []
+
+ # Pass@K accumulators: for each problem, did ANY of K samples get it right?
+ _pak_any_correct: list[int] = [] # 1 if any of K samples correct, else 0
+
+ _eval_label = dataset_name or _infer_dataset_name(data_path)
+ pbar = tqdm(
+ rows, total=total, desc=f"{_eval_label} eval",
+ unit="q", dynamic_ncols=True, leave=True,
+ )
+ for i, row in enumerate(pbar):
+ pred_text = ""
+ try:
+ pred_text = _generate(
+ model=model, tokenizer=tokenizer,
+ problem=row["question"],
+ max_new_tokens=max_new_tokens,
+ temperature=temperature, top_p=top_p, greedy=greedy,
+ )
+ pred_final = extract_final_answer_numeric_str(pred_text) or ""
+ if _equiv_expr(pred_final, row["gold_final"]):
+ correct += 1
+ except Exception as exc:
+ _n_errors += 1
+ if _n_errors <= _MAX_ERROR_WARNINGS:
+ _logger.warning(
+ "evaluate_gsm8k: sample %d raised %s: %s. "
+ "If all fail check that tokenizer has a chat_template.",
+ i, type(exc).__name__, exc,
+ )
+ elif _n_errors == _MAX_ERROR_WARNINGS + 1:
+ _logger.warning(
+ "evaluate_gsm8k: suppressing further errors (%d so far).",
+ _n_errors,
+ )
+ _logger.debug("Sample %d error: %s", i, exc, exc_info=True)
+
+ # ── Pass@K: sample K solutions at T=0.8 and check if any is correct ─
+ # This is the fair comparison to batch_acc during training (also K samples
+ # at T=0.8). Greedy (pass@1) is pessimistic; pass@k shows the upper bound
+ # the model can achieve with sampling, matching the training regime.
+ if pass_at_k > 1 and row.get("gold_final"):
+ _any = 0
+ for _ in range(pass_at_k):
+ try:
+ s = _generate(
+ model=model, tokenizer=tokenizer,
+ problem=row["question"],
+ max_new_tokens=max_new_tokens,
+ temperature=pass_at_k_temperature,
+ top_p=top_p, greedy=False,
+ )
+ pf = extract_final_answer_numeric_str(s) or ""
+ if _equiv_expr(pf, row["gold_final"]):
+ _any = 1
+ break
+ except Exception:
+ pass
+ _pak_any_correct.append(_any)
+
+ # ── Apply the SAME reward function used during GRPO training ──────────
+ if reward_fn is not None and pred_text:
+ try:
+ r = reward_fn(row["question"], pred_text, row["gold_final"])
+ _combined.append(float(r.get("combined_score", 0.0)))
+ _gt_match.append(1.0 if r.get("gt_match", False) else 0.0)
+ _prm_comp.append(float(r.get("prm_mean_score", 0.0)))
+ _prm_final.append(float(r.get("prm_final_score", 0.0)))
+ _step_acc.append(float(r.get("step_accuracy", 0.0)))
+ _lccp.append(float(r.get("lccp", 0.0)))
+ _sympy_comp.append(float(r.get("sympy_score", 0.0)))
+ _fmt_comp.append(float(r.get("format_score", 0.0)))
+ except Exception as rfn_exc:
+ _logger.debug("reward_fn failed for sample %d: %s", i, rfn_exc)
+
+ done = i + 1
+ # Periodically flush the CUDA allocator's free-block pool so that
+ # fragmentation from large KV-cache + PRM tensors doesn't accumulate
+ # and cause per-sample allocation time to grow throughout the run.
+ if done % 20 == 0:
+ import gc; gc.collect()
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+
+ # Live bar: show training-objective score when available, else acc.
+ if _combined:
+ _pf: dict = dict(
+ score=f"{sum(_combined) / len(_combined):.3f}",
+ correct=f"{sum(_gt_match):.0f}/{len(_combined)}",
+ step_acc=f"{sum(_step_acc)/len(_step_acc):.1%}" if _step_acc else "—",
+ lccp=f"{sum(_lccp)/len(_lccp):.1%}" if _lccp else "—",
+ )
+ else:
+ _pf = dict(acc=f"{correct / done:.1%}", correct=f"{correct}/{done}")
+ pbar.set_postfix(**_pf, refresh=False)
+
+ # ── Aggregate ──────────────────────────────────────────────────────────
+ n_scored = len(_combined)
+ _avg = lambda lst: round(sum(lst) / len(lst), 4) if lst else 0.0
+
+ # Pass@K: fraction of problems where any of K sampled solutions was correct.
+ pass_at_k_score = _avg(_pak_any_correct) if _pak_any_correct else None
+
+ if reward_fn is not None:
+ combined_score = _avg(_combined)
+ result: dict = {
+ # PRIMARY: mean training-objective score.
+ # Formula: 0.50×correct + 0.40×process(prm_final, prm_mean) + 0.10×format
+ "accuracy": combined_score,
+ "combined_score": combined_score,
+ # PROCESS metrics — improve before correct_rate does
+ "step_accuracy": _avg(_step_acc),
+ "lccp": _avg(_lccp), # chain integrity: how far into solution stays correct
+ # Answer correctness
+ "correct_rate": _avg(_gt_match),
+ # PRM components
+ "prm_mean": _avg(_prm_comp),
+ "prm_final": _avg(_prm_final),
+ # Format / SymPy (informational)
+ "sympy_mean": _avg(_sympy_comp),
+ "format_mean": _avg(_fmt_comp),
+ "n_scored": n_scored,
+ "total": total,
+ "final_answer_correct": correct,
+ "final_answer_accuracy": correct / total if total else 0.0,
+ }
+ else:
+ _logger.warning(
+ "evaluate_gsm8k: no reward_fn provided — using final-answer accuracy. "
+ "Pass reward_fn=math_env.compute_grounded_reward for full training-objective eval."
+ )
+ fa_acc = correct / total if total else 0.0
+ result = {
+ "accuracy": fa_acc,
+ "combined_score": fa_acc,
+ "correct_rate": fa_acc,
+ "prm_mean": 0.0,
+ "sympy_mean": 0.0,
+ "format_mean": 0.0,
+ "n_scored": 0,
+ "total": total,
+ "final_answer_correct": correct,
+ "final_answer_accuracy": fa_acc,
+ }
+ # Attach pass@k if it was computed
+ if pass_at_k_score is not None:
+ result["pass_at_k"] = pass_at_k_score
+ result["pass_at_k_k"] = pass_at_k
+ return result
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/gsm8k_sft_pipeline.py b/scripts/gsm8k_sft_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..72fdfa6a0bc3b5ceb63f121b212dd3c68dc581a4
--- /dev/null
+++ b/scripts/gsm8k_sft_pipeline.py
@@ -0,0 +1,475 @@
+#!/usr/bin/env python3
+"""
+End-to-end GSM8K pipeline: prepare JSONL → QLoRA SFT → save adapter → inference.
+
+The trained model follows ``Step N:`` / ``Final Answer:`` formatting with SymPy-friendly
+expressions (see ``src.agent.math_agent.SOLVER_SYSTEM_PROMPT``).
+
+Examples
+--------
+ # 1) Only build training JSONL from Hugging Face GSM8K
+ python scripts/gsm8k_sft_pipeline.py prepare --output data/sft/gsm8k_sft.jsonl
+
+ # 2) Fine-tune (requires GPU recommended)
+ python scripts/gsm8k_sft_pipeline.py train \\
+ --data data/sft/gsm8k_sft.jsonl \\
+ --output-dir checkpoints/gsm8k_sft
+
+ # 3) Run inference with saved adapter
+ python scripts/gsm8k_sft_pipeline.py infer \\
+ --adapter checkpoints/gsm8k_sft \\
+ --problem \"Janet has 16 eggs. She eats 3. How many are left?\"
+
+ # Full chain
+ python scripts/gsm8k_sft_pipeline.py all --output-dir checkpoints/gsm8k_sft
+
+Dependencies: torch, transformers, peft, datasets, accelerate, bitsandbytes, trl, sympy
+
+Tip: if downloads fail with XET / "Background writer channel closed", export ``HF_HUB_DISABLE_XET=1``
+before running (this script sets it by default unless already set).
+"""
+
+from __future__ import annotations
+
+import os
+
+# hf-xet can error or segfault on interrupted/large shards; classic HTTP download is more robust.
+if "HF_HUB_DISABLE_XET" not in os.environ:
+ os.environ["HF_HUB_DISABLE_XET"] = "1"
+
+import argparse
+import json
+import math
+import subprocess
+import sys
+from pathlib import Path
+
+# Project root (…/Maths_LLM)
+ROOT = Path(__file__).resolve().parents[1]
+
+
+def cmd_prepare(args: argparse.Namespace) -> None:
+ cmd = [
+ sys.executable,
+ str(ROOT / "scripts" / "convert_gsm8k_to_sft.py"),
+ "--output",
+ str(Path(args.output)),
+ "--splits",
+ *args.splits,
+ ]
+ if args.source == "jsonl":
+ cmd.extend(["--source", "jsonl", "--input", str(args.input)])
+ print("Running:", " ".join(cmd))
+ subprocess.check_call(cmd, cwd=str(ROOT))
+ if args.strip_scratchpads:
+ _rewrite_jsonl_strip_scratchpads(Path(args.output))
+
+
+def _rewrite_jsonl_strip_scratchpads(jsonl_path: Path) -> None:
+ from src.sft.solution_format import strip_gsm8k_scratchpads
+
+ tmp = jsonl_path.with_suffix(".jsonl.tmp")
+ n = 0
+ with jsonl_path.open(encoding="utf-8") as fin, tmp.open("w", encoding="utf-8") as fout:
+ for line in fin:
+ o = json.loads(line)
+ for m in o.get("messages", []):
+ if m.get("role") == "assistant":
+ m["content"] = strip_gsm8k_scratchpads(m["content"])
+ if "text" in o:
+ sys_p = next(x["content"] for x in o["messages"] if x["role"] == "system")
+ usr = next(x["content"] for x in o["messages"] if x["role"] == "user")
+ asst = next(x["content"] for x in o["messages"] if x["role"] == "assistant")
+ o["text"] = (
+ f"<|system|>\n{sys_p}\n<|user|>\n{usr}\n<|assistant|>\n{asst}"
+ )
+ fout.write(json.dumps(o, ensure_ascii=False) + "\n")
+ n += 1
+ tmp.replace(jsonl_path)
+ print(f"Stripped <<>> scratchpads in {n} records → {jsonl_path}")
+
+
+def _warmup_steps_from_ratio(
+ num_examples: int,
+ per_device_train_batch_size: int,
+ gradient_accumulation_steps: int,
+ num_train_epochs: float,
+ warmup_ratio: float,
+) -> int:
+ """Approximate HF Trainer optimizer steps; used to map legacy warmup_ratio → warmup_steps."""
+ if warmup_ratio <= 0:
+ return 0
+ num_batches = max(
+ 1,
+ (num_examples + per_device_train_batch_size - 1) // per_device_train_batch_size,
+ )
+ num_update_steps_per_epoch = max(1, num_batches // gradient_accumulation_steps)
+ total_optimizer_steps = max(1, math.ceil(num_train_epochs * num_update_steps_per_epoch))
+ return min(total_optimizer_steps, int(total_optimizer_steps * warmup_ratio))
+
+
+def cmd_train(args: argparse.Namespace) -> None:
+ try:
+ import torch
+ from datasets import load_dataset
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+ from trl import SFTConfig, SFTTrainer
+ except ImportError as e:
+ raise SystemExit(
+ "Missing dependency for training. Install:\n"
+ " pip install torch transformers peft datasets accelerate bitsandbytes trl sympy\n"
+ f"Original error: {e}"
+ ) from e
+
+ data_path = Path(args.data)
+ if not data_path.is_file():
+ raise SystemExit(f"Data file not found: {data_path}")
+
+ out_dir = Path(args.output_dir)
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ compute_dtype = getattr(torch, args.bnb_compute_dtype)
+ bnb_config = BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_compute_dtype=compute_dtype,
+ bnb_4bit_quant_type="nf4",
+ bnb_4bit_use_double_quant=True,
+ )
+
+ tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+ if tokenizer.pad_token is None:
+ tokenizer.pad_token = tokenizer.eos_token
+ tokenizer.padding_side = "right"
+
+ print(f"Loading model {args.model} …")
+ model = AutoModelForCausalLM.from_pretrained(
+ args.model,
+ quantization_config=bnb_config,
+ device_map="auto",
+ trust_remote_code=True,
+ dtype=compute_dtype,
+ )
+ model = prepare_model_for_kbit_training(model)
+ peft = LoraConfig(
+ r=args.lora_rank,
+ lora_alpha=args.lora_alpha,
+ lora_dropout=args.lora_dropout,
+ bias="none",
+ task_type="CAUSAL_LM",
+ target_modules=list(args.target_modules.split(",")),
+ )
+ model = get_peft_model(model, peft)
+ model.config.use_cache = False
+ model.print_trainable_parameters()
+
+ ds = load_dataset("json", data_files=str(data_path), split="train")
+ if args.max_samples and args.max_samples > 0:
+ ds = ds.select(range(min(args.max_samples, len(ds))))
+
+ def formatting_func(example):
+ return tokenizer.apply_chat_template(
+ example["messages"],
+ tokenize=False,
+ add_generation_prompt=False,
+ )
+
+ if args.warmup_steps is not None:
+ warmup_steps = max(0, args.warmup_steps)
+ else:
+ warmup_steps = _warmup_steps_from_ratio(
+ len(ds),
+ args.batch_size,
+ args.grad_accum,
+ args.epochs,
+ args.warmup_ratio,
+ )
+
+ sft_args = SFTConfig(
+ output_dir=str(out_dir),
+ num_train_epochs=args.epochs,
+ per_device_train_batch_size=args.batch_size,
+ gradient_accumulation_steps=args.grad_accum,
+ learning_rate=args.learning_rate,
+ logging_steps=args.logging_steps,
+ save_steps=args.save_steps,
+ save_total_limit=3,
+ bf16=args.bf16 and torch.cuda.is_available(),
+ fp16=args.fp16 and torch.cuda.is_available() and not args.bf16,
+ max_length=args.max_seq_length,
+ warmup_steps=warmup_steps,
+ lr_scheduler_type="cosine",
+ report_to="none",
+ gradient_checkpointing=True,
+ )
+
+ trainer = SFTTrainer(
+ model=model,
+ args=sft_args,
+ train_dataset=ds,
+ processing_class=tokenizer,
+ formatting_func=formatting_func,
+ )
+
+ trainer.train()
+ trainer.save_model(str(out_dir))
+ tokenizer.save_pretrained(str(out_dir))
+
+ with (out_dir / "pipeline_meta.json").open("w", encoding="utf-8") as f:
+ json.dump(
+ {
+ "base_model": args.model,
+ "data": str(data_path),
+ "lora_rank": args.lora_rank,
+ "epochs": args.epochs,
+ },
+ f,
+ indent=2,
+ )
+ print(f"Saved adapter and tokenizer to {out_dir}")
+
+
+def cmd_infer(args: argparse.Namespace) -> None:
+ import torch
+ from peft import PeftModel
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+ from src.agent.math_agent import SOLVER_SYSTEM_PROMPT
+
+ adapter = Path(args.adapter)
+ meta_path = adapter / "pipeline_meta.json"
+ base_model = args.base_model
+ if meta_path.is_file():
+ meta = json.loads(meta_path.read_text(encoding="utf-8"))
+ base_model = meta.get("base_model", base_model)
+
+ compute_dtype = getattr(torch, args.bnb_compute_dtype)
+ bnb_config = BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_compute_dtype=compute_dtype,
+ bnb_4bit_quant_type="nf4",
+ bnb_4bit_use_double_quant=True,
+ )
+
+ tokenizer = AutoTokenizer.from_pretrained(adapter, trust_remote_code=True)
+ if tokenizer.pad_token is None:
+ tokenizer.pad_token = tokenizer.eos_token
+
+ print(f"Loading base {base_model} + adapter {adapter} …")
+ base = AutoModelForCausalLM.from_pretrained(
+ base_model,
+ quantization_config=bnb_config,
+ device_map="auto",
+ trust_remote_code=True,
+ )
+ model = PeftModel.from_pretrained(base, str(adapter))
+ model.eval()
+
+ user_content = (
+ "Solve the following problem. Show your reasoning as numbered steps, "
+ "then give the final numeric answer on the last line.\n\n"
+ f"Problem:\n{args.problem.strip()}"
+ )
+ messages = [
+ {"role": "system", "content": SOLVER_SYSTEM_PROMPT},
+ {"role": "user", "content": user_content},
+ ]
+ prompt = tokenizer.apply_chat_template(
+ messages, tokenize=False, add_generation_prompt=True
+ )
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+
+ with torch.no_grad():
+ out = model.generate(
+ **inputs,
+ max_new_tokens=args.max_new_tokens,
+ temperature=args.temperature,
+ top_p=args.top_p,
+ do_sample=not args.greedy,
+ pad_token_id=tokenizer.pad_token_id,
+ )
+
+ gen_ids = out[0, inputs["input_ids"].shape[1] :]
+ text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
+ print("\n--- Generated ---\n")
+ print(text)
+ print("\n--- Format check ---")
+ from src.sft.solution_format import validate_sympy_solution_format
+
+ r = validate_sympy_solution_format(text)
+ print(json.dumps(r.__dict__, indent=2))
+
+
+def cmd_all(args: argparse.Namespace) -> None:
+ out_jsonl = Path(args.data) if args.data else ROOT / "data" / "sft" / "gsm8k_sft.jsonl"
+ ns = argparse.Namespace(
+ output=out_jsonl,
+ source=args.prepare_source,
+ input=args.input,
+ splits=args.splits,
+ strip_scratchpads=args.strip_scratchpads,
+ )
+ cmd_prepare(ns)
+ train_ns = argparse.Namespace(
+ data=str(out_jsonl),
+ output_dir=args.output_dir,
+ model=args.model,
+ epochs=args.epochs,
+ batch_size=args.batch_size,
+ grad_accum=args.grad_accum,
+ learning_rate=args.learning_rate,
+ max_samples=args.max_samples,
+ lora_rank=args.lora_rank,
+ lora_alpha=args.lora_alpha,
+ lora_dropout=args.lora_dropout,
+ target_modules=args.target_modules,
+ max_seq_length=args.max_seq_length,
+ save_steps=args.save_steps,
+ logging_steps=args.logging_steps,
+ warmup_ratio=args.warmup_ratio,
+ warmup_steps=args.warmup_steps,
+ bf16=args.bf16,
+ fp16=args.fp16,
+ bnb_compute_dtype=args.bnb_compute_dtype,
+ )
+ cmd_train(train_ns)
+ if args.problem:
+ infer_ns = argparse.Namespace(
+ adapter=Path(args.output_dir),
+ base_model=args.model,
+ problem=args.problem,
+ max_new_tokens=args.max_new_tokens,
+ temperature=args.temperature,
+ top_p=args.top_p,
+ greedy=args.greedy,
+ bnb_compute_dtype=args.bnb_compute_dtype,
+ )
+ cmd_infer(infer_ns)
+
+
+def build_parser() -> argparse.ArgumentParser:
+ p = argparse.ArgumentParser(description="GSM8K SFT pipeline (prepare / train / infer / all)")
+ sub = p.add_subparsers(dest="command", required=True)
+
+ pr = sub.add_parser("prepare", help="Run convert_gsm8k_to_sft.py")
+ pr.add_argument("--output", type=str, default=str(ROOT / "data" / "sft" / "gsm8k_sft.jsonl"))
+ pr.add_argument("--source", choices=("hf", "jsonl"), default="hf")
+ pr.add_argument("--input", type=str, help="JSONL path for --source jsonl")
+ pr.add_argument("--splits", nargs="+", default=["train", "test"])
+ pr.add_argument(
+ "--strip-scratchpads",
+ action="store_true",
+ help="Remove GSM8K <<...>> traces from assistant text after conversion.",
+ )
+ pr.set_defaults(func=cmd_prepare)
+
+ tr = sub.add_parser("train", help="QLoRA SFT on JSONL with messages field")
+ tr.add_argument("--data", type=str, required=True, help="JSONL from prepare step")
+ tr.add_argument("--output-dir", type=str, required=True)
+ tr.add_argument("--model", type=str, default="Qwen/Qwen2.5-Math-1.5B-Instruct")
+ tr.add_argument("--epochs", type=float, default=1.0)
+ tr.add_argument("--batch-size", type=int, default=1)
+ tr.add_argument("--grad-accum", type=int, default=8)
+ tr.add_argument("--learning-rate", type=float, default=2e-4)
+ tr.add_argument("--max-samples", type=int, default=0, help="0 = use full dataset")
+ tr.add_argument("--lora-rank", type=int, default=16)
+ tr.add_argument("--lora-alpha", type=int, default=32)
+ tr.add_argument("--lora-dropout", type=float, default=0.05)
+ tr.add_argument(
+ "--target-modules",
+ type=str,
+ default="q_proj,v_proj,o_proj,gate_proj",
+ )
+ tr.add_argument("--max-seq-length", type=int, default=2048)
+ tr.add_argument("--save-steps", type=int, default=200)
+ tr.add_argument("--logging-steps", type=int, default=10)
+ tr.add_argument(
+ "--warmup-ratio",
+ type=float,
+ default=0.03,
+ help="Used only if --warmup-steps is not set; converted to warmup_steps.",
+ )
+ tr.add_argument(
+ "--warmup-steps",
+ type=int,
+ default=None,
+ help="LR warmup steps; if set, overrides --warmup-ratio.",
+ )
+ tr.add_argument("--bf16", action="store_true", default=True)
+ tr.add_argument("--no-bf16", dest="bf16", action="store_false")
+ tr.add_argument("--fp16", action="store_true")
+ tr.add_argument("--bnb-compute-dtype", type=str, default="bfloat16")
+ tr.set_defaults(func=cmd_train)
+
+ inf = sub.add_parser("infer", help="Generate with saved adapter")
+ inf.add_argument("--adapter", type=str, required=True, help="Directory from train step")
+ inf.add_argument(
+ "--base-model",
+ type=str,
+ default="Qwen/Qwen2.5-Math-1.5B-Instruct",
+ help="Must match base used in training if no pipeline_meta.json",
+ )
+ inf.add_argument("--problem", type=str, required=True)
+ inf.add_argument("--max-new-tokens", type=int, default=1024)
+ inf.add_argument("--temperature", type=float, default=0.7)
+ inf.add_argument("--top-p", type=float, default=0.95)
+ inf.add_argument("--greedy", action="store_true")
+ inf.add_argument("--bnb-compute-dtype", type=str, default="bfloat16")
+ inf.set_defaults(func=cmd_infer)
+
+ al = sub.add_parser("all", help="prepare + train [+ infer if --problem]")
+ al.add_argument("--data", type=str, default=None, help="Output JSONL path (default data/sft/gsm8k_sft.jsonl)")
+ al.add_argument("--prepare-source", choices=("hf", "jsonl"), default="hf")
+ al.add_argument("--input", type=str, help="For jsonl prepare")
+ al.add_argument("--splits", nargs="+", default=["train", "test"])
+ al.add_argument("--strip-scratchpads", action="store_true")
+ al.add_argument("--output-dir", type=str, required=True)
+ al.add_argument("--model", type=str, default="Qwen/Qwen2.5-Math-1.5B-Instruct")
+ al.add_argument("--epochs", type=float, default=1.0)
+ al.add_argument("--batch-size", type=int, default=1)
+ al.add_argument("--grad-accum", type=int, default=8)
+ al.add_argument("--learning-rate", type=float, default=2e-4)
+ al.add_argument("--max-samples", type=int, default=0)
+ al.add_argument("--lora-rank", type=int, default=16)
+ al.add_argument("--lora-alpha", type=int, default=32)
+ al.add_argument("--lora-dropout", type=float, default=0.05)
+ al.add_argument("--target-modules", type=str, default="q_proj,v_proj,o_proj,gate_proj")
+ al.add_argument("--max-seq-length", type=int, default=2048)
+ al.add_argument("--save-steps", type=int, default=200)
+ al.add_argument("--logging-steps", type=int, default=10)
+ al.add_argument(
+ "--warmup-ratio",
+ type=float,
+ default=0.03,
+ help="Used only if --warmup-steps is not set; converted to warmup_steps.",
+ )
+ al.add_argument(
+ "--warmup-steps",
+ type=int,
+ default=None,
+ help="LR warmup steps; if set, overrides --warmup-ratio.",
+ )
+ al.add_argument("--bf16", action="store_true", default=True)
+ al.add_argument("--no-bf16", dest="bf16", action="store_false")
+ al.add_argument("--fp16", action="store_true")
+ al.add_argument("--bnb-compute-dtype", type=str, default="bfloat16")
+ al.add_argument("--problem", type=str, default="", help="If set, run infer after train")
+ al.add_argument("--max-new-tokens", type=int, default=1024)
+ al.add_argument("--temperature", type=float, default=0.7)
+ al.add_argument("--top-p", type=float, default=0.95)
+ al.add_argument("--greedy", action="store_true")
+ al.set_defaults(func=cmd_all)
+
+ return p
+
+
+def main() -> None:
+ parser = build_parser()
+ args = parser.parse_args()
+ if str(ROOT) not in sys.path:
+ sys.path.insert(0, str(ROOT))
+ args.func(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/launch_grpo.sh b/scripts/launch_grpo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0a61c03db1b014f236dbd7b9b5d7e6c1d0743e32
--- /dev/null
+++ b/scripts/launch_grpo.sh
@@ -0,0 +1,127 @@
+set -euo pipefail
+
+# ── Flash-Attention 2 install (if missing) ────────────────────────────────────
+# flash-attn requires (torch version, CUDA version, Python version) alignment.
+# MAX_JOBS caps parallel compilation; prebuilt wheel installs in <30 s.
+# In the prior run (grpo_20260425_151304), flash-attn was absent → SDPA fallback
+# → iter times of 262-330 s once question-gen started (vs ~150 s with Flash).
+if ! python -c "import flash_attn; assert int(flash_attn.__version__.split('.')[0]) >= 2" 2>/dev/null; then
+ echo "[launch] flash-attn not found or < v2 — installing now …"
+ MAX_JOBS=4 pip install flash-attn --no-build-isolation -q
+ echo "[launch] flash-attn installed."
+else
+ FLASH_VER=$(python -c "import flash_attn; print(flash_attn.__version__)" 2>/dev/null)
+ echo "[launch] flash-attn ${FLASH_VER} already installed — skipping install."
+fi
+
+# ── GPU / allocator ───────────────────────────────────────────────────────────
+export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
+# expandable_segments: recovers 2-4 GB fragmented VRAM during long Flash+HF runs
+export PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}
+
+# ── CPU / threading ───────────────────────────────────────────────────────────
+export OMP_NUM_THREADS=${OMP_NUM_THREADS:-8}
+export MKL_NUM_THREADS=${MKL_NUM_THREADS:-8}
+export TOKENIZERS_PARALLELISM=${TOKENIZERS_PARALLELISM:-false}
+
+# ── Triton / Flash-Attn compilation cache ─────────────────────────────────────
+# Persists JIT kernels across runs — avoids ~30 s recompile each launch.
+export TRITON_CACHE_DIR=${TRITON_CACHE_DIR:-/tmp/triton_cache}
+export FLASH_ATTENTION_SKIP_CUDA_BUILD=${FLASH_ATTENTION_SKIP_CUDA_BUILD:-FALSE}
+
+# ── HuggingFace hub robustness ────────────────────────────────────────────────
+export HF_HUB_DISABLE_XET=${HF_HUB_DISABLE_XET:-1}
+export HF_HUB_ENABLE_HF_TRANSFER=${HF_HUB_ENABLE_HF_TRANSFER:-0}
+export TRANSFORMERS_VERBOSITY=${TRANSFORMERS_VERBOSITY:-warning}
+
+# ── Python path ───────────────────────────────────────────────────────────────
+export PYTHONPATH="${PYTHONPATH:-}:$(pwd)"
+
+# ── Pre-flight: GPU info ───────────────────────────────────────────────────────
+if command -v nvidia-smi >/dev/null 2>&1; then
+ echo "─── nvidia-smi ───────────────────────────────────────────────────"
+ nvidia-smi --query-gpu=name,memory.total,memory.free,driver_version \
+ --format=csv,noheader || true
+ echo "──────────────────────────────────────────────────────────────────"
+fi
+
+# ── Confirm attention backend ─────────────────────────────────────────────────
+python - <<'PYEOF'
+import sys; sys.path.insert(0, '.')
+from src.utils.attn_backend import select_attn_implementation
+impl = select_attn_implementation()
+tag = {
+ "flash_attention_2": "FAST — Flash-Attn 2 active (O(T) memory, ~1.5-2× faster)",
+ "sdpa": "OK — SDPA active (install flash-attn for ~2× speedup)",
+ "eager": "SLOW — Eager fallback (install flash-attn for best speed)",
+}.get(impl, impl)
+print(f"[launch] attn_backend = {tag}")
+PYEOF
+
+# ── Log tee ───────────────────────────────────────────────────────────────────
+RUN_NAME="grpo_$(date +%Y%m%d_%H%M%S)"
+LOG_DIR="logs/grpo"
+mkdir -p "$LOG_DIR"
+LOG_FILE="$LOG_DIR/${RUN_NAME}.log"
+
+echo "[launch] run_name = $RUN_NAME"
+echo "[launch] base_model = checkpoints/dual_task_v1"
+echo "[launch] train_data = data/sft/gsm8k_sft.jsonl + data/math/math_numeric.jsonl"
+echo "[launch] eval_data = data/sft/gsm8k_test.jsonl"
+echo "[launch] log_file = $LOG_FILE"
+echo "[launch] architecture = Two-phase self-play (K_q=2, K=10, N=20)"
+echo "[launch] fixes_applied = min-warmup↑12, selfplay-gt-thresh↑0.65, kl-coef↑0.06,"
+echo "[launch] math-ramp-start↑18, group-size↑10, num-iters↑60"
+echo "[launch] wall-time ≈ 3.3 h (Flash active) / 4.5 h (SDPA fallback)"
+
+# ── Train ─────────────────────────────────────────────────────────────────────
+python -u scripts/run_grpo_training.py \
+ --base-model checkpoints/dual_task_v1 \
+ --output-dir checkpoints/grpo \
+ --gsm8k-data data/sft/gsm8k_sft.jsonl \
+ --eval-data-path data/sft/gsm8k_test.jsonl \
+ \
+ --num-iterations 60 \
+ --group-size 10 \
+ --q-group-size 2 \
+ --questions-per-iter 20 \
+ \
+ --learning-rate 5e-6 \
+ --max-new-tokens 1000 \
+ --temperature 0.8 \
+ --max-grad-norm 0.5 \
+ --clip-eps 0.2 \
+ --kl-coef 0.06 \
+ --warmup-iters 8 \
+ --min-lr-ratio 0.1 \
+ \
+ --difficulty-alpha 3.5 \
+ --self-play-ratio 0.70 \
+ \
+ --math-mix-ratio 0.30 \
+ --math-mix-ratio-late 0.50 \
+ --math-ramp-start 18 \
+ --math-max-difficulty 3 \
+ \
+ --overlong-filter \
+ --min-warmup 12 \
+ --selfplay-gt-thresh 0.65 \
+ --selfplay-grounded-thresh 0.65 \
+ --selfplay-step-thresh 0.68 \
+ --selfplay-ramp-iters 28 \
+ --grounded-floor 0.55 \
+ \
+ --extractor-model Qwen/Qwen2.5-0.5B-Instruct \
+ --extraction-cache data/extraction_cache.json \
+ \
+ --eval-every 5 \
+ --eval-max-samples 150 \
+ --eval-max-new-tokens 1000 \
+ --eval-pass-at-k 0 \
+ --save-every 5 \
+ --keep-last 4 \
+ \
+ --use-prm \
+ --prm-model Qwen/Qwen2.5-Math-PRM-7B \
+ --run-name "$RUN_NAME" \
+ "$@" 2>&1 | tee "$LOG_FILE"
diff --git a/scripts/plot_grpo_run.py b/scripts/plot_grpo_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0d095d17129bd6140986cd05951bc5cacbde183
--- /dev/null
+++ b/scripts/plot_grpo_run.py
@@ -0,0 +1,425 @@
+#!/usr/bin/env python3
+"""
+Generate demo-quality plots from a completed (or in-progress) GRPO run.
+
+Usage
+-----
+ # from the run output directory
+ python scripts/plot_grpo_run.py checkpoints/grpo//metrics.jsonl
+
+ # auto-discover the latest run
+ python scripts/plot_grpo_run.py --latest
+
+ # custom output directory
+ python scripts/plot_grpo_run.py metrics.jsonl --out-dir plots/my_run
+
+Output
+------
+Six PNG files saved next to the JSONL (or --out-dir if given):
+
+ 01_training_objective.png – combined_score vs iteration (PRIMARY demo plot)
+ 02_reward_components.png – 4-panel breakdown: correct / PRM / SymPy / format
+ 03_training_dynamics.png – GRPO loss + batch reward + batch accuracy
+ 04_reward_vs_eval.png – training reward vs eval score on same axis
+ 05_component_area.png – stacked-area chart of the 4 weighted components
+ 06_summary_card.png – single-panel card: all key metrics in one view
+
+All figures use a clean dark-on-white academic style. They are saved at
+300 dpi so they look sharp in slides and posters.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+import matplotlib
+matplotlib.use("Agg") # headless — no display needed on training servers
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+import numpy as np
+
+
+# ── Style ────────────────────────────────────────────────────────────────────
+
+PALETTE = {
+ "combined": "#2563EB", # blue — training objective
+ "correct": "#16A34A", # green — correctness
+ "prm": "#DC2626", # red — PRM step quality
+ "sympy": "#D97706", # amber — SymPy verification
+ "fmt": "#7C3AED", # violet — format
+ "reward": "#0891B2", # cyan — mean batch reward
+ "loss": "#64748B", # slate — loss
+ "batch_acc": "#059669", # emerald — batch accuracy
+}
+
+plt.rcParams.update({
+ "figure.dpi": 150,
+ "savefig.dpi": 300,
+ "font.family": "DejaVu Sans",
+ "axes.spines.top": False,
+ "axes.spines.right": False,
+ "axes.grid": True,
+ "grid.alpha": 0.3,
+ "grid.linestyle": "--",
+ "axes.labelsize": 11,
+ "axes.titlesize": 13,
+ "legend.fontsize": 9,
+ "xtick.labelsize": 9,
+ "ytick.labelsize": 9,
+})
+
+
+# ── Data loading ─────────────────────────────────────────────────────────────
+
+def _load(path: Path) -> List[Dict[str, Any]]:
+ rows = []
+ with path.open(encoding="utf-8") as fh:
+ for line in fh:
+ line = line.strip()
+ if line:
+ rows.append(json.loads(line))
+ return rows
+
+
+def _field(rows: List[Dict], key: str) -> Tuple[List[int], List[float]]:
+ """Return (iterations, values) for rows that have a non-empty key."""
+ iters, vals = [], []
+ for r in rows:
+ v = r.get(key)
+ if v is not None and v != "" and not (isinstance(v, float) and np.isnan(v)):
+ try:
+ iters.append(int(r["iteration"]))
+ vals.append(float(v))
+ except (TypeError, ValueError):
+ pass
+ return iters, vals
+
+
+# ── Individual plots ─────────────────────────────────────────────────────────
+
+def plot_training_objective(rows: List[Dict], out: Path) -> None:
+ """Plot 01: combined_score — the single most important demo plot."""
+ xi, xv = _field(rows, "combined_score")
+ if not xi:
+ return
+
+ fig, ax = plt.subplots(figsize=(9, 5))
+ ax.plot(xi, xv, color=PALETTE["combined"], linewidth=2.5,
+ marker="o", markersize=5, label="Training-objective score")
+ ax.fill_between(xi, xv, alpha=0.12, color=PALETTE["combined"])
+
+ # annotate first and last eval points
+ ax.annotate(f"{xv[0]:.3f}", (xi[0], xv[0]), textcoords="offset points",
+ xytext=(8, 6), fontsize=8, color=PALETTE["combined"])
+ ax.annotate(f"{xv[-1]:.3f}", (xi[-1], xv[-1]), textcoords="offset points",
+ xytext=(8, 6), fontsize=8, color=PALETTE["combined"])
+
+ ax.set_xlabel("Iteration")
+ ax.set_ylabel("Score (0 – 1)")
+ ax.set_title(
+ "GRPO Training — Combined Reward Score\n"
+ "0.60 × correct + 0.15 × PRM + 0.15 × SymPy + 0.10 × format",
+ fontsize=12,
+ )
+ ax.set_ylim(0, 1.05)
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
+ ax.legend(loc="lower right")
+ fig.tight_layout()
+ fig.savefig(out)
+ plt.close(fig)
+ print(f" saved {out.name}")
+
+
+def plot_reward_components(rows: List[Dict], out: Path) -> None:
+ """Plot 02: four-panel breakdown of each reward component."""
+ specs = [
+ ("correct_rate", "correct", "Correctness (gt_match)", "60 %"),
+ ("prm_mean", "prm", "PRM Step Quality", "15 %"),
+ ("sympy_mean", "sympy", "SymPy Verification", "15 %"),
+ ("format_mean", "fmt", "Format Compliance", "10 %"),
+ ]
+
+ fig, axes = plt.subplots(2, 2, figsize=(12, 7), sharex=False)
+ axes = axes.flatten()
+
+ for ax, (key, pal, title, weight) in zip(axes, specs):
+ xi, xv = _field(rows, key)
+ if not xi:
+ ax.set_visible(False)
+ continue
+ ax.plot(xi, xv, color=PALETTE[pal], linewidth=2,
+ marker="o", markersize=4)
+ ax.fill_between(xi, xv, alpha=0.12, color=PALETTE[pal])
+ ax.set_title(f"{title} (weight {weight})", fontsize=11)
+ ax.set_xlabel("Iteration")
+ ax.set_ylabel("Score")
+ ax.set_ylim(0, 1.05)
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
+
+ if xv:
+ delta = xv[-1] - xv[0]
+ sign = "+" if delta >= 0 else ""
+ ax.set_title(
+ f"{title} (weight {weight}) Δ={sign}{delta:+.1%}",
+ fontsize=10,
+ )
+
+ fig.suptitle("Reward Component Breakdown over Training", fontsize=13, y=1.01)
+ fig.tight_layout()
+ fig.savefig(out, bbox_inches="tight")
+ plt.close(fig)
+ print(f" saved {out.name}")
+
+
+def plot_training_dynamics(rows: List[Dict], out: Path) -> None:
+ """Plot 03: loss, mean_reward, batch_accuracy over all iterations."""
+ li, lv = _field(rows, "loss")
+ ri, rv = _field(rows, "mean_reward")
+ bi, bv = _field(rows, "batch_accuracy")
+
+ fig, axes = plt.subplots(3, 1, figsize=(10, 8), sharex=True)
+
+ if lv:
+ axes[0].plot(li, lv, color=PALETTE["loss"], linewidth=1.8)
+ axes[0].fill_between(li, lv, alpha=0.1, color=PALETTE["loss"])
+ axes[0].set_ylabel("GRPO Loss")
+ axes[0].set_title("Training Loss", fontsize=11)
+ axes[0].axhline(0, color="black", linewidth=0.8, linestyle="--", alpha=0.4)
+
+ if rv:
+ axes[1].plot(ri, rv, color=PALETTE["reward"], linewidth=1.8)
+ axes[1].fill_between(ri, rv, alpha=0.1, color=PALETTE["reward"])
+ axes[1].set_ylabel("Reward")
+ axes[1].set_ylim(0, 1.05)
+ axes[1].set_title("Mean Batch Reward", fontsize=11)
+ axes[1].yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
+
+ if bv:
+ axes[2].plot(bi, bv, color=PALETTE["batch_acc"], linewidth=1.8)
+ axes[2].fill_between(bi, bv, alpha=0.1, color=PALETTE["batch_acc"])
+ axes[2].set_ylabel("Accuracy")
+ axes[2].set_ylim(0, 1.05)
+ axes[2].set_title("Batch Accuracy (training rollouts)", fontsize=11)
+ axes[2].yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
+
+ for ax in axes:
+ ax.set_xlabel("Iteration")
+
+ fig.suptitle("GRPO Training Dynamics", fontsize=13)
+ fig.tight_layout()
+ fig.savefig(out)
+ plt.close(fig)
+ print(f" saved {out.name}")
+
+
+def plot_reward_vs_eval(rows: List[Dict], out: Path) -> None:
+ """Plot 04: mean_reward (all iters) + combined_score (eval iters) overlaid."""
+ ri, rv = _field(rows, "mean_reward")
+ ei, ev = _field(rows, "combined_score")
+
+ fig, ax = plt.subplots(figsize=(10, 5))
+
+ if rv:
+ ax.plot(ri, rv, color=PALETTE["reward"], linewidth=1.4, alpha=0.7,
+ label="Batch reward (training)")
+ ax.fill_between(ri, rv, alpha=0.06, color=PALETTE["reward"])
+
+ if ev:
+ ax.plot(ei, ev, color=PALETTE["combined"], linewidth=2.5,
+ marker="D", markersize=6, label="Eval score (held-out GSM8K)")
+ for x, y in zip(ei, ev):
+ ax.annotate(f"{y:.3f}", (x, y), textcoords="offset points",
+ xytext=(0, 8), ha="center", fontsize=7,
+ color=PALETTE["combined"])
+
+ ax.set_xlabel("Iteration")
+ ax.set_ylabel("Score (0 – 1)")
+ ax.set_ylim(0, 1.05)
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
+ ax.set_title("Training Reward vs Held-Out Eval Score", fontsize=12)
+ ax.legend()
+ fig.tight_layout()
+ fig.savefig(out)
+ plt.close(fig)
+ print(f" saved {out.name}")
+
+
+def plot_component_area(rows: List[Dict], out: Path) -> None:
+ """Plot 05: stacked-area of the four WEIGHTED components summing to combined_score."""
+ ei, ev_combined = _field(rows, "combined_score")
+ if not ei:
+ return
+
+ # Build per-component weighted series aligned to eval iterations
+ iter_set = set(ei)
+ aligned: Dict[str, List[float]] = {k: [] for k in ("correct", "prm", "sympy", "fmt")}
+ weights = {"correct": 0.60, "prm": 0.15, "sympy": 0.15, "fmt": 0.10}
+ keys = {"correct": "correct_rate", "prm": "prm_mean",
+ "sympy": "sympy_mean", "fmt": "format_mean"}
+
+ # Build lookup per iteration
+ it_map: Dict[int, Dict] = {r["iteration"]: r for r in rows if r["iteration"] in iter_set}
+ iters_sorted = sorted(iter_set)
+
+ for it in iters_sorted:
+ row = it_map.get(it, {})
+ for comp, field in keys.items():
+ v = row.get(field)
+ if v is not None and v != "":
+ aligned[comp].append(float(v) * weights[comp])
+ else:
+ aligned[comp].append(0.0)
+
+ x = np.array(iters_sorted)
+ arr = np.array([aligned["correct"], aligned["prm"],
+ aligned["sympy"], aligned["fmt"]])
+
+ fig, ax = plt.subplots(figsize=(10, 5))
+ labels = ["Correct (×0.60)", "PRM (×0.15)", "SymPy (×0.15)", "Format (×0.10)"]
+ colors = [PALETTE[k] for k in ("correct", "prm", "sympy", "fmt")]
+ ax.stackplot(x, arr, labels=labels, colors=colors, alpha=0.75)
+
+ ax.plot(x, ev_combined, color="black", linewidth=1.5,
+ linestyle="--", label="Combined score", zorder=5)
+
+ ax.set_xlabel("Iteration")
+ ax.set_ylabel("Weighted contribution to score")
+ ax.set_ylim(0, 1.0)
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
+ ax.set_title("Contribution of Each Reward Component (Stacked)", fontsize=12)
+ ax.legend(loc="lower right", ncol=2)
+ fig.tight_layout()
+ fig.savefig(out)
+ plt.close(fig)
+ print(f" saved {out.name}")
+
+
+def plot_summary_card(rows: List[Dict], run_name: str, out: Path) -> None:
+ """Plot 06: all key metrics on a single clean card — ideal for poster / slide."""
+ ei, ev = _field(rows, "combined_score")
+ _, crv = _field(rows, "correct_rate")
+ _, prmv = _field(rows, "prm_mean")
+ _, syv = _field(rows, "sympy_mean")
+ _, fmv = _field(rows, "format_mean")
+ _, lv = _field(rows, "loss")
+ _, rv = _field(rows, "mean_reward")
+ li = _field(rows, "loss")[0]
+ ri = _field(rows, "mean_reward")[0]
+
+ fig, axes = plt.subplots(2, 3, figsize=(15, 8))
+ axes = axes.flatten()
+
+ def _panel(ax, iters, vals, color, title, pct=True):
+ if not iters:
+ ax.set_visible(False)
+ return
+ ax.plot(iters, vals, color=color, linewidth=2, marker="o", markersize=4)
+ ax.fill_between(iters, vals, alpha=0.12, color=color)
+ ax.set_title(title, fontsize=11, fontweight="bold")
+ ax.set_xlabel("Iteration", fontsize=9)
+ if pct:
+ ax.set_ylim(0, 1.05)
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
+ if vals:
+ ax.annotate(f"{vals[-1]:.3f}", (iters[-1], vals[-1]),
+ textcoords="offset points", xytext=(6, 4),
+ fontsize=8, color=color)
+
+ _panel(axes[0], ei, ev, PALETTE["combined"], "Training-Objective Score")
+ _panel(axes[1], ei, crv, PALETTE["correct"], "Correctness Rate")
+ _panel(axes[2], ei, prmv, PALETTE["prm"], "PRM Step Quality")
+ _panel(axes[3], ei, syv, PALETTE["sympy"], "SymPy Verification")
+ _panel(axes[4], ei, fmv, PALETTE["fmt"], "Format Compliance")
+ _panel(axes[5], li, lv, PALETTE["loss"], "GRPO Loss", pct=False)
+
+ fig.suptitle(f"GRPO Training Summary — {run_name}", fontsize=14, fontweight="bold")
+ fig.tight_layout()
+ fig.savefig(out, bbox_inches="tight")
+ plt.close(fig)
+ print(f" saved {out.name}")
+
+
+# ── CLI ──────────────────────────────────────────────────────────────────────
+
+def find_latest_metrics() -> Optional[Path]:
+ """Find the most recently modified metrics.jsonl under checkpoints/grpo/."""
+ ckpt = Path("checkpoints/grpo")
+ if not ckpt.exists():
+ return None
+ candidates = sorted(
+ ckpt.rglob("metrics.jsonl"),
+ key=lambda p: p.stat().st_mtime,
+ )
+ return candidates[-1] if candidates else None
+
+
+def generate_plots(metrics_path: Path, out_dir: Optional[Path] = None) -> Path:
+ """Generate all six plots and return the output directory."""
+ rows = _load(metrics_path)
+ if not rows:
+ print(f"[plot] No data in {metrics_path}", file=sys.stderr)
+ return metrics_path.parent
+
+ out_dir = out_dir or metrics_path.parent / "plots"
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ # Derive run name from the directory name two levels up
+ run_name = metrics_path.parent.name
+
+ print(f"[plot] Generating plots for run '{run_name}' ({len(rows)} iterations)")
+ print(f"[plot] Output → {out_dir}")
+
+ plot_training_objective(rows, out_dir / "01_training_objective.png")
+ plot_reward_components(rows, out_dir / "02_reward_components.png")
+ plot_training_dynamics(rows, out_dir / "03_training_dynamics.png")
+ plot_reward_vs_eval(rows, out_dir / "04_reward_vs_eval.png")
+ plot_component_area(rows, out_dir / "05_component_area.png")
+ plot_summary_card(rows, run_name, out_dir / "06_summary_card.png")
+
+ print(f"[plot] Done — {len(list(out_dir.glob('*.png')))} PNGs in {out_dir}")
+ return out_dir
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(
+ description="Generate demo plots from a GRPO metrics.jsonl file."
+ )
+ parser.add_argument(
+ "metrics_jsonl", nargs="?", type=Path, default=None,
+ help="Path to metrics.jsonl produced by run_grpo_training.py",
+ )
+ parser.add_argument(
+ "--latest", action="store_true",
+ help="Auto-discover the most recent metrics.jsonl under checkpoints/grpo/",
+ )
+ parser.add_argument(
+ "--out-dir", type=Path, default=None,
+ help="Directory to write PNG files (default: /plots/)",
+ )
+ args = parser.parse_args()
+
+ if args.latest:
+ path = find_latest_metrics()
+ if path is None:
+ print("No metrics.jsonl found under checkpoints/grpo/", file=sys.stderr)
+ sys.exit(1)
+ print(f"[plot] Auto-selected {path}")
+ elif args.metrics_jsonl:
+ path = args.metrics_jsonl
+ else:
+ parser.print_help()
+ sys.exit(1)
+
+ if not path.exists():
+ print(f"File not found: {path}", file=sys.stderr)
+ sys.exit(1)
+
+ generate_plots(path, args.out_dir)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/plot_training_results.py b/scripts/plot_training_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..818c7e92ed01bce74a6d4afaaa595ac23cd1655d
--- /dev/null
+++ b/scripts/plot_training_results.py
@@ -0,0 +1,521 @@
+#!/usr/bin/env python3
+"""
+AxiomForgeAI — Training Results Plots
+======================================
+Reads the metrics CSV from a GRPO training run and generates five focused plots
+that tell the story of what improved, how self-play was earned, and why step-level
+reasoning quality matters as much as final-answer accuracy.
+
+All plots are saved to images/ as high-resolution PNGs.
+
+Usage
+-----
+ python scripts/plot_training_results.py
+ python scripts/plot_training_results.py --metrics logs/grpo/grpo_20260426_032827/metrics.csv
+ python scripts/plot_training_results.py --out images/
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+from pathlib import Path
+from typing import Dict, List
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import numpy as np
+
+# ── Style ──────────────────────────────────────────────────────────────────────
+PALETTE = {
+ "indigo": "#6366f1",
+ "pink": "#ec4899",
+ "cyan": "#06b6d4",
+ "amber": "#f59e0b",
+ "emerald": "#10b981",
+ "slate": "#94a3b8",
+ "red": "#ef4444",
+ "violet": "#8b5cf6",
+ "white": "#f8fafc",
+ "bg": "#0f172a",
+ "bg2": "#1e293b",
+ "gridline": "#1e293b",
+}
+
+plt.rcParams.update({
+ "figure.facecolor": PALETTE["bg"],
+ "axes.facecolor": PALETTE["bg"],
+ "axes.edgecolor": PALETTE["slate"],
+ "axes.labelcolor": PALETTE["white"],
+ "axes.titlecolor": PALETTE["white"],
+ "axes.titlesize": 13,
+ "axes.labelsize": 11,
+ "axes.grid": True,
+ "grid.color": "#1e293b",
+ "grid.linewidth": 0.8,
+ "xtick.color": PALETTE["slate"],
+ "ytick.color": PALETTE["slate"],
+ "xtick.labelsize": 9,
+ "ytick.labelsize": 9,
+ "legend.facecolor": "#1e293b",
+ "legend.edgecolor": PALETTE["slate"],
+ "legend.labelcolor": PALETTE["white"],
+ "legend.fontsize": 9,
+ "text.color": PALETTE["white"],
+ "font.family": "sans-serif",
+ "lines.linewidth": 2.0,
+})
+
+PHASE_COLORS = {
+ "GROUNDED_ONLY": ("#6366f120", "#6366f1"),
+ "SELFPLAY_RAMP": ("#10b98120", "#10b981"),
+}
+
+DPI = 160
+IMAGES_DIR = Path("images")
+
+DEFAULT_METRICS = (
+ "logs/grpo/grpo_20260426_032827/metrics.csv"
+)
+
+
+# ── Helpers ────────────────────────────────────────────────────────────────────
+
+def load_csv(path: str) -> List[Dict]:
+ rows = []
+ with open(path, encoding="utf-8") as f:
+ for r in csv.DictReader(f):
+ rows.append({k: v for k, v in r.items()})
+ return rows
+
+
+def f(row: Dict, key: str, default: float = float("nan")) -> float:
+ v = row.get(key, "")
+ try:
+ return float(v) if v != "" else default
+ except (ValueError, TypeError):
+ return default
+
+
+def moving_avg(values: List[float], w: int = 3) -> List[float]:
+ result = []
+ for i in range(len(values)):
+ lo = max(0, i - w + 1)
+ chunk = [v for v in values[lo : i + 1] if not np.isnan(v)]
+ result.append(float(np.mean(chunk)) if chunk else float("nan"))
+ return result
+
+
+def shade_phases(ax, iters, phases):
+ """Draw translucent background rectangles for each training phase."""
+ prev_phase, start = None, iters[0]
+ for it, ph in zip(iters, phases):
+ if ph != prev_phase:
+ if prev_phase is not None:
+ bg, _ = PHASE_COLORS.get(prev_phase, ("#ffffff10", "#ffffff"))
+ ax.axvspan(start - 0.5, it - 0.5, facecolor=bg, linewidth=0, zorder=0)
+ prev_phase, start = ph, it
+ if prev_phase is not None:
+ bg, _ = PHASE_COLORS.get(prev_phase, ("#ffffff10", "#ffffff"))
+ ax.axvspan(start - 0.5, iters[-1] + 0.5, facecolor=bg, linewidth=0, zorder=0)
+
+
+def phase_legend_patches(phases):
+ seen = []
+ patches = []
+ for ph in phases:
+ if ph not in seen:
+ seen.append(ph)
+ _, edge = PHASE_COLORS.get(ph, ("#ffffff10", "#ffffff"))
+ label = ph.replace("_", " ").title()
+ patches.append(mpatches.Patch(facecolor=edge + "40", edgecolor=edge,
+ linewidth=1.2, label=label))
+ return patches
+
+
+def annotate_transition(ax, x_iter, label, ypos=0.97, color="#94a3b8"):
+ ax.axvline(x=x_iter - 0.5, color=color, linewidth=1, linestyle="--", alpha=0.7)
+ ax.text(x_iter, ypos, label, transform=ax.get_xaxis_transform(),
+ fontsize=7.5, color=color, ha="left", va="top",
+ bbox=dict(facecolor=PALETTE["bg2"], edgecolor="none", pad=2))
+
+
+def save(fig: plt.Figure, name: str, out: Path):
+ out.mkdir(parents=True, exist_ok=True)
+ path = out / name
+ fig.savefig(path, dpi=DPI, bbox_inches="tight", facecolor=fig.get_facecolor())
+ print(f" ✓ {path}")
+ plt.close(fig)
+
+
+# ══════════════════════════════════════════════════════════════════════════════
+# PLOT 1 — Hero: Reasoning quality at evaluation checkpoints
+# Shows four signals together: GSM8K accuracy, combined score, step accuracy,
+# and LCCP. The message: the model doesn't just get more answers right —
+# every step of the reasoning chain gets better.
+# ══════════════════════════════════════════════════════════════════════════════
+
+def plot_eval_quality(rows: List[Dict], out: Path):
+ eval_rows = [r for r in rows if r.get("eval_combined", "") != ""]
+ iters = [int(r["iteration"]) for r in eval_rows]
+
+ gsm8k_acc = [f(r, "eval_correct_rt") * 100 for r in eval_rows]
+ combined = [f(r, "eval_combined") * 100 for r in eval_rows]
+ step_acc = [f(r, "eval_step_acc") * 100 for r in eval_rows]
+ lccp = [f(r, "eval_lccp") * 100 for r in eval_rows]
+ prm = [f(r, "eval_prm") * 100 for r in eval_rows]
+
+ fig, ax = plt.subplots(figsize=(9, 5))
+ fig.suptitle("Evaluation Quality Over Training — AxiomForgeAI",
+ fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01)
+
+ # --- lines
+ ax.plot(iters, gsm8k_acc, "o-", color=PALETTE["pink"], label="GSM8K Accuracy (final answer)", ms=7, zorder=5)
+ ax.plot(iters, combined, "s-", color=PALETTE["indigo"], label="Combined Score", ms=6, zorder=5)
+ ax.plot(iters, step_acc, "^-", color=PALETTE["cyan"], label="Step Accuracy (reasoning chain)", ms=6, zorder=5)
+ ax.plot(iters, lccp, "D-", color=PALETTE["emerald"], label="LCCP (chain integrity)", ms=6, zorder=5)
+ ax.plot(iters, prm, "v--", color=PALETTE["amber"], label="PRM Mean Score", ms=5, alpha=0.8, zorder=4)
+
+ # annotate best GSM8K
+ best_gsm = max(gsm8k_acc)
+ bi = gsm8k_acc.index(best_gsm)
+ ax.annotate(f" {best_gsm:.1f}%",
+ xy=(iters[bi], best_gsm), fontsize=9, color=PALETTE["pink"],
+ va="bottom", ha="left")
+
+ # annotate best combined
+ best_c = max(combined)
+ bci = combined.index(best_c)
+ ax.annotate(f" {best_c:.1f}",
+ xy=(iters[bci], best_c), fontsize=9, color=PALETTE["indigo"],
+ va="top", ha="left")
+
+ ax.set_xlabel("Training Iteration")
+ ax.set_ylabel("Score (%)")
+ ax.set_xticks(iters)
+ ax.set_ylim(78, 96)
+ ax.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
+ ax.legend(loc="lower right", framealpha=0.8)
+ ax.set_title(
+ "Four angles on quality — answer correctness, holistic score, per-step reasoning, and chain integrity",
+ fontsize=9, color=PALETTE["slate"], pad=6,
+ )
+
+ fig.tight_layout()
+ save(fig, "plot1_eval_quality.png", out)
+
+
+# ══════════════════════════════════════════════════════════════════════════════
+# PLOT 2 — Training Journey: full 30-iteration timeline with phase shading
+# Shows mean reward, GT match rate, and step accuracy over every iteration.
+# Phase backgrounds show when self-play unlocked and the curriculum ramped.
+# ══════════════════════════════════════════════════════════════════════════════
+
+def plot_training_journey(rows: List[Dict], out: Path):
+ iters = [int(r["iteration"]) for r in rows]
+ phases = [r["training_phase"] for r in rows]
+ mean_r = [f(r, "mean_reward") * 100 for r in rows]
+ gt_match = [f(r, "gt_match_rate") * 100 for r in rows]
+ step_acc = [f(r, "step_accuracy") * 100 for r in rows]
+ batch_acc = [f(r, "batch_accuracy") * 100 for r in rows]
+
+ ma_reward = moving_avg(mean_r, w=4)
+ ma_gt = moving_avg(gt_match, w=4)
+ ma_step = moving_avg(step_acc, w=4)
+
+ fig, ax = plt.subplots(figsize=(11, 5))
+ shade_phases(ax, iters, phases)
+
+ # raw (faint)
+ ax.plot(iters, mean_r, alpha=0.25, color=PALETTE["indigo"], linewidth=1)
+ ax.plot(iters, gt_match, alpha=0.25, color=PALETTE["pink"], linewidth=1)
+ ax.plot(iters, step_acc, alpha=0.25, color=PALETTE["cyan"], linewidth=1)
+
+ # smoothed (bold)
+ ax.plot(iters, ma_reward, color=PALETTE["indigo"], linewidth=2.5, label="Mean Reward (smooth)")
+ ax.plot(iters, ma_gt, color=PALETTE["pink"], linewidth=2.5, label="GT Match Rate (smooth)")
+ ax.plot(iters, ma_step, color=PALETTE["cyan"], linewidth=2.5, label="Step Accuracy (smooth)")
+
+ # self-play transition annotation
+ sp_start = next(i for i, p in enumerate(phases) if p == "SELFPLAY_RAMP")
+ annotate_transition(ax, iters[sp_start], "Self-play\nunlocked", ypos=0.98,
+ color=PALETTE["emerald"])
+
+ ax.set_xlabel("Training Iteration")
+ ax.set_ylabel("Score (%)")
+ ax.set_ylim(55, 105)
+ ax.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
+ ax.set_xticks(range(1, max(iters) + 1, 2))
+ ax.set_title("30-Iteration GRPO Training Timeline | Faint = raw · Bold = 4-iter moving average",
+ fontsize=9, color=PALETTE["slate"], pad=6)
+ fig.suptitle("Training Journey — Reward, GT Match & Step Accuracy",
+ fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01)
+
+ legend_patches = phase_legend_patches(phases)
+ h, l = ax.get_legend_handles_labels()
+ ax.legend(handles=h + legend_patches, loc="lower right", framealpha=0.8, ncol=2)
+
+ fig.tight_layout()
+ save(fig, "plot2_training_journey.png", out)
+
+
+# ══════════════════════════════════════════════════════════════════════════════
+# PLOT 3 — Self-Play Success: the curriculum earning its right to generate
+# Shows the self-play ratio ramping up while question quality stays high.
+# The headline: by iteration 30 more than 60% of training is model-generated,
+# and those questions are 95-100% solvable and genuinely novel.
+# ══════════════════════════════════════════════════════════════════════════════
+
+def plot_selfplay_success(rows: List[Dict], out: Path):
+ sp_rows = [r for r in rows if f(r, "q_reward") > 0]
+ iters = [int(r["iteration"]) for r in sp_rows]
+ sp_rat = [f(r, "sp_ratio") * 100 for r in sp_rows]
+ q_sol = [f(r, "q_solvability") * 100 for r in sp_rows]
+ q_nov = [f(r, "q_novelty") * 100 for r in sp_rows]
+ q_rew = [f(r, "q_reward") * 100 for r in sp_rows]
+
+ fig, ax1 = plt.subplots(figsize=(10, 5))
+ ax2 = ax1.twinx()
+ ax2.tick_params(axis="y", labelcolor=PALETTE["slate"])
+ ax2.spines["right"].set_color(PALETTE["slate"])
+
+ # self-play ramp (left axis)
+ ax1.fill_between(iters, sp_rat, alpha=0.18, color=PALETTE["emerald"])
+ ax1.plot(iters, sp_rat, "o-", color=PALETTE["emerald"], ms=6,
+ label="Self-play ratio", linewidth=2.5)
+ ax1.set_ylabel("Self-play share of training (%)", color=PALETTE["emerald"])
+ ax1.tick_params(axis="y", labelcolor=PALETTE["emerald"])
+ ax1.set_ylim(0, 80)
+
+ # question quality (right axis)
+ ax2.plot(iters, q_sol, "s--", color=PALETTE["cyan"], ms=5, label="Solvability", linewidth=1.8)
+ ax2.plot(iters, q_nov, "^--", color=PALETTE["amber"], ms=5, label="Novelty", linewidth=1.8)
+ ax2.plot(iters, q_rew, "D--", color=PALETTE["pink"], ms=5, label="Q-Reward", linewidth=1.8)
+ ax2.set_ylabel("Question quality score (%)", color=PALETTE["slate"])
+ ax2.set_ylim(0, 115)
+
+ # merge legends
+ h1, l1 = ax1.get_legend_handles_labels()
+ h2, l2 = ax2.get_legend_handles_labels()
+ ax1.legend(h1 + h2, l1 + l2, loc="upper left", framealpha=0.8)
+
+ ax1.set_xlabel("Training Iteration")
+ ax1.set_xticks(iters)
+ ax1.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
+ ax2.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
+
+ # annotate final sp ratio
+ ax1.annotate(f" {sp_rat[-1]:.0f}% self-play\n by iter {iters[-1]}",
+ xy=(iters[-1], sp_rat[-1]), fontsize=9, color=PALETTE["emerald"],
+ va="center", ha="left")
+
+ fig.suptitle("Self-Play Curriculum — The Model Earns Its Own Training Data",
+ fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01)
+ ax1.set_title(
+ "Self-play ratio ramps from 0 → 61% · Generated questions stay 93-100% solvable throughout",
+ fontsize=9, color=PALETTE["slate"], pad=6,
+ )
+ fig.tight_layout()
+ save(fig, "plot3_selfplay_success.png", out)
+
+
+# ══════════════════════════════════════════════════════════════════════════════
+# PLOT 4 — Reward Signal Tightening: mean ± std over 30 iterations
+# As the policy learns what "good" looks like, the spread between the best
+# and worst solutions in a group narrows. Lower variance = more consistent
+# reasoning, not lucky guessing.
+# ══════════════════════════════════════════════════════════════════════════════
+
+def plot_reward_confidence(rows: List[Dict], out: Path):
+ iters = [int(r["iteration"]) for r in rows]
+ phases = [r["training_phase"] for r in rows]
+ mean_r = np.array([f(r, "mean_reward") for r in rows])
+ std_r = np.array([f(r, "std_reward") for r in rows])
+ skipped = np.array([f(r, "skipped_groups", 0) for r in rows])
+ n_grps = np.array([f(r, "n_groups", 1) for r in rows])
+ skip_rt = skipped / np.maximum(n_grps, 1) * 100
+
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(11, 7), sharex=True,
+ gridspec_kw={"height_ratios": [3, 1.2]})
+ fig.suptitle("Reward Confidence — Mean ± Std & Skipped Groups Over 30 Iterations",
+ fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01)
+
+ shade_phases(ax1, iters, phases)
+
+ ax1.fill_between(iters, (mean_r - std_r) * 100, (mean_r + std_r) * 100,
+ alpha=0.20, color=PALETTE["indigo"])
+ ax1.plot(iters, mean_r * 100, color=PALETTE["indigo"], linewidth=2.5, label="Mean reward")
+ ax1.plot(iters, (mean_r - std_r) * 100, "--", color=PALETTE["slate"], linewidth=1,
+ alpha=0.6, label="±1 std")
+ ax1.plot(iters, (mean_r + std_r) * 100, "--", color=PALETTE["slate"], linewidth=1,
+ alpha=0.6)
+
+ # highlight the two tight-cluster peaks
+ for special_iter, label in [(11, "iter 11\nstd=0.098"), (22, "iter 22\nstd=0.124")]:
+ si = iters.index(special_iter)
+ ax1.annotate(label,
+ xy=(special_iter, (mean_r[si] + std_r[si]) * 100),
+ xytext=(special_iter + 1, (mean_r[si] + std_r[si]) * 100 + 2),
+ fontsize=8, color=PALETTE["amber"],
+ arrowprops=dict(arrowstyle="->", color=PALETTE["amber"], lw=1.2))
+
+ ax1.set_ylabel("Reward (%)")
+ ax1.set_ylim(55, 115)
+ ax1.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
+ h1, l1 = ax1.get_legend_handles_labels()
+ ax1.legend(handles=h1 + phase_legend_patches(phases), framealpha=0.8, ncol=3)
+
+ # skip-rate bar chart (bottom panel)
+ shade_phases(ax2, iters, phases)
+ ax2.bar(iters, skip_rt, color=PALETTE["red"], alpha=0.7, width=0.7, label="Skipped groups %")
+ ax2.set_ylabel("Skipped\ngroups (%)")
+ ax2.set_xlabel("Training Iteration")
+ ax2.set_ylim(0, 75)
+ ax2.set_xticks(range(1, max(iters) + 1, 2))
+ ax2.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
+ ax2.legend(loc="upper right", framealpha=0.8)
+
+ fig.tight_layout()
+ save(fig, "plot4_reward_confidence.png", out)
+
+
+# ══════════════════════════════════════════════════════════════════════════════
+# PLOT 5 — Step-Level Reasoning Quality: train vs eval
+# Breaks down the two signals that measure HOW the model thinks (not just
+# whether it gets the final answer right): step accuracy and LCCP.
+# Train lines are noisy; eval lines show clean upward trends.
+# ══════════════════════════════════════════════════════════════════════════════
+
+def plot_reasoning_quality(rows: List[Dict], out: Path):
+ iters = [int(r["iteration"]) for r in rows]
+ phases = [r["training_phase"] for r in rows]
+
+ # training
+ t_step = [f(r, "step_accuracy") * 100 for r in rows]
+ t_lccp = [f(r, "lccp") * 100 for r in rows]
+ t_gt = [f(r, "gt_match_rate") * 100 for r in rows]
+
+ # eval (only at checkpoint iters)
+ eval_rows = [r for r in rows if r.get("eval_combined", "") != ""]
+ e_iters = [int(r["iteration"]) for r in eval_rows]
+ e_step = [f(r, "eval_step_acc") * 100 for r in eval_rows]
+ e_lccp = [f(r, "eval_lccp") * 100 for r in eval_rows]
+
+ # moving averages
+ ma_step = moving_avg(t_step, w=4)
+ ma_lccp = moving_avg(t_lccp, w=4)
+ ma_gt = moving_avg(t_gt, w=4)
+
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5.5))
+ fig.suptitle("Step-Level Reasoning Quality — Training vs Held-Out Evaluation",
+ fontsize=14, fontweight="bold", color=PALETTE["white"], y=1.01)
+
+ # ── LEFT: step accuracy ──
+ shade_phases(ax1, iters, phases)
+ ax1.plot(iters, t_step, alpha=0.2, color=PALETTE["cyan"], linewidth=1)
+ ax1.plot(iters, ma_step, color=PALETTE["cyan"], linewidth=2.5, label="Train step acc (smooth)")
+ ax1.plot(iters, t_gt, alpha=0.15, color=PALETTE["pink"], linewidth=1)
+ ax1.plot(iters, ma_gt, color=PALETTE["pink"], linewidth=2.5, label="Train GT match (smooth)")
+ ax1.plot(e_iters, e_step, "o-", color=PALETTE["white"], ms=8, linewidth=2,
+ label="Eval step accuracy", zorder=6)
+
+ # annotate eval start/end
+ ax1.annotate(f"{e_step[0]:.1f}%", xy=(e_iters[0], e_step[0]),
+ xytext=(e_iters[0] - 0.3, e_step[0] - 1.2), fontsize=8.5,
+ color=PALETTE["white"], ha="right")
+ ax1.annotate(f"{e_step[-1]:.1f}%", xy=(e_iters[-1], e_step[-1]),
+ xytext=(e_iters[-1] + 0.3, e_step[-1] + 0.5), fontsize=8.5,
+ color=PALETTE["white"])
+ ax1.annotate("", xy=(e_iters[-1], e_step[-1]),
+ xytext=(e_iters[0], e_step[0]),
+ arrowprops=dict(arrowstyle="->", color=PALETTE["cyan"], lw=1.5,
+ connectionstyle="arc3,rad=-0.3"))
+
+ ax1.set_title("Step Accuracy — Did each reasoning step hold up?",
+ fontsize=9.5, color=PALETTE["slate"], pad=5)
+ ax1.set_xlabel("Training Iteration")
+ ax1.set_ylabel("Score (%)")
+ ax1.set_ylim(55, 105)
+ ax1.set_xticks(range(1, max(iters) + 1, 3))
+ ax1.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
+ ax1.legend(handles=ax1.get_legend_handles_labels()[0] + phase_legend_patches(phases),
+ framealpha=0.8, ncol=1, loc="lower right")
+
+ # ── RIGHT: LCCP ──
+ shade_phases(ax2, iters, phases)
+ ax2.plot(iters, t_lccp, alpha=0.2, color=PALETTE["emerald"], linewidth=1)
+ ax2.plot(iters, ma_lccp, color=PALETTE["emerald"], linewidth=2.5, label="Train LCCP (smooth)")
+ ax2.plot(e_iters, e_lccp, "o-", color=PALETTE["white"], ms=8, linewidth=2,
+ label="Eval LCCP", zorder=6)
+
+ ax2.annotate(f"{e_lccp[0]:.1f}%", xy=(e_iters[0], e_lccp[0]),
+ xytext=(e_iters[0] - 0.3, e_lccp[0] - 1.5), fontsize=8.5,
+ color=PALETTE["white"], ha="right")
+ ax2.annotate(f"{e_lccp[-1]:.1f}%", xy=(e_iters[-1], e_lccp[-1]),
+ xytext=(e_iters[-1] + 0.3, e_lccp[-1] + 0.5), fontsize=8.5,
+ color=PALETTE["white"])
+
+ # show LCCP delta
+ delta = e_lccp[-1] - e_lccp[0]
+ ax2.text(0.97, 0.06,
+ f"Eval LCCP Δ = +{delta:.2f}pp\n(iter {e_iters[0]} → {e_iters[-1]})",
+ transform=ax2.transAxes, ha="right", va="bottom",
+ fontsize=8.5, color=PALETTE["emerald"],
+ bbox=dict(facecolor=PALETTE["bg2"], edgecolor=PALETTE["emerald"],
+ linewidth=0.8, pad=5))
+
+ ax2.set_title("LCCP — Did the chain of reasoning stay correct until the first error?",
+ fontsize=9.5, color=PALETTE["slate"], pad=5)
+ ax2.set_xlabel("Training Iteration")
+ ax2.set_ylabel("LCCP (%)")
+ ax2.set_ylim(55, 100)
+ ax2.set_xticks(range(1, max(iters) + 1, 3))
+ ax2.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.0f%%"))
+ ax2.legend(handles=ax2.get_legend_handles_labels()[0] + phase_legend_patches(phases),
+ framealpha=0.8, ncol=1, loc="lower right")
+
+ fig.tight_layout()
+ save(fig, "plot5_reasoning_quality.png", out)
+
+
+# ══════════════════════════════════════════════════════════════════════════════
+# Main
+# ══════════════════════════════════════════════════════════════════════════════
+
+def parse_args():
+ p = argparse.ArgumentParser(description="Generate AxiomForgeAI training plots")
+ p.add_argument("--metrics", default=DEFAULT_METRICS,
+ help=f"Path to metrics.csv (default: {DEFAULT_METRICS})")
+ p.add_argument("--out", default="images",
+ help="Output directory for PNGs (default: images/)")
+ return p.parse_args()
+
+
+def main():
+ args = parse_args()
+ out = Path(args.out)
+
+ print(f"Loading metrics from : {args.metrics}")
+ print(f"Saving plots to : {out}/")
+ print()
+
+ rows = load_csv(args.metrics)
+ print(f"Loaded {len(rows)} iterations.\n")
+
+ print("Generating plots …")
+ plot_eval_quality(rows, out)
+ plot_training_journey(rows, out)
+ plot_selfplay_success(rows, out)
+ plot_reward_confidence(rows, out)
+ plot_reasoning_quality(rows, out)
+
+ print(f"\n✅ All 5 plots saved to {out}/")
+ print("\nFiles:")
+ for p in sorted(out.glob("plot*.png")):
+ print(f" {p} ({p.stat().st_size // 1024} KB)")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/precompute_extraction_cache.py b/scripts/precompute_extraction_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..aafc3d2b66cbc59caaf55ab867495d9b5e71468c
--- /dev/null
+++ b/scripts/precompute_extraction_cache.py
@@ -0,0 +1,174 @@
+"""
+Offline step-chain extraction cache builder.
+
+Run this once before training to pre-extract structured step chains from all
+grounded training data (GSM8K + MATH). The resulting cache file is passed to
+run_grpo_training.py via --extraction-cache so the extractor LLM is never
+called for fixed training examples — only novel self-play solutions require
+live extraction during training.
+
+Usage
+-----
+ python scripts/precompute_extraction_cache.py \\
+ --gsm8k-data data/sft/gsm8k_sft.jsonl \\
+ --math-data data/sft/math_sft.jsonl \\
+ --output-cache data/extraction_cache.json \\
+ --extractor-model Qwen/Qwen2.5-0.5B-Instruct \\
+ --device cuda
+
+Cache key: md5(question + "\\n" + solution) — keying on both prevents
+collisions when two MATH problems share identical solution text.
+Entries for solutions the extractor cannot parse are stored with
+success=False so training never re-attempts and correctly penalises them.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import pathlib
+import sys
+from typing import List, Tuple
+
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s %(levelname)-8s %(message)s",
+ handlers=[logging.StreamHandler(sys.stdout)],
+)
+logger = logging.getLogger(__name__)
+
+
+def load_jsonl(path: str) -> list[dict]:
+ records: list[dict] = []
+ with open(path, encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ try:
+ records.append(json.loads(line))
+ except json.JSONDecodeError:
+ pass
+ return records
+
+
+def collect_qa_pairs(records: list[dict]) -> List[Tuple[str, str]]:
+ """
+ Extract (question, solution) pairs from dataset records.
+
+ Returns pairs where both fields are non-empty. Falls back to empty
+ string for the question when only the solution field is present.
+ """
+ pairs: List[Tuple[str, str]] = []
+ for rec in records:
+ sol = (
+ rec.get("solution")
+ or rec.get("output")
+ or rec.get("response")
+ or ""
+ )
+ q = (
+ rec.get("question")
+ or rec.get("problem")
+ or rec.get("input")
+ or ""
+ )
+ if sol.strip():
+ pairs.append((q.strip(), sol.strip()))
+ return pairs
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(
+ description="Pre-extract step chains for grounded training data."
+ )
+ parser.add_argument(
+ "--gsm8k-data", required=True,
+ help="Path to GSM8K training JSONL (e.g. data/sft/gsm8k_sft.jsonl).",
+ )
+ parser.add_argument(
+ "--math-data", default=None,
+ help="Optional path to MATH training JSONL. If provided, those solutions "
+ "are also extracted and added to the cache.",
+ )
+ parser.add_argument(
+ "--output-cache", required=True,
+ help="Destination JSON file for the extraction cache.",
+ )
+ parser.add_argument(
+ "--extractor-model", default="Qwen/Qwen2.5-0.5B-Instruct",
+ help="HuggingFace model ID for the step chain extractor. Default Qwen/Qwen2.5-0.5B-Instruct.",
+ )
+ parser.add_argument(
+ "--device", default="cuda",
+ help="Device for the extractor model (default: cuda).",
+ )
+ parser.add_argument(
+ "--batch-size", type=int, default=1,
+ help="Reserved for future batched extraction. Currently always 1.",
+ )
+ args = parser.parse_args()
+
+ # ── Load data ─────────────────────────────────────────────────────────────
+ logger.info("Loading GSM8K data from: %s", args.gsm8k_data)
+ gsm8k_records = load_jsonl(args.gsm8k_data)
+ qa_pairs = collect_qa_pairs(gsm8k_records)
+ logger.info("GSM8K: %d (question, solution) pairs", len(qa_pairs))
+
+ if args.math_data:
+ logger.info("Loading MATH data from: %s", args.math_data)
+ math_records = load_jsonl(args.math_data)
+ math_pairs = collect_qa_pairs(math_records)
+ logger.info("MATH: %d (question, solution) pairs", len(math_pairs))
+ qa_pairs += math_pairs
+
+ if not qa_pairs:
+ logger.error(
+ "No solutions found in provided files. "
+ "Check field names (question/problem/input + solution/output/response)."
+ )
+ sys.exit(1)
+
+ # Deduplicate by (question, solution) content
+ # Two different MATH problems can have identical solution text but different
+ # questions — the question+solution key keeps them distinct in the cache.
+ seen: set = set()
+ unique_pairs: List[Tuple[str, str]] = []
+ for q, sol in qa_pairs:
+ key = (q, sol)
+ if key not in seen:
+ seen.add(key)
+ unique_pairs.append((q, sol))
+
+ logger.info(
+ "Total: %d pairs (%d unique after dedup)", len(qa_pairs), len(unique_pairs)
+ )
+
+ # ── Load extractor ────────────────────────────────────────────────────────
+ sys.path.insert(0, str(pathlib.Path(__file__).parent.parent))
+ from src.rl.unified_accuracy import StepChainExtractor
+
+ extractor = StepChainExtractor(
+ model_name=args.extractor_model,
+ device=args.device,
+ cache_path=args.output_cache, # load existing cache if present (resume)
+ )
+
+ # ── Build cache ───────────────────────────────────────────────────────────
+ already_cached = len(extractor._cache)
+ if already_cached:
+ logger.info("Resuming: %d entries already in cache", already_cached)
+
+ extractor.build_cache(unique_pairs)
+
+ # ── Save ──────────────────────────────────────────────────────────────────
+ extractor.save_cache()
+ logger.info(
+ "Done. Cache contains %d entries → %s",
+ len(extractor._cache),
+ args.output_cache,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/prepare_aqua_dataset.py b/scripts/prepare_aqua_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..eab5e27997eb052151d14c7863b6c1bc626bc54c
--- /dev/null
+++ b/scripts/prepare_aqua_dataset.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python3
+"""
+Download Chinar/AQuA-RAT from HuggingFace and convert it to the same JSONL
+format used by gsm8k_sft.jsonl so the GRPO training script can consume it
+directly via --gsm8k-data.
+
+Chinar/AQuA-RAT schema (processed version)
+-------------------------------------------
+ prompt : str — the math question
+ completion : str — step-by-step reasoning ending with:
+ "The answer is X . Therefore, the correct answer is: "
+
+Output schema (messages format expected by load_gsm8k)
+-------------------------------------------------------
+ {
+ "id": "aqua_",
+ "skill_id": "aqua_rat_algebra",
+ "source": "Chinar/AQuA-RAT",
+ "split": "train" | "validation",
+ "messages": [
+ {"role": "system", "content": SOLVER_SYSTEM_PROMPT},
+ {"role": "user", "content": "Solve ... Problem:\\n"},
+ {"role": "assistant", "content": "Step 1: ...\\nFinal Answer: "}
+ ]
+ }
+
+The dataset has only a 'train' split — we reserve the last 500 rows as
+a validation set and use the rest for training.
+
+Usage
+-----
+ python scripts/prepare_aqua_dataset.py
+ python scripts/prepare_aqua_dataset.py --val-size 300 --dry-run
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Any, Optional
+
+# ---------------------------------------------------------------------------
+# Prompt constants (kept in sync with src/config/prompts.py)
+# ---------------------------------------------------------------------------
+
+SOLVER_SYSTEM_PROMPT = (
+ "You are a step-by-step math solver. "
+ "Solve the given problem one step at a time. "
+ "Each step must be on its own line, starting with 'Step N:'. "
+ "End with a line starting with 'Final Answer:'. "
+ "Write every mathematical expression in Python/SymPy syntax "
+ "so it can be verified programmatically."
+)
+
+USER_WRAPPER = (
+ "Solve the following problem. Show your reasoning as numbered steps, "
+ "then give the final numeric answer on the last line.\n\nProblem:\n{question}"
+)
+
+# ---------------------------------------------------------------------------
+# Answer extraction
+# ---------------------------------------------------------------------------
+
+# The completion always ends with a variant of:
+# "The answer is E . Therefore, the correct answer is: 23"
+_ANSWER_TAIL = re.compile(
+ r"(?:The answer is\s+[A-Ea-e]\s*[.\-]?\s*)?"
+ r"Therefore,?\s+the correct answer is\s*:?\s*(.+)$",
+ re.IGNORECASE,
+)
+
+
+def _extract_answer_and_rationale(completion: str) -> Optional[tuple[str, str]]:
+ """
+ Split the completion into (rationale_lines, final_answer_str).
+ Returns None if no extractable numeric answer is found.
+ """
+ # Find the tail marker
+ m = _ANSWER_TAIL.search(completion)
+ if not m:
+ return None
+
+ raw_answer = m.group(1).strip()
+ # Everything before the tail is the rationale
+ rationale = completion[: m.start()].strip()
+ # Also strip a standalone "The answer is X ." line at the end of rationale
+ rationale = re.sub(r"\s*The answer is\s+[A-Ea-e]\s*[.\-]?\s*$", "", rationale, flags=re.IGNORECASE).strip()
+
+ # Normalise the answer to a clean numeric string
+ final_answer = _normalise_answer(raw_answer)
+ if final_answer is None:
+ return None
+
+ return rationale, final_answer
+
+
+def _normalise_answer(raw: str) -> Optional[str]:
+ """
+ Extract a single numeric value from an answer string.
+
+ "23" → "23"
+ "$ 1600" → "1600"
+ "8 seconds" → "8"
+ "5 and 1" → None (multi-value — skip)
+ "I and II" → None (non-numeric — skip)
+ "− 3 ≤ x ≤ 4" → None (inequality — skip)
+ """
+ text = raw.strip()
+
+ # Remove currency / whitespace
+ text = text.replace("$", "").replace("Rs.", "").replace("Rs", "").replace(",", "").strip()
+
+ # Handle unicode minus
+ text = text.replace("\u2212", "-").replace("−", "-")
+
+ # Skip if "and" still present (multi-value like "5 and 1")
+ if re.search(r"\band\b", text, re.IGNORECASE):
+ return None
+
+ # Skip inequalities / expressions with variables
+ if re.search(r"[a-zA-Z≤≥<>]", text):
+ return None
+
+ # Single number (integer or decimal, optionally negative)
+ m = re.fullmatch(r"\s*(-?\d+(?:\.\d+)?)\s*(?:[a-zA-Z%°].*)?", text)
+ if m:
+ val_str = m.group(1)
+ try:
+ val = float(val_str)
+ return str(int(val)) if val == int(val) else val_str
+ except ValueError:
+ pass
+
+ return None
+
+
+# ---------------------------------------------------------------------------
+# Rationale → Step N: format
+# ---------------------------------------------------------------------------
+
+def _rationale_to_steps(rationale: str) -> list[str]:
+ lines: list[str] = []
+ for raw in rationale.splitlines():
+ line = raw.strip()
+ if line:
+ line = line.replace("^", "**")
+ lines.append(line)
+ if not lines and rationale.strip():
+ sentences = re.split(r"(?<=[.!?])\s+", rationale.strip())
+ lines = [s.strip() for s in sentences if s.strip()]
+ return lines
+
+
+def _build_assistant(rationale: str, final_answer: str) -> str:
+ steps = _rationale_to_steps(rationale)
+ parts = [f"Step {i}: {line}" for i, line in enumerate(steps, 1)]
+ body = "\n".join(parts)
+ return f"{body}\nFinal Answer: {final_answer}" if body else f"Final Answer: {final_answer}"
+
+
+# ---------------------------------------------------------------------------
+# Row conversion
+# ---------------------------------------------------------------------------
+
+def convert_row(row: dict[str, Any], idx: int, split: str) -> Optional[dict[str, Any]]:
+ question = (row.get("prompt") or "").strip()
+ completion = (row.get("completion") or "").strip()
+
+ if not question or not completion:
+ return None
+
+ result = _extract_answer_and_rationale(completion)
+ if result is None:
+ return None
+
+ rationale, final_answer = result
+ assistant_text = _build_assistant(rationale, final_answer)
+
+ return {
+ "id": f"aqua_{split}_{idx}",
+ "skill_id": "aqua_rat_algebra",
+ "source": "Chinar/AQuA-RAT",
+ "split": split,
+ "messages": [
+ {"role": "system", "content": SOLVER_SYSTEM_PROMPT},
+ {"role": "user", "content": USER_WRAPPER.format(question=question)},
+ {"role": "assistant", "content": assistant_text},
+ ],
+ }
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--output-dir", default="data/sft")
+ parser.add_argument("--val-size", type=int, default=500,
+ help="How many rows from the end of the dataset to use as validation.")
+ parser.add_argument("--dry-run", action="store_true")
+ parser.add_argument("--max-samples", type=int, default=None)
+ args = parser.parse_args()
+
+ try:
+ from datasets import load_dataset
+ except ImportError:
+ print("ERROR: pip install datasets", file=sys.stderr)
+ sys.exit(1)
+
+ print("Downloading Chinar/AQuA-RAT …")
+ ds = load_dataset("Chinar/AQuA-RAT")
+ all_rows = list(ds["train"])
+ total = len(all_rows)
+ print(f" Total rows: {total:,}")
+
+ val_rows = all_rows[-args.val_size:]
+ train_rows = all_rows[: -args.val_size]
+
+ splits = {
+ "train": train_rows,
+ "validation": val_rows,
+ }
+
+ out_dir = Path(args.output_dir)
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ for split, rows in splits.items():
+ if args.max_samples:
+ rows = rows[: args.max_samples]
+
+ records: list[dict] = []
+ skipped = 0
+ for idx, row in enumerate(rows):
+ rec = convert_row(row, idx, split)
+ if rec is None:
+ skipped += 1
+ else:
+ records.append(rec)
+
+ skip_pct = 100.0 * skipped / max(1, len(rows))
+
+ if args.dry_run:
+ print(f"\n── {split}: {len(records)} valid / {skipped} skipped ({skip_pct:.1f}%) ──")
+ for rec in records[:3]:
+ print(json.dumps(rec, indent=2))
+ continue
+
+ out_path = out_dir / f"aqua_{split}.jsonl"
+ with out_path.open("w", encoding="utf-8") as f:
+ for rec in records:
+ f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+
+ print(f" [{split:12s}] {len(records):6,d} valid {skipped:5,d} skipped ({skip_pct:.1f}%) → {out_path}")
+
+ if not args.dry_run:
+ print("\nDone. Launch continuation training with:")
+ print(" bash launch_grpo_aqua.sh")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/prepare_combined_dataset.py b/scripts/prepare_combined_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b36e782df1d85596af0fc7998be2cbc055962545
--- /dev/null
+++ b/scripts/prepare_combined_dataset.py
@@ -0,0 +1,711 @@
+#!/usr/bin/env python3
+"""
+Combined dataset pipeline — NuminaMath-CoT + OpenMathInstruct-2
+================================================================
+Downloads, filters, normalises, and merges two large math datasets into a single
+JSONL file (train / val / test) that the GRPO training script can consume directly
+via --gsm8k-data.
+
+Why these two datasets
+----------------------
+ NuminaMath-CoT (AI-MO/NuminaMath-CoT)
+ 860 K problems. Clean \\boxed{} answers. 7 rich topic categories that map
+ directly to ZPD skill_ids. Sources span AMC, AIME, Chinese HS, olympiads,
+ and synthetic — giving natural difficulty diversity.
+
+ OpenMathInstruct-2 (nvidia/OpenMathInstruct-2)
+ 14 M synthetic problems with step-level CoT. `expected_answer` is pre-verified.
+ Diverse surface forms prevent pattern memorisation. We skip any row whose
+ problem_source is "gsm8k" (already in prior training).
+
+Output schema (identical to gsm8k_sft.jsonl / aqua_train.jsonl)
+---------------------------------------------------------------
+ {
+ "id": "__",
+ "skill_id": "", ← used by ZPD CurriculumManager
+ "source": "",
+ "split": "train" | "val" | "test",
+ "difficulty": 1 | 2 | 3, ← 1=easy 2=medium 3=hard (for ZPD)
+ "task_type": "solve",
+ "messages": [
+ {"role": "system", "content": SOLVER_SYSTEM_PROMPT},
+ {"role": "user", "content": "Solve ... Problem:\\n"},
+ {"role": "assistant", "content": "Step 1: ...\\nFinal Answer: "}
+ ]
+ }
+
+Usage
+-----
+ # Quick test (no download, just show stats)
+ python scripts/prepare_combined_dataset.py --dry-run
+
+ # Full pipeline (default caps: 20 K numina + 15 K openmath)
+ python scripts/prepare_combined_dataset.py
+
+ # Larger run
+ python scripts/prepare_combined_dataset.py --max-numina 40000 --max-openmath 30000
+
+ # Only one source
+ python scripts/prepare_combined_dataset.py --skip-openmath
+ python scripts/prepare_combined_dataset.py --skip-numina
+
+ # Custom output dir
+ python scripts/prepare_combined_dataset.py --output-dir data/sft/combined
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import logging
+import math
+import random
+import re
+import sys
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s %(levelname)-8s %(message)s",
+ datefmt="%H:%M:%S",
+)
+log = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Constants — kept in sync with src/config/prompts.py
+# ---------------------------------------------------------------------------
+
+SOLVER_SYSTEM_PROMPT = (
+ "You are a step-by-step math solver. "
+ "Solve the given problem one step at a time. "
+ "Each step must be on its own line, starting with 'Step N:'. "
+ "End with a line starting with 'Final Answer:'. "
+ "Write every mathematical expression in Python/SymPy syntax "
+ "so it can be verified programmatically."
+)
+
+USER_WRAPPER = (
+ "Solve the following problem. Show your reasoning as numbered steps, "
+ "then give the final numeric answer on the last line.\n\nProblem:\n{question}"
+)
+
+# ---------------------------------------------------------------------------
+# Skill-ID mappings (drives ZPD CurriculumManager per-topic mastery)
+# ---------------------------------------------------------------------------
+
+# NuminaMath-CoT `type` field → skill_id
+NUMINA_TYPE_TO_SKILL: Dict[str, str] = {
+ "algebra": "numina_algebra",
+ "intermediate_algebra": "numina_algebra",
+ "prealgebra": "numina_prealgebra",
+ "number_theory": "numina_number_theory",
+ "geometry": "numina_geometry",
+ "counting_and_probability": "numina_combinatorics",
+ "precalculus": "numina_calculus",
+ "calculus": "numina_calculus",
+ "statistics": "numina_statistics",
+ "probability": "numina_statistics",
+ # competition-source buckets (fallback when type not in map above)
+ "cn_k12": "numina_algebra",
+ "olympiads": "numina_olympiad",
+ "amc_aime": "numina_competition",
+ "synthetic_math": "numina_synthetic",
+}
+
+# NuminaMath source → approximate difficulty (1=easy 2=medium 3=hard)
+NUMINA_SOURCE_DIFFICULTY: Dict[str, int] = {
+ "cn_k12": 1,
+ "synthetic_math": 2,
+ "amc_aime": 2,
+ "olympiads": 3,
+}
+
+# OpenMathInstruct-2 problem_source → skill_id / difficulty
+OPENMATH_SOURCE_TO_SKILL: Dict[str, str] = {
+ "math": "openmath_algebra", # overridden per-row by subject
+ "amc_aime_1983_2024": "openmath_competition",
+ "synthetic_math": "openmath_synthetic",
+ "number_theory": "openmath_number_theory",
+}
+
+OPENMATH_SOURCE_DIFFICULTY: Dict[str, int] = {
+ "math": 2,
+ "amc_aime_1983_2024": 3,
+ "synthetic_math": 1,
+}
+
+# OpenMathInstruct MATH-subject → skill_id (when problem_source == "math")
+OPENMATH_MATH_SUBJECT_SKILL: Dict[str, str] = {
+ "Algebra": "openmath_algebra",
+ "Number Theory": "openmath_number_theory",
+ "Geometry": "openmath_geometry",
+ "Counting & Probability": "openmath_combinatorics",
+ "Intermediate Algebra": "openmath_algebra",
+ "Prealgebra": "openmath_prealgebra",
+ "Precalculus": "openmath_calculus",
+ "Calculus": "openmath_calculus",
+}
+
+# ---------------------------------------------------------------------------
+# Answer normalisation
+# ---------------------------------------------------------------------------
+
+_BOXED_RE = re.compile(r"\\boxed\{((?:[^{}]|\{[^{}]*\})*)\}")
+_LATEX_FRAC = re.compile(r"\\frac\{(\d+)\}\{(\d+)\}")
+_PLAIN_FRAC = re.compile(r"^(-?\d+)\s*/\s*(\d+)$")
+_CURRENCY = re.compile(r"(?:Rs\.?|USD|\$|€|£)\s*", re.IGNORECASE)
+_UNICODE_MINUS = str.maketrans({"\u2212": "-", "−": "-"})
+
+
+def extract_boxed(text: str) -> Optional[str]:
+ """Return the last \\boxed{} contents from a solution string."""
+ matches = _BOXED_RE.findall(text)
+ return matches[-1].strip() if matches else None
+
+
+def normalise_numeric(raw: str) -> Optional[str]:
+ """
+ Convert a raw answer string to a clean numeric string.
+
+ Returns None for:
+ - multi-value answers ("3 and 5")
+ - symbolic expressions ("3\\sqrt{2}", "x+1")
+ - inequalities
+ - fractions where num/den exceed safe range
+ """
+ text = raw.strip()
+
+ # Remove currency symbols and commas in numbers
+ text = _CURRENCY.sub("", text)
+ text = text.replace(",", "").translate(_UNICODE_MINUS).strip()
+
+ # Skip if still contains words other than units
+ if re.search(r"\b(and|or|none|no solution|undefined)\b", text, re.IGNORECASE):
+ return None
+
+ # Skip if contains letters (symbolic)
+ if re.search(r"[a-zA-Z]", text):
+ return None
+
+ # Skip inequalities / ranges
+ if re.search(r"[≤≥<>]", text):
+ return None
+
+ # Handle LaTeX fractions: \frac{3}{4}
+ m = _LATEX_FRAC.fullmatch(text)
+ if m:
+ num, den = int(m.group(1)), int(m.group(2))
+ if den:
+ v = num / den
+ return str(int(v)) if v == int(v) else f"{v:.4f}"
+ return None
+
+ # Handle plain fractions: 3/4
+ m = _PLAIN_FRAC.match(text)
+ if m:
+ num, den = int(m.group(1)), int(m.group(2))
+ if den:
+ v = num / den
+ return str(int(v)) if v == int(v) else f"{v:.4f}"
+ return None
+
+ # Handle percentage → decimal
+ pct = re.fullmatch(r"(-?\d+(?:\.\d+)?)\s*%", text)
+ if pct:
+ v = float(pct.group(1))
+ return str(int(v)) if v == int(v) else f"{v:.4f}"
+
+ # Plain integer or decimal (possibly negative, possibly with trailing unit like "km")
+ m = re.match(r"^\s*(-?\d+(?:\.\d+)?)\s*(?:[^0-9.\s].*)?\s*$", text)
+ if m:
+ val_str = m.group(1)
+ try:
+ v = float(val_str)
+ return str(int(v)) if v == int(v) else val_str
+ except ValueError:
+ pass
+
+ return None
+
+
+# ---------------------------------------------------------------------------
+# Solution → Step N: format
+# ---------------------------------------------------------------------------
+
+_SKIP_LINE_RE = re.compile(
+ r"^\s*("
+ r"\\boxed\{|"
+ r"(Therefore|Thus|Hence|So),?\s+(the\s+)?(final\s+)?answer\s+is|"
+ r"The\s+(final\s+)?answer\s+is|"
+ r"Answer\s*[:=]"
+ r")",
+ re.IGNORECASE,
+)
+
+
+def solution_to_steps(solution: str, final_answer: str, max_steps: int = 18) -> str:
+ """
+ Convert an arbitrary CoT solution to the pipeline's Step N: format.
+
+ Strategy:
+ 1. Split on newlines.
+ 2. Drop blank lines and lines that just announce the final answer
+ (those are replaced by the explicit Final Answer: line).
+ 3. Strip any existing "Step N:" prefix to avoid double-numbering.
+ 4. Re-number as "Step 1:", "Step 2:", …
+ 5. Append "Final Answer: ".
+ """
+ raw_lines = [l.strip() for l in solution.split("\n") if l.strip()]
+ clean: List[str] = []
+ for line in raw_lines:
+ if _SKIP_LINE_RE.match(line):
+ continue
+ # Strip old step prefix
+ line = re.sub(r"^Step\s*\d+\s*[:.)]\s*", "", line)
+ if line:
+ clean.append(line)
+
+ # Cap to max_steps to keep token count reasonable
+ clean = clean[:max_steps]
+
+ if not clean:
+ return f"Final Answer: {final_answer}"
+
+ parts = [f"Step {i}: {line}" for i, line in enumerate(clean, 1)]
+ return "\n".join(parts) + f"\nFinal Answer: {final_answer}"
+
+
+# ---------------------------------------------------------------------------
+# Record builders
+# ---------------------------------------------------------------------------
+
+def build_record(
+ idx: int,
+ split: str,
+ source_name: str,
+ skill_id: str,
+ difficulty: int,
+ question: str,
+ solution_text: str,
+ final_answer: str,
+) -> Dict[str, Any]:
+ assistant_content = solution_to_steps(solution_text, final_answer)
+ return {
+ "id": f"{source_name.replace('/', '_')}_{split}_{idx}",
+ "skill_id": skill_id,
+ "source": source_name,
+ "split": split,
+ "difficulty": difficulty,
+ "task_type": "solve",
+ "messages": [
+ {"role": "system", "content": SOLVER_SYSTEM_PROMPT},
+ {"role": "user", "content": USER_WRAPPER.format(question=question.strip())},
+ {"role": "assistant", "content": assistant_content},
+ ],
+ }
+
+
+# ---------------------------------------------------------------------------
+# Deduplication
+# ---------------------------------------------------------------------------
+
+def problem_hash(text: str) -> str:
+ """Fast 16-char hash for near-dedup (exact-match on normalised text)."""
+ normalised = re.sub(r"\s+", " ", text.strip().lower())
+ return hashlib.md5(normalised.encode()).hexdigest()[:16]
+
+
+# ---------------------------------------------------------------------------
+# NuminaMath-CoT processing
+# ---------------------------------------------------------------------------
+
+def _numina_skill_and_difficulty(row: Dict) -> Tuple[str, int]:
+ topic = (row.get("type") or "").lower().strip()
+ source = (row.get("source") or "").lower().strip()
+
+ skill = NUMINA_TYPE_TO_SKILL.get(topic)
+ if skill is None:
+ skill = NUMINA_TYPE_TO_SKILL.get(source, "numina_general")
+
+ difficulty = NUMINA_SOURCE_DIFFICULTY.get(source, 2)
+ return skill, difficulty
+
+
+def iter_numina(
+ max_samples: int,
+ per_skill_cap: int,
+ skip_olympiad: bool,
+ seed: int,
+) -> Iterator[Dict[str, Any]]:
+ """
+ Stream NuminaMath-CoT from HuggingFace and yield cleaned records.
+ Uses per-skill quota to guarantee topic diversity.
+ """
+ try:
+ from datasets import load_dataset # type: ignore
+ except ImportError:
+ log.error("pip install datasets huggingface_hub")
+ sys.exit(1)
+
+ log.info("Streaming AI-MO/NuminaMath-CoT …")
+ ds = load_dataset("AI-MO/NuminaMath-CoT", split="train", streaming=True,
+ trust_remote_code=True)
+
+ skill_counts: Counter = Counter()
+ seen_hashes: set = set()
+ total_yielded = 0
+
+ rng = random.Random(seed)
+
+ for row in ds:
+ if total_yielded >= max_samples:
+ break
+
+ problem = (row.get("problem") or "").strip()
+ solution = (row.get("solution") or "").strip()
+ if not problem or not solution:
+ continue
+
+ # Extract and normalise answer from \boxed{}
+ raw_answer = extract_boxed(solution)
+ if raw_answer is None:
+ continue
+ final_answer = normalise_numeric(raw_answer)
+ if final_answer is None:
+ continue
+
+ skill, difficulty = _numina_skill_and_difficulty(row)
+
+ # Optionally skip very hard olympiad problems
+ if skip_olympiad and skill == "numina_olympiad":
+ continue
+
+ # Per-skill cap to guarantee diversity
+ if skill_counts[skill] >= per_skill_cap:
+ continue
+
+ # Dedup
+ h = problem_hash(problem)
+ if h in seen_hashes:
+ continue
+ seen_hashes.add(h)
+
+ skill_counts[skill] += 1
+ total_yielded += 1
+
+ yield build_record(
+ idx=total_yielded,
+ split="__assign__",
+ source_name="AI-MO/NuminaMath-CoT",
+ skill_id=skill,
+ difficulty=difficulty,
+ question=problem,
+ solution_text=solution,
+ final_answer=final_answer,
+ )
+
+ log.info("NuminaMath-CoT: yielded %d records | skill dist: %s",
+ total_yielded, dict(skill_counts.most_common()))
+
+
+# ---------------------------------------------------------------------------
+# OpenMathInstruct-2 processing
+# ---------------------------------------------------------------------------
+
+def _openmath_skill_and_difficulty(row: Dict) -> Tuple[str, int]:
+ src = (row.get("problem_source") or "").lower().strip()
+ subj = (row.get("subject") or "").strip()
+
+ if src == "math" and subj:
+ skill = OPENMATH_MATH_SUBJECT_SKILL.get(subj, "openmath_algebra")
+ else:
+ skill = OPENMATH_SOURCE_TO_SKILL.get(src, "openmath_general")
+
+ difficulty = OPENMATH_SOURCE_DIFFICULTY.get(src, 2)
+ return skill, difficulty
+
+
+def iter_openmath(
+ max_samples: int,
+ per_skill_cap: int,
+ skip_gsm8k: bool,
+ seed: int,
+) -> Iterator[Dict[str, Any]]:
+ """
+ Stream OpenMathInstruct-2 from HuggingFace and yield cleaned records.
+ Only yields rows where `is_correct_solution` is True (pre-verified by NVIDIA).
+ """
+ try:
+ from datasets import load_dataset # type: ignore
+ except ImportError:
+ log.error("pip install datasets huggingface_hub")
+ sys.exit(1)
+
+ log.info("Streaming nvidia/OpenMathInstruct-2 (this may take a moment) …")
+ ds = load_dataset(
+ "nvidia/OpenMathInstruct-2",
+ split="train",
+ streaming=True,
+ trust_remote_code=True,
+ )
+
+ skill_counts: Counter = Counter()
+ seen_hashes: set = set()
+ total_yielded = 0
+
+ for row in ds:
+ if total_yielded >= max_samples:
+ break
+
+ # Filter: skip gsm8k (contamination risk)
+ problem_src = (row.get("problem_source") or "").lower()
+ if skip_gsm8k and "gsm8k" in problem_src:
+ continue
+
+ # Filter: only verified correct solutions
+ if not row.get("is_correct_solution", True):
+ continue
+
+ problem = (row.get("problem") or "").strip()
+ solution = (row.get("generated_solution") or "").strip()
+ expected = (row.get("expected_answer") or "").strip()
+
+ if not problem or not solution or not expected:
+ continue
+
+ # Normalise the pre-extracted answer
+ final_answer = normalise_numeric(expected)
+ if final_answer is None:
+ continue
+
+ skill, difficulty = _openmath_skill_and_difficulty(row)
+
+ # Per-skill cap
+ if skill_counts[skill] >= per_skill_cap:
+ continue
+
+ # Dedup
+ h = problem_hash(problem)
+ if h in seen_hashes:
+ continue
+ seen_hashes.add(h)
+
+ skill_counts[skill] += 1
+ total_yielded += 1
+
+ yield build_record(
+ idx=total_yielded,
+ split="__assign__",
+ source_name="nvidia/OpenMathInstruct-2",
+ skill_id=skill,
+ difficulty=difficulty,
+ question=problem,
+ solution_text=solution,
+ final_answer=final_answer,
+ )
+
+ log.info("OpenMathInstruct-2: yielded %d records | skill dist: %s",
+ total_yielded, dict(skill_counts.most_common()))
+
+
+# ---------------------------------------------------------------------------
+# Dataset stats printer
+# ---------------------------------------------------------------------------
+
+def print_stats(records: List[Dict], label: str) -> None:
+ skill_c: Counter = Counter(r["skill_id"] for r in records)
+ diff_c: Counter = Counter(r["difficulty"] for r in records)
+ src_c: Counter = Counter(r["source"] for r in records)
+ split_c: Counter = Counter(r["split"] for r in records)
+
+ log.info("─── %s (%d records) ───────────────────────────────", label, len(records))
+ log.info(" by split: %s", dict(split_c))
+ log.info(" by source: %s", dict(src_c))
+ log.info(" by difficulty: %s", dict(sorted(diff_c.items())))
+ log.info(" by skill_id:")
+ for sk, cnt in skill_c.most_common():
+ log.info(" %-40s %5d", sk, cnt)
+
+
+# ---------------------------------------------------------------------------
+# Write JSONL
+# ---------------------------------------------------------------------------
+
+def write_jsonl(records: List[Dict], path: Path) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ with path.open("w", encoding="utf-8") as f:
+ for rec in records:
+ f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+ log.info("Wrote %d records → %s", len(records), path)
+
+
+# ---------------------------------------------------------------------------
+# Train / val / test split (stratified by skill_id)
+# ---------------------------------------------------------------------------
+
+def stratified_split(
+ records: List[Dict],
+ train_frac: float = 0.85,
+ val_frac: float = 0.10,
+ seed: int = 42,
+) -> Tuple[List[Dict], List[Dict], List[Dict]]:
+ """
+ Stratified split by skill_id so every skill appears in all three sets.
+ Remaining fraction after train+val goes to test.
+ """
+ rng = random.Random(seed)
+
+ by_skill: Dict[str, List[Dict]] = defaultdict(list)
+ for r in records:
+ by_skill[r["skill_id"]].append(r)
+
+ train_, val_, test_ = [], [], []
+ for skill, items in by_skill.items():
+ rng.shuffle(items)
+ n = len(items)
+ n_train = math.floor(n * train_frac)
+ n_val = math.floor(n * val_frac)
+ train_ += items[:n_train]
+ val_ += items[n_train: n_train + n_val]
+ test_ += items[n_train + n_val:]
+
+ for r in train_: r["split"] = "train"
+ for r in val_: r["split"] = "val"
+ for r in test_: r["split"] = "test"
+
+ # Shuffle each split so skill interleaves during training
+ rng.shuffle(train_)
+ rng.shuffle(val_)
+ rng.shuffle(test_)
+
+ return train_, val_, test_
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def parse_args() -> argparse.Namespace:
+ p = argparse.ArgumentParser(
+ description="Build combined NuminaMath + OpenMathInstruct-2 training data."
+ )
+ p.add_argument("--output-dir", default="data/sft",
+ help="Directory for output JSONL files.")
+ p.add_argument("--max-numina", type=int, default=20_000,
+ help="Max records from NuminaMath-CoT (default 20 000).")
+ p.add_argument("--max-openmath", type=int, default=15_000,
+ help="Max records from OpenMathInstruct-2 (default 15 000).")
+ p.add_argument("--per-skill-cap", type=int, default=4_000,
+ help="Max records per skill_id to guarantee topic diversity.")
+ p.add_argument("--skip-numina", action="store_true",
+ help="Skip NuminaMath-CoT entirely.")
+ p.add_argument("--skip-openmath", action="store_true",
+ help="Skip OpenMathInstruct-2 entirely.")
+ p.add_argument("--skip-olympiad", action="store_true", default=True,
+ help="Skip numina_olympiad problems (too hard for 1.5B; default: True).")
+ p.add_argument("--no-skip-olympiad", dest="skip_olympiad", action="store_false",
+ help="Include olympiad-level problems.")
+ p.add_argument("--train-frac", type=float, default=0.85)
+ p.add_argument("--val-frac", type=float, default=0.10)
+ p.add_argument("--seed", type=int, default=42)
+ p.add_argument("--dry-run", action="store_true",
+ help="Process only 500 rows from each source and show stats (no write).")
+ return p.parse_args()
+
+
+def main() -> None:
+ args = parse_args()
+ rng = random.Random(args.seed)
+
+ if args.dry_run:
+ args.max_numina = min(args.max_numina, 500)
+ args.max_openmath = min(args.max_openmath, 500)
+ log.info("DRY RUN — capped at 500 samples per source, nothing written to disk.")
+
+ all_records: List[Dict] = []
+
+ # ── NuminaMath-CoT ────────────────────────────────────────────────────
+ if not args.skip_numina:
+ numina_recs = list(iter_numina(
+ max_samples = args.max_numina,
+ per_skill_cap = args.per_skill_cap,
+ skip_olympiad = args.skip_olympiad,
+ seed = args.seed,
+ ))
+ all_records.extend(numina_recs)
+ log.info("NuminaMath-CoT collected: %d records", len(numina_recs))
+ else:
+ log.info("Skipping NuminaMath-CoT (--skip-numina).")
+
+ # ── OpenMathInstruct-2 ────────────────────────────────────────────────
+ if not args.skip_openmath:
+ openmath_recs = list(iter_openmath(
+ max_samples = args.max_openmath,
+ per_skill_cap = args.per_skill_cap,
+ skip_gsm8k = True,
+ seed = args.seed,
+ ))
+ all_records.extend(openmath_recs)
+ log.info("OpenMathInstruct-2 collected: %d records", len(openmath_recs))
+ else:
+ log.info("Skipping OpenMathInstruct-2 (--skip-openmath).")
+
+ if not all_records:
+ log.error("No records collected — check dataset availability.")
+ sys.exit(1)
+
+ # ── Deduplicate across sources ─────────────────────────────────────────
+ seen: set = set()
+ deduped: List[Dict] = []
+ for r in all_records:
+ question = r["messages"][1]["content"]
+ h = problem_hash(question)
+ if h not in seen:
+ seen.add(h)
+ deduped.append(r)
+
+ log.info("After cross-source dedup: %d → %d records (removed %d dupes)",
+ len(all_records), len(deduped), len(all_records) - len(deduped))
+
+ # ── Stratified split ──────────────────────────────────────────────────
+ train_recs, val_recs, test_recs = stratified_split(
+ deduped, args.train_frac, args.val_frac, args.seed
+ )
+
+ print_stats(train_recs + val_recs + test_recs, "COMBINED DATASET")
+
+ # ── Write outputs ─────────────────────────────────────────────────────
+ if args.dry_run:
+ log.info("DRY RUN complete — no files written.")
+ log.info(" would write: combined_train.jsonl (%d rows)", len(train_recs))
+ log.info(" would write: combined_val.jsonl (%d rows)", len(val_recs))
+ log.info(" would write: combined_test.jsonl (%d rows)", len(test_recs))
+ log.info("Sample record:")
+ print(json.dumps(train_recs[0], indent=2, ensure_ascii=False))
+ return
+
+ out = Path(args.output_dir)
+ write_jsonl(train_recs, out / "combined_train.jsonl")
+ write_jsonl(val_recs, out / "combined_val.jsonl")
+ write_jsonl(test_recs, out / "combined_test.jsonl")
+
+ log.info("")
+ log.info("╔══════════════════════════════════════════════════════════════╗")
+ log.info("║ Pipeline complete. Next step: ║")
+ log.info("║ bash launch_grpo_combined.sh ║")
+ log.info("╚══════════════════════════════════════════════════════════════╝")
+ log.info(" train : %6d rows → %s/combined_train.jsonl", len(train_recs), out)
+ log.info(" val : %6d rows → %s/combined_val.jsonl", len(val_recs), out)
+ log.info(" test : %6d rows → %s/combined_test.jsonl", len(test_recs), out)
+ log.info("")
+ log.info("Skill coverage (for ZPD CurriculumManager):")
+ skill_c = Counter(r["skill_id"] for r in train_recs)
+ for sk, cnt in sorted(skill_c.items()):
+ log.info(" %-40s %5d train samples", sk, cnt)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/run_grpo_training.py b/scripts/run_grpo_training.py
new file mode 100644
index 0000000000000000000000000000000000000000..1176178463a0381f90e66df10eabb648d4192a39
--- /dev/null
+++ b/scripts/run_grpo_training.py
@@ -0,0 +1,2693 @@
+"""
+GRPO training for self-improvement math environment.
+
+Group Relative Policy Optimization (GRPO) is dramatically simpler and more
+stable than PPO for LLM fine-tuning on math tasks:
+
+ - No value function / critic needed
+ - No GAE, no gamma, no lambda
+ - No KL instability from per-step advantage collapse
+ - Advantages computed as within-group z-scores: A_i = (R_i - mean_R) / std_R
+ - Proven on math RL: DeepSeek-Math, Qwen-Math, DAPO all use GRPO variants
+
+The algorithm per question:
+ 1. Generate K solutions (default K=4)
+ 2. Score each with the existing reward pipeline (PRM + SymPy + format)
+ 3. A_i = (R_i - mean(R)) / (std(R) + eps)
+ 4. policy_loss = -mean_i [ A_i * sum_t log pi(a_t | s_{ int:
+ self.primary.write(data)
+ self.secondary.write(data)
+ return len(data)
+
+ def flush(self) -> None:
+ self.primary.flush()
+ self.secondary.flush()
+
+ def isatty(self) -> bool:
+ return getattr(self.primary, "isatty", lambda: False)()
+
+ def fileno(self) -> int:
+ return self.primary.fileno()
+
+
+def _add_file_logging(log_path: Path) -> logging.FileHandler:
+ """Attach a FileHandler to the root logger.
+
+ Every logger.info / logger.warning / … call — from any module — will be
+ written to ``log_path`` in addition to the terminal. This complements
+ TeeStream: TeeStream captures bare print() / sys.stderr writes; this
+ handler captures the logging subsystem, which uses its own internal stream
+ reference that TeeStream cannot intercept.
+ """
+ fh = logging.FileHandler(log_path, mode="a", encoding="utf-8")
+ fh.setLevel(logging.DEBUG)
+ fh.setFormatter(logging.Formatter(
+ "%(asctime)s %(levelname)-8s %(name)s - %(message)s"
+ ))
+ logging.getLogger().addHandler(fh)
+ return fh
+
+
+if torch.cuda.is_available():
+ torch.set_float32_matmul_precision("high")
+ torch.backends.cuda.matmul.allow_tf32 = True
+ torch.backends.cudnn.allow_tf32 = True
+ torch.backends.cudnn.benchmark = True # auto-tune fastest conv algo per shape
+
+
+# ---------------------------------------------------------------------------
+# Data loading
+# ---------------------------------------------------------------------------
+
+def _infer_eval_dataset_name(data_path: str) -> str:
+ """Derive a short human-readable label from the eval data file path."""
+ stem = Path(data_path).stem.lower()
+ if "aqua" in stem:
+ return "AQuA-RAT"
+ if "math" in stem:
+ return "MATH"
+ if "gsm" in stem:
+ return "GSM8K"
+ return Path(data_path).stem
+
+
+def load_gsm8k(path: str) -> List[Dict[str, str]]:
+ """Return list of {"question": ..., "gold_final": ...} from a JSONL file."""
+ pairs: List[Dict[str, str]] = []
+ p = Path(path)
+ if not p.exists():
+ logger.warning("Training data not found at %s", path)
+ return pairs
+ with p.open(encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ rec = json.loads(line)
+ except json.JSONDecodeError:
+ continue
+
+ question = ""
+ gold = ""
+ if "question" in rec and "answer" in rec:
+ question = rec["question"].strip()
+ _, gold = parse_gsm8k_answer(str(rec["answer"]))
+ elif "messages" in rec:
+ user_text = ""
+ asst_text = ""
+ for msg in rec["messages"]:
+ if msg.get("role") == "user" and not user_text:
+ user_text = msg.get("content", "").strip()
+ elif msg.get("role") == "assistant" and not asst_text:
+ asst_text = msg.get("content", "")
+ if "Problem:" in user_text:
+ question = user_text.split("Problem:", 1)[1].strip()
+ else:
+ question = user_text
+ answer_str = extract_final_answer_numeric_str(asst_text) or ""
+ gold = answer_str.strip()
+
+ if question and gold:
+ pairs.append({"question": question, "gold_final": gold})
+ logger.info("Loaded %d QA pairs from %s", len(pairs), path)
+ return pairs
+
+
+# ---------------------------------------------------------------------------
+# MATH harder dataset
+# ---------------------------------------------------------------------------
+
+def _extract_boxed(text: str) -> Optional[str]:
+ r"""Extract the content of the first ``\boxed{...}`` in *text*."""
+ m = re.search(r"\\boxed\{([^}]*)\}", text)
+ return m.group(1).strip() if m else None
+
+
+def _boxed_to_numeric(answer: str) -> Optional[str]:
+ """
+ Convert a ``\\boxed{...}`` answer to a plain numeric string.
+
+ Returns a string of the form ``"42"`` or ``"3.5000"`` when the answer
+ is a recognisable integer, decimal, or simple fraction (``3/4`` or
+ ``\\frac{3}{4}``). Returns ``None`` for symbolic / multi-part answers
+ like ``3\\sqrt{2}`` or ``(1, 2)``.
+ """
+ ans = answer.strip()
+ # Direct integer
+ try:
+ return str(int(ans))
+ except ValueError:
+ pass
+ # Direct float (includes "3.5", "0.75", etc.)
+ try:
+ v = float(ans)
+ return str(int(v)) if v == int(v) else f"{v:.4f}"
+ except ValueError:
+ pass
+ # LaTeX fraction \frac{num}{den}
+ m = re.fullmatch(r"\\frac\{(\d+)\}\{(\d+)\}", ans)
+ if m:
+ num, den = int(m.group(1)), int(m.group(2))
+ if den:
+ v = num / den
+ return str(int(v)) if v == int(v) else f"{v:.4f}"
+ # Plain fraction num/den
+ m = re.fullmatch(r"(\d+)/(\d+)", ans)
+ if m:
+ num, den = int(m.group(1)), int(m.group(2))
+ if den:
+ v = num / den
+ return str(int(v)) if v == int(v) else f"{v:.4f}"
+ return None
+
+
+def load_math_dataset(
+ local_path: Optional[str] = None,
+ cache_path: str = "data/math/math_numeric.jsonl",
+ max_difficulty: int = 3,
+) -> List[Dict[str, str]]:
+ """
+ Load a subset of the MATH competition dataset filtered to problems with
+ numerically-verifiable answers (integers, decimals, simple fractions).
+
+ Loading order
+ -------------
+ 1. ``local_path`` if provided and the file exists.
+ 2. ``cache_path`` if that file exists (written on first HF download).
+ 3. HuggingFace ``competition_math`` dataset; filtered + written to
+ ``cache_path`` for subsequent runs.
+
+ Only problems with ``Level ≤ max_difficulty`` are included. Difficulty
+ 1-2 ≈ AMC-8 level (comparable to hard GSM8K); difficulty 3 ≈ AMC-10.
+ Levels 4-5 are graduate-level and usually too hard for a 1.5B model to
+ get any reward signal from (win_rate ≈ 0 → skipped groups every iter).
+ """
+ for candidate in filter(None, [local_path, cache_path]):
+ p = Path(candidate)
+ if p.exists():
+ pairs: List[Dict[str, str]] = []
+ with p.open(encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ try:
+ pairs.append(json.loads(line))
+ except json.JSONDecodeError:
+ pass
+ if pairs:
+ logger.info("Loaded %d MATH pairs from %s", len(pairs), p)
+ return pairs
+
+ # Download from HuggingFace
+ logger.info(
+ "MATH dataset not found locally — downloading from HuggingFace "
+ "(qwedsacf/competition_math, difficulty ≤ %d, numeric answers only)...",
+ max_difficulty,
+ )
+ # Try HF sources in priority order. Only keep sources confirmed reachable;
+ # lighteval/MATH and hendrycks/competition_math have network/naming issues.
+ _HF_SOURCES = [
+ ("qwedsacf/competition_math", {}), # reliable public mirror
+ ("lighteval/MATH-Hard", {"name": "default"}), # hard subset
+ ]
+ ds = None
+ for hf_name, hf_kwargs in _HF_SOURCES:
+ try:
+ from datasets import load_dataset # type: ignore
+ ds = load_dataset(hf_name, split="train", trust_remote_code=True, **hf_kwargs)
+ logger.info("Loaded HuggingFace dataset: %s (%d items)", hf_name, len(ds))
+ break
+ except Exception as exc:
+ logger.warning("Could not load %s: %s — trying next source.", hf_name, exc)
+ if ds is None:
+ logger.warning(
+ "All MATH dataset sources failed. Proceeding with GSM8K only. "
+ "To load offline: download from https://github.com/hendrycks/math "
+ "and pass --math-data ."
+ )
+ return []
+
+ pairs = []
+ for item in ds:
+ level_str = item.get("level", "Level 5")
+ try:
+ level = int(level_str.split()[-1])
+ except (ValueError, IndexError):
+ level = 5
+ if level > max_difficulty:
+ continue
+
+ question = item.get("problem", "").strip()
+ solution = item.get("solution", "")
+ boxed = _extract_boxed(solution)
+ if not boxed:
+ continue
+ numeric = _boxed_to_numeric(boxed)
+ if not numeric:
+ continue
+ pairs.append({"question": question, "gold_final": numeric})
+
+ if pairs:
+ out_p = Path(cache_path)
+ out_p.parent.mkdir(parents=True, exist_ok=True)
+ with out_p.open("w", encoding="utf-8") as f:
+ for p_item in pairs:
+ f.write(json.dumps(p_item) + "\n")
+ logger.info("Cached %d MATH numeric pairs to %s", len(pairs), out_p)
+ else:
+ logger.warning("No MATH pairs passed the numeric filter — check the dataset.")
+
+ return pairs
+
+
+# ---------------------------------------------------------------------------
+# Reward
+# ---------------------------------------------------------------------------
+
+# ---------------------------------------------------------------------------
+# Self-play verification cascade
+# ---------------------------------------------------------------------------
+# Routes each self-play group to the right verification tool based on
+# problem type and difficulty, then gates the GRPO update on the result.
+# Returns False (→ skip group) when no tool can verify cleanly, preventing
+# circular PRM-only reward from anchoring the training signal.
+
+import re as _re
+
+_FINAL_ANSWER_RE = _re.compile(r"final answer[:\s]*([^\n]+)", _re.I)
+
+# Problem-type routing tables
+_PAL_TOPICS = frozenset({"arithmetic", "algebra", "prealgebra", "grounded"})
+_SYMPY_TOPICS = frozenset({
+ "number_theory", "intermediate_algebra", "precalculus",
+ "counting_and_probability",
+})
+_EXCLUDE_TOPICS = frozenset({"geometry"}) # spatial reasoning; cannot verify programmatically
+
+
+def _extract_final_answer(solution: str) -> Optional[str]:
+ """Extract the text after 'Final Answer:' from a solution string."""
+ m = _FINAL_ANSWER_RE.search(solution)
+ return m.group(1).strip() if m else None
+
+
+def _pal_eval(answer_str: str) -> Optional[float]:
+ """Tier 1: arithmetic / basic algebra via safe eval (no builtins, no names)."""
+ try:
+ val = eval(answer_str, {"__builtins__": {}}, {}) # noqa: S307
+ f = float(val)
+ return None if f != f else f # NaN guard
+ except Exception:
+ return None
+
+
+def _sympy_eval(answer_str: str) -> Optional[float]:
+ """Tier 2: symbolic evaluation via SymPy for algebra, number theory, etc."""
+ try:
+ from sympy import sympify, N as _N # type: ignore
+ f = float(_N(sympify(answer_str), 15))
+ return None if f != f else f # NaN guard
+ except Exception:
+ return None
+
+
+def _verify_self_play_answer(
+ solutions: List[str],
+ target_topic: str,
+ target_difficulty: float,
+) -> bool:
+ """
+ Tiered verification cascade for self-play groups.
+
+ Returns True only when a majority of solutions agree on an answer that an
+ independent tool (PAL eval or SymPy) can verify as a finite number.
+
+ Returns False — drop this group, no gradient — when:
+ * topic is geometry (spatial reasoning, can't verify programmatically)
+ * difficulty >= 4.0 (should have been blocked at generation, guard here too)
+ * no tool can parse a consistent numerical answer
+ * fewer than half of solutions agree on the majority answer
+
+ Coverage for GSM8K + MATH:
+ GSM8K → PAL tier, ~95%+ verified
+ MATH L1-L2 algebra → PAL + SymPy fallback, ~80% verified
+ MATH number theory / intermediate algebra → SymPy primary, ~70% verified
+ MATH geometry → excluded entirely (~3-5% of MATH)
+ MATH L4-L5 → excluded at generation time (see call site)
+ """
+ topic = target_topic.lower().replace(" ", "_")
+
+ # Hard exclusions (guard even if called after generation-time check)
+ if topic in _EXCLUDE_TOPICS or target_difficulty >= 4.0:
+ return False
+
+ answers: List[float] = []
+ for sol in solutions:
+ raw = _extract_final_answer(sol)
+ if raw is None:
+ continue
+
+ val: Optional[float]
+ if topic in _PAL_TOPICS or target_difficulty <= 2:
+ val = _pal_eval(raw) or _sympy_eval(raw)
+ elif topic in _SYMPY_TOPICS:
+ val = _sympy_eval(raw) or _pal_eval(raw)
+ else:
+ # Unknown topic: try both
+ val = _pal_eval(raw) or _sympy_eval(raw)
+
+ if val is not None:
+ answers.append(round(val, 6))
+
+ if not answers:
+ return False # Tier 4: cannot verify — exclude
+
+ majority = max(set(answers), key=answers.count)
+ return answers.count(majority) >= max(1, len(solutions) // 2)
+
+
+def compute_grounded_reward(
+ question: str,
+ solution: str,
+ gold_final: str,
+ math_env: CurriculumMathEnvironment,
+) -> Dict[str, float]:
+ """Score a solution against a known gold answer (grounded path).
+
+ Returns a dict with:
+ combined_score – 0.50×correct + 0.40×process(prm_final,prm_mean) + 0.10×fmt
+ step_accuracy – fraction of PRM steps rated > 0.5 (the core process metric)
+ prm_mean_score – PRM mean across all steps
+ prm_final_score – PRM score on the final reasoning step
+ gt_match – bool, whether pred matches gold
+ format_score – format compliance score
+ """
+ result = math_env.compute_grounded_reward(
+ question=question,
+ solution=solution,
+ gold_final=gold_final,
+ )
+ return {
+ "combined_score": float(result.get("combined_score", 0.0)),
+ "step_accuracy": float(result.get("step_accuracy", 0.0)),
+ "lccp": float(result.get("lccp", 0.0)),
+ "prm_mean_score": float(result.get("prm_mean_score", 0.0)),
+ "prm_final_score": float(result.get("prm_final_score", 0.0)),
+ "gt_match": bool(result.get("gt_match", False)),
+ "format_score": float(result.get("format_score", 0.0)),
+ }
+
+
+def compute_self_play_reward(
+ question: str,
+ solution: str,
+ target_topic: str,
+ target_difficulty: float,
+ math_env: CurriculumMathEnvironment,
+) -> Tuple[float, float, float, Dict]:
+ """Score a self-generated question + solution (self-play path).
+
+ Returns (combined_reward, question_reward, solution_reward, q_metrics).
+
+ Reward breakdown: R = 0.40×question_quality + 0.60×solution_quality,
+ where question_quality captures topic match, difficulty fit, clarity,
+ novelty, and solvability — completing the Theme #4 self-improvement loop
+ where the model is rewarded for generating *good challenges*, not only
+ for solving them.
+
+ q_metrics contains the full question quality breakdown:
+ topic_match, difficulty_fit, clarity, novelty, solvability, overall_score
+ """
+ result = math_env.compute_reward(
+ question=question,
+ solution=solution,
+ target_topic=target_topic,
+ target_difficulty=target_difficulty,
+ )
+ combined = float(result["combined_score"])
+ sol_score = result.get("solution_metrics", {})
+ s_reward = float(sol_score.get("overall_score", 0.0)) if isinstance(sol_score, dict) else 0.0
+
+ # question_reward is NOT a top-level key in compute_reward()'s return dict.
+ # The question quality score lives inside question_metrics["overall_score"].
+ # Key mapping from QuestionEvalResult.to_dict():
+ # overall_score → scalar (overall question quality)
+ # topic_match → scalar
+ # difficulty_score → scalar (fit to target difficulty; named _score not _fit)
+ # clarity → scalar
+ # solvability_score→ scalar (the dict version is under "solvability" — don't use that)
+ # novelty_combined → scalar (the dict version is under "novelty" — don't use that)
+ q_metrics_raw = result.get("question_metrics", {}) or {}
+ # Use the gated question reward (zeroed when solution is invalid) — this is
+ # what actually contributed to combined_score, not the raw overall_score.
+ q_reward = float(result.get("effective_question_reward", q_metrics_raw.get("overall_score", 0.0)))
+ q_metrics: Dict = {
+ "overall_score": q_reward,
+ "topic_match": float(q_metrics_raw.get("topic_match", 0.0)),
+ "difficulty_fit": float(q_metrics_raw.get("difficulty_score", 0.0)),
+ "clarity": float(q_metrics_raw.get("clarity", 0.0)),
+ "novelty": float(q_metrics_raw.get("novelty_combined", 0.0)),
+ "solvability": float(q_metrics_raw.get("solvability_score", 0.0)),
+ # Chain integrity score from Phase 2+ unified calculator (None if inactive)
+ "sp_chain_integrity_score": result.get("sp_chain_integrity_score"),
+ }
+ return combined, q_reward, s_reward, q_metrics
+
+
+@torch.no_grad()
+def generate_question(
+ model: AutoModelForCausalLM,
+ tokenizer: AutoTokenizer,
+ instruction: str,
+ max_new_tokens: int,
+ device: torch.device,
+ temperature: float = 0.85,
+) -> str:
+ """Generate a math question from a curriculum instruction.
+
+ Uses centralized prompts from src/config/prompts.py to ensure consistency
+ across SFT training, GRPO, PPO, and inference.
+
+ Returns the raw decoded question text (no special tokens).
+ """
+ # Use centralized prompt configuration
+ messages = create_generator_messages(instruction)
+
+ try:
+ prompt = tokenizer.apply_chat_template(
+ messages, tokenize=False, add_generation_prompt=True
+ )
+ except Exception:
+ # Fallback if chat template is missing
+ system = messages[0]["content"]
+ user = messages[1]["content"]
+ prompt = f"{system}\n\n{user}\n"
+
+ enc = tokenizer(
+ prompt, return_tensors="pt", truncation=True, max_length=512
+ ).to(device)
+ prompt_len = enc["input_ids"].shape[1]
+
+ stop_ids: List[int] = []
+ if tokenizer.eos_token_id is not None:
+ stop_ids.append(tokenizer.eos_token_id)
+ im_end = tokenizer.convert_tokens_to_ids("<|im_end|>")
+ if isinstance(im_end, int) and im_end not in stop_ids:
+ stop_ids.append(im_end)
+
+ out = model.generate(
+ input_ids=enc["input_ids"],
+ attention_mask=enc["attention_mask"],
+ max_new_tokens=max_new_tokens,
+ do_sample=True,
+ temperature=temperature,
+ top_p=0.95,
+ pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
+ eos_token_id=stop_ids or None,
+ use_cache=True,
+ )
+ return tokenizer.decode(out[0][prompt_len:], skip_special_tokens=True).strip()
+
+
+# ---------------------------------------------------------------------------
+# Generation
+# ---------------------------------------------------------------------------
+
+@torch.no_grad()
+def generate_questions_batched(
+ model: AutoModelForCausalLM,
+ tokenizer: AutoTokenizer,
+ instruction: str,
+ K_q: int,
+ max_new_tokens: int,
+ temperature: float,
+ device: torch.device,
+) -> Tuple[List[str], List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]:
+ """
+ Generate K_q question candidates from a single curriculum instruction in
+ one batched model.generate() call. Returns the same four-tuple as
+ ``generate_solutions_batched`` so the question token IDs can be passed
+ directly to ``grpo_loss_for_group`` for the question-level GRPO update.
+
+ Uses the same centralized prompts (``create_generator_messages``) as
+ ``generate_question()`` so the chat format is identical whether running
+ single-question or batched two-phase generation.
+
+ Returns:
+ questions : K_q decoded question strings
+ input_ids_list : K_q full (prompt+response) token ID tensors
+ response_masks : K_q bool masks (True = non-pad response token)
+ old_log_probs : K_q scalar tensors (sum log π_old over response),
+ no_grad — used as denominator in IS ratio.
+ """
+ messages = create_generator_messages(instruction)
+ try:
+ prompt = tokenizer.apply_chat_template(
+ messages, tokenize=False, add_generation_prompt=True
+ )
+ except Exception:
+ prompt = f"{system}\n\n{instruction}\n"
+
+ stop_ids = _build_stop_token_ids(tokenizer)
+ pad_id: int = (
+ tokenizer.pad_token_id
+ if tokenizer.pad_token_id is not None
+ else tokenizer.eos_token_id
+ )
+
+ enc = tokenizer(
+ prompt, return_tensors="pt", truncation=True, max_length=512
+ ).to(device)
+ prompt_len: int = enc["input_ids"].shape[1]
+
+ input_ids_batch = enc["input_ids"].expand(K_q, -1).contiguous()
+ attn_mask_batch = enc["attention_mask"].expand(K_q, -1).contiguous()
+
+ model.eval()
+ with torch.no_grad():
+ out = model.generate(
+ input_ids=input_ids_batch,
+ attention_mask=attn_mask_batch,
+ max_new_tokens=max_new_tokens,
+ do_sample=True,
+ temperature=temperature,
+ top_p=0.95,
+ pad_token_id=pad_id,
+ eos_token_id=stop_ids,
+ use_cache=True,
+ )
+
+ questions: List[str] = []
+ input_ids_list: List[torch.Tensor] = []
+ response_masks: List[torch.Tensor] = []
+
+ pad_id_t = torch.tensor(pad_id, device=device, dtype=out.dtype)
+ for i in range(K_q):
+ full_ids = out[i]
+ response_section = full_ids[prompt_len:]
+ mask = torch.zeros(full_ids.shape[0], dtype=torch.bool, device=device)
+ mask[prompt_len:] = response_section != pad_id_t
+ question = tokenizer.decode(response_section, skip_special_tokens=True).strip()
+ questions.append(question)
+ input_ids_list.append(full_ids)
+ response_masks.append(mask)
+
+ # Single batched forward pass for all K_q old log-probs (same trick as solutions).
+ old_log_probs: List[torch.Tensor] = []
+ with torch.no_grad():
+ attn_mask_lp = (out != pad_id_t)
+ attn_mask_lp[:, :prompt_len] = True
+ batch_logits = model(
+ input_ids=out,
+ attention_mask=attn_mask_lp.long(),
+ use_cache=False,
+ return_dict=True,
+ ).logits # [K_q, total_len, vocab]
+
+ for i in range(K_q):
+ full_ids = out[i]
+ mask = response_masks[i]
+ shift_logits = batch_logits[i, :-1]
+ shift_labels = full_ids[1:]
+ shift_mask = mask[1:]
+ lp_tokens = F.log_softmax(shift_logits, dim=-1)[
+ torch.arange(shift_logits.size(0), device=device),
+ shift_labels,
+ ]
+ resp_lps = lp_tokens[shift_mask]
+ old_log_probs.append(
+ resp_lps.sum().detach() if resp_lps.numel() > 0
+ else torch.tensor(0.0, device=device)
+ )
+
+ return questions, input_ids_list, response_masks, old_log_probs
+
+def _build_stop_token_ids(tokenizer: AutoTokenizer) -> List[int]:
+ """
+ Return a list of token IDs that should stop generation.
+
+ Qwen2.5-chat models end turns with <|im_end|> (ID 151645). If that
+ token is not the same as eos_token_id we include both so that .generate()
+ halts cleanly instead of running to max_new_tokens and emitting repetitive
+ garbage.
+ """
+ stop_ids: List[int] = []
+ if tokenizer.eos_token_id is not None:
+ stop_ids.append(tokenizer.eos_token_id)
+ im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+ if isinstance(im_end_id, int) and im_end_id not in stop_ids:
+ stop_ids.append(im_end_id)
+ return stop_ids or None # type: ignore[return-value]
+
+
+def generate_solutions_batched(
+ model: AutoModelForCausalLM,
+ tokenizer: AutoTokenizer,
+ prompt: str,
+ K: int,
+ max_new_tokens: int,
+ temperature: float,
+ device: torch.device,
+) -> Tuple[List[str], List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]:
+ """
+ Generate K solutions for a prompt in a **single batched** model.generate() call.
+
+ Batching all K sequences together achieves near-100% GPU utilisation vs
+ the old sequential loop (which was <20% utilised). On an A100 with K=8,
+ this is typically 4-8× faster than K sequential calls.
+
+ ``prompt`` must come from ``math_env.format_solution_prompt(question)``
+ so the chat-template system/user wrapping exactly matches the SFT
+ training format.
+
+ Returns:
+ solutions : K decoded strings (prompt stripped, specials removed)
+ input_ids_list : K full (prompt+response) token ID tensors
+ response_masks : K bool masks (True = non-pad response token)
+ old_log_probs : K scalar tensors, sum(log π_old(token)) over response,
+ computed no_grad — used for IS clip ratio in the loss.
+ """
+ stop_ids = _build_stop_token_ids(tokenizer)
+ pad_id: int = (
+ tokenizer.pad_token_id
+ if tokenizer.pad_token_id is not None
+ else tokenizer.eos_token_id
+ )
+
+ enc = tokenizer(
+ prompt,
+ return_tensors="pt",
+ padding=False,
+ truncation=True,
+ max_length=1024,
+ ).to(device)
+ prompt_len: int = enc["input_ids"].shape[1]
+
+ # Expand prompt K times along the batch dimension (no data copy).
+ input_ids_batch = enc["input_ids"].expand(K, -1).contiguous()
+ attn_mask_batch = enc["attention_mask"].expand(K, -1).contiguous()
+
+ model.eval()
+ with torch.no_grad():
+ out = model.generate(
+ input_ids=input_ids_batch,
+ attention_mask=attn_mask_batch,
+ max_new_tokens=max_new_tokens,
+ do_sample=True,
+ temperature=temperature,
+ top_p=0.9,
+ pad_token_id=pad_id,
+ eos_token_id=stop_ids,
+ use_cache=True,
+ )
+ # out: [K, prompt_len + padded_response_len]
+
+ # ── 1. Build masks and decode solutions ──────────────────────────────────
+ solutions: List[str] = []
+ input_ids_list: List[torch.Tensor] = []
+ response_masks: List[torch.Tensor] = []
+
+ pad_id_t = torch.tensor(pad_id, device=device, dtype=out.dtype)
+ for i in range(K):
+ full_ids = out[i]
+ response_section = full_ids[prompt_len:]
+ mask = torch.zeros(full_ids.shape[0], dtype=torch.bool, device=device)
+ mask[prompt_len:] = response_section != pad_id_t
+ solution = tokenizer.decode(response_section, skip_special_tokens=True)
+ solutions.append(solution)
+ input_ids_list.append(full_ids)
+ response_masks.append(mask)
+
+ # ── 2. Batched old_log_probs — ONE forward pass for all K sequences ───────
+ # The old sequential approach called compute_sequence_log_prob K times
+ # (K separate CPU→GPU round-trips + K forward passes). A single batched
+ # forward pass over out[K, total_len] gives the same result K× faster.
+ #
+ # Attention mask: always attend to prompt tokens; attend to response tokens
+ # only where they are non-pad. This matches what the model saw during
+ # model.generate() and prevents padding from distorting log probs.
+ old_log_probs: List[torch.Tensor] = []
+ with torch.no_grad():
+ attn_mask_lp = (out != pad_id_t) # [K, total_len]
+ attn_mask_lp[:, :prompt_len] = True # prompt always attended
+
+ batch_logits = model(
+ input_ids=out,
+ attention_mask=attn_mask_lp.long(),
+ use_cache=False,
+ return_dict=True,
+ ).logits # [K, total_len, vocab]
+
+ for i in range(K):
+ full_ids = out[i]
+ mask = response_masks[i]
+
+ shift_logits = batch_logits[i, :-1] # [total_len-1, vocab]
+ shift_labels = full_ids[1:] # [total_len-1]
+ shift_mask = mask[1:] # [total_len-1]
+
+ lp_tokens = F.log_softmax(shift_logits, dim=-1)[
+ torch.arange(shift_logits.size(0), device=device),
+ shift_labels,
+ ] # [total_len-1]
+ resp_lps = lp_tokens[shift_mask]
+ old_log_probs.append(
+ resp_lps.sum().detach() if resp_lps.numel() > 0
+ else torch.tensor(0.0, device=device)
+ )
+
+ return solutions, input_ids_list, response_masks, old_log_probs
+
+
+def compute_sequence_log_prob(
+ model: AutoModelForCausalLM,
+ input_ids: torch.Tensor,
+ response_mask: torch.Tensor,
+) -> torch.Tensor:
+ """
+ Forward pass through model to get sum of log probs for response tokens.
+
+ Returns scalar tensor (differentiable).
+ """
+ # input_ids: [seq_len] → unsqueeze to [1, seq_len]
+ ids = input_ids.unsqueeze(0)
+ # Causal LM: logits[i] predicts token[i+1]
+ outputs = model(input_ids=ids, use_cache=False, return_dict=True)
+ logits = outputs.logits[0] # [seq_len, vocab]
+
+ # Shift: predict token t+1 from logit at position t
+ shift_logits = logits[:-1] # [seq_len-1, vocab]
+ shift_labels = input_ids[1:] # [seq_len-1]
+ shift_mask = response_mask[1:] # [seq_len-1] (response tokens)
+
+ log_probs = F.log_softmax(shift_logits, dim=-1) # [seq_len-1, vocab]
+ token_log_probs = log_probs[
+ torch.arange(shift_logits.size(0), device=shift_logits.device),
+ shift_labels,
+ ] # [seq_len-1]
+
+ # Sum log probs over response tokens only
+ response_log_probs = token_log_probs[shift_mask]
+ if response_log_probs.numel() == 0:
+ return torch.tensor(0.0, requires_grad=True, device=input_ids.device)
+ return response_log_probs.sum()
+
+
+# ---------------------------------------------------------------------------
+# GRPO update for one question group
+# ---------------------------------------------------------------------------
+
+def grpo_loss_for_group(
+ model: AutoModelForCausalLM,
+ input_ids_list: List[torch.Tensor],
+ response_masks: List[torch.Tensor],
+ rewards: List[float],
+ old_log_probs: List[torch.Tensor],
+ clip_eps: float = 0.2,
+ kl_coef: float = 0.0,
+ ref_model: Optional[AutoModelForCausalLM] = None,
+ eps: float = 1e-8,
+) -> Optional[torch.Tensor]:
+ """
+ Compute GRPO loss for a group of K solutions to the same question.
+
+ IS clip (``clip_eps > 0``):
+ ratio = π_θ(response) / π_old(response) [sequence level]
+ L_GRPO = -min(ratio × A, clip(ratio, 1-ε, 1+ε) × A) / T
+
+ Reference-policy KL penalty (``kl_coef > 0``, ``ref_model`` required):
+ KL(π_θ ‖ π_ref) ≈ (log π_θ − log π_ref) / T per sequence
+ L_total = L_GRPO + β × KL
+
+ The KL term acts as an anchor: it prevents the policy from drifting so
+ far from its starting point that it forgets the SFT knowledge baked in
+ during dual_task_v1 fine-tuning. β=0.04 is a conservative starting
+ value (matches DeepSeekMath GRPO default).
+
+ Returns None if all rewards are identical (zero gradient signal).
+ """
+ rewards_arr = np.array(rewards, dtype=np.float32)
+ std_r = rewards_arr.std()
+ if std_r < eps:
+ return None
+
+ mean_r = rewards_arr.mean()
+ advantages = (rewards_arr - mean_r) / (std_r + eps)
+ advantages = np.clip(advantages, -5.0, 5.0)
+
+ _device = next(model.parameters()).device
+ group_loss = torch.tensor(0.0, device=_device)
+ n_valid = 0
+
+ model.train()
+ for ids, mask, adv, old_lp in zip(
+ input_ids_list, response_masks, advantages, old_log_probs
+ ):
+ new_lp = compute_sequence_log_prob(model, ids, mask) # differentiable
+ n_response = int(mask[1:].sum().item())
+ if n_response == 0:
+ continue
+
+ adv_t = torch.tensor(adv, dtype=new_lp.dtype, device=_device)
+
+ # ── GRPO surrogate (with optional IS clip) ────────────────────────
+ if clip_eps > 0:
+ ratio = torch.exp(new_lp - old_lp.to(_device).detach())
+ surr_unclipped = ratio * adv_t / n_response
+ surr_clipped = (
+ torch.clamp(ratio, 1.0 - clip_eps, 1.0 + clip_eps)
+ * adv_t / n_response
+ )
+ loss_i = -torch.min(surr_unclipped, surr_clipped)
+ else:
+ loss_i = -(adv_t * new_lp / n_response)
+
+ # ── Reference-policy KL penalty ───────────────────────────────────
+ # KL(π_θ ‖ π_ref) = mean_token(log π_θ − log π_ref)
+ # Adding +β×KL to the minimisation objective penalises drift from
+ # the reference (frozen) checkpoint. This is differentiable through
+ # new_lp; ref_lp is always detached (no grad through frozen model).
+ if kl_coef > 0.0 and ref_model is not None:
+ with torch.no_grad():
+ ref_lp = compute_sequence_log_prob(ref_model, ids, mask)
+ kl_per_token = (new_lp - ref_lp.to(_device).detach()) / n_response
+ loss_i = loss_i + kl_coef * kl_per_token
+
+ group_loss = group_loss + loss_i
+ n_valid += 1
+
+ if n_valid == 0:
+ return None
+ return group_loss / n_valid
+
+
+# ---------------------------------------------------------------------------
+# Evaluation helpers
+# ---------------------------------------------------------------------------
+
+def _log_eval_result(label: str, res: Dict, best: Optional[float]) -> None:
+ """Print a structured evaluation summary that mirrors the training objective."""
+ cs = float(res.get("combined_score", 0.0))
+ cr = float(res.get("correct_rate", 0.0))
+ step_a = float(res.get("step_accuracy", 0.0))
+ lccp = float(res.get("lccp", 0.0))
+ prm = float(res.get("prm_mean", 0.0))
+ prm_fin = float(res.get("prm_final", 0.0))
+ fmt = float(res.get("format_mean", 0.0))
+ n_sc = int(res.get("n_scored", res.get("total", 0)))
+ fa_acc = float(res.get("final_answer_accuracy", cr))
+ pak = res.get("pass_at_k")
+ pak_k = int(res.get("pass_at_k_k", 4))
+
+ best_str = f" (best={best:.4f})" if best is not None else ""
+ logger.info(
+ "Training Score [%s]: %.4f%s | n=%d",
+ label, cs, best_str, n_sc,
+ )
+ logger.info(
+ " Components : 0.50×correct(%.1f%%) + 0.40×process + 0.10×fmt(%.3f)",
+ 100 * cr, fmt,
+ )
+ logger.info(
+ " Process score : prm_mean=%.3f prm_final=%.3f → weighted=%.3f",
+ prm, prm_fin, 0.60 * prm_fin + 0.40 * prm,
+ )
+ logger.info(
+ " Step accuracy : %.1f%% (bag-of-steps: fraction of steps PRM >0.5)",
+ 100 * step_a,
+ )
+ logger.info(
+ " Chain integrity (LCCP): %.1f%% ← fraction of steps before first failure\n"
+ " [LCCP=100%% → all steps correct; LCCP=0%% → first step wrong]",
+ 100 * lccp,
+ )
+ if pak is not None:
+ logger.info(
+ " pass@%d (T=0.8): %.1f%% | greedy correct: %.1f%% "
+ "← ceiling vs floor gap",
+ pak_k, 100 * pak, 100 * cr,
+ )
+ logger.info(
+ " (debug) final-answer accuracy: %.1f%%",
+ 100 * fa_acc,
+ )
+
+
+def evaluate_policy(
+ model: AutoModelForCausalLM,
+ tokenizer: AutoTokenizer,
+ eval_data_path: str,
+ max_samples: int,
+ max_new_tokens: int,
+ math_env: Optional[Any] = None,
+ pass_at_k: int = 4,
+) -> Dict[str, object]:
+ """Run GSM8K evaluation using the SAME reward formula as GRPO training.
+
+ When *math_env* is supplied a ``reward_fn`` is constructed that calls
+ ``math_env.compute_grounded_reward(question, solution, gold)``. This
+ returns ``combined_score = 0.50×correct + 0.40×process(0.60×prm_final
+ + 0.40×prm_mean) + 0.10×format``, making the eval metric IDENTICAL to
+ the GRPO training objective. Any improvement in step quality, chain
+ integrity, or format compliance shows up immediately in the accuracy
+ number instead of being hidden behind the coarse binary final-answer
+ signal.
+ """
+ if not Path(eval_data_path).exists():
+ return {"accuracy": 0.0, "combined_score": 0.0, "total": 0}
+ model.eval()
+
+ reward_fn = None
+ if math_env is not None:
+ import logging as _log_mod
+ _mec_logger = _log_mod.getLogger("src.rl.math_environment_curriculum")
+ _prm_logger = _log_mod.getLogger("src.rl.prm_scorer")
+
+ def reward_fn(question: str, solution: str, gold: str) -> Dict:
+ """Thin wrapper that silences per-sample INFO logs during eval."""
+ _old_mec = _mec_logger.level
+ _old_prm = _prm_logger.level
+ _mec_logger.setLevel(_log_mod.WARNING)
+ _prm_logger.setLevel(_log_mod.WARNING)
+ try:
+ return math_env.compute_grounded_reward(question, solution, gold)
+ finally:
+ _mec_logger.setLevel(_old_mec)
+ _prm_logger.setLevel(_old_prm)
+
+ results = evaluate_gsm8k(
+ model=model,
+ tokenizer=tokenizer,
+ data_path=eval_data_path,
+ max_samples=max_samples,
+ max_new_tokens=max_new_tokens,
+ reward_fn=reward_fn,
+ pass_at_k=pass_at_k,
+ dataset_name=_infer_eval_dataset_name(eval_data_path),
+ )
+ model.train()
+ return results
+
+
+# ---------------------------------------------------------------------------
+# Main training loop
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description="GRPO training for self-improvement math")
+ parser.add_argument("--base-model", default="checkpoints/dual_task_v1")
+ parser.add_argument("--output-dir", default="checkpoints/grpo")
+ parser.add_argument("--gsm8k-data", default="data/sft/gsm8k_sft.jsonl")
+ parser.add_argument("--eval-data-path", default="data/sft/dual_task_val.jsonl")
+ parser.add_argument("--num-iterations", type=int, default=30)
+ parser.add_argument(
+ "--group-size", type=int, default=4,
+ help="K: number of solutions per question per GRPO group (default 4).",
+ )
+ parser.add_argument(
+ "--q-group-size", type=int, default=1,
+ help="K_q: question candidates per self-play group (default 1 = disabled). "
+ "When ≥2, a second question-level GRPO update is added: K_q questions are "
+ "sampled from the same instruction, each solved group-size times; the "
+ "per-question reward (mean of its M solution rewards) drives a GRPO update "
+ "on the question tokens. Recommended: 2 with --group-size 4 to keep "
+ "total self-play compute the same as K_q=1 with group-size 8.",
+ )
+ parser.add_argument(
+ "--questions-per-iter", type=int, default=16,
+ help="Number of questions per training iteration (default 16).",
+ )
+ parser.add_argument("--learning-rate", type=float, default=5e-6)
+ parser.add_argument("--max-new-tokens", type=int, default=400)
+ parser.add_argument("--temperature", type=float, default=0.8)
+ parser.add_argument("--eval-every", type=int, default=5)
+ parser.add_argument("--eval-max-samples", type=int, default=250)
+ parser.add_argument("--eval-max-new-tokens", type=int, default=512)
+ parser.add_argument(
+ "--eval-pass-at-k", type=int, default=0,
+ help="Number of sampled solutions per eval problem for pass@k (0 to disable). "
+ "Makes eval directly comparable to training batch_acc (both K samples at T=0.8). "
+ "Disabled by default — enable with e.g. --eval-pass-at-k 4 for demo runs only "
+ "(adds K×eval_samples extra forward passes).",
+ )
+ parser.add_argument("--use-prm", dest="use_prm", action="store_true", default=True)
+ parser.add_argument("--no-prm", dest="use_prm", action="store_false")
+ parser.add_argument("--prm-model", default="Qwen/Qwen2.5-Math-PRM-7B")
+ parser.add_argument("--skip-initial-eval", action="store_true")
+ parser.add_argument("--run-name", default=None)
+ parser.add_argument("--max-grad-norm", type=float, default=1.0)
+ parser.add_argument(
+ "--kl-coef", type=float, default=0.04,
+ help="Reference-policy KL penalty coefficient β. 0 = disabled. Default 0.04.",
+ )
+ parser.add_argument(
+ "--math-data", type=str, default=None,
+ help="Path to MATH dataset JSONL. If absent, downloads from HuggingFace "
+ "(competition_math) and caches to data/math/math_numeric.jsonl.",
+ )
+ parser.add_argument(
+ "--math-mix-ratio", type=float, default=0.3,
+ help="Fraction of each question batch drawn from MATH (vs GSM8K). "
+ "0 = GSM8K only, 1 = MATH only. Default 0.3.",
+ )
+ parser.add_argument(
+ "--math-mix-ratio-late", type=float, default=None,
+ help="If set, ramp MATH fraction from --math-mix-ratio to this value "
+ "starting at iter 15 (linear ramp over next 10 iters). "
+ "Example: --math-mix-ratio 0.3 --math-mix-ratio-late 0.5 "
+ "raises difficulty progressively once the policy is stable.",
+ )
+ parser.add_argument(
+ "--math-ramp-start", type=int, default=15,
+ help="Iteration at which to begin the MATH ratio ramp. Default 15.",
+ )
+ parser.add_argument(
+ "--math-max-difficulty", type=int, default=3,
+ help="Maximum MATH difficulty level to include (1-5). Default 3.",
+ )
+ parser.add_argument(
+ "--clip-eps", type=float, default=0.2,
+ help="Importance-sampling clip ratio ε (PPO-style clip applied inside GRPO). "
+ "0 = disabled (plain GRPO). Default 0.2.",
+ )
+ parser.add_argument(
+ "--warmup-iters", type=int, default=3,
+ help="Number of linear LR warmup iterations before cosine decay. Default 3.",
+ )
+ parser.add_argument(
+ "--min-lr-ratio", type=float, default=0.1,
+ help="Cosine decay floor as a fraction of peak LR (default 0.1 = 10%%).",
+ )
+ parser.add_argument(
+ "--difficulty-alpha", type=float, default=2.0,
+ help="Sharpness of difficulty-weighted question sampling. "
+ "Higher = stronger preference for on-the-margin questions (win_rate ≈ 0.5). "
+ "0 = uniform random (default behaviour). Default 2.0.",
+ )
+ parser.add_argument(
+ "--overlong-filter", dest="overlong_filter",
+ action="store_true", default=True,
+ help="Skip solutions that hit max-new-tokens (truncated = no Final Answer). Default on.",
+ )
+ parser.add_argument(
+ "--no-overlong-filter", dest="overlong_filter", action="store_false",
+ help="Disable overlong-response filtering.",
+ )
+ parser.add_argument(
+ "--save-every", type=int, default=1,
+ help="Save a full checkpoint every N iterations (default 1 = every iter). "
+ "Best-policy is always saved when accuracy improves, independently of this flag.",
+ )
+ parser.add_argument(
+ "--keep-last", type=int, default=0,
+ help="Keep only the last K iter_* checkpoints on disk (0 = keep all). "
+ "best_policy/ is never pruned.",
+ )
+ parser.add_argument(
+ "--self-play-ratio", type=float, default=0.3,
+ help="Fraction of each question batch that uses SELF-PLAY (model generates the "
+ "question from a curriculum instruction, then solves it, rewarded on "
+ "0.40 × question_quality + 0.60 × solution_quality). "
+ "The remaining (1 - ratio) uses GROUNDED questions from GSM8K / MATH with "
+ "gold-answer reward. "
+ "0.0 = fully grounded (original behaviour), 1.0 = fully self-play. "
+ "Default 0.3 — mirrors the PPO default of 30%% grounded / 70%% self-play "
+ "(inverted here because grounded is our primary accuracy signal).",
+ )
+ # ── Phase-curriculum parameters ───────────────────────────────────────────
+ parser.add_argument(
+ "--min-warmup", type=int, default=10,
+ help="Minimum iterations in Phase 1 (grounded-only) before considering graduation "
+ "to Phase 2 (self-play ramp). Prevents graduating on a lucky early batch. "
+ "Default 10.",
+ )
+ parser.add_argument(
+ "--selfplay-gt-thresh", type=float, default=0.55,
+ help="gt_match_rate threshold required to graduate from Phase 1 to Phase 2. "
+ "Measures raw answer correctness (SymPy exact match), not reward-gamed "
+ "combined_score. Default 0.55.",
+ )
+ parser.add_argument(
+ "--selfplay-grounded-thresh", type=float, default=0.60,
+ help="grounded_accuracy (combined_score > 0.5) threshold for Phase 1 graduation. "
+ "Default 0.60.",
+ )
+ parser.add_argument(
+ "--selfplay-step-thresh", type=float, default=0.65,
+ help="step_accuracy (PRM steps rated > 0.5) threshold for Phase 1 graduation. "
+ "Ensures the model has learned clean step format before entering self-play. "
+ "Default 0.65.",
+ )
+ parser.add_argument(
+ "--selfplay-ramp-iters", type=int, default=20,
+ help="Number of iterations to ramp self-play ratio from ~0%% to --self-play-ratio "
+ "(Phase 2). Grounded anchor stays at ≥30%% throughout. Default 20.",
+ )
+ parser.add_argument(
+ "--grounded-floor", type=float, default=0.50,
+ help="Minimum gt_match_rate to maintain during Phase 3. If it falls below this "
+ "value, self-play is suspended until grounded performance recovers. "
+ "Should be slightly below --selfplay-gt-thresh. Default 0.50.",
+ )
+ # ── Unified accuracy calculator parameters ────────────────────────────────
+ parser.add_argument(
+ "--extractor-model", default="Qwen/Qwen2.5-0.5B-Instruct",
+ help="Small model used for step chain extraction in the unified accuracy "
+ "calculator (Phase 2+). Loaded in 4-bit to minimise VRAM. "
+ "Default Qwen/Qwen2.5-0.5B-Instruct.",
+ )
+ parser.add_argument(
+ "--extraction-cache", default=None,
+ help="Path to a pre-built JSON extraction cache from "
+ "scripts/precompute_extraction_cache.py. When provided, grounded-data "
+ "extractions are served from cache instead of calling the extractor LLM "
+ "at training time. Only novel self-play solutions require live extraction. "
+ "Default None (extraction always uses the LLM).",
+ )
+ args = parser.parse_args()
+
+ # ── Run identity ─────────────────────────────────────────────────────────
+ # Establish run_name first — everything that follows (including log paths)
+ # derives from it.
+ run_name = args.run_name or f"grpo_{datetime.now():%Y%m%d_%H%M%S}"
+ out_dir = Path(args.output_dir) / run_name
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ # ── Log directory ─────────────────────────────────────────────────────────
+ # One canonical directory for ALL run artefacts that are not model weights:
+ # console_output.log — full terminal mirror (logger.* + print + tqdm)
+ # config.json — serialised CLI args for reproducibility
+ # metrics.csv — one row per iteration, written live
+ # summary.json — written at the end of training
+ log_dir = Path("logs") / "grpo" / run_name
+ log_dir.mkdir(parents=True, exist_ok=True)
+
+ # ── Console log file ─────────────────────────────────────────────────────
+ console_log_path = log_dir / "console_output.log"
+ _console_log_file = console_log_path.open("a", encoding="utf-8", buffering=1)
+
+ # 1) FileHandler on the root logger → every logger.*() call goes to file.
+ # This is necessary because logging.StreamHandler stores a reference to
+ # sys.stderr at *creation* time (inside logging.basicConfig above), so
+ # reassigning sys.stderr later has no effect on existing handlers.
+ _file_handler = _add_file_logging(console_log_path)
+
+ # 2) TeeStream on sys.stdout / sys.stderr → every print() / tqdm bar /
+ # library write also goes to file. Both together cover 100% of output.
+ _original_stdout = sys.stdout
+ _original_stderr = sys.stderr
+ sys.stdout = TeeStream(_original_stdout, _console_log_file)
+ sys.stderr = TeeStream(_original_stderr, _console_log_file)
+
+ logger.info("=" * 70)
+ logger.info("GRPO run: %s", run_name)
+ logger.info("Checkpoints : %s", out_dir)
+ logger.info("Logs : %s", log_dir)
+ logger.info("Console log : %s", console_log_path)
+ logger.info("=" * 70)
+
+ # ── Persist config for reproducibility ───────────────────────────────────
+ (log_dir / "config.json").write_text(
+ json.dumps(vars(args), indent=2, default=str), encoding="utf-8"
+ )
+
+ # ── Live CSV metrics writer ───────────────────────────────────────────────
+ # Written one row per iteration so you can tail / open in Excel mid-run.
+ _metrics_csv_path = log_dir / "metrics.csv"
+ _csv_file: Optional[Any] = None
+ _csv_writer: Optional[Any] = None
+
+ def _append_metrics_csv(row: Dict[str, Any]) -> None:
+ """Append one metrics row to metrics.csv; writes header on first call."""
+ nonlocal _csv_file, _csv_writer
+ # Normalise floats to fixed precision so the CSV is human-readable.
+ flat = {
+ k: (f"{v:.6f}" if isinstance(v, float) else v)
+ for k, v in row.items()
+ }
+ if _csv_writer is None:
+ _csv_file = _metrics_csv_path.open("w", newline="", encoding="utf-8")
+ _csv_writer = csv.DictWriter(
+ _csv_file,
+ fieldnames=list(flat.keys()),
+ extrasaction="ignore",
+ )
+ _csv_writer.writeheader()
+ _csv_writer.writerow(flat)
+ _csv_file.flush() # type: ignore[union-attr]
+
+ # ── Teardown: restore streams and close files on any exit path ───────────
+ # atexit runs unconditionally — on normal completion, keyboard interrupt,
+ # unhandled exception, or OOM crash. This is equivalent to a finally block
+ # without requiring the entire training body to be re-indented.
+ def _teardown_logging() -> None:
+ sys.stdout = _original_stdout
+ sys.stderr = _original_stderr
+ logging.getLogger().removeHandler(_file_handler)
+ if not getattr(_file_handler.stream, "closed", False):
+ _file_handler.close()
+ if _csv_file is not None and not getattr(_csv_file, "closed", False):
+ _csv_file.close()
+ if not _console_log_file.closed:
+ _console_log_file.close()
+
+ atexit.register(_teardown_logging)
+
+ random.seed(42)
+ np.random.seed(42)
+ torch.manual_seed(42)
+
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+ attn_impl = select_attn_implementation()
+ logger.info("Device: %s | attn: %s", device, attn_impl)
+ if torch.cuda.is_available():
+ _gpu = torch.cuda.get_device_properties(0)
+ logger.info(
+ "GPU: %s | %.1f GB VRAM | capability sm_%d%d",
+ _gpu.name, _gpu.total_memory / 1e9, _gpu.major, _gpu.minor,
+ )
+ logger.info(
+ "Run config: K=%d K_q=%d N=%d lr=%.1e T=%.2f max_new=%d | "
+ "clip_eps=%.2f kl_coef=%.4f warmup=%d | diff_alpha=%.1f | "
+ "self_play=%.0f%% grounded=%.0f%% | "
+ "math_mix=%.0f%% math_maxdiff=%d | overlong_filter=%s | "
+ "eval_every=%d eval_N=%d | grad_clip=%.2f save_every=%d keep_last=%d | "
+ "question_GRPO=%s",
+ args.group_size, args.q_group_size, args.questions_per_iter, args.learning_rate,
+ args.temperature, args.max_new_tokens,
+ args.clip_eps, args.kl_coef, args.warmup_iters,
+ args.difficulty_alpha,
+ 100 * args.self_play_ratio, 100 * (1 - args.self_play_ratio),
+ 100 * args.math_mix_ratio, args.math_max_difficulty,
+ args.overlong_filter,
+ args.eval_every, args.eval_max_samples,
+ args.max_grad_norm, args.save_every, args.keep_last,
+ f"ENABLED (K_q={args.q_group_size})" if args.q_group_size > 1 else "disabled",
+ )
+
+ # ── Load model ──────────────────────────────────────────────────────────
+ logger.info("Loading model from %s ...", args.base_model)
+ tokenizer = AutoTokenizer.from_pretrained(args.base_model, trust_remote_code=True)
+ if tokenizer.pad_token is None:
+ tokenizer.pad_token = tokenizer.eos_token
+ tokenizer.padding_side = "right"
+
+ # SFT adapter checkpoints often don't save chat_template, which causes
+ # tokenizer.apply_chat_template() to raise an error inside evaluate_gsm8k
+ # — silently swallowed there, giving 0% accuracy even for a capable model.
+ if tokenizer.chat_template is None:
+ _base_model_name = "Qwen/Qwen2.5-Math-1.5B-Instruct"
+ _meta_file = Path(args.base_model) / "pipeline_meta.json"
+ if _meta_file.exists():
+ _meta = json.loads(_meta_file.read_text(encoding="utf-8"))
+ _base_model_name = _meta.get("base_model", _base_model_name)
+ logger.info(
+ "Tokenizer has no chat_template; loading from base model %s", _base_model_name
+ )
+ try:
+ _base_tok = AutoTokenizer.from_pretrained(_base_model_name, trust_remote_code=True)
+ if _base_tok.chat_template is not None:
+ tokenizer.chat_template = _base_tok.chat_template
+ logger.info("Chat template loaded successfully.")
+ except Exception as _e:
+ logger.warning("Could not load chat template from base model: %s", _e)
+
+ # PEFT <= 0.12 crashes inside merge_and_unload() when the
+ # transformers.integrations.tensor_parallel module is missing.
+ if "transformers.integrations.tensor_parallel" not in sys.modules:
+ sys.modules["transformers.integrations.tensor_parallel"] = types.ModuleType(
+ "tensor_parallel"
+ )
+
+ model_path = Path(args.base_model)
+ is_adapter = (model_path / "adapter_config.json").exists()
+
+ load_kwargs = dict(
+ torch_dtype=torch.bfloat16,
+ low_cpu_mem_usage=True,
+ device_map={"": device},
+ trust_remote_code=True,
+ attn_implementation=attn_impl,
+ )
+
+ if is_adapter:
+ # Determine actual base model from pipeline_meta.json (written by SFT pipeline).
+ _meta_path = model_path / "pipeline_meta.json"
+ _base_for_weights = "Qwen/Qwen2.5-Math-1.5B-Instruct"
+ if _meta_path.exists():
+ _base_for_weights = json.loads(
+ _meta_path.read_text(encoding="utf-8")
+ ).get("base_model", _base_for_weights)
+ logger.info("Detected PEFT adapter — loading base %s then merging %s",
+ _base_for_weights, args.base_model)
+ _base = AutoModelForCausalLM.from_pretrained(_base_for_weights, **load_kwargs)
+ model = PeftModel.from_pretrained(_base, args.base_model).merge_and_unload()
+ model = model.to(device)
+ else:
+ model = AutoModelForCausalLM.from_pretrained(args.base_model, **load_kwargs)
+
+ # PEFT.merge_and_unload() leaves requires_grad=False on every param.
+ # Re-enable unconditionally so GRPO's optimizer actually updates weights.
+ params_before = sum(p.numel() for p in model.parameters() if p.requires_grad)
+ for p in model.parameters():
+ p.requires_grad_(True)
+ params_after = sum(p.numel() for p in model.parameters() if p.requires_grad)
+ if params_before == 0 and params_after > 0:
+ logger.warning(
+ "All parameters were frozen on load (PEFT merge_and_unload bug). "
+ "Re-enabled requires_grad — any prior frozen runs were training nothing."
+ )
+
+ # Flash-Attn 2 turns attention memory from O(T²) to O(T), so gradient
+ # checkpointing gives almost no extra saving while costing ~30% more
+ # backward time. Disable it when Flash is active (mirrors PPO runner).
+ # gradient_checkpointing_enable requires use_reentrant=False on modern
+ # PyTorch — the default True is deprecated and causes silent issues.
+ # Also set use_cache=False: HF models can't use KV cache together with
+ # gradient checkpointing (incompatible memory management).
+ flash_active = attn_impl == "flash_attention_2"
+ if not flash_active:
+ model.gradient_checkpointing_enable(
+ gradient_checkpointing_kwargs={"use_reentrant": False}
+ )
+ if hasattr(model, "config"):
+ model.config.use_cache = False
+ logger.info("Gradient checkpointing ENABLED (use_reentrant=False, use_cache=False).")
+ else:
+ logger.info(
+ "Flash-Attn 2 active — gradient checkpointing OFF "
+ "(Flash already gives O(T) attention memory)."
+ )
+
+ n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+ n_total = sum(p.numel() for p in model.parameters())
+ logger.info(
+ "Trainable parameters: %s / %s (%.1f%%)",
+ f"{n_params:,}", f"{n_total:,}", 100.0 * n_params / max(n_total, 1),
+ )
+
+ # ── Reference policy (frozen copy) ───────────────────────────────────────
+ # A deep copy of the policy at t=0, kept frozen forever. Used in the KL
+ # penalty to anchor the policy against catastrophic forgetting of SFT
+ # knowledge: L += β × (log π_θ - log π_ref) / T.
+ # Memory cost: ~3 GB (1.5B × 2 bytes BF16) — negligible on 80 GB.
+ ref_model: Optional[AutoModelForCausalLM] = None
+ if args.kl_coef > 0.0:
+ logger.info(
+ "Creating frozen reference policy (kl_coef=%.4f, ~%.1f GB VRAM)...",
+ args.kl_coef, sum(p.numel() for p in model.parameters()) * 2 / 1e9,
+ )
+ ref_model = copy.deepcopy(model)
+ ref_model.requires_grad_(False)
+ ref_model.eval()
+ logger.info("Reference policy ready.")
+ else:
+ logger.info("KL coef = 0 — no reference policy created.")
+
+ optimizer = torch.optim.AdamW(
+ [p for p in model.parameters() if p.requires_grad],
+ lr=args.learning_rate,
+ fused=torch.cuda.is_available(),
+ )
+
+ # ── LR schedule: linear warmup → cosine decay ────────────────────────────
+ # Linear warmup avoids the large initial gradient spike when the policy
+ # starts updating from an SFT checkpoint. Cosine decay then smoothly
+ # reduces LR toward min_lr as training progresses (standard in RLHF runs).
+ from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR
+ _n_warmup = max(1, args.warmup_iters)
+ _n_total = max(1, args.num_iterations)
+ _n_decay = max(1, _n_total - _n_warmup)
+ _min_lr = args.learning_rate * args.min_lr_ratio
+ _warmup_sched = LinearLR(
+ optimizer,
+ start_factor=0.1,
+ end_factor=1.0,
+ total_iters=_n_warmup,
+ )
+ _cosine_sched = CosineAnnealingLR(
+ optimizer,
+ T_max=_n_decay,
+ eta_min=_min_lr,
+ )
+ scheduler = SequentialLR(
+ optimizer,
+ schedulers=[_warmup_sched, _cosine_sched],
+ milestones=[_n_warmup],
+ )
+ logger.info(
+ "LR schedule: %.1e warmup(%d iters) → cosine decay(%d iters, min=%.1e)",
+ args.learning_rate, _n_warmup, _n_decay, _min_lr,
+ )
+
+ # ── Load data ────────────────────────────────────────────────────────────
+ gsm8k_pairs = load_gsm8k(args.gsm8k_data)
+ if not gsm8k_pairs:
+ logger.error("No training data found at %s — cannot train. Exiting.", args.gsm8k_data)
+ sys.exit(1)
+
+ # Optional MATH dataset mixing
+ math_pairs: List[Dict[str, str]] = []
+ if args.math_mix_ratio > 0.0:
+ math_pairs = load_math_dataset(
+ local_path=args.math_data,
+ max_difficulty=args.math_max_difficulty,
+ )
+ if math_pairs:
+ logger.info(
+ "MATH mixing: %.0f%% MATH (%d problems) + %.0f%% GSM8K (%d problems)",
+ 100 * args.math_mix_ratio, len(math_pairs),
+ 100 * (1 - args.math_mix_ratio), len(gsm8k_pairs),
+ )
+ else:
+ logger.warning("No MATH pairs loaded — using GSM8K only.")
+
+ # Combined pool used for difficulty sampling; kept separate for VRAM-aware
+ # batch construction (sampler draws from each pool proportionally).
+ qa_pairs = gsm8k_pairs # for reward env (all GSM8K gold answers needed)
+
+ # ── Load PRM (optional) ─────────────────────────────────────────────────
+ prm: Optional[ProcessRewardScorer] = None
+ if args.use_prm:
+ try:
+ prm = ProcessRewardScorer(
+ model_name=args.prm_model,
+ device=device,
+ load_in_4bit=True,
+ )
+ logger.info("PRM loaded: %s (4-bit)", args.prm_model)
+ except Exception as exc:
+ logger.warning("PRM load failed (%s); running without PRM.", exc)
+
+ # Build a minimal math_env just for its reward utilities (compute_grounded_reward).
+ # value_model=None is safe: it's only stored as self.value and never invoked on
+ # the grounded-reward path, so GRPO avoids the ~3 GB ValueHead backbone entirely.
+ from src.rl.unified_accuracy import StepChainExtractor, UnifiedAccuracyCalculator
+ _extractor = StepChainExtractor(
+ model_name=args.extractor_model,
+ device=str(device),
+ cache_path=args.extraction_cache,
+ )
+ _unified_calc = UnifiedAccuracyCalculator(extractor=_extractor, question_evaluator=None)
+ logger.info(
+ "Unified accuracy calculator ready (extractor=%s, cache=%s)",
+ args.extractor_model,
+ args.extraction_cache or "none",
+ )
+ # Eagerly load the extractor model now to avoid a 30–60 s stall on the
+ # first training iteration that triggers live (non-cached) extraction.
+ logger.info("Warming up step-chain extractor (eager load)...")
+ _extractor.warmup()
+ logger.info("Extractor warmup complete")
+
+ # ── LLM-backed question classifier (replaces keyword regex) ─────────────
+ # Uses the already-loaded policy model for topic classification during
+ # self-play reward computation. ~60-120 ms per call, cached, falls back
+ # to regex on any error. Dramatically more accurate than keyword matching
+ # for geometry, calculus, competition_math, and statistics.
+ from src.rl.llm_question_classifier import LLMQuestionClassifier
+ _llm_classifier = LLMQuestionClassifier(
+ model=model,
+ tokenizer=tokenizer,
+ device=device,
+ cache_size=10_000,
+ )
+
+ math_env = CurriculumMathEnvironment(
+ policy_model=model,
+ value_model=None,
+ tokenizer=tokenizer,
+ # Feed all training questions as the novelty reference set so
+ # session_novelty is measured against the actual training distribution —
+ # a self-play question that mimics a dataset question gets low novelty.
+ reference_questions=[p["question"] for p in gsm8k_pairs],
+ grounded_qa_pairs=qa_pairs,
+ prm_scorer=prm,
+ max_solution_tokens=args.max_new_tokens,
+ device=device,
+ unified_accuracy_calc=_unified_calc,
+ )
+ # Inject LLM classifier into the question quality evaluator
+ math_env.question_evaluator.classifier = _llm_classifier
+ # Wire the question_evaluator into the unified calc after math_env is available
+ _unified_calc.question_evaluator = math_env.question_evaluator
+
+ # Bootstrap curriculum from dataset skill_ids when the training data
+ # contains structured records (NuminaMath / OpenMathInstruct format).
+ # Falls back to the keyword-classifier path for plain GSM8K.
+ _raw_records: list = []
+ _train_path = Path(args.gsm8k_data)
+ if _train_path.exists():
+ with _train_path.open(encoding="utf-8") as _f:
+ for _line in _f:
+ _line = _line.strip()
+ if _line:
+ try:
+ _raw_records.append(json.loads(_line))
+ except json.JSONDecodeError:
+ pass
+ if any("skill_id" in r for r in _raw_records[:20]):
+ logger.info(
+ "Detected structured dataset (%d records) — bootstrapping "
+ "curriculum from skill_ids instead of keyword classifier.",
+ len(_raw_records),
+ )
+ math_env.curriculum_manager.initialize_from_dataset(_raw_records)
+ else:
+ logger.info("Plain dataset detected — using keyword-classifier bootstrap.")
+
+ # ── Difficulty-adaptive sampling state ───────────────────────────────────
+ # Track per-question win-rate. Questions where the model scores correctly
+ # 20-80% of the time are "on the margin" and provide the richest gradient
+ # signal. Questions it always gets right (win_rate≈1) or always gets wrong
+ # (win_rate≈0) contribute little after the first few iterations.
+ from collections import defaultdict
+ _q_wins: Dict[str, int] = defaultdict(int)
+ _q_attempts: Dict[str, int] = defaultdict(int)
+
+ def _question_key(q: str) -> str:
+ """Stable hash fingerprint — collision-resistant for any pool size."""
+ import hashlib
+ return hashlib.md5(q.encode(), usedforsecurity=False).hexdigest()
+
+ def _sample_by_difficulty(
+ pool: List[Dict[str, str]], n: int, alpha: float
+ ) -> List[Dict[str, str]]:
+ """
+ Sample ``n`` questions from ``pool``, weighting by how informative each is.
+
+ Informativeness = 1 - |win_rate - 0.5| × 2 ∈ [0, 1]
+ win_rate = 0.0 or 1.0 → informativeness = 0 (model already knows / lost cause)
+ win_rate = 0.5 → informativeness = 1 (most uncertain = best signal)
+
+ ``alpha`` sharpens the weighting (higher = stronger preference for win_rate≈0.5).
+ Unseen questions get weight 0.75 to encourage exploration.
+ A 5% floor prevents any question from being permanently excluded.
+ """
+ if alpha <= 0.0:
+ return random.sample(pool, min(n, len(pool)))
+
+ weights = []
+ for qa in pool:
+ key = _question_key(qa["question"])
+ att = _q_attempts[key]
+ if att == 0:
+ w = 0.75
+ else:
+ win_rate = _q_wins[key] / att
+ info = 1.0 - abs(win_rate - 0.5) * 2.0 # ∈ [0, 1]
+ w = max(info ** alpha, 0.05)
+ weights.append(w)
+
+ total_w = sum(weights)
+ probs = [w / total_w for w in weights]
+ chosen = np.random.choice(
+ len(pool), size=min(n, len(pool)), replace=False, p=probs
+ )
+ return [pool[i] for i in chosen]
+
+ # ── Metrics log ─────────────────────────────────────────────────────────
+ metrics_log: List[Dict] = []
+
+ # ── Initial eval ─────────────────────────────────────────────────────────
+ if not args.skip_initial_eval:
+ logger.info("=" * 70)
+ logger.info("INITIAL EVALUATION (Iteration 0)")
+ logger.info("=" * 70)
+ initial_eval = evaluate_policy(
+ model, tokenizer,
+ args.eval_data_path, args.eval_max_samples, args.eval_max_new_tokens,
+ math_env=math_env,
+ pass_at_k=args.eval_pass_at_k,
+ )
+ # accuracy == combined_score = 0.50×correct + 0.40×process(prm_final,prm_mean) + 0.10×fmt
+ # This is identical to the GRPO training objective.
+ _log_eval_result("INITIAL (iter 0)", initial_eval, best=None)
+ metrics_log.append({"iteration": 0, **initial_eval})
+ best_accuracy = float(initial_eval.get("accuracy", 0.0))
+ best_combined = float(initial_eval.get("combined_score", 0.0))
+ best_prm_mean = float(initial_eval.get("prm_mean", 0.0))
+ else:
+ best_accuracy = 0.0
+ best_combined = 0.0
+ best_prm_mean = 0.0
+
+ # ── Training curriculum phase FSM ────────────────────────────────────────
+ # Phase 1 — GROUNDED_ONLY: self-play ratio is forced to 0 until the model
+ # has established reliable answer correctness (gt_match_rate) and step
+ # quality (step_accuracy) on grounded data.
+ # Phase 2 — SELFPLAY_RAMP: self-play ratio ramps from ~0 → self_play_ratio
+ # ceiling over selfplay_ramp_iters, keeping ≥30% grounded as an anchor.
+ # Phase 3 — CONTINUOUS: ratio holds at ceiling; grounded floor is monitored
+ # and self-play is suspended whenever gt_match_rate drops below the floor.
+ from enum import Enum, auto as _auto
+
+ class _Phase(Enum):
+ GROUNDED_ONLY = _auto()
+ SELFPLAY_RAMP = _auto()
+ CONTINUOUS = _auto()
+
+ _phase: _Phase = _Phase.GROUNDED_ONLY
+ _selfplay_iterations: int = 0 # iterations spent in Phase 2+
+ _selfplay_suspended: bool = False
+ _effective_sp_ratio: float = 0.0 # computed each iteration from phase
+
+ # ── Chain scoring calibration state ──────────────────────────────────────
+ # During Phase 2 SELFPLAY_RAMP the extractor runs in shadow mode (computing
+ # scores but NOT affecting rewards) to build a rolling calibration window.
+ # use_chain_scoring only flips True when both the chain↔PRM correlation AND
+ # the extraction success rate cross their thresholds — a data-driven gate,
+ # not a schedule-driven one.
+ _use_chain_as_primary: bool = False # True once calibration passes
+ _chain_prm_correlation: float = 0.0 # rolling Pearson r (chain vs PRM)
+ _extraction_success_rate: float = 0.0 # rolling extraction success fraction
+ # Cross-iteration rolling window (up to 200 paired samples)
+ _rolling_chain_scores: List[float] = []
+ _rolling_prm_scores: List[float] = []
+ _rolling_successes: List[int] = [] # 1 = successful extraction, 0 = failed
+ _CALIB_WINDOW = 50 # minimum samples before computing correlation
+ _CALIB_MAX = 200 # cap rolling lists at this length
+ # Throttle shadow extraction: only run the extractor on every Nth grounded
+ # solution during calibration. Reduces overhead ~4× while still reaching
+ # the 50-sample window within a few iterations.
+ _SHADOW_EVERY = 4
+ _shadow_extract_counter: int = 0
+
+ # ── Training ─────────────────────────────────────────────────────────────
+ for iteration in range(1, args.num_iterations + 1):
+ iter_start = time.perf_counter()
+ logger.info("=" * 70)
+ logger.info("GRPO ITERATION %d/%d", iteration, args.num_iterations)
+ logger.info("=" * 70)
+
+ # Sample questions — difficulty-weighted from the mixed pool.
+ # When math_pairs is non-empty, draw proportionally: N*ratio from MATH
+ # and N*(1-ratio) from GSM8K. The difficulty sampler handles each pool
+ # independently so MATH problems get their own win-rate tracking.
+ #
+ # MATH ratio ramp: once past --math-ramp-start, linearly increase the
+ # MATH fraction toward --math-mix-ratio-late over the next 10 iterations.
+ # This progressively raises difficulty after the policy has stabilised.
+ _effective_math_ratio = args.math_mix_ratio
+ if args.math_mix_ratio_late is not None and iteration > args.math_ramp_start:
+ _ramp_progress = min(1.0, (iteration - args.math_ramp_start) / 10.0)
+ _effective_math_ratio = (
+ args.math_mix_ratio
+ + _ramp_progress * (args.math_mix_ratio_late - args.math_mix_ratio)
+ )
+
+ if math_pairs and _effective_math_ratio > 0.0:
+ n_math = max(1, round(args.questions_per_iter * _effective_math_ratio))
+ n_gsm8k = max(1, args.questions_per_iter - n_math)
+ math_batch = _sample_by_difficulty(math_pairs, n_math, alpha=args.difficulty_alpha)
+ gsm8k_batch = _sample_by_difficulty(gsm8k_pairs, n_gsm8k, alpha=args.difficulty_alpha)
+ questions_batch = math_batch + gsm8k_batch
+ random.shuffle(questions_batch)
+ else:
+ questions_batch = _sample_by_difficulty(
+ gsm8k_pairs, args.questions_per_iter, alpha=args.difficulty_alpha
+ )
+ cur_lr = optimizer.param_groups[0]["lr"]
+ # Temperature annealing: linearly decay T from peak → min_temp over the run.
+ # Early iterations need high T for exploration; later ones need lower T
+ # to consolidate learned strategies (and close the training/eval gap).
+ _anneal_frac = min(1.0, (iteration - 1) / max(1, args.num_iterations - 1))
+ _annealed_temp = args.temperature * (1.0 - 0.5 * _anneal_frac) # 0.8 → 0.4
+ logger.info(
+ "LR this iteration: %.2e | T=%.3f | MATH ratio=%.0f%%",
+ cur_lr, _annealed_temp, 100 * _effective_math_ratio,
+ )
+
+ all_rewards: List[float] = []
+ all_q_rewards: List[float] = []
+ _grounded_rewards: List[float] = []
+ _sp_rewards: List[float] = []
+ _grounded_step_accs: List[float] = []
+ _grounded_lccps: List[float] = []
+ _grounded_gt_matches: List[bool] = []
+ # Chain scoring accumulators (populated only in Phase 2+ when
+ # math_env.use_chain_scoring is True)
+ _chain_arith_scores: List[float] = []
+ _chain_dep_scores: List[float] = []
+ _chain_integrity_scores: List[float] = []
+ _sp_chain_scores: List[float] = [] # self-play chain integrity
+ _skipped_zero_var: int = 0 # groups skipped due to zero reward variance
+ # Per-component question quality accumulators
+ _qc_topic: List[float] = []
+ _qc_diff: List[float] = []
+ _qc_clarity: List[float] = []
+ _qc_novelty: List[float] = []
+ _qc_solvability: List[float] = []
+
+ skipped = 0
+ n_groups = 0
+ n_self_play = 0
+ q_gen_attempts = 0 # total generate_question() calls
+ q_gen_valid = 0 # non-empty questions produced (len > 10 chars)
+ q_quality_good = 0 # self-play groups where question_reward > 0.5
+ total_loss_val = 0.0
+
+ # Determine how many of this iteration's groups use self-play question
+ # generation vs grounded (dataset) questions.
+ # Phase-driven ratio: Phase 1 forces 0; Phase 2 ramps from 0 to ceiling;
+ # Phase 3 holds at ceiling (args.self_play_ratio). Grounded floor recovery
+ # (computed at end of previous iteration) overrides to 0 regardless of phase.
+ if _phase == _Phase.GROUNDED_ONLY:
+ _effective_sp_ratio = 0.0
+ elif _phase == _Phase.SELFPLAY_RAMP:
+ _grounded_anchor = max(0.30, 1.0 - (_selfplay_iterations / max(1, args.selfplay_ramp_iters)))
+ _effective_sp_ratio = 1.0 - _grounded_anchor
+ else: # CONTINUOUS
+ _effective_sp_ratio = args.self_play_ratio
+
+ if _selfplay_suspended:
+ _effective_sp_ratio = 0.0 # grounded floor recovery pass
+
+ n_self_play_target = int(round(len(questions_batch) * _effective_sp_ratio))
+
+ # Build a random set of group indices that will use self-play.
+ # Random interleaving distributes self-play uniformly across the batch
+ # instead of front-loading all self-play groups, which would cause the
+ # gradient to shift mid-batch as the objective changes character.
+ _all_indices = list(range(len(questions_batch)))
+ random.shuffle(_all_indices)
+ _self_play_indices = set(_all_indices[:n_self_play_target])
+
+ # Zero gradients once before the loop — we accumulate them via
+ # per-group .backward() calls instead of building one giant graph.
+ # Keeping all K*N forward passes alive until a single backward()
+ # at the end would hold O(K*N) computation graphs in GPU memory
+ # simultaneously (64 graphs at K=4, N=16), risking OOM. Calling
+ # .backward() immediately after each group frees that graph right
+ # away; gradients accumulate in .grad tensors without extra memory.
+ optimizer.zero_grad()
+
+ pbar = tqdm(questions_batch, desc=f"Iter {iteration} GRPO groups", unit="q")
+ for _group_idx, qa in enumerate(pbar):
+
+ # ── Decide: self-play (model generates question) or grounded ─────
+ # Random interleaving: self-play slots chosen before the loop.
+ use_self_play = _group_idx in _self_play_indices
+
+ if use_self_play:
+ # ── SELF-PLAY BRANCH ─────────────────────────────────────────
+ # 1. Sample a curriculum instruction (topic + difficulty target)
+ instruction, target_topic, target_difficulty = math_env.sample_instruction()
+
+ # MATH L4-L5: exclude from self-play generation — problems at this
+ # difficulty produce unanchored reward because the verification
+ # cascade cannot reliably confirm answers. Fall back to grounded.
+ if target_difficulty >= 4.0:
+ use_self_play = False
+
+ # 2. Model generates the question from the instruction.
+ # This is the "proposer" role in Theme #4 self-improvement:
+ # the model creates its own challenge.
+ q_gen_attempts += 1
+
+ # ── TWO-PHASE QUESTION GRPO (when --q-group-size ≥ 2) ────────
+ # Phase 1: sample K_q question candidates, store their token
+ # IDs for a question-level GRPO update.
+ # Phase 2: for each candidate, generate M=group_size solutions,
+ # score them, and run a solution-level GRPO update.
+ # The per-question reward (mean solution reward) is then used
+ # to run GRPO on the question tokens — gradients flow back
+ # through the question tokens for the first time.
+ if args.q_group_size > 1:
+ _q_temp = min(0.90, _annealed_temp + 0.05)
+ q_cands, q_ids_all, q_masks_all, q_olps_all = generate_questions_batched(
+ model=model,
+ tokenizer=tokenizer,
+ instruction=instruction,
+ K_q=args.q_group_size,
+ max_new_tokens=128,
+ temperature=_q_temp,
+ device=device,
+ )
+ # Keep only candidates with enough substance
+ _valid_q = [
+ (q, ids, mask, olp)
+ for q, ids, mask, olp
+ in zip(q_cands, q_ids_all, q_masks_all, q_olps_all)
+ if len(q.strip()) >= 10
+ ]
+ if not _valid_q:
+ logger.debug("Two-phase SP: all %d question candidates too short, skipping.", args.q_group_size)
+ skipped += 1
+ continue
+ q_gen_valid += 1
+ n_self_play += 1
+
+ # Phase 2: score solutions for each valid question candidate
+ _question_agg_rewards: List[float] = [] # one per valid candidate
+ _q_total_loss_val: float = 0.0
+
+ for _q_text, _q_ids, _q_mask, _q_olp in _valid_q:
+ solution_prompt = math_env.format_solution_prompt(_q_text)
+ sols_q, ids_q, masks_q, olps_q = generate_solutions_batched(
+ model=model,
+ tokenizer=tokenizer,
+ prompt=solution_prompt,
+ K=args.group_size,
+ max_new_tokens=args.max_new_tokens,
+ temperature=_annealed_temp,
+ device=device,
+ )
+ # Overlong filter
+ if args.overlong_filter:
+ _vf = [
+ t for t in zip(sols_q, ids_q, masks_q, olps_q)
+ if int(t[2].sum().item()) < args.max_new_tokens
+ ]
+ if _vf:
+ sols_q, ids_q, masks_q, olps_q = map(list, zip(*_vf)) # type: ignore
+ else:
+ skipped += 1
+ _question_agg_rewards.append(0.0)
+ continue
+
+ # Score solutions
+ _sol_rewards: List[float] = []
+ for _sol in sols_q:
+ _r, _q_rew, _, _q_met = compute_self_play_reward(
+ question=_q_text,
+ solution=_sol,
+ target_topic=target_topic,
+ target_difficulty=target_difficulty,
+ math_env=math_env,
+ )
+ _sol_rewards.append(_r)
+ all_q_rewards.append(_q_rew)
+ _qc_topic.append(_q_met["topic_match"])
+ _qc_diff.append(_q_met["difficulty_fit"])
+ _qc_clarity.append(_q_met["clarity"])
+ _qc_novelty.append(_q_met["novelty"])
+ _qc_solvability.append(_q_met["solvability"])
+
+ all_rewards.extend(_sol_rewards)
+ _sp_rewards.extend(_sol_rewards)
+
+ # Aggregate question reward = mean of its solution rewards
+ _q_agg = float(np.mean(_sol_rewards))
+ _question_agg_rewards.append(_q_agg)
+
+ # ── Solution-level GRPO update ───────────────────────
+ _sol_loss = grpo_loss_for_group(
+ model=model,
+ input_ids_list=ids_q,
+ response_masks=masks_q,
+ rewards=_sol_rewards,
+ old_log_probs=olps_q,
+ clip_eps=args.clip_eps,
+ kl_coef=args.kl_coef,
+ ref_model=ref_model,
+ )
+ if _sol_loss is not None:
+ _sol_loss.backward()
+ total_loss_val += _sol_loss.item()
+ _q_total_loss_val += _sol_loss.item()
+ n_groups += 1
+ else:
+ skipped += 1
+ _skipped_zero_var += 1
+
+ # ── Question-level GRPO update ───────────────────────────
+ # Advantages are computed over the K_q question-reward
+ # scalars. The IS ratio is exp(new_lp_question - old_lp_question).
+ # kl_coef=0 here: there is no reference distribution for questions.
+ _q_ids_v = [t[1] for t in _valid_q]
+ _q_masks_v = [t[2] for t in _valid_q]
+ _q_olps_v = [t[3] for t in _valid_q]
+
+ _q_loss = grpo_loss_for_group(
+ model=model,
+ input_ids_list=_q_ids_v,
+ response_masks=_q_masks_v,
+ rewards=_question_agg_rewards,
+ old_log_probs=_q_olps_v,
+ clip_eps=args.clip_eps,
+ kl_coef=0.0, # no ref model for question tokens
+ ref_model=None,
+ )
+ if _q_loss is not None:
+ _q_loss.backward()
+ logger.debug(
+ "Q-GRPO: loss=%.4f q_rewards=%s (variance=%.4f)",
+ _q_loss.item(),
+ [f"{r:.3f}" for r in _question_agg_rewards],
+ float(np.var(_question_agg_rewards)),
+ )
+
+ # Group-level quality: at least one candidate scored > 0.5
+ if any(r > 0.5 for r in _question_agg_rewards):
+ q_quality_good += 1
+
+ # pbar update then skip to next group (all done above)
+ _mean_r_sp = float(np.mean(all_rewards[-len(_valid_q)*args.group_size:])) if all_rewards else 0.0
+ _q_acc_pct = 100.0 * q_quality_good / max(1, n_self_play)
+ pbar.set_postfix(
+ loss=f"{_q_total_loss_val / max(1, len(_valid_q)):.4f}",
+ mean_r=f"{_mean_r_sp:.3f}",
+ q_acc=f"{_q_acc_pct:.0f}%",
+ q_rew=f"{float(np.mean(all_q_rewards)):.3f}" if all_q_rewards else "n/a",
+ skip=skipped,
+ )
+ continue # ← everything handled above; jump to next group
+
+ # ── K_q=1: original single-question path (no question GRPO) ──
+ question = generate_question(
+ model=model,
+ tokenizer=tokenizer,
+ instruction=instruction,
+ max_new_tokens=128, # questions are short
+ device=device,
+ # Slightly warmer than solution temperature for diversity,
+ # but anneals with the same schedule to stay consistent.
+ temperature=min(0.90, _annealed_temp + 0.05),
+ )
+ # A valid question must have at least some substance.
+ # Reject single-word, empty, or nonsensical outputs.
+ if len(question.strip()) < 10:
+ logger.debug(
+ "Self-play: generated question too short (%d chars), skipping group.",
+ len(question.strip()),
+ )
+ skipped += 1
+ continue
+ q_gen_valid += 1
+ n_self_play += 1
+ gold = None # no gold answer — rewarded on question quality
+ else:
+ # ── GROUNDED BRANCH ──────────────────────────────────────────
+ # Use pre-existing dataset question with known gold answer.
+ question = qa["question"]
+ gold = qa["gold_final"]
+ target_topic = "grounded"
+ target_difficulty = 0.5
+
+ # --- Generate K solutions (batched — single model.generate call) ---
+ solution_prompt = math_env.format_solution_prompt(question)
+ solutions, input_ids_list, response_masks, old_log_probs_list = (
+ generate_solutions_batched(
+ model=model,
+ tokenizer=tokenizer,
+ prompt=solution_prompt,
+ K=args.group_size,
+ max_new_tokens=args.max_new_tokens,
+ temperature=_annealed_temp,
+ device=device,
+ )
+ )
+
+ # --- Overlong filter: drop truncated solutions (no Final Answer) ---
+ # A response that hit max_new_tokens was cut off mid-generation;
+ # it almost certainly didn't produce a valid "Final Answer: X" line,
+ # so its reward is unreliable noise. Dropping it keeps the group
+ # advantage estimates clean.
+ if args.overlong_filter:
+ _valid = [
+ (sol, ids, mask, olp)
+ for sol, ids, mask, olp
+ in zip(solutions, input_ids_list, response_masks, old_log_probs_list)
+ if int(mask.sum().item()) < args.max_new_tokens
+ ]
+ if _valid:
+ solutions, input_ids_list, response_masks, old_log_probs_list = (
+ zip(*_valid) # type: ignore[assignment]
+ )
+ solutions = list(solutions)
+ input_ids_list = list(input_ids_list)
+ response_masks = list(response_masks)
+ old_log_probs_list = list(old_log_probs_list)
+ else:
+ # All K solutions were truncated — skip group.
+ skipped += 1
+ continue
+
+ # --- Score each solution (self-play: Q+S reward; grounded: S only) ---
+ rewards = []
+ _sp_q_rew_this_group: List[float] = []
+ for sol in solutions:
+ if use_self_play:
+ # compute_reward = 0.40×question_quality + 0.60×solution_quality
+ # This is the core Theme #4 signal: the model is rewarded
+ # for generating a well-formed, appropriately difficult,
+ # solvable question AND for solving it correctly.
+ r, q_rew, _, q_met = compute_self_play_reward(
+ question=question,
+ solution=sol,
+ target_topic=target_topic,
+ target_difficulty=target_difficulty,
+ math_env=math_env,
+ )
+ _sp_q_rew_this_group.append(q_rew)
+ all_q_rewards.append(q_rew)
+ # Collect per-component breakdown (same question, all K solutions
+ # get the same q_metrics — average to reduce noise).
+ _qc_topic.append(q_met["topic_match"])
+ _qc_diff.append(q_met["difficulty_fit"])
+ _qc_clarity.append(q_met["clarity"])
+ _qc_novelty.append(q_met["novelty"])
+ _qc_solvability.append(q_met["solvability"])
+ # Self-play chain integrity (Phase 2+ only; None in Phase 1)
+ _sp_ci = q_met.get("sp_chain_integrity_score")
+ if _sp_ci is not None:
+ _sp_chain_scores.append(float(_sp_ci))
+ else:
+ r_dict = compute_grounded_reward(
+ question=question,
+ solution=sol,
+ gold_final=gold,
+ math_env=math_env,
+ )
+ r = r_dict["combined_score"]
+ _grounded_step_accs.append(r_dict["step_accuracy"])
+ _grounded_lccps.append(r_dict["lccp"])
+ _grounded_gt_matches.append(bool(r_dict["gt_match"]))
+ if r_dict.get("chain_arith_score") is not None:
+ _chain_arith_scores.append(float(r_dict["chain_arith_score"]))
+ if r_dict.get("chain_dep_score") is not None:
+ _chain_dep_scores.append(float(r_dict["chain_dep_score"]))
+ if r_dict.get("chain_integrity_score") is not None:
+ _chain_integrity_scores.append(float(r_dict["chain_integrity_score"]))
+
+ # Shadow extraction for calibration: during SELFPLAY_RAMP,
+ # run the chain extractor even before use_chain_scoring is
+ # activated so we can measure chain↔PRM correlation. These
+ # scores do NOT affect the reward — they only feed the
+ # calibration window that decides when to flip use_chain_scoring.
+ # Throttled to every _SHADOW_EVERY solutions to avoid making
+ # each iteration ~10× slower (extractor adds ~8s per call).
+ _shadow_extract_counter += 1
+ if (
+ _phase == _Phase.SELFPLAY_RAMP
+ and not _use_chain_as_primary
+ and _unified_calc is not None
+ and _shadow_extract_counter % _SHADOW_EVERY == 0
+ ):
+ _prm_ps = (
+ 0.60 * r_dict.get("prm_final_score", 0.0)
+ + 0.40 * r_dict.get("prm_mean_score", 0.0)
+ )
+ try:
+ _shadow = _unified_calc.compute(
+ solution=sol,
+ gold_answer=gold,
+ question=question,
+ topic=target_topic,
+ phase="grounded",
+ )
+ _rolling_chain_scores.append(_shadow.chain_integrity_score)
+ _rolling_prm_scores.append(_prm_ps)
+ _rolling_successes.append(1 if _shadow.extraction_succeeded else 0)
+ except Exception:
+ _rolling_successes.append(0)
+ rewards.append(r)
+ all_rewards.extend(rewards)
+ # Route to path-specific accumulators for separate batch_acc reporting
+ if use_self_play:
+ _sp_rewards.extend(rewards)
+ else:
+ _grounded_rewards.extend(rewards)
+
+ # A self-play group is "accurate" if the question it generated scored
+ # above 0.5 on question quality — meaning it was clear, on-topic,
+ # appropriately difficult, and solvable.
+ if use_self_play and _sp_q_rew_this_group:
+ if float(np.mean(_sp_q_rew_this_group)) > 0.5:
+ q_quality_good += 1
+
+ # --- PAL/SymPy verification gate (self-play only) ---
+ # Drop the group if the tiered cascade cannot confirm a consistent,
+ # independently-verifiable answer. This prevents circular PRM reward
+ # from being the sole correctness anchor on self-play examples.
+ if use_self_play:
+ if not _verify_self_play_answer(solutions, target_topic, target_difficulty):
+ skipped += 1
+ continue # no gradient for this group
+
+ # --- Update difficulty stats (grounded questions only — self-play
+ # questions are ephemeral and have no stable key) ---
+ if not use_self_play:
+ _key = _question_key(question)
+ _q_attempts[_key] += len(solutions)
+ # Win = reward in the top half of THIS group, not an absolute 0.5 threshold.
+ # Using a relative threshold avoids the case where all solutions score 0.55
+ # (all "wins" → easy) or all score 0.45 (all "losses" → impossible) when the
+ # rewards are actually similar and carry no difficulty information.
+ _group_median = float(np.median(rewards))
+ _q_wins[_key] += sum(1 for r in rewards if r > _group_median)
+
+ # --- GRPO loss (IS clip + optional KL penalty) + immediate backward ---
+ # Skip near-uniform groups early: when reward std < 0.02 (on a [0,1]
+ # scale) all advantages collapse to ~0 and the gradient contribution
+ # is negligible — equivalent to wasted compute. This is a stricter
+ # guard than the eps=1e-8 inside grpo_loss_for_group, which only
+ # catches exactly-equal rewards (e.g. all 0.998 passes through it).
+ _reward_std = float(np.std(rewards))
+ if _reward_std < 0.02:
+ skipped += 1
+ _skipped_zero_var += 1
+ _pf_zv: Dict = dict(mean_r=f"{np.mean(rewards):.3f}", skip=skipped, loss="0var")
+ pbar.set_postfix(**_pf_zv)
+ continue
+
+ group_loss = grpo_loss_for_group(
+ model=model,
+ input_ids_list=input_ids_list,
+ response_masks=response_masks,
+ rewards=rewards,
+ old_log_probs=old_log_probs_list,
+ clip_eps=args.clip_eps,
+ kl_coef=args.kl_coef,
+ ref_model=ref_model,
+ )
+
+ if group_loss is None:
+ skipped += 1
+ _skipped_zero_var += 1
+ _pf: Dict = dict(mean_r=f"{np.mean(rewards):.3f}", skip=skipped, loss="skip")
+ if n_self_play > 0 and all_q_rewards:
+ _q_acc_pct = 100.0 * q_quality_good / max(1, n_self_play)
+ _pf["q_acc"] = f"{_q_acc_pct:.0f}%"
+ pbar.set_postfix(**_pf)
+ continue
+
+ # Backprop immediately — frees this group's computation graph.
+ # Gradients from all valid groups accumulate in param.grad.
+ group_loss.backward()
+ total_loss_val += group_loss.item()
+ n_groups += 1
+ _pf = dict(
+ mean_r=f"{np.mean(rewards):.3f}",
+ loss=f"{group_loss.item():.4f}",
+ skip=skipped,
+ )
+ if n_self_play > 0 and all_q_rewards:
+ # Show live question-gen accuracy in the tqdm bar.
+ # q_acc = fraction of self-play groups whose generated question
+ # scored > 0.5 on quality (clear, on-topic, solvable).
+ _q_acc_pct = 100.0 * q_quality_good / max(1, n_self_play)
+ _pf["q_acc"] = f"{_q_acc_pct:.0f}%"
+ _pf["q_rew"] = f"{float(np.mean(all_q_rewards)):.3f}"
+ pbar.set_postfix(**_pf)
+
+ # --- Gradient step: normalise accumulated grads then step ---
+ if n_groups > 0:
+ # Divide accumulated grads by n_groups to get the true average
+ # (equivalent to averaging the group losses before backward).
+ if n_groups > 1:
+ for p in model.parameters():
+ if p.grad is not None:
+ p.grad.div_(n_groups)
+ torch.nn.utils.clip_grad_norm_(
+ [p for p in model.parameters() if p.requires_grad],
+ args.max_grad_norm,
+ )
+ optimizer.step()
+ loss_val = total_loss_val / n_groups
+ else:
+ loss_val = 0.0
+ scheduler.step()
+
+ iter_time = time.perf_counter() - iter_start
+ mean_r = float(np.mean(all_rewards)) if all_rewards else 0.0
+ std_r = float(np.std(all_rewards)) if all_rewards else 0.0
+ acc_r = float(np.mean([r > 0.5 for r in all_rewards])) if all_rewards else 0.0
+ grounded_acc_r = (
+ float(np.mean([r > 0.5 for r in _grounded_rewards]))
+ if _grounded_rewards else 0.0
+ )
+ mean_step_acc = (
+ float(np.mean(_grounded_step_accs))
+ if _grounded_step_accs else 0.0
+ )
+ mean_lccp = (
+ float(np.mean(_grounded_lccps))
+ if _grounded_lccps else 0.0
+ )
+ mean_q_r = float(np.mean(all_q_rewards)) if all_q_rewards else 0.0
+
+ # Chain scoring batch means (non-None only in Phase 2+)
+ mean_chain_arith = float(np.mean(_chain_arith_scores)) if _chain_arith_scores else None
+ mean_chain_dep = float(np.mean(_chain_dep_scores)) if _chain_dep_scores else None
+ mean_chain_integrity = float(np.mean(_chain_integrity_scores)) if _chain_integrity_scores else None
+ mean_sp_chain = float(np.mean(_sp_chain_scores)) if _sp_chain_scores else None
+
+ # ── gt_match_rate: raw answer-correctness on grounded examples ────────
+ # This is the primary Phase-1 graduation signal — unlike grounded_acc_r
+ # which is (combined_score > 0.5), gt_match_rate is the direct SymPy
+ # exact-match fraction and cannot be gamed by a high PRM/format score.
+ gt_match_rate = (
+ float(sum(_grounded_gt_matches) / len(_grounded_gt_matches))
+ if _grounded_gt_matches else 0.0
+ )
+
+ # ── Phase FSM transitions ─────────────────────────────────────────────
+ if _phase == _Phase.GROUNDED_ONLY:
+ _graduation_ready = (
+ gt_match_rate >= args.selfplay_gt_thresh
+ and grounded_acc_r >= args.selfplay_grounded_thresh
+ and mean_step_acc >= args.selfplay_step_thresh
+ and iteration >= args.min_warmup
+ )
+ if _graduation_ready:
+ _phase = _Phase.SELFPLAY_RAMP
+ logger.info(
+ "PHASE → SELFPLAY_RAMP at iter %d "
+ "(gt_match=%.2f grounded_acc=%.2f step_acc=%.2f) — "
+ "shadow extraction active; chain scoring deferred until "
+ "calibration passes (corr≥0.70, success_rate≥0.80)",
+ iteration, gt_match_rate, grounded_acc_r, mean_step_acc,
+ )
+ # NOTE: do NOT set math_env.use_chain_scoring = True here.
+ # The extractor runs in shadow mode first; use_chain_scoring
+ # flips to True below once calibration thresholds are met.
+ elif _phase in (_Phase.SELFPLAY_RAMP, _Phase.CONTINUOUS):
+ _selfplay_iterations += 1
+ if _phase == _Phase.SELFPLAY_RAMP and _selfplay_iterations >= args.selfplay_ramp_iters:
+ _phase = _Phase.CONTINUOUS
+ logger.info(
+ "PHASE → CONTINUOUS at iter %d (ramp complete after %d iters)",
+ iteration, _selfplay_iterations,
+ )
+
+ # ── Data-driven chain scoring activation ─────────────────────────
+ # Trim rolling window to _CALIB_MAX before computing correlation.
+ if len(_rolling_chain_scores) > _CALIB_MAX:
+ _rolling_chain_scores = _rolling_chain_scores[-_CALIB_MAX:]
+ _rolling_prm_scores = _rolling_prm_scores[-_CALIB_MAX:]
+ _rolling_successes = _rolling_successes[-_CALIB_MAX:]
+
+ if not _use_chain_as_primary and len(_rolling_chain_scores) >= _CALIB_WINDOW:
+ from scipy.stats import pearsonr # noqa: PLC0415
+ try:
+ _r, _ = pearsonr(
+ _rolling_chain_scores[-_CALIB_WINDOW:],
+ _rolling_prm_scores[-_CALIB_WINDOW:],
+ )
+ _chain_prm_correlation = float(_r)
+ except Exception:
+ _chain_prm_correlation = 0.0
+ _rolling_n = len(_rolling_successes[-_CALIB_WINDOW:])
+ _extraction_success_rate = (
+ sum(_rolling_successes[-_CALIB_WINDOW:]) / _rolling_n
+ if _rolling_n > 0 else 0.0
+ )
+ if (
+ _chain_prm_correlation >= 0.70
+ and _extraction_success_rate >= 0.80
+ ):
+ _use_chain_as_primary = True
+ math_env.use_chain_scoring = True
+ logger.info(
+ "CHAIN PRIMARY activated at iter %d: "
+ "corr=%.2f extraction_rate=%.2f (window=%d) — "
+ "unified calculator now drives reward scoring",
+ iteration, _chain_prm_correlation,
+ _extraction_success_rate, _CALIB_WINDOW,
+ )
+ else:
+ logger.debug(
+ "Chain calibration: corr=%.2f success_rate=%.2f "
+ "(need corr≥0.70, success≥0.80; window=%d/%d)",
+ _chain_prm_correlation, _extraction_success_rate,
+ len(_rolling_chain_scores), _CALIB_WINDOW,
+ )
+
+ # Grounded floor monitoring: suspend self-play if answer correctness
+ # drops below the floor set at graduation minus 5pp. Self-play
+ # resumes automatically next iteration if performance recovers.
+ _prev_suspended = _selfplay_suspended
+ _selfplay_suspended = (
+ bool(_grounded_gt_matches) and gt_match_rate < args.grounded_floor
+ )
+ if _selfplay_suspended and not _prev_suspended:
+ logger.warning(
+ "GROUNDED FLOOR: gt_match_rate=%.2f fell below floor=%.2f — "
+ "suspending self-play for recovery",
+ gt_match_rate, args.grounded_floor,
+ )
+ elif not _selfplay_suspended and _prev_suspended:
+ logger.info(
+ "GROUNDED FLOOR: gt_match_rate=%.2f recovered above floor=%.2f — "
+ "resuming self-play",
+ gt_match_rate, args.grounded_floor,
+ )
+
+ # Question generation accuracy metrics (self-play only)
+ q_gen_valid_rate = (q_gen_valid / q_gen_attempts) if q_gen_attempts > 0 else 0.0
+ q_quality_rate = (q_quality_good / n_self_play) if n_self_play > 0 else 0.0
+ # Per-component averages (all non-empty across K solutions × groups)
+ mean_q_topic = float(np.mean(_qc_topic)) if _qc_topic else 0.0
+ mean_q_diff = float(np.mean(_qc_diff)) if _qc_diff else 0.0
+ mean_q_clarity = float(np.mean(_qc_clarity)) if _qc_clarity else 0.0
+ mean_q_novelty = float(np.mean(_qc_novelty)) if _qc_novelty else 0.0
+ mean_q_solvab = float(np.mean(_qc_solvability)) if _qc_solvability else 0.0
+
+ _cur_lr = optimizer.param_groups[0]["lr"]
+
+ # ── LLM classifier stats (every 5 iters to avoid log spam) ─────────
+ if iteration % 5 == 0:
+ _llm_classifier.log_stats()
+
+ # ── Primary summary line ─────────────────────────────────────────────
+ logger.info(
+ "Iter %d | loss=%.4f | reward mean=%.3f std=%.3f | "
+ "gt_match=%.1f%% | grounded_acc=%.1f%% | step_acc=%.1f%% | lccp=%.1f%% | "
+ "batch_acc=%.1f%% | phase=%s sp_ratio=%.0f%% | "
+ "groups=%d skipped=%d(0var=%d) | lr=%.2e | %.1fs",
+ iteration, loss_val, mean_r, std_r,
+ 100 * gt_match_rate,
+ 100 * grounded_acc_r,
+ 100 * mean_step_acc,
+ 100 * mean_lccp,
+ 100 * acc_r,
+ _phase.name, 100 * _effective_sp_ratio,
+ n_groups, skipped, _skipped_zero_var, _cur_lr, iter_time,
+ )
+ # Starvation warning: if >30% of groups were skipped due to zero reward
+ # variance (all K solutions same score), the curriculum difficulty is
+ # mis-calibrated — either too easy (all correct) or too hard (all wrong).
+ _total_attempted = n_groups + skipped
+ if _total_attempted > 0 and _skipped_zero_var / _total_attempted > 0.30:
+ logger.warning(
+ "STARVATION: %.0f%% of groups skipped (zero variance). "
+ "grounded_acc=%.1f%% suggests curriculum is %s. "
+ "Consider adjusting --difficulty-alpha.",
+ 100 * _skipped_zero_var / _total_attempted,
+ 100 * grounded_acc_r,
+ "too easy (raise alpha)" if grounded_acc_r > 0.75 else "too hard (lower alpha)",
+ )
+
+ # ── Question-generation accuracy line (only when self-play is active) ─
+ if n_self_play > 0:
+ logger.info(
+ " Question generation: %d/%d valid (%.0f%%) | "
+ "q_reward=%.3f | q_acc=%.1f%% (>0.5 quality) | "
+ "topic=%.2f diff=%.2f clarity=%.2f novelty=%.2f solvability=%.2f",
+ q_gen_valid, q_gen_attempts, 100 * q_gen_valid_rate,
+ mean_q_r, 100 * q_quality_rate,
+ mean_q_topic, mean_q_diff, mean_q_clarity,
+ mean_q_novelty, mean_q_solvab,
+ )
+
+ iter_metrics: Dict = {
+ "iteration": iteration,
+ "loss": loss_val,
+ "mean_reward": mean_r,
+ "std_reward": std_r,
+ "batch_accuracy": acc_r,
+ "grounded_accuracy": grounded_acc_r,
+ "gt_match_rate": round(gt_match_rate, 4),
+ "step_accuracy": mean_step_acc,
+ "lccp": mean_lccp,
+ "n_groups": n_groups,
+ "skipped_groups": skipped,
+ "learning_rate": _cur_lr,
+ "iter_time_s": iter_time,
+ # ── Phase curriculum metrics ────────────────────────────────────
+ "training_phase": _phase.name,
+ "effective_sp_ratio": round(_effective_sp_ratio, 3),
+ "selfplay_suspended": int(_selfplay_suspended),
+ # ── Chain scoring metrics (Phase 2+, None in Phase 1) ────────────
+ "chain_arith_score": round(mean_chain_arith, 4) if mean_chain_arith is not None else None,
+ "chain_dep_score": round(mean_chain_dep, 4) if mean_chain_dep is not None else None,
+ "chain_integrity_score": round(mean_chain_integrity, 4) if mean_chain_integrity is not None else None,
+ "sp_chain_integrity_score": round(mean_sp_chain, 4) if mean_sp_chain is not None else None,
+ # ── Chain calibration metrics (populated during SELFPLAY_RAMP shadow mode)
+ "chain_prm_correlation": round(_chain_prm_correlation, 3),
+ "extraction_success_rate": round(_extraction_success_rate, 3),
+ "chain_scoring_active": int(_use_chain_as_primary),
+ # ── Question-generation metrics ─────────────────────────────────
+ "n_self_play_groups": n_self_play,
+ "q_gen_attempts": q_gen_attempts,
+ "q_gen_valid": q_gen_valid,
+ "q_gen_valid_rate": round(q_gen_valid_rate, 4),
+ "mean_question_reward": round(mean_q_r, 4),
+ "q_quality_rate": round(q_quality_rate, 4),
+ "q_topic_match": round(mean_q_topic, 4),
+ "q_difficulty_fit": round(mean_q_diff, 4),
+ "q_clarity": round(mean_q_clarity, 4),
+ "q_novelty": round(mean_q_novelty, 4),
+ "q_solvability": round(mean_q_solvab, 4),
+ }
+
+ # --- Eval ---
+ if iteration % args.eval_every == 0:
+ _eval_ds_label = _infer_eval_dataset_name(args.eval_data_path)
+ logger.info("Evaluating %s (%d samples)...", _eval_ds_label, args.eval_max_samples)
+ eval_res = evaluate_policy(
+ model, tokenizer,
+ args.eval_data_path, args.eval_max_samples, args.eval_max_new_tokens,
+ math_env=math_env,
+ pass_at_k=args.eval_pass_at_k,
+ )
+ # accuracy == combined_score: 0.50×correct + 0.40×process(prm_final,prm_mean) + 0.10×fmt
+ cur_combined = float(eval_res.get("combined_score", best_combined))
+ cur_prm_mean = float(eval_res.get("prm_mean", best_prm_mean))
+
+ _log_eval_result(f"iter {iteration}", eval_res, best=best_combined)
+
+ # ── Checkpoint: save when combined_score strictly improves ────────
+ # combined_score is a continuous variable; any improvement in
+ # correctness, PRM quality, SymPy, or format moves it.
+ if cur_combined > best_combined + 1e-4:
+ reason = f"combined {cur_combined:.4f} > {best_combined:.4f}"
+ best_combined = cur_combined
+ best_prm_mean = max(best_prm_mean, cur_prm_mean)
+ best_accuracy = best_combined
+ best_path = out_dir / "best_policy"
+ model.save_pretrained(str(best_path))
+ tokenizer.save_pretrained(str(best_path))
+ logger.info("New best saved → %s (%s)", best_path, reason)
+
+ iter_metrics.update(eval_res)
+
+ # --- Save checkpoint (respect --save-every / --keep-last) ---
+ is_last_iter = iteration == args.num_iterations
+ should_save = is_last_iter or (
+ args.save_every > 0 and iteration % args.save_every == 0
+ )
+ if should_save:
+ ckpt_path = out_dir / f"iter_{iteration:04d}"
+ ckpt_path.mkdir(exist_ok=True)
+ model.save_pretrained(str(ckpt_path))
+ tokenizer.save_pretrained(str(ckpt_path))
+
+ # Prune older iter_* checkpoints beyond the rolling window.
+ if args.keep_last and args.keep_last > 0:
+ existing = sorted(
+ p for p in out_dir.iterdir()
+ if p.is_dir() and p.name.startswith("iter_")
+ )
+ to_remove = existing[: -args.keep_last]
+ for old in to_remove:
+ try:
+ shutil.rmtree(old)
+ logger.info("Pruned old checkpoint: %s", old.name)
+ except OSError as exc:
+ logger.warning("Could not prune %s: %s", old.name, exc)
+
+ # ── Write metrics to both JSONL (full history) and CSV (live row) ────
+ metrics_log.append(iter_metrics)
+ (out_dir / "metrics.jsonl").write_text(
+ "\n".join(json.dumps(m) for m in metrics_log), encoding="utf-8"
+ )
+ # CSV: one row per iteration, flushed immediately so you can
+ # `tail -f logs/grpo//metrics.csv` or open it in Excel mid-run.
+ # `iter_metrics.update(eval_res)` overwrites step_accuracy/lccp on eval iters.
+ # We capture the is_eval flag here for clarity.
+ _is_eval_iter = "combined_score" in iter_metrics
+ _append_metrics_csv({
+ "iteration": iter_metrics["iteration"],
+ "timestamp": datetime.now().isoformat(timespec="seconds"),
+ # ── Per-iteration training signal ───────────────────────────────
+ "loss": iter_metrics.get("loss", 0.0),
+ "mean_reward": iter_metrics.get("mean_reward", 0.0),
+ "std_reward": iter_metrics.get("std_reward", 0.0),
+ "batch_accuracy": iter_metrics.get("batch_accuracy", 0.0),
+ "grounded_acc": iter_metrics.get("grounded_accuracy", 0.0),
+ "gt_match_rate": iter_metrics.get("gt_match_rate", 0.0),
+ # step_accuracy / lccp: training value on non-eval iters,
+ # eval value on eval iters (update() overwrites them).
+ "step_accuracy": iter_metrics.get("step_accuracy", 0.0),
+ "lccp": iter_metrics.get("lccp", 0.0),
+ "n_groups": iter_metrics.get("n_groups", 0),
+ "skipped_groups": iter_metrics.get("skipped_groups", 0),
+ "n_sp_groups": iter_metrics.get("n_self_play_groups", 0),
+ "sp_ratio": iter_metrics.get("effective_sp_ratio", 0.0),
+ "sp_suspended": iter_metrics.get("selfplay_suspended", 0),
+ "training_phase": iter_metrics.get("training_phase", ""),
+ "learning_rate": iter_metrics.get("learning_rate", 0.0),
+ "iter_time_s": iter_metrics.get("iter_time_s", 0.0),
+ # ── Question-generation quality ─────────────────────────────────
+ "q_reward": iter_metrics.get("mean_question_reward", ""),
+ "q_valid_rate": iter_metrics.get("q_gen_valid_rate", ""),
+ "q_novelty": iter_metrics.get("q_novelty", ""),
+ "q_solvability": iter_metrics.get("q_solvability", ""),
+ # ── Chain scoring calibration ───────────────────────────────────
+ "chain_prm_corr": iter_metrics.get("chain_prm_correlation", ""),
+ "chain_scoring_on": iter_metrics.get("chain_scoring_active", ""),
+ # ── Eval checkpoint metrics (every eval_every iters) ────────────
+ "eval_combined": iter_metrics.get("combined_score", "") if _is_eval_iter else "",
+ "eval_correct_rt": iter_metrics.get("correct_rate", "") if _is_eval_iter else "",
+ "eval_prm": iter_metrics.get("prm_mean", "") if _is_eval_iter else "",
+ "eval_step_acc": iter_metrics.get("step_accuracy", "") if _is_eval_iter else "",
+ "eval_lccp": iter_metrics.get("lccp", "") if _is_eval_iter else "",
+ "eval_format": iter_metrics.get("format_mean", "") if _is_eval_iter else "",
+ "eval_n_scored": iter_metrics.get("n_scored", "") if _is_eval_iter else "",
+ "eval_final_ans": iter_metrics.get("final_answer_accuracy", "") if _is_eval_iter else "",
+ })
+
+ logger.info("=" * 70)
+ logger.info("GRPO training complete.")
+ logger.info(
+ "Best training-objective score : %.4f "
+ "(0.50×correct + 0.40×process[0.60×prm_final+0.40×prm_mean] + 0.10×fmt)",
+ best_combined,
+ )
+ logger.info("Best PRM component mean : %.3f", best_prm_mean)
+ logger.info("Checkpoints : %s", out_dir)
+ logger.info("Logs : %s", log_dir)
+ logger.info("Console log : %s", console_log_path)
+ logger.info("=" * 70)
+
+ # ── Final summary ─────────────────────────────────────────────────────────
+ summary: Dict[str, Any] = {
+ "run_name": run_name,
+ "best_accuracy": best_combined, # accuracy == combined_score
+ "best_combined": best_combined,
+ "best_prm_mean": best_prm_mean,
+ "total_iterations": args.num_iterations,
+ "checkpoints_dir": str(out_dir),
+ "log_dir": str(log_dir),
+ "console_log": str(console_log_path),
+ "metrics_csv": str(_metrics_csv_path),
+ "metrics_jsonl": str(out_dir / "metrics.jsonl"),
+ }
+ (log_dir / "summary.json").write_text(
+ json.dumps(summary, indent=2, default=str), encoding="utf-8"
+ )
+ logger.info("Summary written to %s", log_dir / "summary.json")
+
+ # ── Auto-generate demo plots ───────────────────────────────────────────────
+ _metrics_jsonl = out_dir / "metrics.jsonl"
+ if _metrics_jsonl.exists():
+ try:
+ import importlib
+ if importlib.util.find_spec("matplotlib") is None:
+ logger.warning(
+ "matplotlib not installed — skipping auto-plot. "
+ "Install with: pip install matplotlib then run: "
+ "python scripts/plot_grpo_run.py %s",
+ _metrics_jsonl,
+ )
+ else:
+ from scripts.plot_grpo_run import generate_plots as _gen_plots
+ _plot_dir = _gen_plots(_metrics_jsonl)
+ logger.info("Plots saved → %s", _plot_dir)
+ except Exception as _plot_exc:
+ logger.warning(
+ "Plot generation failed (%s: %s). "
+ "Run manually: python scripts/plot_grpo_run.py %s",
+ type(_plot_exc).__name__, _plot_exc, _metrics_jsonl,
+ )
+
+ # Explicit teardown (atexit is the safety net for crashes; calling here
+ # ensures everything is flushed and closed before the process returns
+ # normally — atexit won't double-close because _teardown_logging is
+ # idempotent via the .closed checks).
+ _teardown_logging()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/run_inference.py b/scripts/run_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..d63e3c05dca24d50d6f20f876e923de45a154116
--- /dev/null
+++ b/scripts/run_inference.py
@@ -0,0 +1,502 @@
+#!/usr/bin/env python3
+"""
+Inference pipeline: Base Qwen2.5-Math-1.5B-Instruct vs RL fine-tuned checkpoint.
+
+For each sampled GSM8K question, both models generate a step-by-step solution.
+Results are saved to reports// as JSON files for the Gradio demo.
+
+Usage
+-----
+ # Full run (50 questions, both models):
+ python scripts/run_inference.py \\
+ --checkpoint checkpoints/grpo_run_v1 \\
+ --num-questions 50 \\
+ --run-name comparison_v1
+
+ # Quick smoke test (10 questions, no RL model):
+ python scripts/run_inference.py \\
+ --num-questions 10 \\
+ --base-only \\
+ --run-name smoke
+
+ # Custom data source:
+ python scripts/run_inference.py \\
+ --checkpoint checkpoints/grpo_run_v1 \\
+ --data data/sft/gsm8k_test.jsonl \\
+ --num-questions 30
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import random
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+from tqdm.auto import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from src.config.prompts import create_solver_messages
+from src.sft.solution_format import extract_final_answer_numeric_str
+from src.utils.attn_backend import select_attn_implementation
+
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s %(levelname)-8s %(name)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+BASE_MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
+REPORTS_DIR = Path("reports")
+
+
+# ── Data loading ──────────────────────────────────────────────────────────────
+
+def load_gsm8k_questions(
+ data_path: Optional[str],
+ num_questions: int,
+ seed: int = 42,
+) -> List[Dict[str, str]]:
+ """
+ Load GSM8K questions from a local JSONL file or fall back to HuggingFace.
+
+ Each returned record has keys: ``question``, ``gold_final``, ``answer``.
+ """
+ # ── Try local JSONL first ────────────────────────────────────────────────
+ candidates = [data_path] if data_path else []
+ candidates += [
+ "data/sft/gsm8k_test.jsonl",
+ "data/sft/gsm8k_sft.jsonl",
+ ]
+
+ for path in candidates:
+ if path and Path(path).exists():
+ logger.info("Loading GSM8K from local file: %s", path)
+ rows: List[Dict] = []
+ with open(path, encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ rows.append(json.loads(line))
+ rng = random.Random(seed)
+ sample = rng.sample(rows, min(num_questions, len(rows)))
+ logger.info("Sampled %d / %d questions.", len(sample), len(rows))
+ return sample
+
+ # ── Fall back to HuggingFace datasets ────────────────────────────────────
+ logger.info("No local file found — downloading GSM8K from HuggingFace…")
+ try:
+ from datasets import load_dataset
+ ds = load_dataset("openai/gsm8k", "main", split="test")
+ except Exception as e:
+ raise RuntimeError(
+ "Could not load GSM8K. Provide --data or install datasets: pip install datasets"
+ ) from e
+
+ rows = []
+ for item in ds:
+ q = item["question"].strip()
+ a = item["answer"].strip()
+ # GSM8K answers end with "#### "
+ gold = a.split("####")[-1].strip() if "####" in a else ""
+ rows.append({"question": q, "gold_final": gold, "answer": a})
+
+ rng = random.Random(seed)
+ sample = rng.sample(rows, min(num_questions, len(rows)))
+ logger.info("Sampled %d questions from HF GSM8K test split.", len(sample))
+ return sample
+
+
+# ── Model loading ─────────────────────────────────────────────────────────────
+
+def load_base_model(
+ device: torch.device,
+ attn_impl: str,
+) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
+ logger.info("Loading base model: %s", BASE_MODEL_ID)
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
+ if tokenizer.pad_token is None:
+ tokenizer.pad_token = tokenizer.eos_token
+ tokenizer.padding_side = "left"
+
+ model = AutoModelForCausalLM.from_pretrained(
+ BASE_MODEL_ID,
+ torch_dtype=torch.bfloat16,
+ device_map={"": device},
+ trust_remote_code=True,
+ attn_implementation=attn_impl,
+ )
+ model.eval()
+ logger.info("Base model loaded.")
+ return model, tokenizer
+
+
+def load_rl_model(
+ checkpoint: str,
+ base_model: AutoModelForCausalLM,
+ base_tokenizer: AutoTokenizer,
+ device: torch.device,
+ attn_impl: str,
+) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
+ """
+ Load the RL fine-tuned checkpoint for comparison against the raw base model.
+
+ Two checkpoint formats are supported:
+
+ PEFT / LoRA adapter (has adapter_config.json)
+ The already-loaded base model weights are deep-copied in CPU memory,
+ the adapter is applied on top, then merged and unloaded.
+ This avoids downloading the 1.5B base weights from HuggingFace a
+ second time — the base model is downloaded only once per run.
+
+ Full saved model (has config.json, no adapter_config.json)
+ Loaded directly from disk with from_pretrained.
+ """
+ import copy
+
+ ckpt_path = Path(checkpoint)
+ if not ckpt_path.exists():
+ raise FileNotFoundError(f"Checkpoint not found: {checkpoint}")
+
+ is_peft = (ckpt_path / "adapter_config.json").exists()
+
+ if is_peft:
+ logger.info(
+ "Loading PEFT adapter from %s (reusing base weights — no second HF download)",
+ checkpoint,
+ )
+ from peft import PeftModel
+
+ # Deep-copy the already-loaded base model so the base remains untouched
+ # for side-by-side comparison. For a 1.5B bfloat16 model this takes
+ # ~1-2 s and avoids re-downloading ~3 GB from HuggingFace.
+ base_copy = copy.deepcopy(base_model)
+ model = PeftModel.from_pretrained(base_copy, checkpoint)
+ model = model.merge_and_unload()
+ model = model.to(device)
+ else:
+ logger.info("Loading full model checkpoint from %s", checkpoint)
+ model = AutoModelForCausalLM.from_pretrained(
+ checkpoint,
+ torch_dtype=torch.bfloat16,
+ device_map={"": device},
+ trust_remote_code=True,
+ attn_implementation=attn_impl,
+ )
+
+ # Patch chat_template from base tokenizer if missing
+ tokenizer = AutoTokenizer.from_pretrained(
+ checkpoint if (ckpt_path / "tokenizer_config.json").exists() else BASE_MODEL_ID,
+ trust_remote_code=True,
+ )
+ if tokenizer.pad_token is None:
+ tokenizer.pad_token = tokenizer.eos_token
+ tokenizer.padding_side = "left"
+ if tokenizer.chat_template is None and base_tokenizer.chat_template:
+ tokenizer.chat_template = base_tokenizer.chat_template
+
+ model.eval()
+ logger.info("RL model loaded.")
+ return model, tokenizer
+
+
+# ── Inference ─────────────────────────────────────────────────────────────────
+
+def generate_solution(
+ model: AutoModelForCausalLM,
+ tokenizer: AutoTokenizer,
+ question: str,
+ device: torch.device,
+ max_new_tokens: int = 512,
+ temperature: float = 0.1,
+) -> Tuple[str, float]:
+ """
+ Generate a step-by-step solution for ``question``.
+
+ Returns ``(solution_text, elapsed_seconds)``.
+ Low temperature (0.1) for deterministic, greedy-like output during eval.
+ """
+ messages = create_solver_messages(question)
+ prompt = tokenizer.apply_chat_template(
+ messages, tokenize=False, add_generation_prompt=True
+ )
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
+ inputs = {k: v.to(device) for k, v in inputs.items()}
+ prompt_len = inputs["input_ids"].shape[1]
+
+ stop_ids = [tokenizer.eos_token_id]
+ im_end = tokenizer.convert_tokens_to_ids("<|im_end|>")
+ if isinstance(im_end, int) and im_end not in stop_ids:
+ stop_ids.append(im_end)
+
+ t0 = time.time()
+ with torch.no_grad():
+ output = model.generate(
+ **inputs,
+ max_new_tokens=max_new_tokens,
+ do_sample=temperature > 0.05,
+ temperature=temperature if temperature > 0.05 else None,
+ top_p=0.95 if temperature > 0.05 else None,
+ eos_token_id=stop_ids,
+ pad_token_id=tokenizer.pad_token_id,
+ use_cache=True,
+ )
+ elapsed = time.time() - t0
+
+ response_ids = output[0][prompt_len:]
+ solution = tokenizer.decode(response_ids, skip_special_tokens=True).strip()
+ return solution, elapsed
+
+
+def score_answer(solution: str, gold_final: str) -> Dict[str, Any]:
+ """
+ Extract the predicted final answer and compare with gold.
+ Returns a dict with ``predicted``, ``gold``, ``correct``, ``match_type``.
+ """
+ predicted_raw = extract_final_answer_numeric_str(solution)
+
+ if predicted_raw is None:
+ return {
+ "predicted": None,
+ "gold": gold_final,
+ "correct": False,
+ "match_type": "no_answer_found",
+ }
+
+ # Normalise: strip whitespace, remove commas (e.g. "1,200" → "1200")
+ def _norm(s: str) -> str:
+ return s.strip().replace(",", "").rstrip(".").lower()
+
+ pred_n = _norm(predicted_raw)
+ gold_n = _norm(gold_final)
+
+ # Direct string match
+ if pred_n == gold_n:
+ return {
+ "predicted": predicted_raw,
+ "gold": gold_final,
+ "correct": True,
+ "match_type": "exact",
+ }
+
+ # Numeric match (handles float/int equivalence)
+ try:
+ pred_f = float(pred_n)
+ gold_f = float(gold_n)
+ if abs(pred_f - gold_f) < 1e-6:
+ return {
+ "predicted": predicted_raw,
+ "gold": gold_final,
+ "correct": True,
+ "match_type": "numeric",
+ }
+ except (ValueError, TypeError):
+ pass
+
+ return {
+ "predicted": predicted_raw,
+ "gold": gold_final,
+ "correct": False,
+ "match_type": "wrong",
+ }
+
+
+# ── Report serialisation ──────────────────────────────────────────────────────
+
+def save_question_report(
+ report_dir: Path,
+ idx: int,
+ question: str,
+ gold_final: str,
+ base_result: Dict[str, Any],
+ rl_result: Optional[Dict[str, Any]],
+) -> Path:
+ record = {
+ "idx": idx,
+ "question": question,
+ "gold_final": gold_final,
+ "base_model": base_result,
+ "rl_model": rl_result,
+ }
+ out = report_dir / f"q_{idx:04d}.json"
+ out.write_text(json.dumps(record, indent=2, ensure_ascii=False), encoding="utf-8")
+ return out
+
+
+def save_summary(
+ report_dir: Path,
+ run_name: str,
+ checkpoint: Optional[str],
+ base_correct: int,
+ rl_correct: Optional[int],
+ total: int,
+ total_time_s: float,
+ args_dict: Dict,
+) -> None:
+ summary = {
+ "run_name": run_name,
+ "timestamp": datetime.now().isoformat(),
+ "base_model": BASE_MODEL_ID,
+ "rl_checkpoint": checkpoint,
+ "num_questions": total,
+ "base_accuracy": round(base_correct / total, 4) if total else 0,
+ "rl_accuracy": round(rl_correct / total, 4) if (rl_correct is not None and total) else None,
+ "base_correct": base_correct,
+ "rl_correct": rl_correct,
+ "total_time_s": round(total_time_s, 1),
+ "args": args_dict,
+ }
+ out = report_dir / "summary.json"
+ out.write_text(json.dumps(summary, indent=2), encoding="utf-8")
+ logger.info("Summary saved → %s", out)
+
+
+# ── Main ──────────────────────────────────────────────────────────────────────
+
+def parse_args() -> argparse.Namespace:
+ p = argparse.ArgumentParser(description="Run inference: base vs RL model on GSM8K")
+ p.add_argument("--checkpoint", type=str, default=None,
+ help="Path to RL fine-tuned model or PEFT adapter. "
+ "If omitted, only the base model is run.")
+ p.add_argument("--data", type=str, default=None,
+ help="Path to local GSM8K JSONL file. "
+ "Defaults to data/sft/gsm8k_test.jsonl or HuggingFace.")
+ p.add_argument("--num-questions", type=int, default=50)
+ p.add_argument("--seed", type=int, default=42)
+ p.add_argument("--max-new-tokens", type=int, default=512)
+ p.add_argument("--temperature", type=float, default=0.1)
+ p.add_argument("--run-name", type=str, default=None,
+ help="Report sub-folder name. Defaults to timestamp.")
+ p.add_argument("--base-only", action="store_true",
+ help="Skip RL model; only run the base model.")
+ p.add_argument("--reports-dir", type=str, default="reports")
+ return p.parse_args()
+
+
+def main() -> None:
+ args = parse_args()
+
+ run_name = args.run_name or f"run_{datetime.now():%Y%m%d_%H%M%S}"
+ report_dir = Path(args.reports_dir) / run_name
+ report_dir.mkdir(parents=True, exist_ok=True)
+ logger.info("Reports → %s", report_dir)
+
+ # ── Device ────────────────────────────────────────────────────────────────
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+ attn_impl = select_attn_implementation()
+ logger.info("Device: %s | attn: %s", device, attn_impl)
+ if torch.cuda.is_available():
+ g = torch.cuda.get_device_properties(0)
+ logger.info("GPU: %s | %.1f GB", g.name, g.total_memory / 1e9)
+
+ # ── Data ──────────────────────────────────────────────────────────────────
+ questions = load_gsm8k_questions(args.data, args.num_questions, args.seed)
+
+ # ── Models ────────────────────────────────────────────────────────────────
+ base_model, base_tokenizer = load_base_model(device, attn_impl)
+
+ rl_model, rl_tokenizer = None, None
+ if not args.base_only and args.checkpoint:
+ rl_model, rl_tokenizer = load_rl_model(
+ args.checkpoint, base_model, base_tokenizer, device, attn_impl
+ )
+ elif not args.base_only and not args.checkpoint:
+ logger.warning("No --checkpoint provided. Running base model only.")
+
+ # ── Inference loop ────────────────────────────────────────────────────────
+ base_correct = 0
+ rl_correct = 0 if rl_model else None
+ t_total_start = time.time()
+
+ for idx, row in enumerate(tqdm(questions, desc="Inference")):
+ question = row["question"]
+ gold_final = row.get("gold_final", "").strip()
+
+ # Base model
+ base_solution, base_time = generate_solution(
+ base_model, base_tokenizer, question, device,
+ args.max_new_tokens, args.temperature,
+ )
+ base_score = score_answer(base_solution, gold_final)
+ if base_score["correct"]:
+ base_correct += 1
+
+ base_result = {
+ "solution": base_solution,
+ "predicted": base_score["predicted"],
+ "correct": base_score["correct"],
+ "match_type": base_score["match_type"],
+ "time_s": round(base_time, 2),
+ "num_tokens": len(base_tokenizer.encode(base_solution)),
+ }
+
+ # RL model
+ rl_result = None
+ if rl_model is not None:
+ rl_solution, rl_time = generate_solution(
+ rl_model, rl_tokenizer, question, device,
+ args.max_new_tokens, args.temperature,
+ )
+ rl_score = score_answer(rl_solution, gold_final)
+ if rl_score["correct"]:
+ rl_correct += 1
+
+ rl_result = {
+ "solution": rl_solution,
+ "predicted": rl_score["predicted"],
+ "correct": rl_score["correct"],
+ "match_type": rl_score["match_type"],
+ "time_s": round(rl_time, 2),
+ "num_tokens": len(rl_tokenizer.encode(rl_solution)),
+ }
+
+ save_question_report(report_dir, idx, question, gold_final, base_result, rl_result)
+
+ # Live progress log every 10 questions
+ if (idx + 1) % 10 == 0 or idx == len(questions) - 1:
+ done = idx + 1
+ b_acc = base_correct / done
+ log_str = f"[{done}/{len(questions)}] Base acc: {b_acc:.1%}"
+ if rl_correct is not None:
+ log_str += f" | RL acc: {rl_correct / done:.1%}"
+ logger.info(log_str)
+
+ total_time = time.time() - t_total_start
+
+ # ── Summary ───────────────────────────────────────────────────────────────
+ save_summary(
+ report_dir=report_dir,
+ run_name=run_name,
+ checkpoint=args.checkpoint,
+ base_correct=base_correct,
+ rl_correct=rl_correct,
+ total=len(questions),
+ total_time_s=total_time,
+ args_dict=vars(args),
+ )
+
+ logger.info("=" * 60)
+ logger.info("Run complete: %s", run_name)
+ logger.info("Base accuracy : %d / %d = %.1f%%",
+ base_correct, len(questions), 100 * base_correct / len(questions))
+ if rl_correct is not None:
+ logger.info("RL accuracy : %d / %d = %.1f%%",
+ rl_correct, len(questions), 100 * rl_correct / len(questions))
+ delta = rl_correct - base_correct
+ sign = "+" if delta >= 0 else ""
+ logger.info("Delta : %s%d questions (%s%.1f%%)",
+ sign, delta, sign, 100 * delta / len(questions))
+ logger.info("Reports : %s", report_dir)
+ logger.info("=" * 60)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/server/AxiomForgeAI_environment.py b/server/AxiomForgeAI_environment.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe66fa6978e6e6d80966a2e1dc099aadce2096ac
--- /dev/null
+++ b/server/AxiomForgeAI_environment.py
@@ -0,0 +1,359 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+AxiomForgeAI Math RL Environment.
+
+Wraps CurriculumMathEnvironment from src/rl/math_environment_curriculum.py
+to expose an OpenEnv-compatible interface (reset / step / state).
+
+Episode semantics
+-----------------
+* reset() — Samples a new question from the adaptive curriculum (or a
+ grounded QA pair when a dataset is configured). Returns the
+ question in the observation; reward is 0.0.
+* step(action) — Scores the agent's submitted solution with the full reward
+ pipeline (PRM + SymPy + format) and returns reward + feedback.
+ done=True always: one question per episode.
+
+Environment variables
+---------------------
+AXIOMFORGE_DATA_PATH Path to a JSONL file with {"question", "gold_final"}
+ records (e.g. data/sft/gsm8k_sft.jsonl). When set,
+ the environment uses grounded QA pairs for questions
+ and ground-truth answer verification.
+
+AXIOMFORGE_PRM_PATH HuggingFace model ID or local path for the Process
+ Reward Model (default: Qwen/Qwen2.5-Math-PRM-7B).
+ Set to "" to disable PRM scoring (uses SymPy only).
+
+AXIOMFORGE_CURRICULUM_DIR
+ Directory where the CurriculumManager persists its
+ state between runs. Defaults to
+ "checkpoints/curriculum".
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import random
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from uuid import uuid4
+
+import torch
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import State
+
+try:
+ from ..models import AxiomforgeaiAction, AxiomforgeaiObservation
+
+except ImportError:
+ from models import AxiomforgeaiAction, AxiomforgeaiObservation
+
+# ── Heavy RL imports — fail gracefully so openenv validate passes even when
+# the ML stack is not installed (e.g. lightweight CI / schema validation).
+try:
+ from src.rl.math_environment_curriculum import CurriculumMathEnvironment
+ from src.rl.prm_scorer import ProcessRewardScorer
+ from src.sft.solution_format import extract_final_answer_numeric_str
+
+ _RL_AVAILABLE = True
+except Exception as _rl_import_err: # pragma: no cover
+ _RL_AVAILABLE = False
+ CurriculumMathEnvironment = None # type: ignore[assignment,misc]
+ ProcessRewardScorer = None # type: ignore[assignment,misc]
+ extract_final_answer_numeric_str = None # type: ignore[assignment]
+
+
+logger = logging.getLogger(__name__)
+
+# Fallback question used during validation / when no dataset is configured.
+_VALIDATION_QUESTION = (
+ "A store sells apples for $2 each and oranges for $3 each. "
+ "If Sarah buys 4 apples and 3 oranges, how much does she spend in total?"
+)
+_VALIDATION_GOLD = "17"
+_VALIDATION_TOPIC = "basic_arithmetic"
+_VALIDATION_DIFFICULTY = 0.1
+
+
+def _load_qa_pairs(data_path: str) -> List[Dict[str, str]]:
+ """Load {"question", "gold_final"} records from a JSONL file."""
+ pairs: List[Dict[str, str]] = []
+ p = Path(data_path)
+ if not p.exists():
+ logger.warning("AXIOMFORGE_DATA_PATH not found: %s", data_path)
+ return pairs
+ with p.open(encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ rec = json.loads(line)
+ except json.JSONDecodeError:
+ continue
+ q = rec.get("question", "").strip()
+ g = rec.get("gold_final", "").strip()
+ if q and g:
+ pairs.append({"question": q, "gold_final": g})
+ logger.info("Loaded %d QA pairs from %s", len(pairs), data_path)
+ return pairs
+
+
+class AxiomforgeaiEnvironment(Environment):
+ """
+ AxiomForgeAI math RL environment for OpenEnv.
+
+ Uses CurriculumMathEnvironment from src/rl/ for adaptive question
+ selection and reward computation. When the ML stack is unavailable
+ (e.g. during schema validation), falls back to a lightweight mode
+ that uses only the installed openenv-core dependencies.
+
+ Supports concurrent WebSocket sessions — each client gets its own
+ instance with independent episode state.
+ """
+
+ SUPPORTS_CONCURRENT_SESSIONS: bool = True
+
+ def __init__(self) -> None:
+ self._state = State(episode_id=str(uuid4()), step_count=0)
+
+ # Per-episode state
+ self._current_question: str = ""
+ self._gold_final: str = ""
+ self._current_topic: str = ""
+ self._current_difficulty: float = 0.5
+
+ self._math_env: Optional[Any] = None # CurriculumMathEnvironment or None
+
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+ if not _RL_AVAILABLE:
+ logger.warning(
+ "RL stack (torch/transformers/sympy) not available — "
+ "running in schema-validation mode with fixed fallback responses."
+ )
+ return
+
+ # ── Load grounded QA pairs (optional) ─────────────────────────────
+ grounded_qa_pairs: List[Dict[str, str]] = []
+ data_path = os.environ.get("AXIOMFORGE_DATA_PATH", "")
+ if data_path:
+ grounded_qa_pairs = _load_qa_pairs(data_path)
+
+ # ── Load PRM scorer (optional) ────────────────────────────────────
+ prm: Optional[Any] = None # ProcessRewardScorer or None
+ prm_path = os.environ.get("AXIOMFORGE_PRM_PATH", "")
+ if prm_path:
+ try:
+ prm = ProcessRewardScorer(
+ model_name=prm_path,
+ device=device,
+ load_in_4bit=True,
+ )
+ logger.info("PRM loaded: %s", prm_path)
+ except Exception as exc:
+ logger.warning("PRM load failed (%s) — scoring uses SymPy only.", exc)
+
+ # ── Create CurriculumMathEnvironment in scoring-only mode ─────────
+ # policy_model=None + tokenizer=None is safe when only reward-computation
+ # methods are called (compute_grounded_reward, sample_instruction).
+ # Generation methods (generate_with_logging, format_solution_prompt)
+ # are NOT called from the server step path — the agent supplies solutions.
+ curriculum_dir = os.environ.get(
+ "AXIOMFORGE_CURRICULUM_DIR", "checkpoints/curriculum"
+ )
+ try:
+ self._math_env = CurriculumMathEnvironment(
+ policy_model=None,
+ value_model=None,
+ tokenizer=None,
+ reference_questions=[qa["question"] for qa in grounded_qa_pairs],
+ grounded_qa_pairs=grounded_qa_pairs,
+ prm_scorer=prm,
+ curriculum_checkpoint_dir=curriculum_dir,
+ device=device,
+ )
+ logger.info(
+ "CurriculumMathEnvironment ready (scoring-only, %d QA pairs, PRM=%s)",
+ len(grounded_qa_pairs),
+ "yes" if prm else "no",
+ )
+ except Exception as exc:
+ logger.warning(
+ "CurriculumMathEnvironment init failed (%s) — "
+ "falling back to validation mode.",
+ exc,
+ )
+ self._math_env = None
+
+ # ------------------------------------------------------------------
+ # OpenEnv interface
+ # ------------------------------------------------------------------
+
+ def reset(
+ self,
+ qa: Optional[Dict[str, str]] = None,
+ ) -> AxiomforgeaiObservation:
+ """
+ Reset the environment and begin a new episode.
+
+ Args:
+ qa: Optional ``{"question": str, "gold_final": str}`` dict.
+ When supplied the environment is seeded with this specific
+ question and gold answer — used by the training loop for
+ difficulty-sampled grounded episodes. When omitted the
+ environment draws from its internal grounded QA pool (if
+ configured) or falls back to the curriculum instruction.
+
+ Returns:
+ AxiomforgeaiObservation with the question populated; reward=0.0.
+ """
+ self._state = State(episode_id=str(uuid4()), step_count=0)
+
+ if qa is not None:
+ # Caller-supplied episode — honour it exactly.
+ self._current_question = qa.get("question", "").strip()
+ self._gold_final = qa.get("gold_final", "").strip()
+ self._current_topic = qa.get("topic", "grounded")
+ self._current_difficulty = float(qa.get("difficulty", 0.5))
+ elif self._math_env is not None:
+ try:
+ instruction, topic, difficulty = self._math_env.sample_instruction()
+ self._current_topic = topic
+ self._current_difficulty = float(difficulty)
+ if self._math_env.grounded_qa_pairs:
+ _qa = random.choice(self._math_env.grounded_qa_pairs)
+ self._current_question = _qa["question"]
+ self._gold_final = _qa["gold_final"]
+ else:
+ self._current_question = instruction
+ self._gold_final = ""
+ except Exception as exc:
+ logger.warning("sample_instruction failed, using fallback: %s", exc)
+ self._current_question = _VALIDATION_QUESTION
+ self._gold_final = _VALIDATION_GOLD
+ self._current_topic = _VALIDATION_TOPIC
+ self._current_difficulty = _VALIDATION_DIFFICULTY
+ else:
+ self._current_question = _VALIDATION_QUESTION
+ self._gold_final = _VALIDATION_GOLD
+ self._current_topic = _VALIDATION_TOPIC
+ self._current_difficulty = _VALIDATION_DIFFICULTY
+
+ return AxiomforgeaiObservation(
+ question=self._current_question,
+ topic=self._current_topic,
+ difficulty=self._current_difficulty,
+ feedback="",
+ done=False,
+ reward=0.0,
+ )
+
+ def step(self, action: AxiomforgeaiAction) -> AxiomforgeaiObservation: # type: ignore[override]
+ """
+ Score the agent's submitted solution.
+
+ Uses compute_grounded_reward from CurriculumMathEnvironment when
+ available (PRM + SymPy + format scoring). Falls back to numeric
+ answer extraction when the full RL stack is not loaded.
+
+ Args:
+ action: AxiomforgeaiAction containing the solution text.
+
+ Returns:
+ AxiomforgeaiObservation with reward, feedback, and metadata.
+ done=True — one question per episode.
+ """
+ self._state.step_count += 1
+ solution = action.solution
+
+ reward: float = 0.0
+ feedback: str = ""
+ metadata: Dict[str, Any] = {}
+
+ if self._math_env is not None and self._current_question:
+ try:
+ reward_result = self._math_env.compute_grounded_reward(
+ question=self._current_question,
+ solution=solution,
+ gold_final=self._gold_final,
+ )
+ reward = float(reward_result.get("combined_score", 0.0))
+ gt = reward_result.get("gt_match", False)
+ step_acc = reward_result.get("step_accuracy", 0.0)
+ lccp = reward_result.get("lccp", 0.0)
+ pred = reward_result.get("pred_final", "")
+ feedback = (
+ f"gt_match={gt} pred={pred!r} gold={self._gold_final!r} "
+ f"step_acc={step_acc:.2f} lccp={lccp:.2f}"
+ )
+ # Serialise reward breakdown into metadata; skip non-serialisable lists.
+ metadata = {
+ k: v
+ for k, v in reward_result.items()
+ if not isinstance(v, list)
+ }
+ except Exception as exc:
+ logger.warning("compute_grounded_reward failed: %s", exc)
+ reward, feedback, metadata = self._fallback_score(solution)
+ else:
+ reward, feedback, metadata = self._fallback_score(solution)
+
+ return AxiomforgeaiObservation(
+ question=self._current_question,
+ topic=self._current_topic,
+ difficulty=self._current_difficulty,
+ feedback=feedback,
+ done=True,
+ reward=reward,
+ metadata=metadata,
+ )
+
+ # ------------------------------------------------------------------
+ # Helpers
+ # ------------------------------------------------------------------
+
+ def _fallback_score(
+ self, solution: str
+ ) -> tuple[float, str, Dict[str, Any]]:
+ """Lightweight scoring used when the full RL stack is unavailable."""
+ pred: str = ""
+ if extract_final_answer_numeric_str is not None:
+ pred = extract_final_answer_numeric_str(solution) or ""
+ reward = 1.0 if pred and pred == self._gold_final else 0.0
+ feedback = f"pred={pred!r} gold={self._gold_final!r}"
+ return reward, feedback, {"pred_final": pred, "gold_final": self._gold_final}
+
+ def close(self) -> None:
+ """
+ Persist curriculum state and release resources.
+
+ Call once at the end of a training run so the CurriculumManager's
+ per-topic statistics are saved to disk and can be resumed on the
+ next run. Safe to call multiple times.
+ """
+ if self._math_env is not None:
+ try:
+ self._math_env.curriculum_manager.save_state(
+ iteration=self._math_env.curriculum_manager.current_iteration,
+ rollout=None,
+ )
+ logger.info(
+ "Curriculum state saved (iteration %d).",
+ self._math_env.curriculum_manager.current_iteration,
+ )
+ except Exception as exc:
+ logger.warning("close(): curriculum save failed — %s", exc)
+
+ @property
+ def state(self) -> State:
+ """Return the current episode state (episode_id + step_count)."""
+ return self._state
diff --git a/server/Dockerfile b/server/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..f85750fecf5d59d1451586e72e3e1639c2ad4e1b
--- /dev/null
+++ b/server/Dockerfile
@@ -0,0 +1,121 @@
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# AxiomForgeAI OpenEnv server image
+# ─────────────────────────────────────────────────────────────────────────────
+# Hardware target : A100 PCIE 80 GB | AMD EPYC 7V13
+# CUDA driver : >= 13.0 (enforced at container start)
+# CUDA toolkit : 12.4.1 (backward-compatible with driver 13.x)
+# PyTorch : 2.5.1+cu124 (pinned in /requirements.txt)
+#
+# The server exposes the math RL environment over HTTP/WebSocket and supports
+# optional GPU-accelerated PRM scoring when AXIOMFORGE_PRM_PATH is set.
+#
+# ── Build ────────────────────────────────────────────────────────────────────
+# docker build -f server/Dockerfile -t axiomforgeai-server:latest .
+#
+# ── Run (CPU-only / validation) ───────────────────────────────────────────────
+# docker run -p 8000:8000 axiomforgeai-server:latest
+#
+# ── Run (GPU + grounded data + PRM) ──────────────────────────────────────────
+# docker run --gpus all \
+# -e AXIOMFORGE_DATA_PATH=/data/gsm8k_sft.jsonl \
+# -e AXIOMFORGE_PRM_PATH=Qwen/Qwen2.5-Math-PRM-7B \
+# -v /host/data:/data \
+# -p 8000:8000 \
+# axiomforgeai-server:latest
+
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+
+WORKDIR /app
+
+# git is required for VCS-based dependency installs
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends git curl && \
+ rm -rf /var/lib/apt/lists/*
+
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=AxiomForgeAI
+
+COPY . /app/env
+WORKDIR /app/env
+
+# Ensure uv is available
+RUN if ! command -v uv >/dev/null 2>&1; then \
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+ fi
+
+# Install openenv-core + server deps (pyproject.toml / server/requirements.txt)
+RUN --mount=type=cache,target=/root/.cache/uv \
+ if [ -f uv.lock ]; then \
+ uv sync --frozen --no-install-project --no-editable; \
+ else \
+ uv sync --no-install-project --no-editable; \
+ fi
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+ if [ -f uv.lock ]; then \
+ uv sync --frozen --no-editable; \
+ else \
+ uv sync --no-editable; \
+ fi
+
+# ── ML stack for optional GPU-based PRM scoring ───────────────────────────────
+# All versions are taken from the root requirements.txt so they stay in sync
+# with the training image. The cu124 extra-index is needed to resolve the
+# correct CUDA-linked torch wheel.
+COPY requirements.txt /tmp/ml-requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+ .venv/bin/pip install --no-cache-dir \
+ --extra-index-url https://download.pytorch.org/whl/cu124 \
+ -r /tmp/ml-requirements.txt \
+ || true # non-fatal: server remains fully functional without the ML stack
+
+# ── Runtime stage ─────────────────────────────────────────────────────────────
+FROM ${BASE_IMAGE}
+
+WORKDIR /app
+
+COPY --from=builder /app/env/.venv /app/.venv
+COPY --from=builder /app/env /app/env
+
+ENV PATH="/app/.venv/bin:$PATH"
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+
+# HuggingFace model cache — mount a host path here to avoid re-downloading:
+# -v /host/hf_cache:/app/hf_cache
+ENV HF_HOME="/app/hf_cache"
+ENV TRANSFORMERS_CACHE="/app/hf_cache"
+
+# A100 CUDA tuning (only effective when --gpus is passed)
+ENV CUDA_DEVICE_MAX_CONNECTIONS=1
+ENV TORCH_CUDNN_V8_API_ENABLED=1
+
+# ── Runtime CUDA driver check (>= 13.0) ──────────────────────────────────────
+RUN printf '%s\n' \
+ '#!/bin/sh' \
+ 'if command -v nvidia-smi >/dev/null 2>&1; then' \
+ ' CUDA_VER=$(nvidia-smi 2>/dev/null | grep -oP "CUDA Version: \K[0-9.]+" || echo "0.0")' \
+ ' MAJOR=$(echo "$CUDA_VER" | cut -d. -f1)' \
+ ' echo "[AxiomForgeAI-server] CUDA driver reports toolkit: $CUDA_VER"' \
+ ' if [ "${MAJOR:-0}" -lt 13 ] 2>/dev/null; then' \
+ ' echo "[ERROR] CUDA driver >= 13.0 required; detected $CUDA_VER. Upgrade your NVIDIA driver."' \
+ ' exit 1' \
+ ' fi' \
+ 'fi' \
+ 'exec "$@"' \
+ > /usr/local/bin/entrypoint.sh \
+ && chmod +x /usr/local/bin/entrypoint.sh
+
+HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
+ CMD curl -f http://localhost:8000/health || exit 1
+
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
diff --git a/server/__init__.py b/server/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..737fb30d98910669a711796aea0dadca98bc47d6
--- /dev/null
+++ b/server/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Axiomforgeai environment server components."""
+
+from .AxiomForgeAI_environment import AxiomforgeaiEnvironment
+
+__all__ = ["AxiomforgeaiEnvironment"]
diff --git a/server/app.py b/server/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..c92b6718c6e25821dbebef031ae40bc63111c17a
--- /dev/null
+++ b/server/app.py
@@ -0,0 +1,79 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+FastAPI application for the Axiomforgeai Environment.
+
+This module creates an HTTP server that exposes the AxiomforgeaiEnvironment
+over HTTP and WebSocket endpoints, compatible with EnvClient.
+
+Endpoints:
+ - POST /reset: Reset the environment
+ - POST /step: Execute an action
+ - GET /state: Get current environment state
+ - GET /schema: Get action/observation schemas
+ - WS /ws: WebSocket endpoint for persistent sessions
+
+Usage:
+ # Development (with auto-reload):
+ uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
+
+ # Production:
+ uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
+
+ # Or run directly:
+ python -m server.app
+"""
+
+try:
+ from openenv.core.env_server.http_server import create_app
+except Exception as e: # pragma: no cover
+ raise ImportError(
+ "openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
+ ) from e
+
+try:
+ from ..models import AxiomforgeaiAction, AxiomforgeaiObservation
+ from .AxiomForgeAI_environment import AxiomforgeaiEnvironment
+except ImportError:
+ from models import AxiomforgeaiAction, AxiomforgeaiObservation
+ from server.AxiomForgeAI_environment import AxiomforgeaiEnvironment
+
+
+# Create the app with web interface and README integration
+app = create_app(
+ AxiomforgeaiEnvironment,
+ AxiomforgeaiAction,
+ AxiomforgeaiObservation,
+ env_name="AxiomForgeAI",
+ max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
+)
+
+
+def main(host: str = "0.0.0.0", port: int = 8000):
+ """
+ Entry point for direct execution via uv run or python -m.
+
+ This function enables running the server without Docker:
+ uv run --project . server
+ uv run --project . server --port 8001
+ python -m AxiomForgeAI.server.app
+
+ Args:
+ host: Host address to bind to (default: "0.0.0.0")
+ port: Port number to listen on (default: 8000)
+
+ For production deployments, consider using uvicorn directly with
+ multiple workers:
+ uvicorn AxiomForgeAI.server.app:app --workers 4
+ """
+ import uvicorn
+
+ uvicorn.run(app, host=host, port=port)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/server/requirements.txt b/server/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..65b1c22b3db715ed9d63b9ad06cd4afb0d9412c5
--- /dev/null
+++ b/server/requirements.txt
@@ -0,0 +1,6 @@
+openenv[core]>=0.2.0
+fastapi>=0.115.0
+uvicorn>=0.24.0
+
+
+
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..521670b4db9eb65bf8585e3bdef763ac10efa35d
--- /dev/null
+++ b/src/__init__.py
@@ -0,0 +1 @@
+# src package
diff --git a/src/config/README.md b/src/config/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6e25eaa764860349bd83e85b64499f455d478df9
--- /dev/null
+++ b/src/config/README.md
@@ -0,0 +1,59 @@
+# Centralized Prompt Configuration
+
+All prompts for question generation and solution generation are centralized in `src/config/prompts.py`.
+
+## Why Centralized?
+
+- **Consistency**: Same prompts across SFT training, GRPO, PPO, and inference
+- **Maintainability**: Single source of truth for all prompt text
+- **Flexibility**: Easy to tune prompts without hunting through multiple files
+
+## Usage
+
+### Question Generation
+
+```python
+from src.config.prompts import create_generator_messages
+
+instruction = "Generate a problem about fractions in a shopping context"
+messages = create_generator_messages(instruction)
+# Returns:
+# [
+# {"role": "system", "content": GENERATOR_SYSTEM_PROMPT},
+# {"role": "user", "content": "### Task: Generate Question\n{instruction}"}
+# ]
+```
+
+### Solution Generation
+
+```python
+from src.config.prompts import create_solver_messages
+
+question = "If John has 5 apples and gives 2 away, how many does he have?"
+messages = create_solver_messages(question)
+# Returns:
+# [
+# {"role": "system", "content": SOLVER_SYSTEM_PROMPT},
+# {"role": "user", "content": "### Task: Solve Problem\nProblem: {question}\nSolution:"}
+# ]
+```
+
+## Files Using Centralized Prompts
+
+- `scripts/run_grpo_training.py` - GRPO question generation
+- `scripts/dual_task_sft_pipeline.py` - SFT training
+- `scripts/create_dual_task_dataset.py` - Dataset creation
+- `src/rl/math_environment.py` - PPO environment
+- `src/rl/triple_verifier.py` - Consensus verification
+
+## Prompt Design Principles
+
+### Question Generation
+- **No explicit step constraints**: Let the model decide complexity naturally
+- Focus on **realistic scenarios** and **simple operations** (grade-school level)
+- Output **only the problem statement**, no solutions
+
+### Solution Generation
+- **Step-by-step format**: Each step on its own line starting with "Step N:"
+- **Final Answer format**: Line starting with "Final Answer:"
+- **Python/SymPy syntax**: All math expressions verifiable programmatically
diff --git a/src/config/__init__.py b/src/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..55dff30c3f4fd6a4aa90dc5518f2a94f4b3b20e1
--- /dev/null
+++ b/src/config/__init__.py
@@ -0,0 +1,23 @@
+"""Configuration package for prompts and other settings."""
+
+from src.config.prompts import (
+ SOLVE_TASK_PREFIX,
+ GENERATE_TASK_PREFIX,
+ SOLVER_SYSTEM_PROMPT,
+ GENERATOR_SYSTEM_PROMPT,
+ format_solver_user_message,
+ format_generator_user_message,
+ create_solver_messages,
+ create_generator_messages,
+)
+
+__all__ = [
+ "SOLVE_TASK_PREFIX",
+ "GENERATE_TASK_PREFIX",
+ "SOLVER_SYSTEM_PROMPT",
+ "GENERATOR_SYSTEM_PROMPT",
+ "format_solver_user_message",
+ "format_generator_user_message",
+ "create_solver_messages",
+ "create_generator_messages",
+]
diff --git a/src/config/prompts.py b/src/config/prompts.py
new file mode 100644
index 0000000000000000000000000000000000000000..519113e51842ba8aca7615b996a0b6fd32225440
--- /dev/null
+++ b/src/config/prompts.py
@@ -0,0 +1,57 @@
+"""
+Centralized prompt configuration for math problem generation and solving.
+
+All prompts used across SFT training, GRPO training, PPO training, and inference
+are defined here to ensure consistency.
+"""
+
+# Task prefixes used in dual-task training
+SOLVE_TASK_PREFIX = "### Task: Solve Problem\n"
+GENERATE_TASK_PREFIX = "### Task: Generate Question\n"
+
+
+# System prompts for solution generation
+SOLVER_SYSTEM_PROMPT = (
+ "You are a step-by-step math solver. "
+ "Solve the given problem one step at a time. "
+ "Each step must be on its own line, starting with 'Step N:'. "
+ "End with a line starting with 'Final Answer:'. "
+ "Write every mathematical expression in Python/SymPy syntax "
+ "so it can be verified programmatically."
+)
+
+
+# System prompts for question generation
+GENERATOR_SYSTEM_PROMPT = (
+ "You are a math problem generator. "
+ "Generate grade-school level math word problems. "
+ "Problems should involve realistic scenarios and use simple arithmetic, fractions, "
+ "percentages, or basic algebra. "
+ "Output ONLY the problem statement, no solutions or steps."
+)
+
+
+def format_solver_user_message(question: str) -> str:
+ """Format user message for solution generation."""
+ return f"{SOLVE_TASK_PREFIX}Problem: {question}\nSolution:"
+
+
+def format_generator_user_message(instruction: str) -> str:
+ """Format user message for question generation."""
+ return f"{GENERATE_TASK_PREFIX}{instruction}"
+
+
+def create_solver_messages(question: str) -> list[dict[str, str]]:
+ """Create chat messages for solution generation."""
+ return [
+ {"role": "system", "content": SOLVER_SYSTEM_PROMPT},
+ {"role": "user", "content": format_solver_user_message(question)},
+ ]
+
+
+def create_generator_messages(instruction: str) -> list[dict[str, str]]:
+ """Create chat messages for question generation."""
+ return [
+ {"role": "system", "content": GENERATOR_SYSTEM_PROMPT},
+ {"role": "user", "content": format_generator_user_message(instruction)},
+ ]
diff --git a/src/rl/__init__.py b/src/rl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a1060d551993f8792d5a5e616f6ff3810572076
--- /dev/null
+++ b/src/rl/__init__.py
@@ -0,0 +1,32 @@
+# GRPO RL components
+
+from src.rl.mdp_components import Action, State, Trajectory, Transition
+from src.rl.question_classifier import QuestionClassifier
+from src.rl.curriculum_manager import CurriculumManager
+from src.rl.question_quality_evaluator import QuestionQualityEvaluator
+from src.rl.expert_panel import SimulatedExpertPanel
+from src.rl.quality_filter import QualityFilter
+from src.rl.replay_buffer import GenerationalReplayBuffer
+
+# Optional heavy imports (require torch + transformers)
+try:
+ from src.rl.value_network import ValueHead
+ from src.rl.math_environment_curriculum import CurriculumMathEnvironment
+except ModuleNotFoundError: # pragma: no cover
+ ValueHead = None
+ CurriculumMathEnvironment = None
+
+__all__ = [
+ "State",
+ "Action",
+ "Transition",
+ "Trajectory",
+ "ValueHead",
+ "CurriculumMathEnvironment",
+ "QuestionClassifier",
+ "CurriculumManager",
+ "QuestionQualityEvaluator",
+ "SimulatedExpertPanel",
+ "QualityFilter",
+ "GenerationalReplayBuffer",
+]
diff --git a/src/rl/curriculum_manager.py b/src/rl/curriculum_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..5752f49cba08432d6e4dff822b8338171ef27e52
--- /dev/null
+++ b/src/rl/curriculum_manager.py
@@ -0,0 +1,702 @@
+"""
+Adaptive curriculum manager for dual-task math training.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import random
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+
+from src.rl.question_classifier import QuestionClassifier, TOPIC_LIST
+
+logger = logging.getLogger(__name__)
+
+# Maps dataset skill_id prefixes → canonical curriculum topic names.
+# Used during bootstrap so questions from NuminaMath / OpenMathInstruct
+# are credited to the right ZPD bucket instead of being misclassified.
+SKILL_ID_TO_TOPIC: dict[str, str] = {
+ # NuminaMath-CoT
+ "numina_algebra": "algebra",
+ "numina_prealgebra": "algebra",
+ "numina_number_theory": "number_theory",
+ "numina_geometry": "geometry",
+ "numina_combinatorics": "combinatorics",
+ "numina_calculus": "calculus",
+ "numina_statistics": "statistics",
+ "numina_synthetic": "multi_step_reasoning",
+ "numina_olympiad": "competition_math",
+ "numina_competition": "competition_math",
+ "numina_general": "multi_step_reasoning",
+ # OpenMathInstruct-2
+ "openmath_algebra": "algebra",
+ "openmath_prealgebra": "algebra",
+ "openmath_number_theory":"number_theory",
+ "openmath_geometry": "geometry",
+ "openmath_combinatorics":"combinatorics",
+ "openmath_calculus": "calculus",
+ "openmath_competition": "competition_math",
+ "openmath_synthetic": "multi_step_reasoning",
+ "openmath_general": "multi_step_reasoning",
+ # Legacy
+ "gsm8k_grade_school": "basic_arithmetic",
+ "aqua_rat_algebra": "algebra",
+ "question_generation": "multi_step_reasoning",
+}
+
+
+@dataclass
+class TopicState:
+ topic_name: str
+ total_attempts: int = 0
+ successes: int = 0
+ success_rate: float = 0.0
+ difficulty_target: float = 0.5
+ difficulty_history: List[float] = field(default_factory=list)
+ last_practiced: int = 0
+ first_attempted: int = 0
+ status: str = "untested"
+ mastered_at_iteration: Optional[int] = None
+ retention_tests_passed: int = 0
+ last_retention_score: float = 0.0
+ consecutive_failures: int = 0
+ failure_count_total: int = 0
+ history: List[Dict[str, float]] = field(default_factory=list)
+ current_iteration_attempts: int = 0 # Track attempts within current iteration
+
+
+class CurriculumManager:
+ """Goldilocks curriculum with adaptive topic selection."""
+
+ SWEET_SPOT_MIN = 0.4
+ SWEET_SPOT_MAX = 0.7
+ TARGET_SUCCESS = 0.55
+
+ CONTEXTS = ["bakery", "shopping", "school", "sports", "gardening", "travel"]
+ ACTIONS = ["uses", "sells", "shares", "loses", "earns", "mixes"]
+
+ TOPIC_TEMPLATES = {
+ "basic_arithmetic": [
+ "Generate a {context} word problem using addition/subtraction.",
+ ],
+ "single_step_word_problems": [
+ "Generate a simple one-idea word problem in a {context} setting.",
+ ],
+ "fractions": [
+ "Generate a fractions word problem where someone {action} part of a quantity in a {context} scenario.",
+ "Create a problem involving fraction operations in {context}.",
+ ],
+ "percentages": [
+ "Generate a percentage change or discount problem in {context}.",
+ ],
+ "ratios": [
+ "Generate a ratios/proportions word problem in {context}.",
+ ],
+ "money_problems": [
+ "Create a money and pricing problem in {context}.",
+ ],
+ "time_distance": [
+ "Generate a time/speed/distance problem in {context}.",
+ ],
+ "multi_step_reasoning": [
+ "Generate a multi-step reasoning problem in {context}.",
+ ],
+ "algebra": [
+ "Generate an algebra problem that solves for a variable in {context}.",
+ ],
+ "mixed_operations": [
+ "Generate a problem requiring mixed operations in {context}.",
+ ],
+ "comparison_problems": [
+ "Generate a comparison problem in {context} ('more than'/'less than').",
+ ],
+ "optimization_problems": [
+ "Generate a constrained optimization style word problem in {context}.",
+ ],
+ # ── AQuA-RAT additions ────────────────────────────────────────────
+ "number_theory": [
+ "Generate a number theory problem about divisibility, remainders, or prime factors in a {context} setting.",
+ "Create a problem involving multiples and factors where someone in {context} {action} items in groups.",
+ ],
+ "profit_loss": [
+ "Generate a profit and loss problem where someone in {context} {action} goods at cost price and selling price.",
+ "Create a problem about percentage profit or loss on a transaction in {context}.",
+ ],
+ "interest": [
+ "Generate a simple or compound interest problem involving a loan or investment in {context}.",
+ "Create a problem where someone in {context} {action} money at a given annual interest rate.",
+ ],
+ "sets": [
+ "Generate a set theory or Venn diagram problem where people in {context} belong to overlapping groups.",
+ "Create a problem using union and intersection of two groups in {context}.",
+ ],
+ "combinatorics": [
+ "Generate a combinatorics problem about arrangements or selections of objects in {context}.",
+ "Create a problem involving permutations or combinations where someone in {context} {action} items.",
+ ],
+ "sequences": [
+ "Generate an arithmetic or geometric sequence problem in {context}.",
+ "Create a problem where someone in {context} follows a pattern and must find the nth term.",
+ ],
+ "probability": [
+ "Generate a probability problem involving random selection or chance events in {context}.",
+ "Create a problem where someone in {context} {action} items from a group and asks for probability.",
+ ],
+ "work_time": [
+ "Generate a work-rate problem where two people in {context} complete a task together or alone.",
+ "Create a problem where workers in {context} {action} a job at different rates.",
+ ],
+ # ── NuminaMath / OpenMathInstruct additions ───────────────────────
+ "geometry": [
+ "Generate a geometry problem about area or perimeter of a shape encountered in {context}.",
+ "Create a problem involving triangles or circles where someone in {context} needs to find a missing length or angle.",
+ "Generate a coordinate geometry problem where points in a {context} layout form a geometric figure.",
+ "Create a problem involving volume or surface area of a 3D shape relevant to {context}.",
+ ],
+ "calculus": [
+ "Generate a rate-of-change problem where a quantity in {context} grows or shrinks over time.",
+ "Create an optimization problem where someone in {context} wants to maximise profit or minimise cost using calculus.",
+ "Generate a problem involving a function whose minimum or maximum value must be found in a {context} scenario.",
+ ],
+ "statistics": [
+ "Generate a statistics problem where someone in {context} collects data and must find the mean, median, or mode.",
+ "Create a problem involving standard deviation or variance of measurements taken in {context}.",
+ "Generate a problem where data from {context} is summarised and an outlier or expected value must be identified.",
+ ],
+ "competition_math": [
+ "Generate a number theory problem asking how many positive integers satisfy a divisibility condition.",
+ "Create a competition-style problem: find all integer solutions to an equation involving remainders or modular arithmetic.",
+ "Generate a counting problem asking in how many ways objects can be arranged or selected under a constraint.",
+ "Create a problem: given two integers relatively prime to each other, find their least common multiple or sum of digits.",
+ ],
+ }
+
+ TOPIC_PREREQUISITES = {
+ "fractions": ["basic_arithmetic"],
+ "percentages": ["fractions"],
+ "ratios": ["basic_arithmetic"],
+ "algebra": ["basic_arithmetic", "comparison_problems"],
+ "mixed_operations": ["basic_arithmetic", "fractions"],
+ "optimization_problems": ["comparison_problems", "algebra"],
+ # AQuA-RAT additions
+ "number_theory": ["basic_arithmetic"],
+ "profit_loss": ["percentages", "money_problems"],
+ "interest": ["percentages"],
+ "sets": ["basic_arithmetic"],
+ "combinatorics": ["basic_arithmetic"],
+ "sequences": ["basic_arithmetic", "algebra"],
+ "probability": ["fractions", "ratios"],
+ "work_time": ["ratios", "multi_step_reasoning"],
+ # NuminaMath / OpenMathInstruct additions
+ "geometry": ["basic_arithmetic"],
+ "calculus": ["algebra", "sequences"],
+ "statistics": ["ratios", "fractions"],
+ "competition_math": ["number_theory", "combinatorics", "algebra"],
+ }
+
+ def __init__(self, checkpoint_dir: str | Path):
+ self.checkpoint_dir = Path(checkpoint_dir)
+ self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
+ self.classifier = QuestionClassifier()
+
+ self.current_iteration = 0
+ self.recent_combined_rewards: List[float] = []
+ self.topics: Dict[str, TopicState] = {
+ topic: TopicState(topic_name=topic) for topic in TOPIC_LIST
+ }
+ self.current_focus_topics: List[str] = []
+ self.hyperparams = {
+ "sweet_spot_min": self.SWEET_SPOT_MIN,
+ "sweet_spot_max": self.SWEET_SPOT_MAX,
+ "target_success": self.TARGET_SUCCESS,
+ }
+
+ def initialize(self, bootstrap_questions: Optional[List[str]] = None) -> None:
+ """Initialize topic priors, optionally from GSM8K-style questions."""
+ if bootstrap_questions:
+ counts = {topic: 0 for topic in TOPIC_LIST}
+ for question in bootstrap_questions:
+ detected = self.classifier.classify_topic(question)
+ topic = str(detected["primary_topic"])
+ if topic in counts:
+ counts[topic] += 1
+
+ total = max(1, sum(counts.values()))
+ for topic, state in self.topics.items():
+ prevalence = counts[topic] / total
+ state.difficulty_target = float(max(0.3, min(0.75, 0.35 + 0.8 * prevalence)))
+ else:
+ for state in self.topics.values():
+ state.difficulty_target = 0.5
+
+ def initialize_from_dataset(
+ self,
+ records: List[Dict],
+ difficulty_field: str = "difficulty",
+ ) -> None:
+ """
+ Bootstrap curriculum topic priors directly from dataset skill_ids.
+
+ Much faster than the question-classifier bootstrap path — reads
+ skill_id and difficulty from each JSONL record rather than running
+ the keyword classifier on every question.
+
+ Args:
+ records: List of dataset records with 'skill_id' and
+ optionally 'difficulty' fields.
+ difficulty_field: Name of the difficulty field (default 'difficulty').
+ Values: 1=easy, 2=medium, 3=hard.
+ """
+ counts: Dict[str, int] = {topic: 0 for topic in TOPIC_LIST}
+ # Map difficulty 1/2/3 → difficulty_target 0.35/0.55/0.75
+ _diff_map = {1: 0.35, 2: 0.55, 3: 0.75}
+ topic_difficulties: Dict[str, List[float]] = {t: [] for t in TOPIC_LIST}
+
+ for rec in records:
+ skill_id = rec.get("skill_id", "")
+ topic = SKILL_ID_TO_TOPIC.get(skill_id)
+ if topic is None:
+ # Fall back to keyword classifier on the question text
+ msgs = rec.get("messages", [])
+ question = next(
+ (m.get("content", "") for m in msgs if m.get("role") == "user"), ""
+ )
+ if question:
+ detected = self.classifier.classify_topic(question)
+ topic = str(detected["primary_topic"])
+ else:
+ continue
+ if topic not in counts:
+ continue
+ counts[topic] += 1
+ raw_diff = rec.get(difficulty_field, 2)
+ topic_difficulties[topic].append(_diff_map.get(int(raw_diff), 0.55))
+
+ total = max(1, sum(counts.values()))
+ for topic, state in self.topics.items():
+ prevalence = counts[topic] / total
+ # Difficulty target: average of observed difficulties, biased up
+ # for rare topics (less data → start harder to find signal faster)
+ diffs = topic_difficulties[topic]
+ if diffs:
+ avg_diff = sum(diffs) / len(diffs)
+ else:
+ avg_diff = 0.50
+
+ # Blend prevalence-based prior with observed average difficulty
+ state.difficulty_target = float(
+ max(0.30, min(0.80, 0.4 * avg_diff + 0.6 * (0.35 + 0.8 * prevalence)))
+ )
+
+ logger.info(
+ "Curriculum bootstrapped from %d records across %d topics",
+ len(records),
+ sum(1 for c in counts.values() if c > 0),
+ )
+ for topic, cnt in sorted(counts.items(), key=lambda x: -x[1]):
+ if cnt > 0:
+ logger.debug(
+ " %-30s %4d samples target_difficulty=%.2f",
+ topic, cnt, self.topics[topic].difficulty_target,
+ )
+
+ def select_topic_and_difficulty(self) -> Tuple[str, float]:
+ probs = self._compute_topic_probabilities()
+ names = list(probs.keys())
+ dist = np.array([probs[name] for name in names], dtype=np.float64)
+ dist = dist / dist.sum()
+
+ # Log topic distribution at start of each iteration (rollout 0, 10, 20, etc.)
+ total_attempts = sum(t.current_iteration_attempts for t in self.topics.values())
+ if total_attempts % 20 == 0: # Every 20 rollouts
+ top_5 = sorted(probs.items(), key=lambda x: x[1], reverse=True)[:5]
+ logger.info(f"Topic probabilities (rollout {total_attempts}): {[(t, f'{p:.3f}') for t, p in top_5]}")
+
+ topic = str(np.random.choice(names, p=dist))
+ difficulty = self._get_difficulty_for_topic(topic)
+ self.current_focus_topics = [topic]
+ return topic, difficulty
+
+ def update_from_trajectory(
+ self,
+ topic: str,
+ question_reward: float,
+ solution_success: bool,
+ combined_reward: Optional[float] = None,
+ measured_difficulty: Optional[float] = None,
+ ) -> None:
+ # Grounded rollouts tag themselves with a synthetic topic name
+ # (``grounded_gsm8k``) that isn't part of the curriculum ontology.
+ # They must not pollute per-topic statistics — silently skip the
+ # update for any unknown topic instead of crashing. The combined
+ # reward is still recorded for plateau detection.
+ state = self.topics.get(topic)
+ if state is None:
+ logger.debug(
+ "Skipping curriculum update for out-of-ontology topic %r", topic
+ )
+ if combined_reward is not None:
+ self.recent_combined_rewards.append(float(combined_reward))
+ self.recent_combined_rewards = self.recent_combined_rewards[-30:]
+ return
+ state.total_attempts += 1
+ state.current_iteration_attempts += 1
+ state.successes += int(solution_success)
+ state.success_rate = state.successes / max(1, state.total_attempts)
+ state.last_practiced = self.current_iteration
+ if state.first_attempted == 0:
+ state.first_attempted = self.current_iteration
+
+ success_value = 1.0 if solution_success else 0.0
+ if solution_success:
+ state.consecutive_failures = 0
+ else:
+ state.consecutive_failures += 1
+ state.failure_count_total += 1
+
+ target = state.difficulty_target
+ # Only adjust difficulty if we have sufficient data
+ if state.total_attempts >= 5:
+ if state.success_rate > self.SWEET_SPOT_MAX:
+ # Increase difficulty gradually
+ state.difficulty_target = min(0.95, target + 0.03)
+ if state.status != "mastered" and state.success_rate >= 0.75:
+ state.status = "mastered"
+ state.mastered_at_iteration = self.current_iteration
+ elif state.success_rate < self.SWEET_SPOT_MIN:
+ # Decrease difficulty more conservatively to avoid getting stuck too low
+ state.difficulty_target = max(0.2, target - 0.04)
+ state.status = "active"
+ else:
+ state.status = "active"
+
+ if measured_difficulty is not None:
+ state.difficulty_history.append(float(measured_difficulty))
+ else:
+ state.difficulty_history.append(state.difficulty_target)
+
+ state.history.append(
+ {
+ "iteration": float(self.current_iteration),
+ "question_reward": float(question_reward),
+ "solution_success": float(success_value),
+ "success_rate": float(state.success_rate),
+ "difficulty_target": float(state.difficulty_target),
+ }
+ )
+
+ if combined_reward is not None:
+ self.recent_combined_rewards.append(float(combined_reward))
+ self.recent_combined_rewards = self.recent_combined_rewards[-30:]
+
+ self.handle_persistent_failure(topic)
+
+ def increment_iteration(self) -> None:
+ self.current_iteration += 1
+ # Reset within-iteration counters
+ for state in self.topics.values():
+ state.current_iteration_attempts = 0
+ self._run_retention_tests_if_due()
+
+ def generate_instruction(self, topic: str, target_difficulty: float) -> str:
+ templates = self.TOPIC_TEMPLATES.get(topic, self.TOPIC_TEMPLATES["multi_step_reasoning"])
+ template = random.choice(templates)
+ # Note: {steps} placeholder removed from templates to let model decide complexity
+ return template.format(
+ context=random.choice(self.CONTEXTS),
+ action=random.choice(self.ACTIONS),
+ )
+
+ def get_curriculum_stats(self) -> Dict[str, object]:
+ return {
+ "iteration": self.current_iteration,
+ "topics": {topic: asdict(state) for topic, state in self.topics.items()},
+ "sweet_spot_topics": self.get_sweet_spot_topics(),
+ "current_focus_topics": self.get_current_focus(),
+ "avg_recent_reward": float(np.mean(self.recent_combined_rewards)) if self.recent_combined_rewards else 0.0,
+ }
+
+ def get_sweet_spot_topics(self) -> List[str]:
+ return [
+ topic
+ for topic, state in self.topics.items()
+ if state.total_attempts > 0 and self.SWEET_SPOT_MIN <= state.success_rate <= self.SWEET_SPOT_MAX
+ ]
+
+ def get_current_focus(self) -> List[str]:
+ return list(self.current_focus_topics)
+
+ def save_state(self, iteration: int, rollout: Optional[int] = None) -> None:
+ if rollout is not None and rollout % 10 != 0:
+ return
+ filename = (
+ f"iteration_{iteration:03d}_final.json"
+ if rollout is None
+ else f"iteration_{iteration:03d}_rollout_{rollout:03d}.json"
+ )
+ path = self.checkpoint_dir / filename
+ state = {
+ "version": "1.0",
+ "timestamp": datetime.utcnow().isoformat(),
+ "iteration": iteration,
+ "rollout": rollout,
+ "current_iteration": self.current_iteration,
+ "recent_combined_rewards": self.recent_combined_rewards,
+ "topics": {topic: asdict(topic_state) for topic, topic_state in self.topics.items()},
+ "hyperparams": self.hyperparams,
+ }
+ path.write_text(json.dumps(state, indent=2), encoding="utf-8")
+
+ def load_checkpoint_safe(self) -> bool:
+ checkpoints = sorted(self.checkpoint_dir.glob("iteration_*_final.json"), reverse=True)
+ for checkpoint in checkpoints:
+ try:
+ data = json.loads(checkpoint.read_text(encoding="utf-8"))
+ topics = data["topics"]
+ if not isinstance(topics, dict):
+ raise ValueError("Invalid topics section")
+ rebuilt = {}
+ for topic in TOPIC_LIST:
+ values = topics.get(topic)
+ if values is None or "success_rate" not in values:
+ raise ValueError(f"Topic {topic} missing or malformed")
+ rebuilt[topic] = TopicState(**values)
+
+ self.topics = rebuilt
+ self.current_iteration = int(data.get("current_iteration", data.get("iteration", 0)))
+ self.recent_combined_rewards = list(data.get("recent_combined_rewards", []))
+ logger.info("Loaded curriculum state from %s", checkpoint)
+ return True
+ except Exception as exc: # pragma: no cover - defensive
+ logger.warning("Failed to load curriculum checkpoint %s: %s", checkpoint, exc)
+ return False
+
+ def _compute_topic_probabilities(self) -> Dict[str, float]:
+ all_states = list(self.topics.values())
+ sweet_spot = [t for t in all_states if self.SWEET_SPOT_MIN <= t.success_rate <= self.SWEET_SPOT_MAX and t.total_attempts > 0]
+ untested = [t for t in all_states if t.total_attempts == 0]
+ mastered = [t for t in all_states if t.status == "mastered"]
+
+ if self.current_iteration <= 3:
+ weights = {"sweet": 0.0, "explore": 1.0, "retention": 0.0}
+ elif self.current_iteration <= 10:
+ weights = {"sweet": 0.50, "explore": 0.35, "retention": 0.15}
+ else:
+ weights = {"sweet": 0.60, "explore": 0.25, "retention": 0.15}
+
+ if self._detect_plateau():
+ weights["explore"] = min(0.50, weights["explore"] + 0.2)
+ weights["sweet"] = max(0.2, weights["sweet"] - 0.2)
+
+ # Start with minimum allocation for ALL topics (5% split)
+ MIN_ALLOCATION = 0.05
+ probs: Dict[str, float] = {t.topic_name: MIN_ALLOCATION / len(all_states) for t in all_states}
+ remaining_mass = 1.0 - MIN_ALLOCATION
+
+ bonus_probs: Dict[str, float] = {}
+
+ # Allocate sweet spot budget with within-iteration diversity penalty
+ if sweet_spot:
+ # Apply diversity penalty based on current iteration attempts
+ staleness = {}
+ for t in sweet_spot:
+ # Strong penalty for topics sampled many times in current iteration
+ if t.current_iteration_attempts == 0:
+ # Not yet sampled this iteration - highest priority
+ staleness[t.topic_name] = 10.0
+ elif t.current_iteration_attempts <= 3:
+ # Sampled 1-3 times - moderate priority
+ staleness[t.topic_name] = 5.0 / t.current_iteration_attempts
+ else:
+ # Sampled 4+ times - heavily penalized
+ staleness[t.topic_name] = 1.0 / (t.current_iteration_attempts ** 1.5)
+
+ total_stale = sum(staleness.values())
+ if total_stale > 0:
+ for t in sweet_spot:
+ bonus_probs[t.topic_name] = bonus_probs.get(t.topic_name, 0.0) + (
+ remaining_mass * weights["sweet"] * (staleness[t.topic_name] / total_stale)
+ )
+
+ # Allocate explore budget - ensure we always explore something
+ explore_pool = untested if untested else self._get_diverse_exploration_pool(sweet_spot)
+ if explore_pool:
+ each = remaining_mass * weights["explore"] / len(explore_pool)
+ for t in explore_pool:
+ bonus_probs[t.topic_name] = bonus_probs.get(t.topic_name, 0.0) + each
+
+ # Allocate retention budget
+ retention_due = [t for t in mastered if self._schedule_retention_test(t) <= self.current_iteration]
+ if retention_due:
+ each = remaining_mass * weights["retention"] / len(retention_due)
+ for t in retention_due:
+ bonus_probs[t.topic_name] = bonus_probs.get(t.topic_name, 0.0) + each
+
+ # Add bonus to base minimum allocation
+ for topic, bonus in bonus_probs.items():
+ probs[topic] = probs.get(topic, 0.0) + bonus
+
+ # Normalize to ensure sum = 1.0
+ total = sum(probs.values())
+ if total <= 0:
+ each = 1.0 / len(all_states)
+ return {t.topic_name: each for t in all_states}
+
+ normalized = {topic: value / total for topic, value in probs.items()}
+
+ # Apply topic probability floor to prevent mode collapse
+ MIN_TOPIC_PROB = 0.02 # Every topic gets at least 2% chance
+ for topic in normalized:
+ if normalized[topic] < MIN_TOPIC_PROB:
+ normalized[topic] = MIN_TOPIC_PROB
+
+ # Re-normalize after applying floor
+ total = sum(normalized.values())
+ normalized = {topic: value / total for topic, value in normalized.items()}
+
+ # Log top 5 topics for debugging
+ top_topics = sorted(normalized.items(), key=lambda x: x[1], reverse=True)[:5]
+ logger.debug(f"Topic probabilities: {top_topics}")
+
+ return normalized
+
+ def _get_boundary_topics(self) -> List[TopicState]:
+ result = []
+ for state in self.topics.values():
+ if state.total_attempts == 0:
+ continue
+ near_low = abs(state.success_rate - self.SWEET_SPOT_MIN) <= 0.08
+ near_high = abs(state.success_rate - self.SWEET_SPOT_MAX) <= 0.08
+ if near_low or near_high:
+ result.append(state)
+ if not result:
+ result = sorted(self.topics.values(), key=lambda t: abs(t.success_rate - self.TARGET_SUCCESS))[:4]
+ return result
+
+ def _get_diverse_exploration_pool(self, exclude_sweet_spot: List[TopicState]) -> List[TopicState]:
+ """
+ Get topics for exploration that are NOT in sweet spot.
+
+ Prioritizes:
+ 1. Under-practiced topics (low attempt count)
+ 2. Topics with potential (success rate 0.2-0.4 or 0.7-0.9)
+ 3. Topics not recently attempted
+
+ Args:
+ exclude_sweet_spot: Topics already in sweet spot to exclude
+
+ Returns:
+ List of 3-5 topics for exploration
+ """
+ sweet_spot_names = {t.topic_name for t in exclude_sweet_spot}
+ candidates = [t for t in self.topics.values() if t.topic_name not in sweet_spot_names]
+
+ if not candidates:
+ # Fallback if somehow all topics are in sweet spot
+ return list(self.topics.values())[:3]
+
+ # Score each candidate
+ scored = []
+ for state in candidates:
+ # Factor 1: Under-practiced (inverse of attempts)
+ attempt_score = 1.0 / (1.0 + state.total_attempts / 10.0)
+
+ # Factor 2: Near sweet spot boundaries (could improve into sweet spot)
+ if 0.2 <= state.success_rate < self.SWEET_SPOT_MIN:
+ potential_score = 2.0 # Just below sweet spot - high potential
+ elif self.SWEET_SPOT_MAX < state.success_rate <= 0.9:
+ potential_score = 1.5 # Just above sweet spot - could be challenged more
+ elif state.total_attempts == 0:
+ potential_score = 3.0 # Untested - highest priority
+ else:
+ potential_score = 0.5 # Far from sweet spot
+
+ # Factor 3: Staleness (not practiced recently)
+ staleness_score = max(1, self.current_iteration - state.last_practiced) / 5.0
+
+ # Combined score
+ total_score = attempt_score + potential_score + staleness_score
+ scored.append((state, total_score))
+
+ # Return top 3-5 topics by score
+ scored.sort(key=lambda x: x[1], reverse=True)
+ num_explore = min(5, max(3, len(candidates) // 3))
+ return [state for state, _ in scored[:num_explore]]
+
+ def _get_difficulty_for_topic(self, topic: str) -> float:
+ state = self.topics[topic]
+ noise = random.uniform(-0.04, 0.04)
+ return max(0.1, min(0.95, state.difficulty_target + noise))
+
+ def _difficulty_to_step_range(self, difficulty: float) -> str:
+ if difficulty < 0.3:
+ return "1-2"
+ if difficulty < 0.6:
+ return "2-3"
+ return "3-4"
+
+ def _schedule_retention_test(self, state: TopicState) -> int:
+ if state.mastered_at_iteration is None:
+ return 10 ** 9
+ interval = min(2 ** max(0, state.retention_tests_passed), 32)
+ return state.mastered_at_iteration + interval
+
+ def _run_retention_tests_if_due(self) -> None:
+ for state in self.topics.values():
+ if state.status != "mastered":
+ continue
+ if self._schedule_retention_test(state) <= self.current_iteration:
+ # Retention test scheduling is represented by increasing pressure
+ # during topic selection rather than explicit immediate update.
+ logger.info("Topic %s is due for retention test", state.topic_name)
+
+ def handle_retention_test_result(self, topic: str, success_rate: float) -> None:
+ state = self.topics[topic]
+ if success_rate >= 0.7:
+ state.retention_tests_passed += 1
+ state.last_retention_score = success_rate
+ state.status = "mastered"
+ elif success_rate >= 0.4:
+ state.retention_tests_passed = 0
+ state.last_retention_score = success_rate
+ state.status = "active"
+ else:
+ state.retention_tests_passed = 0
+ state.last_retention_score = success_rate
+ state.status = "forgotten"
+ state.difficulty_target = max(0.15, state.difficulty_target * 0.7)
+
+ def handle_persistent_failure(self, topic: str) -> None:
+ state = self.topics[topic]
+ failures = state.consecutive_failures
+ if failures >= 3:
+ state.difficulty_target = max(0.1, state.difficulty_target * 0.6)
+ if failures >= 5:
+ state.status = "paused"
+ if failures >= 10:
+ hard_topics = [
+ t
+ for t in self.topics.values()
+ if t.total_attempts >= 10 and t.success_rate < 0.3
+ ]
+ if len(hard_topics) >= 3:
+ self._emergency_reset()
+
+ def _emergency_reset(self) -> None:
+ logger.warning("Emergency curriculum reset triggered")
+ for state in self.topics.values():
+ state.status = "active"
+ state.difficulty_target = min(0.45, max(0.2, state.difficulty_target))
+ state.consecutive_failures = 0
+
+ def _detect_plateau(self) -> bool:
+ if len(self.recent_combined_rewards) < 10:
+ return False
+ window = self.recent_combined_rewards[-10:]
+ return float(np.std(window)) < 0.05
diff --git a/src/rl/expert_panel.py b/src/rl/expert_panel.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef0f0f8389de1d516a7fdc355d7654f96903e3cd
--- /dev/null
+++ b/src/rl/expert_panel.py
@@ -0,0 +1,220 @@
+"""
+Simulated expert panel with shifting preferences across curriculum phases.
+
+Reward-shaping design notes
+---------------------------
+Historically this panel applied a **multiplicative** shaping step:
+
+ adjusted = clip01( base * (1 + modifier) ), modifier in [-0.3, +0.3]
+
+Two problems that analysis of 20 PPO iterations made obvious:
+
+1. Saturation. Any base >= 0.77 was clipped to exactly 1.0 with the
+ maximum boost, and a large fraction of self-play rollouts land in
+ that zone every iteration. After the rollout buffer whitens
+ advantages, a cluster of identical 1.0s flattens the policy
+ gradient — that's the "policy_loss ~ -0.004 across every
+ iteration" signature. Meanwhile the rare non-saturated outlier
+ produces a huge standardized advantage -> KL spikes -> early stop.
+
+2. PRM triple-counting. The panel used ``correctness`` and
+ ``consensus`` weights, and the caller wired both to ``PRM_mean``.
+ Combined with the PRM terms inside ``sol`` itself, a single frozen
+ PRM's opinion drove ~75% of the variance in ``combined``. The
+ policy can game that by finding text the PRM likes without the
+ answer being correct.
+
+The replacement here is:
+
+* **Additive** shaping with a tight bound (|modifier| <= 0.08 by
+ default). No multiplication, no clip-to-1. ``base`` stays in
+ [0, 1] as computed by the environment, and shaping only nudges it
+ a little — GAE + advantage normalization handle scale downstream.
+* The panel no longer consumes the PRM-correlated signals
+ (``correctness``, ``consensus_score``). Those already live inside
+ ``sol``. What the panel *does* add is curriculum-phase taste:
+ clarity, solvability, difficulty match, novelty, format
+ compliance.
+* A harder, one-sided format penalty: badly-formatted outputs get
+ penalized more than well-formatted ones get rewarded. Solutions
+ that don't even parse should not win ties over ones that do.
+
+Nothing about the public API changes — the returned dict still has
+``adjusted_reward``, ``reward_modifier``, ``raw_modifier``,
+``phase``/``description``, ``signals``, and ``feedback``.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+
+
+# Tight additive bound. With base in [0, 1] this keeps the final reward
+# inside roughly [-0.08, 1.08]; the environment re-clips to [0, 1].
+# Lower than the old 0.3 on purpose — shaping is a flavor term, not the
+# main signal.
+MAX_MODIFIER = 0.08
+
+
+@dataclass(frozen=True)
+class ExpertPhase:
+ name: str
+ start_iteration: int
+ end_iteration: Optional[int]
+ clarity_weight: float
+ solvability_weight: float
+ difficulty_weight: float
+ novelty_weight: float
+ format_penalty_weight: float
+ description: str
+
+ def active_for(self, iteration: int) -> bool:
+ if iteration < self.start_iteration:
+ return False
+ if self.end_iteration is None:
+ return True
+ return iteration <= self.end_iteration
+
+
+class SimulatedExpertPanel:
+ """Applies phase-specific bounded **additive** reward shaping.
+
+ No more multiplication, no more clip-to-1, and crucially no more
+ ``correctness``/``consensus`` knobs (which used to double-count
+ PRM_mean on top of ``sol``). The panel now only shapes question
+ quality and format — the correctness signal lives in ``sol`` alone.
+ """
+
+ def __init__(self) -> None:
+ self._phases: List[ExpertPhase] = [
+ ExpertPhase(
+ name="pedagogy",
+ start_iteration=0,
+ end_iteration=3,
+ clarity_weight=0.30,
+ solvability_weight=0.25,
+ difficulty_weight=-0.10,
+ novelty_weight=0.00,
+ format_penalty_weight=0.40,
+ description="Prioritize clear, learnable, and solvable foundation tasks.",
+ ),
+ ExpertPhase(
+ name="accuracy",
+ start_iteration=4,
+ end_iteration=6,
+ clarity_weight=0.10,
+ solvability_weight=0.20,
+ difficulty_weight=0.00,
+ novelty_weight=0.00,
+ format_penalty_weight=0.70,
+ description="Prioritize arithmetic correctness and agreement stability.",
+ ),
+ ExpertPhase(
+ name="challenge",
+ start_iteration=7,
+ end_iteration=None,
+ clarity_weight=0.10,
+ solvability_weight=0.10,
+ difficulty_weight=0.30,
+ novelty_weight=0.20,
+ format_penalty_weight=0.30,
+ description="Prioritize challenging, novel, and diverse problems.",
+ ),
+ ]
+
+ def get_current_expert(self, iteration: int) -> ExpertPhase:
+ for phase in self._phases:
+ if phase.active_for(iteration):
+ return phase
+ return self._phases[-1]
+
+ def apply_expert_preferences(
+ self,
+ base_reward: float,
+ question_metrics: Dict[str, object],
+ solution_metrics: Dict[str, object],
+ iteration: int,
+ ) -> Dict[str, object]:
+ phase = self.get_current_expert(iteration)
+
+ clarity = float(question_metrics.get("clarity", 0.0))
+ solvability = float(question_metrics.get("solvability_score", 0.0))
+ difficulty = float(question_metrics.get("difficulty_score", 0.0))
+ novelty = float(question_metrics.get("novelty_combined", 0.0))
+ format_compliance = float(solution_metrics.get("format_compliance", 0.0))
+ format_penalty = 1.0 - format_compliance
+
+ # Centered versions keep the additive shaping close to zero when
+ # quality signals are average; only genuinely good (>0.5) or
+ # genuinely bad (<0.5) questions move the needle. Without this,
+ # every single rollout got a +0.15 bump just for producing a
+ # non-empty string.
+ clarity_c = clarity - 0.5
+ solvability_c = solvability - 0.5
+ difficulty_c = difficulty - 0.5
+ novelty_c = novelty - 0.5
+
+ raw_modifier = (
+ phase.clarity_weight * clarity_c
+ + phase.solvability_weight * solvability_c
+ + phase.difficulty_weight * difficulty_c
+ + phase.novelty_weight * novelty_c
+ - phase.format_penalty_weight * format_penalty
+ )
+ modifier = max(-MAX_MODIFIER, min(MAX_MODIFIER, raw_modifier))
+
+ # Additive, no multiplication. We leave the final [0, 1] clip to
+ # the caller (math_environment_curriculum) so it can combine the
+ # shaping with its own format-floor rule.
+ adjusted_reward = float(base_reward) + modifier
+ return {
+ "phase": phase.name,
+ "description": phase.description,
+ "phase_start_iteration": phase.start_iteration,
+ "phase_end_iteration": phase.end_iteration,
+ "base_reward": float(base_reward),
+ "adjusted_reward": adjusted_reward,
+ "reward_modifier": modifier,
+ "raw_modifier": raw_modifier,
+ "signals": {
+ "clarity": clarity,
+ "solvability": solvability,
+ "difficulty_score": difficulty,
+ "novelty": novelty,
+ "format_compliance": format_compliance,
+ },
+ "feedback": self.get_expert_feedback(
+ phase_name=phase.name,
+ reward_modifier=modifier,
+ signals={
+ "clarity": clarity,
+ "solvability": solvability,
+ "difficulty_score": difficulty,
+ "novelty": novelty,
+ "format_compliance": format_compliance,
+ },
+ ),
+ }
+
+ def get_expert_feedback(
+ self,
+ phase_name: str,
+ reward_modifier: float,
+ signals: Dict[str, float],
+ ) -> str:
+ direction = "boosted" if reward_modifier >= 0 else "penalized"
+ if phase_name == "pedagogy":
+ return (
+ f"Pedagogy expert {direction} reward; clarity={signals['clarity']:.2f}, "
+ f"solvability={signals['solvability']:.2f}, difficulty={signals['difficulty_score']:.2f}."
+ )
+ if phase_name == "accuracy":
+ return (
+ f"Accuracy expert {direction} reward; solvability={signals['solvability']:.2f}, "
+ f"format={signals['format_compliance']:.2f}."
+ )
+ return (
+ f"Challenge expert {direction} reward; difficulty={signals['difficulty_score']:.2f}, "
+ f"novelty={signals['novelty']:.2f}, format={signals['format_compliance']:.2f}."
+ )
diff --git a/src/rl/llm_question_classifier.py b/src/rl/llm_question_classifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..deaef4c5d21c9b9a7d67a964a4b5ab401045ed93
--- /dev/null
+++ b/src/rl/llm_question_classifier.py
@@ -0,0 +1,296 @@
+"""
+LLM-backed question classifier that replaces the keyword-regex approach.
+
+The already-loaded policy model (Qwen2.5-1.5B-Instruct) is used as the
+classifier brain via a short structured prompt. Inference runs under
+``torch.no_grad()`` so it does not affect training gradients.
+
+Interface is identical to ``QuestionClassifier``, so it is a drop-in
+replacement for the ``classifier`` argument of ``QuestionQualityEvaluator``.
+
+Fallback chain
+--------------
+ 1. Cache hit → instant (0 ms)
+ 2. LLM generation → ~60-120 ms (8 new tokens, greedy, 1.5B model)
+ 3. Regex fallback → ~1 ms (on any error or unparseable output)
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import Any, Dict, List, Optional
+
+import torch
+
+from src.rl.question_classifier import TOPIC_LIST, QuestionClassifier
+
+logger = logging.getLogger(__name__)
+
+# ── Prompt constants ─────────────────────────────────────────────────────────
+
+_TOPIC_CSV = "\n".join(f" {t}" for t in TOPIC_LIST)
+
+_SYSTEM_PROMPT = "You are a precise math topic classifier. Reply with exactly one topic name."
+
+_USER_TEMPLATE = (
+ "Classify the math problem below into EXACTLY ONE topic from this list:\n"
+ "{topics}\n\n"
+ "Problem:\n{problem}\n\n"
+ "Reply with only the topic name, nothing else."
+)
+
+_TOPIC_SET = set(TOPIC_LIST)
+
+# Normalise common LLM output variations → canonical topic names
+_ALIAS_MAP: Dict[str, str] = {
+ # spacing / dash variants
+ "competition math": "competition_math",
+ "competition-math": "competition_math",
+ "basic arithmetic": "basic_arithmetic",
+ "number theory": "number_theory",
+ "single step": "single_step_word_problems",
+ "single-step": "single_step_word_problems",
+ "word problems": "single_step_word_problems",
+ "word problem": "single_step_word_problems",
+ "multi step": "multi_step_reasoning",
+ "multi-step": "multi_step_reasoning",
+ "time distance": "time_distance",
+ "time-distance": "time_distance",
+ "money problems": "money_problems",
+ "profit loss": "profit_loss",
+ "profit and loss": "profit_loss",
+ "work time": "work_time",
+ "work rate": "work_time",
+ "mixed operations": "mixed_operations",
+ "mixed-operations": "mixed_operations",
+ "comparison problems": "comparison_problems",
+ "optimization problems": "optimization_problems",
+ # common shorthand
+ "geo": "geometry",
+ "calc": "calculus",
+ "stats": "statistics",
+ "stat": "statistics",
+ "arith": "basic_arithmetic",
+ "combi": "combinatorics",
+ "combo": "combinatorics",
+ "prob": "probability",
+ "seq": "sequences",
+ "percent": "percentages",
+ "alg": "algebra",
+}
+
+
+def _parse_topic(raw: str) -> Optional[str]:
+ """
+ Extract a canonical topic name from raw LLM output.
+
+ Returns None if the output cannot be mapped to any known topic.
+ """
+ text = raw.strip().lower()
+ # Take first line only (model sometimes adds explanation after newline)
+ first_line = text.split("\n")[0].strip()
+ # Remove surrounding quotes or punctuation
+ first_line = re.sub(r'^["\']|["\',.:;]$', "", first_line).strip()
+
+ if first_line in _TOPIC_SET:
+ return first_line
+
+ normalised = first_line.replace(" ", "_").replace("-", "_")
+ if normalised in _TOPIC_SET:
+ return normalised
+
+ if first_line in _ALIAS_MAP:
+ return _ALIAS_MAP[first_line]
+ if normalised in _ALIAS_MAP:
+ return _ALIAS_MAP[normalised]
+
+ # Substring scan: accept if exactly one topic is contained
+ matches = [t for t in TOPIC_LIST if t in first_line or first_line in t]
+ if len(matches) == 1:
+ return matches[0]
+
+ return None
+
+
+# ── LLM Classifier ────────────────────────────────────────────────────────────
+
+
+class LLMQuestionClassifier(QuestionClassifier):
+ """
+ Uses the loaded policy model to classify math problem topics.
+
+ Inherits all ``estimate_difficulty``, ``check_clarity``, and
+ ``_infer_topic_from_solution`` methods from ``QuestionClassifier`` —
+ only ``classify_topic`` is overridden with LLM inference.
+
+ Parameters
+ ----------
+ model : The loaded CausalLM policy model (already in VRAM).
+ tokenizer : Matching tokenizer.
+ device : torch.device or str.
+ cache_size : LRU-style cache capacity (number of questions).
+ max_retries : Number of greedy attempts before regex fallback.
+ """
+
+ def __init__(
+ self,
+ model: Any,
+ tokenizer: Any,
+ device: Any,
+ cache_size: int = 10_000,
+ max_retries: int = 1,
+ ) -> None:
+ super().__init__()
+ self._model = model
+ self._tokenizer = tokenizer
+ self._device = torch.device(device) if isinstance(device, str) else device
+ self._cache: Dict[str, Dict] = {}
+ self._cache_size = cache_size
+ self._max_retries = max_retries
+ self._stats = {"llm_hits": 0, "cache_hits": 0, "fallback_hits": 0}
+ logger.info(
+ "LLMQuestionClassifier ready (model=%s, cache=%d, topics=%d)",
+ type(model).__name__,
+ cache_size,
+ len(TOPIC_LIST),
+ )
+
+ # ------------------------------------------------------------------
+ # Public API (same signature as QuestionClassifier)
+ # ------------------------------------------------------------------
+
+ def classify_topic(
+ self,
+ question: str,
+ solution: Optional[str] = None,
+ ) -> Dict[str, object]:
+ """
+ Classify *question* into one of the 24 curriculum topics.
+
+ Uses the LLM for fresh questions and a cache for repeated ones.
+ Falls back to regex keyword matching on any error.
+ """
+ cache_key = (question or "")[:300]
+
+ if cache_key in self._cache:
+ self._stats["cache_hits"] += 1
+ return self._cache[cache_key]
+
+ result = self._classify_with_llm(question, solution)
+
+ # Evict oldest entry when cache is full (FIFO approximation)
+ if len(self._cache) >= self._cache_size:
+ self._cache.pop(next(iter(self._cache)))
+ self._cache[cache_key] = result
+ return result
+
+ # ------------------------------------------------------------------
+ # Internal helpers
+ # ------------------------------------------------------------------
+
+ def _classify_with_llm(
+ self,
+ question: str,
+ solution: Optional[str],
+ ) -> Dict[str, object]:
+ """Run LLM inference and parse the topic; fall back to regex."""
+ try:
+ topic = self._llm_infer_topic(question)
+ if topic is not None:
+ self._stats["llm_hits"] += 1
+ return {
+ "primary_topic": topic,
+ "secondary_topics": self._llm_secondary(topic, question, solution),
+ "confidence": 0.92,
+ "signals_used": ["llm"],
+ "keyword_scores": {topic: 0.92},
+ }
+ except Exception as exc:
+ logger.debug("LLM classifier error: %s — using regex fallback.", exc)
+
+ # Regex fallback (inherited from QuestionClassifier)
+ self._stats["fallback_hits"] += 1
+ return super().classify_topic(question, solution)
+
+ @torch.no_grad()
+ def _llm_infer_topic(self, question: str) -> Optional[str]:
+ """
+ Generate a topic prediction using the policy model (greedy, 8 tokens).
+
+ Returns a canonical topic string, or None if the output can't be parsed.
+ """
+ prompt_text = _USER_TEMPLATE.format(
+ topics=_TOPIC_CSV,
+ problem=(question or "")[:400], # truncate very long problems
+ )
+ messages = [
+ {"role": "system", "content": _SYSTEM_PROMPT},
+ {"role": "user", "content": prompt_text},
+ ]
+ input_text = self._tokenizer.apply_chat_template(
+ messages, tokenize=False, add_generation_prompt=True
+ )
+ enc = self._tokenizer(
+ input_text,
+ return_tensors="pt",
+ truncation=True,
+ max_length=512,
+ ).to(self._device)
+ prompt_len = enc["input_ids"].shape[1]
+
+ out = self._model.generate(
+ **enc,
+ max_new_tokens=12,
+ do_sample=False,
+ temperature=1.0,
+ pad_token_id=self._tokenizer.eos_token_id,
+ eos_token_id=self._tokenizer.eos_token_id,
+ )
+
+ new_ids = out[0][prompt_len:]
+ raw = self._tokenizer.decode(new_ids, skip_special_tokens=True)
+ return _parse_topic(raw)
+
+ def _llm_secondary(
+ self,
+ primary: str,
+ question: str,
+ solution: Optional[str],
+ ) -> List[str]:
+ """
+ Cheap secondary topics via regex (not worth a second LLM call).
+ Re-uses the parent's keyword_scores to find runner-up topics.
+ """
+ text = (question or "").lower()
+ kw_scores = {
+ t: self._keyword_score(text, words)
+ for t, words in __import__(
+ "src.rl.question_classifier", fromlist=["TOPIC_KEYWORDS"]
+ ).TOPIC_KEYWORDS.items()
+ }
+ secondary = [
+ t for t, sc in sorted(kw_scores.items(), key=lambda x: x[1], reverse=True)
+ if t != primary and sc >= 0.2
+ ][:3]
+ return secondary
+
+ # ------------------------------------------------------------------
+ # Diagnostics
+ # ------------------------------------------------------------------
+
+ def get_stats(self) -> Dict[str, int]:
+ return dict(self._stats)
+
+ def log_stats(self) -> None:
+ total = sum(self._stats.values())
+ if total == 0:
+ return
+ logger.info(
+ "LLMClassifier cache=%.0f%% llm=%.0f%% fallback=%.0f%% (cache_size=%d/%d)",
+ 100 * self._stats["cache_hits"] / total,
+ 100 * self._stats["llm_hits"] / total,
+ 100 * self._stats["fallback_hits"] / total,
+ len(self._cache),
+ self._cache_size,
+ )
diff --git a/src/rl/math_environment_curriculum.py b/src/rl/math_environment_curriculum.py
new file mode 100644
index 0000000000000000000000000000000000000000..35eefe08782047e31d7752adb6d31d27261902da
--- /dev/null
+++ b/src/rl/math_environment_curriculum.py
@@ -0,0 +1,1223 @@
+"""
+Curriculum-aware math environment with dual reward signals.
+
+This file is deliberately minimal: a single ``collect_rollouts`` method is all
+the training loop needs. Rollouts and PPO updates run in the same process on
+a single GPU — no subprocesses, no RPC, no vLLM colocation.
+"""
+
+from __future__ import annotations
+
+import logging
+import random
+import re
+from dataclasses import asdict, dataclass
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+from sympy import simplify
+from sympy.parsing.sympy_parser import parse_expr
+from tqdm.auto import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from src.config.prompts import create_generator_messages, create_solver_messages
+from src.rl.curriculum_manager import CurriculumManager
+from src.rl.expert_panel import SimulatedExpertPanel
+from src.rl.mdp_components import Action, State, Trajectory, Transition
+from src.rl.prm_scorer import ProcessRewardScorer
+from src.rl.quality_filter import QualityFilter
+from src.rl.question_quality_evaluator import QuestionQualityEvaluator
+from src.rl.replay_buffer import GenerationalReplayBuffer
+from src.rl.value_network import ValueHead
+from src.sft.solution_format import extract_final_answer_numeric_str
+from src.sft.sympy_normalize import normalize_for_parse_expr
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class TrajectoryMetadata:
+ curriculum_iteration: int
+ target_topic: str
+ target_difficulty: float
+ instruction: str
+ generated_question: str
+ generated_solution: str
+ question_length: int
+ solution_length: int
+ detected_topic: str
+ detected_secondary_topics: List[str]
+ topic_match_score: float
+ estimated_difficulty: float
+ clarity_score: float
+ novelty_scores: Dict[str, float]
+ consensus_achieved: bool
+ consensus_strength: float
+ answer_diversity: int
+ majority_answer: Optional[float]
+ primary_matches_majority: bool
+ sympy_verified: bool
+ steps_total: int
+ steps_verified_ok: int
+ steps_failed: int
+ final_answer_ok: bool
+ question_reward: float
+ solution_reward: float
+ pre_expert_reward: float
+ expert_reward_modifier: float
+ expert_phase: str
+ expert_feedback: str
+ replay_candidate: bool
+ replay_novelty: float
+ replay_added: bool
+ combined_reward: float
+ reward_breakdown: Dict[str, object]
+ topics_in_sweet_spot: List[str]
+ current_focus_topics: List[str]
+ curriculum_state_snapshot: Dict[str, object]
+
+
+class CurriculumMathEnvironment:
+ """Standalone curriculum environment with PRM-based rewards and GRPO training support."""
+
+ def __init__(
+ self,
+ policy_model: AutoModelForCausalLM,
+ value_model: Optional[ValueHead],
+ tokenizer: AutoTokenizer,
+ reference_questions: Optional[List[str]] = None,
+ grounded_qa_pairs: Optional[List[Dict[str, str]]] = None,
+ prm_scorer: Optional[ProcessRewardScorer] = None,
+ curriculum_checkpoint_dir: str = "checkpoints/curriculum",
+ max_question_tokens: int = 200,
+ max_solution_tokens: int = 500,
+ temperature: float = 0.7,
+ top_p: float = 0.9,
+ consensus_temperature: float = 0.7,
+ device: Optional[torch.device] = None,
+ unified_accuracy_calc: Optional[Any] = None,
+ ):
+ # ── Core model attributes (used by generation helpers) ───────────
+ self.policy = policy_model
+ self.value = value_model
+ self.tokenizer = tokenizer
+ self.max_question_tokens = max_question_tokens
+ self.max_solution_tokens = max_solution_tokens
+ self.temperature = temperature
+ self.top_p = top_p
+
+ if device is not None:
+ self.device = torch.device(device)
+ else:
+ try:
+ self.device = next(policy_model.parameters()).device
+ except StopIteration:
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+ self.reference_questions = reference_questions or []
+ self.grounded_qa_pairs: List[Dict[str, str]] = [
+ qa for qa in (grounded_qa_pairs or [])
+ if qa.get("question") and qa.get("gold_final")
+ ]
+ self.consensus_temperature = consensus_temperature
+ self.curriculum_manager = CurriculumManager(checkpoint_dir=curriculum_checkpoint_dir)
+ self.curriculum_manager.initialize(bootstrap_questions=self.reference_questions)
+ self.curriculum_manager.load_checkpoint_safe()
+ self.question_evaluator = QuestionQualityEvaluator(
+ reference_questions=self.reference_questions
+ )
+ # PRM is the sole process-quality signal. Passing prm_scorer=None
+ # will cause compute_reward/compute_grounded_reward to raise at
+ # call time — GRPO training always supplies the PRM.
+ self.prm_scorer = prm_scorer
+ # Unified accuracy calculator — activated on Phase 2+ transition.
+ # When use_chain_scoring is True, chain_integrity_score from this
+ # calculator replaces PRM-based process_score in both grounded and
+ # self-play reward paths.
+ self.unified_accuracy_calc: Optional[Any] = unified_accuracy_calc
+ self.use_chain_scoring: bool = False
+ self.expert_panel = SimulatedExpertPanel()
+ self.replay_buffer = GenerationalReplayBuffer(max_size=500)
+ self.quality_filter = QualityFilter(novelty_threshold=0.5)
+ self.last_replay_ratio: float = 0.0
+ self.last_rollout_mix: Dict[str, int] = {
+ "fresh": 0,
+ "replay": 0,
+ "grounded": 0,
+ }
+ # Running counts for the most recent grounded batch, so the training
+ # script can log grounded accuracy per iteration without re-parsing
+ # trajectory metadata.
+ self.last_grounded_stats: Dict[str, float] = {
+ "count": 0,
+ "correct": 0,
+ "accuracy": 0.0,
+ "mean_reward": 0.0,
+ }
+
+ def sample_instruction(self) -> Tuple[str, str, float]:
+ topic, difficulty = self.curriculum_manager.select_topic_and_difficulty()
+ instruction = self.curriculum_manager.generate_instruction(
+ topic=topic, target_difficulty=difficulty
+ )
+ return instruction, topic, difficulty
+
+ def format_solution_prompt(self, question: str) -> str:
+ """Format a question into a chat-templated solver prompt."""
+ messages = create_solver_messages(question)
+ return self.tokenizer.apply_chat_template(
+ messages, tokenize=False, add_generation_prompt=True
+ )
+
+ def format_question_generation_prompt(self, instruction: str) -> str:
+ """Format a curriculum instruction into a chat-templated generator prompt."""
+ messages = create_generator_messages(instruction)
+ return self.tokenizer.apply_chat_template(
+ messages, tokenize=False, add_generation_prompt=True
+ )
+
+ def generate_with_logging(
+ self,
+ initial_prompt: str,
+ max_tokens: int,
+ phase: str,
+ ) -> Tuple[str, List[Transition]]:
+ """
+ Generate text with per-step PPO-grade transition logging.
+
+ Used by the PPO-compatible rollout methods (``collect_rollouts``,
+ ``rollout_trajectory``, ``rollout_grounded_trajectory``). The GRPO
+ training loop uses ``generate_solutions_batched`` instead.
+ """
+ import torch.nn.functional as F # local import to keep top-level clean
+
+ prompt_ids = self.tokenizer.encode(
+ initial_prompt, return_tensors="pt"
+ ).to(self.device)
+ prompt_length = prompt_ids.shape[1]
+ prompt_attn = torch.ones_like(prompt_ids)
+
+ temperature = float(self.temperature)
+ do_sample = temperature > 1e-4
+ eos_id = self.tokenizer.eos_token_id
+ pad_id = self.tokenizer.pad_token_id or eos_id
+
+ gen_kwargs: Dict[str, Any] = dict(
+ input_ids=prompt_ids,
+ attention_mask=prompt_attn,
+ max_new_tokens=max_tokens,
+ do_sample=do_sample,
+ use_cache=True,
+ output_logits=True,
+ return_dict_in_generate=True,
+ pad_token_id=pad_id,
+ eos_token_id=eos_id,
+ )
+ if do_sample:
+ gen_kwargs["temperature"] = max(temperature, 1e-6)
+ gen_kwargs["top_p"] = float(self.top_p)
+
+ with torch.no_grad():
+ gen_out = self.policy.generate(**gen_kwargs)
+
+ full_ids = gen_out.sequences # [1, P + T]
+ T_gen = int(full_ids.shape[1] - prompt_length)
+ if T_gen <= 0:
+ return "", []
+
+ raw_logits = torch.stack([lg[0] for lg in gen_out.logits], dim=0).float()
+ raw_log_probs = F.log_softmax(raw_logits, dim=-1)
+ sampled_tokens = full_ids[0, prompt_length:]
+ chosen_log_probs = raw_log_probs.gather(
+ 1, sampled_tokens.unsqueeze(1)
+ ).squeeze(1)
+ entropies = -(raw_log_probs.exp() * raw_log_probs).sum(dim=-1)
+
+ positions = torch.arange(
+ prompt_length - 1, prompt_length + T_gen - 1, device=self.device
+ )
+ full_attn = torch.ones_like(full_ids)
+ if self.value is not None:
+ values = self.value.values_at_positions(
+ input_ids=full_ids, positions=positions, attention_mask=full_attn
+ )
+ else:
+ values = torch.zeros(T_gen, device=self.device)
+
+ piece_by_piece: List[str] = self.tokenizer.batch_decode(
+ [[tok.item()] for tok in sampled_tokens], skip_special_tokens=False
+ )
+
+ transitions: List[Transition] = []
+ running_text = initial_prompt
+ for t in range(T_gen):
+ state_input_ids = full_ids[0, : prompt_length + t]
+ current_state = State(
+ text=running_text,
+ input_ids=state_input_ids,
+ attention_mask=torch.ones_like(state_input_ids),
+ phase=phase,
+ )
+ action_token = int(sampled_tokens[t].item())
+ action = Action(
+ token_id=action_token,
+ log_prob=float(chosen_log_probs[t].item()),
+ entropy=float(entropies[t].item()),
+ )
+ next_text = running_text + piece_by_piece[t]
+ next_input_ids = full_ids[0, : prompt_length + t + 1]
+ next_state = State(
+ text=next_text,
+ input_ids=next_input_ids,
+ attention_mask=torch.ones_like(next_input_ids),
+ phase=phase,
+ )
+ is_done = eos_id is not None and action_token == eos_id
+ transitions.append(
+ Transition(
+ state=current_state,
+ action=action,
+ reward=0.0,
+ next_state=next_state,
+ value=float(values[t].item()),
+ done=is_done,
+ )
+ )
+ running_text = next_text
+ if is_done:
+ break
+
+ generated_ids = full_ids[0, prompt_length : prompt_length + len(transitions)]
+ generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+ return generated_text, transitions
+
+ def _compute_format_score(self, solution: str) -> float:
+ """
+ Structural format score based purely on text patterns — no SymPy.
+
+ Checks:
+ - Presence of 'Step N:' lines (multi-step structure)
+ - Presence of 'Final Answer:' line (correct termination)
+ - Length: ≥2 step lines scores highest
+
+ Returns a score in [0, 1].
+ """
+ lines = solution.splitlines()
+ step_lines = [l for l in lines if re.match(r"^\s*Step\s+\d+\s*:", l)]
+ has_final = any(re.match(r"^\s*Final Answer\s*:", l, re.IGNORECASE) for l in lines)
+
+ n_steps = len(step_lines)
+ if n_steps >= 2:
+ length_bonus = 1.0
+ elif n_steps == 1:
+ length_bonus = 0.5
+ else:
+ length_bonus = 0.0
+
+ final_ok = 1.0 if has_final else 0.0
+ # 0.7 × step-structure + 0.3 × final-answer presence
+ return max(0.0, min(1.0, 0.7 * length_bonus + 0.3 * final_ok))
+
+ def compute_reward(
+ self,
+ question: str,
+ solution: str,
+ target_topic: str,
+ target_difficulty: float,
+ ) -> Dict[str, object]:
+ # With a PRM scorer plugged in we skip the expensive (and noisy)
+ # TripleVerifier consensus step. PRM gives per-step correctness
+ # against the actual question semantics, which is strictly better
+ # than "do 3 independent samples agree?"
+ if self.prm_scorer is not None:
+ return self._compute_reward_with_prm(
+ question=question,
+ solution=solution,
+ target_topic=target_topic,
+ target_difficulty=target_difficulty,
+ )
+
+ raise RuntimeError(
+ "compute_reward called without a PRM scorer. "
+ "CurriculumMathEnvironment requires prm_scorer to be set. "
+ "Pass prm_scorer=ProcessRewardScorer(...) at construction time."
+ )
+
+ def _compute_reward_with_prm(
+ self,
+ question: str,
+ solution: str,
+ target_topic: str,
+ target_difficulty: float,
+ ) -> Dict[str, object]:
+ """
+ Self-play reward using Qwen2.5-Math-PRM as the semantic-correctness
+ signal. PRM gives per-step probabilities that each reasoning step
+ is correct *given the question* — exactly the signal consensus
+ voting was supposed to approximate but couldn't (three samples
+ from the same policy agree on wrong answers).
+
+ Solution reward (PRM path):
+ R_sol = 0.45·prm_final + 0.35·prm_mean + 0.20·lccp
+ R = 0.4·R_q + 0.6·R_sol (then expert-panel modifier)
+
+ * ``prm_final`` (final step score) is the strongest predictor of
+ overall answer correctness.
+ * ``prm_mean`` provides a smooth gradient over all steps.
+ * ``lccp`` (Longest Correct Consecutive Prefix) rewards chain
+ integrity — consecutive correct steps before the first failure.
+ * The 0.4/0.6 Q/Sol split boosts gradient to question-generation
+ without starving the solution-correctness signal.
+ """
+ assert self.prm_scorer is not None, "caller must check self.prm_scorer"
+
+ prm_result = self.prm_scorer.score_solution(
+ question=question, solution=solution
+ )
+ format_score = self._compute_format_score(solution)
+
+ prm_mean = float(prm_result.get("mean_score", 0.0))
+ prm_min = float(prm_result.get("min_score", 0.0))
+ prm_final = float(prm_result.get("final_score", 0.0))
+ prm_num_steps = int(prm_result.get("num_steps", 0))
+ prm_degraded = bool(prm_result.get("degraded", False))
+
+ # If the PRM degraded (empty solution, tokeniser mismatch, truncation),
+ # the output is effectively unparseable. Prior behavior was to fall
+ # back on SymPy+format, but the upstream ``base_combined_score`` also
+ # blends in the question reward — so the policy got a positive signal
+ # for producing a broken solution as long as the *question* looked
+ # fine. We now treat a degraded PRM as a hard zero on the solution
+ # reward; the question reward is gated below so the full combined
+ # score also collapses.
+ if prm_degraded or prm_num_steps == 0:
+ solution_reward = 0.0
+ _sp_lccp = 0.0
+ sol_valid = False
+ _sp_chain_integrity: Optional[float] = None
+ logger.info(
+ "PRM degraded (%s); sol_reward set to 0.0 (format=%.2f).",
+ prm_result.get("degraded_reason", "unknown"),
+ format_score,
+ )
+ else:
+ # LCCP for self-play: same chain-integrity measure as grounded path
+ _sp_step_scores = prm_result.get("step_scores", []) or []
+ if _sp_step_scores:
+ _first_fail = next(
+ (i for i, s in enumerate(_sp_step_scores) if s <= 0.5),
+ len(_sp_step_scores),
+ )
+ _sp_lccp = _first_fail / len(_sp_step_scores)
+ else:
+ _sp_lccp = 0.0
+
+ # Self-play solution: PRM-only reward blending mean, final & chain integrity.
+ # LCCP anchors the grade to *consecutive* correctness, not just bag-of-steps.
+ solution_reward = (
+ 0.45 * prm_final
+ + 0.35 * prm_mean
+ + 0.20 * _sp_lccp
+ )
+ # Phase 2+ chain scoring: replace PRM solution blend with unified
+ # chain integrity + dependency consistency. This also populates the
+ # question_score from the unified calculator so the Q/Sol weighting
+ # below uses chain-verified signals instead of PRM proxies.
+ _sp_chain_integrity = None
+ if self.use_chain_scoring and self.unified_accuracy_calc is not None:
+ try:
+ _sp_report = self.unified_accuracy_calc.compute(
+ solution=solution,
+ gold_answer=None,
+ question=question,
+ topic=target_topic,
+ phase="selfplay",
+ )
+ solution_reward = _sp_report.composite_accuracy
+ _sp_chain_integrity = _sp_report.chain_integrity_score
+ except Exception as _sp_exc:
+ logger.debug("Unified accuracy calc (self-play) failed: %s", _sp_exc)
+ sol_valid = True
+ solution_reward = max(0.0, min(1.0, solution_reward))
+
+ question_result = self.question_evaluator.evaluate(
+ question=question,
+ solution=solution,
+ # Synthesize a "consensus-equivalent" dict so the question
+ # evaluator keeps working unchanged. PRM mean score stands
+ # in for consensus strength since both are correctness proxies.
+ consensus_result={
+ "has_majority": prm_mean >= 0.5,
+ "consensus_strength": prm_mean,
+ "primary_matches_majority": prm_mean >= 0.5,
+ "answer_diversity": 0,
+ "majority_answer": None,
+ "primary_answer": None,
+ },
+ target_topic=target_topic,
+ target_difficulty=target_difficulty,
+ )
+ question_reward = float(question_result["overall_score"])
+
+ # Gate the question-quality bonus on having a parseable solution.
+ # A great-looking question with a broken solution is not progress
+ # toward self-improvement — it's the policy gaming whichever
+ # signal is easier to produce.
+ effective_question_reward = question_reward if sol_valid else 0.0
+
+ # Q/Sol = 0.4/0.6 — see note in compute_reward (non-PRM path).
+ base_combined_score = (
+ 0.4 * effective_question_reward + 0.6 * solution_reward
+ )
+
+ # Format floor: if the solution structure is broken (<0.5 format),
+ # cap the overall reward at 0.3 regardless of how much the PRM
+ # likes the prose. Previously we saw combined=0.83 with
+ # Format=0.30, i.e. the PRM "approved" an output that didn't have
+ # parseable Step/Final Answer lines — pure reward hacking.
+ format_floor_active = format_score < 0.5
+ format_cap = 0.3 if format_floor_active else 1.0
+ base_combined_score = min(base_combined_score, format_cap)
+
+ # Novelty gate: prevent template-copying reward hacking.
+ # If the model just generates "John has X apples..." with different numbers,
+ # n-gram similarity to the reference corpus is high → dataset_novelty is LOW.
+ # We cap the reward to discourage this without penalising genuinely novel questions.
+ # < 0.20: near-copy of a training question (template + new variables) → cap 0.35
+ # > 0.85: completely off-domain (not a real math problem style) → cap 0.55
+ # [0.20, 0.85]: Goldilocks zone → full reward (novelty_cap = 1.0)
+ _dataset_novelty = float(
+ question_result.get("novelty", {}).get("dataset_novelty", 0.5)
+ if isinstance(question_result.get("novelty"), dict)
+ else 0.5
+ )
+ if _dataset_novelty < 0.20:
+ _novelty_cap = 0.35
+ elif _dataset_novelty > 0.85:
+ _novelty_cap = 0.55
+ else:
+ _novelty_cap = 1.0
+ if _novelty_cap < 1.0:
+ base_combined_score = min(base_combined_score, _novelty_cap)
+ logger.debug(
+ "Novelty gate: dataset_novelty=%.2f → cap=%.2f (was %.3f → now %.3f)",
+ _dataset_novelty, _novelty_cap,
+ base_combined_score / _novelty_cap if _novelty_cap > 0 else 0,
+ base_combined_score,
+ )
+
+ expert_adjustment = self.expert_panel.apply_expert_preferences(
+ base_reward=base_combined_score,
+ question_metrics=question_result,
+ solution_metrics={
+ # Only format_compliance still influences shaping — the
+ # PRM/correctness signal lives inside ``solution_reward``
+ # already and must not be double-counted here.
+ "format_compliance": format_score,
+ },
+ iteration=self.curriculum_manager.current_iteration,
+ )
+ combined_score = float(expert_adjustment["adjusted_reward"])
+ # Re-clip after additive shaping + respect the format cap one more
+ # time so the shaping can't lift a badly-formatted solution back
+ # above the cap.
+ combined_score = max(0.0, min(format_cap, combined_score))
+
+ # Curriculum mastery: consider self-play solution "successful" when
+ # both the chain mean AND the final concluding step are above threshold.
+ # Using prm_final as a required condition prevents a solution that gets
+ # most steps right but fails the conclusion from being marked "mastered".
+ solution_success = (
+ (not prm_degraded)
+ and (prm_mean >= 0.65)
+ and (prm_final >= 0.50)
+ )
+ self.curriculum_manager.update_from_trajectory(
+ topic=target_topic,
+ question_reward=question_reward,
+ solution_success=solution_success,
+ combined_reward=combined_score,
+ measured_difficulty=float(question_result["measured_difficulty"]),
+ )
+
+ modifier_val = float(expert_adjustment.get("reward_modifier", 0.0))
+ floor_tag = " FLOOR" if format_floor_active else ""
+ valid_tag = "" if sol_valid else " [SOL_INVALID]"
+ logger.info(
+ "PRM reward%s: combined=%.3f = clip(base=%.3f + mod=%+.3f, cap=%.2f)%s "
+ "| Q=%.2f sol=%.3f novelty=%.2f | "
+ "sol=0.45*prm_final(%.2f)+0.35*prm_mean(%.2f)+0.20*lccp(%.2f) "
+ "| steps=%d",
+ valid_tag,
+ combined_score,
+ base_combined_score,
+ modifier_val,
+ format_cap,
+ floor_tag,
+ effective_question_reward,
+ solution_reward,
+ _dataset_novelty,
+ prm_final,
+ prm_mean,
+ _sp_lccp if sol_valid else 0.0,
+ prm_num_steps,
+ )
+
+ # Shape a consensus-style verification_details dict so downstream
+ # aggregation (which reads these keys) keeps working unchanged.
+ verification_details = {
+ "consensus": {
+ "has_majority": prm_mean >= 0.5,
+ "consensus_strength": prm_mean,
+ "primary_matches_majority": prm_mean >= 0.5,
+ "answer_diversity": 0,
+ "majority_answer": None,
+ "primary_answer": extract_final_answer_numeric_str(solution) or None,
+ "prm_mean_score": prm_mean,
+ "prm_min_score": prm_min,
+ "prm_final_score": prm_final,
+ "prm_step_scores": prm_result.get("step_scores", []),
+ "prm_num_steps": prm_num_steps,
+ "prm_degraded": prm_degraded,
+ },
+ }
+
+ return {
+ "combined_score": combined_score,
+ "base_combined_score": base_combined_score,
+ "effective_question_reward": effective_question_reward, # gated (0 when sol invalid)
+ "question_metrics": question_result,
+ "solution_metrics": {
+ "overall_score": solution_reward,
+ "correctness": prm_mean,
+ "format_compliance": format_score,
+ "efficiency": prm_mean, # legacy slot
+ "consensus_score": prm_mean, # legacy slot
+ "prm_mean_score": prm_mean,
+ "prm_min_score": prm_min,
+ "prm_final_score": prm_final,
+ "prm_step_scores": prm_result.get("step_scores", []),
+ "prm_num_steps": prm_num_steps,
+ "prm_degraded": prm_degraded,
+ "verification_details": verification_details,
+ },
+ "curriculum_metrics": {
+ "target_topic": target_topic,
+ "target_difficulty": target_difficulty,
+ "detected_topic": question_result["detected_topic"],
+ "measured_difficulty": question_result["measured_difficulty"],
+ },
+ "expert_metrics": expert_adjustment,
+ # Chain scoring metrics (Phase 2+; None when use_chain_scoring=False)
+ "sp_chain_integrity_score": _sp_chain_integrity,
+ }
+
+ # ------------------------------------------------------------------
+ # Grounded (GSM8K-anchored) rollouts
+ # ------------------------------------------------------------------
+ #
+ # Why this exists: self-play rewards are dominated by consensus voting
+ # between 3 same-model samples, which correlates poorly with GSM8K
+ # accuracy (all three samples can be wrong in the same way). For the
+ # grounded path we solve a known GSM8K problem and score the solution
+ # directly against the gold final answer, which is the only signal
+ # guaranteed to move the benchmark we actually evaluate on.
+ #
+ # The reward: R = 0.50·gt_match + 0.40·process(PRM) + 0.10·format
+ #
+ # * gt_match = 1.0 iff the model's Final Answer is mathematically
+ # equivalent to the GSM8K gold final (via sympy.simplify on the
+ # extracted numeric string).
+ # * process = 0.60·prm_final + 0.40·prm_mean (PRM step-level quality)
+ # * format rewards Step N: lines and a Final Answer: line.
+ #
+ # No TripleVerifier call on this path — ground truth obviates consensus.
+
+ @staticmethod
+ def _norm_expr_for_match(s: str) -> str:
+ s = (s or "").strip()
+ s = s.replace("^", "**")
+ s = re.sub(r"[,$€£\s]+", "", s)
+ return s
+
+ @classmethod
+ def _answers_equivalent(cls, pred: str, gold: str) -> bool:
+ """Return True iff ``pred`` and ``gold`` parse to the same number."""
+ if not pred or not gold:
+ return False
+ p = cls._norm_expr_for_match(pred)
+ g = cls._norm_expr_for_match(gold)
+ if p == g:
+ return True
+ try:
+ diff = simplify(
+ parse_expr(normalize_for_parse_expr(p))
+ - parse_expr(normalize_for_parse_expr(g))
+ )
+ return bool(diff == 0)
+ except Exception:
+ return False
+
+ def compute_grounded_reward(
+ self,
+ question: str,
+ solution: str,
+ gold_final: str,
+ ) -> Dict[str, object]:
+ """
+ Compute a ground-truth-anchored reward for a solution to a known
+ GSM8K problem. No TripleVerifier call — the gold final answer
+ replaces consensus voting as the semantic check.
+ """
+ format_score = self._compute_format_score(solution)
+
+ pred_final = extract_final_answer_numeric_str(solution) or ""
+ gt_match_bool = self._answers_equivalent(pred_final, gold_final)
+ if gt_match_bool:
+ gt_match = 1.0
+ else:
+ # Soft numeric proximity: reward near-misses rather than cliffing at 0.
+ # Gives partial credit proportional to how close the numeric answer is.
+ # Capped at 0.85 so an exact match (1.0) is always strictly better.
+ # Non-numeric wrong answers still get 0.0.
+ try:
+ _p = float(pred_final.replace(",", "").strip())
+ _g = float(gold_final.replace(",", "").strip())
+ _denom = max(abs(_g), 1.0)
+ gt_match = min(0.85, 1.0 / (1.0 + 2.0 * abs(_p - _g) / _denom))
+ except (ValueError, TypeError, AttributeError):
+ gt_match = 0.0
+
+ # Optional PRM step-level quality on grounded rollouts.
+ # prm_final (last step score) is the strongest single predictor of
+ # answer correctness. step_accuracy = fraction of steps the PRM
+ # considers correct — the direct measure of reasoning process quality.
+ prm_mean = 0.0
+ prm_final = 0.0
+ prm_step_scores: List[float] = []
+ prm_num_steps = 0
+ prm_degraded = True
+ if self.prm_scorer is not None:
+ prm_result = self.prm_scorer.score_solution(
+ question=question, solution=solution
+ )
+ prm_degraded = bool(prm_result.get("degraded", False))
+ if not prm_degraded:
+ prm_mean = float(prm_result.get("mean_score", 0.0))
+ prm_final = float(prm_result.get("final_score", 0.0))
+ prm_step_scores = list(prm_result.get("step_scores", []))
+ prm_num_steps = int(prm_result.get("num_steps", 0))
+
+ # Step accuracy: fraction of individual steps rated correct by PRM.
+ step_accuracy = (
+ sum(1.0 for s in prm_step_scores if s > 0.5) / len(prm_step_scores)
+ if prm_step_scores else 0.0
+ )
+
+ # Longest Correct Consecutive Prefix (LCCP): fraction of steps from
+ # the start that are ALL rated correct before the first failure.
+ # This captures chain integrity — a broken step 3 makes steps 4+ invalid
+ # regardless of their individual PRM scores.
+ # LCCP=1.0 means every step was correct (necessary condition for right answer).
+ # LCCP=0.0 means step 1 itself was wrong (model never had a valid chain).
+ if prm_step_scores:
+ first_fail = next(
+ (i for i, s in enumerate(prm_step_scores) if s <= 0.5), len(prm_step_scores)
+ )
+ lccp = first_fail / len(prm_step_scores)
+ else:
+ lccp = 0.0
+
+ if self.prm_scorer is not None and not prm_degraded:
+ # process_score: weight prm_final (conclusion step) more than mean
+ # — the final step is the most critical and most predictive.
+ process_score = 0.60 * prm_final + 0.40 * prm_mean
+ combined = (
+ 0.50 * gt_match
+ + 0.40 * process_score
+ + 0.10 * format_score
+ )
+ _gt_tag = "exact" if gt_match_bool else f"prox={gt_match:.2f}"
+ components_str = (
+ f"0.50×{gt_match:.2f}({_gt_tag}) + 0.40×proc({process_score:.3f}"
+ f"[fin={prm_final:.2f},mean={prm_mean:.2f}]) + "
+ f"0.10×fmt({format_score:.3f})"
+ )
+ else:
+ combined = 0.85 * gt_match + 0.15 * format_score
+ components_str = (
+ f"0.85×{gt_match:.2f} + 0.15×fmt({format_score:.3f})"
+ )
+
+ # Phase 2+ chain scoring: override process_score, step_accuracy, lccp,
+ # and combined with formally-verified chain integrity metrics.
+ # PRM is still called above so its scores remain logged for comparison.
+ _chain_report = None
+ if self.use_chain_scoring and self.unified_accuracy_calc is not None:
+ try:
+ _chain_report = self.unified_accuracy_calc.compute(
+ solution=solution,
+ gold_answer=gold_final,
+ topic="grounded",
+ phase="grounded",
+ )
+ process_score = _chain_report.chain_integrity_score
+ step_accuracy = _chain_report.step_arithmetic_score
+ lccp = _chain_report.lccp_score
+ combined = max(0.0, min(1.0,
+ 0.50 * gt_match + 0.30 * process_score + 0.20 * lccp
+ ))
+ components_str = (
+ f"0.50×{gt_match:.2f} + 0.30×chain({process_score:.3f}"
+ f"[arith={_chain_report.step_arithmetic_score:.2f},"
+ f"dep={_chain_report.step_dependency_score:.2f}]) + "
+ f"0.20×lccp({lccp:.3f})"
+ )
+ except Exception as _chain_exc:
+ logger.debug("Unified accuracy calc failed, keeping PRM scores: %s", _chain_exc)
+ else:
+ combined = max(0.0, min(1.0, combined))
+
+ # Hard negative mining: wrong-answer solutions still get a partial signal
+ # proportional to how far they got before the first error (LCCP).
+ # This prevents gradient starvation on hard problems where no solution in
+ # the group is fully correct — the model still learns "longer correct prefix
+ # is better" rather than receiving zero reward for all K samples.
+ if gt_match < 0.5 and lccp > 0.0 and self.prm_scorer is not None:
+ # Bonus = 0.15 × LCCP, capped so that a wrong answer (combined ≈ 0.40)
+ # can never exceed 0.55 — always well below a correct answer (≈ 0.90+).
+ _hnm_bonus = 0.15 * lccp
+ combined = min(combined + _hnm_bonus, 0.55)
+
+ _chain_depth = first_fail if prm_step_scores else 0
+ logger.info(
+ "Grounded reward: combined=%.3f = %s | pred=%r gold=%r | "
+ "step_acc=%.0f%% lccp=%.0f%% (chain=%d/%d ok_count=%d) n_steps=%d",
+ combined,
+ components_str,
+ pred_final,
+ gold_final,
+ 100 * step_accuracy,
+ 100 * lccp,
+ _chain_depth,
+ len(prm_step_scores),
+ sum(1 for s in prm_step_scores if s > 0.5),
+ prm_num_steps,
+ )
+
+ return {
+ "combined_score": combined,
+ "gt_match": gt_match_bool,
+ # process metrics
+ "step_accuracy": step_accuracy,
+ "lccp": lccp, # longest correct consecutive prefix ratio
+ "prm_mean_score": prm_mean,
+ "prm_final_score": prm_final,
+ "prm_step_scores": prm_step_scores,
+ "prm_num_steps": prm_num_steps,
+ "prm_degraded": prm_degraded,
+ # format / answer
+ "format_score": format_score,
+ "pred_final": pred_final,
+ "gold_final": gold_final,
+ # chain scoring metrics (populated in Phase 2+, None otherwise)
+ "chain_arith_score": _chain_report.step_arithmetic_score if _chain_report else None,
+ "chain_dep_score": _chain_report.step_dependency_score if _chain_report else None,
+ "chain_integrity_score": _chain_report.chain_integrity_score if _chain_report else None,
+ "first_failure_step": _chain_report.first_failure_step if _chain_report else None,
+ "final_consistent": _chain_report.final_answer_consistent if _chain_report else None,
+ }
+
+ def rollout_grounded_trajectory(self, qa_pair: Dict[str, str]) -> Trajectory:
+ """
+ Run a rollout on a known GSM8K (question, gold_final) pair.
+
+ The policy generates a solution to the real question; reward is
+ dominated by whether the model's final number matches the gold
+ final (ground-truth-anchored).
+ """
+ question = str(qa_pair["question"]).strip()
+ gold_final = str(qa_pair["gold_final"]).strip()
+
+ solution_prompt = self.format_solution_prompt(question)
+ generated_solution, solution_transitions = self.generate_with_logging(
+ initial_prompt=solution_prompt,
+ max_tokens=self.max_solution_tokens,
+ phase="grounded_solution",
+ )
+
+ reward_result = self.compute_grounded_reward(
+ question=question,
+ solution=generated_solution,
+ gold_final=gold_final,
+ )
+
+ terminal_reward = float(reward_result["combined_score"])
+ trajectory = Trajectory()
+ for idx, transition in enumerate(solution_transitions):
+ transition.reward = (
+ terminal_reward if idx == len(solution_transitions) - 1 else 0.0
+ )
+ trajectory.add(transition)
+
+ metadata = {
+ "rollout_source": "grounded",
+ "curriculum_iteration": self.curriculum_manager.current_iteration,
+ "target_topic": "grounded_gsm8k",
+ "target_difficulty": 0.5,
+ "instruction": "",
+ "generated_question": question,
+ "generated_solution": generated_solution,
+ "question_length": 0,
+ "solution_length": len(solution_transitions),
+ "detected_topic": "grounded_gsm8k",
+ "detected_secondary_topics": [],
+ "topic_match_score": 1.0,
+ "estimated_difficulty": 0.5,
+ "clarity_score": 1.0,
+ "novelty_scores": {"combined": 0.0},
+ "consensus_achieved": bool(reward_result["gt_match"]),
+ "consensus_strength": 1.0 if reward_result["gt_match"] else 0.0,
+ "answer_diversity": 0,
+ "majority_answer": None,
+ "primary_matches_majority": bool(reward_result["gt_match"]),
+ "question_reward": 0.0,
+ "solution_reward": terminal_reward,
+ "pre_expert_reward": terminal_reward,
+ "expert_reward_modifier": 0.0,
+ "expert_phase": "grounded",
+ "expert_feedback": "ground-truth anchored",
+ "replay_candidate": False,
+ "replay_novelty": 0.0,
+ "replay_added": False,
+ "combined_reward": terminal_reward,
+ "reward_breakdown": {
+ "grounded": True,
+ "gt_match": bool(reward_result["gt_match"]),
+ "format_score": float(reward_result["format_score"]),
+ "pred_final": reward_result["pred_final"],
+ "gold_final": reward_result["gold_final"],
+ "prm_mean_score": float(reward_result.get("prm_mean_score", 0.0)),
+ "prm_num_steps": int(reward_result.get("prm_num_steps", 0)),
+ "prm_step_scores": list(reward_result.get("prm_step_scores", [])),
+ "prm_degraded": bool(reward_result.get("prm_degraded", True)),
+ },
+ "topics_in_sweet_spot": self.curriculum_manager.get_sweet_spot_topics(),
+ "current_focus_topics": self.curriculum_manager.get_current_focus(),
+ "curriculum_state_snapshot": self.curriculum_manager.get_curriculum_stats(),
+ "grounded_gt_match": bool(reward_result["gt_match"]),
+ "grounded_pred_final": reward_result["pred_final"],
+ "grounded_gold_final": reward_result["gold_final"],
+ }
+ trajectory.metadata = metadata
+ return trajectory
+
+ def rollout_trajectory(self) -> Trajectory:
+ instruction, target_topic, target_difficulty = self.sample_instruction()
+ question_prompt = self.format_question_generation_prompt(instruction)
+ generated_question, question_transitions = self.generate_with_logging(
+ initial_prompt=question_prompt,
+ max_tokens=self.max_question_tokens,
+ phase="question_generation",
+ )
+ return self._build_trajectory_from_question(
+ instruction=instruction,
+ target_topic=target_topic,
+ target_difficulty=target_difficulty,
+ generated_question=generated_question,
+ question_transitions=question_transitions,
+ )
+
+ def _build_trajectory_from_question(
+ self,
+ instruction: str,
+ target_topic: str,
+ target_difficulty: float,
+ generated_question: str,
+ question_transitions: Optional[List] = None,
+ ) -> Trajectory:
+ trajectory = Trajectory()
+ question_transitions = question_transitions or []
+
+ solution_prompt = self.format_solution_prompt(generated_question)
+ generated_solution, solution_transitions = self.generate_with_logging(
+ initial_prompt=solution_prompt,
+ max_tokens=self.max_solution_tokens,
+ phase="solution",
+ )
+
+ reward_result = self.compute_reward(
+ question=generated_question,
+ solution=generated_solution,
+ target_topic=target_topic,
+ target_difficulty=target_difficulty,
+ )
+
+ terminal_reward = float(reward_result["combined_score"])
+ all_transitions = question_transitions + solution_transitions
+ # Terminal-only reward — gae_lambda=1.0 makes A_t = R - V(s_t) for all t.
+ for idx, transition in enumerate(all_transitions):
+ transition.reward = (
+ terminal_reward if idx == len(all_transitions) - 1 else 0.0
+ )
+ trajectory.add(transition)
+
+ verification = reward_result["solution_metrics"]["verification_details"]
+ consensus = verification["consensus"]
+ question_metrics = reward_result["question_metrics"]
+
+ metadata = TrajectoryMetadata(
+ curriculum_iteration=self.curriculum_manager.current_iteration,
+ target_topic=target_topic,
+ target_difficulty=target_difficulty,
+ instruction=instruction,
+ generated_question=generated_question,
+ generated_solution=generated_solution,
+ question_length=len(question_transitions),
+ solution_length=len(solution_transitions),
+ detected_topic=str(question_metrics["detected_topic"]["primary_topic"]),
+ detected_secondary_topics=[
+ str(x) for x in question_metrics["detected_topic"]["secondary_topics"]
+ ],
+ topic_match_score=float(question_metrics["topic_match"]),
+ estimated_difficulty=float(question_metrics["measured_difficulty"]),
+ clarity_score=float(question_metrics["clarity"]),
+ novelty_scores=dict(question_metrics["novelty"]),
+ consensus_achieved=bool(consensus["has_majority"]),
+ consensus_strength=float(consensus["consensus_strength"]),
+ answer_diversity=int(consensus["answer_diversity"]),
+ majority_answer=consensus.get("majority_answer"),
+ primary_matches_majority=bool(consensus["primary_matches_majority"]),
+ sympy_verified=True,
+ steps_total=int(consensus.get("prm_num_steps", 0)),
+ steps_verified_ok=int(consensus.get("prm_num_steps", 0)),
+ steps_failed=0,
+ final_answer_ok=bool(consensus.get("primary_matches_majority", False)),
+ question_reward=float(question_metrics["overall_score"]),
+ solution_reward=float(reward_result["solution_metrics"]["overall_score"]),
+ pre_expert_reward=float(reward_result["base_combined_score"]),
+ expert_reward_modifier=float(
+ reward_result["expert_metrics"]["reward_modifier"]
+ ),
+ expert_phase=str(reward_result["expert_metrics"]["phase"]),
+ expert_feedback=str(reward_result["expert_metrics"]["feedback"]),
+ replay_candidate=False,
+ replay_novelty=0.0,
+ replay_added=False,
+ combined_reward=terminal_reward,
+ reward_breakdown=reward_result,
+ topics_in_sweet_spot=self.curriculum_manager.get_sweet_spot_topics(),
+ current_focus_topics=self.curriculum_manager.get_current_focus(),
+ curriculum_state_snapshot=self.curriculum_manager.get_curriculum_stats(),
+ )
+ metadata_dict = asdict(metadata)
+ trajectory.metadata = metadata_dict
+
+ # Replay admission: requires trajectory.metadata to already exist
+ # because check_novelty reads metadata["generated_question"].
+ is_candidate, reason = self.quality_filter.meets_replay_criteria(metadata_dict)
+ metadata_dict["replay_candidate"] = is_candidate
+ if is_candidate:
+ novelty_score = self.quality_filter.check_novelty(
+ trajectory, self.replay_buffer.buffer
+ )
+ metadata_dict["replay_novelty"] = float(novelty_score)
+ if self.quality_filter.is_novel_enough(novelty_score):
+ quality_score = self.quality_filter.compute_quality_score(metadata_dict)
+ self.replay_buffer.add_trajectory(
+ trajectory=trajectory,
+ metadata=metadata_dict,
+ iteration=self.curriculum_manager.current_iteration,
+ quality_score=quality_score,
+ )
+ metadata_dict["replay_added"] = True
+ else:
+ metadata_dict["replay_added"] = False
+ else:
+ metadata_dict["replay_added"] = False
+ metadata_dict["replay_reject_reason"] = reason
+
+ trajectory.metadata = metadata_dict
+ return trajectory
+
+ def _get_adaptive_replay_ratio(self) -> float:
+ iteration = self.curriculum_manager.current_iteration
+ if iteration < 3:
+ return 0.0
+ if iteration < 5:
+ return 0.15
+
+ buffer_stats = self.replay_buffer.get_buffer_stats(current_iteration=iteration)
+ buffer_health = float(buffer_stats.get("buffer_health", 0.0))
+ if buffer_health >= 0.75:
+ return 0.3
+ if buffer_health >= 0.6:
+ return 0.25
+ return 0.2
+
+ def collect_rollouts(
+ self,
+ num_trajectories: int,
+ verbose: bool = True,
+ grounded_ratio: float = 0.0,
+ ) -> List[Trajectory]:
+ """
+ Generate ``num_trajectories`` episodes in-process on the current
+ device.
+
+ Mix:
+ * ``grounded_ratio`` of rollouts are GSM8K-anchored (real question,
+ reward scored against gold final answer). These give the policy
+ a clean gradient toward benchmark correctness and are also ~3x
+ faster than self-play rollouts (no TripleVerifier call).
+ * an adaptive fraction is drawn from the replay buffer when buffer
+ health is good (self-play only).
+ * the remainder are fresh self-play rollouts.
+ """
+ if num_trajectories <= 0:
+ return []
+
+ # Defensive .eval() on both policy and value before any generation.
+ # The first iteration runs rollouts right after model load (HF default
+ # is .train()). Qwen2.5 has zero dropout so this is currently cosmetic,
+ # but cheap insurance against any future model swap with stochastic layers.
+ if self.policy is not None:
+ self.policy.eval()
+ if self.value is not None:
+ self.value.eval()
+
+ # Grounded rollouts: only if we actually have QA pairs loaded.
+ if grounded_ratio > 0.0 and self.grounded_qa_pairs:
+ num_grounded = int(round(num_trajectories * grounded_ratio))
+ num_grounded = min(num_grounded, num_trajectories)
+ else:
+ num_grounded = 0
+ num_selfplay = num_trajectories - num_grounded
+
+ # Within the self-play half, the existing replay-buffer mix applies.
+ replay_ratio = self._get_adaptive_replay_ratio()
+ num_replay = int(num_selfplay * replay_ratio)
+ num_replay = min(num_replay, len(self.replay_buffer))
+ num_fresh = max(0, num_selfplay - num_replay)
+
+ # ---- Grounded rollouts (GSM8K-anchored) --------------------------
+ grounded_trajectories: List[Trajectory] = []
+ grounded_correct = 0
+ grounded_reward_sum = 0.0
+ if num_grounded > 0:
+ qa_sample = random.sample(
+ self.grounded_qa_pairs,
+ k=min(num_grounded, len(self.grounded_qa_pairs)),
+ )
+ # If we asked for more grounded rollouts than we have distinct
+ # pairs, pad by re-sampling with replacement.
+ while len(qa_sample) < num_grounded:
+ qa_sample.append(random.choice(self.grounded_qa_pairs))
+ pbar = tqdm(
+ qa_sample,
+ desc="Grounded rollouts",
+ unit="ep",
+ dynamic_ncols=True,
+ leave=False,
+ disable=not verbose,
+ )
+ for qa in pbar:
+ trajectory = self.rollout_grounded_trajectory(qa)
+ grounded_trajectories.append(trajectory)
+ r = float(trajectory.metadata.get("combined_reward", 0.0))
+ grounded_reward_sum += r
+ if bool(trajectory.metadata.get("grounded_gt_match", False)):
+ grounded_correct += 1
+ done = len(grounded_trajectories)
+ pbar.set_postfix(
+ acc=f"{grounded_correct / done:.1%}",
+ reward=f"{grounded_reward_sum / done:+.3f}",
+ refresh=False,
+ )
+
+ # ---- Fresh self-play rollouts ------------------------------------
+ fresh_trajectories: List[Trajectory] = []
+ pbar = tqdm(
+ range(num_fresh),
+ desc="Self-play rollouts",
+ unit="ep",
+ dynamic_ncols=True,
+ leave=False,
+ disable=not verbose,
+ )
+ running_reward = 0.0
+ running_ok = 0
+ for _ in pbar:
+ trajectory = self.rollout_trajectory()
+ trajectory.metadata["rollout_source"] = "fresh"
+ fresh_trajectories.append(trajectory)
+
+ running_reward += float(trajectory.metadata.get("combined_reward", 0.0))
+ if trajectory.metadata.get("final_answer_ok", False):
+ running_ok += 1
+ done = len(fresh_trajectories)
+ pbar.set_postfix(
+ reward=f"{running_reward / done:+.3f}",
+ ok=f"{running_ok}/{done}",
+ refresh=False,
+ )
+
+ # ---- Replay buffer draws -----------------------------------------
+ replay_trajectories = self.replay_buffer.sample_replay_batch(
+ num_replay, diversity_sample=True
+ )
+ for trajectory in replay_trajectories:
+ trajectory.metadata["rollout_source"] = "replay"
+
+ trajectories = (
+ grounded_trajectories + fresh_trajectories + replay_trajectories
+ )
+ random.shuffle(trajectories)
+
+ self.last_replay_ratio = replay_ratio
+ self.last_rollout_mix = {
+ "fresh": len(fresh_trajectories),
+ "replay": len(replay_trajectories),
+ "grounded": len(grounded_trajectories),
+ }
+ grounded_count = len(grounded_trajectories)
+ self.last_grounded_stats = {
+ "count": grounded_count,
+ "correct": grounded_correct,
+ "accuracy": (
+ grounded_correct / grounded_count if grounded_count > 0 else 0.0
+ ),
+ "mean_reward": (
+ grounded_reward_sum / grounded_count if grounded_count > 0 else 0.0
+ ),
+ }
+
+ if verbose:
+ buffer_stats = self.replay_buffer.get_buffer_stats(
+ current_iteration=self.curriculum_manager.current_iteration
+ )
+ logger.info(
+ "Rollout mix: %d grounded + %d fresh + %d replay "
+ "(grounded_ratio=%.2f, replay_ratio=%.2f, buffer_size=%d, health=%.3f)",
+ len(grounded_trajectories),
+ len(fresh_trajectories),
+ len(replay_trajectories),
+ grounded_ratio,
+ replay_ratio,
+ len(self.replay_buffer),
+ float(buffer_stats.get("buffer_health", 0.0)),
+ )
+ if grounded_count > 0:
+ logger.info(
+ "Grounded accuracy this iter: %d/%d = %.1f%% (mean reward %.3f)",
+ grounded_correct,
+ grounded_count,
+ 100.0 * grounded_correct / grounded_count,
+ grounded_reward_sum / grounded_count,
+ )
+
+ self.curriculum_manager.increment_iteration()
+ self.curriculum_manager.save_state(
+ iteration=self.curriculum_manager.current_iteration, rollout=None
+ )
+ return trajectories
diff --git a/src/rl/mdp_components.py b/src/rl/mdp_components.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecae98c9911f4ba39248c09a912594da25804130
--- /dev/null
+++ b/src/rl/mdp_components.py
@@ -0,0 +1,137 @@
+"""
+MDP data structures for PPO self-improvement loop.
+
+Defines the core components of the Markov Decision Process:
+ - State : text sequence at time t
+ - Action : token sampled from π_θ(·|s_t)
+ - Transition : (s_t, a_t, r_t, s_{t+1}, V(s_t), done)
+ - Trajectory : full episode τ = (s_0, a_0, r_0, …, s_T)
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any, Dict, Iterator, List
+
+import torch
+
+
+@dataclass
+class State:
+ """
+ Represents s_t = context token sequence at generation step t.
+
+ Attributes:
+ text : Decoded string (includes prompt).
+ input_ids : 1-D token-id tensor [seq_len].
+ attention_mask: 1-D mask tensor [seq_len].
+ phase : "question_generation" | "solution".
+ """
+
+ text: str
+ input_ids: torch.Tensor
+ attention_mask: torch.Tensor
+ phase: str
+
+
+@dataclass
+class Action:
+ """
+ Represents a_t = single token selected at step t.
+
+ Attributes:
+ token_id : Vocabulary index of the chosen token.
+ log_prob : log π_θ(a_t | s_t) (used for importance ratio).
+ entropy : H(π(·|s_t)) (used for entropy bonus).
+ """
+
+ token_id: int
+ log_prob: float
+ entropy: float
+
+
+@dataclass
+class Transition:
+ """
+ Single step in the MDP: (s_t, a_t, r_t, s_{t+1}, V(s_t), done).
+
+ Attributes:
+ state : s_t
+ action : a_t
+ reward : r_t (0.0 for non-terminal; sparse reward at episode end)
+ next_state: s_{t+1}
+ value : V(s_t) from critic at step t
+ done : Whether this is the terminal transition
+ """
+
+ state: State
+ action: Action
+ reward: float
+ next_state: State
+ value: float
+ done: bool
+
+
+class Trajectory:
+ """
+ Complete episode τ = (s_0, a_0, r_0, …, s_T).
+
+ Provides helpers for reward summation and iteration.
+ """
+
+ def __init__(self) -> None:
+ self.transitions: List[Transition] = []
+ self.metadata: Dict[str, Any] = {}
+
+ # ------------------------------------------------------------------
+ # Mutation
+ # ------------------------------------------------------------------
+
+ def add(self, transition: Transition) -> None:
+ """Append a transition to the episode."""
+ self.transitions.append(transition)
+
+ # ------------------------------------------------------------------
+ # Properties
+ # ------------------------------------------------------------------
+
+ @property
+ def total_reward(self) -> float:
+ """Sum of all rewards in the episode R(τ) = Σ r_t."""
+ return sum(t.reward for t in self.transitions)
+
+ @property
+ def rewards(self) -> List[float]:
+ return [t.reward for t in self.transitions]
+
+ @property
+ def values(self) -> List[float]:
+ return [t.value for t in self.transitions]
+
+ @property
+ def log_probs(self) -> List[float]:
+ return [t.action.log_prob for t in self.transitions]
+
+ @property
+ def entropies(self) -> List[float]:
+ return [t.action.entropy for t in self.transitions]
+
+ @property
+ def dones(self) -> List[bool]:
+ return [t.done for t in self.transitions]
+
+ # ------------------------------------------------------------------
+ # Dunder helpers
+ # ------------------------------------------------------------------
+
+ def __len__(self) -> int:
+ return len(self.transitions)
+
+ def __iter__(self) -> Iterator[Transition]:
+ return iter(self.transitions)
+
+ def __repr__(self) -> str: # pragma: no cover
+ return (
+ f"Trajectory(len={len(self)}, "
+ f"total_reward={self.total_reward:.3f})"
+ )
diff --git a/src/rl/prm_scorer.py b/src/rl/prm_scorer.py
new file mode 100644
index 0000000000000000000000000000000000000000..022ac34d5f7150ec9f68d4b698b0ce3c63732708
--- /dev/null
+++ b/src/rl/prm_scorer.py
@@ -0,0 +1,289 @@
+"""
+Process Reward Model (PRM) scorer for step-level correctness.
+
+Uses Qwen/Qwen2.5-Math-PRM-7B — a purpose-built process reward model that
+assigns each reasoning step a probability of being correct. This replaces
+the "consensus voting across three samples from the same policy" signal,
+which was groupthink (three samples agree because they share the same
+failure mode) and therefore uncorrelated with GSM8K accuracy.
+
+How PRM scoring works
+---------------------
+* The input is ``question`` + an assistant response where each reasoning
+ step is separated by the special token ```` (also appended
+ after the final step).
+* The model runs a single forward pass and emits a classification logit
+ (``[negative, positive]``) at every ```` position.
+* ``softmax`` → the positive-class probability is the per-step reward in
+ ``[0, 1]``.
+
+Training integration
+--------------------
+Loaded once at startup alongside the policy. Scored during rollout
+``compute_reward`` calls (no gradient flow). Quantise to 4-bit via
+``bitsandbytes`` to keep VRAM under ~5 GB so there is ample headroom for
+policy training on a single 80 GB A100.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Dict, List, Optional
+
+import torch
+import torch.nn.functional as F
+from transformers import AutoModel, AutoTokenizer
+
+from src.sft.solution_format import _step_bodies, extract_final_answer_numeric_str
+from src.utils.attn_backend import select_attn_implementation
+
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_SYSTEM_PROMPT = (
+ "Please reason step by step, and put your final answer within \\boxed{}."
+)
+# Qwen PRM's step separator token. Hard-coded by the model; do not change.
+STEP_SEP_TOKEN = ""
+
+
+def extract_prm_steps(solution: str) -> List[str]:
+ """
+ Split a Qwen-style ``Step N:`` solution into the text fragments the PRM
+ expects — one element per reasoning step, with the final-answer line
+ appended as a closing step so it gets its own correctness score.
+
+ The ``Step N:`` prefix is stripped so we feed plain reasoning text
+ (matches PRM's training distribution, which was Qwen-Math-Instruct
+ paragraph-style outputs).
+ """
+ bodies = _step_bodies(solution)
+ steps: List[str] = [b.strip() for b in bodies if b.strip()]
+ final_raw = extract_final_answer_numeric_str(solution)
+ if final_raw:
+ steps.append(f"The answer is \\boxed{{{final_raw.strip()}}}")
+ return steps
+
+
+class ProcessRewardScorer:
+ """
+ Qwen2.5-Math-PRM-7B scorer. Memory-efficient: the model is held in
+ inference mode on the training device and runs in ``torch.no_grad``.
+ """
+
+ def __init__(
+ self,
+ model_name: str = "Qwen/Qwen2.5-Math-PRM-7B",
+ device: Optional[torch.device] = None,
+ load_in_4bit: bool = True,
+ dtype: torch.dtype = torch.bfloat16,
+ max_input_tokens: int = 4096,
+ ):
+ self.model_name = model_name
+ self.device = device or torch.device(
+ "cuda" if torch.cuda.is_available() else "cpu"
+ )
+ self.max_input_tokens = max_input_tokens
+
+ logger.info(
+ "Loading PRM %s (4-bit=%s, dtype=%s) on %s …",
+ model_name, load_in_4bit, dtype, self.device,
+ )
+ self.tokenizer = AutoTokenizer.from_pretrained(
+ model_name, trust_remote_code=True
+ )
+
+ load_kwargs: Dict[str, Any] = {
+ "trust_remote_code": True,
+ "torch_dtype": dtype,
+ # PRM forward is eval-only but sequences can be 1-2k tokens
+ # when the policy writes a lot of steps; flash-attn 2 cuts the
+ # scoring forward by ~2x at those lengths. Falls back to SDPA.
+ "attn_implementation": select_attn_implementation(),
+ }
+ if load_in_4bit and torch.cuda.is_available():
+ try:
+ from transformers import BitsAndBytesConfig
+
+ load_kwargs["quantization_config"] = BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_compute_dtype=dtype,
+ bnb_4bit_quant_type="nf4",
+ bnb_4bit_use_double_quant=True,
+ )
+ load_kwargs["device_map"] = {"": self.device}
+ except ImportError:
+ logger.warning(
+ "bitsandbytes not available; falling back to bf16 PRM load"
+ )
+ load_in_4bit = False
+ if not load_in_4bit:
+ load_kwargs["device_map"] = {"": self.device}
+
+ self.model = AutoModel.from_pretrained(model_name, **load_kwargs).eval()
+
+ # Cache separator token id so we don't re-tokenize it every call.
+ # encode() returns a list — PRM's step_sep is a single token.
+ sep_ids = self.tokenizer.encode(STEP_SEP_TOKEN, add_special_tokens=False)
+ if len(sep_ids) != 1:
+ raise RuntimeError(
+ f"PRM step separator {STEP_SEP_TOKEN!r} tokenized to "
+ f"{sep_ids} (expected a single id). Tokenizer mismatch."
+ )
+ self.step_sep_id = int(sep_ids[0])
+
+ if torch.cuda.is_available():
+ mem_alloc = torch.cuda.memory_allocated(self.device) / (1024 ** 3)
+ logger.info(
+ "PRM ready. GPU memory allocated: %.2f GB step_sep_id=%d",
+ mem_alloc, self.step_sep_id,
+ )
+
+ @torch.no_grad()
+ def score_solution(
+ self,
+ question: str,
+ solution: str,
+ system_prompt: str = DEFAULT_SYSTEM_PROMPT,
+ ) -> Dict[str, Any]:
+ """
+ Return per-step correctness probabilities for ``solution``.
+
+ Returns dict with:
+ step_scores : List[float] — per-step prob in [0, 1]
+ num_steps : int
+ mean_score : float — avg across steps
+ min_score : float — weakest step (error locator)
+ final_score : float — score on the answer-line step
+ degraded : bool — True if we returned a zero-length
+ score list (empty solution, etc.)
+ """
+ steps = extract_prm_steps(solution)
+ if not steps:
+ return {
+ "step_scores": [],
+ "num_steps": 0,
+ "mean_score": 0.0,
+ "min_score": 0.0,
+ "final_score": 0.0,
+ "degraded": True,
+ "degraded_reason": "no extractable steps",
+ }
+
+ assistant_body = STEP_SEP_TOKEN.join(steps) + STEP_SEP_TOKEN
+ messages = [
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": question.strip()},
+ {"role": "assistant", "content": assistant_body},
+ ]
+ try:
+ prompt = self.tokenizer.apply_chat_template(
+ messages, tokenize=False, add_generation_prompt=False
+ )
+ except Exception as exc:
+ logger.warning("PRM chat template failed: %s", exc)
+ return {
+ "step_scores": [],
+ "num_steps": len(steps),
+ "mean_score": 0.0,
+ "min_score": 0.0,
+ "final_score": 0.0,
+ "degraded": True,
+ "degraded_reason": f"chat template error: {exc}",
+ }
+
+ enc = self.tokenizer(
+ prompt,
+ return_tensors="pt",
+ truncation=True,
+ max_length=self.max_input_tokens,
+ )
+ input_ids = enc["input_ids"].to(self.device)
+ attention_mask = enc.get("attention_mask")
+ if attention_mask is not None:
+ attention_mask = attention_mask.to(self.device)
+
+ try:
+ outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
+ except Exception as exc:
+ logger.warning("PRM forward pass failed: %s", exc)
+ return {
+ "step_scores": [],
+ "num_steps": len(steps),
+ "mean_score": 0.0,
+ "min_score": 0.0,
+ "final_score": 0.0,
+ "degraded": True,
+ "degraded_reason": f"forward error: {exc}",
+ }
+
+ logits = outputs[0] # [1, seq_len, 2]
+ token_mask = (input_ids == self.step_sep_id) # [1, seq_len] bool
+
+ # Follow the reference make_step_rewards routine. We softmax the
+ # logits, zero out non-separator positions, then read the positive
+ # class (index 1) at each separator.
+ probs = F.softmax(logits, dim=-1) # [1, seq_len, 2]
+ probs = probs * token_mask.unsqueeze(-1)
+ sample = probs[0] # [seq_len, 2]
+ positive_probs = sample[sample != 0].view(-1, 2)[:, 1]
+ step_scores: List[float] = positive_probs.float().cpu().tolist()
+
+ # Truncation may have dropped trailing separators. Align lengths
+ # conservatively by padding missing positions with the mean of what
+ # we did see. Log a warning so callers know the scores are partial.
+ if len(step_scores) < len(steps) and step_scores:
+ pad_val = float(sum(step_scores) / len(step_scores))
+ n_padded = len(steps) - len(step_scores)
+ step_scores = step_scores + [pad_val] * n_padded
+ logger.warning(
+ "PRM: %d/%d steps scored; %d tail step(s) padded with mean=%.3f "
+ "(sequence likely truncated at %d tokens).",
+ len(step_scores) - n_padded, len(steps), n_padded, pad_val,
+ self.max_input_tokens,
+ )
+ elif len(step_scores) > len(steps):
+ step_scores = step_scores[: len(steps)]
+
+ if not step_scores:
+ return {
+ "step_scores": [],
+ "num_steps": len(steps),
+ "mean_score": 0.0,
+ "min_score": 0.0,
+ "final_score": 0.0,
+ "degraded": True,
+ "degraded_reason": "no separator token in output (truncated?)",
+ }
+
+ mean_score = float(sum(step_scores) / len(step_scores))
+ min_score = float(min(step_scores))
+ final_score = float(step_scores[-1])
+
+ return {
+ "step_scores": [float(s) for s in step_scores],
+ "num_steps": len(step_scores),
+ "mean_score": mean_score,
+ "min_score": min_score,
+ "final_score": final_score,
+ "degraded": False,
+ "padded_steps": len(step_scores) < len(steps), # True if tail was padded
+ }
+
+ @torch.no_grad()
+ def score_batch(
+ self,
+ items: List[Dict[str, str]],
+ system_prompt: str = DEFAULT_SYSTEM_PROMPT,
+ ) -> List[Dict[str, Any]]:
+ """Score a list of ``{"question", "solution"}`` items sequentially.
+
+ A proper padded batch path would be ~2-3× faster but needs care to
+ handle variable separator counts. Sequential is simple, correct,
+ and a single PRM forward takes ~100-300 ms on an A100 — acceptable
+ overhead given self-play generation dominates rollout wall-time.
+ """
+ return [
+ self.score_solution(it["question"], it["solution"], system_prompt)
+ for it in items
+ ]
diff --git a/src/rl/quality_filter.py b/src/rl/quality_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..8249d45078d5d313912807fec0add27003b2d893
--- /dev/null
+++ b/src/rl/quality_filter.py
@@ -0,0 +1,131 @@
+"""
+Quality gating and novelty checks for replay admission.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Dict, Iterable, Set, Tuple
+
+from src.rl.mdp_components import Trajectory
+from src.rl.replay_buffer import StoredTrajectory
+
+
+class QualityFilter:
+ def __init__(self, novelty_threshold: float = 0.5) -> None:
+ """
+ Initialize quality filter with relaxed novelty threshold.
+
+ Args:
+ novelty_threshold: Minimum novelty score (0.5 = moderate diversity)
+ """
+ self.novelty_threshold = novelty_threshold
+
+ def meets_replay_criteria(self, metadata: Dict[str, object]) -> Tuple[bool, str]:
+ """
+ Three-tier quality filter for buffer admission.
+
+ Tier 1 (Gold): High reward + both verification signals
+ Tier 2 (Silver): Very high reward + at least one strong signal
+ Tier 3 (Platinum): Near-perfect trajectories bypass filters
+
+ Args:
+ metadata: Trajectory metadata dict
+
+ Returns:
+ (is_eligible, reason_or_tier)
+ """
+ combined_reward = float(metadata.get("combined_reward", 0.0))
+
+ # Tier 3: Platinum standard - near-perfect trajectories always get in
+ if combined_reward >= 0.95:
+ return True, "platinum_standard"
+
+ # Tier 1: Gold standard - high quality with both verification signals
+ if combined_reward >= 0.7:
+ has_consensus = (
+ bool(metadata.get("consensus_achieved", False)) and
+ bool(metadata.get("primary_matches_majority", False))
+ )
+ sympy_clean = bool(metadata.get("sympy_verified", False))
+
+ if has_consensus and sympy_clean:
+ if float(metadata.get("topic_match_score", 0.0)) >= 0.6:
+ return True, "gold_standard"
+
+ # Tier 2: Silver standard - very high reward with at least one strong signal
+ if combined_reward >= 0.75:
+ # Accept if EITHER perfect SymPy OR strong consensus
+ perfect_sympy = float(metadata.get("sympy_score", 0.0)) >= 0.95
+ strong_consensus = (
+ bool(metadata.get("consensus_achieved", False)) and
+ float(metadata.get("consensus_strength", 0.0)) >= 0.8
+ )
+
+ if perfect_sympy or strong_consensus:
+ if float(metadata.get("topic_match_score", 0.0)) >= 0.6:
+ return True, "silver_standard"
+
+ # Failed all tiers
+ if combined_reward < 0.7:
+ return False, f"reward_too_low_{combined_reward:.2f}"
+ elif combined_reward < 0.75:
+ return False, "reward_below_silver_tier"
+ else:
+ return False, "no_strong_verification_signal"
+
+ def compute_quality_score(self, metadata: Dict[str, object]) -> float:
+ return max(
+ 0.0,
+ min(
+ 1.0,
+ (
+ 0.4 * float(metadata.get("combined_reward", 0.0))
+ + 0.3 * (1.0 if bool(metadata.get("sympy_verified", False)) else 0.0)
+ + 0.2 * float(metadata.get("topic_match_score", 0.0))
+ + 0.1 * float(metadata.get("clarity_score", 0.0))
+ ),
+ ),
+ )
+
+ def check_novelty(
+ self,
+ trajectory: Trajectory,
+ existing: Iterable[StoredTrajectory],
+ ) -> float:
+ if trajectory.metadata is None:
+ return 0.0
+ question = str(trajectory.metadata.get("generated_question", ""))
+ new_ngrams = self._extract_ngrams(question.lower(), n=3)
+ if not new_ngrams:
+ return 0.0
+
+ max_similarity = 0.0
+ for stored in existing:
+ stored_q = str(stored.metadata.get("generated_question", ""))
+ existing_ngrams = self._extract_ngrams(stored_q.lower(), n=3)
+ similarity = self._jaccard(new_ngrams, existing_ngrams)
+ if similarity > max_similarity:
+ max_similarity = similarity
+ return 1.0 - max_similarity
+
+ def is_novel_enough(self, novelty_score: float) -> bool:
+ return novelty_score >= self.novelty_threshold
+
+ @staticmethod
+ def _extract_ngrams(text: str, n: int = 3) -> Set[str]:
+ normalized = re.sub(r"\s+", " ", text.strip())
+ if not normalized:
+ return set()
+ if len(normalized) < n:
+ return {normalized}
+ return {normalized[i : i + n] for i in range(len(normalized) - n + 1)}
+
+ @staticmethod
+ def _jaccard(left: Set[str], right: Set[str]) -> float:
+ if not left or not right:
+ return 0.0
+ union = left | right
+ if not union:
+ return 0.0
+ return len(left & right) / len(union)
diff --git a/src/rl/question_classifier.py b/src/rl/question_classifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..676b4502163bd475f57792b4254e6f9533d79201
--- /dev/null
+++ b/src/rl/question_classifier.py
@@ -0,0 +1,510 @@
+"""
+Question classification and difficulty estimation utilities.
+
+This module provides a deterministic, low-latency classifier for:
+- Primary/secondary topic detection
+- Post-hoc difficulty estimation from generated solutions
+- Basic question clarity checks
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+
+TOPIC_KEYWORDS = {
+ "basic_arithmetic": [
+ "add",
+ "sum",
+ "subtract",
+ "difference",
+ "total",
+ "altogether",
+ ],
+ "single_step_word_problems": [
+ "how many",
+ "left",
+ "remain",
+ "altogether",
+ ],
+ "fractions": [
+ "fraction",
+ "fractions",
+ "numerator",
+ "denominator",
+ "half",
+ "quarter",
+ "third",
+ "fourth",
+ "fifth",
+ ],
+ "percentages": [
+ "percent",
+ "percentage",
+ "% ",
+ "discount",
+ "tax",
+ "increase",
+ "decrease",
+ ],
+ "ratios": [
+ "ratio",
+ "proportion",
+ "per",
+ "for every",
+ "rate",
+ ],
+ "money_problems": [
+ "dollar",
+ "dollars",
+ "cents",
+ "$",
+ "price",
+ "cost",
+ "buy",
+ "sell",
+ ],
+ "time_distance": [
+ "hour",
+ "minute",
+ "second",
+ "km",
+ "mile",
+ "speed",
+ "distance",
+ "travel",
+ ],
+ "multi_step_reasoning": [
+ "then",
+ "after",
+ "before",
+ "remaining",
+ "each",
+ "twice",
+ "three times",
+ ],
+ "algebra": [
+ "solve for",
+ "equation",
+ "variable",
+ "x",
+ "y",
+ "unknown",
+ ],
+ "mixed_operations": [
+ "combined",
+ "multiple operations",
+ "in total",
+ ],
+ "comparison_problems": [
+ "more than",
+ "less than",
+ "difference",
+ "compared",
+ ],
+ "optimization_problems": [
+ "maximum",
+ "minimum",
+ "optimize",
+ "best",
+ ],
+ # ── AQuA-RAT additions ────────────────────────────────────────────────
+ "number_theory": [
+ "prime",
+ "divisible",
+ "remainder",
+ "factor",
+ "multiple",
+ "divisor",
+ "integer divisible",
+ "mod",
+ ],
+ "profit_loss": [
+ "profit",
+ "loss",
+ "cost price",
+ "selling price",
+ "markup",
+ "gain",
+ "cp",
+ "sp",
+ ],
+ "interest": [
+ "simple interest",
+ "compound interest",
+ "principal",
+ "rate of interest",
+ "annually",
+ "quarterly",
+ "semi-annually",
+ "p.a.",
+ ],
+ "sets": [
+ "neither",
+ "both",
+ "only one",
+ "union",
+ "intersection",
+ "venn",
+ "at least one",
+ ],
+ "combinatorics": [
+ "combination",
+ "permutation",
+ "arrangement",
+ "ways to select",
+ "ways to choose",
+ "how many ways",
+ "nCr",
+ "nPr",
+ ],
+ "sequences": [
+ "sequence",
+ "series",
+ "arithmetic progression",
+ "geometric progression",
+ "nth term",
+ "common difference",
+ "common ratio",
+ ],
+ "probability": [
+ "probability",
+ "chance",
+ "likely",
+ "favorable",
+ "event",
+ "random",
+ "draw",
+ ],
+ "work_time": [
+ "work together",
+ "working together",
+ "alone in",
+ "complete the job",
+ "working rate",
+ "finish the work",
+ "days to complete",
+ "rate of work",
+ ],
+ # ── NuminaMath / OpenMathInstruct additions ───────────────────────────
+ "geometry": [
+ "triangle",
+ "circle",
+ "rectangle",
+ "polygon",
+ "area",
+ "perimeter",
+ "angle",
+ "radius",
+ "diameter",
+ "hypotenuse",
+ "coordinate",
+ "tangent",
+ "bisector",
+ "congruent",
+ "similar",
+ "parallel",
+ "perpendicular",
+ "volume",
+ "surface area",
+ "right angle",
+ ],
+ "calculus": [
+ "derivative",
+ "differentiate",
+ "integrate",
+ "dy/dx",
+ "f'(x)",
+ "definite integral",
+ "indefinite integral",
+ "slope of the tangent",
+ "rate of change",
+ "inflection point",
+ ],
+ "statistics": [
+ "mean",
+ "median",
+ "mode",
+ "standard deviation",
+ "variance",
+ "average",
+ "data set",
+ "frequency",
+ "histogram",
+ "distribution",
+ "normal distribution",
+ "expected value",
+ "outlier",
+ "quartile",
+ "range of data",
+ ],
+ "competition_math": [
+ "positive integers",
+ "integer solutions",
+ "divisible by",
+ "remainder when",
+ "relatively prime",
+ "greatest common divisor",
+ "least common multiple",
+ "prove that",
+ "diophantine",
+ "congruent modulo",
+ "sum of digits",
+ ],
+}
+
+TOPIC_LIST = list(TOPIC_KEYWORDS.keys())
+
+
+@dataclass
+class TopicClassification:
+ primary_topic: str
+ secondary_topics: List[str]
+ confidence: float
+ signals_used: List[str]
+ keyword_scores: Dict[str, float]
+
+ def to_dict(self) -> Dict[str, object]:
+ return {
+ "primary_topic": self.primary_topic,
+ "secondary_topics": self.secondary_topics,
+ "confidence": self.confidence,
+ "signals_used": self.signals_used,
+ "keyword_scores": self.keyword_scores,
+ }
+
+
+class QuestionClassifier:
+ """Deterministic classifier for curriculum-guided question generation."""
+
+ _step_pattern = re.compile(r"^\s*step\s+\d+\s*:", re.IGNORECASE | re.MULTILINE)
+ _number_pattern = re.compile(r"-?\d+(?:\.\d+)?(?:/\d+)?")
+ _fraction_pattern = re.compile(r"\d+\s*/\s*\d+")
+ _nested_op_pattern = re.compile(r"\([^()]*[+\-*/][^()]*\)")
+
+ # High-confidence single-phrase signals that override the scoring formula.
+ # Ordered: more specific first. If ANY of these patterns match, the
+ # corresponding topic wins regardless of keyword counts.
+ _PRIORITY_SIGNALS: List[Tuple[re.Pattern, str]] = [
+ # Calculus — "integrate" before ratios can steal "rate" as a substring
+ (re.compile(r"\b(derivative|differentiate|integrate|d/dx|dy/dx|f'\s*\(|indefinite integral|definite integral|rate of change|inflection point)\b", re.I), "calculus"),
+ # Geometry
+ (re.compile(r"\b(triangle|rectangle|polygon|perimeter|circumference|hypotenuse|right angle|surface area|volume of|radius|diameter)\b", re.I), "geometry"),
+ # Statistics
+ (re.compile(r"\b(standard deviation|variance|median|normal distribution|expected value)\b", re.I), "statistics"),
+ # Competition math
+ (re.compile(r"\b(divisible by|remainder when|relatively prime|greatest common divisor|least common multiple|diophantine|congruent modulo|sum of digits)\b", re.I), "competition_math"),
+ (re.compile(r"\bpositive integers?\b.{0,40}\bdivisible\b", re.I), "competition_math"),
+ # Time-distance (speeds? covers plural; match across short gap)
+ (re.compile(r"\bspeeds?\b.{0,80}\b(meet|distance|time|arrive|travel)\b", re.I), "time_distance"),
+ (re.compile(r"\b(km/h|mph|miles per hour|km per hour)\b", re.I), "time_distance"),
+ # Combinatorics — "how many ways" beats single_step "how many"
+ (re.compile(r"\bhow many ways\b", re.I), "combinatorics"),
+ (re.compile(r"\b(arrangements?|permutations?|combinations?) of\b", re.I), "combinatorics"),
+ # Probability — "probability" contains "y" which would otherwise hit algebra
+ (re.compile(r"\b(probability|the chance that|likelihood of)\b", re.I), "probability"),
+ ]
+
+ def classify_topic(self, question: str, solution: Optional[str] = None) -> Dict[str, object]:
+ """Return primary/secondary topics with confidence."""
+ text = (question or "").lower()
+
+ # Fast path: high-confidence priority signals bypass scoring
+ for pattern, topic in self._PRIORITY_SIGNALS:
+ if pattern.search(text):
+ return TopicClassification(
+ primary_topic=topic,
+ secondary_topics=[],
+ confidence=0.95,
+ signals_used=["priority"],
+ keyword_scores={topic: 0.95},
+ ).to_dict()
+
+ keyword_scores = {topic: self._keyword_score(text, words) for topic, words in TOPIC_KEYWORDS.items()}
+
+ signals_used = ["keyword"]
+ primary_topic = max(keyword_scores, key=keyword_scores.get)
+ confidence = keyword_scores[primary_topic]
+
+ if self._fraction_pattern.search(text):
+ keyword_scores["fractions"] += 0.25
+ primary_topic = max(keyword_scores, key=keyword_scores.get)
+ confidence = max(confidence, min(1.0, keyword_scores[primary_topic]))
+ signals_used.append("pattern")
+
+ if "%" in text:
+ keyword_scores["percentages"] += 0.25
+ primary_topic = max(keyword_scores, key=keyword_scores.get)
+ confidence = max(confidence, min(1.0, keyword_scores[primary_topic]))
+ if "pattern" not in signals_used:
+ signals_used.append("pattern")
+
+ if solution:
+ op_topic = self._infer_topic_from_solution(solution)
+ if op_topic:
+ primary_topic = op_topic
+ confidence = max(confidence, 0.9)
+ signals_used.append("solution_ops")
+
+ secondary_topics = [
+ topic
+ for topic, score in sorted(keyword_scores.items(), key=lambda item: item[1], reverse=True)
+ if topic != primary_topic and score >= 0.2
+ ][:3]
+
+ return TopicClassification(
+ primary_topic=primary_topic,
+ secondary_topics=secondary_topics,
+ confidence=min(1.0, confidence),
+ signals_used=signals_used,
+ keyword_scores=keyword_scores,
+ ).to_dict()
+
+ def estimate_difficulty(
+ self,
+ question: str,
+ solution: str,
+ consensus_result: Optional[Dict[str, object]] = None,
+ ) -> float:
+ """
+ Estimate difficulty using post-solution signals.
+
+ 40%: step complexity
+ 30%: numeric complexity
+ 30%: consensus disagreement complexity
+ """
+ step_score = self._step_complexity(solution)
+ number_score = self._numeric_complexity(question, solution)
+ consensus_score = self._consensus_difficulty(consensus_result)
+ difficulty = 0.4 * step_score + 0.3 * number_score + 0.3 * consensus_score
+ return max(0.0, min(1.0, difficulty))
+
+ def check_clarity(self, question: str) -> float:
+ """Score question clarity in [0, 1] from low-cost heuristics."""
+ text = (question or "").strip()
+ if not text:
+ return 0.0
+
+ lower = text.lower()
+ has_numbers = 1.0 if self._number_pattern.search(lower) else 0.0
+ has_question = 1.0 if ("?" in lower or re.search(r"\b(find|calculate|how many|what is|determine|compute|evaluate|express|simplify|solve)\b", lower)) else 0.0
+ words = lower.split()
+ length_ok = 1.0 if 8 <= len(words) <= 120 else 0.3
+ contradiction = 1.0 if not re.search(r"\b(impossible|contradiction|undefined)\b", lower) else 0.0
+
+ return max(0.0, min(1.0, 0.3 * has_numbers + 0.3 * has_question + 0.2 * length_ok + 0.2 * contradiction))
+
+ def _keyword_score(self, text: str, keywords: List[str]) -> float:
+ if not keywords:
+ return 0.0
+ hits = 0
+ for kw in keywords:
+ if kw in text:
+ hits += 1
+ return min(1.0, hits / max(2.0, len(keywords) * 0.6))
+
+ def _infer_topic_from_solution(self, solution: str) -> Optional[str]:
+ text = (solution or "").lower()
+ if not text:
+ return None
+
+ has_fraction = bool(self._fraction_pattern.search(text))
+ has_percent = "%" in text or "percent" in text
+ has_variable = bool(re.search(r"\b[x-y]\b|\bsolve\b|\bequation\b", text))
+ has_division = "/" in text or "divide" in text
+ has_mul = "*" in text or "multiply" in text
+ has_add_sub = any(op in text for op in ["+", "-", "add", "subtract"])
+
+ # Higher-specificity signals come first
+ if any(kw in text for kw in ["derivative", "dy/dx", "f'(", "differentiat", "integrat"]):
+ return "calculus"
+ if any(kw in text for kw in ["triangle", "circle", "area =", "perimeter", "radius", "angle", "coordinate"]):
+ return "geometry"
+ if any(kw in text for kw in ["modulo", "gcd", "lcm", "divisible by", "remainder", "prime"]):
+ return "competition_math"
+ if any(kw in text for kw in ["mean =", "median", "standard deviation", "variance"]):
+ return "statistics"
+ if has_variable:
+ return "algebra"
+ if has_percent:
+ return "percentages"
+ if has_fraction:
+ return "fractions"
+ if has_division and ("km" in text or "mile" in text or "hour" in text):
+ return "time_distance"
+ if has_division and has_mul and has_add_sub:
+ return "mixed_operations"
+ if has_division or has_mul:
+ return "multi_step_reasoning"
+ return None
+
+ def _step_complexity(self, solution: str) -> float:
+ text = solution or ""
+ step_count = len(self._step_pattern.findall(text))
+ if step_count == 0:
+ step_count = max(1, text.count("\n") // 2)
+ step_score = min(1.0, step_count / 5.0)
+
+ lowered = text.lower()
+ op_score = 0.0
+ if any(token in lowered for token in ["+", "-", "add", "subtract"]):
+ op_score = max(op_score, 0.3)
+ if any(token in lowered for token in ["*", "multiply"]):
+ op_score = max(op_score, 0.55)
+ if any(token in lowered for token in ["/", "divide"]):
+ op_score = max(op_score, 0.7)
+ if self._nested_op_pattern.search(lowered):
+ op_score = max(op_score, 0.85)
+
+ return max(0.0, min(1.0, 0.6 * step_score + 0.4 * op_score))
+
+ def _numeric_complexity(self, question: str, solution: str) -> float:
+ text = f"{question or ''} {solution or ''}"
+ numbers = self._number_pattern.findall(text)
+ if not numbers:
+ return 0.0
+
+ max_abs = 0.0
+ has_decimal = False
+ has_fraction = False
+ for token in numbers:
+ if "/" in token:
+ has_fraction = True
+ parts = token.split("/")
+ if len(parts) == 2 and parts[1] != "0":
+ try:
+ value = abs(float(parts[0]) / float(parts[1]))
+ max_abs = max(max_abs, value)
+ except ValueError:
+ pass
+ else:
+ if "." in token:
+ has_decimal = True
+ try:
+ max_abs = max(max_abs, abs(float(token)))
+ except ValueError:
+ pass
+
+ magnitude_score = 0.2
+ if max_abs >= 1000:
+ magnitude_score = 0.8
+ elif max_abs >= 100:
+ magnitude_score = 0.6
+ elif max_abs >= 20:
+ magnitude_score = 0.4
+
+ numeric_bonus = 0.0
+ if has_decimal:
+ numeric_bonus += 0.15
+ if has_fraction:
+ numeric_bonus += 0.2
+
+ return max(0.0, min(1.0, magnitude_score + numeric_bonus))
+
+ def _consensus_difficulty(self, consensus_result: Optional[Dict[str, object]]) -> float:
+ if not consensus_result:
+ return 0.5
+ strength = float(consensus_result.get("consensus_strength", 0.0))
+ return max(0.0, min(1.0, 1.0 - strength))
diff --git a/src/rl/question_quality_evaluator.py b/src/rl/question_quality_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..686d5f41956fbbfdadb8e15e224e321140764ae2
--- /dev/null
+++ b/src/rl/question_quality_evaluator.py
@@ -0,0 +1,202 @@
+"""
+Question quality evaluator for curriculum-guided dual-task training.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+
+from src.rl.question_classifier import QuestionClassifier
+
+
+@dataclass
+class QuestionEvalResult:
+ overall_score: float
+ topic_match: float
+ difficulty_score: float
+ clarity: float
+ solvability_score: float
+ novelty_combined: float
+ measured_difficulty: float
+ detected_topic: Dict[str, object]
+ novelty: Dict[str, float]
+ solvability: Dict[str, object]
+
+ def to_dict(self) -> Dict[str, object]:
+ return {
+ "overall_score": self.overall_score,
+ "topic_match": self.topic_match,
+ "difficulty_score": self.difficulty_score,
+ "clarity": self.clarity,
+ "solvability_score": self.solvability_score,
+ "novelty_combined": self.novelty_combined,
+ "measured_difficulty": self.measured_difficulty,
+ "detected_topic": self.detected_topic,
+ "novelty": self.novelty,
+ "solvability": self.solvability,
+ }
+
+
+class QuestionQualityEvaluator:
+ """Evaluate generated question quality for curriculum reward shaping."""
+
+ def __init__(
+ self,
+ reference_questions: Optional[List[str]] = None,
+ classifier: Optional[QuestionClassifier] = None,
+ novelty_window_size: int = 500, # raised from 100: 5 SP/iter → fills in ~100 iters
+ ):
+ self.reference_questions = reference_questions or []
+ self.classifier = classifier or QuestionClassifier()
+ self.novelty_window_size = novelty_window_size
+ self.recent_questions: List[str] = []
+ # Pre-compute and cache reference n-gram sets once at init.
+ self._reference_ngrams = [self._extract_ngrams(q.lower()) for q in self.reference_questions]
+ # Rolling cache of n-gram sets for recent questions (avoids recomputing every call).
+ self._recent_ngrams: List[set] = []
+
+ def evaluate(
+ self,
+ question: str,
+ solution: str,
+ consensus_result: Optional[Dict[str, object]],
+ target_topic: str,
+ target_difficulty: float,
+ ) -> Dict[str, object]:
+ detected_topic = self.classifier.classify_topic(question=question, solution=solution)
+ topic_match = self._topic_match_score(detected_topic, target_topic)
+
+ measured_difficulty = self.classifier.estimate_difficulty(
+ question=question,
+ solution=solution,
+ consensus_result=consensus_result,
+ )
+ difficulty_score = max(0.0, 1.0 - 2.0 * abs(measured_difficulty - target_difficulty))
+
+ clarity = self.classifier.check_clarity(question)
+ novelty = self.compute_novelty_score(question)
+ solvability = self.assess_solvability(question, solution, consensus_result)
+
+ overall = (
+ 0.25 * topic_match
+ + 0.15 * difficulty_score
+ + 0.20 * clarity
+ + 0.20 * float(solvability["score"])
+ + 0.20 * novelty["combined"] # raised 0.10→0.20; taken from difficulty_score
+ )
+
+ return QuestionEvalResult(
+ overall_score=max(0.0, min(1.0, overall)),
+ topic_match=topic_match,
+ difficulty_score=difficulty_score,
+ clarity=clarity,
+ solvability_score=float(solvability["score"]),
+ novelty_combined=novelty["combined"],
+ measured_difficulty=measured_difficulty,
+ detected_topic=detected_topic,
+ novelty=novelty,
+ solvability=solvability,
+ ).to_dict()
+
+ def compute_novelty_score(self, question: str) -> Dict[str, float]:
+ dataset_novelty = self._novelty_against_reference(question, self._reference_ngrams)
+ # Use cached recent n-gram sets instead of recomputing from strings each call (O(n²)→O(n)).
+ session_novelty = self._novelty_against_reference(question, self._recent_ngrams)
+ # Weight dataset novelty higher (60%) — comparing against 8k GSM8K questions
+ # is a stable, meaningful signal. Session novelty (40%) guards against
+ # the model looping the same question template within a run.
+ combined = max(0.0, min(1.0, 0.60 * dataset_novelty + 0.40 * session_novelty))
+
+ self.recent_questions.append(question)
+ self.recent_questions = self.recent_questions[-self.novelty_window_size:]
+ # Keep n-gram cache in sync with the question window.
+ self._recent_ngrams.append(self._extract_ngrams(question.lower()))
+ self._recent_ngrams = self._recent_ngrams[-self.novelty_window_size:]
+
+ return {
+ "combined": combined,
+ "dataset_novelty": dataset_novelty,
+ "session_novelty": session_novelty,
+ }
+
+ def assess_solvability(
+ self,
+ question: str,
+ solution: str,
+ consensus_result: Optional[Dict[str, object]],
+ ) -> Dict[str, object]:
+ q_lower = (question or "").lower()
+ has_numbers = bool(re.search(r"\d", q_lower))
+ has_question = ("?" in q_lower) or bool(re.search(
+ r"\b(find|calculate|how many|what is|determine|compute|evaluate|express|simplify|solve)\b",
+ q_lower,
+ ))
+ length_ok = 8 <= len(q_lower.split()) <= 120
+ if not (has_numbers and has_question and length_ok):
+ return {"solvable": False, "reason": "syntactic_failure", "score": 0.0}
+
+ has_contradiction = bool(re.search(r"\b(impossible|cannot|undefined)\b", q_lower))
+ if has_contradiction:
+ return {"solvable": False, "reason": "semantic_failure", "score": 0.3}
+
+ # PRM-based arithmetic quality check (replaces SymPy step verification).
+ # consensus_strength = prm_mean: average PRM score across all reasoning steps.
+ # A low PRM mean means the model produced inconsistent or incorrect reasoning,
+ # which strongly signals the question is ambiguous, contradictory, or unsolvable.
+ # PRM understands full mathematical semantics — it catches errors that SymPy
+ # misses (e.g., wrong logic, incorrect setups) while not failing on valid prose.
+ if consensus_result:
+ confidence = float(consensus_result.get("consensus_strength", 0.5))
+ if confidence < 0.30:
+ # PRM rejects most steps → solution is invalid → question is likely unsolvable
+ return {"solvable": False, "reason": "low_prm_confidence", "score": 0.5}
+ if not bool(consensus_result.get("has_majority", False)):
+ # PRM is borderline (0.30–0.49) → uncertain solvability
+ return {"solvable": False, "reason": "no_consensus", "score": 0.6}
+ else:
+ confidence = 0.5
+
+ return {
+ "solvable": True,
+ "reason": "fully_solvable",
+ "score": 1.0,
+ "confidence": confidence,
+ }
+
+ @staticmethod
+ def _extract_ngrams(text: str, n: int = 3) -> set[str]:
+ normalized = re.sub(r"\s+", " ", (text or "").strip())
+ if len(normalized) < n:
+ return {normalized} if normalized else set()
+ return {normalized[i : i + n] for i in range(len(normalized) - n + 1)}
+
+ @staticmethod
+ def _jaccard_similarity(set1: set[str], set2: set[str]) -> float:
+ if not set1 or not set2:
+ return 0.0
+ union = set1 | set2
+ if not union:
+ return 0.0
+ return len(set1 & set2) / len(union)
+
+ def _novelty_against_reference(self, question: str, reference_sets: List[set[str]]) -> float:
+ if not reference_sets:
+ return 1.0
+ current = self._extract_ngrams((question or "").lower())
+ max_similarity = 0.0
+ for ref_set in reference_sets:
+ max_similarity = max(max_similarity, self._jaccard_similarity(current, ref_set))
+ return max(0.0, 1.0 - max_similarity)
+
+ @staticmethod
+ def _topic_match_score(detected_topic: Dict[str, object], target_topic: str) -> float:
+ primary = str(detected_topic.get("primary_topic", ""))
+ secondary = [str(x) for x in detected_topic.get("secondary_topics", [])]
+ confidence = float(detected_topic.get("confidence", 0.0))
+ if primary == target_topic:
+ return max(0.6, min(1.0, confidence))
+ if target_topic in secondary:
+ return max(0.4, min(0.8, confidence))
+ return min(0.35, confidence)
diff --git a/src/rl/replay_buffer.py b/src/rl/replay_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..49fab4b05551c52d85a7661f264b049ca66bb059
--- /dev/null
+++ b/src/rl/replay_buffer.py
@@ -0,0 +1,181 @@
+"""
+Generational replay buffer for recursive self-improvement.
+"""
+
+from __future__ import annotations
+
+import math
+import random
+from dataclasses import dataclass
+from typing import Dict, List
+
+import numpy as np
+
+from src.rl.mdp_components import Trajectory
+
+
+@dataclass
+class StoredTrajectory:
+ trajectory: Trajectory
+ metadata: Dict[str, object]
+ generation_iteration: int
+ reward: float
+ quality_score: float
+ topic: str
+
+
+class GenerationalReplayBuffer:
+ """Stores high-quality trajectories and samples diverse replays."""
+
+ def __init__(self, max_size: int = 500) -> None:
+ self.max_size = max_size
+ self.buffer: List[StoredTrajectory] = []
+ self.replayed_count = 0
+ self.total_sampled = 0
+ self.additions_since_prune = 0
+
+ def __len__(self) -> int:
+ return len(self.buffer)
+
+ def add_trajectory(
+ self,
+ trajectory: Trajectory,
+ metadata: Dict[str, object],
+ iteration: int,
+ quality_score: float,
+ ) -> bool:
+ stored = StoredTrajectory(
+ trajectory=trajectory,
+ metadata=metadata,
+ generation_iteration=iteration,
+ reward=float(metadata.get("combined_reward", trajectory.total_reward)),
+ quality_score=float(quality_score),
+ topic=str(metadata.get("target_topic", "unknown")),
+ )
+ self.buffer.append(stored)
+ self.additions_since_prune += 1
+
+ if len(self.buffer) > self.max_size:
+ self._prune_by_topic_capacity(per_topic_keep=50)
+ return True
+
+ def sample_replay_batch(self, n: int, diversity_sample: bool = True) -> List[Trajectory]:
+ if n <= 0 or not self.buffer:
+ return []
+ n = min(n, len(self.buffer))
+ self.total_sampled += n
+
+ if not diversity_sample:
+ chosen = random.sample(self.buffer, n)
+ self.replayed_count += len(chosen)
+ return [item.trajectory for item in chosen]
+
+ by_topic = self._group_by_topic()
+ topic_names = list(by_topic.keys())
+ topic_sizes = np.array([len(by_topic[t]) for t in topic_names], dtype=np.float64)
+ topic_probs = topic_sizes / topic_sizes.sum()
+
+ selected: List[StoredTrajectory] = []
+ used_ids: set[int] = set()
+ attempts = 0
+ max_attempts = n * 8
+ while len(selected) < n and attempts < max_attempts:
+ attempts += 1
+ topic_idx = int(np.random.choice(len(topic_names), p=topic_probs))
+ topic = topic_names[topic_idx]
+ candidates = by_topic[topic]
+
+ # Higher quality samples are preferred within a topic.
+ quality = np.array([max(1e-6, c.quality_score) for c in candidates], dtype=np.float64)
+ q_probs = quality / quality.sum()
+ candidate_idx = int(np.random.choice(len(candidates), p=q_probs))
+ candidate = candidates[candidate_idx]
+ candidate_key = id(candidate)
+ if candidate_key in used_ids:
+ continue
+ used_ids.add(candidate_key)
+ selected.append(candidate)
+
+ if len(selected) < n:
+ remainder = [x for x in self.buffer if id(x) not in used_ids]
+ random.shuffle(remainder)
+ selected.extend(remainder[: n - len(selected)])
+
+ self.replayed_count += len(selected)
+ return [item.trajectory for item in selected]
+
+ def get_buffer_stats(self, current_iteration: int | None = None) -> Dict[str, float]:
+ if not self.buffer:
+ return {
+ "buffer_size": 0.0,
+ "avg_quality": 0.0,
+ "quality_variance": 0.0,
+ "staleness": 0.0,
+ "topic_entropy": 0.0,
+ "replay_success_rate": 0.0,
+ "buffer_turnover_rate": 0.0,
+ "topics_in_buffer": 0.0,
+ "buffer_health": 0.0,
+ }
+
+ qualities = np.array([x.quality_score for x in self.buffer], dtype=np.float64)
+ max_iter = (
+ current_iteration
+ if current_iteration is not None
+ else max(x.generation_iteration for x in self.buffer)
+ )
+ staleness = np.array([max_iter - x.generation_iteration for x in self.buffer], dtype=np.float64)
+ topic_entropy = self._compute_topic_entropy()
+ replay_success = self.replayed_count / max(1, self.total_sampled)
+ turnover = self.additions_since_prune / max(1, len(self.buffer))
+
+ stats = {
+ "buffer_size": float(len(self.buffer)),
+ "avg_quality": float(qualities.mean()),
+ "quality_variance": float(qualities.var()),
+ "staleness": float(staleness.mean()),
+ "topic_entropy": float(topic_entropy),
+ "replay_success_rate": float(replay_success),
+ "buffer_turnover_rate": float(turnover),
+ "topics_in_buffer": float(len(self._group_by_topic())),
+ }
+ stats["buffer_health"] = float(self.compute_buffer_health(stats))
+ return stats
+
+ def compute_buffer_health(self, stats: Dict[str, float] | None = None) -> float:
+ if not self.buffer:
+ return 0.0
+ base = stats or self.get_buffer_stats()
+ avg_quality = base["avg_quality"]
+ # Use max_size as coarse normalization ceiling for topic entropy.
+ entropy_norm = max(1.0, math.log(max(2, len(self._group_by_topic()))))
+ topic_diversity = min(1.0, base["topic_entropy"] / entropy_norm)
+ staleness_penalty = max(0.0, 1.0 - min(1.0, base["staleness"] / 10.0))
+ health = 0.5 * avg_quality + 0.3 * topic_diversity + 0.2 * staleness_penalty
+ return float(max(0.0, min(1.0, health)))
+
+ def _group_by_topic(self) -> Dict[str, List[StoredTrajectory]]:
+ grouped: Dict[str, List[StoredTrajectory]] = {}
+ for item in self.buffer:
+ grouped.setdefault(item.topic, []).append(item)
+ return grouped
+
+ def _prune_by_topic_capacity(self, per_topic_keep: int) -> None:
+ grouped = self._group_by_topic()
+ pruned: List[StoredTrajectory] = []
+ for _, items in grouped.items():
+ items_sorted = sorted(items, key=lambda x: x.quality_score, reverse=True)
+ pruned.extend(items_sorted[:per_topic_keep])
+
+ if len(pruned) > self.max_size:
+ pruned = sorted(pruned, key=lambda x: x.quality_score, reverse=True)[: self.max_size]
+ self.buffer = pruned
+ self.additions_since_prune = 0
+
+ def _compute_topic_entropy(self) -> float:
+ grouped = self._group_by_topic()
+ if not grouped:
+ return 0.0
+ counts = np.array([len(v) for v in grouped.values()], dtype=np.float64)
+ probs = counts / counts.sum()
+ return float(-(probs * np.log(np.clip(probs, 1e-12, 1.0))).sum())
diff --git a/src/rl/unified_accuracy.py b/src/rl/unified_accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..3041d28cceb80d900d16d5894bb3a0c1b602c1a4
--- /dev/null
+++ b/src/rl/unified_accuracy.py
@@ -0,0 +1,706 @@
+"""
+Unified Accuracy Calculator for GRPO training.
+
+Replaces opaque PRM-based step scoring (Phase 1) with formally-grounded
+chain integrity scoring (Phase 2+) using a small LLM extractor plus
+eval()/SymPy for arithmetic verification and dependency consistency checks.
+
+Architecture:
+ Solution text
+ ↓
+ StepChainExtractor (small LLM, 4-bit; cache-first for grounded data)
+ ↓
+ ExtractionResult (steps + success flag)
+ ↓
+ _pal_eval / _sympy_eval (formal arithmetic verification)
+ _value_used_in_expr (dependency consistency check)
+ ↓
+ AccuracyReport (arith + dep + lccp + final + q_score)
+ ↓
+ UnifiedAccuracyCalculator.compute() → AccuracyReport
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import logging
+import re
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Extraction prompt
+# ---------------------------------------------------------------------------
+
+CHAIN_EXTRACT_PROMPT = """\
+Extract ALL arithmetic claims from these math solution steps.
+Return ONLY a JSON array, no other text.
+
+Each element:
+{{
+ "step": ,
+ "expression": ,
+ "claimed": ,
+ "produces": ,
+ "uses": []
+}}
+
+Rules:
+- Replace x and × with *
+- Replace ÷ with /
+- Keep fractions as-is: (2/3) stays (2/3)
+- If a step has no arithmetic claim, still include it with expression=null and claimed=null
+- "uses" tracks which prior step's output feeds into this expression
+
+Steps:
+{steps}
+
+JSON array:"""
+
+# ---------------------------------------------------------------------------
+# Module-level helper functions
+# ---------------------------------------------------------------------------
+
+_FINAL_ANSWER_RE = re.compile(r"final answer[:\s]*([^\n]+)", re.IGNORECASE)
+_STEP_RE = re.compile(r"^\s*Step\s+\d+\s*:", re.IGNORECASE | re.MULTILINE)
+
+
+def _cache_key(question: str, solution: str) -> str:
+ """
+ Cache key on (question, solution) to prevent collisions when two
+ different problems share identical solution text (common in short
+ MATH Level 1–2 examples).
+ """
+ return hashlib.md5(
+ f"{question}\n{solution}".encode(), usedforsecurity=False
+ ).hexdigest()
+
+
+def _extract_final_answer(solution: str) -> Optional[str]:
+ """Return the text after 'Final Answer:' in a solution."""
+ m = _FINAL_ANSWER_RE.search(solution)
+ return m.group(1).strip() if m else None
+
+
+def _extract_step_bodies(solution: str) -> List[str]:
+ """Split solution into individual step text strings."""
+ parts = _STEP_RE.split(solution)
+ bodies: List[str] = []
+ for p in parts:
+ stripped = p.strip()
+ if stripped:
+ bodies.append(stripped)
+ return bodies
+
+
+def _pal_eval(answer_str: str) -> Optional[float]:
+ """
+ Tier 1: arithmetic / basic algebra via safe eval.
+ No builtins, no names — only numeric Python expressions.
+ """
+ try:
+ val = eval(answer_str, {"__builtins__": {}}, {}) # noqa: S307
+ f = float(val)
+ return None if f != f else f # NaN guard
+ except Exception:
+ return None
+
+
+def _sympy_eval(answer_str: str) -> Optional[float]:
+ """
+ Tier 2: symbolic evaluation via SymPy for algebra,
+ fractions, square roots, etc.
+ """
+ try:
+ from sympy import N as _N, sympify # type: ignore
+ f = float(_N(sympify(answer_str), 15))
+ return None if f != f else f # NaN guard
+ except Exception:
+ return None
+
+
+def _parse_value(raw: str) -> Optional[float]:
+ """Try PAL eval first, fall back to SymPy."""
+ return _pal_eval(raw) or _sympy_eval(raw)
+
+
+def _value_used_in_expr(expression: str, expected_value: float, tol: float = 1e-4) -> bool:
+ """
+ Check whether a prior step's actual value appears in the expression
+ that claims to use it — catches silent dependency breaks that PRM misses.
+
+ Three-pass check:
+ 1. Raw numeric literals ("0.6667" matches 0.6667)
+ 2. Sub-expressions ("(2/3)" evaluates to ≈0.6667 → matches)
+ 3. Full expression evaluation (whole expr IS the prior step's value)
+
+ Example — Roberto Step 7 "60 * (2/3)" where dep value = 0.6667:
+ Pass 1: literals [60, 2, 3] — none equal 0.6667 → no match yet
+ Pass 2: "(2/3)" evaluates to 0.6667 → MATCH ✓
+
+ Example — broken chain "60 * 0.5" where dep value = 0.6667:
+ Pass 1: literals [60, 0.5] — neither equals 0.6667
+ Pass 2: no sub-expressions
+ Pass 3: 60 * 0.5 = 30.0 ≠ 0.6667
+ → False ✓
+ """
+ # Pass 1: raw numeric literals
+ nums = re.findall(r"\d+\.?\d*", expression)
+ for n in nums:
+ try:
+ if abs(float(n) - expected_value) < tol:
+ return True
+ except ValueError:
+ pass
+
+ # Pass 2: evaluate sub-expressions like (2/3), (1+2), etc.
+ sub_exprs = re.findall(r"\([\d\s\+\-\*\/\.]+\)", expression)
+ for sub in sub_exprs:
+ try:
+ val = eval(sub, {"__builtins__": {}}, {}) # noqa: S307
+ if abs(float(val) - expected_value) < tol:
+ return True
+ except Exception:
+ pass
+
+ # Pass 3: evaluate the full expression and check if it equals the dep
+ try:
+ full_val = eval(expression, {"__builtins__": {}}, {}) # noqa: S307
+ if abs(float(full_val) - expected_value) < tol:
+ return True
+ except Exception:
+ pass
+
+ return False
+
+
+# ---------------------------------------------------------------------------
+# Dataclasses
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class ExtractionResult:
+ """
+ Result of a step chain extraction attempt.
+
+ Distinguishing ``success=False`` (LLM call failed / JSON unparseable)
+ from ``steps=[]`` (no arithmetic claims found) is critical: the former
+ should penalise the chain score to 0.5 (neutral), NOT reward it with 1.0
+ (which would happen if the calculator sees an empty chain and defaults
+ to 'all steps correct').
+ """
+ steps: List[dict]
+ success: bool # False = LLM or JSON parse failed
+ n_steps_found: int # steps with non-null expression (arithmetic claims)
+
+
+@dataclass
+class ChainStep:
+ step: int
+ expression: Optional[str]
+ claimed: Optional[str]
+ produces: str
+ uses: List[str] = field(default_factory=list)
+ arithmetic_correct: Optional[bool] = None # None = no arithmetic claim
+ dependency_consistent: Optional[bool] = None # None = no deps to check
+ actual_value: Optional[float] = None
+
+
+@dataclass
+class AccuracyReport:
+ # Step chain integrity
+ step_arithmetic_score: float # fraction of steps with correct arithmetic
+ step_dependency_score: float # fraction of deps using correct prior values
+ chain_integrity_score: float # 0.6 * arith + 0.4 * dep
+ first_failure_step: Optional[int]
+ lccp_score: float # fraction of clean steps before first failure
+
+ # Final answer
+ final_answer_correct: bool # against gold (grounded) or own chain (self-play)
+ final_answer_consistent: bool # consistent with step chain
+
+ # Question quality
+ # Always float (0.0 default) so downstream averaging never hits TypeError.
+ # Check question_scored to know whether it was actually evaluated.
+ question_score: float = 0.0
+ question_scored: bool = False
+
+ # Extraction status
+ extraction_succeeded: bool = True # False when extractor returned failure
+
+ # Composite (replaces PRM-based combined_score in Phase 2+)
+ composite_accuracy: float = 0.0
+
+
+# ---------------------------------------------------------------------------
+# StepChainExtractor
+# ---------------------------------------------------------------------------
+
+
+class StepChainExtractor:
+ """
+ Extracts structured step chains from math solutions using a small LLM.
+
+ For grounded data (fixed GSM8K + MATH training set) the cache avoids
+ calling the LLM at training time — only novel self-play solutions
+ incur a forward pass.
+
+ Cache format: {"": {"steps": [...], "success": bool}}
+ Stores success status so failure entries are not retried and are correctly
+ penalised (not rewarded) by the calculator.
+ """
+
+ def __init__(
+ self,
+ model_name: str,
+ device: str,
+ cache_path: Optional[str] = None,
+ ) -> None:
+ self.model_name = model_name
+ self.device = device
+ self.cache_path = cache_path
+ # Each entry: {"steps": List[dict], "success": bool}
+ self._cache: Dict[str, Dict[str, Any]] = {}
+ self._model: Any = None
+ self._tokenizer: Any = None
+ # Lifetime extraction counters for calibration reporting
+ self.n_extractions: int = 0
+ self.n_successful: int = 0
+
+ if cache_path:
+ self.load_cache()
+
+ # ── Model loading ────────────────────────────────────────────────────────
+
+ def _ensure_loaded(self) -> None:
+ """Load the small LLM. Call warmup() at startup for eager loading."""
+ if self._model is not None:
+ return
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+ import torch
+
+ logger.info("Loading step chain extractor: %s", self.model_name)
+ bnb_config = BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_compute_dtype=torch.bfloat16,
+ bnb_4bit_use_double_quant=True,
+ bnb_4bit_quant_type="nf4",
+ )
+ self._tokenizer = AutoTokenizer.from_pretrained(
+ self.model_name, trust_remote_code=True
+ )
+ self._model = AutoModelForCausalLM.from_pretrained(
+ self.model_name,
+ quantization_config=bnb_config,
+ device_map={"": self.device},
+ trust_remote_code=True,
+ )
+ self._model.eval()
+ logger.info("Step chain extractor loaded")
+
+ def warmup(self) -> None:
+ """
+ Eagerly load the extractor model at startup.
+
+ Call this immediately after __init__ in the training script to avoid
+ a 30–60 second stall on the first iteration that triggers live extraction.
+ """
+ self._ensure_loaded()
+
+ # ── Core extraction ─────────────────────────────────────────────────────
+
+ def extract(self, solution: str, question: str = "") -> ExtractionResult:
+ """
+ Return an ExtractionResult for ``solution``.
+
+ Cache key is md5(question + solution) — keying on question prevents
+ collisions when two MATH problems share identical solution text.
+
+ Returns ExtractionResult with success=False on LLM/parse failure, so
+ the calculator can apply a neutral penalty (0.5) instead of incorrectly
+ rewarding the empty chain with score 1.0.
+ """
+ key = _cache_key(question, solution)
+ if key in self._cache:
+ entry = self._cache[key]
+ steps = entry.get("steps") or []
+ success = bool(entry.get("success", True))
+ n_claims = sum(1 for s in steps if s.get("expression") is not None)
+ return ExtractionResult(steps=steps, success=success, n_steps_found=n_claims)
+
+ result = self._call_extractor(solution)
+ self._cache[key] = {"steps": result.steps, "success": result.success}
+ self.n_extractions += 1
+ if result.success:
+ self.n_successful += 1
+ return result
+
+ def _call_extractor(self, solution: str) -> ExtractionResult:
+ """Run a forward pass of the small LLM to extract step chain JSON."""
+ step_bodies = _extract_step_bodies(solution)
+ if not step_bodies:
+ # No Step N: lines — treat as no arithmetic claims (not a failure)
+ return ExtractionResult(steps=[], success=True, n_steps_found=0)
+
+ try:
+ self._ensure_loaded()
+ import torch
+
+ steps_text = "\n".join(
+ f"Step {i + 1}: {body}" for i, body in enumerate(step_bodies)
+ )
+ prompt = CHAIN_EXTRACT_PROMPT.format(steps=steps_text)
+
+ if hasattr(self._tokenizer, "apply_chat_template"):
+ messages = [{"role": "user", "content": prompt}]
+ full_prompt = self._tokenizer.apply_chat_template(
+ messages, tokenize=False, add_generation_prompt=True
+ )
+ else:
+ full_prompt = prompt
+
+ inputs = self._tokenizer(
+ full_prompt, return_tensors="pt", truncation=True, max_length=2048
+ ).to(self.device)
+
+ with torch.no_grad():
+ output_ids = self._model.generate(
+ **inputs,
+ max_new_tokens=512,
+ temperature=0.1,
+ do_sample=False,
+ pad_token_id=self._tokenizer.eos_token_id,
+ )
+
+ new_tokens = output_ids[0, inputs["input_ids"].shape[1]:]
+ raw_text = self._tokenizer.decode(new_tokens, skip_special_tokens=True)
+
+ json_match = re.search(r"\[.*\]", raw_text, re.DOTALL)
+ if not json_match:
+ logger.debug("Extractor produced no JSON array; raw: %s", raw_text[:200])
+ return ExtractionResult(steps=[], success=False, n_steps_found=0)
+
+ chain = json.loads(json_match.group())
+ if not isinstance(chain, list):
+ return ExtractionResult(steps=[], success=False, n_steps_found=0)
+
+ n_claims = sum(1 for s in chain if isinstance(s, dict) and s.get("expression") is not None)
+ return ExtractionResult(steps=chain, success=True, n_steps_found=n_claims)
+
+ except Exception as exc:
+ logger.debug("StepChainExtractor._call_extractor failed: %s", exc)
+ return ExtractionResult(steps=[], success=False, n_steps_found=0)
+
+ # ── Cache management ────────────────────────────────────────────────────
+
+ def build_cache(self, qa_pairs: List[Tuple[str, str]]) -> None:
+ """
+ Pre-extract step chains for (question, solution) pairs.
+
+ Accepts a list of ``(question, solution)`` tuples. Keying on both
+ prevents cache collisions between MATH problems with identical
+ solution text.
+
+ Used by the offline preprocessing script to warm the cache before
+ training. Skips entries already in cache (resume support).
+ """
+ import tqdm as _tqdm
+ for question, solution in _tqdm.tqdm(qa_pairs, desc="Extracting step chains"):
+ key = _cache_key(question, solution)
+ if key not in self._cache:
+ result = self._call_extractor(solution)
+ self._cache[key] = {"steps": result.steps, "success": result.success}
+
+ def save_cache(self) -> None:
+ if not self.cache_path:
+ return
+ import pathlib
+ pathlib.Path(self.cache_path).parent.mkdir(parents=True, exist_ok=True)
+ with open(self.cache_path, "w", encoding="utf-8") as f:
+ json.dump(self._cache, f)
+ logger.info(
+ "Extraction cache saved: %d entries → %s", len(self._cache), self.cache_path
+ )
+
+ def load_cache(self) -> None:
+ if not self.cache_path:
+ return
+ try:
+ with open(self.cache_path, encoding="utf-8") as f:
+ raw = json.load(f)
+ # Migrate old format (plain list values) to new dict format
+ migrated = 0
+ for k, v in raw.items():
+ if isinstance(v, list):
+ raw[k] = {"steps": v, "success": True}
+ migrated += 1
+ self._cache = raw
+ logger.info(
+ "Extraction cache loaded: %d entries from %s%s",
+ len(self._cache),
+ self.cache_path,
+ f" ({migrated} migrated from old format)" if migrated else "",
+ )
+ except FileNotFoundError:
+ logger.info(
+ "Extraction cache not found at %s — will build on first use",
+ self.cache_path,
+ )
+ except Exception as exc:
+ logger.warning("Failed to load extraction cache: %s", exc)
+
+
+# ---------------------------------------------------------------------------
+# UnifiedAccuracyCalculator
+# ---------------------------------------------------------------------------
+
+
+class UnifiedAccuracyCalculator:
+ """
+ Compute an AccuracyReport for a given solution.
+
+ Phase-gated: activated when math_env.use_chain_scoring is True (Phase 2+).
+ During Phase 2 SELFPLAY_RAMP the calculator also runs in shadow mode
+ (computing scores without affecting rewards) to build calibration data
+ for the data-driven chain-vs-PRM correlation check.
+ """
+
+ def __init__(
+ self,
+ extractor: StepChainExtractor,
+ question_evaluator: Any = None,
+ ) -> None:
+ self.extractor = extractor
+ self.question_evaluator = question_evaluator
+
+ def compute(
+ self,
+ solution: str,
+ gold_answer: Optional[str],
+ question: Optional[str] = None,
+ topic: str = "arithmetic",
+ phase: str = "grounded", # "grounded" or "selfplay"
+ ) -> AccuracyReport:
+ """
+ Compute a unified AccuracyReport for one solution.
+
+ Parameters
+ ----------
+ solution : Full model-generated solution text.
+ gold_answer : Known correct answer (grounded) or None (self-play).
+ question : Question text — also used as cache key discriminator.
+ topic : Problem type tag (passed through for future routing).
+ phase : "grounded" uses gold for correctness; "selfplay" uses chain consistency.
+ """
+ # ── 1. Extract step chain ──────────────────────────────────────────
+ extraction = self.extractor.extract(solution, question=question or "")
+
+ # Handle extraction failure: apply neutral penalty (0.5) rather than
+ # rewarding the empty chain with the default 1.0 score.
+ if not extraction.success:
+ return AccuracyReport(
+ step_arithmetic_score=0.5,
+ step_dependency_score=0.5,
+ chain_integrity_score=0.5,
+ first_failure_step=None,
+ lccp_score=0.0,
+ final_answer_correct=False,
+ final_answer_consistent=False,
+ question_score=0.0,
+ question_scored=False,
+ extraction_succeeded=False,
+ composite_accuracy=0.25, # penalised for unverifiable chain
+ )
+
+ # Handle genuine "no arithmetic claims" (no Step N: lines, or all
+ # narrative steps): treat as neutral, not perfect or failed.
+ if extraction.n_steps_found == 0:
+ arith_score = 0.5
+ dep_score = 0.5
+ lccp = 1.0 # no steps → no failures in prefix
+ else:
+ arith_score, dep_score, lccp, _ = self._verify_chain(
+ extraction.steps
+ )
+
+ chain_steps_parsed = self._parse_chain(extraction.steps)
+ first_failure = self._find_first_failure(chain_steps_parsed)
+ chain_score = 0.6 * arith_score + 0.4 * dep_score
+
+ # ── 2. Final answer ────────────────────────────────────────────────
+ final_raw = _extract_final_answer(solution)
+ final_val = _parse_value(final_raw) if final_raw else None
+
+ # Reconstruct value_registry for consistency check
+ value_registry: Dict[str, float] = {}
+ for sr in chain_steps_parsed:
+ if sr.actual_value is not None:
+ value_registry[sr.produces] = sr.actual_value
+
+ chain_final: Optional[float] = (
+ list(value_registry.values())[-1] if value_registry else None
+ )
+ final_consistent = (
+ abs(final_val - chain_final) < 1e-4
+ if final_val is not None and chain_final is not None
+ else False
+ )
+
+ if phase == "grounded" and gold_answer is not None and final_val is not None:
+ gold_val = _parse_value(gold_answer)
+ final_correct = (
+ abs(final_val - gold_val) < 1e-4
+ if gold_val is not None else False
+ )
+ else:
+ final_correct = final_consistent
+
+ # ── 3. Question quality (self-play only) ──────────────────────────
+ q_score: float = 0.0
+ q_scored: bool = False
+ if phase == "selfplay" and question and self.question_evaluator is not None:
+ try:
+ q_result = self.question_evaluator.evaluate(
+ question=question,
+ solution=solution,
+ consensus_result={
+ "has_majority": final_correct,
+ "consensus_strength": float(chain_score),
+ "primary_matches_majority": final_correct,
+ "answer_diversity": 0,
+ "majority_answer": None,
+ "primary_answer": None,
+ },
+ target_topic=topic,
+ target_difficulty=2.0,
+ )
+ q_score = float(q_result.get("overall_score", 0.0))
+ q_scored = True
+ except Exception as exc:
+ logger.debug("question_evaluator failed in unified calc: %s", exc)
+
+ # ── 4. Composite accuracy ─────────────────────────────────────────
+ if phase == "grounded":
+ composite = (
+ 0.50 * float(final_correct)
+ + 0.30 * chain_score
+ + 0.20 * lccp
+ )
+ else: # selfplay
+ composite = (
+ 0.35 * float(final_correct)
+ + 0.30 * chain_score
+ + 0.15 * lccp
+ + 0.20 * q_score
+ )
+ composite = max(0.0, min(1.0, composite))
+
+ return AccuracyReport(
+ step_arithmetic_score=arith_score,
+ step_dependency_score=dep_score,
+ chain_integrity_score=chain_score,
+ first_failure_step=first_failure,
+ lccp_score=lccp,
+ final_answer_correct=final_correct,
+ final_answer_consistent=final_consistent,
+ question_score=q_score,
+ question_scored=q_scored,
+ extraction_succeeded=True,
+ composite_accuracy=composite,
+ )
+
+ # ── Internal helpers ────────────────────────────────────────────────────
+
+ def _verify_chain(
+ self, raw_chain: List[dict]
+ ) -> Tuple[float, float, float, List[ChainStep]]:
+ """
+ Verify arithmetic and dependencies for a parsed chain.
+
+ Returns (arith_score, dep_score, lccp, chain_steps).
+ """
+ chain_steps = self._parse_chain(raw_chain)
+ value_registry: Dict[str, float] = {}
+ first_failure: Optional[int] = None
+
+ for sr in chain_steps:
+ if sr.expression is None:
+ sr.arithmetic_correct = None
+ continue
+
+ actual = _parse_value(sr.expression)
+ claimed = _parse_value(sr.claimed) if sr.claimed else None
+
+ if actual is not None and claimed is not None:
+ sr.arithmetic_correct = abs(actual - claimed) < 1e-4
+ sr.actual_value = actual
+ else:
+ sr.arithmetic_correct = None
+
+ if sr.uses and actual is not None:
+ dep_ok = True
+ for dep_name in sr.uses:
+ if dep_name in value_registry:
+ dep_ok = dep_ok and _value_used_in_expr(
+ sr.expression, value_registry[dep_name]
+ )
+ sr.dependency_consistent = dep_ok
+
+ if actual is not None:
+ value_registry[sr.produces] = actual
+
+ if sr.arithmetic_correct is False and first_failure is None:
+ first_failure = sr.step
+
+ checked = [s for s in chain_steps if s.arithmetic_correct is not None]
+ dep_checked = [s for s in chain_steps if s.dependency_consistent is not None]
+
+ arith_score = (
+ sum(1.0 for s in checked if s.arithmetic_correct) / len(checked)
+ if checked else 0.5
+ )
+ dep_score = (
+ sum(1.0 for s in dep_checked if s.dependency_consistent) / len(dep_checked)
+ if dep_checked else 0.5
+ )
+
+ lccp = (
+ (first_failure - 1) / len(chain_steps)
+ if first_failure is not None and chain_steps
+ else 1.0
+ )
+ lccp = max(0.0, min(1.0, lccp))
+
+ return arith_score, dep_score, lccp, chain_steps
+
+ @staticmethod
+ def _find_first_failure(chain_steps: List[ChainStep]) -> Optional[int]:
+ for sr in chain_steps:
+ if sr.arithmetic_correct is False:
+ return sr.step
+ return None
+
+ @staticmethod
+ def _parse_chain(raw_chain: List[dict]) -> List[ChainStep]:
+ """Convert raw JSON dicts from the extractor into ChainStep objects."""
+ steps: List[ChainStep] = []
+ for item in raw_chain:
+ if not isinstance(item, dict):
+ continue
+ try:
+ steps.append(ChainStep(
+ step=int(item.get("step", len(steps) + 1)),
+ expression=item.get("expression"),
+ claimed=item.get("claimed"),
+ produces=str(
+ item.get("produces") or f"step_{len(steps) + 1}_result"
+ ),
+ uses=list(item.get("uses") or []),
+ ))
+ except Exception:
+ continue
+ return steps
diff --git a/src/rl/value_network.py b/src/rl/value_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..37ee337865c75d57c7193300daf2bbf843473513
--- /dev/null
+++ b/src/rl/value_network.py
@@ -0,0 +1,189 @@
+"""
+Value Network (Critic) for PPO.
+
+ValueHead wraps a frozen copy of the base language model backbone and
+appends a small MLP to regress a scalar value V(s_t) ∈ ℝ.
+
+Design notes
+------------
+- The backbone is loaded once with bfloat16 to fit on GPU.
+- Only the MLP head (value_head) is updated during training; the
+ backbone can optionally be unfrozen for fine-grained critic learning.
+- The forward pass returns a 1-D tensor of shape (batch_size,) so the
+ caller can do .item() for single inputs.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Optional
+
+import torch
+import torch.nn as nn
+from transformers import AutoConfig, AutoModel
+
+from src.utils.attn_backend import select_attn_implementation
+
+logger = logging.getLogger(__name__)
+
+
+class ValueHead(nn.Module):
+ """
+ Critic network V_φ(s).
+
+ Architecture
+ ------------
+ backbone (LM encoder, frozen by default)
+ ↓ last-token hidden state [hidden_size]
+ Linear(hidden_size, 256) + ReLU
+ ↓
+ Linear(256, 1)
+ ↓ squeeze → scalar V(s)
+
+ Args:
+ base_model_path : HuggingFace model id or local checkpoint path.
+ freeze_backbone : If True, backbone weights are not updated.
+ Defaults to True (only head is trained).
+ hidden_size : Override backbone hidden size (auto-detected
+ from config when None).
+ """
+
+ def __init__(
+ self,
+ base_model_path: str,
+ freeze_backbone: bool = True,
+ hidden_size: Optional[int] = None,
+ model_device_map: Optional[Any] = "auto",
+ max_memory: Optional[dict] = None,
+ ) -> None:
+ super().__init__()
+
+ logger.info(f"Loading ValueHead backbone from {base_model_path}")
+
+ config = AutoConfig.from_pretrained(
+ base_model_path, trust_remote_code=True
+ )
+ h = hidden_size or config.hidden_size
+
+ # Always load on CPU first to avoid 90% GPU allocation
+ # The caller will move to GPU if needed
+ load_kwargs = {
+ "torch_dtype": torch.bfloat16,
+ "device_map": model_device_map,
+ "low_cpu_mem_usage": True,
+ "trust_remote_code": True,
+ "attn_implementation": select_attn_implementation(),
+ }
+
+ self.backbone = AutoModel.from_pretrained(
+ base_model_path,
+ **load_kwargs,
+ )
+
+ if freeze_backbone:
+ for param in self.backbone.parameters():
+ param.requires_grad_(False)
+ logger.info("Backbone frozen; only ValueHead MLP will be trained.")
+
+ self.value_head = nn.Sequential(
+ nn.Linear(h, 256),
+ nn.ReLU(),
+ nn.Linear(256, 1),
+ )
+
+ # ------------------------------------------------------------------
+ # Forward
+ # ------------------------------------------------------------------
+
+ def forward(
+ self,
+ input_ids: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ ) -> torch.Tensor:
+ """
+ Compute V(s) for a batch of states.
+
+ Args:
+ input_ids : [batch, seq_len]
+ attention_mask : [batch, seq_len] (ones if None)
+
+ Returns:
+ values : [batch] — scalar value estimate per sequence
+ """
+ if attention_mask is None:
+ attention_mask = torch.ones_like(input_ids)
+
+ outputs = self.backbone(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ )
+
+ # Last *non-pad* token (right-padded batches: last valid index per row)
+ last_hidden = outputs.last_hidden_state # [B, T, H]
+ last_idx = attention_mask.long().sum(dim=1) - 1
+ last_idx = last_idx.clamp(min=0)
+ b = torch.arange(last_hidden.size(0), device=last_hidden.device)
+ cls_hidden = last_hidden[b, last_idx].to(self.value_head[0].weight.dtype)
+
+ values = self.value_head(cls_hidden).squeeze(-1) # [B]
+ return values
+
+ @torch.no_grad()
+ def values_at_positions(
+ self,
+ input_ids: torch.Tensor,
+ positions: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ ) -> torch.Tensor:
+ """
+ Compute V(s_t) for many states in a SINGLE backbone forward pass.
+
+ The naive rollout loop calls ``self.value(...)`` once per generated
+ token, which does one full backbone forward over the growing
+ sequence each step — that's O(T^2) work for T tokens. This helper
+ lets the caller run the backbone exactly once on the full
+ trajectory and then pluck hidden states at the positions that
+ correspond to each state s_t.
+
+ For a trajectory with prompt length P and T generated tokens,
+ state s_t (= prompt + generated[:t], t=0..T-1) is a "last token"
+ at position P + t - 1 in the full sequence, so callers pass
+ ``positions = torch.arange(P - 1, P + T - 1)``.
+
+ Args:
+ input_ids:
+ [1, L] full trajectory (prompt + generated). A single
+ un-padded sequence — callers that need batched different-
+ length trajectories should loop over them (cheap because
+ each call is O(L), not O(L^2)).
+ positions:
+ [N] long tensor of indices into the L-axis. Hidden states
+ at these positions will be fed through the value MLP.
+ attention_mask:
+ Optional [1, L] mask. Defaults to all-ones.
+
+ Returns:
+ values: [N] scalar value estimates, one per requested position,
+ on the same device as ``input_ids`` and already in float32
+ (so callers can safely ``.tolist()`` them for the buffer).
+ """
+ if attention_mask is None:
+ attention_mask = torch.ones_like(input_ids)
+
+ outputs = self.backbone(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ )
+ hidden = outputs.last_hidden_state # [1, L, H]
+
+ positions = positions.to(device=hidden.device, dtype=torch.long)
+ # Clamp just in case the caller requests an out-of-range position
+ # (e.g. T=0 edge cases). clamp is a no-op for valid indices.
+ positions = positions.clamp(min=0, max=hidden.size(1) - 1)
+
+ # Gather → [N, H]. Cast to the value_head's weight dtype so
+ # bf16 backbone + fp32 head works regardless of how torch
+ # autocast is configured on the caller side.
+ gathered = hidden[0, positions].to(self.value_head[0].weight.dtype)
+ values = self.value_head(gathered).squeeze(-1).float() # [N]
+ return values
diff --git a/src/sft/solution_format.py b/src/sft/solution_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..432dcbe5c3eb55a01aae745186d61a7cf8614ca0
--- /dev/null
+++ b/src/sft/solution_format.py
@@ -0,0 +1,146 @@
+"""
+Utilities for SymPy-oriented solver output: validation and GSM8K trace cleanup.
+
+Aligned with ``src.agent.math_agent.SOLVER_SYSTEM_PROMPT`` (Step N: / Final Answer:).
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import List, Optional
+
+from sympy.parsing.sympy_parser import parse_expr
+
+from src.sft.sympy_normalize import normalize_for_parse_expr
+
+
+STEP_RE = re.compile(r"^Step\s+(\d+)\s*:", re.IGNORECASE | re.MULTILINE)
+FINAL_RE = re.compile(r"(?im)^Final\s*Answer\s*:\s*([^\n]+?)\s*$")
+
+
+@dataclass
+class FormatCheckResult:
+ ok: bool
+ step_count: int
+ has_final_line: bool
+ final_answer_raw: str
+ sympy_parseable_steps: int
+ sympy_parseable_final: bool
+ errors: List[str]
+
+
+def strip_gsm8k_scratchpads(text: str) -> str:
+ """Remove GSM8K ``<<...>>`` calculator traces; collapse extra spaces."""
+ s = re.sub(r"<<[^>]*>>", "", text)
+ s = re.sub(r"[ \t]+", " ", s)
+ s = re.sub(r"\n{3,}", "\n\n", s)
+ return s.strip()
+
+
+def _step_bodies(text: str) -> List[str]:
+ """Text after each 'Step N:' up to next Step or Final Answer (best-effort)."""
+ lines = text.splitlines()
+ bodies: List[str] = []
+ cur: List[str] = []
+ in_step = False
+ for line in lines:
+ if re.match(r"^\s*Step\s+\d+\s*:", line, re.I):
+ if cur:
+ bodies.append("\n".join(cur).strip())
+ cur = [re.sub(r"^\s*Step\s+\d+\s*:\s*", "", line, flags=re.I)]
+ in_step = True
+ elif re.match(r"^\s*Final\s*Answer\s*:", line, re.I):
+ if cur:
+ bodies.append("\n".join(cur).strip())
+ cur = []
+ in_step = False
+ break
+ elif in_step:
+ cur.append(line)
+ if cur:
+ bodies.append("\n".join(cur).strip())
+ return [b for b in bodies if b]
+
+
+def _sympy_can_parse_fragment(s: str) -> bool:
+ s = s.strip()
+ if not s:
+ return False
+ # Normalize using shared normalizer (handles ^, currency, etc.)
+ s = normalize_for_parse_expr(s)
+ # Take first line or expression-ish segment after last '='
+ chunk = s
+ if "=" in s and "==" not in s:
+ chunk = s.split("=")[-1].strip()
+ chunk = chunk.split()[0] if chunk.split() else chunk
+ try:
+ parse_expr(chunk)
+ return True
+ except Exception:
+ try:
+ parse_expr(s[:200])
+ return True
+ except Exception:
+ return False
+
+
+def validate_sympy_solution_format(
+ text: str,
+ *,
+ require_step_prefix: bool = True,
+ require_final_answer: bool = True,
+ min_steps: int = 1,
+) -> FormatCheckResult:
+ """
+ Check solution text for structural compliance and loose SymPy parseability.
+
+ Steps: at least ``min_steps`` lines starting with ``Step N:``.
+ Final: a line ``Final Answer: ...`` where the RHS should parse with SymPy
+ (integers and simple rationals usually succeed).
+ """
+ errors: List[str] = []
+ steps = STEP_RE.findall(text)
+ step_count = len(steps)
+
+ if require_step_prefix and step_count < min_steps:
+ errors.append(f"expected at least {min_steps} Step N: line(s), found {step_count}")
+
+ m_final = None
+ for m in FINAL_RE.finditer(text):
+ m_final = m
+ has_final = m_final is not None
+ final_raw = m_final.group(1).strip() if m_final else ""
+
+ if require_final_answer and not has_final:
+ errors.append("missing 'Final Answer:' line")
+
+ sympy_final = False
+ if final_raw:
+ try:
+ parse_expr(normalize_for_parse_expr(final_raw))
+ sympy_final = True
+ except Exception:
+ errors.append(f"final answer does not parse as SymPy expr: {final_raw!r}")
+
+ bodies = _step_bodies(text)
+ sympy_parseable_steps = len([b for b in bodies if _sympy_can_parse_fragment(b)])
+
+ ok = len(errors) == 0
+ return FormatCheckResult(
+ ok=ok,
+ step_count=step_count,
+ has_final_line=has_final,
+ final_answer_raw=final_raw,
+ sympy_parseable_steps=sympy_parseable_steps,
+ sympy_parseable_final=sympy_final,
+ errors=errors,
+ )
+
+
+def extract_final_answer_numeric_str(text: str) -> Optional[str]:
+ """Return substring after 'Final Answer:' if present."""
+ m = list(FINAL_RE.finditer(text))
+ if not m:
+ return None
+ return m[-1].group(1).strip()
diff --git a/src/sft/sympy_normalize.py b/src/sft/sympy_normalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ff3f7da4713980ca2030ff238152b0c4a8cb2fd
--- /dev/null
+++ b/src/sft/sympy_normalize.py
@@ -0,0 +1,168 @@
+"""
+Normalization layer for LLM outputs before SymPy parsing.
+
+This module provides a single, well-tested function to convert common LLM output
+patterns (Unicode operators, currency symbols, implicit styles) into SymPy-friendly
+ASCII Python-like expressions suitable for `sympy.parsing.sympy_parser.parse_expr`.
+
+## Why normalize instead of controlling LLM output?
+
+LLMs generate diverse textual math notation (^, ×, π, commas in numbers, etc.) that
+cannot be reliably controlled at the token level. A deterministic preprocessing layer
+is more robust than trying to force specific character-level outputs during training.
+
+## SymPy parsing context
+
+SymPy's `parse_expr` (docs: https://docs.sympy.org/latest/modules/parsing.html):
+- Uses Python-like expression syntax as the base grammar.
+- Applies **transformations** (token rewrites) before evaluation.
+- Notable transformations:
+ - `standard_transformations`: auto symbol/number conversion, factorial notation.
+ - `convert_xor`: treats `^` as power (not bitwise XOR).
+ - `implicit_multiplication_application`: relaxes syntax (implicit mult, split symbols).
+ - LaTeX is a **separate path** via `sympy.parsing.latex.parse_latex` (experimental).
+
+**Security note:** `parse_expr` uses `eval` internally. Treat LLM outputs as untrusted;
+this module helps but does not sandbox.
+
+## Normalization mapping (categories)
+
+| Category | LLM output | Normalized | Notes |
+|--------------------|----------------------|-------------------|----------------------------------------|
+| Power | `^` | `**` | Python power operator |
+| Multiplication | `×`, `·`, `•` | `*` | Unicode operators → ASCII |
+| Division | `÷` | `/` | Unicode division sign → ASCII |
+| Minus sign | `−` (U+2212) | `-` | Typography minus → ASCII hyphen-minus |
+| Comparisons | `≤`, `≥`, `≠` | `<=`, `>=`, `!=` | Relational operators (if parsing them) |
+| Constants | `π` | `pi` | Greek letter → SymPy symbol name |
+| Thousands sep | `80,000` | `80000` | Remove commas in numeric literals |
+| Currency | `$`, `€`, `£` | (removed) | Strip before parsing numeric tails |
+| Extra whitespace | multiple spaces/tabs | single space | Collapse for cleaner parsing |
+
+Not handled (by design):
+- **LaTeX** (`\\frac`, `\\sqrt`, etc.): route to `parse_latex` separately if needed.
+- **Natural language prefix** ("Janet sells 16-3-4=9 eggs"): caller extracts math tail first.
+- **Grouping `[` `]`**: context-dependent; avoid substituting without semantic analysis.
+
+Version lock: sympy==1.14.0 (line 84 in requirements.txt at time of writing).
+"""
+
+from __future__ import annotations
+
+import re
+
+
+def normalize_for_parse_expr(text: str) -> str:
+ """
+ Normalize LLM-generated math text for SymPy's `parse_expr`.
+
+ Converts common Unicode operators, currency symbols, and formatting quirks
+ into ASCII Python-like syntax. This is the single source of truth for
+ string preprocessing before SymPy parsing in this project.
+
+ Parameters
+ ----------
+ text : str
+ Raw string (potentially mixed prose and math from LLM).
+
+ Returns
+ -------
+ str
+ Normalized ASCII expression.
+
+ Examples
+ --------
+ >>> normalize_for_parse_expr("2^3")
+ '2**3'
+ >>> normalize_for_parse_expr("16 × 3 − 4")
+ '16 * 3 - 4'
+ >>> normalize_for_parse_expr("$2,500")
+ '2500'
+ >>> normalize_for_parse_expr("π/2")
+ 'pi/2'
+ """
+ s = text.strip()
+
+ # Power: ^ → **
+ s = s.replace("^", "**")
+
+ # Multiplication: Unicode operators → *
+ s = s.replace("×", "*")
+ s = s.replace("·", "*")
+ s = s.replace("•", "*")
+ s = s.replace("\u00d7", "*") # U+00D7 MULTIPLICATION SIGN (×)
+ s = s.replace("\u22c5", "*") # U+22C5 DOT OPERATOR (⋅)
+ s = s.replace("\u2022", "*") # U+2022 BULLET (•)
+
+ # Division: Unicode ÷ → /
+ s = s.replace("÷", "/")
+ s = s.replace("\u00f7", "/") # U+00F7 DIVISION SIGN
+
+ # Minus: typography minus (U+2212) → ASCII hyphen-minus
+ s = s.replace("\u2212", "-") # U+2212 MINUS SIGN (−)
+
+ # Comparison operators (if ever parsing relations)
+ s = s.replace("≤", "<=")
+ s = s.replace("≥", ">=")
+ s = s.replace("≠", "!=")
+ s = s.replace("\u2264", "<=") # U+2264 LESS-THAN OR EQUAL TO
+ s = s.replace("\u2265", ">=") # U+2265 GREATER-THAN OR EQUAL TO
+ s = s.replace("\u2260", "!=") # U+2260 NOT EQUAL TO
+
+ # Greek constants: π → pi (SymPy symbol name)
+ s = s.replace("π", "pi")
+ s = s.replace("\u03c0", "pi") # U+03C0 GREEK SMALL LETTER PI
+
+ # Currency symbols: remove (caller typically strips or segments numeric tails)
+ s = re.sub(r"[$€£¥₹]", "", s)
+
+ # Thousands separators in numbers: 80,000 → 80000
+ # Match comma only between digits in a numeric context
+ s = re.sub(r"(?<=\d),(?=\d{3}\b)", "", s)
+
+ # Spoken "times" with ASCII letter x (grade-school / LLM): "4 x 90" must not
+ # become 4*x*90 in SymPy (x parsed as a symbol → false failures on chains).
+ # Only between digit and digit or digit and '('.
+ s = re.sub(r"(?<=\d)\s+[xX]\s+(?=\d|\()", "*", s)
+
+ # Collapse multiple spaces/tabs to single space
+ s = re.sub(r"[ \t]+", " ", s)
+
+ # Collapse excessive newlines (keep at most double)
+ s = re.sub(r"\n{3,}", "\n\n", s)
+
+ return s.strip()
+
+
+def prefer_arithmetic_tail(text: str) -> str:
+ """
+ Return substring starting from the first digit (if present), else full text.
+
+ Useful when LLM outputs mix natural language with equations, e.g.:
+ "Janet sells 16 - 3 - 4 = 9 eggs every day"
+ This heuristic extracts "16 - 3 - 4 = 9 eggs every day" (digit onward),
+ reducing risk of English words being parsed as symbols when implicit
+ multiplication transformations are enabled.
+
+ Parameters
+ ----------
+ text : str
+ Potentially mixed prose + math.
+
+ Returns
+ -------
+ str
+ Substring from first digit onward, or original if no digit.
+
+ Examples
+ --------
+ >>> prefer_arithmetic_tail("Janet sells 16-3-4=9")
+ '16-3-4=9'
+ >>> prefer_arithmetic_tail("no digits here")
+ 'no digits here'
+ """
+ s = normalize_for_parse_expr(text)
+ m = re.search(r"\d", s)
+ if m:
+ return s[m.start() :].strip()
+ return s.strip()
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..adcd2f15a3811d507ff1c4ac5a6de08ab4cb0dab
--- /dev/null
+++ b/src/utils/__init__.py
@@ -0,0 +1,5 @@
+"""Utility modules for the project."""
+
+from .csv_logger import CSVLogger
+
+__all__ = ["CSVLogger"]
diff --git a/src/utils/attn_backend.py b/src/utils/attn_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..ede363ccbed54fadfec0842cabc7de5f270c50b6
--- /dev/null
+++ b/src/utils/attn_backend.py
@@ -0,0 +1,122 @@
+"""
+Attention-backend selection helper.
+
+Picks the fastest attention implementation available at runtime, with a
+safe fallback ladder:
+
+ flash_attention_2 (package `flash-attn`)
+ ↓ not installed / incompatible
+ sdpa (torch.nn.functional.scaled_dot_product_attention)
+ ↓ not supported by this model
+ eager (stock HF implementation, slowest)
+
+Flash-Attn 2 is a big deal for this codebase:
+
+* Training (PPO backward pass):
+ - Turns attention activation memory from O(T^2) to O(T) per layer.
+ For B=8, T=500, H=12, 28 layers of bf16 Qwen2 that is a ~1.3 GB
+ saving on the backward graph, and it scales quadratically with T.
+ - Fused forward+backward is 1.5-2.5x faster than SDPA on Ampere+.
+
+* Rollouts (KV-cached `.generate()`):
+ - Each generation step does an incremental attention over the full
+ KV cache. Flash is faster here too, and its lower memory footprint
+ lets us keep larger prompts cached.
+
+The helper memoizes its answer so we only probe the `flash_attn` import
+once per process.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+
+# Module-level cache. Set to None the first time select_attn_implementation
+# runs; subsequent calls return the cached string.
+_SELECTED: Optional[str] = None
+
+
+def select_attn_implementation(
+ prefer: Optional[str] = None,
+ log_once: bool = True,
+) -> str:
+ """
+ Pick the best attention backend string for
+ `AutoModel{,ForCausalLM}.from_pretrained(..., attn_implementation=...)`.
+
+ Args:
+ prefer:
+ If set, try this backend first. Useful for forcing "sdpa"
+ in environments where flash-attn is installed but broken
+ (rare, but we've seen it on some vast.ai images).
+ log_once:
+ When True, emit one INFO log line the first time we pick,
+ then be silent on subsequent calls.
+
+ Returns:
+ One of: "flash_attention_2", "sdpa", "eager".
+ """
+ global _SELECTED
+
+ if _SELECTED is not None:
+ return _SELECTED
+
+ candidates = []
+ if prefer is not None:
+ candidates.append(prefer)
+ # Canonical preference order.
+ for name in ("flash_attention_2", "sdpa", "eager"):
+ if name not in candidates:
+ candidates.append(name)
+
+ chosen = "eager"
+ for name in candidates:
+ if name == "flash_attention_2":
+ if _flash_attention_2_available():
+ chosen = "flash_attention_2"
+ break
+ elif name == "sdpa":
+ # SDPA ships with torch >= 2.0 and is always importable.
+ # The HF model class may still reject it for non-supported
+ # architectures, but every modern Llama/Qwen supports it.
+ chosen = "sdpa"
+ break
+ elif name == "eager":
+ chosen = "eager"
+ break
+
+ _SELECTED = chosen
+ if log_once:
+ logger.info(
+ "Attention backend selected: %s%s",
+ chosen,
+ "" if chosen == "flash_attention_2" else
+ " (flash-attn not available — install `flash-attn` for "
+ "~1.5-2.5x faster attention and O(T) memory)",
+ )
+ return chosen
+
+
+def _flash_attention_2_available() -> bool:
+ """
+ Return True iff `flash_attn` is importable and its version is >=2.0.
+
+ We don't run a functional test; HF will raise a clear error at
+ model-load time if the installed build is incompatible with the
+ model's head dim / dtype, and we'd rather surface that than silently
+ fall back and waste hours of training at 1x speed.
+ """
+ try:
+ import flash_attn # noqa: F401
+ except Exception:
+ return False
+ version = getattr(flash_attn, "__version__", "0.0")
+ try:
+ major = int(str(version).split(".", 1)[0])
+ except ValueError:
+ return False
+ return major >= 2
diff --git a/src/utils/csv_logger.py b/src/utils/csv_logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef0d77ec17d557cfc05b030c80c0a831cc2175d4
--- /dev/null
+++ b/src/utils/csv_logger.py
@@ -0,0 +1,217 @@
+"""
+CSV Logger for training metrics.
+
+Replaces wandb logging with simple CSV files that can be viewed later.
+"""
+
+import csv
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Set
+
+
+class CSVLogger:
+ """
+ Logger that writes metrics to CSV files for easy viewing and analysis.
+
+ Each run creates a timestamped directory with:
+ - metrics.csv: Main training metrics (key metrics only)
+ - detailed_metrics/: Detailed metrics per iteration (JSON)
+ - config.json: Configuration parameters
+ - summary.json: Final summary statistics
+ """
+
+ # Define which metrics to include in the main CSV (keep it concise)
+ KEY_METRICS = {
+ "iteration", "step", "timestamp",
+ # Training metrics
+ "train/policy_loss", "train/value_loss", "train/entropy",
+ "train/approx_kl", "train/clip_fraction",
+ # Evaluation metrics
+ "eval/accuracy", "eval/correct", "eval/total",
+ # Buffer/rollout metrics
+ "rollout/mean_reward", "rollout/num_trajectories", "rollout/mean_length",
+ # Curriculum metrics (high-level)
+ "curriculum/topic_diversity", "curriculum/avg_difficulty",
+ "curriculum/avg_novelty", "curriculum/replay_ratio",
+ # Performance metrics
+ "perf/rollout_time", "perf/train_time", "perf/total_time",
+ "perf/tokens_per_second",
+ # Consensus metrics
+ "consensus/rate", "consensus/answer_diversity",
+ # Disk/resource metrics
+ "system/disk_free_gb", "system/gpu_util_percent",
+ }
+
+ def __init__(
+ self,
+ project: str = "training",
+ run_name: Optional[str] = None,
+ log_dir: str = "logs",
+ config: Optional[Dict[str, Any]] = None,
+ log_detailed: bool = True,
+ ):
+ """
+ Initialize CSV logger.
+
+ Args:
+ project: Project name (used as subdirectory)
+ run_name: Optional run name, defaults to timestamp
+ log_dir: Base directory for logs
+ config: Optional configuration dict to save
+ log_detailed: If True, save full metrics as JSON per iteration
+ """
+ self.project = project
+ self.run_name = run_name or f"run_{datetime.now():%Y%m%d_%H%M%S}"
+ self.log_detailed = log_detailed
+
+ # Create log directory
+ self.log_path = Path(log_dir) / project / self.run_name
+ self.log_path.mkdir(parents=True, exist_ok=True)
+
+ if self.log_detailed:
+ self.detailed_path = self.log_path / "detailed_metrics"
+ self.detailed_path.mkdir(exist_ok=True)
+
+ # Initialize metrics file
+ self.metrics_file = self.log_path / "metrics.csv"
+ self.metrics_writer = None
+ self.metrics_handle = None
+ self.fieldnames: List[str] = []
+ self.step_count = 0
+
+ # Save config
+ if config:
+ config_file = self.log_path / "config.json"
+ with open(config_file, "w") as f:
+ json.dump(config, f, indent=2, default=str)
+
+ print(f"CSV Logger initialized: {self.log_path}")
+
+ def log(self, metrics: Dict[str, Any], step: Optional[int] = None):
+ """
+ Log metrics to CSV file (only key metrics) and optionally full metrics to JSON.
+
+ Args:
+ metrics: Dictionary of metric names and values
+ step: Optional step/iteration number
+ """
+ if step is None:
+ step = self.step_count
+ self.step_count += 1
+
+ # Save full detailed metrics to JSON if enabled
+ if self.log_detailed:
+ detailed_file = self.detailed_path / f"step_{step:04d}.json"
+ with open(detailed_file, "w") as f:
+ json.dump(metrics, f, indent=2, default=str)
+
+ # Flatten nested dicts
+ flat_metrics = self._flatten_dict(metrics)
+ flat_metrics["step"] = step
+ flat_metrics["timestamp"] = datetime.now().isoformat()
+
+ # Filter to only key metrics for CSV
+ csv_metrics = {k: v for k, v in flat_metrics.items()
+ if k in self.KEY_METRICS or any(k.startswith(prefix) for prefix in ["iteration"])}
+
+ # Initialize CSV writer if needed
+ if self.metrics_writer is None:
+ # Determine initial fieldnames from key metrics
+ self.fieldnames = ["step", "timestamp"] + sorted(
+ [k for k in csv_metrics.keys() if k not in ["step", "timestamp"]]
+ )
+ self.metrics_handle = open(self.metrics_file, "w", newline="")
+ self.metrics_writer = csv.DictWriter(
+ self.metrics_handle,
+ fieldnames=self.fieldnames,
+ extrasaction="ignore"
+ )
+ self.metrics_writer.writeheader()
+
+ # Add any new fields that match our key metrics
+ new_fields = [k for k in csv_metrics.keys() if k not in self.fieldnames]
+ if new_fields:
+ self._add_columns(new_fields)
+
+ # Write row
+ self.metrics_writer.writerow(csv_metrics)
+ self.metrics_handle.flush()
+
+ def _flatten_dict(self, d: Dict[str, Any], parent_key: str = "", sep: str = "/") -> Dict[str, Any]:
+ """
+ Flatten nested dictionary using separator.
+
+ Example: {"train": {"loss": 0.5}} -> {"train/loss": 0.5}
+ """
+ items = []
+ for k, v in d.items():
+ new_key = f"{parent_key}{sep}{k}" if parent_key else k
+ if isinstance(v, dict):
+ items.extend(self._flatten_dict(v, new_key, sep=sep).items())
+ else:
+ # Convert to JSON string if not a simple type
+ if isinstance(v, (list, tuple)):
+ v = json.dumps(v)
+ elif not isinstance(v, (str, int, float, bool, type(None))):
+ v = str(v)
+ items.append((new_key, v))
+ return dict(items)
+
+ def _add_columns(self, new_fields: List[str]):
+ """Add new columns to existing CSV by rewriting it."""
+ self.fieldnames.extend(new_fields)
+
+ # Read existing data
+ self.metrics_handle.close()
+ existing_data = []
+ if self.metrics_file.exists():
+ with open(self.metrics_file, "r") as f:
+ reader = csv.DictReader(f)
+ existing_data = list(reader)
+
+ # Rewrite with new fieldnames
+ self.metrics_handle = open(self.metrics_file, "w", newline="")
+ self.metrics_writer = csv.DictWriter(
+ self.metrics_handle,
+ fieldnames=self.fieldnames,
+ extrasaction="ignore"
+ )
+ self.metrics_writer.writeheader()
+ for row in existing_data:
+ self.metrics_writer.writerow(row)
+
+ def save_summary(self, summary: Dict[str, Any]):
+ """
+ Save a summary dictionary to JSON.
+
+ Args:
+ summary: Summary statistics or final results
+ """
+ summary_file = self.log_path / "summary.json"
+ with open(summary_file, "w") as f:
+ json.dump(summary, f, indent=2, default=str)
+
+ def save_artifact(self, name: str, data: Any):
+ """
+ Save arbitrary data as JSON artifact.
+
+ Args:
+ name: Artifact name (will be used as filename)
+ data: Data to save (must be JSON serializable)
+ """
+ artifact_file = self.log_path / f"{name}.json"
+ with open(artifact_file, "w") as f:
+ json.dump(data, f, indent=2, default=str)
+
+ def finish(self):
+ """Close logger and clean up resources."""
+ if self.metrics_handle:
+ self.metrics_handle.close()
+ print(f"Logs saved to: {self.log_path}")
+
+ def __del__(self):
+ """Ensure file handle is closed."""
+ if self.metrics_handle and not self.metrics_handle.closed:
+ self.metrics_handle.close()
diff --git a/train_grpo.ipynb b/train_grpo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..667d787bde4a72eaf3208044393a1929accc5339
--- /dev/null
+++ b/train_grpo.ipynb
@@ -0,0 +1,1153 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "md-header",
+ "metadata": {},
+ "source": [
+ "# AxiomForgeAI — GRPO Training\n",
+ "\n",
+ "Training loop structured around the classic RL interface:\n",
+ "\n",
+ "```\n",
+ "env.reset(qa) → start episode, receive question\n",
+ "env.step(action)→ submit solution, receive reward + feedback\n",
+ "env.state → inspect episode metadata\n",
+ "env.close() → persist curriculum, release resources\n",
+ "```\n",
+ "\n",
+ "All scoring, curriculum management, and reward computation are handled inside\n",
+ "`AxiomforgeaiEnvironment`. The notebook owns model loading, solution generation,\n",
+ "GRPO loss, and optimisation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-imports",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ── Standard library ──────────────────────────────────────────────────────────\n",
+ "from __future__ import annotations\n",
+ "\n",
+ "import argparse, copy, csv, hashlib, json, logging, random, re\n",
+ "import shutil, sys, time, types\n",
+ "from collections import defaultdict\n",
+ "from datetime import datetime\n",
+ "from enum import Enum, auto as _auto\n",
+ "from pathlib import Path\n",
+ "from typing import Any, Dict, List, Optional, Tuple\n",
+ "\n",
+ "# ── Third-party ───────────────────────────────────────────────────────────────\n",
+ "import numpy as np\n",
+ "import torch\n",
+ "import torch.nn.functional as F\n",
+ "from peft import PeftModel\n",
+ "from tqdm.auto import tqdm\n",
+ "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+ "\n",
+ "# Ensure the repo root is always on sys.path regardless of the kernel's cwd.\n",
+ "_REPO_ROOT = Path(__file__).resolve().parent if \"__file__\" in dir() else Path.cwd()\n",
+ "if str(_REPO_ROOT) not in sys.path:\n",
+ " sys.path.insert(0, str(_REPO_ROOT))\n",
+ "\n",
+ "# ── RL Environment (reset / step / state / close) ───────────────────────────\n",
+ "from server.AxiomForgeAI_environment import AxiomforgeaiEnvironment\n",
+ "from models import AxiomforgeaiAction\n",
+ "\n",
+ "# ── Existing utilities from scripts/ and src/ ────────────────────────────────\n",
+ "from scripts.convert_gsm8k_to_sft import parse_gsm8k_answer\n",
+ "from scripts.eval_sft_inference import evaluate_gsm8k\n",
+ "from src.rl.prm_scorer import ProcessRewardScorer\n",
+ "from src.rl.math_environment_curriculum import CurriculumMathEnvironment\n",
+ "from src.rl.unified_accuracy import StepChainExtractor, UnifiedAccuracyCalculator\n",
+ "from src.rl.llm_question_classifier import LLMQuestionClassifier\n",
+ "from src.config.prompts import create_generator_messages\n",
+ "from src.sft.solution_format import extract_final_answer_numeric_str\n",
+ "from src.utils.attn_backend import select_attn_implementation\n",
+ "from src.utils.csv_logger import CSVLogger\n",
+ "\n",
+ "logging.basicConfig(\n",
+ " level=logging.INFO,\n",
+ " format=\"%(asctime)s %(levelname)-8s %(name)s - %(message)s\",\n",
+ ")\n",
+ "logger = logging.getLogger(__name__)\n",
+ "\n",
+ "if torch.cuda.is_available():\n",
+ " torch.set_float32_matmul_precision(\"high\")\n",
+ " torch.backends.cuda.matmul.allow_tf32 = True\n",
+ " torch.backends.cudnn.allow_tf32 = True\n",
+ " torch.backends.cudnn.benchmark = True"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-config",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ── Training configuration ────────────────────────────────────────────────────\n",
+ "# Edit these values before running. Every key matches the corresponding\n",
+ "# CLI flag in scripts/run_grpo_training.py for compatibility.\n",
+ "\n",
+ "args = argparse.Namespace(\n",
+ " # ── Paths ─────────────────────────────────────────────────────────────────\n",
+ " base_model = \"checkpoints/dual_task_v1\",\n",
+ " output_dir = \"checkpoints/grpo\",\n",
+ " gsm8k_data = \"data/sft/gsm8k_sft.jsonl\",\n",
+ " eval_data_path = \"data/sft/gsm8k_test.jsonl\",\n",
+ " math_data = None,\n",
+ " extraction_cache = \"data/extraction_cache.json\",\n",
+ " run_name = None, # auto-set to grpo_\n",
+ "\n",
+ " # ── Training scale ────────────────────────────────────────────────────────\n",
+ " num_iterations = 60,\n",
+ " questions_per_iter = 20,\n",
+ " group_size = 10, # K solutions per question (GRPO group)\n",
+ " q_group_size = 2, # K_q question candidates for two-phase self-play\n",
+ "\n",
+ " # ── Optimiser ─────────────────────────────────────────────────────────────\n",
+ " learning_rate = 5e-6,\n",
+ " max_grad_norm = 0.5,\n",
+ " kl_coef = 0.06,\n",
+ " clip_eps = 0.2,\n",
+ " warmup_iters = 8,\n",
+ " min_lr_ratio = 0.1,\n",
+ "\n",
+ " # ── Generation ────────────────────────────────────────────────────────────\n",
+ " max_new_tokens = 1000,\n",
+ " temperature = 0.8,\n",
+ " overlong_filter = True,\n",
+ "\n",
+ " # ── Dataset mixing (GSM8K → MATH curriculum ramp) ─────────────────────────\n",
+ " math_mix_ratio = 0.30, # MATH fraction at ramp start\n",
+ " math_mix_ratio_late = 0.50, # MATH fraction after ramp\n",
+ " math_ramp_start = 18, # iteration at which MATH mix starts increasing\n",
+ " math_max_difficulty = 3,\n",
+ " difficulty_alpha = 3.5, # Zipf-style sampling; higher → more hard questions\n",
+ "\n",
+ " # ── Evaluation ────────────────────────────────────────────────────────────\n",
+ " eval_every = 5,\n",
+ " eval_max_samples = 150,\n",
+ " eval_max_new_tokens = 1000,\n",
+ " eval_pass_at_k = 0,\n",
+ " skip_initial_eval = False,\n",
+ "\n",
+ " # ── PRM (Process Reward Model) ────────────────────────────────────────────\n",
+ " use_prm = True,\n",
+ " prm_model = \"Qwen/Qwen2.5-Math-PRM-7B\",\n",
+ "\n",
+ " # ── Chain / unified accuracy extractor ───────────────────────────────────\n",
+ " extractor_model = \"Qwen/Qwen2.5-0.5B-Instruct\",\n",
+ "\n",
+ " # ── Checkpointing ─────────────────────────────────────────────────────────\n",
+ " save_every = 5,\n",
+ " keep_last = 4,\n",
+ "\n",
+ " # ── Self-play phase curriculum ────────────────────────────────────────────\n",
+ " # Phase 1 (GROUNDED_ONLY): grounded-only until min_warmup iters pass AND\n",
+ " # grounded accuracy ≥ selfplay_gt_thresh AND step accuracy ≥ selfplay_step_thresh\n",
+ " # Phase 2 (SELFPLAY_RAMP): linearly ramp self-play over selfplay_ramp_iters\n",
+ " # Phase 3 (CONTINUOUS): stable mix; falls back to grounded if quality drops\n",
+ " self_play_ratio = 0.70, # target self-play fraction in Phase 3\n",
+ " min_warmup = 12, # minimum grounded-only iterations before SP\n",
+ " selfplay_gt_thresh = 0.65, # gt_match_rate required to unlock self-play\n",
+ " selfplay_grounded_thresh= 0.65, # grounded accuracy required to unlock self-play\n",
+ " selfplay_step_thresh = 0.68, # step-level accuracy threshold\n",
+ " selfplay_ramp_iters = 28, # iterations to ramp from 0 → self_play_ratio\n",
+ " grounded_floor = 0.55, # below this grounded acc → suspend self-play\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-infra",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ── Run identity + directory layout ──────────────────────────────────────────\n",
+ "run_name = args.run_name or f\"grpo_{datetime.now():%Y%m%d_%H%M%S}\"\n",
+ "out_dir = Path(args.output_dir) / run_name\n",
+ "log_dir = Path(\"logs\") / \"grpo\" / run_name\n",
+ "out_dir.mkdir(parents=True, exist_ok=True)\n",
+ "log_dir.mkdir(parents=True, exist_ok=True)\n",
+ "\n",
+ "# ── Console mirror (TeeStream) ────────────────────────────────────────────────\n",
+ "class TeeStream:\n",
+ " \"\"\"Mirrors every write to a terminal stream into a log file.\"\"\"\n",
+ " def __init__(self, primary, secondary):\n",
+ " self.primary, self.secondary = primary, secondary\n",
+ " def write(self, data):\n",
+ " self.primary.write(data); self.secondary.write(data); return len(data)\n",
+ " def flush(self):\n",
+ " self.primary.flush(); self.secondary.flush()\n",
+ " def isatty(self):\n",
+ " return getattr(self.primary, \"isatty\", lambda: False)()\n",
+ " def fileno(self):\n",
+ " return self.primary.fileno()\n",
+ "\n",
+ "console_log_path = log_dir / \"console_output.log\"\n",
+ "_console_log_file = console_log_path.open(\"a\", encoding=\"utf-8\", buffering=1)\n",
+ "\n",
+ "def _add_file_logging(path: Path) -> logging.FileHandler:\n",
+ " fh = logging.FileHandler(path, mode=\"a\", encoding=\"utf-8\")\n",
+ " fh.setLevel(logging.DEBUG)\n",
+ " fh.setFormatter(logging.Formatter(\"%(asctime)s %(levelname)-8s %(name)s - %(message)s\"))\n",
+ " logging.getLogger().addHandler(fh)\n",
+ " return fh\n",
+ "\n",
+ "_file_handler = _add_file_logging(console_log_path)\n",
+ "_orig_stdout = sys.stdout\n",
+ "_orig_stderr = sys.stderr\n",
+ "sys.stdout = TeeStream(_orig_stdout, _console_log_file)\n",
+ "sys.stderr = TeeStream(_orig_stderr, _console_log_file)\n",
+ "\n",
+ "# ── Live CSV metrics writer (via CSVLogger) ───────────────────────────────────\n",
+ "# CSVLogger writes key metrics to metrics.csv and full metrics as per-step JSON\n",
+ "# under logs/grpo//detailed_metrics/step_NNNN.json\n",
+ "_csv_logger = CSVLogger(\n",
+ " project=\"grpo\",\n",
+ " run_name=run_name,\n",
+ " log_dir=\"logs\",\n",
+ " config=vars(args),\n",
+ " log_detailed=True,\n",
+ ")\n",
+ "\n",
+ "def _append_metrics_csv(row: Dict[str, Any], step: Optional[int] = None) -> None:\n",
+ " \"\"\"Write one row of metrics via CSVLogger (key metrics → CSV, all → JSON).\"\"\"\n",
+ " _csv_logger.log(row, step=step)\n",
+ "\n",
+ "# ── Teardown (atexit + explicit) ──────────────────────────────────────────────\n",
+ "def _teardown() -> None:\n",
+ " sys.stdout = _orig_stdout\n",
+ " sys.stderr = _orig_stderr\n",
+ " logging.getLogger().removeHandler(_file_handler)\n",
+ " if not getattr(_file_handler.stream, \"closed\", False): _file_handler.close()\n",
+ " if not _console_log_file.closed: _console_log_file.close()\n",
+ " _csv_logger.finish()\n",
+ "\n",
+ "import atexit; atexit.register(_teardown)\n",
+ "\n",
+ "random.seed(42); np.random.seed(42); torch.manual_seed(42)\n",
+ "\n",
+ "logger.info(\"Run: %s | out=%s | log=%s\", run_name, out_dir, log_dir)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-model",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ── Device + attention backend ────────────────────────────────────────────────\n",
+ "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
+ "attn_impl = select_attn_implementation()\n",
+ "logger.info(\"Device: %s | attn: %s\", device, attn_impl)\n",
+ "if torch.cuda.is_available():\n",
+ " _g = torch.cuda.get_device_properties(0)\n",
+ " logger.info(\"GPU: %s | %.1f GB | sm_%d%d\", _g.name, _g.total_memory/1e9, _g.major, _g.minor)\n",
+ "\n",
+ "# ── Policy model ──────────────────────────────────────────────────────────────\n",
+ "logger.info(\"Loading model from %s ...\", args.base_model)\n",
+ "tokenizer = AutoTokenizer.from_pretrained(args.base_model, trust_remote_code=True)\n",
+ "if tokenizer.pad_token is None:\n",
+ " tokenizer.pad_token = tokenizer.eos_token\n",
+ "tokenizer.padding_side = \"right\"\n",
+ "\n",
+ "# Patch missing chat_template (common in SFT adapter checkpoints)\n",
+ "if tokenizer.chat_template is None:\n",
+ " _base_name = \"Qwen/Qwen2.5-Math-1.5B-Instruct\"\n",
+ " _meta = Path(args.base_model) / \"pipeline_meta.json\"\n",
+ " if _meta.exists():\n",
+ " _base_name = json.loads(_meta.read_text(encoding=\"utf-8\")).get(\"base_model\", _base_name)\n",
+ " try:\n",
+ " _bt = AutoTokenizer.from_pretrained(_base_name, trust_remote_code=True)\n",
+ " if _bt.chat_template: tokenizer.chat_template = _bt.chat_template\n",
+ " logger.info(\"Chat template loaded from %s\", _base_name)\n",
+ " except Exception as _e:\n",
+ " logger.warning(\"Could not load chat template: %s\", _e)\n",
+ "\n",
+ "# Patch missing tensor_parallel shim (PEFT ≤ 0.12)\n",
+ "if \"transformers.integrations.tensor_parallel\" not in sys.modules:\n",
+ " sys.modules[\"transformers.integrations.tensor_parallel\"] = types.ModuleType(\"tensor_parallel\")\n",
+ "\n",
+ "load_kwargs = dict(\n",
+ " torch_dtype=torch.bfloat16, low_cpu_mem_usage=True,\n",
+ " device_map={\"\":device}, trust_remote_code=True, attn_implementation=attn_impl)\n",
+ "\n",
+ "model_path = Path(args.base_model)\n",
+ "if (model_path / \"adapter_config.json\").exists():\n",
+ " _meta_p = model_path / \"pipeline_meta.json\"\n",
+ " _base_w = \"Qwen/Qwen2.5-Math-1.5B-Instruct\"\n",
+ " if _meta_p.exists():\n",
+ " _base_w = json.loads(_meta_p.read_text(encoding=\"utf-8\")).get(\"base_model\", _base_w)\n",
+ " logger.info(\"PEFT adapter — loading base %s then merging %s\", _base_w, args.base_model)\n",
+ " _base = AutoModelForCausalLM.from_pretrained(_base_w, **load_kwargs)\n",
+ " model = PeftModel.from_pretrained(_base, args.base_model).merge_and_unload().to(device)\n",
+ "else:\n",
+ " model = AutoModelForCausalLM.from_pretrained(args.base_model, **load_kwargs)\n",
+ "\n",
+ "for p in model.parameters(): p.requires_grad_(True)\n",
+ "\n",
+ "# Flash-Attn 2 makes gradient checkpointing redundant (same O(T) memory)\n",
+ "if attn_impl != \"flash_attention_2\":\n",
+ " model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={\"use_reentrant\": False})\n",
+ " if hasattr(model, \"config\"): model.config.use_cache = False\n",
+ " logger.info(\"Gradient checkpointing enabled.\")\n",
+ "\n",
+ "n_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
+ "n_total = sum(p.numel() for p in model.parameters())\n",
+ "logger.info(\"Parameters: %s / %s trainable (%.1f%%)\",\n",
+ " f\"{n_trainable:,}\", f\"{n_total:,}\", 100*n_trainable/max(n_total,1))\n",
+ "\n",
+ "# ── Frozen reference policy for KL penalty ────────────────────────────────────\n",
+ "ref_model: Optional[AutoModelForCausalLM] = None\n",
+ "if args.kl_coef > 0.0:\n",
+ " ref_model = copy.deepcopy(model)\n",
+ " ref_model.requires_grad_(False).eval()\n",
+ " logger.info(\"Reference policy ready (kl_coef=%.4f).\", args.kl_coef)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-env",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ── Load training data ────────────────────────────────────────────────────────\n",
+ "def _load_jsonl_qa(path: str) -> List[Dict[str, str]]:\n",
+ " \"\"\"Load {question, gold_final} pairs from a JSONL file.\"\"\"\n",
+ " pairs: List[Dict[str, str]] = []\n",
+ " p = Path(path)\n",
+ " if not p.exists():\n",
+ " logger.warning(\"Data file not found: %s\", path); return pairs\n",
+ " with p.open(encoding=\"utf-8\") as f:\n",
+ " for line in f:\n",
+ " line = line.strip()\n",
+ " if not line: continue\n",
+ " try: rec = json.loads(line)\n",
+ " except json.JSONDecodeError: continue\n",
+ " if \"question\" in rec and \"answer\" in rec:\n",
+ " q = rec[\"question\"].strip()\n",
+ " _, g = parse_gsm8k_answer(str(rec[\"answer\"]))\n",
+ " elif \"messages\" in rec:\n",
+ " q, asst = \"\", \"\"\n",
+ " for msg in rec[\"messages\"]:\n",
+ " if msg.get(\"role\") == \"user\" and not q: q = msg.get(\"content\",\"\").strip()\n",
+ " if msg.get(\"role\") == \"assistant\" and not asst: asst = msg.get(\"content\",\"\")\n",
+ " if \"Problem:\" in q: q = q.split(\"Problem:\",1)[1].strip()\n",
+ " g = (extract_final_answer_numeric_str(asst) or \"\").strip()\n",
+ " else:\n",
+ " continue\n",
+ " if q and g: pairs.append({\"question\": q, \"gold_final\": g})\n",
+ " logger.info(\"Loaded %d QA pairs from %s\", len(pairs), path)\n",
+ " return pairs\n",
+ "\n",
+ "def _load_math_dataset(\n",
+ " local_path: Optional[str] = None,\n",
+ " cache: str = \"data/math/math_numeric.jsonl\",\n",
+ " max_diff: int = 3,\n",
+ ") -> List[Dict[str, str]]:\n",
+ " \"\"\"Load MATH competition dataset (numeric answers, difficulty ≤ max_diff).\"\"\"\n",
+ " for src in filter(None, [local_path, cache]):\n",
+ " p = Path(src)\n",
+ " if p.exists():\n",
+ " items = [json.loads(l) for l in p.read_text(encoding=\"utf-8\").splitlines() if l.strip()]\n",
+ " if items: logger.info(\"Loaded %d MATH pairs from %s\", len(items), p); return items\n",
+ " try:\n",
+ " from datasets import load_dataset\n",
+ " ds = load_dataset(\"qwedsacf/competition_math\", split=\"train\", trust_remote_code=True)\n",
+ " except Exception as e:\n",
+ " logger.warning(\"MATH download failed (%s) — GSM8K only.\", e); return []\n",
+ " pairs, _box = [], re.compile(r\"\\\\\\\\boxed\\\\{([^}]*)\\\\}\")\n",
+ " for item in ds:\n",
+ " lvl = item.get(\"level\",\"Level 5\")\n",
+ " try:\n",
+ " if int(lvl.split()[-1]) > max_diff: continue\n",
+ " except (ValueError, IndexError): continue\n",
+ " m = _box.search(item.get(\"solution\",\"\"))\n",
+ " if not m: continue\n",
+ " raw = m.group(1).strip()\n",
+ " try: num = str(int(raw))\n",
+ " except ValueError:\n",
+ " try: v=float(raw); num=str(int(v)) if v==int(v) else f\"{v:.4f}\"\n",
+ " except ValueError: continue\n",
+ " pairs.append({\"question\": item.get(\"problem\",\"\").strip(), \"gold_final\": num})\n",
+ " if pairs:\n",
+ " Path(cache).parent.mkdir(parents=True,exist_ok=True)\n",
+ " Path(cache).write_text(\"\\n\".join(json.dumps(p) for p in pairs), encoding=\"utf-8\")\n",
+ " logger.info(\"Cached %d MATH pairs → %s\", len(pairs), cache)\n",
+ " return pairs\n",
+ "\n",
+ "gsm8k_pairs = _load_jsonl_qa(args.gsm8k_data)\n",
+ "if not gsm8k_pairs:\n",
+ " raise SystemExit(f\"No training data at {args.gsm8k_data}\")\n",
+ "\n",
+ "math_pairs: List[Dict[str, str]] = []\n",
+ "if args.math_mix_ratio > 0:\n",
+ " math_pairs = _load_math_dataset(args.math_data, max_diff=args.math_max_difficulty)\n",
+ " if math_pairs:\n",
+ " logger.info(\"MATH mix: %.0f%% MATH (%d) + %.0f%% GSM8K (%d)\",\n",
+ " 100*args.math_mix_ratio, len(math_pairs),\n",
+ " 100*(1-args.math_mix_ratio), len(gsm8k_pairs))\n",
+ "\n",
+ "# ── PRM scorer ────────────────────────────────────────────────────────────────\n",
+ "prm: Optional[ProcessRewardScorer] = None\n",
+ "if args.use_prm:\n",
+ " try:\n",
+ " prm = ProcessRewardScorer(model_name=args.prm_model, device=device, load_in_4bit=True)\n",
+ " logger.info(\"PRM loaded: %s (4-bit)\", args.prm_model)\n",
+ " except Exception as e:\n",
+ " logger.warning(\"PRM load failed (%s) — no PRM scoring.\", e)\n",
+ "\n",
+ "# ── Unified accuracy calculator (step-chain scoring, Phase 2+) ────────────────\n",
+ "_extractor = StepChainExtractor(model_name=args.extractor_model, device=str(device),\n",
+ " cache_path=args.extraction_cache)\n",
+ "_unified_calc = UnifiedAccuracyCalculator(extractor=_extractor, question_evaluator=None)\n",
+ "logger.info(\"Warming up step-chain extractor ...\")\n",
+ "_extractor.warmup()\n",
+ "logger.info(\"Extractor ready.\")\n",
+ "\n",
+ "# ── CurriculumMathEnvironment (full model — generates + scores) ───────────────\n",
+ "math_env = CurriculumMathEnvironment(\n",
+ " policy_model=model,\n",
+ " value_model=None,\n",
+ " tokenizer=tokenizer,\n",
+ " reference_questions=[p[\"question\"] for p in gsm8k_pairs],\n",
+ " grounded_qa_pairs=gsm8k_pairs,\n",
+ " prm_scorer=prm,\n",
+ " max_solution_tokens=args.max_new_tokens,\n",
+ " device=device,\n",
+ " unified_accuracy_calc=_unified_calc,\n",
+ ")\n",
+ "_unified_calc.question_evaluator = math_env.question_evaluator\n",
+ "\n",
+ "# LLM-backed question classifier (uses the already-loaded policy)\n",
+ "_llm_cls = LLMQuestionClassifier(model=model, tokenizer=tokenizer,\n",
+ " device=device, cache_size=10_000)\n",
+ "math_env.question_evaluator.classifier = _llm_cls\n",
+ "\n",
+ "# Bootstrap curriculum from structured dataset (NuminaMath / OpenMathInstruct)\n",
+ "_raw = [json.loads(l) for l in Path(args.gsm8k_data).read_text(encoding=\"utf-8\").splitlines() if l.strip()]\n",
+ "if any(\"skill_id\" in r for r in _raw[:20]):\n",
+ " math_env.curriculum_manager.initialize_from_dataset(_raw)\n",
+ " logger.info(\"Curriculum bootstrapped from skill_ids.\")\n",
+ "else:\n",
+ " logger.info(\"Plain dataset — keyword-classifier bootstrap.\")\n",
+ "\n",
+ "# ── RL Environment — wraps math_env with reset / step / state / close ─────────\n",
+ "env = AxiomforgeaiEnvironment()\n",
+ "env._math_env = math_env # inject the training-configured math_env\n",
+ "logger.info(\"RL environment ready — reset / step / state / close.\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-optim",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ── Optimiser + LR schedule ───────────────────────────────────────────────────\n",
+ "optimizer = torch.optim.AdamW(\n",
+ " [p for p in model.parameters() if p.requires_grad],\n",
+ " lr=args.learning_rate,\n",
+ " fused=torch.cuda.is_available(),\n",
+ ")\n",
+ "\n",
+ "from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR\n",
+ "_nw = max(1, args.warmup_iters)\n",
+ "_nt = max(1, args.num_iterations)\n",
+ "_nd = max(1, _nt - _nw)\n",
+ "scheduler = SequentialLR(\n",
+ " optimizer,\n",
+ " schedulers=[\n",
+ " LinearLR(optimizer, start_factor=0.1, end_factor=1.0, total_iters=_nw),\n",
+ " CosineAnnealingLR(optimizer, T_max=_nd, eta_min=args.learning_rate * args.min_lr_ratio),\n",
+ " ],\n",
+ " milestones=[_nw],\n",
+ ")\n",
+ "logger.info(\"LR: %.1e warmup=%d cosine=%d min=%.1e\",\n",
+ " args.learning_rate, _nw, _nd, args.learning_rate * args.min_lr_ratio)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-utils",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ── GRPO utilities ────────────────────────────────────────────────────────────\n",
+ "# These functions live in the notebook because they depend on live model\n",
+ "# objects and are tightly coupled to the GRPO update step.\n",
+ "\n",
+ "def _stop_ids(tok: AutoTokenizer) -> Optional[List[int]]:\n",
+ " ids = []\n",
+ " if tok.eos_token_id is not None: ids.append(tok.eos_token_id)\n",
+ " im = tok.convert_tokens_to_ids(\"<|im_end|>\")\n",
+ " if isinstance(im, int) and im not in ids: ids.append(im)\n",
+ " return ids or None\n",
+ "\n",
+ "\n",
+ "@torch.no_grad()\n",
+ "def generate_questions_batched(\n",
+ " model, tokenizer, instruction: str, K_q: int,\n",
+ " max_new_tokens: int, temperature: float, device,\n",
+ ") -> Tuple[List[str], List, List, List]:\n",
+ " \"\"\"Generate K_q candidate question strings from a curriculum instruction.\"\"\"\n",
+ " messages = create_generator_messages(instruction)\n",
+ " try: prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
+ " except: prompt = f\"{messages[0]['content']}\\n\\n{instruction}\\n\"\n",
+ " pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id\n",
+ " enc = tokenizer(prompt, return_tensors=\"pt\", truncation=True, max_length=512).to(device)\n",
+ " plen = enc[\"input_ids\"].shape[1]\n",
+ " out = model.generate(\n",
+ " input_ids=enc[\"input_ids\"].expand(K_q,-1).contiguous(),\n",
+ " attention_mask=enc[\"attention_mask\"].expand(K_q,-1).contiguous(),\n",
+ " max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature,\n",
+ " top_p=0.95, pad_token_id=pad_id, eos_token_id=_stop_ids(tokenizer), use_cache=True)\n",
+ " pad_t = torch.tensor(pad_id, device=device, dtype=out.dtype)\n",
+ " questions, ids_list, masks_list, olps_list = [], [], [], []\n",
+ " attn_lp = (out != pad_t); attn_lp[:,:plen] = True\n",
+ " batch_logits = model(input_ids=out, attention_mask=attn_lp.long(),\n",
+ " use_cache=False, return_dict=True).logits\n",
+ " for i in range(K_q):\n",
+ " full = out[i]; resp = full[plen:]\n",
+ " mask = torch.zeros(full.shape[0], dtype=torch.bool, device=device)\n",
+ " mask[plen:] = resp != pad_t\n",
+ " questions.append(tokenizer.decode(resp, skip_special_tokens=True).strip())\n",
+ " ids_list.append(full); masks_list.append(mask)\n",
+ " sl = batch_logits[i,:-1]; lb = full[1:]; sm = mask[1:]\n",
+ " lp = F.log_softmax(sl,dim=-1)[torch.arange(sl.size(0),device=device), lb]\n",
+ " resp_lp = lp[sm]\n",
+ " olps_list.append(resp_lp.sum().detach() if resp_lp.numel()>0 else torch.tensor(0.,device=device))\n",
+ " return questions, ids_list, masks_list, olps_list\n",
+ "\n",
+ "\n",
+ "def generate_solutions_batched(\n",
+ " model, tokenizer, prompt: str, K: int,\n",
+ " max_new_tokens: int, temperature: float, device,\n",
+ ") -> Tuple[List[str], List, List, List]:\n",
+ " \"\"\"Generate K solution strings and their per-sequence log-probs.\"\"\"\n",
+ " pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id\n",
+ " enc = tokenizer(prompt, return_tensors=\"pt\", padding=False,\n",
+ " truncation=True, max_length=1024).to(device)\n",
+ " plen = enc[\"input_ids\"].shape[1]\n",
+ " model.eval()\n",
+ " with torch.no_grad():\n",
+ " out = model.generate(\n",
+ " input_ids=enc[\"input_ids\"].expand(K,-1).contiguous(),\n",
+ " attention_mask=enc[\"attention_mask\"].expand(K,-1).contiguous(),\n",
+ " max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature,\n",
+ " top_p=0.9, pad_token_id=pad_id, eos_token_id=_stop_ids(tokenizer), use_cache=True)\n",
+ " pad_t = torch.tensor(pad_id, device=device, dtype=out.dtype)\n",
+ " solutions, ids_list, masks_list, olps_list = [], [], [], []\n",
+ " with torch.no_grad():\n",
+ " attn_lp = (out != pad_t); attn_lp[:,:plen] = True\n",
+ " batch_logits = model(input_ids=out, attention_mask=attn_lp.long(),\n",
+ " use_cache=False, return_dict=True).logits\n",
+ " for i in range(K):\n",
+ " full = out[i]; resp = full[plen:]\n",
+ " mask = torch.zeros(full.shape[0], dtype=torch.bool, device=device)\n",
+ " mask[plen:] = resp != pad_t\n",
+ " solutions.append(tokenizer.decode(resp, skip_special_tokens=True))\n",
+ " ids_list.append(full); masks_list.append(mask)\n",
+ " sl = batch_logits[i,:-1]; lb = full[1:]; sm = mask[1:]\n",
+ " lp = F.log_softmax(sl,dim=-1)[torch.arange(sl.size(0),device=device), lb]\n",
+ " resp_lp = lp[sm]\n",
+ " olps_list.append(resp_lp.sum().detach() if resp_lp.numel()>0 else torch.tensor(0.,device=device))\n",
+ " return solutions, ids_list, masks_list, olps_list\n",
+ "\n",
+ "\n",
+ "def compute_sequence_log_prob(model, input_ids, response_mask) -> torch.Tensor:\n",
+ " \"\"\"Forward pass → sum of log-probs over the response tokens.\"\"\"\n",
+ " logits = model(input_ids=input_ids.unsqueeze(0), use_cache=False, return_dict=True).logits[0]\n",
+ " lp = F.log_softmax(logits[:-1], dim=-1)\n",
+ " token_lp = lp[torch.arange(lp.size(0), device=lp.device), input_ids[1:]]\n",
+ " resp = token_lp[response_mask[1:]]\n",
+ " return resp.sum() if resp.numel() > 0 else torch.tensor(0., requires_grad=True, device=input_ids.device)\n",
+ "\n",
+ "\n",
+ "def grpo_loss_for_group(\n",
+ " model, ids_list, masks_list, rewards: List[float], old_lps,\n",
+ " clip_eps: float = 0.2, kl_coef: float = 0.0, ref_model=None, eps: float = 1e-8,\n",
+ ") -> Optional[torch.Tensor]:\n",
+ " \"\"\"GRPO policy loss for one question group (K solutions).\"\"\"\n",
+ " r = np.array(rewards, dtype=np.float32)\n",
+ " if r.std() < eps: return None\n",
+ " advantages = np.clip((r - r.mean()) / (r.std() + eps), -5., 5.)\n",
+ " dev = next(model.parameters()).device\n",
+ " loss = torch.tensor(0., device=dev); n = 0\n",
+ " model.train()\n",
+ " for ids, mask, adv, olp in zip(ids_list, masks_list, advantages, old_lps):\n",
+ " n_resp = int(mask[1:].sum().item())\n",
+ " if n_resp == 0: continue\n",
+ " new_lp = compute_sequence_log_prob(model, ids, mask)\n",
+ " adv_t = torch.tensor(adv, dtype=new_lp.dtype, device=dev)\n",
+ " if clip_eps > 0:\n",
+ " ratio = torch.exp(new_lp - olp.to(dev).detach())\n",
+ " li = -torch.min(ratio * adv_t, torch.clamp(ratio,1-clip_eps,1+clip_eps) * adv_t) / n_resp\n",
+ " else:\n",
+ " li = -(adv_t * new_lp / n_resp)\n",
+ " if kl_coef > 0 and ref_model is not None:\n",
+ " with torch.no_grad(): ref_lp = compute_sequence_log_prob(ref_model, ids, mask)\n",
+ " li = li + kl_coef * (new_lp - ref_lp.to(dev).detach()) / n_resp\n",
+ " loss = loss + li; n += 1\n",
+ " return loss / n if n > 0 else None\n",
+ "\n",
+ "\n",
+ "def compute_self_play_reward(\n",
+ " question: str, solution: str, topic: str, difficulty: float, math_env,\n",
+ ") -> Tuple[float, float, float, Dict]:\n",
+ " \"\"\"Self-play reward via math_env.compute_reward (no gold answer).\"\"\"\n",
+ " result = math_env.compute_reward(question=question, solution=solution,\n",
+ " target_topic=topic, target_difficulty=difficulty)\n",
+ " combined = float(result[\"combined_score\"])\n",
+ " sol_m = result.get(\"solution_metrics\") or {}\n",
+ " s_rew = float(sol_m.get(\"overall_score\", 0.)) if isinstance(sol_m, dict) else 0.\n",
+ " q_raw = result.get(\"question_metrics\") or {}\n",
+ " q_rew = float(result.get(\"effective_question_reward\", q_raw.get(\"overall_score\", 0.)))\n",
+ " q_met: Dict = {\n",
+ " \"overall_score\": q_rew,\n",
+ " \"topic_match\": float(q_raw.get(\"topic_match\", 0.)),\n",
+ " \"difficulty_fit\": float(q_raw.get(\"difficulty_score\", 0.)),\n",
+ " \"clarity\": float(q_raw.get(\"clarity\", 0.)),\n",
+ " \"novelty\": float(q_raw.get(\"novelty_combined\", 0.)),\n",
+ " \"solvability\": float(q_raw.get(\"solvability_score\",0.)),\n",
+ " \"sp_chain_integrity_score\": result.get(\"sp_chain_integrity_score\"),\n",
+ " }\n",
+ " return combined, q_rew, s_rew, q_met\n",
+ "\n",
+ "\n",
+ "def _verify_sp_answer(solutions: List[str], topic: str, difficulty: float) -> bool:\n",
+ " \"\"\"Consensus check: majority of K solutions agree on a numeric answer.\"\"\"\n",
+ " t = topic.lower().replace(\" \",\"_\")\n",
+ " if t in {\"geometry\"} or difficulty >= 4.: return False\n",
+ " answers: List[float] = []\n",
+ " for sol in solutions:\n",
+ " m = re.search(r\"final answer[:\\s]*([^\\n]+)\", sol, re.I)\n",
+ " if not m: continue\n",
+ " raw = m.group(1).strip()\n",
+ " for fn in (lambda s: float(eval(s, {\"__builtins__\":{}}, {})),\n",
+ " lambda s: float(__import__(\"sympy\").N(__import__(\"sympy\").sympify(s), 15))):\n",
+ " try: v = fn(raw); answers.append(round(v, 6)); break\n",
+ " except: pass\n",
+ " if not answers: return False\n",
+ " maj = max(set(answers), key=answers.count)\n",
+ " return answers.count(maj) >= max(1, len(solutions)//2)\n",
+ "\n",
+ "\n",
+ "def evaluate_policy(\n",
+ " model, tokenizer, data_path: str, max_samples: int,\n",
+ " max_new_tokens: int, math_env=None, pass_at_k: int = 4,\n",
+ ") -> Dict[str, Any]:\n",
+ " \"\"\"Run evaluation on held-out data; returns combined_score and related metrics.\"\"\"\n",
+ " if not Path(data_path).exists(): return {\"accuracy\": 0., \"combined_score\": 0., \"total\": 0}\n",
+ " model.eval()\n",
+ " reward_fn = None\n",
+ " if math_env is not None:\n",
+ " import logging as _lm\n",
+ " _ml = _lm.getLogger(\"src.rl.math_environment_curriculum\")\n",
+ " _pl = _lm.getLogger(\"src.rl.prm_scorer\")\n",
+ " def reward_fn(q, s, g):\n",
+ " _ml.setLevel(_lm.WARNING); _pl.setLevel(_lm.WARNING)\n",
+ " try: return math_env.compute_grounded_reward(q, s, g)\n",
+ " finally: _ml.setLevel(_lm.INFO); _pl.setLevel(_lm.INFO)\n",
+ " stem = Path(data_path).stem.lower()\n",
+ " ds_name = \"AQuA-RAT\" if \"aqua\" in stem else \"MATH\" if \"math\" in stem else \"GSM8K\"\n",
+ " results = evaluate_gsm8k(model=model, tokenizer=tokenizer, data_path=data_path,\n",
+ " max_samples=max_samples, max_new_tokens=max_new_tokens,\n",
+ " reward_fn=reward_fn, pass_at_k=pass_at_k, dataset_name=ds_name)\n",
+ " model.train()\n",
+ " return results\n",
+ "\n",
+ "\n",
+ "# ── Difficulty-adaptive question sampling ─────────────────────────────────────\n",
+ "_q_wins: Dict[str, int] = defaultdict(int)\n",
+ "_q_attempts: Dict[str, int] = defaultdict(int)\n",
+ "\n",
+ "def _qkey(q: str) -> str:\n",
+ " return hashlib.md5(q.encode(), usedforsecurity=False).hexdigest()\n",
+ "\n",
+ "def _sample_by_difficulty(pool: List[Dict], n: int, alpha: float) -> List[Dict]:\n",
+ " \"\"\"Weight questions by how informative they are (win-rate close to 50%).\"\"\"\n",
+ " if alpha <= 0: return random.sample(pool, min(n, len(pool)))\n",
+ " weights = []\n",
+ " for qa in pool:\n",
+ " att = _q_attempts[_qkey(qa[\"question\"])]\n",
+ " w = 0.75 if att == 0 else max(\n",
+ " (1. - abs(_q_wins[_qkey(qa[\"question\"])]/att - 0.5)*2.) ** alpha, 0.05)\n",
+ " weights.append(w)\n",
+ " tw = sum(weights)\n",
+ " probs = [w/tw for w in weights]\n",
+ " return [pool[i] for i in np.random.choice(len(pool), size=min(n,len(pool)), replace=False, p=probs)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-init-eval",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ── Optional initial evaluation (Iteration 0 baseline) ───────────────────────\n",
+ "metrics_log: List[Dict] = []\n",
+ "best_combined = best_prm_mean = best_accuracy = 0.\n",
+ "\n",
+ "if not args.skip_initial_eval:\n",
+ " logger.info(\"=\" * 70)\n",
+ " logger.info(\"INITIAL EVALUATION (Iteration 0)\")\n",
+ " logger.info(\"=\" * 70)\n",
+ " _init = evaluate_policy(model, tokenizer, args.eval_data_path,\n",
+ " args.eval_max_samples, args.eval_max_new_tokens,\n",
+ " math_env=math_env, pass_at_k=args.eval_pass_at_k)\n",
+ " best_combined = best_accuracy = float(_init.get(\"combined_score\", 0.))\n",
+ " best_prm_mean = float(_init.get(\"prm_mean\", 0.))\n",
+ " logger.info(\"Baseline combined_score=%.4f correct=%.1f%%\",\n",
+ " best_combined, 100*float(_init.get(\"correct_rate\", 0.)))\n",
+ " metrics_log.append({\"iteration\": 0, **_init})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-train",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ══════════════════════════════════════════════════════════════════════════════\n",
+ "# GRPO Training Loop — reset → step → state → close\n",
+ "# ══════════════════════════════════════════════════════════════════════════════\n",
+ "\n",
+ "# ── Phase curriculum state ────────────────────────────────────────────────────\n",
+ "class _Phase(Enum):\n",
+ " GROUNDED_ONLY = _auto() # grounded only until model is ready\n",
+ " SELFPLAY_RAMP = _auto() # ramp self-play ratio up from 0\n",
+ " CONTINUOUS = _auto() # steady-state mixed training\n",
+ "\n",
+ "_phase: _Phase = _Phase.GROUNDED_ONLY\n",
+ "_sp_iters: int = 0\n",
+ "_sp_suspended: bool = False\n",
+ "_eff_sp: float = 0.\n",
+ "_use_chain: bool = False\n",
+ "_chain_corr: float = 0.\n",
+ "_extract_rate: float = 0.\n",
+ "_chain_buf: List[float] = []\n",
+ "_prm_buf: List[float] = []\n",
+ "_succ_buf: List[int] = []\n",
+ "_CWIN, _CMAX, _SHDW = 50, 200, 4\n",
+ "_shadow_ctr: int = 0\n",
+ "\n",
+ "for iteration in range(1, args.num_iterations + 1):\n",
+ " iter_start = time.perf_counter()\n",
+ " logger.info(\"=\" * 70)\n",
+ " logger.info(\"GRPO ITERATION %d / %d [phase=%s]\", iteration, args.num_iterations, _phase.name)\n",
+ " logger.info(\"=\" * 70)\n",
+ "\n",
+ " # ── Dataset batch (with MATH ramp) ────────────────────────────────────────\n",
+ " _eff_math = args.math_mix_ratio\n",
+ " if args.math_mix_ratio_late and iteration > args.math_ramp_start:\n",
+ " _r = min(1., (iteration - args.math_ramp_start) / 10.)\n",
+ " _eff_math = args.math_mix_ratio + _r * (args.math_mix_ratio_late - args.math_mix_ratio)\n",
+ " if math_pairs and _eff_math > 0:\n",
+ " nm = max(1, round(args.questions_per_iter * _eff_math))\n",
+ " ng = max(1, args.questions_per_iter - nm)\n",
+ " batch = (_sample_by_difficulty(math_pairs, nm, args.difficulty_alpha) +\n",
+ " _sample_by_difficulty(gsm8k_pairs, ng, args.difficulty_alpha))\n",
+ " random.shuffle(batch)\n",
+ " else:\n",
+ " batch = _sample_by_difficulty(gsm8k_pairs, args.questions_per_iter, args.difficulty_alpha)\n",
+ "\n",
+ " # Temperature annealing: 0.8 → 0.4 over the full run\n",
+ " _ann = min(1., (iteration-1) / max(1, args.num_iterations-1))\n",
+ " _temp = args.temperature * (1. - 0.5 * _ann)\n",
+ "\n",
+ " # ── Effective self-play ratio (phase-dependent) ────────────────────────────\n",
+ " if _phase == _Phase.GROUNDED_ONLY or _sp_suspended: _eff_sp = 0.\n",
+ " elif _phase == _Phase.SELFPLAY_RAMP:\n",
+ " _eff_sp = 1. - max(0.30, 1. - _sp_iters / max(1, args.selfplay_ramp_iters))\n",
+ " else: _eff_sp = args.self_play_ratio\n",
+ "\n",
+ " _sp_idx = set(random.sample(range(len(batch)), int(round(len(batch)*_eff_sp))))\n",
+ "\n",
+ " # ── Per-iteration metric accumulators ─────────────────────────────────────\n",
+ " all_r, all_qr = [], []\n",
+ " gr_r, sp_r = [], []\n",
+ " gr_step, gr_lccp, gr_gt = [], [], []\n",
+ " ch_arith, ch_dep, ch_int, sp_ch = [], [], [], []\n",
+ " qc = dict(topic=[], diff=[], clarity=[], novelty=[], solvab=[])\n",
+ " skipped = n_grps = n_sp = q_att = q_val = q_good = 0\n",
+ " skip0var = 0; total_loss = 0.\n",
+ "\n",
+ " optimizer.zero_grad()\n",
+ "\n",
+ " pbar = tqdm(batch, desc=f\"Iter {iteration}\", unit=\"q\")\n",
+ " for _idx, qa in enumerate(pbar):\n",
+ " is_sp = _idx in _sp_idx\n",
+ "\n",
+ " # ════════════════════════════════════════════════════════════════════\n",
+ " # RESET — start a new episode\n",
+ " # ════════════════════════════════════════════════════════════════════\n",
+ " if is_sp:\n",
+ " # Self-play: curriculum provides the instruction\n",
+ " instr, topic, diff = env._math_env.sample_instruction()\n",
+ " if diff >= 4.: skipped += 1; continue\n",
+ " q_att += 1\n",
+ "\n",
+ " if args.q_group_size > 1:\n",
+ " # Two-phase SP: generate K_q questions, then K solutions per question\n",
+ " _qt = min(0.90, _temp + 0.05)\n",
+ " qcands, qids, qmasks, qolps = generate_questions_batched(\n",
+ " model, tokenizer, instr, args.q_group_size, 128, _qt, device)\n",
+ " vq = [(q,i,m,o) for q,i,m,o in zip(qcands,qids,qmasks,qolps) if len(q.strip())>=10]\n",
+ " if not vq: skipped += 1; continue\n",
+ " q_val += 1; n_sp += 1\n",
+ " qagg: List[float] = []\n",
+ " for _qt2, _qi, _qm, _qo in vq:\n",
+ " sols, sids, smasks, solps = generate_solutions_batched(\n",
+ " model, tokenizer, math_env.format_solution_prompt(_qt2),\n",
+ " args.group_size, args.max_new_tokens, _temp, device)\n",
+ " if args.overlong_filter:\n",
+ " vf = [(s,i,m,o) for s,i,m,o in zip(sols,sids,smasks,solps)\n",
+ " if int(m.sum())0.5 for r in qagg): q_good+=1\n",
+ " pbar.set_postfix(loss=f\"{total_loss/max(1,n_grps):.4f}\",\n",
+ " sp_r=f\"{np.mean(sp_r or [0]):.3f}\",skip=skipped)\n",
+ " continue\n",
+ "\n",
+ " # Single-question self-play\n",
+ " from src.config.prompts import create_generator_messages as _cgm\n",
+ " _msgs = _cgm(instr)\n",
+ " try: _pr = tokenizer.apply_chat_template(_msgs,tokenize=False,add_generation_prompt=True)\n",
+ " except: _pr = f\"{_msgs[0]['content']}\\n\\n{instr}\\n\"\n",
+ " _enc = tokenizer(_pr,return_tensors=\"pt\",truncation=True,max_length=512).to(device)\n",
+ " _plen = _enc[\"input_ids\"].shape[1]\n",
+ " with torch.no_grad():\n",
+ " _out = model.generate(\n",
+ " **_enc, max_new_tokens=128, do_sample=True,\n",
+ " temperature=min(0.90,_temp+0.05), top_p=0.95,\n",
+ " pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,\n",
+ " eos_token_id=_stop_ids(tokenizer), use_cache=True)\n",
+ " question = tokenizer.decode(_out[0][_plen:], skip_special_tokens=True).strip()\n",
+ " if len(question.strip()) < 10: skipped+=1; continue\n",
+ " q_val+=1; n_sp+=1\n",
+ " else:\n",
+ " # ── RESET (grounded): inject difficulty-sampled QA pair ────────────\n",
+ " obs = env.reset(qa=qa) # state: question from dataset\n",
+ " question = obs.question # the grounded math question\n",
+ " topic, diff = \"grounded\", 0.5\n",
+ "\n",
+ " # ════════════════════════════════════════════════════════════════════\n",
+ " # GENERATE — policy produces K candidate solutions\n",
+ " # ════════════════════════════════════════════════════════════════════\n",
+ " solutions, ids_list, masks_list, lps_list = generate_solutions_batched(\n",
+ " model, tokenizer, math_env.format_solution_prompt(question),\n",
+ " args.group_size, args.max_new_tokens, _temp, device)\n",
+ " if args.overlong_filter:\n",
+ " vf = [(s,i,m,o) for s,i,m,o in zip(solutions,ids_list,masks_list,lps_list)\n",
+ " if int(m.sum()) < args.max_new_tokens]\n",
+ " if vf: solutions,ids_list,masks_list,lps_list = map(list, zip(*vf))\n",
+ " else: skipped+=1; continue\n",
+ "\n",
+ " # ════════════════════════════════════════════════════════════════════\n",
+ " # STEP — score each solution with the RL environment\n",
+ " # ════════════════════════════════════════════════════════════════════\n",
+ " rewards: List[float] = []\n",
+ " _sp_qr: List[float] = []\n",
+ " for sol in solutions:\n",
+ " if is_sp:\n",
+ " # Self-play: env.compute_reward (no gold answer)\n",
+ " r, qr, _, qm = compute_self_play_reward(question, sol, topic, diff, math_env)\n",
+ " _sp_qr.append(qr); all_qr.append(qr)\n",
+ " qc[\"topic\"].append(qm[\"topic_match\"]); qc[\"diff\"].append(qm[\"difficulty_fit\"])\n",
+ " qc[\"clarity\"].append(qm[\"clarity\"]); qc[\"novelty\"].append(qm[\"novelty\"])\n",
+ " qc[\"solvab\"].append(qm[\"solvability\"])\n",
+ " _sc = qm.get(\"sp_chain_integrity_score\")\n",
+ " if _sc is not None: sp_ch.append(float(_sc))\n",
+ " else:\n",
+ " # Grounded: env.step → compute_grounded_reward internally\n",
+ " step_obs = env.step(AxiomforgeaiAction(solution=sol))\n",
+ " r = step_obs.reward\n",
+ " m = step_obs.metadata or {}\n",
+ " gr_step.append(float(m.get(\"step_accuracy\", 0.)))\n",
+ " gr_lccp.append(float(m.get(\"lccp\", 0.)))\n",
+ " gr_gt.append(bool(m.get(\"gt_match\", False)))\n",
+ " if m.get(\"chain_arith_score\") is not None: ch_arith.append(float(m[\"chain_arith_score\"]))\n",
+ " if m.get(\"chain_dep_score\") is not None: ch_dep.append(float(m[\"chain_dep_score\"]))\n",
+ " if m.get(\"chain_integrity_score\") is not None: ch_int.append(float(m[\"chain_integrity_score\"]))\n",
+ " # Shadow chain extraction for Phase 2 calibration\n",
+ " _shadow_ctr += 1\n",
+ " if (_phase == _Phase.SELFPLAY_RAMP and not _use_chain\n",
+ " and _unified_calc and _shadow_ctr % _SHDW == 0):\n",
+ " _pps = 0.60*m.get(\"prm_final_score\",0.) + 0.40*m.get(\"prm_mean_score\",0.)\n",
+ " try:\n",
+ " _sh = _unified_calc.compute(solution=sol,gold_answer=qa[\"gold_final\"],\n",
+ " question=question,topic=\"grounded\",phase=\"grounded\")\n",
+ " _chain_buf.append(_sh.chain_integrity_score)\n",
+ " _prm_buf.append(_pps)\n",
+ " _succ_buf.append(1 if _sh.extraction_succeeded else 0)\n",
+ " except Exception: _succ_buf.append(0)\n",
+ " rewards.append(r)\n",
+ "\n",
+ " all_r.extend(rewards)\n",
+ " if is_sp: sp_r.extend(rewards)\n",
+ " else: gr_r.extend(rewards)\n",
+ "\n",
+ " if is_sp:\n",
+ " if _sp_qr and np.mean(_sp_qr) > 0.5: q_good += 1\n",
+ " if not _verify_sp_answer(solutions, topic, diff): skipped+=1; continue\n",
+ " else:\n",
+ " k = _qkey(question)\n",
+ " _q_attempts[k] += len(solutions)\n",
+ " _q_wins[k] += sum(1 for r in rewards if r > float(np.median(rewards)))\n",
+ "\n",
+ " # Zero-variance guard\n",
+ " if np.std(rewards) < 0.02:\n",
+ " skipped+=1; skip0var+=1\n",
+ " pbar.set_postfix(mean_r=f\"{np.mean(rewards):.3f}\",skip=skipped,loss=\"0var\"); continue\n",
+ "\n",
+ " # GRPO loss\n",
+ " g_loss = grpo_loss_for_group(model, ids_list, masks_list, rewards, lps_list,\n",
+ " args.clip_eps, args.kl_coef, ref_model)\n",
+ " if g_loss is None:\n",
+ " skipped+=1; skip0var+=1\n",
+ " pbar.set_postfix(mean_r=f\"{np.mean(rewards):.3f}\",skip=skipped,loss=\"skip\"); continue\n",
+ "\n",
+ " g_loss.backward()\n",
+ " total_loss += g_loss.item(); n_grps += 1\n",
+ " pbar.set_postfix(mean_r=f\"{np.mean(rewards):.3f}\",\n",
+ " loss=f\"{g_loss.item():.4f}\", skip=skipped)\n",
+ "\n",
+ " # ── Optimiser step ────────────────────────────────────────────────────────\n",
+ " if n_grps > 0:\n",
+ " if n_grps > 1:\n",
+ " for p in model.parameters():\n",
+ " if p.grad is not None: p.grad.div_(n_grps)\n",
+ " torch.nn.utils.clip_grad_norm_(\n",
+ " [p for p in model.parameters() if p.requires_grad], args.max_grad_norm)\n",
+ " optimizer.step()\n",
+ " loss_val = total_loss / n_grps\n",
+ " else:\n",
+ " loss_val = 0.\n",
+ " scheduler.step()\n",
+ "\n",
+ " # ════════════════════════════════════════════════════════════════════════\n",
+ " # STATE — collect iteration metrics + phase transitions\n",
+ " # ════════════════════════════════════════════════════════════════════════\n",
+ " _epi_state = env.state # episode_id + step_count for the last episode\n",
+ " iter_time = time.perf_counter() - iter_start\n",
+ " mean_r = float(np.mean(all_r)) if all_r else 0.\n",
+ " std_r = float(np.std(all_r)) if all_r else 0.\n",
+ " acc_r = float(np.mean([r>0.5 for r in all_r])) if all_r else 0.\n",
+ " gr_acc = float(np.mean([r>0.5 for r in gr_r])) if gr_r else 0.\n",
+ " step_a = float(np.mean(gr_step)) if gr_step else 0.\n",
+ " lccp_a = float(np.mean(gr_lccp)) if gr_lccp else 0.\n",
+ " mean_qr = float(np.mean(all_qr)) if all_qr else 0.\n",
+ " gt_rate = float(sum(gr_gt)/len(gr_gt)) if gr_gt else 0.\n",
+ " cur_lr = optimizer.param_groups[0][\"lr\"]\n",
+ "\n",
+ " # Phase transition logic\n",
+ " if _phase == _Phase.GROUNDED_ONLY:\n",
+ " if (gt_rate >= args.selfplay_gt_thresh\n",
+ " and gr_acc >= args.selfplay_grounded_thresh\n",
+ " and step_a >= args.selfplay_step_thresh\n",
+ " and iteration >= args.min_warmup):\n",
+ " _phase = _Phase.SELFPLAY_RAMP\n",
+ " logger.info(\"PHASE → SELFPLAY_RAMP at iter %d (gt=%.2f acc=%.2f step=%.2f)\",\n",
+ " iteration, gt_rate, gr_acc, step_a)\n",
+ " elif _phase in (_Phase.SELFPLAY_RAMP, _Phase.CONTINUOUS):\n",
+ " _sp_iters += 1\n",
+ " if _phase == _Phase.SELFPLAY_RAMP and _sp_iters >= args.selfplay_ramp_iters:\n",
+ " _phase = _Phase.CONTINUOUS\n",
+ " logger.info(\"PHASE → CONTINUOUS at iter %d\", iteration)\n",
+ " # Chain scoring calibration\n",
+ " if len(_chain_buf) > _CMAX:\n",
+ " _chain_buf[:] = _chain_buf[-_CMAX:]\n",
+ " _prm_buf[:] = _prm_buf[-_CMAX:]\n",
+ " _succ_buf[:] = _succ_buf[-_CMAX:]\n",
+ " if not _use_chain and len(_chain_buf) >= _CWIN:\n",
+ " try:\n",
+ " from scipy.stats import pearsonr\n",
+ " _r2, _ = pearsonr(_chain_buf[-_CWIN:], _prm_buf[-_CWIN:])\n",
+ " _chain_corr = float(_r2)\n",
+ " except Exception: _chain_corr = 0.\n",
+ " _n = len(_succ_buf[-_CWIN:])\n",
+ " _extract_rate = sum(_succ_buf[-_CWIN:])/_n if _n else 0.\n",
+ " if _chain_corr >= 0.70 and _extract_rate >= 0.80:\n",
+ " _use_chain = True; math_env.use_chain_scoring = True\n",
+ " logger.info(\"CHAIN PRIMARY activated iter %d: corr=%.2f rate=%.2f\",\n",
+ " iteration, _chain_corr, _extract_rate)\n",
+ " _prev_susp = _sp_suspended\n",
+ " _sp_suspended = bool(gr_gt) and gt_rate < args.grounded_floor\n",
+ " if _sp_suspended and not _prev_susp:\n",
+ " logger.warning(\"GROUNDED FLOOR: self-play suspended (gt=%.2f)\", gt_rate)\n",
+ " elif not _sp_suspended and _prev_susp:\n",
+ " logger.info(\"GROUNDED FLOOR: self-play resumed (gt=%.2f)\", gt_rate)\n",
+ "\n",
+ " # ── Logging ───────────────────────────────────────────────────────────────\n",
+ " logger.info(\n",
+ " \"Iter %d | loss=%.4f | r=%.3f±%.3f | gt=%.1f%% | gr_acc=%.1f%% | \"\n",
+ " \"step=%.1f%% | lccp=%.1f%% | phase=%s sp=%.0f%% | \"\n",
+ " \"grps=%d skip=%d | lr=%.2e | %.1fs\",\n",
+ " iteration, loss_val, mean_r, std_r,\n",
+ " 100*gt_rate, 100*gr_acc, 100*step_a, 100*lccp_a,\n",
+ " _phase.name, 100*_eff_sp, n_grps, skipped, cur_lr, iter_time)\n",
+ " if (n_grps+skipped) > 0 and skip0var/(n_grps+skipped) > 0.30:\n",
+ " logger.warning(\"STARVATION: %.0f%% zero-var groups — curriculum %s\",\n",
+ " 100*skip0var/(n_grps+skipped),\n",
+ " \"too easy\" if gr_acc>0.75 else \"too hard\")\n",
+ "\n",
+ " # ── Evaluation (every eval_every iterations) ───────────────────────────────\n",
+ " iter_metrics: Dict[str, Any] = {\n",
+ " \"iteration\": iteration, \"loss\": loss_val, \"mean_reward\": mean_r,\n",
+ " \"std_reward\": std_r, \"batch_accuracy\": acc_r, \"grounded_accuracy\": gr_acc,\n",
+ " \"gt_match_rate\": round(gt_rate,4), \"step_accuracy\": step_a, \"lccp\": lccp_a,\n",
+ " \"n_groups\": n_grps, \"skipped_groups\": skipped, \"learning_rate\": cur_lr,\n",
+ " \"iter_time_s\": iter_time, \"training_phase\": _phase.name,\n",
+ " \"effective_sp_ratio\": round(_eff_sp,3), \"selfplay_suspended\": int(_sp_suspended),\n",
+ " \"chain_prm_corr\": round(_chain_corr,3), \"chain_scoring_active\": int(_use_chain),\n",
+ " \"n_sp_groups\": n_sp, \"mean_q_reward\": round(mean_qr,4),\n",
+ " \"q_gen_valid_rate\": round(q_val/q_att if q_att>0 else 0,4),\n",
+ " \"episode_id\": _epi_state.episode_id, # from env.state\n",
+ " \"episode_steps\": _epi_state.step_count, # from env.state\n",
+ " }\n",
+ "\n",
+ " if iteration % args.eval_every == 0:\n",
+ " logger.info(\"Evaluating (%d samples) ...\", args.eval_max_samples)\n",
+ " eval_res = evaluate_policy(model, tokenizer, args.eval_data_path,\n",
+ " args.eval_max_samples, args.eval_max_new_tokens,\n",
+ " math_env=math_env, pass_at_k=args.eval_pass_at_k)\n",
+ " cur_comb = float(eval_res.get(\"combined_score\", best_combined))\n",
+ " logger.info(\"Eval combined=%.4f correct=%.1f%% best=%.4f\",\n",
+ " cur_comb, 100*float(eval_res.get(\"correct_rate\",0.)), best_combined)\n",
+ " if cur_comb > best_combined + 1e-4:\n",
+ " best_combined = cur_comb\n",
+ " best_prm_mean = max(best_prm_mean, float(eval_res.get(\"prm_mean\",0.)))\n",
+ " model.save_pretrained(str(out_dir/\"best_policy\"))\n",
+ " tokenizer.save_pretrained(str(out_dir/\"best_policy\"))\n",
+ " logger.info(\"New best → %s\", out_dir/\"best_policy\")\n",
+ " iter_metrics.update(eval_res)\n",
+ "\n",
+ " # ── Checkpoint ────────────────────────────────────────────────────────────\n",
+ " if iteration == args.num_iterations or (args.save_every>0 and iteration%args.save_every==0):\n",
+ " ck = out_dir / f\"iter_{iteration:04d}\"\n",
+ " ck.mkdir(exist_ok=True)\n",
+ " model.save_pretrained(str(ck)); tokenizer.save_pretrained(str(ck))\n",
+ " if args.keep_last and args.keep_last > 0:\n",
+ " old = sorted(p for p in out_dir.iterdir() if p.is_dir() and p.name.startswith(\"iter_\"))\n",
+ " for o in old[:-args.keep_last]:\n",
+ " try: shutil.rmtree(o); logger.info(\"Pruned: %s\", o.name)\n",
+ " except OSError as e: logger.warning(\"Could not prune %s: %s\", o.name, e)\n",
+ "\n",
+ " # ── Write metrics ─────────────────────────────────────────────────────────\n",
+ " metrics_log.append(iter_metrics)\n",
+ " (out_dir/\"metrics.jsonl\").write_text(\n",
+ " \"\\n\".join(json.dumps(m) for m in metrics_log), encoding=\"utf-8\")\n",
+ " _append_metrics_csv({\n",
+ " \"iteration\": iter_metrics[\"iteration\"],\n",
+ " \"timestamp\": datetime.now().isoformat(timespec=\"seconds\"),\n",
+ " \"loss\": iter_metrics.get(\"loss\",0.),\n",
+ " \"mean_reward\": iter_metrics.get(\"mean_reward\",0.),\n",
+ " \"batch_acc\": iter_metrics.get(\"batch_accuracy\",0.),\n",
+ " \"grounded_acc\": iter_metrics.get(\"grounded_accuracy\",0.),\n",
+ " \"gt_match\": iter_metrics.get(\"gt_match_rate\",0.),\n",
+ " \"step_acc\": iter_metrics.get(\"step_accuracy\",0.),\n",
+ " \"lccp\": iter_metrics.get(\"lccp\",0.),\n",
+ " \"n_groups\": iter_metrics.get(\"n_groups\",0),\n",
+ " \"skipped\": iter_metrics.get(\"skipped_groups\",0),\n",
+ " \"sp_ratio\": iter_metrics.get(\"effective_sp_ratio\",0.),\n",
+ " \"phase\": iter_metrics.get(\"training_phase\",\"\"),\n",
+ " \"lr\": iter_metrics.get(\"learning_rate\",0.),\n",
+ " \"iter_s\": iter_metrics.get(\"iter_time_s\",0.),\n",
+ " \"eval_combined\":iter_metrics.get(\"combined_score\",\"\") if \"combined_score\" in iter_metrics else \"\",\n",
+ " \"eval_correct\": iter_metrics.get(\"correct_rate\",\"\") if \"combined_score\" in iter_metrics else \"\",\n",
+ " \"eval_prm\": iter_metrics.get(\"prm_mean\",\"\") if \"combined_score\" in iter_metrics else \"\",\n",
+ " }, step=iter_metrics[\"iteration\"])\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-close",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ════════════════════════════════════════════════════════════════════════════\n",
+ "# CLOSE — persist curriculum state and finalise run\n",
+ "# ════════════════════════════════════════════════════════════════════════════\n",
+ "env.close() # saves CurriculumManager state to checkpoints/curriculum/\n",
+ "_teardown() # restore stdout/stderr, flush CSV and log files\n",
+ "\n",
+ "logger.info(\"=\" * 70)\n",
+ "logger.info(\"GRPO training complete.\")\n",
+ "logger.info(\"Best combined score : %.4f\", best_combined)\n",
+ "logger.info(\"Best PRM mean : %.3f\", best_prm_mean)\n",
+ "logger.info(\"Checkpoints : %s\", out_dir)\n",
+ "logger.info(\"Logs : %s\", log_dir)\n",
+ "logger.info(\"=\" * 70)\n",
+ "\n",
+ "summary = {\n",
+ " \"run_name\": run_name,\n",
+ " \"best_combined\": best_combined,\n",
+ " \"best_prm_mean\": best_prm_mean,\n",
+ " \"total_iters\": args.num_iterations,\n",
+ " \"checkpoints\": str(out_dir),\n",
+ " \"log_dir\": str(log_dir),\n",
+ " \"metrics_csv\": str(_csv_logger.metrics_file),\n",
+ " \"metrics_json\": str(_csv_logger.log_path / \"detailed_metrics\"),\n",
+ "}\n",
+ "_csv_logger.save_summary(summary)\n",
+ "logger.info(\"Summary → %s\", _csv_logger.log_path / \"summary.json\")\n",
+ "\n",
+ "# Auto-generate training plots if matplotlib is available\n",
+ "_jsonl = out_dir / \"metrics.jsonl\"\n",
+ "if _jsonl.exists():\n",
+ " try:\n",
+ " from scripts.plot_grpo_run import generate_plots\n",
+ " _pdir = generate_plots(_jsonl)\n",
+ " logger.info(\"Plots → %s\", _pdir)\n",
+ " except Exception as _pe:\n",
+ " logger.warning(\"Plot generation skipped (%s). Run manually: \"\n",
+ " \"python scripts/plot_grpo_run.py %s\", _pe, _jsonl)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.11.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/uv.lock b/uv.lock
new file mode 100644
index 0000000000000000000000000000000000000000..68c5276c4060b090aa912fbe044d38bf17ea913f
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,2992 @@
+version = 1
+revision = 3
+requires-python = ">=3.10"
+resolution-markers = [
+ "python_full_version >= '3.14' and sys_platform == 'win32'",
+ "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+ "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+ "python_full_version == '3.13.*' and sys_platform == 'win32'",
+ "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+ "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+ "python_full_version >= '3.11' and python_full_version < '3.13' and sys_platform == 'win32'",
+ "python_full_version >= '3.11' and python_full_version < '3.13' and sys_platform == 'emscripten'",
+ "python_full_version >= '3.11' and python_full_version < '3.13' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+ "python_full_version < '3.11'",
+]
+
+[[package]]
+name = "aiofile"
+version = "3.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "caio" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/67/e2/d7cb819de8df6b5c1968a2756c3cb4122d4fa2b8fc768b53b7c9e5edb646/aiofile-3.9.0.tar.gz", hash = "sha256:e5ad718bb148b265b6df1b3752c4d1d83024b93da9bd599df74b9d9ffcf7919b", size = 17943, upload-time = "2024-10-08T10:39:35.846Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/50/25/da1f0b4dd970e52bf5a36c204c107e11a0c6d3ed195eba0bfbc664c312b2/aiofile-3.9.0-py3-none-any.whl", hash = "sha256:ce2f6c1571538cbdfa0143b04e16b208ecb0e9cb4148e528af8a640ed51cc8aa", size = 19539, upload-time = "2024-10-08T10:39:32.955Z" },
+]
+
+[[package]]
+name = "annotated-doc"
+version = "0.0.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288, upload-time = "2025-11-10T22:07:42.062Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303, upload-time = "2025-11-10T22:07:40.673Z" },
+]
+
+[[package]]
+name = "annotated-types"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
+]
+
+[[package]]
+name = "anyio"
+version = "4.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
+ { name = "idna" },
+ { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/19/14/2c5dd9f512b66549ae92767a9c7b330ae88e1932ca57876909410251fe13/anyio-4.13.0.tar.gz", hash = "sha256:334b70e641fd2221c1505b3890c69882fe4a2df910cba14d97019b90b24439dc", size = 231622, upload-time = "2026-03-24T12:59:09.671Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl", hash = "sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size = 114353, upload-time = "2026-03-24T12:59:08.246Z" },
+]
+
+[[package]]
+name = "attrs"
+version = "26.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9a/8e/82a0fe20a541c03148528be8cac2408564a6c9a0cc7e9171802bc1d26985/attrs-26.1.0.tar.gz", hash = "sha256:d03ceb89cb322a8fd706d4fb91940737b6642aa36998fe130a9bc96c985eff32", size = 952055, upload-time = "2026-03-19T14:22:25.026Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/64/b4/17d4b0b2a2dc85a6df63d1157e028ed19f90d4cd97c36717afef2bc2f395/attrs-26.1.0-py3-none-any.whl", hash = "sha256:c647aa4a12dfbad9333ca4e71fe62ddc36f4e63b2d260a37a8b83d2f043ac309", size = 67548, upload-time = "2026-03-19T14:22:23.645Z" },
+]
+
+[[package]]
+name = "audioop-lts"
+version = "0.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/38/53/946db57842a50b2da2e0c1e34bd37f36f5aadba1a929a3971c5d7841dbca/audioop_lts-0.2.2.tar.gz", hash = "sha256:64d0c62d88e67b98a1a5e71987b7aa7b5bcffc7dcee65b635823dbdd0a8dbbd0", size = 30686, upload-time = "2025-08-05T16:43:17.409Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/de/d4/94d277ca941de5a507b07f0b592f199c22454eeaec8f008a286b3fbbacd6/audioop_lts-0.2.2-cp313-abi3-macosx_10_13_universal2.whl", hash = "sha256:fd3d4602dc64914d462924a08c1a9816435a2155d74f325853c1f1ac3b2d9800", size = 46523, upload-time = "2025-08-05T16:42:20.836Z" },
+ { url = "https://files.pythonhosted.org/packages/f8/5a/656d1c2da4b555920ce4177167bfeb8623d98765594af59702c8873f60ec/audioop_lts-0.2.2-cp313-abi3-macosx_10_13_x86_64.whl", hash = "sha256:550c114a8df0aafe9a05442a1162dfc8fec37e9af1d625ae6060fed6e756f303", size = 27455, upload-time = "2025-08-05T16:42:22.283Z" },
+ { url = "https://files.pythonhosted.org/packages/1b/83/ea581e364ce7b0d41456fb79d6ee0ad482beda61faf0cab20cbd4c63a541/audioop_lts-0.2.2-cp313-abi3-macosx_11_0_arm64.whl", hash = "sha256:9a13dc409f2564de15dd68be65b462ba0dde01b19663720c68c1140c782d1d75", size = 26997, upload-time = "2025-08-05T16:42:23.849Z" },
+ { url = "https://files.pythonhosted.org/packages/b8/3b/e8964210b5e216e5041593b7d33e97ee65967f17c282e8510d19c666dab4/audioop_lts-0.2.2-cp313-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:51c916108c56aa6e426ce611946f901badac950ee2ddaf302b7ed35d9958970d", size = 85844, upload-time = "2025-08-05T16:42:25.208Z" },
+ { url = "https://files.pythonhosted.org/packages/c7/2e/0a1c52faf10d51def20531a59ce4c706cb7952323b11709e10de324d6493/audioop_lts-0.2.2-cp313-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:47eba38322370347b1c47024defbd36374a211e8dd5b0dcbce7b34fdb6f8847b", size = 85056, upload-time = "2025-08-05T16:42:26.559Z" },
+ { url = "https://files.pythonhosted.org/packages/75/e8/cd95eef479656cb75ab05dfece8c1f8c395d17a7c651d88f8e6e291a63ab/audioop_lts-0.2.2-cp313-abi3-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ba7c3a7e5f23e215cb271516197030c32aef2e754252c4c70a50aaff7031a2c8", size = 93892, upload-time = "2025-08-05T16:42:27.902Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/1e/a0c42570b74f83efa5cca34905b3eef03f7ab09fe5637015df538a7f3345/audioop_lts-0.2.2-cp313-abi3-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:def246fe9e180626731b26e89816e79aae2276f825420a07b4a647abaa84becc", size = 96660, upload-time = "2025-08-05T16:42:28.9Z" },
+ { url = "https://files.pythonhosted.org/packages/50/d5/8a0ae607ca07dbb34027bac8db805498ee7bfecc05fd2c148cc1ed7646e7/audioop_lts-0.2.2-cp313-abi3-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e160bf9df356d841bb6c180eeeea1834085464626dc1b68fa4e1d59070affdc3", size = 79143, upload-time = "2025-08-05T16:42:29.929Z" },
+ { url = "https://files.pythonhosted.org/packages/12/17/0d28c46179e7910bfb0bb62760ccb33edb5de973052cb2230b662c14ca2e/audioop_lts-0.2.2-cp313-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4b4cd51a57b698b2d06cb9993b7ac8dfe89a3b2878e96bc7948e9f19ff51dba6", size = 84313, upload-time = "2025-08-05T16:42:30.949Z" },
+ { url = "https://files.pythonhosted.org/packages/84/ba/bd5d3806641564f2024e97ca98ea8f8811d4e01d9b9f9831474bc9e14f9e/audioop_lts-0.2.2-cp313-abi3-musllinux_1_2_ppc64le.whl", hash = "sha256:4a53aa7c16a60a6857e6b0b165261436396ef7293f8b5c9c828a3a203147ed4a", size = 93044, upload-time = "2025-08-05T16:42:31.959Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/5e/435ce8d5642f1f7679540d1e73c1c42d933331c0976eb397d1717d7f01a3/audioop_lts-0.2.2-cp313-abi3-musllinux_1_2_riscv64.whl", hash = "sha256:3fc38008969796f0f689f1453722a0f463da1b8a6fbee11987830bfbb664f623", size = 78766, upload-time = "2025-08-05T16:42:33.302Z" },
+ { url = "https://files.pythonhosted.org/packages/ae/3b/b909e76b606cbfd53875693ec8c156e93e15a1366a012f0b7e4fb52d3c34/audioop_lts-0.2.2-cp313-abi3-musllinux_1_2_s390x.whl", hash = "sha256:15ab25dd3e620790f40e9ead897f91e79c0d3ce65fe193c8ed6c26cffdd24be7", size = 87640, upload-time = "2025-08-05T16:42:34.854Z" },
+ { url = "https://files.pythonhosted.org/packages/30/e7/8f1603b4572d79b775f2140d7952f200f5e6c62904585d08a01f0a70393a/audioop_lts-0.2.2-cp313-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:03f061a1915538fd96272bac9551841859dbb2e3bf73ebe4a23ef043766f5449", size = 86052, upload-time = "2025-08-05T16:42:35.839Z" },
+ { url = "https://files.pythonhosted.org/packages/b5/96/c37846df657ccdda62ba1ae2b6534fa90e2e1b1742ca8dcf8ebd38c53801/audioop_lts-0.2.2-cp313-abi3-win32.whl", hash = "sha256:3bcddaaf6cc5935a300a8387c99f7a7fbbe212a11568ec6cf6e4bc458c048636", size = 26185, upload-time = "2025-08-05T16:42:37.04Z" },
+ { url = "https://files.pythonhosted.org/packages/34/a5/9d78fdb5b844a83da8a71226c7bdae7cc638861085fff7a1d707cb4823fa/audioop_lts-0.2.2-cp313-abi3-win_amd64.whl", hash = "sha256:a2c2a947fae7d1062ef08c4e369e0ba2086049a5e598fda41122535557012e9e", size = 30503, upload-time = "2025-08-05T16:42:38.427Z" },
+ { url = "https://files.pythonhosted.org/packages/34/25/20d8fde083123e90c61b51afb547bb0ea7e77bab50d98c0ab243d02a0e43/audioop_lts-0.2.2-cp313-abi3-win_arm64.whl", hash = "sha256:5f93a5db13927a37d2d09637ccca4b2b6b48c19cd9eda7b17a2e9f77edee6a6f", size = 24173, upload-time = "2025-08-05T16:42:39.704Z" },
+ { url = "https://files.pythonhosted.org/packages/58/a7/0a764f77b5c4ac58dc13c01a580f5d32ae8c74c92020b961556a43e26d02/audioop_lts-0.2.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:73f80bf4cd5d2ca7814da30a120de1f9408ee0619cc75da87d0641273d202a09", size = 47096, upload-time = "2025-08-05T16:42:40.684Z" },
+ { url = "https://files.pythonhosted.org/packages/aa/ed/ebebedde1a18848b085ad0fa54b66ceb95f1f94a3fc04f1cd1b5ccb0ed42/audioop_lts-0.2.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:106753a83a25ee4d6f473f2be6b0966fc1c9af7e0017192f5531a3e7463dce58", size = 27748, upload-time = "2025-08-05T16:42:41.992Z" },
+ { url = "https://files.pythonhosted.org/packages/cb/6e/11ca8c21af79f15dbb1c7f8017952ee8c810c438ce4e2b25638dfef2b02c/audioop_lts-0.2.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:fbdd522624141e40948ab3e8cdae6e04c748d78710e9f0f8d4dae2750831de19", size = 27329, upload-time = "2025-08-05T16:42:42.987Z" },
+ { url = "https://files.pythonhosted.org/packages/84/52/0022f93d56d85eec5da6b9da6a958a1ef09e80c39f2cc0a590c6af81dcbb/audioop_lts-0.2.2-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:143fad0311e8209ece30a8dbddab3b65ab419cbe8c0dde6e8828da25999be911", size = 92407, upload-time = "2025-08-05T16:42:44.336Z" },
+ { url = "https://files.pythonhosted.org/packages/87/1d/48a889855e67be8718adbc7a01f3c01d5743c325453a5e81cf3717664aad/audioop_lts-0.2.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dfbbc74ec68a0fd08cfec1f4b5e8cca3d3cd7de5501b01c4b5d209995033cde9", size = 91811, upload-time = "2025-08-05T16:42:45.325Z" },
+ { url = "https://files.pythonhosted.org/packages/98/a6/94b7213190e8077547ffae75e13ed05edc488653c85aa5c41472c297d295/audioop_lts-0.2.2-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cfcac6aa6f42397471e4943e0feb2244549db5c5d01efcd02725b96af417f3fe", size = 100470, upload-time = "2025-08-05T16:42:46.468Z" },
+ { url = "https://files.pythonhosted.org/packages/e9/e9/78450d7cb921ede0cfc33426d3a8023a3bda755883c95c868ee36db8d48d/audioop_lts-0.2.2-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:752d76472d9804ac60f0078c79cdae8b956f293177acd2316cd1e15149aee132", size = 103878, upload-time = "2025-08-05T16:42:47.576Z" },
+ { url = "https://files.pythonhosted.org/packages/4f/e2/cd5439aad4f3e34ae1ee852025dc6aa8f67a82b97641e390bf7bd9891d3e/audioop_lts-0.2.2-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:83c381767e2cc10e93e40281a04852facc4cd9334550e0f392f72d1c0a9c5753", size = 84867, upload-time = "2025-08-05T16:42:49.003Z" },
+ { url = "https://files.pythonhosted.org/packages/68/4b/9d853e9076c43ebba0d411e8d2aa19061083349ac695a7d082540bad64d0/audioop_lts-0.2.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c0022283e9556e0f3643b7c3c03f05063ca72b3063291834cca43234f20c60bb", size = 90001, upload-time = "2025-08-05T16:42:50.038Z" },
+ { url = "https://files.pythonhosted.org/packages/58/26/4bae7f9d2f116ed5593989d0e521d679b0d583973d203384679323d8fa85/audioop_lts-0.2.2-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:a2d4f1513d63c795e82948e1305f31a6d530626e5f9f2605408b300ae6095093", size = 99046, upload-time = "2025-08-05T16:42:51.111Z" },
+ { url = "https://files.pythonhosted.org/packages/b2/67/a9f4fb3e250dda9e9046f8866e9fa7d52664f8985e445c6b4ad6dfb55641/audioop_lts-0.2.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:c9c8e68d8b4a56fda8c025e538e639f8c5953f5073886b596c93ec9b620055e7", size = 84788, upload-time = "2025-08-05T16:42:52.198Z" },
+ { url = "https://files.pythonhosted.org/packages/70/f7/3de86562db0121956148bcb0fe5b506615e3bcf6e63c4357a612b910765a/audioop_lts-0.2.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:96f19de485a2925314f5020e85911fb447ff5fbef56e8c7c6927851b95533a1c", size = 94472, upload-time = "2025-08-05T16:42:53.59Z" },
+ { url = "https://files.pythonhosted.org/packages/f1/32/fd772bf9078ae1001207d2df1eef3da05bea611a87dd0e8217989b2848fa/audioop_lts-0.2.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e541c3ef484852ef36545f66209444c48b28661e864ccadb29daddb6a4b8e5f5", size = 92279, upload-time = "2025-08-05T16:42:54.632Z" },
+ { url = "https://files.pythonhosted.org/packages/4f/41/affea7181592ab0ab560044632571a38edaf9130b84928177823fbf3176a/audioop_lts-0.2.2-cp313-cp313t-win32.whl", hash = "sha256:d5e73fa573e273e4f2e5ff96f9043858a5e9311e94ffefd88a3186a910c70917", size = 26568, upload-time = "2025-08-05T16:42:55.627Z" },
+ { url = "https://files.pythonhosted.org/packages/28/2b/0372842877016641db8fc54d5c88596b542eec2f8f6c20a36fb6612bf9ee/audioop_lts-0.2.2-cp313-cp313t-win_amd64.whl", hash = "sha256:9191d68659eda01e448188f60364c7763a7ca6653ed3f87ebb165822153a8547", size = 30942, upload-time = "2025-08-05T16:42:56.674Z" },
+ { url = "https://files.pythonhosted.org/packages/ee/ca/baf2b9cc7e96c179bb4a54f30fcd83e6ecb340031bde68f486403f943768/audioop_lts-0.2.2-cp313-cp313t-win_arm64.whl", hash = "sha256:c174e322bb5783c099aaf87faeb240c8d210686b04bd61dfd05a8e5a83d88969", size = 24603, upload-time = "2025-08-05T16:42:57.571Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/73/413b5a2804091e2c7d5def1d618e4837f1cb82464e230f827226278556b7/audioop_lts-0.2.2-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:f9ee9b52f5f857fbaf9d605a360884f034c92c1c23021fb90b2e39b8e64bede6", size = 47104, upload-time = "2025-08-05T16:42:58.518Z" },
+ { url = "https://files.pythonhosted.org/packages/ae/8c/daa3308dc6593944410c2c68306a5e217f5c05b70a12e70228e7dd42dc5c/audioop_lts-0.2.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:49ee1a41738a23e98d98b937a0638357a2477bc99e61b0f768a8f654f45d9b7a", size = 27754, upload-time = "2025-08-05T16:43:00.132Z" },
+ { url = "https://files.pythonhosted.org/packages/4e/86/c2e0f627168fcf61781a8f72cab06b228fe1da4b9fa4ab39cfb791b5836b/audioop_lts-0.2.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5b00be98ccd0fc123dcfad31d50030d25fcf31488cde9e61692029cd7394733b", size = 27332, upload-time = "2025-08-05T16:43:01.666Z" },
+ { url = "https://files.pythonhosted.org/packages/c7/bd/35dce665255434f54e5307de39e31912a6f902d4572da7c37582809de14f/audioop_lts-0.2.2-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a6d2e0f9f7a69403e388894d4ca5ada5c47230716a03f2847cfc7bd1ecb589d6", size = 92396, upload-time = "2025-08-05T16:43:02.991Z" },
+ { url = "https://files.pythonhosted.org/packages/2d/d2/deeb9f51def1437b3afa35aeb729d577c04bcd89394cb56f9239a9f50b6f/audioop_lts-0.2.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f9b0b8a03ef474f56d1a842af1a2e01398b8f7654009823c6d9e0ecff4d5cfbf", size = 91811, upload-time = "2025-08-05T16:43:04.096Z" },
+ { url = "https://files.pythonhosted.org/packages/76/3b/09f8b35b227cee28cc8231e296a82759ed80c1a08e349811d69773c48426/audioop_lts-0.2.2-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2b267b70747d82125f1a021506565bdc5609a2b24bcb4773c16d79d2bb260bbd", size = 100483, upload-time = "2025-08-05T16:43:05.085Z" },
+ { url = "https://files.pythonhosted.org/packages/0b/15/05b48a935cf3b130c248bfdbdea71ce6437f5394ee8533e0edd7cfd93d5e/audioop_lts-0.2.2-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0337d658f9b81f4cd0fdb1f47635070cc084871a3d4646d9de74fdf4e7c3d24a", size = 103885, upload-time = "2025-08-05T16:43:06.197Z" },
+ { url = "https://files.pythonhosted.org/packages/83/80/186b7fce6d35b68d3d739f228dc31d60b3412105854edb975aa155a58339/audioop_lts-0.2.2-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:167d3b62586faef8b6b2275c3218796b12621a60e43f7e9d5845d627b9c9b80e", size = 84899, upload-time = "2025-08-05T16:43:07.291Z" },
+ { url = "https://files.pythonhosted.org/packages/49/89/c78cc5ac6cb5828f17514fb12966e299c850bc885e80f8ad94e38d450886/audioop_lts-0.2.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:0d9385e96f9f6da847f4d571ce3cb15b5091140edf3db97276872647ce37efd7", size = 89998, upload-time = "2025-08-05T16:43:08.335Z" },
+ { url = "https://files.pythonhosted.org/packages/4c/4b/6401888d0c010e586c2ca50fce4c903d70a6bb55928b16cfbdfd957a13da/audioop_lts-0.2.2-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:48159d96962674eccdca9a3df280e864e8ac75e40a577cc97c5c42667ffabfc5", size = 99046, upload-time = "2025-08-05T16:43:09.367Z" },
+ { url = "https://files.pythonhosted.org/packages/de/f8/c874ca9bb447dae0e2ef2e231f6c4c2b0c39e31ae684d2420b0f9e97ee68/audioop_lts-0.2.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:8fefe5868cd082db1186f2837d64cfbfa78b548ea0d0543e9b28935ccce81ce9", size = 84843, upload-time = "2025-08-05T16:43:10.749Z" },
+ { url = "https://files.pythonhosted.org/packages/3e/c0/0323e66f3daebc13fd46b36b30c3be47e3fc4257eae44f1e77eb828c703f/audioop_lts-0.2.2-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:58cf54380c3884fb49fdd37dfb7a772632b6701d28edd3e2904743c5e1773602", size = 94490, upload-time = "2025-08-05T16:43:12.131Z" },
+ { url = "https://files.pythonhosted.org/packages/98/6b/acc7734ac02d95ab791c10c3f17ffa3584ccb9ac5c18fd771c638ed6d1f5/audioop_lts-0.2.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:088327f00488cdeed296edd9215ca159f3a5a5034741465789cad403fcf4bec0", size = 92297, upload-time = "2025-08-05T16:43:13.139Z" },
+ { url = "https://files.pythonhosted.org/packages/13/c3/c3dc3f564ce6877ecd2a05f8d751b9b27a8c320c2533a98b0c86349778d0/audioop_lts-0.2.2-cp314-cp314t-win32.whl", hash = "sha256:068aa17a38b4e0e7de771c62c60bbca2455924b67a8814f3b0dee92b5820c0b3", size = 27331, upload-time = "2025-08-05T16:43:14.19Z" },
+ { url = "https://files.pythonhosted.org/packages/72/bb/b4608537e9ffcb86449091939d52d24a055216a36a8bf66b936af8c3e7ac/audioop_lts-0.2.2-cp314-cp314t-win_amd64.whl", hash = "sha256:a5bf613e96f49712073de86f20dbdd4014ca18efd4d34ed18c75bd808337851b", size = 31697, upload-time = "2025-08-05T16:43:15.193Z" },
+ { url = "https://files.pythonhosted.org/packages/f6/22/91616fe707a5c5510de2cac9b046a30defe7007ba8a0c04f9c08f27df312/audioop_lts-0.2.2-cp314-cp314t-win_arm64.whl", hash = "sha256:b492c3b040153e68b9fdaff5913305aaaba5bb433d8a7f73d5cf6a64ed3cc1dd", size = 25206, upload-time = "2025-08-05T16:43:16.444Z" },
+]
+
+[[package]]
+name = "authlib"
+version = "1.7.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "cryptography" },
+ { name = "joserfc" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d9/82/4d0603f30c1b4629b1f091bb266b0d7986434891d6940a8c87f8098db24e/authlib-1.7.0.tar.gz", hash = "sha256:b3e326c9aa9cc3ea95fe7d89fd880722d3608da4d00e8a27e061e64b48d801d5", size = 175890, upload-time = "2026-04-18T11:00:28.559Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/ca/48/c954218b2a250e23f178f10167c4173fecb5a75d2c206f0a67ba58006c26/authlib-1.7.0-py2.py3-none-any.whl", hash = "sha256:e36817afb02f6f0b6bf55f150782499ddd6ddf44b402bb055d3263cc65ac9ae0", size = 258779, upload-time = "2026-04-18T11:00:26.64Z" },
+]
+
+[[package]]
+name = "backports-tarfile"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/86/72/cd9b395f25e290e633655a100af28cb253e4393396264a98bd5f5951d50f/backports_tarfile-1.2.0.tar.gz", hash = "sha256:d75e02c268746e1b8144c278978b6e98e85de6ad16f8e4b0844a154557eca991", size = 86406, upload-time = "2024-05-28T17:01:54.731Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/b9/fa/123043af240e49752f1c4bd24da5053b6bd00cad78c2be53c0d1e8b975bc/backports.tarfile-1.2.0-py3-none-any.whl", hash = "sha256:77e284d754527b01fb1e6fa8a1afe577858ebe4e9dad8919e34c862cb399bc34", size = 30181, upload-time = "2024-05-28T17:01:53.112Z" },
+]
+
+[[package]]
+name = "beartype"
+version = "0.22.9"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c7/94/1009e248bbfbab11397abca7193bea6626806be9a327d399810d523a07cb/beartype-0.22.9.tar.gz", hash = "sha256:8f82b54aa723a2848a56008d18875f91c1db02c32ef6a62319a002e3e25a975f", size = 1608866, upload-time = "2025-12-13T06:50:30.72Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/71/cc/18245721fa7747065ab478316c7fea7c74777d07f37ae60db2e84f8172e8/beartype-0.22.9-py3-none-any.whl", hash = "sha256:d16c9bbc61ea14637596c5f6fbff2ee99cbe3573e46a716401734ef50c3060c2", size = 1333658, upload-time = "2025-12-13T06:50:28.266Z" },
+]
+
+[[package]]
+name = "brotli"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f7/16/c92ca344d646e71a43b8bb353f0a6490d7f6e06210f8554c8f874e454285/brotli-1.2.0.tar.gz", hash = "sha256:e310f77e41941c13340a95976fe66a8a95b01e783d430eeaf7a2f87e0a57dd0a", size = 7388632, upload-time = "2025-11-05T18:39:42.86Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/64/10/a090475284fc4a71aed40a96f32e44a7fe5bda39687353dd977720b211b6/brotli-1.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3b90b767916ac44e93a8e28ce6adf8d551e43affb512f2377c732d486ac6514e", size = 863089, upload-time = "2025-11-05T18:38:01.181Z" },
+ { url = "https://files.pythonhosted.org/packages/03/41/17416630e46c07ac21e378c3464815dd2e120b441e641bc516ac32cc51d2/brotli-1.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6be67c19e0b0c56365c6a76e393b932fb0e78b3b56b711d180dd7013cb1fd984", size = 445442, upload-time = "2025-11-05T18:38:02.434Z" },
+ { url = "https://files.pythonhosted.org/packages/24/31/90cc06584deb5d4fcafc0985e37741fc6b9717926a78674bbb3ce018957e/brotli-1.2.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0bbd5b5ccd157ae7913750476d48099aaf507a79841c0d04a9db4415b14842de", size = 1532658, upload-time = "2025-11-05T18:38:03.588Z" },
+ { url = "https://files.pythonhosted.org/packages/62/17/33bf0c83bcbc96756dfd712201d87342732fad70bb3472c27e833a44a4f9/brotli-1.2.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3f3c908bcc404c90c77d5a073e55271a0a498f4e0756e48127c35d91cf155947", size = 1631241, upload-time = "2025-11-05T18:38:04.582Z" },
+ { url = "https://files.pythonhosted.org/packages/48/10/f47854a1917b62efe29bc98ac18e5d4f71df03f629184575b862ef2e743b/brotli-1.2.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1b557b29782a643420e08d75aea889462a4a8796e9a6cf5621ab05a3f7da8ef2", size = 1424307, upload-time = "2025-11-05T18:38:05.587Z" },
+ { url = "https://files.pythonhosted.org/packages/e4/b7/f88eb461719259c17483484ea8456925ee057897f8e64487d76e24e5e38d/brotli-1.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:81da1b229b1889f25adadc929aeb9dbc4e922bd18561b65b08dd9343cfccca84", size = 1488208, upload-time = "2025-11-05T18:38:06.613Z" },
+ { url = "https://files.pythonhosted.org/packages/26/59/41bbcb983a0c48b0b8004203e74706c6b6e99a04f3c7ca6f4f41f364db50/brotli-1.2.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ff09cd8c5eec3b9d02d2408db41be150d8891c5566addce57513bf546e3d6c6d", size = 1597574, upload-time = "2025-11-05T18:38:07.838Z" },
+ { url = "https://files.pythonhosted.org/packages/8e/e6/8c89c3bdabbe802febb4c5c6ca224a395e97913b5df0dff11b54f23c1788/brotli-1.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a1778532b978d2536e79c05dac2d8cd857f6c55cd0c95ace5b03740824e0e2f1", size = 1492109, upload-time = "2025-11-05T18:38:08.816Z" },
+ { url = "https://files.pythonhosted.org/packages/ed/9a/4b19d4310b2dbd545c0c33f176b0528fa68c3cd0754e34b2f2bcf56548ae/brotli-1.2.0-cp310-cp310-win32.whl", hash = "sha256:b232029d100d393ae3c603c8ffd7e3fe6f798c5e28ddca5feabb8e8fdb732997", size = 334461, upload-time = "2025-11-05T18:38:10.729Z" },
+ { url = "https://files.pythonhosted.org/packages/ac/39/70981d9f47705e3c2b95c0847dfa3e7a37aa3b7c6030aedc4873081ed005/brotli-1.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:ef87b8ab2704da227e83a246356a2b179ef826f550f794b2c52cddb4efbd0196", size = 369035, upload-time = "2025-11-05T18:38:11.827Z" },
+ { url = "https://files.pythonhosted.org/packages/7a/ef/f285668811a9e1ddb47a18cb0b437d5fc2760d537a2fe8a57875ad6f8448/brotli-1.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:15b33fe93cedc4caaff8a0bd1eb7e3dab1c61bb22a0bf5bdfdfd97cd7da79744", size = 863110, upload-time = "2025-11-05T18:38:12.978Z" },
+ { url = "https://files.pythonhosted.org/packages/50/62/a3b77593587010c789a9d6eaa527c79e0848b7b860402cc64bc0bc28a86c/brotli-1.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:898be2be399c221d2671d29eed26b6b2713a02c2119168ed914e7d00ceadb56f", size = 445438, upload-time = "2025-11-05T18:38:14.208Z" },
+ { url = "https://files.pythonhosted.org/packages/cd/e1/7fadd47f40ce5549dc44493877db40292277db373da5053aff181656e16e/brotli-1.2.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:350c8348f0e76fff0a0fd6c26755d2653863279d086d3aa2c290a6a7251135dd", size = 1534420, upload-time = "2025-11-05T18:38:15.111Z" },
+ { url = "https://files.pythonhosted.org/packages/12/8b/1ed2f64054a5a008a4ccd2f271dbba7a5fb1a3067a99f5ceadedd4c1d5a7/brotli-1.2.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2e1ad3fda65ae0d93fec742a128d72e145c9c7a99ee2fcd667785d99eb25a7fe", size = 1632619, upload-time = "2025-11-05T18:38:16.094Z" },
+ { url = "https://files.pythonhosted.org/packages/89/5a/7071a621eb2d052d64efd5da2ef55ecdac7c3b0c6e4f9d519e9c66d987ef/brotli-1.2.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:40d918bce2b427a0c4ba189df7a006ac0c7277c180aee4617d99e9ccaaf59e6a", size = 1426014, upload-time = "2025-11-05T18:38:17.177Z" },
+ { url = "https://files.pythonhosted.org/packages/26/6d/0971a8ea435af5156acaaccec1a505f981c9c80227633851f2810abd252a/brotli-1.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2a7f1d03727130fc875448b65b127a9ec5d06d19d0148e7554384229706f9d1b", size = 1489661, upload-time = "2025-11-05T18:38:18.41Z" },
+ { url = "https://files.pythonhosted.org/packages/f3/75/c1baca8b4ec6c96a03ef8230fab2a785e35297632f402ebb1e78a1e39116/brotli-1.2.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:9c79f57faa25d97900bfb119480806d783fba83cd09ee0b33c17623935b05fa3", size = 1599150, upload-time = "2025-11-05T18:38:19.792Z" },
+ { url = "https://files.pythonhosted.org/packages/0d/1a/23fcfee1c324fd48a63d7ebf4bac3a4115bdb1b00e600f80f727d850b1ae/brotli-1.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:844a8ceb8483fefafc412f85c14f2aae2fb69567bf2a0de53cdb88b73e7c43ae", size = 1493505, upload-time = "2025-11-05T18:38:20.913Z" },
+ { url = "https://files.pythonhosted.org/packages/36/e5/12904bbd36afeef53d45a84881a4810ae8810ad7e328a971ebbfd760a0b3/brotli-1.2.0-cp311-cp311-win32.whl", hash = "sha256:aa47441fa3026543513139cb8926a92a8e305ee9c71a6209ef7a97d91640ea03", size = 334451, upload-time = "2025-11-05T18:38:21.94Z" },
+ { url = "https://files.pythonhosted.org/packages/02/8b/ecb5761b989629a4758c394b9301607a5880de61ee2ee5fe104b87149ebc/brotli-1.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:022426c9e99fd65d9475dce5c195526f04bb8be8907607e27e747893f6ee3e24", size = 369035, upload-time = "2025-11-05T18:38:22.941Z" },
+ { url = "https://files.pythonhosted.org/packages/11/ee/b0a11ab2315c69bb9b45a2aaed022499c9c24a205c3a49c3513b541a7967/brotli-1.2.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:35d382625778834a7f3061b15423919aa03e4f5da34ac8e02c074e4b75ab4f84", size = 861543, upload-time = "2025-11-05T18:38:24.183Z" },
+ { url = "https://files.pythonhosted.org/packages/e1/2f/29c1459513cd35828e25531ebfcbf3e92a5e49f560b1777a9af7203eb46e/brotli-1.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7a61c06b334bd99bc5ae84f1eeb36bfe01400264b3c352f968c6e30a10f9d08b", size = 444288, upload-time = "2025-11-05T18:38:25.139Z" },
+ { url = "https://files.pythonhosted.org/packages/3d/6f/feba03130d5fceadfa3a1bb102cb14650798c848b1df2a808356f939bb16/brotli-1.2.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:acec55bb7c90f1dfc476126f9711a8e81c9af7fb617409a9ee2953115343f08d", size = 1528071, upload-time = "2025-11-05T18:38:26.081Z" },
+ { url = "https://files.pythonhosted.org/packages/2b/38/f3abb554eee089bd15471057ba85f47e53a44a462cfce265d9bf7088eb09/brotli-1.2.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:260d3692396e1895c5034f204f0db022c056f9e2ac841593a4cf9426e2a3faca", size = 1626913, upload-time = "2025-11-05T18:38:27.284Z" },
+ { url = "https://files.pythonhosted.org/packages/03/a7/03aa61fbc3c5cbf99b44d158665f9b0dd3d8059be16c460208d9e385c837/brotli-1.2.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:072e7624b1fc4d601036ab3f4f27942ef772887e876beff0301d261210bca97f", size = 1419762, upload-time = "2025-11-05T18:38:28.295Z" },
+ { url = "https://files.pythonhosted.org/packages/21/1b/0374a89ee27d152a5069c356c96b93afd1b94eae83f1e004b57eb6ce2f10/brotli-1.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:adedc4a67e15327dfdd04884873c6d5a01d3e3b6f61406f99b1ed4865a2f6d28", size = 1484494, upload-time = "2025-11-05T18:38:29.29Z" },
+ { url = "https://files.pythonhosted.org/packages/cf/57/69d4fe84a67aef4f524dcd075c6eee868d7850e85bf01d778a857d8dbe0a/brotli-1.2.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:7a47ce5c2288702e09dc22a44d0ee6152f2c7eda97b3c8482d826a1f3cfc7da7", size = 1593302, upload-time = "2025-11-05T18:38:30.639Z" },
+ { url = "https://files.pythonhosted.org/packages/d5/3b/39e13ce78a8e9a621c5df3aeb5fd181fcc8caba8c48a194cd629771f6828/brotli-1.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:af43b8711a8264bb4e7d6d9a6d004c3a2019c04c01127a868709ec29962b6036", size = 1487913, upload-time = "2025-11-05T18:38:31.618Z" },
+ { url = "https://files.pythonhosted.org/packages/62/28/4d00cb9bd76a6357a66fcd54b4b6d70288385584063f4b07884c1e7286ac/brotli-1.2.0-cp312-cp312-win32.whl", hash = "sha256:e99befa0b48f3cd293dafeacdd0d191804d105d279e0b387a32054c1180f3161", size = 334362, upload-time = "2025-11-05T18:38:32.939Z" },
+ { url = "https://files.pythonhosted.org/packages/1c/4e/bc1dcac9498859d5e353c9b153627a3752868a9d5f05ce8dedd81a2354ab/brotli-1.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:b35c13ce241abdd44cb8ca70683f20c0c079728a36a996297adb5334adfc1c44", size = 369115, upload-time = "2025-11-05T18:38:33.765Z" },
+ { url = "https://files.pythonhosted.org/packages/6c/d4/4ad5432ac98c73096159d9ce7ffeb82d151c2ac84adcc6168e476bb54674/brotli-1.2.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9e5825ba2c9998375530504578fd4d5d1059d09621a02065d1b6bfc41a8e05ab", size = 861523, upload-time = "2025-11-05T18:38:34.67Z" },
+ { url = "https://files.pythonhosted.org/packages/91/9f/9cc5bd03ee68a85dc4bc89114f7067c056a3c14b3d95f171918c088bf88d/brotli-1.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0cf8c3b8ba93d496b2fae778039e2f5ecc7cff99df84df337ca31d8f2252896c", size = 444289, upload-time = "2025-11-05T18:38:35.6Z" },
+ { url = "https://files.pythonhosted.org/packages/2e/b6/fe84227c56a865d16a6614e2c4722864b380cb14b13f3e6bef441e73a85a/brotli-1.2.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c8565e3cdc1808b1a34714b553b262c5de5fbda202285782173ec137fd13709f", size = 1528076, upload-time = "2025-11-05T18:38:36.639Z" },
+ { url = "https://files.pythonhosted.org/packages/55/de/de4ae0aaca06c790371cf6e7ee93a024f6b4bb0568727da8c3de112e726c/brotli-1.2.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:26e8d3ecb0ee458a9804f47f21b74845cc823fd1bb19f02272be70774f56e2a6", size = 1626880, upload-time = "2025-11-05T18:38:37.623Z" },
+ { url = "https://files.pythonhosted.org/packages/5f/16/a1b22cbea436642e071adcaf8d4b350a2ad02f5e0ad0da879a1be16188a0/brotli-1.2.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:67a91c5187e1eec76a61625c77a6c8c785650f5b576ca732bd33ef58b0dff49c", size = 1419737, upload-time = "2025-11-05T18:38:38.729Z" },
+ { url = "https://files.pythonhosted.org/packages/46/63/c968a97cbb3bdbf7f974ef5a6ab467a2879b82afbc5ffb65b8acbb744f95/brotli-1.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4ecdb3b6dc36e6d6e14d3a1bdc6c1057c8cbf80db04031d566eb6080ce283a48", size = 1484440, upload-time = "2025-11-05T18:38:39.916Z" },
+ { url = "https://files.pythonhosted.org/packages/06/9d/102c67ea5c9fc171f423e8399e585dabea29b5bc79b05572891e70013cdd/brotli-1.2.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3e1b35d56856f3ed326b140d3c6d9db91740f22e14b06e840fe4bb1923439a18", size = 1593313, upload-time = "2025-11-05T18:38:41.24Z" },
+ { url = "https://files.pythonhosted.org/packages/9e/4a/9526d14fa6b87bc827ba1755a8440e214ff90de03095cacd78a64abe2b7d/brotli-1.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:54a50a9dad16b32136b2241ddea9e4df159b41247b2ce6aac0b3276a66a8f1e5", size = 1487945, upload-time = "2025-11-05T18:38:42.277Z" },
+ { url = "https://files.pythonhosted.org/packages/5b/e8/3fe1ffed70cbef83c5236166acaed7bb9c766509b157854c80e2f766b38c/brotli-1.2.0-cp313-cp313-win32.whl", hash = "sha256:1b1d6a4efedd53671c793be6dd760fcf2107da3a52331ad9ea429edf0902f27a", size = 334368, upload-time = "2025-11-05T18:38:43.345Z" },
+ { url = "https://files.pythonhosted.org/packages/ff/91/e739587be970a113b37b821eae8097aac5a48e5f0eca438c22e4c7dd8648/brotli-1.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:b63daa43d82f0cdabf98dee215b375b4058cce72871fd07934f179885aad16e8", size = 369116, upload-time = "2025-11-05T18:38:44.609Z" },
+ { url = "https://files.pythonhosted.org/packages/17/e1/298c2ddf786bb7347a1cd71d63a347a79e5712a7c0cba9e3c3458ebd976f/brotli-1.2.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:6c12dad5cd04530323e723787ff762bac749a7b256a5bece32b2243dd5c27b21", size = 863080, upload-time = "2025-11-05T18:38:45.503Z" },
+ { url = "https://files.pythonhosted.org/packages/84/0c/aac98e286ba66868b2b3b50338ffbd85a35c7122e9531a73a37a29763d38/brotli-1.2.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:3219bd9e69868e57183316ee19c84e03e8f8b5a1d1f2667e1aa8c2f91cb061ac", size = 445453, upload-time = "2025-11-05T18:38:46.433Z" },
+ { url = "https://files.pythonhosted.org/packages/ec/f1/0ca1f3f99ae300372635ab3fe2f7a79fa335fee3d874fa7f9e68575e0e62/brotli-1.2.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:963a08f3bebd8b75ac57661045402da15991468a621f014be54e50f53a58d19e", size = 1528168, upload-time = "2025-11-05T18:38:47.371Z" },
+ { url = "https://files.pythonhosted.org/packages/d6/a6/2ebfc8f766d46df8d3e65b880a2e220732395e6d7dc312c1e1244b0f074a/brotli-1.2.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9322b9f8656782414b37e6af884146869d46ab85158201d82bab9abbcb971dc7", size = 1627098, upload-time = "2025-11-05T18:38:48.385Z" },
+ { url = "https://files.pythonhosted.org/packages/f3/2f/0976d5b097ff8a22163b10617f76b2557f15f0f39d6a0fe1f02b1a53e92b/brotli-1.2.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cf9cba6f5b78a2071ec6fb1e7bd39acf35071d90a81231d67e92d637776a6a63", size = 1419861, upload-time = "2025-11-05T18:38:49.372Z" },
+ { url = "https://files.pythonhosted.org/packages/9c/97/d76df7176a2ce7616ff94c1fb72d307c9a30d2189fe877f3dd99af00ea5a/brotli-1.2.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7547369c4392b47d30a3467fe8c3330b4f2e0f7730e45e3103d7d636678a808b", size = 1484594, upload-time = "2025-11-05T18:38:50.655Z" },
+ { url = "https://files.pythonhosted.org/packages/d3/93/14cf0b1216f43df5609f5b272050b0abd219e0b54ea80b47cef9867b45e7/brotli-1.2.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:fc1530af5c3c275b8524f2e24841cbe2599d74462455e9bae5109e9ff42e9361", size = 1593455, upload-time = "2025-11-05T18:38:51.624Z" },
+ { url = "https://files.pythonhosted.org/packages/b3/73/3183c9e41ca755713bdf2cc1d0810df742c09484e2e1ddd693bee53877c1/brotli-1.2.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d2d085ded05278d1c7f65560aae97b3160aeb2ea2c0b3e26204856beccb60888", size = 1488164, upload-time = "2025-11-05T18:38:53.079Z" },
+ { url = "https://files.pythonhosted.org/packages/64/6a/0c78d8f3a582859236482fd9fa86a65a60328a00983006bcf6d83b7b2253/brotli-1.2.0-cp314-cp314-win32.whl", hash = "sha256:832c115a020e463c2f67664560449a7bea26b0c1fdd690352addad6d0a08714d", size = 339280, upload-time = "2025-11-05T18:38:54.02Z" },
+ { url = "https://files.pythonhosted.org/packages/f5/10/56978295c14794b2c12007b07f3e41ba26acda9257457d7085b0bb3bb90c/brotli-1.2.0-cp314-cp314-win_amd64.whl", hash = "sha256:e7c0af964e0b4e3412a0ebf341ea26ec767fa0b4cf81abb5e897c9338b5ad6a3", size = 375639, upload-time = "2025-11-05T18:38:55.67Z" },
+]
+
+[[package]]
+name = "cachetools"
+version = "7.0.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/76/7b/1755ed2c6bfabd1d98b37ae73152f8dcf94aa40fee119d163c19ed484704/cachetools-7.0.6.tar.gz", hash = "sha256:e5d524d36d65703a87243a26ff08ad84f73352adbeafb1cde81e207b456aaf24", size = 37526, upload-time = "2026-04-20T19:02:23.289Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/fe/c4/cf76242a5da1410917107ff14551764aa405a5fd10cd10cf9a5ca8fa77f4/cachetools-7.0.6-py3-none-any.whl", hash = "sha256:4e94956cfdd3086f12042cdd29318f5ced3893014f7d0d059bf3ead3f85b7f8b", size = 13976, upload-time = "2026-04-20T19:02:21.187Z" },
+]
+
+[[package]]
+name = "caio"
+version = "0.9.25"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/92/88/b8527e1b00c1811db339a1df8bd1ae49d146fcea9d6a5c40e3a80aaeb38d/caio-0.9.25.tar.gz", hash = "sha256:16498e7f81d1d0f5a4c0ad3f2540e65fe25691376e0a5bd367f558067113ed10", size = 26781, upload-time = "2025-12-26T15:21:36.501Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/6a/80/ea4ead0c5d52a9828692e7df20f0eafe8d26e671ce4883a0a146bb91049e/caio-0.9.25-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ca6c8ecda611478b6016cb94d23fd3eb7124852b985bdec7ecaad9f3116b9619", size = 36836, upload-time = "2025-12-26T15:22:04.662Z" },
+ { url = "https://files.pythonhosted.org/packages/17/b9/36715c97c873649d1029001578f901b50250916295e3dddf20c865438865/caio-0.9.25-cp310-cp310-manylinux2010_x86_64.manylinux2014_x86_64.manylinux_2_12_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:db9b5681e4af8176159f0d6598e73b2279bb661e718c7ac23342c550bd78c241", size = 79695, upload-time = "2025-12-26T15:22:18.818Z" },
+ { url = "https://files.pythonhosted.org/packages/0b/ab/07080ecb1adb55a02cbd8ec0126aa8e43af343ffabb6a71125b42670e9a1/caio-0.9.25-cp310-cp310-manylinux_2_34_aarch64.whl", hash = "sha256:bf61d7d0c4fd10ffdd98ca47f7e8db4d7408e74649ffaf4bef40b029ada3c21b", size = 79457, upload-time = "2026-03-04T22:08:16.024Z" },
+ { url = "https://files.pythonhosted.org/packages/88/95/dd55757bb671eb4c376e006c04e83beb413486821f517792ea603ef216e9/caio-0.9.25-cp310-cp310-manylinux_2_34_x86_64.whl", hash = "sha256:ab52e5b643f8bbd64a0605d9412796cd3464cb8ca88593b13e95a0f0b10508ae", size = 77705, upload-time = "2026-03-04T22:08:17.202Z" },
+ { url = "https://files.pythonhosted.org/packages/ec/90/543f556fcfcfa270713eef906b6352ab048e1e557afec12925c991dc93c2/caio-0.9.25-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d6956d9e4a27021c8bd6c9677f3a59eb1d820cc32d0343cea7961a03b1371965", size = 36839, upload-time = "2025-12-26T15:21:40.267Z" },
+ { url = "https://files.pythonhosted.org/packages/51/3b/36f3e8ec38dafe8de4831decd2e44c69303d2a3892d16ceda42afed44e1b/caio-0.9.25-cp311-cp311-manylinux2010_x86_64.manylinux2014_x86_64.manylinux_2_12_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bf84bfa039f25ad91f4f52944452a5f6f405e8afab4d445450978cd6241d1478", size = 80255, upload-time = "2025-12-26T15:22:20.271Z" },
+ { url = "https://files.pythonhosted.org/packages/df/ce/65e64867d928e6aff1b4f0e12dba0ef6d5bf412c240dc1df9d421ac10573/caio-0.9.25-cp311-cp311-manylinux_2_34_aarch64.whl", hash = "sha256:ae3d62587332bce600f861a8de6256b1014d6485cfd25d68c15caf1611dd1f7c", size = 80052, upload-time = "2026-03-04T22:08:20.402Z" },
+ { url = "https://files.pythonhosted.org/packages/46/90/e278863c47e14ec58309aa2e38a45882fbe67b4cc29ec9bc8f65852d3e45/caio-0.9.25-cp311-cp311-manylinux_2_34_x86_64.whl", hash = "sha256:fc220b8533dcf0f238a6b1a4a937f92024c71e7b10b5a2dfc1c73604a25709bc", size = 78273, upload-time = "2026-03-04T22:08:21.368Z" },
+ { url = "https://files.pythonhosted.org/packages/d3/25/79c98ebe12df31548ba4eaf44db11b7cad6b3e7b4203718335620939083c/caio-0.9.25-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:fb7ff95af4c31ad3f03179149aab61097a71fd85e05f89b4786de0359dffd044", size = 36983, upload-time = "2025-12-26T15:21:36.075Z" },
+ { url = "https://files.pythonhosted.org/packages/a3/2b/21288691f16d479945968a0a4f2856818c1c5be56881d51d4dac9b255d26/caio-0.9.25-cp312-cp312-manylinux2010_x86_64.manylinux2014_x86_64.manylinux_2_12_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:97084e4e30dfa598449d874c4d8e0c8d5ea17d2f752ef5e48e150ff9d240cd64", size = 82012, upload-time = "2025-12-26T15:22:20.983Z" },
+ { url = "https://files.pythonhosted.org/packages/03/c4/8a1b580875303500a9c12b9e0af58cb82e47f5bcf888c2457742a138273c/caio-0.9.25-cp312-cp312-manylinux_2_34_aarch64.whl", hash = "sha256:4fa69eba47e0f041b9d4f336e2ad40740681c43e686b18b191b6c5f4c5544bfb", size = 81502, upload-time = "2026-03-04T22:08:22.381Z" },
+ { url = "https://files.pythonhosted.org/packages/d1/1c/0fe770b8ffc8362c48134d1592d653a81a3d8748d764bec33864db36319d/caio-0.9.25-cp312-cp312-manylinux_2_34_x86_64.whl", hash = "sha256:6bebf6f079f1341d19f7386db9b8b1f07e8cc15ae13bfdaff573371ba0575d69", size = 80200, upload-time = "2026-03-04T22:08:23.382Z" },
+ { url = "https://files.pythonhosted.org/packages/31/57/5e6ff127e6f62c9f15d989560435c642144aa4210882f9494204bc892305/caio-0.9.25-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d6c2a3411af97762a2b03840c3cec2f7f728921ff8adda53d7ea2315a8563451", size = 36979, upload-time = "2025-12-26T15:21:35.484Z" },
+ { url = "https://files.pythonhosted.org/packages/a3/9f/f21af50e72117eb528c422d4276cbac11fb941b1b812b182e0a9c70d19c5/caio-0.9.25-cp313-cp313-manylinux2010_x86_64.manylinux2014_x86_64.manylinux_2_12_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0998210a4d5cd5cb565b32ccfe4e53d67303f868a76f212e002a8554692870e6", size = 81900, upload-time = "2025-12-26T15:22:21.919Z" },
+ { url = "https://files.pythonhosted.org/packages/9c/12/c39ae2a4037cb10ad5eb3578eb4d5f8c1a2575c62bba675f3406b7ef0824/caio-0.9.25-cp313-cp313-manylinux_2_34_aarch64.whl", hash = "sha256:1a177d4777141b96f175fe2c37a3d96dec7911ed9ad5f02bac38aaa1c936611f", size = 81523, upload-time = "2026-03-04T22:08:25.187Z" },
+ { url = "https://files.pythonhosted.org/packages/22/59/f8f2e950eb4f1a5a3883e198dca514b9d475415cb6cd7b78b9213a0dd45a/caio-0.9.25-cp313-cp313-manylinux_2_34_x86_64.whl", hash = "sha256:9ed3cfb28c0e99fec5e208c934e5c157d0866aa9c32aa4dc5e9b6034af6286b7", size = 80243, upload-time = "2026-03-04T22:08:26.449Z" },
+ { url = "https://files.pythonhosted.org/packages/69/ca/a08fdc7efdcc24e6a6131a93c85be1f204d41c58f474c42b0670af8c016b/caio-0.9.25-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:fab6078b9348e883c80a5e14b382e6ad6aabbc4429ca034e76e730cf464269db", size = 36978, upload-time = "2025-12-26T15:21:41.055Z" },
+ { url = "https://files.pythonhosted.org/packages/5e/6c/d4d24f65e690213c097174d26eda6831f45f4734d9d036d81790a27e7b78/caio-0.9.25-cp314-cp314-manylinux2010_x86_64.manylinux2014_x86_64.manylinux_2_12_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:44a6b58e52d488c75cfaa5ecaa404b2b41cc965e6c417e03251e868ecd5b6d77", size = 81832, upload-time = "2025-12-26T15:22:22.757Z" },
+ { url = "https://files.pythonhosted.org/packages/87/a4/e534cf7d2d0e8d880e25dd61e8d921ffcfe15bd696734589826f5a2df727/caio-0.9.25-cp314-cp314-manylinux_2_34_aarch64.whl", hash = "sha256:628a630eb7fb22381dd8e3c8ab7f59e854b9c806639811fc3f4310c6bd711d79", size = 81565, upload-time = "2026-03-04T22:08:27.483Z" },
+ { url = "https://files.pythonhosted.org/packages/3f/ed/bf81aeac1d290017e5e5ac3e880fd56ee15e50a6d0353986799d1bc5cfd5/caio-0.9.25-cp314-cp314-manylinux_2_34_x86_64.whl", hash = "sha256:0ba16aa605ccb174665357fc729cf500679c2d94d5f1458a6f0d5ca48f2060a7", size = 80071, upload-time = "2026-03-04T22:08:28.751Z" },
+ { url = "https://files.pythonhosted.org/packages/86/93/1f76c8d1bafe3b0614e06b2195784a3765bbf7b0a067661af9e2dd47fc33/caio-0.9.25-py3-none-any.whl", hash = "sha256:06c0bb02d6b929119b1cfbe1ca403c768b2013a369e2db46bfa2a5761cf82e40", size = 19087, upload-time = "2025-12-26T15:22:00.221Z" },
+]
+
+[[package]]
+name = "certifi"
+version = "2026.4.22"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/25/ee/6caf7a40c36a1220410afe15a1cc64993a1f864871f698c0f93acb72842a/certifi-2026.4.22.tar.gz", hash = "sha256:8d455352a37b71bf76a79caa83a3d6c25afee4a385d632127b6afb3963f1c580", size = 137077, upload-time = "2026-04-22T11:26:11.191Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl", hash = "sha256:3cb2210c8f88ba2318d29b0388d1023c8492ff72ecdde4ebdaddbb13a31b1c4a", size = 135707, upload-time = "2026-04-22T11:26:09.372Z" },
+]
+
+[[package]]
+name = "cffi"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "pycparser", marker = "implementation_name != 'PyPy'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/93/d7/516d984057745a6cd96575eea814fe1edd6646ee6efd552fb7b0921dec83/cffi-2.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44", size = 184283, upload-time = "2025-09-08T23:22:08.01Z" },
+ { url = "https://files.pythonhosted.org/packages/9e/84/ad6a0b408daa859246f57c03efd28e5dd1b33c21737c2db84cae8c237aa5/cffi-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49", size = 180504, upload-time = "2025-09-08T23:22:10.637Z" },
+ { url = "https://files.pythonhosted.org/packages/50/bd/b1a6362b80628111e6653c961f987faa55262b4002fcec42308cad1db680/cffi-2.0.0-cp310-cp310-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:53f77cbe57044e88bbd5ed26ac1d0514d2acf0591dd6bb02a3ae37f76811b80c", size = 208811, upload-time = "2025-09-08T23:22:12.267Z" },
+ { url = "https://files.pythonhosted.org/packages/4f/27/6933a8b2562d7bd1fb595074cf99cc81fc3789f6a6c05cdabb46284a3188/cffi-2.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3e837e369566884707ddaf85fc1744b47575005c0a229de3327f8f9a20f4efeb", size = 216402, upload-time = "2025-09-08T23:22:13.455Z" },
+ { url = "https://files.pythonhosted.org/packages/05/eb/b86f2a2645b62adcfff53b0dd97e8dfafb5c8aa864bd0d9a2c2049a0d551/cffi-2.0.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:5eda85d6d1879e692d546a078b44251cdd08dd1cfb98dfb77b670c97cee49ea0", size = 203217, upload-time = "2025-09-08T23:22:14.596Z" },
+ { url = "https://files.pythonhosted.org/packages/9f/e0/6cbe77a53acf5acc7c08cc186c9928864bd7c005f9efd0d126884858a5fe/cffi-2.0.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:9332088d75dc3241c702d852d4671613136d90fa6881da7d770a483fd05248b4", size = 203079, upload-time = "2025-09-08T23:22:15.769Z" },
+ { url = "https://files.pythonhosted.org/packages/98/29/9b366e70e243eb3d14a5cb488dfd3a0b6b2f1fb001a203f653b93ccfac88/cffi-2.0.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc7de24befaeae77ba923797c7c87834c73648a05a4bde34b3b7e5588973a453", size = 216475, upload-time = "2025-09-08T23:22:17.427Z" },
+ { url = "https://files.pythonhosted.org/packages/21/7a/13b24e70d2f90a322f2900c5d8e1f14fa7e2a6b3332b7309ba7b2ba51a5a/cffi-2.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cf364028c016c03078a23b503f02058f1814320a56ad535686f90565636a9495", size = 218829, upload-time = "2025-09-08T23:22:19.069Z" },
+ { url = "https://files.pythonhosted.org/packages/60/99/c9dc110974c59cc981b1f5b66e1d8af8af764e00f0293266824d9c4254bc/cffi-2.0.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e11e82b744887154b182fd3e7e8512418446501191994dbf9c9fc1f32cc8efd5", size = 211211, upload-time = "2025-09-08T23:22:20.588Z" },
+ { url = "https://files.pythonhosted.org/packages/49/72/ff2d12dbf21aca1b32a40ed792ee6b40f6dc3a9cf1644bd7ef6e95e0ac5e/cffi-2.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8ea985900c5c95ce9db1745f7933eeef5d314f0565b27625d9a10ec9881e1bfb", size = 218036, upload-time = "2025-09-08T23:22:22.143Z" },
+ { url = "https://files.pythonhosted.org/packages/e2/cc/027d7fb82e58c48ea717149b03bcadcbdc293553edb283af792bd4bcbb3f/cffi-2.0.0-cp310-cp310-win32.whl", hash = "sha256:1f72fb8906754ac8a2cc3f9f5aaa298070652a0ffae577e0ea9bd480dc3c931a", size = 172184, upload-time = "2025-09-08T23:22:23.328Z" },
+ { url = "https://files.pythonhosted.org/packages/33/fa/072dd15ae27fbb4e06b437eb6e944e75b068deb09e2a2826039e49ee2045/cffi-2.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:b18a3ed7d5b3bd8d9ef7a8cb226502c6bf8308df1525e1cc676c3680e7176739", size = 182790, upload-time = "2025-09-08T23:22:24.752Z" },
+ { url = "https://files.pythonhosted.org/packages/12/4a/3dfd5f7850cbf0d06dc84ba9aa00db766b52ca38d8b86e3a38314d52498c/cffi-2.0.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:b4c854ef3adc177950a8dfc81a86f5115d2abd545751a304c5bcf2c2c7283cfe", size = 184344, upload-time = "2025-09-08T23:22:26.456Z" },
+ { url = "https://files.pythonhosted.org/packages/4f/8b/f0e4c441227ba756aafbe78f117485b25bb26b1c059d01f137fa6d14896b/cffi-2.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c", size = 180560, upload-time = "2025-09-08T23:22:28.197Z" },
+ { url = "https://files.pythonhosted.org/packages/b1/b7/1200d354378ef52ec227395d95c2576330fd22a869f7a70e88e1447eb234/cffi-2.0.0-cp311-cp311-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:baf5215e0ab74c16e2dd324e8ec067ef59e41125d3eade2b863d294fd5035c92", size = 209613, upload-time = "2025-09-08T23:22:29.475Z" },
+ { url = "https://files.pythonhosted.org/packages/b8/56/6033f5e86e8cc9bb629f0077ba71679508bdf54a9a5e112a3c0b91870332/cffi-2.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93", size = 216476, upload-time = "2025-09-08T23:22:31.063Z" },
+ { url = "https://files.pythonhosted.org/packages/dc/7f/55fecd70f7ece178db2f26128ec41430d8720f2d12ca97bf8f0a628207d5/cffi-2.0.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5", size = 203374, upload-time = "2025-09-08T23:22:32.507Z" },
+ { url = "https://files.pythonhosted.org/packages/84/ef/a7b77c8bdc0f77adc3b46888f1ad54be8f3b7821697a7b89126e829e676a/cffi-2.0.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664", size = 202597, upload-time = "2025-09-08T23:22:34.132Z" },
+ { url = "https://files.pythonhosted.org/packages/d7/91/500d892b2bf36529a75b77958edfcd5ad8e2ce4064ce2ecfeab2125d72d1/cffi-2.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26", size = 215574, upload-time = "2025-09-08T23:22:35.443Z" },
+ { url = "https://files.pythonhosted.org/packages/44/64/58f6255b62b101093d5df22dcb752596066c7e89dd725e0afaed242a61be/cffi-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a05d0c237b3349096d3981b727493e22147f934b20f6f125a3eba8f994bec4a9", size = 218971, upload-time = "2025-09-08T23:22:36.805Z" },
+ { url = "https://files.pythonhosted.org/packages/ab/49/fa72cebe2fd8a55fbe14956f9970fe8eb1ac59e5df042f603ef7c8ba0adc/cffi-2.0.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414", size = 211972, upload-time = "2025-09-08T23:22:38.436Z" },
+ { url = "https://files.pythonhosted.org/packages/0b/28/dd0967a76aab36731b6ebfe64dec4e981aff7e0608f60c2d46b46982607d/cffi-2.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743", size = 217078, upload-time = "2025-09-08T23:22:39.776Z" },
+ { url = "https://files.pythonhosted.org/packages/2b/c0/015b25184413d7ab0a410775fdb4a50fca20f5589b5dab1dbbfa3baad8ce/cffi-2.0.0-cp311-cp311-win32.whl", hash = "sha256:c649e3a33450ec82378822b3dad03cc228b8f5963c0c12fc3b1e0ab940f768a5", size = 172076, upload-time = "2025-09-08T23:22:40.95Z" },
+ { url = "https://files.pythonhosted.org/packages/ae/8f/dc5531155e7070361eb1b7e4c1a9d896d0cb21c49f807a6c03fd63fc877e/cffi-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5", size = 182820, upload-time = "2025-09-08T23:22:42.463Z" },
+ { url = "https://files.pythonhosted.org/packages/95/5c/1b493356429f9aecfd56bc171285a4c4ac8697f76e9bbbbb105e537853a1/cffi-2.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:c6638687455baf640e37344fe26d37c404db8b80d037c3d29f58fe8d1c3b194d", size = 177635, upload-time = "2025-09-08T23:22:43.623Z" },
+ { url = "https://files.pythonhosted.org/packages/ea/47/4f61023ea636104d4f16ab488e268b93008c3d0bb76893b1b31db1f96802/cffi-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d", size = 185271, upload-time = "2025-09-08T23:22:44.795Z" },
+ { url = "https://files.pythonhosted.org/packages/df/a2/781b623f57358e360d62cdd7a8c681f074a71d445418a776eef0aadb4ab4/cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c", size = 181048, upload-time = "2025-09-08T23:22:45.938Z" },
+ { url = "https://files.pythonhosted.org/packages/ff/df/a4f0fbd47331ceeba3d37c2e51e9dfc9722498becbeec2bd8bc856c9538a/cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe", size = 212529, upload-time = "2025-09-08T23:22:47.349Z" },
+ { url = "https://files.pythonhosted.org/packages/d5/72/12b5f8d3865bf0f87cf1404d8c374e7487dcf097a1c91c436e72e6badd83/cffi-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062", size = 220097, upload-time = "2025-09-08T23:22:48.677Z" },
+ { url = "https://files.pythonhosted.org/packages/c2/95/7a135d52a50dfa7c882ab0ac17e8dc11cec9d55d2c18dda414c051c5e69e/cffi-2.0.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e", size = 207983, upload-time = "2025-09-08T23:22:50.06Z" },
+ { url = "https://files.pythonhosted.org/packages/3a/c8/15cb9ada8895957ea171c62dc78ff3e99159ee7adb13c0123c001a2546c1/cffi-2.0.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037", size = 206519, upload-time = "2025-09-08T23:22:51.364Z" },
+ { url = "https://files.pythonhosted.org/packages/78/2d/7fa73dfa841b5ac06c7b8855cfc18622132e365f5b81d02230333ff26e9e/cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba", size = 219572, upload-time = "2025-09-08T23:22:52.902Z" },
+ { url = "https://files.pythonhosted.org/packages/07/e0/267e57e387b4ca276b90f0434ff88b2c2241ad72b16d31836adddfd6031b/cffi-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94", size = 222963, upload-time = "2025-09-08T23:22:54.518Z" },
+ { url = "https://files.pythonhosted.org/packages/b6/75/1f2747525e06f53efbd878f4d03bac5b859cbc11c633d0fb81432d98a795/cffi-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187", size = 221361, upload-time = "2025-09-08T23:22:55.867Z" },
+ { url = "https://files.pythonhosted.org/packages/7b/2b/2b6435f76bfeb6bbf055596976da087377ede68df465419d192acf00c437/cffi-2.0.0-cp312-cp312-win32.whl", hash = "sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18", size = 172932, upload-time = "2025-09-08T23:22:57.188Z" },
+ { url = "https://files.pythonhosted.org/packages/f8/ed/13bd4418627013bec4ed6e54283b1959cf6db888048c7cf4b4c3b5b36002/cffi-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5", size = 183557, upload-time = "2025-09-08T23:22:58.351Z" },
+ { url = "https://files.pythonhosted.org/packages/95/31/9f7f93ad2f8eff1dbc1c3656d7ca5bfd8fb52c9d786b4dcf19b2d02217fa/cffi-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6", size = 177762, upload-time = "2025-09-08T23:22:59.668Z" },
+ { url = "https://files.pythonhosted.org/packages/4b/8d/a0a47a0c9e413a658623d014e91e74a50cdd2c423f7ccfd44086ef767f90/cffi-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb", size = 185230, upload-time = "2025-09-08T23:23:00.879Z" },
+ { url = "https://files.pythonhosted.org/packages/4a/d2/a6c0296814556c68ee32009d9c2ad4f85f2707cdecfd7727951ec228005d/cffi-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca", size = 181043, upload-time = "2025-09-08T23:23:02.231Z" },
+ { url = "https://files.pythonhosted.org/packages/b0/1e/d22cc63332bd59b06481ceaac49d6c507598642e2230f201649058a7e704/cffi-2.0.0-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b", size = 212446, upload-time = "2025-09-08T23:23:03.472Z" },
+ { url = "https://files.pythonhosted.org/packages/a9/f5/a2c23eb03b61a0b8747f211eb716446c826ad66818ddc7810cc2cc19b3f2/cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b", size = 220101, upload-time = "2025-09-08T23:23:04.792Z" },
+ { url = "https://files.pythonhosted.org/packages/f2/7f/e6647792fc5850d634695bc0e6ab4111ae88e89981d35ac269956605feba/cffi-2.0.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2", size = 207948, upload-time = "2025-09-08T23:23:06.127Z" },
+ { url = "https://files.pythonhosted.org/packages/cb/1e/a5a1bd6f1fb30f22573f76533de12a00bf274abcdc55c8edab639078abb6/cffi-2.0.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3", size = 206422, upload-time = "2025-09-08T23:23:07.753Z" },
+ { url = "https://files.pythonhosted.org/packages/98/df/0a1755e750013a2081e863e7cd37e0cdd02664372c754e5560099eb7aa44/cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26", size = 219499, upload-time = "2025-09-08T23:23:09.648Z" },
+ { url = "https://files.pythonhosted.org/packages/50/e1/a969e687fcf9ea58e6e2a928ad5e2dd88cc12f6f0ab477e9971f2309b57c/cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c", size = 222928, upload-time = "2025-09-08T23:23:10.928Z" },
+ { url = "https://files.pythonhosted.org/packages/36/54/0362578dd2c9e557a28ac77698ed67323ed5b9775ca9d3fe73fe191bb5d8/cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b", size = 221302, upload-time = "2025-09-08T23:23:12.42Z" },
+ { url = "https://files.pythonhosted.org/packages/eb/6d/bf9bda840d5f1dfdbf0feca87fbdb64a918a69bca42cfa0ba7b137c48cb8/cffi-2.0.0-cp313-cp313-win32.whl", hash = "sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27", size = 172909, upload-time = "2025-09-08T23:23:14.32Z" },
+ { url = "https://files.pythonhosted.org/packages/37/18/6519e1ee6f5a1e579e04b9ddb6f1676c17368a7aba48299c3759bbc3c8b3/cffi-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75", size = 183402, upload-time = "2025-09-08T23:23:15.535Z" },
+ { url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" },
+ { url = "https://files.pythonhosted.org/packages/92/c4/3ce07396253a83250ee98564f8d7e9789fab8e58858f35d07a9a2c78de9f/cffi-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5", size = 185320, upload-time = "2025-09-08T23:23:18.087Z" },
+ { url = "https://files.pythonhosted.org/packages/59/dd/27e9fa567a23931c838c6b02d0764611c62290062a6d4e8ff7863daf9730/cffi-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13", size = 181487, upload-time = "2025-09-08T23:23:19.622Z" },
+ { url = "https://files.pythonhosted.org/packages/d6/43/0e822876f87ea8a4ef95442c3d766a06a51fc5298823f884ef87aaad168c/cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b", size = 220049, upload-time = "2025-09-08T23:23:20.853Z" },
+ { url = "https://files.pythonhosted.org/packages/b4/89/76799151d9c2d2d1ead63c2429da9ea9d7aac304603de0c6e8764e6e8e70/cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c", size = 207793, upload-time = "2025-09-08T23:23:22.08Z" },
+ { url = "https://files.pythonhosted.org/packages/bb/dd/3465b14bb9e24ee24cb88c9e3730f6de63111fffe513492bf8c808a3547e/cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef", size = 206300, upload-time = "2025-09-08T23:23:23.314Z" },
+ { url = "https://files.pythonhosted.org/packages/47/d9/d83e293854571c877a92da46fdec39158f8d7e68da75bf73581225d28e90/cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775", size = 219244, upload-time = "2025-09-08T23:23:24.541Z" },
+ { url = "https://files.pythonhosted.org/packages/2b/0f/1f177e3683aead2bb00f7679a16451d302c436b5cbf2505f0ea8146ef59e/cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205", size = 222828, upload-time = "2025-09-08T23:23:26.143Z" },
+ { url = "https://files.pythonhosted.org/packages/c6/0f/cafacebd4b040e3119dcb32fed8bdef8dfe94da653155f9d0b9dc660166e/cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1", size = 220926, upload-time = "2025-09-08T23:23:27.873Z" },
+ { url = "https://files.pythonhosted.org/packages/3e/aa/df335faa45b395396fcbc03de2dfcab242cd61a9900e914fe682a59170b1/cffi-2.0.0-cp314-cp314-win32.whl", hash = "sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f", size = 175328, upload-time = "2025-09-08T23:23:44.61Z" },
+ { url = "https://files.pythonhosted.org/packages/bb/92/882c2d30831744296ce713f0feb4c1cd30f346ef747b530b5318715cc367/cffi-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25", size = 185650, upload-time = "2025-09-08T23:23:45.848Z" },
+ { url = "https://files.pythonhosted.org/packages/9f/2c/98ece204b9d35a7366b5b2c6539c350313ca13932143e79dc133ba757104/cffi-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad", size = 180687, upload-time = "2025-09-08T23:23:47.105Z" },
+ { url = "https://files.pythonhosted.org/packages/3e/61/c768e4d548bfa607abcda77423448df8c471f25dbe64fb2ef6d555eae006/cffi-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9", size = 188773, upload-time = "2025-09-08T23:23:29.347Z" },
+ { url = "https://files.pythonhosted.org/packages/2c/ea/5f76bce7cf6fcd0ab1a1058b5af899bfbef198bea4d5686da88471ea0336/cffi-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d", size = 185013, upload-time = "2025-09-08T23:23:30.63Z" },
+ { url = "https://files.pythonhosted.org/packages/be/b4/c56878d0d1755cf9caa54ba71e5d049479c52f9e4afc230f06822162ab2f/cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c", size = 221593, upload-time = "2025-09-08T23:23:31.91Z" },
+ { url = "https://files.pythonhosted.org/packages/e0/0d/eb704606dfe8033e7128df5e90fee946bbcb64a04fcdaa97321309004000/cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8", size = 209354, upload-time = "2025-09-08T23:23:33.214Z" },
+ { url = "https://files.pythonhosted.org/packages/d8/19/3c435d727b368ca475fb8742ab97c9cb13a0de600ce86f62eab7fa3eea60/cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc", size = 208480, upload-time = "2025-09-08T23:23:34.495Z" },
+ { url = "https://files.pythonhosted.org/packages/d0/44/681604464ed9541673e486521497406fadcc15b5217c3e326b061696899a/cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592", size = 221584, upload-time = "2025-09-08T23:23:36.096Z" },
+ { url = "https://files.pythonhosted.org/packages/25/8e/342a504ff018a2825d395d44d63a767dd8ebc927ebda557fecdaca3ac33a/cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512", size = 224443, upload-time = "2025-09-08T23:23:37.328Z" },
+ { url = "https://files.pythonhosted.org/packages/e1/5e/b666bacbbc60fbf415ba9988324a132c9a7a0448a9a8f125074671c0f2c3/cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4", size = 223437, upload-time = "2025-09-08T23:23:38.945Z" },
+ { url = "https://files.pythonhosted.org/packages/a0/1d/ec1a60bd1a10daa292d3cd6bb0b359a81607154fb8165f3ec95fe003b85c/cffi-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e", size = 180487, upload-time = "2025-09-08T23:23:40.423Z" },
+ { url = "https://files.pythonhosted.org/packages/bf/41/4c1168c74fac325c0c8156f04b6749c8b6a8f405bbf91413ba088359f60d/cffi-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6", size = 191726, upload-time = "2025-09-08T23:23:41.742Z" },
+ { url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" },
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.4.7"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e7/a1/67fe25fac3c7642725500a3f6cfe5821ad557c3abb11c9d20d12c7008d3e/charset_normalizer-3.4.7.tar.gz", hash = "sha256:ae89db9e5f98a11a4bf50407d4363e7b09b31e55bc117b4f7d80aab97ba009e5", size = 144271, upload-time = "2026-04-02T09:28:39.342Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/26/08/0f303cb0b529e456bb116f2d50565a482694fbb94340bf56d44677e7ed03/charset_normalizer-3.4.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cdd68a1fb318e290a2077696b7eb7a21a49163c455979c639bf5a5dcdc46617d", size = 315182, upload-time = "2026-04-02T09:25:40.673Z" },
+ { url = "https://files.pythonhosted.org/packages/24/47/b192933e94b546f1b1fe4df9cc1f84fcdbf2359f8d1081d46dd029b50207/charset_normalizer-3.4.7-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e17b8d5d6a8c47c85e68ca8379def1303fd360c3e22093a807cd34a71cd082b8", size = 209329, upload-time = "2026-04-02T09:25:42.354Z" },
+ { url = "https://files.pythonhosted.org/packages/c2/b4/01fa81c5ca6141024d89a8fc15968002b71da7f825dd14113207113fabbd/charset_normalizer-3.4.7-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:511ef87c8aec0783e08ac18565a16d435372bc1ac25a91e6ac7f5ef2b0bff790", size = 231230, upload-time = "2026-04-02T09:25:44.281Z" },
+ { url = "https://files.pythonhosted.org/packages/20/f7/7b991776844dfa058017e600e6e55ff01984a063290ca5622c0b63162f68/charset_normalizer-3.4.7-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:007d05ec7321d12a40227aae9e2bc6dca73f3cb21058999a1df9e193555a9dcc", size = 225890, upload-time = "2026-04-02T09:25:45.475Z" },
+ { url = "https://files.pythonhosted.org/packages/20/e7/bed0024a0f4ab0c8a9c64d4445f39b30c99bd1acd228291959e3de664247/charset_normalizer-3.4.7-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cf29836da5119f3c8a8a70667b0ef5fdca3bb12f80fd06487cfa575b3909b393", size = 216930, upload-time = "2026-04-02T09:25:46.58Z" },
+ { url = "https://files.pythonhosted.org/packages/e2/ab/b18f0ab31cdd7b3ddb8bb76c4a414aeb8160c9810fdf1bc62f269a539d87/charset_normalizer-3.4.7-cp310-cp310-manylinux_2_31_armv7l.whl", hash = "sha256:12d8baf840cc7889b37c7c770f478adea7adce3dcb3944d02ec87508e2dcf153", size = 202109, upload-time = "2026-04-02T09:25:48.031Z" },
+ { url = "https://files.pythonhosted.org/packages/82/e5/7e9440768a06dfb3075936490cb82dbf0ee20a133bf0dd8551fa096914ec/charset_normalizer-3.4.7-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d560742f3c0d62afaccf9f41fe485ed69bd7661a241f86a3ef0f0fb8b1a397af", size = 214684, upload-time = "2026-04-02T09:25:49.245Z" },
+ { url = "https://files.pythonhosted.org/packages/71/94/8c61d8da9f062fdf457c80acfa25060ec22bf1d34bbeaca4350f13bcfd07/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b14b2d9dac08e28bb8046a1a0434b1750eb221c8f5b87a68f4fa11a6f97b5e34", size = 212785, upload-time = "2026-04-02T09:25:50.671Z" },
+ { url = "https://files.pythonhosted.org/packages/66/cd/6e9889c648e72c0ab2e5967528bb83508f354d706637bc7097190c874e13/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:bc17a677b21b3502a21f66a8cc64f5bfad4df8a0b8434d661666f8ce90ac3af1", size = 203055, upload-time = "2026-04-02T09:25:51.802Z" },
+ { url = "https://files.pythonhosted.org/packages/92/2e/7a951d6a08aefb7eb8e1b54cdfb580b1365afdd9dd484dc4bee9e5d8f258/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:750e02e074872a3fad7f233b47734166440af3cdea0add3e95163110816d6752", size = 232502, upload-time = "2026-04-02T09:25:53.388Z" },
+ { url = "https://files.pythonhosted.org/packages/58/d5/abcf2d83bf8e0a1286df55cd0dc1d49af0da4282aa77e986df343e7de124/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:4e5163c14bffd570ef2affbfdd77bba66383890797df43dc8b4cc7d6f500bf53", size = 214295, upload-time = "2026-04-02T09:25:54.765Z" },
+ { url = "https://files.pythonhosted.org/packages/47/3a/7d4cd7ed54be99973a0dc176032cba5cb1f258082c31fa6df35cff46acfc/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6ed74185b2db44f41ef35fd1617c5888e59792da9bbc9190d6c7300617182616", size = 227145, upload-time = "2026-04-02T09:25:55.904Z" },
+ { url = "https://files.pythonhosted.org/packages/1d/98/3a45bf8247889cf28262ebd3d0872edff11565b2a1e3064ccb132db3fbb0/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:94e1885b270625a9a828c9793b4d52a64445299baa1fea5a173bf1d3dd9a1a5a", size = 218884, upload-time = "2026-04-02T09:25:57.074Z" },
+ { url = "https://files.pythonhosted.org/packages/ad/80/2e8b7f8915ed5c9ef13aa828d82738e33888c485b65ebf744d615040c7ea/charset_normalizer-3.4.7-cp310-cp310-win32.whl", hash = "sha256:6785f414ae0f3c733c437e0f3929197934f526d19dfaa75e18fdb4f94c6fb374", size = 148343, upload-time = "2026-04-02T09:25:58.199Z" },
+ { url = "https://files.pythonhosted.org/packages/35/1b/3b8c8c77184af465ee9ad88b5aea46ea6b2e1f7b9dc9502891e37af21e30/charset_normalizer-3.4.7-cp310-cp310-win_amd64.whl", hash = "sha256:6696b7688f54f5af4462118f0bfa7c1621eeb87154f77fa04b9295ce7a8f2943", size = 159174, upload-time = "2026-04-02T09:25:59.322Z" },
+ { url = "https://files.pythonhosted.org/packages/be/c1/feb40dca40dbb21e0a908801782d9288c64fc8d8e562c2098e9994c8c21b/charset_normalizer-3.4.7-cp310-cp310-win_arm64.whl", hash = "sha256:66671f93accb62ed07da56613636f3641f1a12c13046ce91ffc923721f23c008", size = 147805, upload-time = "2026-04-02T09:26:00.756Z" },
+ { url = "https://files.pythonhosted.org/packages/c2/d7/b5b7020a0565c2e9fa8c09f4b5fa6232feb326b8c20081ccded47ea368fd/charset_normalizer-3.4.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7641bb8895e77f921102f72833904dcd9901df5d6d72a2ab8f31d04b7e51e4e7", size = 309705, upload-time = "2026-04-02T09:26:02.191Z" },
+ { url = "https://files.pythonhosted.org/packages/5a/53/58c29116c340e5456724ecd2fff4196d236b98f3da97b404bc5e51ac3493/charset_normalizer-3.4.7-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:202389074300232baeb53ae2569a60901f7efadd4245cf3a3bf0617d60b439d7", size = 206419, upload-time = "2026-04-02T09:26:03.583Z" },
+ { url = "https://files.pythonhosted.org/packages/b2/02/e8146dc6591a37a00e5144c63f29fb7c97a734ea8a111190783c0e60ab63/charset_normalizer-3.4.7-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:30b8d1d8c52a48c2c5690e152c169b673487a2a58de1ec7393196753063fcd5e", size = 227901, upload-time = "2026-04-02T09:26:04.738Z" },
+ { url = "https://files.pythonhosted.org/packages/fb/73/77486c4cd58f1267bf17db420e930c9afa1b3be3fe8c8b8ebbebc9624359/charset_normalizer-3.4.7-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:532bc9bf33a68613fd7d65e4b1c71a6a38d7d42604ecf239c77392e9b4e8998c", size = 222742, upload-time = "2026-04-02T09:26:06.36Z" },
+ { url = "https://files.pythonhosted.org/packages/a1/fa/f74eb381a7d94ded44739e9d94de18dc5edc9c17fb8c11f0a6890696c0a9/charset_normalizer-3.4.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2fe249cb4651fd12605b7288b24751d8bfd46d35f12a20b1ba33dea122e690df", size = 214061, upload-time = "2026-04-02T09:26:08.347Z" },
+ { url = "https://files.pythonhosted.org/packages/dc/92/42bd3cefcf7687253fb86694b45f37b733c97f59af3724f356fa92b8c344/charset_normalizer-3.4.7-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:65bcd23054beab4d166035cabbc868a09c1a49d1efe458fe8e4361215df40265", size = 199239, upload-time = "2026-04-02T09:26:09.823Z" },
+ { url = "https://files.pythonhosted.org/packages/4c/3d/069e7184e2aa3b3cddc700e3dd267413dc259854adc3380421c805c6a17d/charset_normalizer-3.4.7-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:08e721811161356f97b4059a9ba7bafb23ea5ee2255402c42881c214e173c6b4", size = 210173, upload-time = "2026-04-02T09:26:10.953Z" },
+ { url = "https://files.pythonhosted.org/packages/62/51/9d56feb5f2e7074c46f93e0ebdbe61f0848ee246e2f0d89f8e20b89ebb8f/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e060d01aec0a910bdccb8be71faf34e7799ce36950f8294c8bf612cba65a2c9e", size = 209841, upload-time = "2026-04-02T09:26:12.142Z" },
+ { url = "https://files.pythonhosted.org/packages/d2/59/893d8f99cc4c837dda1fe2f1139079703deb9f321aabcb032355de13b6c7/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:38c0109396c4cfc574d502df99742a45c72c08eff0a36158b6f04000043dbf38", size = 200304, upload-time = "2026-04-02T09:26:13.711Z" },
+ { url = "https://files.pythonhosted.org/packages/7d/1d/ee6f3be3464247578d1ed5c46de545ccc3d3ff933695395c402c21fa6b77/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1c2a768fdd44ee4a9339a9b0b130049139b8ce3c01d2ce09f67f5a68048d477c", size = 229455, upload-time = "2026-04-02T09:26:14.941Z" },
+ { url = "https://files.pythonhosted.org/packages/54/bb/8fb0a946296ea96a488928bdce8ef99023998c48e4713af533e9bb98ef07/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:1a87ca9d5df6fe460483d9a5bbf2b18f620cbed41b432e2bddb686228282d10b", size = 210036, upload-time = "2026-04-02T09:26:16.478Z" },
+ { url = "https://files.pythonhosted.org/packages/9a/bc/015b2387f913749f82afd4fcba07846d05b6d784dd16123cb66860e0237d/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:d635aab80466bc95771bb78d5370e74d36d1fe31467b6b29b8b57b2a3cd7d22c", size = 224739, upload-time = "2026-04-02T09:26:17.751Z" },
+ { url = "https://files.pythonhosted.org/packages/17/ab/63133691f56baae417493cba6b7c641571a2130eb7bceba6773367ab9ec5/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ae196f021b5e7c78e918242d217db021ed2a6ace2bc6ae94c0fc596221c7f58d", size = 216277, upload-time = "2026-04-02T09:26:18.981Z" },
+ { url = "https://files.pythonhosted.org/packages/06/6d/3be70e827977f20db77c12a97e6a9f973631a45b8d186c084527e53e77a4/charset_normalizer-3.4.7-cp311-cp311-win32.whl", hash = "sha256:adb2597b428735679446b46c8badf467b4ca5f5056aae4d51a19f9570301b1ad", size = 147819, upload-time = "2026-04-02T09:26:20.295Z" },
+ { url = "https://files.pythonhosted.org/packages/20/d9/5f67790f06b735d7c7637171bbfd89882ad67201891b7275e51116ed8207/charset_normalizer-3.4.7-cp311-cp311-win_amd64.whl", hash = "sha256:8e385e4267ab76874ae30db04c627faaaf0b509e1ccc11a95b3fc3e83f855c00", size = 159281, upload-time = "2026-04-02T09:26:21.74Z" },
+ { url = "https://files.pythonhosted.org/packages/ca/83/6413f36c5a34afead88ce6f66684d943d91f233d76dd083798f9602b75ae/charset_normalizer-3.4.7-cp311-cp311-win_arm64.whl", hash = "sha256:d4a48e5b3c2a489fae013b7589308a40146ee081f6f509e047e0e096084ceca1", size = 147843, upload-time = "2026-04-02T09:26:22.901Z" },
+ { url = "https://files.pythonhosted.org/packages/0c/eb/4fc8d0a7110eb5fc9cc161723a34a8a6c200ce3b4fbf681bc86feee22308/charset_normalizer-3.4.7-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:eca9705049ad3c7345d574e3510665cb2cf844c2f2dcfe675332677f081cbd46", size = 311328, upload-time = "2026-04-02T09:26:24.331Z" },
+ { url = "https://files.pythonhosted.org/packages/f8/e3/0fadc706008ac9d7b9b5be6dc767c05f9d3e5df51744ce4cc9605de7b9f4/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6178f72c5508bfc5fd446a5905e698c6212932f25bcdd4b47a757a50605a90e2", size = 208061, upload-time = "2026-04-02T09:26:25.568Z" },
+ { url = "https://files.pythonhosted.org/packages/42/f0/3dd1045c47f4a4604df85ec18ad093912ae1344ac706993aff91d38773a2/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e1421b502d83040e6d7fb2fb18dff63957f720da3d77b2fbd3187ceb63755d7b", size = 229031, upload-time = "2026-04-02T09:26:26.865Z" },
+ { url = "https://files.pythonhosted.org/packages/dc/67/675a46eb016118a2fbde5a277a5d15f4f69d5f3f5f338e5ee2f8948fcf43/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:edac0f1ab77644605be2cbba52e6b7f630731fc42b34cb0f634be1a6eface56a", size = 225239, upload-time = "2026-04-02T09:26:28.044Z" },
+ { url = "https://files.pythonhosted.org/packages/4b/f8/d0118a2f5f23b02cd166fa385c60f9b0d4f9194f574e2b31cef350ad7223/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5649fd1c7bade02f320a462fdefd0b4bd3ce036065836d4f42e0de958038e116", size = 216589, upload-time = "2026-04-02T09:26:29.239Z" },
+ { url = "https://files.pythonhosted.org/packages/b1/f1/6d2b0b261b6c4ceef0fcb0d17a01cc5bc53586c2d4796fa04b5c540bc13d/charset_normalizer-3.4.7-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:203104ed3e428044fd943bc4bf45fa73c0730391f9621e37fe39ecf477b128cb", size = 202733, upload-time = "2026-04-02T09:26:30.5Z" },
+ { url = "https://files.pythonhosted.org/packages/6f/c0/7b1f943f7e87cc3db9626ba17807d042c38645f0a1d4415c7a14afb5591f/charset_normalizer-3.4.7-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:298930cec56029e05497a76988377cbd7457ba864beeea92ad7e844fe74cd1f1", size = 212652, upload-time = "2026-04-02T09:26:31.709Z" },
+ { url = "https://files.pythonhosted.org/packages/38/dd/5a9ab159fe45c6e72079398f277b7d2b523e7f716acc489726115a910097/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:708838739abf24b2ceb208d0e22403dd018faeef86ddac04319a62ae884c4f15", size = 211229, upload-time = "2026-04-02T09:26:33.282Z" },
+ { url = "https://files.pythonhosted.org/packages/d5/ff/531a1cad5ca855d1c1a8b69cb71abfd6d85c0291580146fda7c82857caa1/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:0f7eb884681e3938906ed0434f20c63046eacd0111c4ba96f27b76084cd679f5", size = 203552, upload-time = "2026-04-02T09:26:34.845Z" },
+ { url = "https://files.pythonhosted.org/packages/c1/4c/a5fb52d528a8ca41f7598cb619409ece30a169fbdf9cdce592e53b46c3a6/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4dc1e73c36828f982bfe79fadf5919923f8a6f4df2860804db9a98c48824ce8d", size = 230806, upload-time = "2026-04-02T09:26:36.152Z" },
+ { url = "https://files.pythonhosted.org/packages/59/7a/071feed8124111a32b316b33ae4de83d36923039ef8cf48120266844285b/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:aed52fea0513bac0ccde438c188c8a471c4e0f457c2dd20cdbf6ea7a450046c7", size = 212316, upload-time = "2026-04-02T09:26:37.672Z" },
+ { url = "https://files.pythonhosted.org/packages/fd/35/f7dba3994312d7ba508e041eaac39a36b120f32d4c8662b8814dab876431/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:fea24543955a6a729c45a73fe90e08c743f0b3334bbf3201e6c4bc1b0c7fa464", size = 227274, upload-time = "2026-04-02T09:26:38.93Z" },
+ { url = "https://files.pythonhosted.org/packages/8a/2d/a572df5c9204ab7688ec1edc895a73ebded3b023bb07364710b05dd1c9be/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bb6d88045545b26da47aa879dd4a89a71d1dce0f0e549b1abcb31dfe4a8eac49", size = 218468, upload-time = "2026-04-02T09:26:40.17Z" },
+ { url = "https://files.pythonhosted.org/packages/86/eb/890922a8b03a568ca2f336c36585a4713c55d4d67bf0f0c78924be6315ca/charset_normalizer-3.4.7-cp312-cp312-win32.whl", hash = "sha256:2257141f39fe65a3fdf38aeccae4b953e5f3b3324f4ff0daf9f15b8518666a2c", size = 148460, upload-time = "2026-04-02T09:26:41.416Z" },
+ { url = "https://files.pythonhosted.org/packages/35/d9/0e7dffa06c5ab081f75b1b786f0aefc88365825dfcd0ac544bdb7b2b6853/charset_normalizer-3.4.7-cp312-cp312-win_amd64.whl", hash = "sha256:5ed6ab538499c8644b8a3e18debabcd7ce684f3fa91cf867521a7a0279cab2d6", size = 159330, upload-time = "2026-04-02T09:26:42.554Z" },
+ { url = "https://files.pythonhosted.org/packages/9e/5d/481bcc2a7c88ea6b0878c299547843b2521ccbc40980cb406267088bc701/charset_normalizer-3.4.7-cp312-cp312-win_arm64.whl", hash = "sha256:56be790f86bfb2c98fb742ce566dfb4816e5a83384616ab59c49e0604d49c51d", size = 147828, upload-time = "2026-04-02T09:26:44.075Z" },
+ { url = "https://files.pythonhosted.org/packages/c1/3b/66777e39d3ae1ddc77ee606be4ec6d8cbd4c801f65e5a1b6f2b11b8346dd/charset_normalizer-3.4.7-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:f496c9c3cc02230093d8330875c4c3cdfc3b73612a5fd921c65d39cbcef08063", size = 309627, upload-time = "2026-04-02T09:26:45.198Z" },
+ { url = "https://files.pythonhosted.org/packages/2e/4e/b7f84e617b4854ade48a1b7915c8ccfadeba444d2a18c291f696e37f0d3b/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ea948db76d31190bf08bd371623927ee1339d5f2a0b4b1b4a4439a65298703c", size = 207008, upload-time = "2026-04-02T09:26:46.824Z" },
+ { url = "https://files.pythonhosted.org/packages/c4/bb/ec73c0257c9e11b268f018f068f5d00aa0ef8c8b09f7753ebd5f2880e248/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a277ab8928b9f299723bc1a2dabb1265911b1a76341f90a510368ca44ad9ab66", size = 228303, upload-time = "2026-04-02T09:26:48.397Z" },
+ { url = "https://files.pythonhosted.org/packages/85/fb/32d1f5033484494619f701e719429c69b766bfc4dbc61aa9e9c8c166528b/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3bec022aec2c514d9cf199522a802bd007cd588ab17ab2525f20f9c34d067c18", size = 224282, upload-time = "2026-04-02T09:26:49.684Z" },
+ { url = "https://files.pythonhosted.org/packages/fa/07/330e3a0dda4c404d6da83b327270906e9654a24f6c546dc886a0eb0ffb23/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e044c39e41b92c845bc815e5ae4230804e8e7bc29e399b0437d64222d92809dd", size = 215595, upload-time = "2026-04-02T09:26:50.915Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/7c/fc890655786e423f02556e0216d4b8c6bcb6bdfa890160dc66bf52dee468/charset_normalizer-3.4.7-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:f495a1652cf3fbab2eb0639776dad966c2fb874d79d87ca07f9d5f059b8bd215", size = 201986, upload-time = "2026-04-02T09:26:52.197Z" },
+ { url = "https://files.pythonhosted.org/packages/d8/97/bfb18b3db2aed3b90cf54dc292ad79fdd5ad65c4eae454099475cbeadd0d/charset_normalizer-3.4.7-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e712b419df8ba5e42b226c510472b37bd57b38e897d3eca5e8cfd410a29fa859", size = 211711, upload-time = "2026-04-02T09:26:53.49Z" },
+ { url = "https://files.pythonhosted.org/packages/6f/a5/a581c13798546a7fd557c82614a5c65a13df2157e9ad6373166d2a3e645d/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7804338df6fcc08105c7745f1502ba68d900f45fd770d5bdd5288ddccb8a42d8", size = 210036, upload-time = "2026-04-02T09:26:54.975Z" },
+ { url = "https://files.pythonhosted.org/packages/8c/bf/b3ab5bcb478e4193d517644b0fb2bf5497fbceeaa7a1bc0f4d5b50953861/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:481551899c856c704d58119b5025793fa6730adda3571971af568f66d2424bb5", size = 202998, upload-time = "2026-04-02T09:26:56.303Z" },
+ { url = "https://files.pythonhosted.org/packages/e7/4e/23efd79b65d314fa320ec6017b4b5834d5c12a58ba4610aa353af2e2f577/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f59099f9b66f0d7145115e6f80dd8b1d847176df89b234a5a6b3f00437aa0832", size = 230056, upload-time = "2026-04-02T09:26:57.554Z" },
+ { url = "https://files.pythonhosted.org/packages/b9/9f/1e1941bc3f0e01df116e68dc37a55c4d249df5e6fa77f008841aef68264f/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:f59ad4c0e8f6bba240a9bb85504faa1ab438237199d4cce5f622761507b8f6a6", size = 211537, upload-time = "2026-04-02T09:26:58.843Z" },
+ { url = "https://files.pythonhosted.org/packages/80/0f/088cbb3020d44428964a6c97fe1edfb1b9550396bf6d278330281e8b709c/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:3dedcc22d73ec993f42055eff4fcfed9318d1eeb9a6606c55892a26964964e48", size = 226176, upload-time = "2026-04-02T09:27:00.437Z" },
+ { url = "https://files.pythonhosted.org/packages/6a/9f/130394f9bbe06f4f63e22641d32fc9b202b7e251c9aef4db044324dac493/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:64f02c6841d7d83f832cd97ccf8eb8a906d06eb95d5276069175c696b024b60a", size = 217723, upload-time = "2026-04-02T09:27:02.021Z" },
+ { url = "https://files.pythonhosted.org/packages/73/55/c469897448a06e49f8fa03f6caae97074fde823f432a98f979cc42b90e69/charset_normalizer-3.4.7-cp313-cp313-win32.whl", hash = "sha256:4042d5c8f957e15221d423ba781e85d553722fc4113f523f2feb7b188cc34c5e", size = 148085, upload-time = "2026-04-02T09:27:03.192Z" },
+ { url = "https://files.pythonhosted.org/packages/5d/78/1b74c5bbb3f99b77a1715c91b3e0b5bdb6fe302d95ace4f5b1bec37b0167/charset_normalizer-3.4.7-cp313-cp313-win_amd64.whl", hash = "sha256:3946fa46a0cf3e4c8cb1cc52f56bb536310d34f25f01ca9b6c16afa767dab110", size = 158819, upload-time = "2026-04-02T09:27:04.454Z" },
+ { url = "https://files.pythonhosted.org/packages/68/86/46bd42279d323deb8687c4a5a811fd548cb7d1de10cf6535d099877a9a9f/charset_normalizer-3.4.7-cp313-cp313-win_arm64.whl", hash = "sha256:80d04837f55fc81da168b98de4f4b797ef007fc8a79ab71c6ec9bc4dd662b15b", size = 147915, upload-time = "2026-04-02T09:27:05.971Z" },
+ { url = "https://files.pythonhosted.org/packages/97/c8/c67cb8c70e19ef1960b97b22ed2a1567711de46c4ddf19799923adc836c2/charset_normalizer-3.4.7-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:c36c333c39be2dbca264d7803333c896ab8fa7d4d6f0ab7edb7dfd7aea6e98c0", size = 309234, upload-time = "2026-04-02T09:27:07.194Z" },
+ { url = "https://files.pythonhosted.org/packages/99/85/c091fdee33f20de70d6c8b522743b6f831a2f1cd3ff86de4c6a827c48a76/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1c2aed2e5e41f24ea8ef1590b8e848a79b56f3a5564a65ceec43c9d692dc7d8a", size = 208042, upload-time = "2026-04-02T09:27:08.749Z" },
+ { url = "https://files.pythonhosted.org/packages/87/1c/ab2ce611b984d2fd5d86a5a8a19c1ae26acac6bad967da4967562c75114d/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:54523e136b8948060c0fa0bc7b1b50c32c186f2fceee897a495406bb6e311d2b", size = 228706, upload-time = "2026-04-02T09:27:09.951Z" },
+ { url = "https://files.pythonhosted.org/packages/a8/29/2b1d2cb00bf085f59d29eb773ce58ec2d325430f8c216804a0a5cd83cbca/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:715479b9a2802ecac752a3b0efa2b0b60285cf962ee38414211abdfccc233b41", size = 224727, upload-time = "2026-04-02T09:27:11.175Z" },
+ { url = "https://files.pythonhosted.org/packages/47/5c/032c2d5a07fe4d4855fea851209cca2b6f03ebeb6d4e3afdb3358386a684/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bd6c2a1c7573c64738d716488d2cdd3c00e340e4835707d8fdb8dc1a66ef164e", size = 215882, upload-time = "2026-04-02T09:27:12.446Z" },
+ { url = "https://files.pythonhosted.org/packages/2c/c2/356065d5a8b78ed04499cae5f339f091946a6a74f91e03476c33f0ab7100/charset_normalizer-3.4.7-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:c45e9440fb78f8ddabcf714b68f936737a121355bf59f3907f4e17721b9d1aae", size = 200860, upload-time = "2026-04-02T09:27:13.721Z" },
+ { url = "https://files.pythonhosted.org/packages/0c/cd/a32a84217ced5039f53b29f460962abb2d4420def55afabe45b1c3c7483d/charset_normalizer-3.4.7-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3534e7dcbdcf757da6b85a0bbf5b6868786d5982dd959b065e65481644817a18", size = 211564, upload-time = "2026-04-02T09:27:15.272Z" },
+ { url = "https://files.pythonhosted.org/packages/44/86/58e6f13ce26cc3b8f4a36b94a0f22ae2f00a72534520f4ae6857c4b81f89/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e8ac484bf18ce6975760921bb6148041faa8fef0547200386ea0b52b5d27bf7b", size = 211276, upload-time = "2026-04-02T09:27:16.834Z" },
+ { url = "https://files.pythonhosted.org/packages/8f/fe/d17c32dc72e17e155e06883efa84514ca375f8a528ba2546bee73fc4df81/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:a5fe03b42827c13cdccd08e6c0247b6a6d4b5e3cdc53fd1749f5896adcdc2356", size = 201238, upload-time = "2026-04-02T09:27:18.229Z" },
+ { url = "https://files.pythonhosted.org/packages/6a/29/f33daa50b06525a237451cdb6c69da366c381a3dadcd833fa5676bc468b3/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:2d6eb928e13016cea4f1f21d1e10c1cebd5a421bc57ddf5b1142ae3f86824fab", size = 230189, upload-time = "2026-04-02T09:27:19.445Z" },
+ { url = "https://files.pythonhosted.org/packages/b6/6e/52c84015394a6a0bdcd435210a7e944c5f94ea1055f5cc5d56c5fe368e7b/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:e74327fb75de8986940def6e8dee4f127cc9752bee7355bb323cc5b2659b6d46", size = 211352, upload-time = "2026-04-02T09:27:20.79Z" },
+ { url = "https://files.pythonhosted.org/packages/8c/d7/4353be581b373033fb9198bf1da3cf8f09c1082561e8e922aa7b39bf9fe8/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:d6038d37043bced98a66e68d3aa2b6a35505dc01328cd65217cefe82f25def44", size = 227024, upload-time = "2026-04-02T09:27:22.063Z" },
+ { url = "https://files.pythonhosted.org/packages/30/45/99d18aa925bd1740098ccd3060e238e21115fffbfdcb8f3ece837d0ace6c/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7579e913a5339fb8fa133f6bbcfd8e6749696206cf05acdbdca71a1b436d8e72", size = 217869, upload-time = "2026-04-02T09:27:23.486Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/05/5ee478aa53f4bb7996482153d4bfe1b89e0f087f0ab6b294fcf92d595873/charset_normalizer-3.4.7-cp314-cp314-win32.whl", hash = "sha256:5b77459df20e08151cd6f8b9ef8ef1f961ef73d85c21a555c7eed5b79410ec10", size = 148541, upload-time = "2026-04-02T09:27:25.146Z" },
+ { url = "https://files.pythonhosted.org/packages/48/77/72dcb0921b2ce86420b2d79d454c7022bf5be40202a2a07906b9f2a35c97/charset_normalizer-3.4.7-cp314-cp314-win_amd64.whl", hash = "sha256:92a0a01ead5e668468e952e4238cccd7c537364eb7d851ab144ab6627dbbe12f", size = 159634, upload-time = "2026-04-02T09:27:26.642Z" },
+ { url = "https://files.pythonhosted.org/packages/c6/a3/c2369911cd72f02386e4e340770f6e158c7980267da16af8f668217abaa0/charset_normalizer-3.4.7-cp314-cp314-win_arm64.whl", hash = "sha256:67f6279d125ca0046a7fd386d01b311c6363844deac3e5b069b514ba3e63c246", size = 148384, upload-time = "2026-04-02T09:27:28.271Z" },
+ { url = "https://files.pythonhosted.org/packages/94/09/7e8a7f73d24dba1f0035fbbf014d2c36828fc1bf9c88f84093e57d315935/charset_normalizer-3.4.7-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:effc3f449787117233702311a1b7d8f59cba9ced946ba727bdc329ec69028e24", size = 330133, upload-time = "2026-04-02T09:27:29.474Z" },
+ { url = "https://files.pythonhosted.org/packages/8d/da/96975ddb11f8e977f706f45cddd8540fd8242f71ecdb5d18a80723dcf62c/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fbccdc05410c9ee21bbf16a35f4c1d16123dcdeb8a1d38f33654fa21d0234f79", size = 216257, upload-time = "2026-04-02T09:27:30.793Z" },
+ { url = "https://files.pythonhosted.org/packages/e5/e8/1d63bf8ef2d388e95c64b2098f45f84758f6d102a087552da1485912637b/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:733784b6d6def852c814bce5f318d25da2ee65dd4839a0718641c696e09a2960", size = 234851, upload-time = "2026-04-02T09:27:32.44Z" },
+ { url = "https://files.pythonhosted.org/packages/9b/40/e5ff04233e70da2681fa43969ad6f66ca5611d7e669be0246c4c7aaf6dc8/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a89c23ef8d2c6b27fd200a42aa4ac72786e7c60d40efdc76e6011260b6e949c4", size = 233393, upload-time = "2026-04-02T09:27:34.03Z" },
+ { url = "https://files.pythonhosted.org/packages/be/c1/06c6c49d5a5450f76899992f1ee40b41d076aee9279b49cf9974d2f313d5/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6c114670c45346afedc0d947faf3c7f701051d2518b943679c8ff88befe14f8e", size = 223251, upload-time = "2026-04-02T09:27:35.369Z" },
+ { url = "https://files.pythonhosted.org/packages/2b/9f/f2ff16fb050946169e3e1f82134d107e5d4ae72647ec8a1b1446c148480f/charset_normalizer-3.4.7-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:a180c5e59792af262bf263b21a3c49353f25945d8d9f70628e73de370d55e1e1", size = 206609, upload-time = "2026-04-02T09:27:36.661Z" },
+ { url = "https://files.pythonhosted.org/packages/69/d5/a527c0cd8d64d2eab7459784fb4169a0ac76e5a6fc5237337982fd61347e/charset_normalizer-3.4.7-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3c9a494bc5ec77d43cea229c4f6db1e4d8fe7e1bbffa8b6f0f0032430ff8ab44", size = 220014, upload-time = "2026-04-02T09:27:38.019Z" },
+ { url = "https://files.pythonhosted.org/packages/7e/80/8a7b8104a3e203074dc9aa2c613d4b726c0e136bad1cc734594b02867972/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8d828b6667a32a728a1ad1d93957cdf37489c57b97ae6c4de2860fa749b8fc1e", size = 218979, upload-time = "2026-04-02T09:27:39.37Z" },
+ { url = "https://files.pythonhosted.org/packages/02/9a/b759b503d507f375b2b5c153e4d2ee0a75aa215b7f2489cf314f4541f2c0/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:cf1493cd8607bec4d8a7b9b004e699fcf8f9103a9284cc94962cb73d20f9d4a3", size = 209238, upload-time = "2026-04-02T09:27:40.722Z" },
+ { url = "https://files.pythonhosted.org/packages/c2/4e/0f3f5d47b86bdb79256e7290b26ac847a2832d9a4033f7eb2cd4bcf4bb5b/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:0c96c3b819b5c3e9e165495db84d41914d6894d55181d2d108cc1a69bfc9cce0", size = 236110, upload-time = "2026-04-02T09:27:42.33Z" },
+ { url = "https://files.pythonhosted.org/packages/96/23/bce28734eb3ed2c91dcf93abeb8a5cf393a7b2749725030bb630e554fdd8/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:752a45dc4a6934060b3b0dab47e04edc3326575f82be64bc4fc293914566503e", size = 219824, upload-time = "2026-04-02T09:27:43.924Z" },
+ { url = "https://files.pythonhosted.org/packages/2c/6f/6e897c6984cc4d41af319b077f2f600fc8214eb2fe2d6bcb79141b882400/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:8778f0c7a52e56f75d12dae53ae320fae900a8b9b4164b981b9c5ce059cd1fcb", size = 233103, upload-time = "2026-04-02T09:27:45.348Z" },
+ { url = "https://files.pythonhosted.org/packages/76/22/ef7bd0fe480a0ae9b656189ec00744b60933f68b4f42a7bb06589f6f576a/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ce3412fbe1e31eb81ea42f4169ed94861c56e643189e1e75f0041f3fe7020abe", size = 225194, upload-time = "2026-04-02T09:27:46.706Z" },
+ { url = "https://files.pythonhosted.org/packages/c5/a7/0e0ab3e0b5bc1219bd80a6a0d4d72ca74d9250cb2382b7c699c147e06017/charset_normalizer-3.4.7-cp314-cp314t-win32.whl", hash = "sha256:c03a41a8784091e67a39648f70c5f97b5b6a37f216896d44d2cdcb82615339a0", size = 159827, upload-time = "2026-04-02T09:27:48.053Z" },
+ { url = "https://files.pythonhosted.org/packages/7a/1d/29d32e0fb40864b1f878c7f5a0b343ae676c6e2b271a2d55cc3a152391da/charset_normalizer-3.4.7-cp314-cp314t-win_amd64.whl", hash = "sha256:03853ed82eeebbce3c2abfdbc98c96dc205f32a79627688ac9a27370ea61a49c", size = 174168, upload-time = "2026-04-02T09:27:49.795Z" },
+ { url = "https://files.pythonhosted.org/packages/de/32/d92444ad05c7a6e41fb2036749777c163baf7a0301a040cb672d6b2b1ae9/charset_normalizer-3.4.7-cp314-cp314t-win_arm64.whl", hash = "sha256:c35abb8bfff0185efac5878da64c45dafd2b37fb0383add1be155a763c1f083d", size = 153018, upload-time = "2026-04-02T09:27:51.116Z" },
+ { url = "https://files.pythonhosted.org/packages/db/8f/61959034484a4a7c527811f4721e75d02d653a35afb0b6054474d8185d4c/charset_normalizer-3.4.7-py3-none-any.whl", hash = "sha256:3dce51d0f5e7951f8bb4900c257dad282f49190fdbebecd4ba99bcc41fef404d", size = 61958, upload-time = "2026-04-02T09:28:37.794Z" },
+]
+
+[[package]]
+name = "click"
+version = "8.3.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/bb/63/f9e1ea081ce35720d8b92acde70daaedace594dc93b693c869e0d5910718/click-8.3.3.tar.gz", hash = "sha256:398329ad4837b2ff7cbe1dd166a4c0f8900c3ca3a218de04466f38f6497f18a2", size = 328061, upload-time = "2026-04-22T15:11:27.506Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/ae/44/c1221527f6a71a01ec6fbad7fa78f1d50dfa02217385cf0fa3eec7087d59/click-8.3.3-py3-none-any.whl", hash = "sha256:a2bf429bb3033c89fa4936ffb35d5cb471e3719e1f3c8a7c3fff0b8314305613", size = 110502, upload-time = "2026-04-22T15:11:25.044Z" },
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
+]
+
+[[package]]
+name = "coverage"
+version = "7.13.5"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9d/e0/70553e3000e345daff267cec284ce4cbf3fc141b6da229ac52775b5428f1/coverage-7.13.5.tar.gz", hash = "sha256:c81f6515c4c40141f83f502b07bbfa5c240ba25bbe73da7b33f1e5b6120ff179", size = 915967, upload-time = "2026-03-17T10:33:18.341Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/69/33/e8c48488c29a73fd089f9d71f9653c1be7478f2ad6b5bc870db11a55d23d/coverage-7.13.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e0723d2c96324561b9aa76fb982406e11d93cdb388a7a7da2b16e04719cf7ca5", size = 219255, upload-time = "2026-03-17T10:29:51.081Z" },
+ { url = "https://files.pythonhosted.org/packages/da/bd/b0ebe9f677d7f4b74a3e115eec7ddd4bcf892074963a00d91e8b164a6386/coverage-7.13.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52f444e86475992506b32d4e5ca55c24fc88d73bcbda0e9745095b28ef4dc0cf", size = 219772, upload-time = "2026-03-17T10:29:52.867Z" },
+ { url = "https://files.pythonhosted.org/packages/48/cc/5cb9502f4e01972f54eedd48218bb203fe81e294be606a2bc93970208013/coverage-7.13.5-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:704de6328e3d612a8f6c07000a878ff38181ec3263d5a11da1db294fa6a9bdf8", size = 246532, upload-time = "2026-03-17T10:29:54.688Z" },
+ { url = "https://files.pythonhosted.org/packages/7d/d8/3217636d86c7e7b12e126e4f30ef1581047da73140614523af7495ed5f2d/coverage-7.13.5-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a1a6d79a14e1ec1832cabc833898636ad5f3754a678ef8bb4908515208bf84f4", size = 248333, upload-time = "2026-03-17T10:29:56.221Z" },
+ { url = "https://files.pythonhosted.org/packages/2b/30/2002ac6729ba2d4357438e2ed3c447ad8562866c8c63fc16f6dfc33afe56/coverage-7.13.5-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79060214983769c7ba3f0cee10b54c97609dca4d478fa1aa32b914480fd5738d", size = 250211, upload-time = "2026-03-17T10:29:57.938Z" },
+ { url = "https://files.pythonhosted.org/packages/6c/85/552496626d6b9359eb0e2f86f920037c9cbfba09b24d914c6e1528155f7d/coverage-7.13.5-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:356e76b46783a98c2a2fe81ec79df4883a1e62895ea952968fb253c114e7f930", size = 252125, upload-time = "2026-03-17T10:29:59.388Z" },
+ { url = "https://files.pythonhosted.org/packages/44/21/40256eabdcbccdb6acf6b381b3016a154399a75fe39d406f790ae84d1f3c/coverage-7.13.5-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0cef0cdec915d11254a7f549c1170afecce708d30610c6abdded1f74e581666d", size = 247219, upload-time = "2026-03-17T10:30:01.199Z" },
+ { url = "https://files.pythonhosted.org/packages/b1/e8/96e2a6c3f21a0ea77d7830b254a1542d0328acc8d7bdf6a284ba7e529f77/coverage-7.13.5-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:dc022073d063b25a402454e5712ef9e007113e3a676b96c5f29b2bda29352f40", size = 248248, upload-time = "2026-03-17T10:30:03.317Z" },
+ { url = "https://files.pythonhosted.org/packages/da/ba/8477f549e554827da390ec659f3c38e4b6d95470f4daafc2d8ff94eaa9c2/coverage-7.13.5-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9b74db26dfea4f4e50d48a4602207cd1e78be33182bc9cbf22da94f332f99878", size = 246254, upload-time = "2026-03-17T10:30:04.832Z" },
+ { url = "https://files.pythonhosted.org/packages/55/59/bc22aef0e6aa179d5b1b001e8b3654785e9adf27ef24c93dc4228ebd5d68/coverage-7.13.5-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ad146744ca4fd09b50c482650e3c1b1f4dfa1d4792e0a04a369c7f23336f0400", size = 250067, upload-time = "2026-03-17T10:30:06.535Z" },
+ { url = "https://files.pythonhosted.org/packages/de/1b/c6a023a160806a5137dca53468fd97530d6acad24a22003b1578a9c2e429/coverage-7.13.5-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:c555b48be1853fe3997c11c4bd521cdd9a9612352de01fa4508f16ec341e6fe0", size = 246521, upload-time = "2026-03-17T10:30:08.486Z" },
+ { url = "https://files.pythonhosted.org/packages/2d/3f/3532c85a55aa2f899fa17c186f831cfa1aa434d88ff792a709636f64130e/coverage-7.13.5-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7034b5c56a58ae5e85f23949d52c14aca2cfc6848a31764995b7de88f13a1ea0", size = 247126, upload-time = "2026-03-17T10:30:09.966Z" },
+ { url = "https://files.pythonhosted.org/packages/aa/2e/b9d56af4a24ef45dfbcda88e06870cb7d57b2b0bfa3a888d79b4c8debd76/coverage-7.13.5-cp310-cp310-win32.whl", hash = "sha256:eb7fdf1ef130660e7415e0253a01a7d5a88c9c4d158bcf75cbbd922fd65a5b58", size = 221860, upload-time = "2026-03-17T10:30:11.393Z" },
+ { url = "https://files.pythonhosted.org/packages/9f/cc/d938417e7a4d7f0433ad4edee8bb2acdc60dc7ac5af19e2a07a048ecbee3/coverage-7.13.5-cp310-cp310-win_amd64.whl", hash = "sha256:3e1bb5f6c78feeb1be3475789b14a0f0a5b47d505bfc7267126ccbd50289999e", size = 222788, upload-time = "2026-03-17T10:30:12.886Z" },
+ { url = "https://files.pythonhosted.org/packages/4b/37/d24c8f8220ff07b839b2c043ea4903a33b0f455abe673ae3c03bbdb7f212/coverage-7.13.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66a80c616f80181f4d643b0f9e709d97bcea413ecd9631e1dedc7401c8e6695d", size = 219381, upload-time = "2026-03-17T10:30:14.68Z" },
+ { url = "https://files.pythonhosted.org/packages/35/8b/cd129b0ca4afe886a6ce9d183c44d8301acbd4ef248622e7c49a23145605/coverage-7.13.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:145ede53ccbafb297c1c9287f788d1bc3efd6c900da23bf6931b09eafc931587", size = 219880, upload-time = "2026-03-17T10:30:16.231Z" },
+ { url = "https://files.pythonhosted.org/packages/55/2f/e0e5b237bffdb5d6c530ce87cc1d413a5b7d7dfd60fb067ad6d254c35c76/coverage-7.13.5-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:0672854dc733c342fa3e957e0605256d2bf5934feeac328da9e0b5449634a642", size = 250303, upload-time = "2026-03-17T10:30:17.748Z" },
+ { url = "https://files.pythonhosted.org/packages/92/be/b1afb692be85b947f3401375851484496134c5554e67e822c35f28bf2fbc/coverage-7.13.5-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:ec10e2a42b41c923c2209b846126c6582db5e43a33157e9870ba9fb70dc7854b", size = 252218, upload-time = "2026-03-17T10:30:19.804Z" },
+ { url = "https://files.pythonhosted.org/packages/da/69/2f47bb6fa1b8d1e3e5d0c4be8ccb4313c63d742476a619418f85740d597b/coverage-7.13.5-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:be3d4bbad9d4b037791794ddeedd7d64a56f5933a2c1373e18e9e568b9141686", size = 254326, upload-time = "2026-03-17T10:30:21.321Z" },
+ { url = "https://files.pythonhosted.org/packages/d5/d0/79db81da58965bd29dabc8f4ad2a2af70611a57cba9d1ec006f072f30a54/coverage-7.13.5-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4d2afbc5cc54d286bfb54541aa50b64cdb07a718227168c87b9e2fb8f25e1743", size = 256267, upload-time = "2026-03-17T10:30:23.094Z" },
+ { url = "https://files.pythonhosted.org/packages/e5/32/d0d7cc8168f91ddab44c0ce4806b969df5f5fdfdbb568eaca2dbc2a04936/coverage-7.13.5-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3ad050321264c49c2fa67bb599100456fc51d004b82534f379d16445da40fb75", size = 250430, upload-time = "2026-03-17T10:30:25.311Z" },
+ { url = "https://files.pythonhosted.org/packages/4d/06/a055311d891ddbe231cd69fdd20ea4be6e3603ffebddf8704b8ca8e10a3c/coverage-7.13.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7300c8a6d13335b29bb76d7651c66af6bd8658517c43499f110ddc6717bfc209", size = 252017, upload-time = "2026-03-17T10:30:27.284Z" },
+ { url = "https://files.pythonhosted.org/packages/d6/f6/d0fd2d21e29a657b5f77a2fe7082e1568158340dceb941954f776dce1b7b/coverage-7.13.5-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:eb07647a5738b89baab047f14edd18ded523de60f3b30e75c2acc826f79c839a", size = 250080, upload-time = "2026-03-17T10:30:29.481Z" },
+ { url = "https://files.pythonhosted.org/packages/4e/ab/0d7fb2efc2e9a5eb7ddcc6e722f834a69b454b7e6e5888c3a8567ecffb31/coverage-7.13.5-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:9adb6688e3b53adffefd4a52d72cbd8b02602bfb8f74dcd862337182fd4d1a4e", size = 253843, upload-time = "2026-03-17T10:30:31.301Z" },
+ { url = "https://files.pythonhosted.org/packages/ba/6f/7467b917bbf5408610178f62a49c0ed4377bb16c1657f689cc61470da8ce/coverage-7.13.5-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7c8d4bc913dd70b93488d6c496c77f3aff5ea99a07e36a18f865bca55adef8bd", size = 249802, upload-time = "2026-03-17T10:30:33.358Z" },
+ { url = "https://files.pythonhosted.org/packages/75/2c/1172fb689df92135f5bfbbd69fc83017a76d24ea2e2f3a1154007e2fb9f8/coverage-7.13.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0e3c426ffc4cd952f54ee9ffbdd10345709ecc78a3ecfd796a57236bfad0b9b8", size = 250707, upload-time = "2026-03-17T10:30:35.2Z" },
+ { url = "https://files.pythonhosted.org/packages/67/21/9ac389377380a07884e3b48ba7a620fcd9dbfaf1d40565facdc6b36ec9ef/coverage-7.13.5-cp311-cp311-win32.whl", hash = "sha256:259b69bb83ad9894c4b25be2528139eecba9a82646ebdda2d9db1ba28424a6bf", size = 221880, upload-time = "2026-03-17T10:30:36.775Z" },
+ { url = "https://files.pythonhosted.org/packages/af/7f/4cd8a92531253f9d7c1bbecd9fa1b472907fb54446ca768c59b531248dc5/coverage-7.13.5-cp311-cp311-win_amd64.whl", hash = "sha256:258354455f4e86e3e9d0d17571d522e13b4e1e19bf0f8596bcf9476d61e7d8a9", size = 222816, upload-time = "2026-03-17T10:30:38.891Z" },
+ { url = "https://files.pythonhosted.org/packages/12/a6/1d3f6155fb0010ca68eba7fe48ca6c9da7385058b77a95848710ecf189b1/coverage-7.13.5-cp311-cp311-win_arm64.whl", hash = "sha256:bff95879c33ec8da99fc9b6fe345ddb5be6414b41d6d1ad1c8f188d26f36e028", size = 221483, upload-time = "2026-03-17T10:30:40.463Z" },
+ { url = "https://files.pythonhosted.org/packages/a0/c3/a396306ba7db865bf96fc1fb3b7fd29bcbf3d829df642e77b13555163cd6/coverage-7.13.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:460cf0114c5016fa841214ff5564aa4864f11948da9440bc97e21ad1f4ba1e01", size = 219554, upload-time = "2026-03-17T10:30:42.208Z" },
+ { url = "https://files.pythonhosted.org/packages/a6/16/a68a19e5384e93f811dccc51034b1fd0b865841c390e3c931dcc4699e035/coverage-7.13.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0e223ce4b4ed47f065bfb123687686512e37629be25cc63728557ae7db261422", size = 219908, upload-time = "2026-03-17T10:30:43.906Z" },
+ { url = "https://files.pythonhosted.org/packages/29/72/20b917c6793af3a5ceb7fb9c50033f3ec7865f2911a1416b34a7cfa0813b/coverage-7.13.5-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:6e3370441f4513c6252bf042b9c36d22491142385049243253c7e48398a15a9f", size = 251419, upload-time = "2026-03-17T10:30:45.545Z" },
+ { url = "https://files.pythonhosted.org/packages/8c/49/cd14b789536ac6a4778c453c6a2338bc0a2fb60c5a5a41b4008328b9acc1/coverage-7.13.5-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:03ccc709a17a1de074fb1d11f217342fb0d2b1582ed544f554fc9fc3f07e95f5", size = 254159, upload-time = "2026-03-17T10:30:47.204Z" },
+ { url = "https://files.pythonhosted.org/packages/9d/00/7b0edcfe64e2ed4c0340dac14a52ad0f4c9bd0b8b5e531af7d55b703db7c/coverage-7.13.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3f4818d065964db3c1c66dc0fbdac5ac692ecbc875555e13374fdbe7eedb4376", size = 255270, upload-time = "2026-03-17T10:30:48.812Z" },
+ { url = "https://files.pythonhosted.org/packages/93/89/7ffc4ba0f5d0a55c1e84ea7cee39c9fc06af7b170513d83fbf3bbefce280/coverage-7.13.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:012d5319e66e9d5a218834642d6c35d265515a62f01157a45bcc036ecf947256", size = 257538, upload-time = "2026-03-17T10:30:50.77Z" },
+ { url = "https://files.pythonhosted.org/packages/81/bd/73ddf85f93f7e6fa83e77ccecb6162d9415c79007b4bc124008a4995e4a7/coverage-7.13.5-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8dd02af98971bdb956363e4827d34425cb3df19ee550ef92855b0acb9c7ce51c", size = 251821, upload-time = "2026-03-17T10:30:52.5Z" },
+ { url = "https://files.pythonhosted.org/packages/a0/81/278aff4e8dec4926a0bcb9486320752811f543a3ce5b602cc7a29978d073/coverage-7.13.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f08fd75c50a760c7eb068ae823777268daaf16a80b918fa58eea888f8e3919f5", size = 253191, upload-time = "2026-03-17T10:30:54.543Z" },
+ { url = "https://files.pythonhosted.org/packages/70/ee/fe1621488e2e0a58d7e94c4800f0d96f79671553488d401a612bebae324b/coverage-7.13.5-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:843ea8643cf967d1ac7e8ecd4bb00c99135adf4816c0c0593fdcc47b597fcf09", size = 251337, upload-time = "2026-03-17T10:30:56.663Z" },
+ { url = "https://files.pythonhosted.org/packages/37/a6/f79fb37aa104b562207cc23cb5711ab6793608e246cae1e93f26b2236ed9/coverage-7.13.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:9d44d7aa963820b1b971dbecd90bfe5fe8f81cff79787eb6cca15750bd2f79b9", size = 255404, upload-time = "2026-03-17T10:30:58.427Z" },
+ { url = "https://files.pythonhosted.org/packages/75/f0/ed15262a58ec81ce457ceb717b7f78752a1713556b19081b76e90896e8d4/coverage-7.13.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:7132bed4bd7b836200c591410ae7d97bf7ae8be6fc87d160b2bd881df929e7bf", size = 250903, upload-time = "2026-03-17T10:31:00.093Z" },
+ { url = "https://files.pythonhosted.org/packages/0f/e9/9129958f20e7e9d4d56d51d42ccf708d15cac355ff4ac6e736e97a9393d2/coverage-7.13.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a698e363641b98843c517817db75373c83254781426e94ada3197cabbc2c919c", size = 252780, upload-time = "2026-03-17T10:31:01.916Z" },
+ { url = "https://files.pythonhosted.org/packages/a4/d7/0ad9b15812d81272db94379fe4c6df8fd17781cc7671fdfa30c76ba5ff7b/coverage-7.13.5-cp312-cp312-win32.whl", hash = "sha256:bdba0a6b8812e8c7df002d908a9a2ea3c36e92611b5708633c50869e6d922fdf", size = 222093, upload-time = "2026-03-17T10:31:03.642Z" },
+ { url = "https://files.pythonhosted.org/packages/29/3d/821a9a5799fac2556bcf0bd37a70d1d11fa9e49784b6d22e92e8b2f85f18/coverage-7.13.5-cp312-cp312-win_amd64.whl", hash = "sha256:d2c87e0c473a10bffe991502eac389220533024c8082ec1ce849f4218dded810", size = 222900, upload-time = "2026-03-17T10:31:05.651Z" },
+ { url = "https://files.pythonhosted.org/packages/d4/fa/2238c2ad08e35cf4f020ea721f717e09ec3152aea75d191a7faf3ef009a8/coverage-7.13.5-cp312-cp312-win_arm64.whl", hash = "sha256:bf69236a9a81bdca3bff53796237aab096cdbf8d78a66ad61e992d9dac7eb2de", size = 221515, upload-time = "2026-03-17T10:31:07.293Z" },
+ { url = "https://files.pythonhosted.org/packages/74/8c/74fedc9663dcf168b0a059d4ea756ecae4da77a489048f94b5f512a8d0b3/coverage-7.13.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5ec4af212df513e399cf11610cc27063f1586419e814755ab362e50a85ea69c1", size = 219576, upload-time = "2026-03-17T10:31:09.045Z" },
+ { url = "https://files.pythonhosted.org/packages/0c/c9/44fb661c55062f0818a6ffd2685c67aa30816200d5f2817543717d4b92eb/coverage-7.13.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:941617e518602e2d64942c88ec8499f7fbd49d3f6c4327d3a71d43a1973032f3", size = 219942, upload-time = "2026-03-17T10:31:10.708Z" },
+ { url = "https://files.pythonhosted.org/packages/5f/13/93419671cee82b780bab7ea96b67c8ef448f5f295f36bf5031154ec9a790/coverage-7.13.5-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:da305e9937617ee95c2e39d8ff9f040e0487cbf1ac174f777ed5eddd7a7c1f26", size = 250935, upload-time = "2026-03-17T10:31:12.392Z" },
+ { url = "https://files.pythonhosted.org/packages/ac/68/1666e3a4462f8202d836920114fa7a5ee9275d1fa45366d336c551a162dd/coverage-7.13.5-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:78e696e1cc714e57e8b25760b33a8b1026b7048d270140d25dafe1b0a1ee05a3", size = 253541, upload-time = "2026-03-17T10:31:14.247Z" },
+ { url = "https://files.pythonhosted.org/packages/4e/5e/3ee3b835647be646dcf3c65a7c6c18f87c27326a858f72ab22c12730773d/coverage-7.13.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:02ca0eed225b2ff301c474aeeeae27d26e2537942aa0f87491d3e147e784a82b", size = 254780, upload-time = "2026-03-17T10:31:16.193Z" },
+ { url = "https://files.pythonhosted.org/packages/44/b3/cb5bd1a04cfcc49ede6cd8409d80bee17661167686741e041abc7ee1b9a9/coverage-7.13.5-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:04690832cbea4e4663d9149e05dba142546ca05cb1848816760e7f58285c970a", size = 256912, upload-time = "2026-03-17T10:31:17.89Z" },
+ { url = "https://files.pythonhosted.org/packages/1b/66/c1dceb7b9714473800b075f5c8a84f4588f887a90eb8645282031676e242/coverage-7.13.5-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0590e44dd2745c696a778f7bab6aa95256de2cbc8b8cff4f7db8ff09813d6969", size = 251165, upload-time = "2026-03-17T10:31:19.605Z" },
+ { url = "https://files.pythonhosted.org/packages/b7/62/5502b73b97aa2e53ea22a39cf8649ff44827bef76d90bf638777daa27a9d/coverage-7.13.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d7cfad2d6d81dd298ab6b89fe72c3b7b05ec7544bdda3b707ddaecff8d25c161", size = 252908, upload-time = "2026-03-17T10:31:21.312Z" },
+ { url = "https://files.pythonhosted.org/packages/7d/37/7792c2d69854397ca77a55c4646e5897c467928b0e27f2d235d83b5d08c6/coverage-7.13.5-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:e092b9499de38ae0fbfbc603a74660eb6ff3e869e507b50d85a13b6db9863e15", size = 250873, upload-time = "2026-03-17T10:31:23.565Z" },
+ { url = "https://files.pythonhosted.org/packages/a3/23/bc866fb6163be52a8a9e5d708ba0d3b1283c12158cefca0a8bbb6e247a43/coverage-7.13.5-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:48c39bc4a04d983a54a705a6389512883d4a3b9862991b3617d547940e9f52b1", size = 255030, upload-time = "2026-03-17T10:31:25.58Z" },
+ { url = "https://files.pythonhosted.org/packages/7d/8b/ef67e1c222ef49860701d346b8bbb70881bef283bd5f6cbba68a39a086c7/coverage-7.13.5-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:2d3807015f138ffea1ed9afeeb8624fd781703f2858b62a8dd8da5a0994c57b6", size = 250694, upload-time = "2026-03-17T10:31:27.316Z" },
+ { url = "https://files.pythonhosted.org/packages/46/0d/866d1f74f0acddbb906db212e096dee77a8e2158ca5e6bb44729f9d93298/coverage-7.13.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ee2aa19e03161671ec964004fb74b2257805d9710bf14a5c704558b9d8dbaf17", size = 252469, upload-time = "2026-03-17T10:31:29.472Z" },
+ { url = "https://files.pythonhosted.org/packages/7a/f5/be742fec31118f02ce42b21c6af187ad6a344fed546b56ca60caacc6a9a0/coverage-7.13.5-cp313-cp313-win32.whl", hash = "sha256:ce1998c0483007608c8382f4ff50164bfc5bd07a2246dd272aa4043b75e61e85", size = 222112, upload-time = "2026-03-17T10:31:31.526Z" },
+ { url = "https://files.pythonhosted.org/packages/66/40/7732d648ab9d069a46e686043241f01206348e2bbf128daea85be4d6414b/coverage-7.13.5-cp313-cp313-win_amd64.whl", hash = "sha256:631efb83f01569670a5e866ceb80fe483e7c159fac6f167e6571522636104a0b", size = 222923, upload-time = "2026-03-17T10:31:33.633Z" },
+ { url = "https://files.pythonhosted.org/packages/48/af/fea819c12a095781f6ccd504890aaddaf88b8fab263c4940e82c7b770124/coverage-7.13.5-cp313-cp313-win_arm64.whl", hash = "sha256:f4cd16206ad171cbc2470dbea9103cf9a7607d5fe8c242fdf1edf36174020664", size = 221540, upload-time = "2026-03-17T10:31:35.445Z" },
+ { url = "https://files.pythonhosted.org/packages/23/d2/17879af479df7fbbd44bd528a31692a48f6b25055d16482fdf5cdb633805/coverage-7.13.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0428cbef5783ad91fe240f673cc1f76b25e74bbfe1a13115e4aa30d3f538162d", size = 220262, upload-time = "2026-03-17T10:31:37.184Z" },
+ { url = "https://files.pythonhosted.org/packages/5b/4c/d20e554f988c8f91d6a02c5118f9abbbf73a8768a3048cb4962230d5743f/coverage-7.13.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e0b216a19534b2427cc201a26c25da4a48633f29a487c61258643e89d28200c0", size = 220617, upload-time = "2026-03-17T10:31:39.245Z" },
+ { url = "https://files.pythonhosted.org/packages/29/9c/f9f5277b95184f764b24e7231e166dfdb5780a46d408a2ac665969416d61/coverage-7.13.5-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:972a9cd27894afe4bc2b1480107054e062df08e671df7c2f18c205e805ccd806", size = 261912, upload-time = "2026-03-17T10:31:41.324Z" },
+ { url = "https://files.pythonhosted.org/packages/d5/f6/7f1ab39393eeb50cfe4747ae8ef0e4fc564b989225aa1152e13a180d74f8/coverage-7.13.5-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:4b59148601efcd2bac8c4dbf1f0ad6391693ccf7a74b8205781751637076aee3", size = 263987, upload-time = "2026-03-17T10:31:43.724Z" },
+ { url = "https://files.pythonhosted.org/packages/a0/d7/62c084fb489ed9c6fbdf57e006752e7c516ea46fd690e5ed8b8617c7d52e/coverage-7.13.5-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:505d7083c8b0c87a8fa8c07370c285847c1f77739b22e299ad75a6af6c32c5c9", size = 266416, upload-time = "2026-03-17T10:31:45.769Z" },
+ { url = "https://files.pythonhosted.org/packages/a9/f6/df63d8660e1a0bff6125947afda112a0502736f470d62ca68b288ea762d8/coverage-7.13.5-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:60365289c3741e4db327e7baff2a4aaacf22f788e80fa4683393891b70a89fbd", size = 267558, upload-time = "2026-03-17T10:31:48.293Z" },
+ { url = "https://files.pythonhosted.org/packages/5b/02/353ca81d36779bd108f6d384425f7139ac3c58c750dcfaafe5d0bee6436b/coverage-7.13.5-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1b88c69c8ef5d4b6fe7dea66d6636056a0f6a7527c440e890cf9259011f5e606", size = 261163, upload-time = "2026-03-17T10:31:50.125Z" },
+ { url = "https://files.pythonhosted.org/packages/2c/16/2e79106d5749bcaf3aee6d309123548e3276517cd7851faa8da213bc61bf/coverage-7.13.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5b13955d31d1633cf9376908089b7cebe7d15ddad7aeaabcbe969a595a97e95e", size = 263981, upload-time = "2026-03-17T10:31:51.961Z" },
+ { url = "https://files.pythonhosted.org/packages/29/c7/c29e0c59ffa6942030ae6f50b88ae49988e7e8da06de7ecdbf49c6d4feae/coverage-7.13.5-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:f70c9ab2595c56f81a89620e22899eea8b212a4041bd728ac6f4a28bf5d3ddd0", size = 261604, upload-time = "2026-03-17T10:31:53.872Z" },
+ { url = "https://files.pythonhosted.org/packages/40/48/097cdc3db342f34006a308ab41c3a7c11c3f0d84750d340f45d88a782e00/coverage-7.13.5-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:084b84a8c63e8d6fc7e3931b316a9bcafca1458d753c539db82d31ed20091a87", size = 265321, upload-time = "2026-03-17T10:31:55.997Z" },
+ { url = "https://files.pythonhosted.org/packages/bb/1f/4994af354689e14fd03a75f8ec85a9a68d94e0188bbdab3fc1516b55e512/coverage-7.13.5-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ad14385487393e386e2ea988b09d62dd42c397662ac2dabc3832d71253eee479", size = 260502, upload-time = "2026-03-17T10:31:58.308Z" },
+ { url = "https://files.pythonhosted.org/packages/22/c6/9bb9ef55903e628033560885f5c31aa227e46878118b63ab15dc7ba87797/coverage-7.13.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7f2c47b36fe7709a6e83bfadf4eefb90bd25fbe4014d715224c4316f808e59a2", size = 262688, upload-time = "2026-03-17T10:32:00.141Z" },
+ { url = "https://files.pythonhosted.org/packages/14/4f/f5df9007e50b15e53e01edea486814783a7f019893733d9e4d6caad75557/coverage-7.13.5-cp313-cp313t-win32.whl", hash = "sha256:67e9bc5449801fad0e5dff329499fb090ba4c5800b86805c80617b4e29809b2a", size = 222788, upload-time = "2026-03-17T10:32:02.246Z" },
+ { url = "https://files.pythonhosted.org/packages/e1/98/aa7fccaa97d0f3192bec013c4e6fd6d294a6ed44b640e6bb61f479e00ed5/coverage-7.13.5-cp313-cp313t-win_amd64.whl", hash = "sha256:da86cdcf10d2519e10cabb8ac2de03da1bcb6e4853790b7fbd48523332e3a819", size = 223851, upload-time = "2026-03-17T10:32:04.416Z" },
+ { url = "https://files.pythonhosted.org/packages/3d/8b/e5c469f7352651e5f013198e9e21f97510b23de957dd06a84071683b4b60/coverage-7.13.5-cp313-cp313t-win_arm64.whl", hash = "sha256:0ecf12ecb326fe2c339d93fc131816f3a7367d223db37817208905c89bded911", size = 222104, upload-time = "2026-03-17T10:32:06.65Z" },
+ { url = "https://files.pythonhosted.org/packages/8e/77/39703f0d1d4b478bfd30191d3c14f53caf596fac00efb3f8f6ee23646439/coverage-7.13.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:fbabfaceaeb587e16f7008f7795cd80d20ec548dc7f94fbb0d4ec2e038ce563f", size = 219621, upload-time = "2026-03-17T10:32:08.589Z" },
+ { url = "https://files.pythonhosted.org/packages/e2/3e/51dff36d99ae14639a133d9b164d63e628532e2974d8b1edb99dd1ebc733/coverage-7.13.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9bb2a28101a443669a423b665939381084412b81c3f8c0fcfbac57f4e30b5b8e", size = 219953, upload-time = "2026-03-17T10:32:10.507Z" },
+ { url = "https://files.pythonhosted.org/packages/6a/6c/1f1917b01eb647c2f2adc9962bd66c79eb978951cab61bdc1acab3290c07/coverage-7.13.5-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:bd3a2fbc1c6cccb3c5106140d87cc6a8715110373ef42b63cf5aea29df8c217a", size = 250992, upload-time = "2026-03-17T10:32:12.41Z" },
+ { url = "https://files.pythonhosted.org/packages/22/e5/06b1f88f42a5a99df42ce61208bdec3bddb3d261412874280a19796fc09c/coverage-7.13.5-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6c36ddb64ed9d7e496028d1d00dfec3e428e0aabf4006583bb1839958d280510", size = 253503, upload-time = "2026-03-17T10:32:14.449Z" },
+ { url = "https://files.pythonhosted.org/packages/80/28/2a148a51e5907e504fa7b85490277734e6771d8844ebcc48764a15e28155/coverage-7.13.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:380e8e9084d8eb38db3a9176a1a4f3c0082c3806fa0dc882d1d87abc3c789247", size = 254852, upload-time = "2026-03-17T10:32:16.56Z" },
+ { url = "https://files.pythonhosted.org/packages/61/77/50e8d3d85cc0b7ebe09f30f151d670e302c7ff4a1bf6243f71dd8b0981fa/coverage-7.13.5-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e808af52a0513762df4d945ea164a24b37f2f518cbe97e03deaa0ee66139b4d6", size = 257161, upload-time = "2026-03-17T10:32:19.004Z" },
+ { url = "https://files.pythonhosted.org/packages/3b/c4/b5fd1d4b7bf8d0e75d997afd3925c59ba629fc8616f1b3aae7605132e256/coverage-7.13.5-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e301d30dd7e95ae068671d746ba8c34e945a82682e62918e41b2679acd2051a0", size = 251021, upload-time = "2026-03-17T10:32:21.344Z" },
+ { url = "https://files.pythonhosted.org/packages/f8/66/6ea21f910e92d69ef0b1c3346ea5922a51bad4446c9126db2ae96ee24c4c/coverage-7.13.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:800bc829053c80d240a687ceeb927a94fd108bbdc68dfbe505d0d75ab578a882", size = 252858, upload-time = "2026-03-17T10:32:23.506Z" },
+ { url = "https://files.pythonhosted.org/packages/9e/ea/879c83cb5d61aa2a35fb80e72715e92672daef8191b84911a643f533840c/coverage-7.13.5-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:0b67af5492adb31940ee418a5a655c28e48165da5afab8c7fa6fd72a142f8740", size = 250823, upload-time = "2026-03-17T10:32:25.516Z" },
+ { url = "https://files.pythonhosted.org/packages/8a/fb/616d95d3adb88b9803b275580bdeee8bd1b69a886d057652521f83d7322f/coverage-7.13.5-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:c9136ff29c3a91e25b1d1552b5308e53a1e0653a23e53b6366d7c2dcbbaf8a16", size = 255099, upload-time = "2026-03-17T10:32:27.944Z" },
+ { url = "https://files.pythonhosted.org/packages/1c/93/25e6917c90ec1c9a56b0b26f6cad6408e5f13bb6b35d484a0d75c9cf000d/coverage-7.13.5-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:cff784eef7f0b8f6cb28804fbddcfa99f89efe4cc35fb5627e3ac58f91ed3ac0", size = 250638, upload-time = "2026-03-17T10:32:29.914Z" },
+ { url = "https://files.pythonhosted.org/packages/fc/7b/dc1776b0464145a929deed214aef9fb1493f159b59ff3c7eeeedf91eddd0/coverage-7.13.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:68a4953be99b17ac3c23b6efbc8a38330d99680c9458927491d18700ef23ded0", size = 252295, upload-time = "2026-03-17T10:32:31.981Z" },
+ { url = "https://files.pythonhosted.org/packages/ea/fb/99cbbc56a26e07762a2740713f3c8f9f3f3106e3a3dd8cc4474954bccd34/coverage-7.13.5-cp314-cp314-win32.whl", hash = "sha256:35a31f2b1578185fbe6aa2e74cea1b1d0bbf4c552774247d9160d29b80ed56cc", size = 222360, upload-time = "2026-03-17T10:32:34.233Z" },
+ { url = "https://files.pythonhosted.org/packages/8d/b7/4758d4f73fb536347cc5e4ad63662f9d60ba9118cb6785e9616b2ce5d7fa/coverage-7.13.5-cp314-cp314-win_amd64.whl", hash = "sha256:2aa055ae1857258f9e0045be26a6d62bdb47a72448b62d7b55f4820f361a2633", size = 223174, upload-time = "2026-03-17T10:32:36.369Z" },
+ { url = "https://files.pythonhosted.org/packages/2c/f2/24d84e1dfe70f8ac9fdf30d338239860d0d1d5da0bda528959d0ebc9da28/coverage-7.13.5-cp314-cp314-win_arm64.whl", hash = "sha256:1b11eef33edeae9d142f9b4358edb76273b3bfd30bc3df9a4f95d0e49caf94e8", size = 221739, upload-time = "2026-03-17T10:32:38.736Z" },
+ { url = "https://files.pythonhosted.org/packages/60/5b/4a168591057b3668c2428bff25dd3ebc21b629d666d90bcdfa0217940e84/coverage-7.13.5-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:10a0c37f0b646eaff7cce1874c31d1f1ccb297688d4c747291f4f4c70741cc8b", size = 220351, upload-time = "2026-03-17T10:32:41.196Z" },
+ { url = "https://files.pythonhosted.org/packages/f5/21/1fd5c4dbfe4a58b6b99649125635df46decdfd4a784c3cd6d410d303e370/coverage-7.13.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b5db73ba3c41c7008037fa731ad5459fc3944cb7452fc0aa9f822ad3533c583c", size = 220612, upload-time = "2026-03-17T10:32:43.204Z" },
+ { url = "https://files.pythonhosted.org/packages/d6/fe/2a924b3055a5e7e4512655a9d4609781b0d62334fa0140c3e742926834e2/coverage-7.13.5-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:750db93a81e3e5a9831b534be7b1229df848b2e125a604fe6651e48aa070e5f9", size = 261985, upload-time = "2026-03-17T10:32:45.514Z" },
+ { url = "https://files.pythonhosted.org/packages/d7/0d/c8928f2bd518c45990fe1a2ab8db42e914ef9b726c975facc4282578c3eb/coverage-7.13.5-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9ddb4f4a5479f2539644be484da179b653273bca1a323947d48ab107b3ed1f29", size = 264107, upload-time = "2026-03-17T10:32:47.971Z" },
+ { url = "https://files.pythonhosted.org/packages/ef/ae/4ae35bbd9a0af9d820362751f0766582833c211224b38665c0f8de3d487f/coverage-7.13.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8a7a2049c14f413163e2bdabd37e41179b1d1ccb10ffc6ccc4b7a718429c607", size = 266513, upload-time = "2026-03-17T10:32:50.1Z" },
+ { url = "https://files.pythonhosted.org/packages/9c/20/d326174c55af36f74eac6ae781612d9492f060ce8244b570bb9d50d9d609/coverage-7.13.5-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e1c85e0b6c05c592ea6d8768a66a254bfb3874b53774b12d4c89c481eb78cb90", size = 267650, upload-time = "2026-03-17T10:32:52.391Z" },
+ { url = "https://files.pythonhosted.org/packages/7a/5e/31484d62cbd0eabd3412e30d74386ece4a0837d4f6c3040a653878bfc019/coverage-7.13.5-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:777c4d1eff1b67876139d24288aaf1817f6c03d6bae9c5cc8d27b83bcfe38fe3", size = 261089, upload-time = "2026-03-17T10:32:54.544Z" },
+ { url = "https://files.pythonhosted.org/packages/e9/d8/49a72d6de146eebb0b7e48cc0f4bc2c0dd858e3d4790ab2b39a2872b62bd/coverage-7.13.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6697e29b93707167687543480a40f0db8f356e86d9f67ddf2e37e2dfd91a9dab", size = 263982, upload-time = "2026-03-17T10:32:56.803Z" },
+ { url = "https://files.pythonhosted.org/packages/06/3b/0351f1bd566e6e4dd39e978efe7958bde1d32f879e85589de147654f57bb/coverage-7.13.5-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:8fdf453a942c3e4d99bd80088141c4c6960bb232c409d9c3558e2dbaa3998562", size = 261579, upload-time = "2026-03-17T10:32:59.466Z" },
+ { url = "https://files.pythonhosted.org/packages/5d/ce/796a2a2f4017f554d7810f5c573449b35b1e46788424a548d4d19201b222/coverage-7.13.5-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:32ca0c0114c9834a43f045a87dcebd69d108d8ffb666957ea65aa132f50332e2", size = 265316, upload-time = "2026-03-17T10:33:01.847Z" },
+ { url = "https://files.pythonhosted.org/packages/3d/16/d5ae91455541d1a78bc90abf495be600588aff8f6db5c8b0dae739fa39c9/coverage-7.13.5-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:8769751c10f339021e2638cd354e13adeac54004d1941119b2c96fe5276d45ea", size = 260427, upload-time = "2026-03-17T10:33:03.945Z" },
+ { url = "https://files.pythonhosted.org/packages/48/11/07f413dba62db21fb3fad5d0de013a50e073cc4e2dc4306e770360f6dfc8/coverage-7.13.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cec2d83125531bd153175354055cdb7a09987af08a9430bd173c937c6d0fba2a", size = 262745, upload-time = "2026-03-17T10:33:06.285Z" },
+ { url = "https://files.pythonhosted.org/packages/91/15/d792371332eb4663115becf4bad47e047d16234b1aff687b1b18c58d60ae/coverage-7.13.5-cp314-cp314t-win32.whl", hash = "sha256:0cd9ed7a8b181775459296e402ca4fb27db1279740a24e93b3b41942ebe4b215", size = 223146, upload-time = "2026-03-17T10:33:08.756Z" },
+ { url = "https://files.pythonhosted.org/packages/db/51/37221f59a111dca5e85be7dbf09696323b5b9f13ff65e0641d535ed06ea8/coverage-7.13.5-cp314-cp314t-win_amd64.whl", hash = "sha256:301e3b7dfefecaca37c9f1aa6f0049b7d4ab8dd933742b607765d757aca77d43", size = 224254, upload-time = "2026-03-17T10:33:11.174Z" },
+ { url = "https://files.pythonhosted.org/packages/54/83/6acacc889de8987441aa7d5adfbdbf33d288dad28704a67e574f1df9bcbb/coverage-7.13.5-cp314-cp314t-win_arm64.whl", hash = "sha256:9dacc2ad679b292709e0f5fc1ac74a6d4d5562e424058962c7bb0c658ad25e45", size = 222276, upload-time = "2026-03-17T10:33:13.466Z" },
+ { url = "https://files.pythonhosted.org/packages/9e/ee/a4cf96b8ce1e566ed238f0659ac2d3f007ed1d14b181bcb684e19561a69a/coverage-7.13.5-py3-none-any.whl", hash = "sha256:34b02417cf070e173989b3db962f7ed56d2f644307b2cf9d5a0f258e13084a61", size = 211346, upload-time = "2026-03-17T10:33:15.691Z" },
+]
+
+[package.optional-dependencies]
+toml = [
+ { name = "tomli", marker = "python_full_version <= '3.11'" },
+]
+
+[[package]]
+name = "cryptography"
+version = "47.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "cffi", marker = "platform_python_implementation != 'PyPy'" },
+ { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ef/b2/7ffa7fe8207a8c42147ffe70c3e360b228160c1d85dc3faff16aaa3244c0/cryptography-47.0.0.tar.gz", hash = "sha256:9f8e55fe4e63613a5e1cc5819030f27b97742d720203a087802ce4ce9ceb52bb", size = 830863, upload-time = "2026-04-24T19:54:57.056Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/a4/98/40dfe932134bdcae4f6ab5927c87488754bf9eb79297d7e0070b78dd58e9/cryptography-47.0.0-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:160ad728f128972d362e714054f6ba0067cab7fb350c5202a9ae8ae4ce3ef1a0", size = 7912214, upload-time = "2026-04-24T19:53:03.864Z" },
+ { url = "https://files.pythonhosted.org/packages/34/c6/2733531243fba725f58611b918056b277692f1033373dcc8bd01af1c05d4/cryptography-47.0.0-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b9a8943e359b7615db1a3ba587994618e094ff3d6fa5a390c73d079ce18b3973", size = 4644617, upload-time = "2026-04-24T19:53:06.909Z" },
+ { url = "https://files.pythonhosted.org/packages/00/e3/b27be1a670a9b87f855d211cf0e1174a5d721216b7616bd52d8581d912ed/cryptography-47.0.0-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f5c15764f261394b22aef6b00252f5195f46f2ca300bec57149474e2538b31f8", size = 4668186, upload-time = "2026-04-24T19:53:09.053Z" },
+ { url = "https://files.pythonhosted.org/packages/81/b9/8443cfe5d17d482d348cee7048acf502bb89a51b6382f06240fd290d4ca3/cryptography-47.0.0-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:9c59ab0e0fa3a180a5a9c59f3a5abe3ef90d474bc56d7fadfbe80359491b615b", size = 4651244, upload-time = "2026-04-24T19:53:11.217Z" },
+ { url = "https://files.pythonhosted.org/packages/5d/5e/13ed0cdd0eb88ba159d6dd5ebfece8cb901dbcf1ae5ac4072e28b55d3153/cryptography-47.0.0-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:34b4358b925a5ea3e14384ca781a2c0ef7ac219b57bb9eacc4457078e2b19f92", size = 5252906, upload-time = "2026-04-24T19:53:13.532Z" },
+ { url = "https://files.pythonhosted.org/packages/64/16/ed058e1df0f33d440217cd120d41d5dda9dd215a80b8187f68483185af82/cryptography-47.0.0-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:0024b87d47ae2399165a6bfb20d24888881eeab83ae2566d62467c5ff0030ce7", size = 4701842, upload-time = "2026-04-24T19:53:15.618Z" },
+ { url = "https://files.pythonhosted.org/packages/02/e0/3d30986b30fdbd9e969abbdf8ba00ed0618615144341faeb57f395a084fe/cryptography-47.0.0-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:1e47422b5557bb82d3fff997e8d92cff4e28b9789576984f08c248d2b3535d93", size = 4289313, upload-time = "2026-04-24T19:53:17.755Z" },
+ { url = "https://files.pythonhosted.org/packages/df/fd/32db38e3ad0cb331f0691cb4c7a8a6f176f679124dee746b3af6633db4d9/cryptography-47.0.0-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:6f29f36582e6151d9686235e586dd35bb67491f024767d10b842e520dc6a07ac", size = 4650964, upload-time = "2026-04-24T19:53:20.062Z" },
+ { url = "https://files.pythonhosted.org/packages/86/53/5395d944dfd48cb1f67917f533c609c34347185ef15eb4308024c876f274/cryptography-47.0.0-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:a9b761f012a943b7de0e828843c5688d0de94a0578d44d6c85a1bae32f87791f", size = 5207817, upload-time = "2026-04-24T19:53:22.498Z" },
+ { url = "https://files.pythonhosted.org/packages/34/4f/e5711b28e1901f7d480a2b1b688b645aa4c77c73f10731ed17e7f7db3f0d/cryptography-47.0.0-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:4e1de79e047e25d6e9f8cea71c86b4a53aced64134f0f003bbcbf3655fd172c8", size = 4701544, upload-time = "2026-04-24T19:53:24.356Z" },
+ { url = "https://files.pythonhosted.org/packages/22/22/c8ddc25de3010fc8da447648f5a092c40e7a8fadf01dd6d255d9c0b9373d/cryptography-47.0.0-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ef6b3634087f18d2155b1e8ce264e5345a753da2c5fa9815e7d41315c90f8318", size = 4783536, upload-time = "2026-04-24T19:53:26.665Z" },
+ { url = "https://files.pythonhosted.org/packages/66/b6/d4a68f4ea999c6d89e8498579cba1c5fcba4276284de7773b17e4fa69293/cryptography-47.0.0-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:11dbb9f50a0f1bb9757b3d8c27c1101780efb8f0bdecfb12439c22a74d64c001", size = 4926106, upload-time = "2026-04-24T19:53:28.686Z" },
+ { url = "https://files.pythonhosted.org/packages/54/ed/5f524db1fade9c013aa618e1c99c6ed05e8ffc9ceee6cda22fed22dda3f4/cryptography-47.0.0-cp311-abi3-win32.whl", hash = "sha256:7fda2f02c9015db3f42bb8a22324a454516ed10a8c29ca6ece6cdbb5efe2a203", size = 3258581, upload-time = "2026-04-24T19:53:31.058Z" },
+ { url = "https://files.pythonhosted.org/packages/b2/dc/1b901990b174786569029f67542b3edf72ac068b6c3c8683c17e6a2f5363/cryptography-47.0.0-cp311-abi3-win_amd64.whl", hash = "sha256:f5c3296dab66202f1b18a91fa266be93d6aa0c2806ea3d67762c69f60adc71aa", size = 3775309, upload-time = "2026-04-24T19:53:33.054Z" },
+ { url = "https://files.pythonhosted.org/packages/14/88/7aa18ad9c11bc87689affa5ce4368d884b517502d75739d475fc6f4a03c7/cryptography-47.0.0-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:be12cb6a204f77ed968bcefe68086eb061695b540a3dd05edac507a3111b25f0", size = 7904299, upload-time = "2026-04-24T19:53:35.003Z" },
+ { url = "https://files.pythonhosted.org/packages/07/55/c18f75724544872f234678fdedc871391722cb34a2aee19faa9f63100bb2/cryptography-47.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2ebd84adf0728c039a3be2700289378e1c164afc6748df1a5ed456767bef9ba7", size = 4631180, upload-time = "2026-04-24T19:53:37.517Z" },
+ { url = "https://files.pythonhosted.org/packages/ee/65/31a5cc0eaca99cec5bafffe155d407115d96136bb161e8b49e0ef73f09a7/cryptography-47.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7f68d6fbc7fbbcfb0939fea72c3b96a9f9a6edfc0e1b1d29778a2066030418b1", size = 4653529, upload-time = "2026-04-24T19:53:39.775Z" },
+ { url = "https://files.pythonhosted.org/packages/e5/bc/641c0519a495f3bfd0421b48d7cd325c4336578523ccd76ea322b6c29c7a/cryptography-47.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:6651d32eff255423503aa276739da98c30f26c40cbeffcc6048e0d54ef704c0c", size = 4638570, upload-time = "2026-04-24T19:53:42.129Z" },
+ { url = "https://files.pythonhosted.org/packages/2b/f2/300327b0a47f6dc94dd8b71b57052aefe178bb51745073d73d80604f11ab/cryptography-47.0.0-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:3fb8fa48075fad7193f2e5496135c6a76ac4b2aa5a38433df0a539296b377829", size = 5238019, upload-time = "2026-04-24T19:53:44.577Z" },
+ { url = "https://files.pythonhosted.org/packages/e9/5a/5b5cf994391d4bf9d9c7efd4c66aabe4d95227256627f8fea6cff7dfadbd/cryptography-47.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:11438c7518132d95f354fa01a4aa2f806d172a061a7bed18cf18cbdacdb204d7", size = 4686832, upload-time = "2026-04-24T19:53:47.015Z" },
+ { url = "https://files.pythonhosted.org/packages/dc/2c/ae950e28fd6475c852fc21a44db3e6b5bcc1261d1e370f2b6e42fa800fef/cryptography-47.0.0-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:8c1a736bbb3288005796c3f7ccb9453360d7fed483b13b9f468aea5171432923", size = 4269301, upload-time = "2026-04-24T19:53:48.97Z" },
+ { url = "https://files.pythonhosted.org/packages/67/fb/6a39782e150ffe5cc1b0018cb6ddc48bf7ca62b498d7539ffc8a758e977d/cryptography-47.0.0-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:f1557695e5c2b86e204f6ce9470497848634100787935ab7adc5397c54abd7ab", size = 4638110, upload-time = "2026-04-24T19:53:51.011Z" },
+ { url = "https://files.pythonhosted.org/packages/8e/d7/0b3c71090a76e5c203164a47688b697635ece006dcd2499ab3a4dbd3f0bd/cryptography-47.0.0-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:f9a034b642b960767fb343766ae5ba6ad653f2e890ddd82955aef288ffea8736", size = 5194988, upload-time = "2026-04-24T19:53:52.962Z" },
+ { url = "https://files.pythonhosted.org/packages/63/33/63a961498a9df51721ab578c5a2622661411fc520e00bd83b0cc64eb20c4/cryptography-47.0.0-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:b1c76fca783aa7698eb21eb14f9c4aa09452248ee54a627d125025a43f83e7a7", size = 4686563, upload-time = "2026-04-24T19:53:55.274Z" },
+ { url = "https://files.pythonhosted.org/packages/b7/bf/5ee5b145248f92250de86145d1c1d6edebbd57a7fe7caa4dedb5d4cf06a1/cryptography-47.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4f7722c97826770bab8ae92959a2e7b20a5e9e9bf4deae68fd86c3ca457bab52", size = 4770094, upload-time = "2026-04-24T19:53:57.753Z" },
+ { url = "https://files.pythonhosted.org/packages/92/43/21d220b2da5d517773894dacdcdb5c682c28d3fffce65548cb06e87d5501/cryptography-47.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:09f6d7bf6724f8db8b32f11eccf23efc8e759924bc5603800335cf8859a3ddbd", size = 4913811, upload-time = "2026-04-24T19:54:00.236Z" },
+ { url = "https://files.pythonhosted.org/packages/31/98/dc4ad376ac5f1a1a7d4a83f7b0c6f2bcad36b5d2d8f30aeb482d3a7d9582/cryptography-47.0.0-cp314-cp314t-win32.whl", hash = "sha256:6eebcaf0df1d21ce1f90605c9b432dd2c4f4ab665ac29a40d5e3fc68f51b5e63", size = 3237158, upload-time = "2026-04-24T19:54:02.606Z" },
+ { url = "https://files.pythonhosted.org/packages/bc/da/97f62d18306b5133468bc3f8cc73a3111e8cdc8cf8d3e69474d6e5fd2d1b/cryptography-47.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:51c9313e90bd1690ec5a75ed047c27c0b8e6c570029712943d6116ef9a90620b", size = 3758706, upload-time = "2026-04-24T19:54:04.433Z" },
+ { url = "https://files.pythonhosted.org/packages/e0/34/a4fae8ae7c3bc227460c9ae43f56abf1b911da0ec29e0ebac53bb0a4b6b7/cryptography-47.0.0-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:14432c8a9bcb37009784f9594a62fae211a2ae9543e96c92b2a8e4c3cd5cd0c4", size = 7904072, upload-time = "2026-04-24T19:54:06.411Z" },
+ { url = "https://files.pythonhosted.org/packages/01/64/d7b1e54fdb69f22d24a64bb3e88dc718b31c7fb10ef0b9691a3cf7eeea6e/cryptography-47.0.0-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:07efe86201817e7d3c18781ca9770bc0db04e1e48c994be384e4602bc38f8f27", size = 4635767, upload-time = "2026-04-24T19:54:08.519Z" },
+ { url = "https://files.pythonhosted.org/packages/8b/7b/cca826391fb2a94efdcdfe4631eb69306ee1cff0b22f664a412c90713877/cryptography-47.0.0-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2b45761c6ec22b7c726d6a829558777e32d0f1c8be7c3f3480f9c912d5ee8a10", size = 4654350, upload-time = "2026-04-24T19:54:10.795Z" },
+ { url = "https://files.pythonhosted.org/packages/4c/65/4b57bcc823f42a991627c51c2f68c9fd6eb1393c1756aac876cba2accae2/cryptography-47.0.0-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:edd4da498015da5b9f26d38d3bfc2e90257bfa9cbed1f6767c282a0025ae649b", size = 4643394, upload-time = "2026-04-24T19:54:13.275Z" },
+ { url = "https://files.pythonhosted.org/packages/f4/c4/2c5fbeea70adbbca2bbae865e1d605d6a4a7f8dbd9d33eaf69645087f06c/cryptography-47.0.0-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:9af828c0d5a65c70ec729cd7495a4bf1a67ecb66417b8f02ff125ab8a6326a74", size = 5225777, upload-time = "2026-04-24T19:54:15.18Z" },
+ { url = "https://files.pythonhosted.org/packages/7e/b8/ac57107ef32749d2b244e36069bb688792a363aaaa3acc9e3cf84c130315/cryptography-47.0.0-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:256d07c78a04d6b276f5df935a9923275f53bd1522f214447fdf365494e2d515", size = 4688771, upload-time = "2026-04-24T19:54:17.835Z" },
+ { url = "https://files.pythonhosted.org/packages/56/fc/9f1de22ff8be99d991f240a46863c52d475404c408886c5a38d2b5c3bb26/cryptography-47.0.0-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:5d0e362ff51041b0c0d219cc7d6924d7b8996f57ce5712bdcef71eb3c65a59cc", size = 4270753, upload-time = "2026-04-24T19:54:19.963Z" },
+ { url = "https://files.pythonhosted.org/packages/00/68/d70c852797aa68e8e48d12e5a87170c43f67bb4a59403627259dd57d15de/cryptography-47.0.0-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:1581aef4219f7ca2849d0250edaa3866212fb74bf5667284f46aa92f9e65c1ca", size = 4642911, upload-time = "2026-04-24T19:54:21.818Z" },
+ { url = "https://files.pythonhosted.org/packages/a5/51/661cbee74f594c5d97ff82d34f10d5551c085ca4668645f4606ebd22bd5d/cryptography-47.0.0-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:a49a3eb5341b9503fa3000a9a0db033161db90d47285291f53c2a9d2cd1b7f76", size = 5181411, upload-time = "2026-04-24T19:54:24.376Z" },
+ { url = "https://files.pythonhosted.org/packages/94/87/f2b6c374a82cf076cfa1416992ac8e8ec94d79facc37aec87c1a5cb72352/cryptography-47.0.0-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:2207a498b03275d0051589e326b79d4cf59985c99031b05bb292ac52631c37fe", size = 4688262, upload-time = "2026-04-24T19:54:26.946Z" },
+ { url = "https://files.pythonhosted.org/packages/14/e2/8b7462f4acf21ec509616f0245018bb197194ab0b65c2ea21a0bdd53c0eb/cryptography-47.0.0-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:7a02675e2fabd0c0fc04c868b8781863cbf1967691543c22f5470500ff840b31", size = 4775506, upload-time = "2026-04-24T19:54:28.926Z" },
+ { url = "https://files.pythonhosted.org/packages/70/75/158e494e4c08dc05e039da5bb48553826bd26c23930cf8d3cd5f21fa8921/cryptography-47.0.0-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:80887c5cbd1774683cb126f0ab4184567f080071d5acf62205acb354b4b753b7", size = 4912060, upload-time = "2026-04-24T19:54:30.869Z" },
+ { url = "https://files.pythonhosted.org/packages/06/bd/0a9d3edbf5eadbac926d7b9b3cd0c4be584eeeae4a003d24d9eda4affbbd/cryptography-47.0.0-cp38-abi3-win32.whl", hash = "sha256:ed67ea4e0cfb5faa5bc7ecb6e2b8838f3807a03758eec239d6c21c8769355310", size = 3248487, upload-time = "2026-04-24T19:54:33.494Z" },
+ { url = "https://files.pythonhosted.org/packages/60/80/5681af756d0da3a599b7bdb586fac5a1540f1bcefd2717a20e611ddade45/cryptography-47.0.0-cp38-abi3-win_amd64.whl", hash = "sha256:835d2d7f47cdc53b3224e90810fb1d36ca94ea29cc1801fb4c1bc43876735769", size = 3755737, upload-time = "2026-04-24T19:54:35.408Z" },
+ { url = "https://files.pythonhosted.org/packages/1b/a0/928c9ce0d120a40a81aa99e3ba383e87337b9ac9ef9f6db02e4d7822424d/cryptography-47.0.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:7f1207974a904e005f762869996cf620e9bf79ecb4622f148550bb48e0eb35a7", size = 3909893, upload-time = "2026-04-24T19:54:38.334Z" },
+ { url = "https://files.pythonhosted.org/packages/81/75/d691e284750df5d9569f2b1ce4a00a71e1d79566da83b2b3e5549c84917f/cryptography-47.0.0-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:1a405c08857258c11016777e11c02bacbe7ef596faf259305d282272a3a05cbe", size = 4587867, upload-time = "2026-04-24T19:54:40.619Z" },
+ { url = "https://files.pythonhosted.org/packages/07/d6/1b90f1a4e453009730b4545286f0b39bb348d805c11181fc31544e4f9a65/cryptography-47.0.0-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:20fdbe3e38fb67c385d233c89371fa27f9909f6ebca1cecc20c13518dae65475", size = 4627192, upload-time = "2026-04-24T19:54:42.849Z" },
+ { url = "https://files.pythonhosted.org/packages/dc/53/cb358a80e9e359529f496870dd08c102aa8a4b5b9f9064f00f0d6ed5b527/cryptography-47.0.0-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:f7db373287273d8af1414cf95dc4118b13ffdc62be521997b0f2b270771fef50", size = 4587486, upload-time = "2026-04-24T19:54:44.908Z" },
+ { url = "https://files.pythonhosted.org/packages/8b/57/aaa3d53876467a226f9a7a82fd14dd48058ad2de1948493442dfa16e2ffd/cryptography-47.0.0-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:9fe6b7c64926c765f9dff301f9c1b867febcda5768868ca084e18589113732ab", size = 4626327, upload-time = "2026-04-24T19:54:47.813Z" },
+ { url = "https://files.pythonhosted.org/packages/ab/9c/51f28c3550276bcf35660703ba0ab829a90b88be8cd98a71ef23c2413913/cryptography-47.0.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:cffbba3392df0fa8629bb7f43454ee2925059ee158e23c54620b9063912b86c8", size = 3698916, upload-time = "2026-04-24T19:54:49.782Z" },
+]
+
+[[package]]
+name = "cyclopts"
+version = "4.11.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "attrs" },
+ { name = "docstring-parser" },
+ { name = "rich" },
+ { name = "rich-rst" },
+ { name = "tomli", marker = "python_full_version < '3.11'" },
+ { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f9/fa/eff8f1abae783bade9b5e9bafafd0040d4dbf51988f9384bfdc0326ba1fc/cyclopts-4.11.0.tar.gz", hash = "sha256:1ffcb9990dbd56b90da19980d31596de9e99019980a215a5d76cf88fe452e94d", size = 170690, upload-time = "2026-04-23T00:23:36.858Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/7c/37/197db187c260d24d4be1f09d427f59f3fb9a89bcf1354e23865c7bff7607/cyclopts-4.11.0-py3-none-any.whl", hash = "sha256:34318e3823b44b5baa754a5e37ec70a5c17dc81c65e4295ed70e17bc1aeae50d", size = 208494, upload-time = "2026-04-23T00:23:34.948Z" },
+]
+
+[[package]]
+name = "distro"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },
+]
+
+[[package]]
+name = "dnspython"
+version = "2.8.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8c/8b/57666417c0f90f08bcafa776861060426765fdb422eb10212086fb811d26/dnspython-2.8.0.tar.gz", hash = "sha256:181d3c6996452cb1189c4046c61599b84a5a86e099562ffde77d26984ff26d0f", size = 368251, upload-time = "2025-09-07T18:58:00.022Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/ba/5a/18ad964b0086c6e62e2e7500f7edc89e3faa45033c71c1893d34eed2b2de/dnspython-2.8.0-py3-none-any.whl", hash = "sha256:01d9bbc4a2d76bf0db7c1f729812ded6d912bd318d3b1cf81d30c0f845dbf3af", size = 331094, upload-time = "2025-09-07T18:57:58.071Z" },
+]
+
+[[package]]
+name = "docstring-parser"
+version = "0.18.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e0/4d/f332313098c1de1b2d2ff91cf2674415cc7cddab2ca1b01ae29774bd5fdf/docstring_parser-0.18.0.tar.gz", hash = "sha256:292510982205c12b1248696f44959db3cdd1740237a968ea1e2e7a900eeb2015", size = 29341, upload-time = "2026-04-14T04:09:19.867Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/a7/5f/ed01f9a3cdffbd5a008556fc7b2a08ddb1cc6ace7effa7340604b1d16699/docstring_parser-0.18.0-py3-none-any.whl", hash = "sha256:b3fcbed555c47d8479be0796ef7e19c2670d428d72e96da63f3a40122860374b", size = 22484, upload-time = "2026-04-14T04:09:18.638Z" },
+]
+
+[[package]]
+name = "docutils"
+version = "0.22.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ae/b6/03bb70946330e88ffec97aefd3ea75ba575cb2e762061e0e62a213befee8/docutils-0.22.4.tar.gz", hash = "sha256:4db53b1fde9abecbb74d91230d32ab626d94f6badfc575d6db9194a49df29968", size = 2291750, upload-time = "2025-12-18T19:00:26.443Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/02/10/5da547df7a391dcde17f59520a231527b8571e6f46fc8efb02ccb370ab12/docutils-0.22.4-py3-none-any.whl", hash = "sha256:d0013f540772d1420576855455d050a2180186c91c15779301ac2ccb3eeb68de", size = 633196, upload-time = "2025-12-18T19:00:18.077Z" },
+]
+
+[[package]]
+name = "email-validator"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "dnspython" },
+ { name = "idna" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f5/22/900cb125c76b7aaa450ce02fd727f452243f2e91a61af068b40adba60ea9/email_validator-2.3.0.tar.gz", hash = "sha256:9fc05c37f2f6cf439ff414f8fc46d917929974a82244c20eb10231ba60c54426", size = 51238, upload-time = "2025-08-26T13:09:06.831Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/de/15/545e2b6cf2e3be84bc1ed85613edd75b8aea69807a71c26f4ca6a9258e82/email_validator-2.3.0-py3-none-any.whl", hash = "sha256:80f13f623413e6b197ae73bb10bf4eb0908faf509ad8362c5edeb0be7fd450b4", size = 35604, upload-time = "2025-08-26T13:09:05.858Z" },
+]
+
+[[package]]
+name = "exceptiongroup"
+version = "1.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598", size = 16740, upload-time = "2025-11-21T23:01:53.443Z" },
+]
+
+[[package]]
+name = "fastapi"
+version = "0.136.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "annotated-doc" },
+ { name = "pydantic" },
+ { name = "starlette" },
+ { name = "typing-extensions" },
+ { name = "typing-inspection" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5d/45/c130091c2dfa061bbfe3150f2a5091ef1adf149f2a8d2ae769ecaf6e99a2/fastapi-0.136.1.tar.gz", hash = "sha256:7af665ad7acfa0a3baf8983d393b6b471b9da10ede59c60045f49fbc89a0fa7f", size = 397448, upload-time = "2026-04-23T16:49:44.046Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/5a/ff/2e4eca3ade2c22fe1dea7043b8ee9dabe47753349eb1b56a202de8af6349/fastapi-0.136.1-py3-none-any.whl", hash = "sha256:a6e9d7eeada96c93a4d69cb03836b44fa34e2854accb7244a1ece36cd4781c3f", size = 117683, upload-time = "2026-04-23T16:49:42.437Z" },
+]
+
+[[package]]
+name = "fastmcp"
+version = "3.2.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "authlib" },
+ { name = "cyclopts" },
+ { name = "exceptiongroup" },
+ { name = "griffelib" },
+ { name = "httpx" },
+ { name = "jsonref" },
+ { name = "jsonschema-path" },
+ { name = "mcp" },
+ { name = "openapi-pydantic" },
+ { name = "opentelemetry-api" },
+ { name = "packaging" },
+ { name = "platformdirs" },
+ { name = "py-key-value-aio", extra = ["filetree", "keyring", "memory"] },
+ { name = "pydantic", extra = ["email"] },
+ { name = "pyperclip" },
+ { name = "python-dotenv" },
+ { name = "pyyaml" },
+ { name = "rich" },
+ { name = "uncalled-for" },
+ { name = "uvicorn" },
+ { name = "watchfiles" },
+ { name = "websockets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9c/13/29544fbc6dfe45ea38046af0067311e0bad7acc7d1f2ad38bb08f2409fe2/fastmcp-3.2.4.tar.gz", hash = "sha256:083ecb75b44a4169e7fc0f632f94b781bdb0ff877c6b35b9877cbb566fd4d4d1", size = 28746127, upload-time = "2026-04-14T01:42:24.174Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/cf/76/b310d52fa0e30d39bd937eb58ec2c1f1ea1b5f519f0575e9dd9612f01deb/fastmcp-3.2.4-py3-none-any.whl", hash = "sha256:e6c9c429171041455e47ab94bb3f83c4657622a0ec28922f6940053959bd58a9", size = 728599, upload-time = "2026-04-14T01:42:26.85Z" },
+]
+
+[[package]]
+name = "filelock"
+version = "3.29.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b5/fe/997687a931ab51049acce6fa1f23e8f01216374ea81374ddee763c493db5/filelock-3.29.0.tar.gz", hash = "sha256:69974355e960702e789734cb4871f884ea6fe50bd8404051a3530bc07809cf90", size = 57571, upload-time = "2026-04-19T15:39:10.068Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/81/47/dd9a212ef6e343a6857485ffe25bba537304f1913bdbed446a23f7f592e1/filelock-3.29.0-py3-none-any.whl", hash = "sha256:96f5f6344709aa1572bbf631c640e4ebeeb519e08da902c39a001882f30ac258", size = 39812, upload-time = "2026-04-19T15:39:08.752Z" },
+]
+
+[[package]]
+name = "fsspec"
+version = "2026.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e1/cf/b50ddf667c15276a9ab15a70ef5f257564de271957933ffea49d2cdbcdfb/fsspec-2026.3.0.tar.gz", hash = "sha256:1ee6a0e28677557f8c2f994e3eea77db6392b4de9cd1f5d7a9e87a0ae9d01b41", size = 313547, upload-time = "2026-03-27T19:11:14.892Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/d5/1f/5f4a3cd9e4440e9d9bc78ad0a91a1c8d46b4d429d5239ebe6793c9fe5c41/fsspec-2026.3.0-py3-none-any.whl", hash = "sha256:d2ceafaad1b3457968ed14efa28798162f1638dbb5d2a6868a2db002a5ee39a4", size = 202595, upload-time = "2026-03-27T19:11:13.595Z" },
+]
+
+[[package]]
+name = "gradio"
+version = "6.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "anyio" },
+ { name = "audioop-lts", marker = "python_full_version >= '3.13'" },
+ { name = "brotli" },
+ { name = "fastapi" },
+ { name = "gradio-client" },
+ { name = "groovy" },
+ { name = "hf-gradio" },
+ { name = "httpx" },
+ { name = "huggingface-hub" },
+ { name = "jinja2" },
+ { name = "markupsafe" },
+ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+ { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+ { name = "orjson" },
+ { name = "packaging" },
+ { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+ { name = "pandas", version = "3.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+ { name = "pillow" },
+ { name = "pydantic" },
+ { name = "pydub" },
+ { name = "python-multipart" },
+ { name = "pytz" },
+ { name = "pyyaml" },
+ { name = "safehttpx" },
+ { name = "semantic-version" },
+ { name = "starlette" },
+ { name = "tomlkit" },
+ { name = "typer" },
+ { name = "typing-extensions" },
+ { name = "uvicorn" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7d/93/022b6cae8b566424683a80c21ca04c364f9b88120f08a9ba2b93c6b7c8e3/gradio-6.13.0.tar.gz", hash = "sha256:23457dde02202d97f636a5c170967a846297e20f40c3152b41aa4c3460245e3b", size = 36016802, upload-time = "2026-04-20T23:16:10.057Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/3f/95/0ad40fb92ba3e6fe36182f722f81d69842a1e93cab1d9c6171256ef55418/gradio-6.13.0-py3-none-any.whl", hash = "sha256:46953f88aad36db9bc369ad2d1d6c4f200274da28f232b54842b2d4942a24f8f", size = 19684382, upload-time = "2026-04-20T23:16:06.298Z" },
+]
+
+[[package]]
+name = "gradio-client"
+version = "2.5.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "fsspec" },
+ { name = "httpx" },
+ { name = "huggingface-hub" },
+ { name = "packaging" },
+ { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e8/e6/6b6029f5fe2ad7f1211105d530e34d991014c2cae463f9223033031cfc4f/gradio_client-2.5.0.tar.gz", hash = "sha256:4cde99bad62149595c30c90876ca2e405e3a13687ecf895474f3412cb476673d", size = 59013, upload-time = "2026-04-20T23:16:21.518Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/78/81/0a861b8e1ff42960139c6cd4c7dd591292fa09ea1ae2d87677441cba4c00/gradio_client-2.5.0-py3-none-any.whl", hash = "sha256:d43e2179c29076292a76485ad7ed2e6eaa19d14ac58283bd7f5beabfe4ca958c", size = 59952, upload-time = "2026-04-20T23:16:20.186Z" },
+]
+
+[[package]]
+name = "griffelib"
+version = "2.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9d/82/74f4a3310cdabfbb10da554c3a672847f1ed33c6f61dd472681ce7f1fe67/griffelib-2.0.2.tar.gz", hash = "sha256:3cf20b3bc470e83763ffbf236e0076b1211bac1bc67de13daf494640f2de707e", size = 166461, upload-time = "2026-03-27T11:34:51.091Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/11/8c/c9138d881c79aa0ea9ed83cbd58d5ca75624378b38cee225dcf5c42cc91f/griffelib-2.0.2-py3-none-any.whl", hash = "sha256:925c857658fb1ba40c0772c37acbc2ab650bd794d9c1b9726922e36ea4117ea1", size = 142357, upload-time = "2026-03-27T11:34:46.275Z" },
+]
+
+[[package]]
+name = "groovy"
+version = "0.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/52/36/bbdede67400277bef33d3ec0e6a31750da972c469f75966b4930c753218f/groovy-0.1.2.tar.gz", hash = "sha256:25c1dc09b3f9d7e292458aa762c6beb96ea037071bf5e917fc81fb78d2231083", size = 17325, upload-time = "2025-02-28T20:24:56.068Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/28/27/3d6dcadc8a3214d8522c1e7f6a19554e33659be44546d44a2f7572ac7d2a/groovy-0.1.2-py3-none-any.whl", hash = "sha256:7f7975bab18c729a257a8b1ae9dcd70b7cafb1720481beae47719af57c35fa64", size = 14090, upload-time = "2025-02-28T20:24:55.152Z" },
+]
+
+[[package]]
+name = "h11"
+version = "0.16.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
+]
+
+[[package]]
+name = "hf-gradio"
+version = "0.4.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "gradio-client" },
+ { name = "typer" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ce/86/c9694b7cfada5780e75769e60dc161a161f4dd7fc91b61db5e3a3338bef9/hf_gradio-0.4.1.tar.gz", hash = "sha256:a017d942618f0d495a58ee4563047fa04bef614c00e0cb789a9a6d0633cffa7b", size = 6560, upload-time = "2026-04-22T14:01:32.334Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/30/2d/afff2ee87e75d8eb85c92bb8cf0e15b05c23c2ebd8fd8dec781d8601ed7f/hf_gradio-0.4.1-py3-none-any.whl", hash = "sha256:76b8cb8be6abe62d74c1ad2d35b42f0629db89aa9e1a8d033cecfe7c856eeab3", size = 4482, upload-time = "2026-04-17T19:53:31.827Z" },
+]
+
+[[package]]
+name = "hf-xet"
+version = "1.4.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/53/92/ec9ad04d0b5728dca387a45af7bc98fbb0d73b2118759f5f6038b61a57e8/hf_xet-1.4.3.tar.gz", hash = "sha256:8ddedb73c8c08928c793df2f3401ec26f95be7f7e516a7bee2fbb546f6676113", size = 670477, upload-time = "2026-03-31T22:40:07.874Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/72/43/724d307b34e353da0abd476e02f72f735cdd2bc86082dee1b32ea0bfee1d/hf_xet-1.4.3-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:7551659ba4f1e1074e9623996f28c3873682530aee0a846b7f2f066239228144", size = 3800935, upload-time = "2026-03-31T22:39:49.618Z" },
+ { url = "https://files.pythonhosted.org/packages/2b/d2/8bee5996b699262edb87dbb54118d287c0e1b2fc78af7cdc41857ba5e3c4/hf_xet-1.4.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:bee693ada985e7045997f05f081d0e12c4c08bd7626dc397f8a7c487e6c04f7f", size = 3558942, upload-time = "2026-03-31T22:39:47.938Z" },
+ { url = "https://files.pythonhosted.org/packages/c3/a1/e993d09cbe251196fb60812b09a58901c468127b7259d2bf0f68bf6088eb/hf_xet-1.4.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:21644b404bb0100fe3857892f752c4d09642586fd988e61501c95bbf44b393a3", size = 4207657, upload-time = "2026-03-31T22:39:39.69Z" },
+ { url = "https://files.pythonhosted.org/packages/64/44/9eb6d21e5c34c63e5e399803a6932fa983cabdf47c0ecbcfe7ea97684b8c/hf_xet-1.4.3-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:987f09cfe418237812896a6736b81b1af02a3a6dcb4b4944425c4c4fca7a7cf8", size = 3986765, upload-time = "2026-03-31T22:39:37.936Z" },
+ { url = "https://files.pythonhosted.org/packages/ea/7b/8ad6f16fdb82f5f7284a34b5ec48645bd575bdcd2f6f0d1644775909c486/hf_xet-1.4.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:60cf7fc43a99da0a853345cf86d23738c03983ee5249613a6305d3e57a5dca74", size = 4188162, upload-time = "2026-03-31T22:39:58.382Z" },
+ { url = "https://files.pythonhosted.org/packages/1b/c4/39d6e136cbeea9ca5a23aad4b33024319222adbdc059ebcda5fc7d9d5ff4/hf_xet-1.4.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2815a49a7a59f3e2edf0cf113ae88e8cb2ca2a221bf353fb60c609584f4884d4", size = 4424525, upload-time = "2026-03-31T22:40:00.225Z" },
+ { url = "https://files.pythonhosted.org/packages/46/f2/adc32dae6bdbc367853118b9878139ac869419a4ae7ba07185dc31251b76/hf_xet-1.4.3-cp313-cp313t-win_amd64.whl", hash = "sha256:42ee323265f1e6a81b0e11094564fb7f7e0ec75b5105ffd91ae63f403a11931b", size = 3671610, upload-time = "2026-03-31T22:40:10.42Z" },
+ { url = "https://files.pythonhosted.org/packages/e2/19/25d897dcc3f81953e0c2cde9ec186c7a0fee413eb0c9a7a9130d87d94d3a/hf_xet-1.4.3-cp313-cp313t-win_arm64.whl", hash = "sha256:27c976ba60079fb8217f485b9c5c7fcd21c90b0367753805f87cb9f3cdc4418a", size = 3528529, upload-time = "2026-03-31T22:40:09.106Z" },
+ { url = "https://files.pythonhosted.org/packages/ec/36/3e8f85ca9fe09b8de2b2e10c63b3b3353d7dda88a0b3d426dffbe7b8313b/hf_xet-1.4.3-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:5251d5ece3a81815bae9abab41cf7ddb7bcb8f56411bce0827f4a3071c92fdc6", size = 3801019, upload-time = "2026-03-31T22:39:56.651Z" },
+ { url = "https://files.pythonhosted.org/packages/b5/9c/defb6cb1de28bccb7bd8d95f6e60f72a3d3fa4cb3d0329c26fb9a488bfe7/hf_xet-1.4.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1feb0f3abeacee143367c326a128a2e2b60868ec12a36c225afb1d6c5a05e6d2", size = 3558746, upload-time = "2026-03-31T22:39:54.766Z" },
+ { url = "https://files.pythonhosted.org/packages/c1/bd/8d001191893178ff8e826e46ad5299446e62b93cd164e17b0ffea08832ec/hf_xet-1.4.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8b301fc150290ca90b4fccd079829b84bb4786747584ae08b94b4577d82fb791", size = 4207692, upload-time = "2026-03-31T22:39:46.246Z" },
+ { url = "https://files.pythonhosted.org/packages/ce/48/6790b402803250e9936435613d3a78b9aaeee7973439f0918848dde58309/hf_xet-1.4.3-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:d972fbe95ddc0d3c0fc49b31a8a69f47db35c1e3699bf316421705741aab6653", size = 3986281, upload-time = "2026-03-31T22:39:44.648Z" },
+ { url = "https://files.pythonhosted.org/packages/51/56/ea62552fe53db652a9099eda600b032d75554d0e86c12a73824bfedef88b/hf_xet-1.4.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c5b48db1ee344a805a1b9bd2cda9b6b65fe77ed3787bd6e87ad5521141d317cd", size = 4187414, upload-time = "2026-03-31T22:40:04.951Z" },
+ { url = "https://files.pythonhosted.org/packages/7d/f5/bc1456d4638061bea997e6d2db60a1a613d7b200e0755965ec312dc1ef79/hf_xet-1.4.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:22bdc1f5fb8b15bf2831440b91d1c9bbceeb7e10c81a12e8d75889996a5c9da8", size = 4424368, upload-time = "2026-03-31T22:40:06.347Z" },
+ { url = "https://files.pythonhosted.org/packages/e4/76/ab597bae87e1f06d18d3ecb8ed7f0d3c9a37037fc32ce76233d369273c64/hf_xet-1.4.3-cp314-cp314t-win_amd64.whl", hash = "sha256:0392c79b7cf48418cd61478c1a925246cf10639f4cd9d94368d8ca1e8df9ea07", size = 3672280, upload-time = "2026-03-31T22:40:16.401Z" },
+ { url = "https://files.pythonhosted.org/packages/62/05/2e462d34e23a09a74d73785dbed71cc5dbad82a72eee2ad60a72a554155d/hf_xet-1.4.3-cp314-cp314t-win_arm64.whl", hash = "sha256:681c92a07796325778a79d76c67011764ecc9042a8c3579332b61b63ae512075", size = 3528945, upload-time = "2026-03-31T22:40:14.995Z" },
+ { url = "https://files.pythonhosted.org/packages/ac/9f/9c23e4a447b8f83120798f9279d0297a4d1360bdbf59ef49ebec78fe2545/hf_xet-1.4.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:d0da85329eaf196e03e90b84c2d0aca53bd4573d097a75f99609e80775f98025", size = 3805048, upload-time = "2026-03-31T22:39:53.105Z" },
+ { url = "https://files.pythonhosted.org/packages/0b/f8/7aacb8e5f4a7899d39c787b5984e912e6c18b11be136ef13947d7a66d265/hf_xet-1.4.3-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:e23717ce4186b265f69afa66e6f0069fe7efbf331546f5c313d00e123dc84583", size = 3562178, upload-time = "2026-03-31T22:39:51.295Z" },
+ { url = "https://files.pythonhosted.org/packages/df/9a/a24b26dc8a65f0ecc0fe5be981a19e61e7ca963b85e062c083f3a9100529/hf_xet-1.4.3-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc360b70c815bf340ed56c7b8c63aacf11762a4b099b2fe2c9bd6d6068668c08", size = 4212320, upload-time = "2026-03-31T22:39:42.922Z" },
+ { url = "https://files.pythonhosted.org/packages/53/60/46d493db155d2ee2801b71fb1b0fd67696359047fdd8caee2c914cc50c79/hf_xet-1.4.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:39f2d2e9654cd9b4319885733993807aab6de9dfbd34c42f0b78338d6617421f", size = 3991546, upload-time = "2026-03-31T22:39:41.335Z" },
+ { url = "https://files.pythonhosted.org/packages/bc/f5/067363e1c96c6b17256910830d1b54099d06287e10f4ec6ec4e7e08371fc/hf_xet-1.4.3-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:49ad8a8cead2b56051aa84d7fce3e1335efe68df3cf6c058f22a65513885baac", size = 4193200, upload-time = "2026-03-31T22:40:01.936Z" },
+ { url = "https://files.pythonhosted.org/packages/42/4b/53951592882d9c23080c7644542fda34a3813104e9e11fa1a7d82d419cb8/hf_xet-1.4.3-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7716d62015477a70ea272d2d68cd7cad140f61c52ee452e133e139abfe2c17ba", size = 4429392, upload-time = "2026-03-31T22:40:03.492Z" },
+ { url = "https://files.pythonhosted.org/packages/8a/21/75a6c175b4e79662ad8e62f46a40ce341d8d6b206b06b4320d07d55b188c/hf_xet-1.4.3-cp37-abi3-win_amd64.whl", hash = "sha256:6b591fcad34e272a5b02607485e4f2a1334aebf1bc6d16ce8eb1eb8978ac2021", size = 3677359, upload-time = "2026-03-31T22:40:13.619Z" },
+ { url = "https://files.pythonhosted.org/packages/8a/7c/44314ecd0e89f8b2b51c9d9e5e7a60a9c1c82024ac471d415860557d3cd8/hf_xet-1.4.3-cp37-abi3-win_arm64.whl", hash = "sha256:7c2c7e20bcfcc946dc67187c203463f5e932e395845d098cc2a93f5b67ca0b47", size = 3533664, upload-time = "2026-03-31T22:40:12.152Z" },
+]
+
+[[package]]
+name = "httpcore"
+version = "1.0.9"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "certifi" },
+ { name = "h11" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },
+]
+
+[[package]]
+name = "httpx"
+version = "0.28.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "anyio" },
+ { name = "certifi" },
+ { name = "httpcore" },
+ { name = "idna" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
+]
+
+[[package]]
+name = "httpx-sse"
+version = "0.4.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0f/4c/751061ffa58615a32c31b2d82e8482be8dd4a89154f003147acee90f2be9/httpx_sse-0.4.3.tar.gz", hash = "sha256:9b1ed0127459a66014aec3c56bebd93da3c1bc8bb6618c8082039a44889a755d", size = 15943, upload-time = "2025-10-10T21:48:22.271Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/d2/fd/6668e5aec43ab844de6fc74927e155a3b37bf40d7c3790e49fc0406b6578/httpx_sse-0.4.3-py3-none-any.whl", hash = "sha256:0ac1c9fe3c0afad2e0ebb25a934a59f4c7823b60792691f779fad2c5568830fc", size = 8960, upload-time = "2025-10-10T21:48:21.158Z" },
+]
+
+[[package]]
+name = "huggingface-hub"
+version = "1.12.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "filelock" },
+ { name = "fsspec" },
+ { name = "hf-xet", marker = "platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" },
+ { name = "httpx" },
+ { name = "packaging" },
+ { name = "pyyaml" },
+ { name = "tqdm" },
+ { name = "typer" },
+ { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/56/52/1b54cb569509c725a32c1315261ac9fd0e6b91bbbf74d86fca10d3376164/huggingface_hub-1.12.0.tar.gz", hash = "sha256:7c3fe85e24b652334e5d456d7a812cd9a071e75630fac4365d9165ab5e4a34b6", size = 763091, upload-time = "2026-04-24T13:32:08.674Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/7e/2b/ef03ddb96bd1123503c2bd6932001020292deea649e9bf4caa2cb65a85bf/huggingface_hub-1.12.0-py3-none-any.whl", hash = "sha256:d74939969585ee35748bd66de09baf84099d461bda7287cd9043bfb99b0e424d", size = 646806, upload-time = "2026-04-24T13:32:06.717Z" },
+]
+
+[[package]]
+name = "idna"
+version = "3.13"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ce/cc/762dfb036166873f0059f3b7de4565e1b5bc3d6f28a414c13da27e442f99/idna-3.13.tar.gz", hash = "sha256:585ea8fe5d69b9181ec1afba340451fba6ba764af97026f92a91d4eef164a242", size = 194210, upload-time = "2026-04-22T16:42:42.314Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl", hash = "sha256:892ea0cde124a99ce773decba204c5552b69c3c67ffd5f232eb7696135bc8bb3", size = 68629, upload-time = "2026-04-22T16:42:40.909Z" },
+]
+
+[[package]]
+name = "importlib-metadata"
+version = "8.7.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "zipp" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" },
+]
+
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
+]
+
+[[package]]
+name = "jaraco-classes"
+version = "3.4.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "more-itertools" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/06/c0/ed4a27bc5571b99e3cff68f8a9fa5b56ff7df1c2251cc715a652ddd26402/jaraco.classes-3.4.0.tar.gz", hash = "sha256:47a024b51d0239c0dd8c8540c6c7f484be3b8fcf0b2d85c13825780d3b3f3acd", size = 11780, upload-time = "2024-03-31T07:27:36.643Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/7f/66/b15ce62552d84bbfcec9a4873ab79d993a1dd4edb922cbfccae192bd5b5f/jaraco.classes-3.4.0-py3-none-any.whl", hash = "sha256:f662826b6bed8cace05e7ff873ce0f9283b5c924470fe664fff1c2f00f581790", size = 6777, upload-time = "2024-03-31T07:27:34.792Z" },
+]
+
+[[package]]
+name = "jaraco-context"
+version = "6.1.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "backports-tarfile", marker = "python_full_version < '3.12'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/af/50/4763cd07e722bb6285316d390a164bc7e479db9d90daa769f22578f698b4/jaraco_context-6.1.2.tar.gz", hash = "sha256:f1a6c9d391e661cc5b8d39861ff077a7dc24dc23833ccee564b234b81c82dfe3", size = 16801, upload-time = "2026-03-20T22:13:33.922Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/f2/58/bc8954bda5fcda97bd7c19be11b85f91973d67a706ed4a3aec33e7de22db/jaraco_context-6.1.2-py3-none-any.whl", hash = "sha256:bf8150b79a2d5d91ae48629d8b427a8f7ba0e1097dd6202a9059f29a36379535", size = 7871, upload-time = "2026-03-20T22:13:32.808Z" },
+]
+
+[[package]]
+name = "jaraco-functools"
+version = "4.4.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "more-itertools" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0f/27/056e0638a86749374d6f57d0b0db39f29509cce9313cf91bdc0ac4d91084/jaraco_functools-4.4.0.tar.gz", hash = "sha256:da21933b0417b89515562656547a77b4931f98176eb173644c0d35032a33d6bb", size = 19943, upload-time = "2025-12-21T09:29:43.6Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/fd/c4/813bb09f0985cb21e959f21f2464169eca882656849adf727ac7bb7e1767/jaraco_functools-4.4.0-py3-none-any.whl", hash = "sha256:9eec1e36f45c818d9bf307c8948eb03b2b56cd44087b3cdc989abca1f20b9176", size = 10481, upload-time = "2025-12-21T09:29:42.27Z" },
+]
+
+[[package]]
+name = "jeepney"
+version = "0.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7b/6f/357efd7602486741aa73ffc0617fb310a29b588ed0fd69c2399acbb85b0c/jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732", size = 106758, upload-time = "2025-02-27T18:51:01.684Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/b2/a3/e137168c9c44d18eff0376253da9f1e9234d0239e0ee230d2fee6cea8e55/jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683", size = 49010, upload-time = "2025-02-27T18:51:00.104Z" },
+]
+
+[[package]]
+name = "jinja2"
+version = "3.1.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "markupsafe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
+]
+
+[[package]]
+name = "jiter"
+version = "0.14.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6e/c1/0cddc6eb17d4c53a99840953f95dd3accdc5cfc7a337b0e9b26476276be9/jiter-0.14.0.tar.gz", hash = "sha256:e8a39e66dac7153cf3f964a12aad515afa8d74938ec5cc0018adcdae5367c79e", size = 165725, upload-time = "2026-04-10T14:28:42.01Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/64/2e/a9959997739c403378d0a4a3a1c4ed80b60aeace216c4d37b303a9fc60a4/jiter-0.14.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:02f36a5c700f105ac04a6556fe664a59037a2c200db3b7e88784fac2ddf02531", size = 316927, upload-time = "2026-04-10T14:25:40.753Z" },
+ { url = "https://files.pythonhosted.org/packages/27/72/b6de8a531e0adbadd839bec301165feb1fccf00e9ff55073ba2dd20f0043/jiter-0.14.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:41eab6c09ceffb6f0fe25e214b3068146edb1eda3649ca2aee2a061029c7ba2e", size = 321181, upload-time = "2026-04-10T14:25:42.621Z" },
+ { url = "https://files.pythonhosted.org/packages/db/d8/2040b9efa13c917f855c40890ae4119fe02c25b7c7677d5b4fa820a851fc/jiter-0.14.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cf4d4c109641f9cfaf4a7b6aebd51654e405cd00fa9ebbf87163b8b97b325aa", size = 347387, upload-time = "2026-04-10T14:25:44.212Z" },
+ { url = "https://files.pythonhosted.org/packages/49/62/655c0ad5ce6a8e90f9068c175b8a236877d753e460762b3183c136db1c5b/jiter-0.14.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b80c7b41a628e6be2213ad0ece763c5f88aa5ee003fa394d58acaaee1f4b8342", size = 373083, upload-time = "2026-04-10T14:25:45.55Z" },
+ { url = "https://files.pythonhosted.org/packages/f1/66/549c40fa068f08710b7570869c306a051eb67a29758bd64f4114f730554c/jiter-0.14.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fb3dbf7cc0d4dbe73cce307ebe7eefa7f73a7d3d854dd119ea0c243f03e40927", size = 463639, upload-time = "2026-04-10T14:25:47.452Z" },
+ { url = "https://files.pythonhosted.org/packages/25/2f/97a32a05fed14ed58a18e181fdfb619e05163f3726b54ee6080ec0539c09/jiter-0.14.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7054adcdeb06b46efd17b5734f75817a44a2d06d3748e36c3a023a1bb52af9ec", size = 380735, upload-time = "2026-04-10T14:25:49.305Z" },
+ { url = "https://files.pythonhosted.org/packages/2a/3b/4347e1d6c2a973d653bbb7a2d671a2d2426e54b52ba735b8ff0d0a29b75c/jiter-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d597cd1bf6790376f3fffc7c708766e57301d99a19314824ea0ccc9c3c70e1e2", size = 358632, upload-time = "2026-04-10T14:25:50.931Z" },
+ { url = "https://files.pythonhosted.org/packages/ef/24/ca452fbf2ea33548ed30ce68a39a50442d3f7c9bf0704a7af958a930c057/jiter-0.14.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:df63a14878da754427926281626fd3ee249424a186e25a274e78176d42945264", size = 359969, upload-time = "2026-04-10T14:25:52.381Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/a3/94470a0d199287caabeb4da2bb2ae5f6d17f3cf05dfc975d7cb064d58e0f/jiter-0.14.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4ea73187627bcc5810e085df715e8a99da8bdfd96a7eb36b4b4df700ba6d4c9c", size = 397529, upload-time = "2026-04-10T14:25:53.801Z" },
+ { url = "https://files.pythonhosted.org/packages/cf/71/6768edc09d7c45c39f093feb3de105fa718a3e982b5208b8a2ed6382b44b/jiter-0.14.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9f541eaf7bb8382367a1a23d6fc3d6aad57f8dd8c18c3c17f838bee20f217220", size = 522342, upload-time = "2026-04-10T14:25:55.396Z" },
+ { url = "https://files.pythonhosted.org/packages/3d/6b/5c2e17559a0f4e96e934479f7137df46c939e983fa05244e674815befb73/jiter-0.14.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:107465250de4fce00fdb47166bcd51df8e634e049541174fe3c71848e44f52ce", size = 556784, upload-time = "2026-04-10T14:25:56.927Z" },
+ { url = "https://files.pythonhosted.org/packages/b1/83/c25f3556a60fc74d11199100f1b6cc0c006b815c8494dea8ca16fe398732/jiter-0.14.0-cp310-cp310-win32.whl", hash = "sha256:ffb2a08a406465bb076b7cc1df41d833106d3cf7905076cc73f0cb90078c7d10", size = 208439, upload-time = "2026-04-10T14:25:58.796Z" },
+ { url = "https://files.pythonhosted.org/packages/2e/99/781a1b413f0989b7f2ea203b094b331685f1a35e52e0a45e5d000ecaab27/jiter-0.14.0-cp310-cp310-win_amd64.whl", hash = "sha256:cb8b682d10cb0cce7ff4c1af7244af7022c9b01ae16d46c357bdd0df13afb25d", size = 204558, upload-time = "2026-04-10T14:26:00.208Z" },
+ { url = "https://files.pythonhosted.org/packages/8a/1f/198ae537fccb7080a0ed655eb56abf64a92f79489dfbf79f40fa34225bcd/jiter-0.14.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:7e791e247b8044512e070bd1f3633dc08350d32776d2d6e7473309d0edf256a2", size = 316896, upload-time = "2026-04-10T14:26:01.986Z" },
+ { url = "https://files.pythonhosted.org/packages/cf/34/da67cff3fce964a36d03c3e365fb0f8726ade2a6cfd4d3c70107e216ead6/jiter-0.14.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:71527ce13fd5a0c4e40ad37331f8c547177dbb2dd0a93e5278b6a5eecf748804", size = 321085, upload-time = "2026-04-10T14:26:03.364Z" },
+ { url = "https://files.pythonhosted.org/packages/ed/36/4c72e67180d4e71a4f5dcf7886d0840e83c49ab11788172177a77570326e/jiter-0.14.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02c4a7ab56f746014874f2c525584c0daca1dec37f66fd707ecef3b7e5c2228c", size = 347393, upload-time = "2026-04-10T14:26:05.314Z" },
+ { url = "https://files.pythonhosted.org/packages/bc/db/9b39e09ceafa9878235c0fc29e3e3f9b12a4c6a98ea3085b998cadf3accc/jiter-0.14.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:376e9dafff914253bb9d46cdc5f7965607fbe7feb0a491c34e35f92b2770702e", size = 372937, upload-time = "2026-04-10T14:26:06.884Z" },
+ { url = "https://files.pythonhosted.org/packages/b0/96/0dcba1d7a82c1b720774b48ef239376addbaf30df24c34742ac4a57b67b2/jiter-0.14.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:23ad2a7a9da1935575c820428dd8d2490ce4d23189691ce33da1fc0a58e14e1c", size = 463646, upload-time = "2026-04-10T14:26:08.345Z" },
+ { url = "https://files.pythonhosted.org/packages/f1/e3/f61b71543e746e6b8b805e7755814fc242715c16f1dba58e1cbccb8032c2/jiter-0.14.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:54b3ddf5786bc7732d293bba3411ac637ecfa200a39983166d1df86a59a43c9f", size = 380225, upload-time = "2026-04-10T14:26:10.161Z" },
+ { url = "https://files.pythonhosted.org/packages/ad/5e/0ddeb7096aca099114abe36c4921016e8d251e6f35f5890240b31f1f60ae/jiter-0.14.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c001d5a646c2a50dc055dd526dad5d5245969e8234d2b1131d0451e81f3a373", size = 358682, upload-time = "2026-04-10T14:26:11.574Z" },
+ { url = "https://files.pythonhosted.org/packages/e9/d1/fe0c46cd7fda9cad8f1ff9ad217dc61f1e4280b21052ec6dfe88c1446ef2/jiter-0.14.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:834bb5bdabca2e91592a03d373838a8d0a1b8bbde7077ae6913fd2fc51812d00", size = 359973, upload-time = "2026-04-10T14:26:13.316Z" },
+ { url = "https://files.pythonhosted.org/packages/ac/21/f5317f91729b501019184771c80d60abd89907009e7bfa6c7e348c5bdd44/jiter-0.14.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4e9178be60e229b1b2b0710f61b9e24d1f4f8556985a83ff4c4f95920eea7314", size = 397568, upload-time = "2026-04-10T14:26:15.212Z" },
+ { url = "https://files.pythonhosted.org/packages/e9/05/79d8f33fb2bf168db0df5c9cd16fe440a8ada57e929d3677b22712c2568f/jiter-0.14.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a7e4ccff04ec03614e62c613e976a3a5860dc9714ce8266f44328bdc8b1cab2c", size = 522535, upload-time = "2026-04-10T14:26:16.956Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/00/d1e3ff3d2a465e67f08507d74bafb2dcd29eba91dc939820e39e8dea38b8/jiter-0.14.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69539d936fb5d55caf6ecd33e2e884de083ff0ea28579780d56c4403094bb8d9", size = 556709, upload-time = "2026-04-10T14:26:18.5Z" },
+ { url = "https://files.pythonhosted.org/packages/60/5b/bbb2189f62ace8d95e869aa4c84c9946616f301e2d02895a6f20dcc3bba3/jiter-0.14.0-cp311-cp311-win32.whl", hash = "sha256:4927d09b3e572787cc5e0a5318601448e1ab9391bcef95677f5840c2d00eaa6d", size = 208660, upload-time = "2026-04-10T14:26:20.511Z" },
+ { url = "https://files.pythonhosted.org/packages/b8/86/c500b53dcbf08575f5963e536ebd757a1f7c568272ba5d180b212c9a87fb/jiter-0.14.0-cp311-cp311-win_amd64.whl", hash = "sha256:42d6ed359ac49eb922fdd565f209c57340aa06d589c84c8413e42a0f9ae1b842", size = 204659, upload-time = "2026-04-10T14:26:22.152Z" },
+ { url = "https://files.pythonhosted.org/packages/75/4a/a676249049d42cb29bef82233e4fe0524d414cbe3606c7a4b311193c2f77/jiter-0.14.0-cp311-cp311-win_arm64.whl", hash = "sha256:6dd689f5f4a5a33747b28686e051095beb214fe28cfda5e9fe58a295a788f593", size = 194772, upload-time = "2026-04-10T14:26:23.458Z" },
+ { url = "https://files.pythonhosted.org/packages/5a/68/7390a418f10897da93b158f2d5a8bd0bcd73a0f9ec3bb36917085bb759ef/jiter-0.14.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:2fb2ce3a7bc331256dfb14cefc34832366bb28a9aca81deaf43bbf2a5659e607", size = 316295, upload-time = "2026-04-10T14:26:24.887Z" },
+ { url = "https://files.pythonhosted.org/packages/60/a0/5854ac00ff63551c52c6c89534ec6aba4b93474e7924d64e860b1c94165b/jiter-0.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5252a7ca23785cef5d02d4ece6077a1b556a410c591b379f82091c3001e14844", size = 315898, upload-time = "2026-04-10T14:26:26.601Z" },
+ { url = "https://files.pythonhosted.org/packages/41/a1/4f44832650a16b18e8391f1bf1d6ca4909bc738351826bcc198bba4357f4/jiter-0.14.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c409578cbd77c338975670ada777add4efd53379667edf0aceea730cabede6fb", size = 343730, upload-time = "2026-04-10T14:26:28.326Z" },
+ { url = "https://files.pythonhosted.org/packages/48/64/a329e9d469f86307203594b1707e11ae51c3348d03bfd514a5f997870012/jiter-0.14.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7ede4331a1899d604463369c730dbb961ffdc5312bc7f16c41c2896415b1304a", size = 370102, upload-time = "2026-04-10T14:26:30.089Z" },
+ { url = "https://files.pythonhosted.org/packages/94/c1/5e3dfc59635aa4d4c7bd20a820ac1d09b8ed851568356802cf1c08edb3cf/jiter-0.14.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:92cd8b6025981a041f5310430310b55b25ca593972c16407af8837d3d7d2ca01", size = 461335, upload-time = "2026-04-10T14:26:31.911Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/1b/dd157009dbc058f7b00108f545ccb72a2d56461395c4fc7b9cfdccb00af4/jiter-0.14.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:351bf6eda4e3a7ceb876377840c702e9a3e4ecc4624dbfb2d6463c67ae52637d", size = 378536, upload-time = "2026-04-10T14:26:33.595Z" },
+ { url = "https://files.pythonhosted.org/packages/91/78/256013667b7c10b8834f8e6e54cd3e562d4c6e34227a1596addccc05e38c/jiter-0.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1dcfbeb93d9ecd9ca128bbf8910120367777973fa193fb9a39c31237d8df165", size = 353859, upload-time = "2026-04-10T14:26:35.098Z" },
+ { url = "https://files.pythonhosted.org/packages/de/d9/137d65ade9093a409fe80955ce60b12bb753722c986467aeda47faf450ad/jiter-0.14.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:ae039aaef8de3f8157ecc1fdd4d85043ac4f57538c245a0afaecb8321ec951c3", size = 357626, upload-time = "2026-04-10T14:26:36.685Z" },
+ { url = "https://files.pythonhosted.org/packages/2e/48/76750835b87029342727c1a268bea8878ab988caf81ee4e7b880900eeb5a/jiter-0.14.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7d9d51eb96c82a9652933bd769fe6de66877d6eb2b2440e281f2938c51b5643e", size = 393172, upload-time = "2026-04-10T14:26:38.097Z" },
+ { url = "https://files.pythonhosted.org/packages/a6/60/456c4e81d5c8045279aefe60e9e483be08793828800a4e64add8fdde7f2a/jiter-0.14.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d824ca4148b705970bf4e120924a212fdfca9859a73e42bd7889a63a4ea6bb98", size = 520300, upload-time = "2026-04-10T14:26:39.532Z" },
+ { url = "https://files.pythonhosted.org/packages/a8/9f/2020e0984c235f678dced38fe4eec3058cf528e6af36ebf969b410305941/jiter-0.14.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ff3a6465b3a0f54b1a430f45c3c0ba7d61ceb45cbc3e33f9e1a7f638d690baf3", size = 553059, upload-time = "2026-04-10T14:26:40.991Z" },
+ { url = "https://files.pythonhosted.org/packages/ef/32/e2d298e1a22a4bbe6062136d1c7192db7dba003a6975e51d9a9eecabc4c2/jiter-0.14.0-cp312-cp312-win32.whl", hash = "sha256:5dec7c0a3e98d2a3f8a2e67382d0d7c3ac60c69103a4b271da889b4e8bb1e129", size = 206030, upload-time = "2026-04-10T14:26:42.517Z" },
+ { url = "https://files.pythonhosted.org/packages/36/ac/96369141b3d8a4a8e4590e983085efe1c436f35c0cda940dd76d942e3e40/jiter-0.14.0-cp312-cp312-win_amd64.whl", hash = "sha256:fc7e37b4b8bc7e80a63ad6cfa5fc11fab27dbfea4cc4ae644b1ab3f273dc348f", size = 201603, upload-time = "2026-04-10T14:26:44.328Z" },
+ { url = "https://files.pythonhosted.org/packages/01/c3/75d847f264647017d7e3052bbcc8b1e24b95fa139c320c5f5066fa7a0bdd/jiter-0.14.0-cp312-cp312-win_arm64.whl", hash = "sha256:ee4a72f12847ef29b072aee9ad5474041ab2924106bdca9fcf5d7d965853e057", size = 191525, upload-time = "2026-04-10T14:26:46Z" },
+ { url = "https://files.pythonhosted.org/packages/97/2a/09f70020898507a89279659a1afe3364d57fc1b2c89949081975d135f6f5/jiter-0.14.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:af72f204cf4d44258e5b4c1745130ac45ddab0e71a06333b01de660ab4187a94", size = 315502, upload-time = "2026-04-10T14:26:47.697Z" },
+ { url = "https://files.pythonhosted.org/packages/d6/be/080c96a45cd74f9fce5db4fd68510b88087fb37ffe2541ff73c12db92535/jiter-0.14.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4b77da71f6e819be5fbcec11a453fde5b1d0267ef6ed487e2a392fd8e14e4e3a", size = 314870, upload-time = "2026-04-10T14:26:49.149Z" },
+ { url = "https://files.pythonhosted.org/packages/7d/5e/2d0fee155826a968a832cc32438de5e2a193292c8721ca70d0b53e58245b/jiter-0.14.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f4ea612fe8b84b8b04e51d0e78029ecf3466348e25973f953de6e6a59aa4c1", size = 343406, upload-time = "2026-04-10T14:26:50.762Z" },
+ { url = "https://files.pythonhosted.org/packages/70/af/bf9ee0d3a4f8dc0d679fc1337f874fe60cdbf841ebbb304b374e1c9aaceb/jiter-0.14.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:62fe2451f8fcc0240261e6a4df18ecbcd58327857e61e625b2393ea3b468aac9", size = 369415, upload-time = "2026-04-10T14:26:52.188Z" },
+ { url = "https://files.pythonhosted.org/packages/0f/83/8e8561eadba31f4d3948a5b712fb0447ec71c3560b57a855449e7b8ddc98/jiter-0.14.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6112f26f5afc75bcb475787d29da3aa92f9d09c7858f632f4be6ffe607be82e9", size = 461456, upload-time = "2026-04-10T14:26:53.611Z" },
+ { url = "https://files.pythonhosted.org/packages/f6/c9/c5299e826a5fe6108d172b344033f61c69b1bb979dd8d9ddd4278a160971/jiter-0.14.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:215a6cb8fb7dc702aa35d475cc00ddc7f970e5c0b1417fb4b4ac5d82fa2a29db", size = 378488, upload-time = "2026-04-10T14:26:55.211Z" },
+ { url = "https://files.pythonhosted.org/packages/5d/37/c16d9d15c0a471b8644b1abe3c82668092a707d9bedcf076f24ff2e380cd/jiter-0.14.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc4ab96a30fb3cb2c7e0cd33f7616c8860da5f5674438988a54ac717caccdbaa", size = 353242, upload-time = "2026-04-10T14:26:56.705Z" },
+ { url = "https://files.pythonhosted.org/packages/58/ea/8050cb0dc654e728e1bfacbc0c640772f2181af5dedd13ae70145743a439/jiter-0.14.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:3a99c1387b1f2928f799a9de899193484d66206a50e98233b6b088a7f0c1edb2", size = 356823, upload-time = "2026-04-10T14:26:58.281Z" },
+ { url = "https://files.pythonhosted.org/packages/b0/3b/cf71506d270e5f84d97326bf220e47aed9b95e9a4a060758fb07772170ab/jiter-0.14.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ab18d11074485438695f8d34a1b6da61db9754248f96d51341956607a8f39985", size = 392564, upload-time = "2026-04-10T14:27:00.018Z" },
+ { url = "https://files.pythonhosted.org/packages/b0/cc/8c6c74a3efb5bd671bfd14f51e8a73375464ca914b1551bc3b40e26ac2c9/jiter-0.14.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:801028dcfc26ac0895e4964cbc0fd62c73be9fd4a7d7b1aaf6e5790033a719b7", size = 520322, upload-time = "2026-04-10T14:27:01.664Z" },
+ { url = "https://files.pythonhosted.org/packages/41/24/68d7b883ec959884ddf00d019b2e0e82ba81b167e1253684fa90519ce33c/jiter-0.14.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ad425b087aafb4a1c7e1e98a279200743b9aaf30c3e0ba723aec93f061bd9bc8", size = 552619, upload-time = "2026-04-10T14:27:03.316Z" },
+ { url = "https://files.pythonhosted.org/packages/b6/89/b1a0985223bbf3150ff9e8f46f98fc9360c1de94f48abe271bbe1b465682/jiter-0.14.0-cp313-cp313-win32.whl", hash = "sha256:882bcb9b334318e233950b8be366fe5f92c86b66a7e449e76975dfd6d776a01f", size = 205699, upload-time = "2026-04-10T14:27:04.662Z" },
+ { url = "https://files.pythonhosted.org/packages/4c/19/3f339a5a7f14a11730e67f6be34f9d5105751d547b615ef593fa122a5ded/jiter-0.14.0-cp313-cp313-win_amd64.whl", hash = "sha256:9b8c571a5dba09b98bd3462b5a53f27209a5cbbe85670391692ede71974e979f", size = 201323, upload-time = "2026-04-10T14:27:06.139Z" },
+ { url = "https://files.pythonhosted.org/packages/50/56/752dd89c84be0e022a8ea3720bcfa0a8431db79a962578544812ce061739/jiter-0.14.0-cp313-cp313-win_arm64.whl", hash = "sha256:34f19dcc35cb1abe7c369b3756babf8c7f04595c0807a848df8f26ef8298ef92", size = 191099, upload-time = "2026-04-10T14:27:07.564Z" },
+ { url = "https://files.pythonhosted.org/packages/91/28/292916f354f25a1fe8cf2c918d1415c699a4a659ae00be0430e1c5d9ffea/jiter-0.14.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e89bcd7d426a75bb4952c696b267075790d854a07aad4c9894551a82c5b574ab", size = 320880, upload-time = "2026-04-10T14:27:09.326Z" },
+ { url = "https://files.pythonhosted.org/packages/ad/c7/b002a7d8b8957ac3d469bd59c18ef4b1595a5216ae0de639a287b9816023/jiter-0.14.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b25beaa0d4447ea8c7ae0c18c688905d34840d7d0b937f2f7bdd52162c98a40", size = 346563, upload-time = "2026-04-10T14:27:11.287Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/3b/f8d07580d8706021d255a6356b8fab13ee4c869412995550ce6ed4ddf97d/jiter-0.14.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:651a8758dd413c51e3b7f6557cdc6921faf70b14106f45f969f091f5cda990ea", size = 357928, upload-time = "2026-04-10T14:27:12.729Z" },
+ { url = "https://files.pythonhosted.org/packages/47/5b/ac1a974da29e35507230383110ffec59998b290a8732585d04e19a9eb5ba/jiter-0.14.0-cp313-cp313t-win_amd64.whl", hash = "sha256:e1a7eead856a5038a8d291f1447176ab0b525c77a279a058121b5fccee257f6f", size = 203519, upload-time = "2026-04-10T14:27:14.125Z" },
+ { url = "https://files.pythonhosted.org/packages/96/6d/9fc8433d667d2454271378a79747d8c76c10b51b482b454e6190e511f244/jiter-0.14.0-cp313-cp313t-win_arm64.whl", hash = "sha256:2e692633a12cda97e352fdcd1c4acc971b1c28707e1e33aeef782b0cbf051975", size = 190113, upload-time = "2026-04-10T14:27:16.638Z" },
+ { url = "https://files.pythonhosted.org/packages/4f/1e/354ed92461b165bd581f9ef5150971a572c873ec3b68a916d5aa91da3cc2/jiter-0.14.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:6f396837fc7577871ca8c12edaf239ed9ccef3bbe39904ae9b8b63ce0a48b140", size = 315277, upload-time = "2026-04-10T14:27:18.109Z" },
+ { url = "https://files.pythonhosted.org/packages/a6/95/8c7c7028aa8636ac21b7a55faef3e34215e6ed0cbf5ae58258427f621aa3/jiter-0.14.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a4d50ea3d8ba4176f79754333bd35f1bbcd28e91adc13eb9b7ca91bc52a6cef9", size = 315923, upload-time = "2026-04-10T14:27:19.603Z" },
+ { url = "https://files.pythonhosted.org/packages/47/40/e2a852a44c4a089f2681a16611b7ce113224a80fd8504c46d78491b47220/jiter-0.14.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce17f8a050447d1b4153bda4fb7d26e6a9e74eb4f4a41913f30934c5075bf615", size = 344943, upload-time = "2026-04-10T14:27:21.262Z" },
+ { url = "https://files.pythonhosted.org/packages/fc/1f/670f92adee1e9895eac41e8a4d623b6da68c4d46249d8b556b60b63f949e/jiter-0.14.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f4f1c4b125e1652aefbc2e2c1617b60a160ab789d180e3d423c41439e5f32850", size = 369725, upload-time = "2026-04-10T14:27:22.766Z" },
+ { url = "https://files.pythonhosted.org/packages/01/2f/541c9ba567d05de1c4874a0f8f8c5e3fd78e2b874266623da9a775cf46e0/jiter-0.14.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be808176a6a3a14321d18c603f2d40741858a7c4fc982f83232842689fe86dd9", size = 461210, upload-time = "2026-04-10T14:27:24.315Z" },
+ { url = "https://files.pythonhosted.org/packages/ce/a9/c31cbec09627e0d5de7aeaec7690dba03e090caa808fefd8133137cf45bc/jiter-0.14.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:26679d58ba816f88c3849306dd58cb863a90a1cf352cdd4ef67e30ccf8a77994", size = 380002, upload-time = "2026-04-10T14:27:26.155Z" },
+ { url = "https://files.pythonhosted.org/packages/50/02/3c05c1666c41904a2f607475a73e7a4763d1cbde2d18229c4f85b22dc253/jiter-0.14.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80381f5a19af8fa9aef743f080e34f6b25ebd89656475f8cf0470ec6157052aa", size = 354678, upload-time = "2026-04-10T14:27:27.701Z" },
+ { url = "https://files.pythonhosted.org/packages/7d/97/e15b33545c2b13518f560d695f974b9891b311641bdcf178d63177e8801e/jiter-0.14.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:004df5fdb8ecbd6d99f3227df18ba1a259254c4359736a2e6f036c944e02d7c5", size = 358920, upload-time = "2026-04-10T14:27:29.256Z" },
+ { url = "https://files.pythonhosted.org/packages/ad/d2/8b1461def6b96ba44530df20d07ef7a1c7da22f3f9bf1727e2d611077bf1/jiter-0.14.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cff5708f7ed0fa098f2b53446c6fa74c48469118e5cd7497b4f1cd569ab06928", size = 394512, upload-time = "2026-04-10T14:27:31.344Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/88/837566dd6ed6e452e8d3205355afd484ce44b2533edfa4ed73a298ea893e/jiter-0.14.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:2492e5f06c36a976d25c7cc347a60e26d5470178d44cde1b9b75e60b4e519f28", size = 521120, upload-time = "2026-04-10T14:27:33.299Z" },
+ { url = "https://files.pythonhosted.org/packages/89/6b/b00b45c4d1b4c031777fe161d620b755b5b02cdade1e316dcb46e4471d63/jiter-0.14.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:7609cfbe3a03d37bfdbf5052012d5a879e72b83168a363deae7b3a26564d57de", size = 553668, upload-time = "2026-04-10T14:27:34.868Z" },
+ { url = "https://files.pythonhosted.org/packages/ad/d8/6fe5b42011d19397433d345716eac16728ac241862a2aac9c91923c7509a/jiter-0.14.0-cp314-cp314-win32.whl", hash = "sha256:7282342d32e357543565286b6450378c3cd402eea333fc1ebe146f1fabb306fc", size = 207001, upload-time = "2026-04-10T14:27:36.455Z" },
+ { url = "https://files.pythonhosted.org/packages/e5/43/5c2e08da1efad5e410f0eaaabeadd954812612c33fbbd8fd5328b489139d/jiter-0.14.0-cp314-cp314-win_amd64.whl", hash = "sha256:bd77945f38866a448e73b0b7637366afa814d4617790ecd88a18ca74377e6c02", size = 202187, upload-time = "2026-04-10T14:27:38Z" },
+ { url = "https://files.pythonhosted.org/packages/aa/1f/6e39ac0b4cdfa23e606af5b245df5f9adaa76f35e0c5096790da430ca506/jiter-0.14.0-cp314-cp314-win_arm64.whl", hash = "sha256:f2d4c61da0821ee42e0cdf5489da60a6d074306313a377c2b35af464955a3611", size = 192257, upload-time = "2026-04-10T14:27:39.504Z" },
+ { url = "https://files.pythonhosted.org/packages/05/57/7dbc0ffbbb5176a27e3518716608aa464aee2e2887dc938f0b900a120449/jiter-0.14.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1bf7ff85517dd2f20a5750081d2b75083c1b269cf75afc7511bdf1f9548beb3b", size = 323441, upload-time = "2026-04-10T14:27:41.039Z" },
+ { url = "https://files.pythonhosted.org/packages/83/6e/7b3314398d8983f06b557aa21b670511ec72d3b79a68ee5e4d9bff972286/jiter-0.14.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8ef8791c3e78d6c6b157c6d360fbb5c715bebb8113bc6a9303c5caff012754a", size = 348109, upload-time = "2026-04-10T14:27:42.552Z" },
+ { url = "https://files.pythonhosted.org/packages/ae/4f/8dc674bcd7db6dba566de73c08c763c337058baff1dbeb34567045b27cdc/jiter-0.14.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e74663b8b10da1fe0f4e4703fd7980d24ad17174b6bb35d8498d6e3ebce2ae6a", size = 368328, upload-time = "2026-04-10T14:27:44.574Z" },
+ { url = "https://files.pythonhosted.org/packages/3b/5f/188e09a1f20906f98bbdec44ed820e19f4e8eb8aff88b9d1a5a497587ff3/jiter-0.14.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1aca29ba52913f78362ec9c2da62f22cdc4c3083313403f90c15460979b84d9b", size = 463301, upload-time = "2026-04-10T14:27:46.717Z" },
+ { url = "https://files.pythonhosted.org/packages/ac/f0/19046ef965ed8f349e8554775bb12ff4352f443fbe12b95d31f575891256/jiter-0.14.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8b39b7d87a952b79949af5fef44d2544e58c21a28da7f1bae3ef166455c61746", size = 378891, upload-time = "2026-04-10T14:27:48.32Z" },
+ { url = "https://files.pythonhosted.org/packages/c4/c3/da43bd8431ee175695777ee78cf0e93eacbb47393ff493f18c45231b427d/jiter-0.14.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78d918a68b26e9fab068c2b5453577ef04943ab2807b9a6275df2a812599a310", size = 360749, upload-time = "2026-04-10T14:27:49.88Z" },
+ { url = "https://files.pythonhosted.org/packages/72/26/e054771be889707c6161dbdec9c23d33a9ec70945395d70f07cfea1e9a6f/jiter-0.14.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:b08997c35aee1201c1a5361466a8fb9162d03ae7bf6568df70b6c859f1e654a4", size = 358526, upload-time = "2026-04-10T14:27:51.504Z" },
+ { url = "https://files.pythonhosted.org/packages/c3/0f/7bea65ea2a6d91f2bf989ff11a18136644392bf2b0497a1fa50934c30a9c/jiter-0.14.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:260bf7ca20704d58d41f669e5e9fe7fe2fa72901a6b324e79056f5d52e9c9be2", size = 393926, upload-time = "2026-04-10T14:27:53.368Z" },
+ { url = "https://files.pythonhosted.org/packages/3c/a1/b1ff7d70deef61ac0b7c6c2f12d2ace950cdeecb4fdc94500a0926802857/jiter-0.14.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:37826e3df29e60f30a382f9294348d0238ef127f4b5d7f5f8da78b5b9e050560", size = 521052, upload-time = "2026-04-10T14:27:55.058Z" },
+ { url = "https://files.pythonhosted.org/packages/0b/7b/3b0649983cbaf15eda26a414b5b1982e910c67bd6f7b1b490f3cfc76896a/jiter-0.14.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:645be49c46f2900937ba0eaf871ad5183c96858c0af74b6becc7f4e367e36e06", size = 553716, upload-time = "2026-04-10T14:27:57.269Z" },
+ { url = "https://files.pythonhosted.org/packages/97/f8/33d78c83bd93ae0c0af05293a6660f88a1977caef39a6d72a84afab94ce0/jiter-0.14.0-cp314-cp314t-win32.whl", hash = "sha256:2f7877ed45118de283786178eceaf877110abacd04fde31efff3940ae9672674", size = 207957, upload-time = "2026-04-10T14:27:59.285Z" },
+ { url = "https://files.pythonhosted.org/packages/d6/ac/2b760516c03e2227826d1f7025d89bf6bf6357a28fe75c2a2800873c50bf/jiter-0.14.0-cp314-cp314t-win_amd64.whl", hash = "sha256:14c0cb10337c49f5eafe8e7364daca5e29a020ea03580b8f8e6c597fed4e1588", size = 204690, upload-time = "2026-04-10T14:28:00.962Z" },
+ { url = "https://files.pythonhosted.org/packages/dc/2e/a44c20c58aeed0355f2d326969a181696aeb551a25195f47563908a815be/jiter-0.14.0-cp314-cp314t-win_arm64.whl", hash = "sha256:5419d4aa2024961da9fe12a9cfe7484996735dca99e8e090b5c88595ef1951ff", size = 191338, upload-time = "2026-04-10T14:28:02.853Z" },
+ { url = "https://files.pythonhosted.org/packages/32/a1/ef34ca2cab2962598591636a1804b93645821201cc0095d4a93a9a329c9d/jiter-0.14.0-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:a25ffa2dbbdf8721855612f6dca15c108224b12d0c4024d0ac3d7902132b4211", size = 311366, upload-time = "2026-04-10T14:28:27.943Z" },
+ { url = "https://files.pythonhosted.org/packages/60/bb/520576a532a6b8a6f42747afed289c8448c879a34d7802fe2c832d4fd38f/jiter-0.14.0-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:0ac9cbaa86c10996b92bd12c91659b60f939f8e28fcfa6bc11a0e90a774ce95b", size = 309873, upload-time = "2026-04-10T14:28:29.688Z" },
+ { url = "https://files.pythonhosted.org/packages/b2/7c/c16db114ea1f2f532f198aa8dc39585026af45af362c69a0492f31bc4821/jiter-0.14.0-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:844e73b6c56b505e9e169234ea3bdea2ea43f769f847f47ac559ba1d2361ebea", size = 344816, upload-time = "2026-04-10T14:28:31.348Z" },
+ { url = "https://files.pythonhosted.org/packages/99/8f/15e7741ff19e9bcd4d753f7ff22f988fd54592f134ca13701c13ea8c20e0/jiter-0.14.0-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e52c076f187405fc21523c746c04399c9af8ece566077ed147b2126f2bcba577", size = 351445, upload-time = "2026-04-10T14:28:33.093Z" },
+ { url = "https://files.pythonhosted.org/packages/21/42/9042c3f3019de4adcb8c16591c325ec7255beea9fcd33a42a43f3b0b1000/jiter-0.14.0-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:fbd9e482663ca9d005d051330e4d2d8150bb208a209409c10f7e7dfdf7c49da9", size = 308810, upload-time = "2026-04-10T14:28:34.673Z" },
+ { url = "https://files.pythonhosted.org/packages/60/cf/a7e19b308bd86bb04776803b1f01a5f9a287a4c55205f4708827ee487fbf/jiter-0.14.0-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:33a20d838b91ef376b3a56896d5b04e725c7df5bc4864cc6569cf046a8d73b6d", size = 308443, upload-time = "2026-04-10T14:28:36.658Z" },
+ { url = "https://files.pythonhosted.org/packages/ca/44/e26ede3f0caeff93f222559cb0cc4ca68579f07d009d7b6010c5b586f9b1/jiter-0.14.0-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:432c4db5255d86a259efde91e55cb4c8d18c0521d844c9e2e7efcce3899fb016", size = 343039, upload-time = "2026-04-10T14:28:38.356Z" },
+ { url = "https://files.pythonhosted.org/packages/da/e9/1f9ada30cef7b05e74bb06f52127e7a724976c225f46adb65c37b1dadfb6/jiter-0.14.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67f00d94b281174144d6532a04b66a12cb866cbdc47c3af3bfe2973677f9861a", size = 349613, upload-time = "2026-04-10T14:28:40.066Z" },
+]
+
+[[package]]
+name = "joserfc"
+version = "1.6.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "cryptography" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/de/c6/de8fdbdfa75c8ca04fead38a82d573df8a82906e984c349d58665f459558/joserfc-1.6.4.tar.gz", hash = "sha256:34ce5f499bfcc5e9ad4cc75077f9278ab3227b71da9aaf28f9ab705f8a560d3c", size = 231866, upload-time = "2026-04-13T13:15:40.632Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/b6/f7/210b27752e972edb36d239315b08d3eb6b14824cc4a590da2337d195260b/joserfc-1.6.4-py3-none-any.whl", hash = "sha256:3e4a22b509b41908989237a045e25c8308d5fd47ab96bdae2dd8057c6451003a", size = 70464, upload-time = "2026-04-13T13:15:39.259Z" },
+]
+
+[[package]]
+name = "jsonref"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/aa/0d/c1f3277e90ccdb50d33ed5ba1ec5b3f0a242ed8c1b1a85d3afeb68464dca/jsonref-1.1.0.tar.gz", hash = "sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552", size = 8814, upload-time = "2023-01-16T16:10:04.455Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/0c/ec/e1db9922bceb168197a558a2b8c03a7963f1afe93517ddd3cf99f202f996/jsonref-1.1.0-py3-none-any.whl", hash = "sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9", size = 9425, upload-time = "2023-01-16T16:10:02.255Z" },
+]
+
+[[package]]
+name = "jsonschema"
+version = "4.26.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "attrs" },
+ { name = "jsonschema-specifications" },
+ { name = "referencing" },
+ { name = "rpds-py" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b3/fc/e067678238fa451312d4c62bf6e6cf5ec56375422aee02f9cb5f909b3047/jsonschema-4.26.0.tar.gz", hash = "sha256:0c26707e2efad8aa1bfc5b7ce170f3fccc2e4918ff85989ba9ffa9facb2be326", size = 366583, upload-time = "2026-01-07T13:41:07.246Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/69/90/f63fb5873511e014207a475e2bb4e8b2e570d655b00ac19a9a0ca0a385ee/jsonschema-4.26.0-py3-none-any.whl", hash = "sha256:d489f15263b8d200f8387e64b4c3a75f06629559fb73deb8fdfb525f2dab50ce", size = 90630, upload-time = "2026-01-07T13:41:05.306Z" },
+]
+
+[[package]]
+name = "jsonschema-path"
+version = "0.4.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "pathable" },
+ { name = "pyyaml" },
+ { name = "referencing" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5b/8a/7e6102f2b8bdc6705a9eb5294f8f6f9ccd3a8420e8e8e19671d1dd773251/jsonschema_path-0.4.5.tar.gz", hash = "sha256:c6cd7d577ae290c7defd4f4029e86fdb248ca1bd41a07557795b3c95e5144918", size = 15113, upload-time = "2026-03-03T09:56:46.87Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/04/d5/4e96c44f6c1ea3d812cf5391d81a4f5abaa540abf8d04ecd7f66e0ed11df/jsonschema_path-0.4.5-py3-none-any.whl", hash = "sha256:7d77a2c3f3ec569a40efe5c5f942c44c1af2a6f96fe0866794c9ef5b8f87fd65", size = 19368, upload-time = "2026-03-03T09:56:45.39Z" },
+]
+
+[[package]]
+name = "jsonschema-specifications"
+version = "2025.9.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "referencing" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/19/74/a633ee74eb36c44aa6d1095e7cc5569bebf04342ee146178e2d36600708b/jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d", size = 32855, upload-time = "2025-09-08T01:34:59.186Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" },
+]
+
+[[package]]
+name = "keyring"
+version = "25.7.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "importlib-metadata", marker = "python_full_version < '3.12'" },
+ { name = "jaraco-classes" },
+ { name = "jaraco-context" },
+ { name = "jaraco-functools" },
+ { name = "jeepney", marker = "sys_platform == 'linux'" },
+ { name = "pywin32-ctypes", marker = "sys_platform == 'win32'" },
+ { name = "secretstorage", marker = "sys_platform == 'linux'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/43/4b/674af6ef2f97d56f0ab5153bf0bfa28ccb6c3ed4d1babf4305449668807b/keyring-25.7.0.tar.gz", hash = "sha256:fe01bd85eb3f8fb3dd0405defdeac9a5b4f6f0439edbb3149577f244a2e8245b", size = 63516, upload-time = "2025-11-16T16:26:09.482Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/81/db/e655086b7f3a705df045bf0933bdd9c2f79bb3c97bfef1384598bb79a217/keyring-25.7.0-py3-none-any.whl", hash = "sha256:be4a0b195f149690c166e850609a477c532ddbfbaed96a404d4e43f8d5e2689f", size = 39160, upload-time = "2025-11-16T16:26:08.402Z" },
+]
+
+[[package]]
+name = "markdown-it-py"
+version = "4.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "mdurl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" },
+]
+
+[[package]]
+name = "markupsafe"
+version = "3.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/e8/4b/3541d44f3937ba468b75da9eebcae497dcf67adb65caa16760b0a6807ebb/markupsafe-3.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f981d352f04553a7171b8e44369f2af4055f888dfb147d55e42d29e29e74559", size = 11631, upload-time = "2025-09-27T18:36:05.558Z" },
+ { url = "https://files.pythonhosted.org/packages/98/1b/fbd8eed11021cabd9226c37342fa6ca4e8a98d8188a8d9b66740494960e4/markupsafe-3.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c1493fb6e50ab01d20a22826e57520f1284df32f2d8601fdd90b6304601419", size = 12057, upload-time = "2025-09-27T18:36:07.165Z" },
+ { url = "https://files.pythonhosted.org/packages/40/01/e560d658dc0bb8ab762670ece35281dec7b6c1b33f5fbc09ebb57a185519/markupsafe-3.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ba88449deb3de88bd40044603fafffb7bc2b055d626a330323a9ed736661695", size = 22050, upload-time = "2025-09-27T18:36:08.005Z" },
+ { url = "https://files.pythonhosted.org/packages/af/cd/ce6e848bbf2c32314c9b237839119c5a564a59725b53157c856e90937b7a/markupsafe-3.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f42d0984e947b8adf7dd6dde396e720934d12c506ce84eea8476409563607591", size = 20681, upload-time = "2025-09-27T18:36:08.881Z" },
+ { url = "https://files.pythonhosted.org/packages/c9/2a/b5c12c809f1c3045c4d580b035a743d12fcde53cf685dbc44660826308da/markupsafe-3.0.3-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c0c0b3ade1c0b13b936d7970b1d37a57acde9199dc2aecc4c336773e1d86049c", size = 20705, upload-time = "2025-09-27T18:36:10.131Z" },
+ { url = "https://files.pythonhosted.org/packages/cf/e3/9427a68c82728d0a88c50f890d0fc072a1484de2f3ac1ad0bfc1a7214fd5/markupsafe-3.0.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0303439a41979d9e74d18ff5e2dd8c43ed6c6001fd40e5bf2e43f7bd9bbc523f", size = 21524, upload-time = "2025-09-27T18:36:11.324Z" },
+ { url = "https://files.pythonhosted.org/packages/bc/36/23578f29e9e582a4d0278e009b38081dbe363c5e7165113fad546918a232/markupsafe-3.0.3-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:d2ee202e79d8ed691ceebae8e0486bd9a2cd4794cec4824e1c99b6f5009502f6", size = 20282, upload-time = "2025-09-27T18:36:12.573Z" },
+ { url = "https://files.pythonhosted.org/packages/56/21/dca11354e756ebd03e036bd8ad58d6d7168c80ce1fe5e75218e4945cbab7/markupsafe-3.0.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:177b5253b2834fe3678cb4a5f0059808258584c559193998be2601324fdeafb1", size = 20745, upload-time = "2025-09-27T18:36:13.504Z" },
+ { url = "https://files.pythonhosted.org/packages/87/99/faba9369a7ad6e4d10b6a5fbf71fa2a188fe4a593b15f0963b73859a1bbd/markupsafe-3.0.3-cp310-cp310-win32.whl", hash = "sha256:2a15a08b17dd94c53a1da0438822d70ebcd13f8c3a95abe3a9ef9f11a94830aa", size = 14571, upload-time = "2025-09-27T18:36:14.779Z" },
+ { url = "https://files.pythonhosted.org/packages/d6/25/55dc3ab959917602c96985cb1253efaa4ff42f71194bddeb61eb7278b8be/markupsafe-3.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:c4ffb7ebf07cfe8931028e3e4c85f0357459a3f9f9490886198848f4fa002ec8", size = 15056, upload-time = "2025-09-27T18:36:16.125Z" },
+ { url = "https://files.pythonhosted.org/packages/d0/9e/0a02226640c255d1da0b8d12e24ac2aa6734da68bff14c05dd53b94a0fc3/markupsafe-3.0.3-cp310-cp310-win_arm64.whl", hash = "sha256:e2103a929dfa2fcaf9bb4e7c091983a49c9ac3b19c9061b6d5427dd7d14d81a1", size = 13932, upload-time = "2025-09-27T18:36:17.311Z" },
+ { url = "https://files.pythonhosted.org/packages/08/db/fefacb2136439fc8dd20e797950e749aa1f4997ed584c62cfb8ef7c2be0e/markupsafe-3.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad", size = 11631, upload-time = "2025-09-27T18:36:18.185Z" },
+ { url = "https://files.pythonhosted.org/packages/e1/2e/5898933336b61975ce9dc04decbc0a7f2fee78c30353c5efba7f2d6ff27a/markupsafe-3.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a", size = 12058, upload-time = "2025-09-27T18:36:19.444Z" },
+ { url = "https://files.pythonhosted.org/packages/1d/09/adf2df3699d87d1d8184038df46a9c80d78c0148492323f4693df54e17bb/markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50", size = 24287, upload-time = "2025-09-27T18:36:20.768Z" },
+ { url = "https://files.pythonhosted.org/packages/30/ac/0273f6fcb5f42e314c6d8cd99effae6a5354604d461b8d392b5ec9530a54/markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf", size = 22940, upload-time = "2025-09-27T18:36:22.249Z" },
+ { url = "https://files.pythonhosted.org/packages/19/ae/31c1be199ef767124c042c6c3e904da327a2f7f0cd63a0337e1eca2967a8/markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f", size = 21887, upload-time = "2025-09-27T18:36:23.535Z" },
+ { url = "https://files.pythonhosted.org/packages/b2/76/7edcab99d5349a4532a459e1fe64f0b0467a3365056ae550d3bcf3f79e1e/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a", size = 23692, upload-time = "2025-09-27T18:36:24.823Z" },
+ { url = "https://files.pythonhosted.org/packages/a4/28/6e74cdd26d7514849143d69f0bf2399f929c37dc2b31e6829fd2045b2765/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115", size = 21471, upload-time = "2025-09-27T18:36:25.95Z" },
+ { url = "https://files.pythonhosted.org/packages/62/7e/a145f36a5c2945673e590850a6f8014318d5577ed7e5920a4b3448e0865d/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a", size = 22923, upload-time = "2025-09-27T18:36:27.109Z" },
+ { url = "https://files.pythonhosted.org/packages/0f/62/d9c46a7f5c9adbeeeda52f5b8d802e1094e9717705a645efc71b0913a0a8/markupsafe-3.0.3-cp311-cp311-win32.whl", hash = "sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19", size = 14572, upload-time = "2025-09-27T18:36:28.045Z" },
+ { url = "https://files.pythonhosted.org/packages/83/8a/4414c03d3f891739326e1783338e48fb49781cc915b2e0ee052aa490d586/markupsafe-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01", size = 15077, upload-time = "2025-09-27T18:36:29.025Z" },
+ { url = "https://files.pythonhosted.org/packages/35/73/893072b42e6862f319b5207adc9ae06070f095b358655f077f69a35601f0/markupsafe-3.0.3-cp311-cp311-win_arm64.whl", hash = "sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c", size = 13876, upload-time = "2025-09-27T18:36:29.954Z" },
+ { url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615, upload-time = "2025-09-27T18:36:30.854Z" },
+ { url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020, upload-time = "2025-09-27T18:36:31.971Z" },
+ { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332, upload-time = "2025-09-27T18:36:32.813Z" },
+ { url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947, upload-time = "2025-09-27T18:36:33.86Z" },
+ { url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962, upload-time = "2025-09-27T18:36:35.099Z" },
+ { url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760, upload-time = "2025-09-27T18:36:36.001Z" },
+ { url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529, upload-time = "2025-09-27T18:36:36.906Z" },
+ { url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015, upload-time = "2025-09-27T18:36:37.868Z" },
+ { url = "https://files.pythonhosted.org/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d", size = 14540, upload-time = "2025-09-27T18:36:38.761Z" },
+ { url = "https://files.pythonhosted.org/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c", size = 15105, upload-time = "2025-09-27T18:36:39.701Z" },
+ { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906, upload-time = "2025-09-27T18:36:40.689Z" },
+ { url = "https://files.pythonhosted.org/packages/38/2f/907b9c7bbba283e68f20259574b13d005c121a0fa4c175f9bed27c4597ff/markupsafe-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795", size = 11622, upload-time = "2025-09-27T18:36:41.777Z" },
+ { url = "https://files.pythonhosted.org/packages/9c/d9/5f7756922cdd676869eca1c4e3c0cd0df60ed30199ffd775e319089cb3ed/markupsafe-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219", size = 12029, upload-time = "2025-09-27T18:36:43.257Z" },
+ { url = "https://files.pythonhosted.org/packages/00/07/575a68c754943058c78f30db02ee03a64b3c638586fba6a6dd56830b30a3/markupsafe-3.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6", size = 24374, upload-time = "2025-09-27T18:36:44.508Z" },
+ { url = "https://files.pythonhosted.org/packages/a9/21/9b05698b46f218fc0e118e1f8168395c65c8a2c750ae2bab54fc4bd4e0e8/markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676", size = 22980, upload-time = "2025-09-27T18:36:45.385Z" },
+ { url = "https://files.pythonhosted.org/packages/7f/71/544260864f893f18b6827315b988c146b559391e6e7e8f7252839b1b846a/markupsafe-3.0.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9", size = 21990, upload-time = "2025-09-27T18:36:46.916Z" },
+ { url = "https://files.pythonhosted.org/packages/c2/28/b50fc2f74d1ad761af2f5dcce7492648b983d00a65b8c0e0cb457c82ebbe/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1", size = 23784, upload-time = "2025-09-27T18:36:47.884Z" },
+ { url = "https://files.pythonhosted.org/packages/ed/76/104b2aa106a208da8b17a2fb72e033a5a9d7073c68f7e508b94916ed47a9/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc", size = 21588, upload-time = "2025-09-27T18:36:48.82Z" },
+ { url = "https://files.pythonhosted.org/packages/b5/99/16a5eb2d140087ebd97180d95249b00a03aa87e29cc224056274f2e45fd6/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12", size = 23041, upload-time = "2025-09-27T18:36:49.797Z" },
+ { url = "https://files.pythonhosted.org/packages/19/bc/e7140ed90c5d61d77cea142eed9f9c303f4c4806f60a1044c13e3f1471d0/markupsafe-3.0.3-cp313-cp313-win32.whl", hash = "sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed", size = 14543, upload-time = "2025-09-27T18:36:51.584Z" },
+ { url = "https://files.pythonhosted.org/packages/05/73/c4abe620b841b6b791f2edc248f556900667a5a1cf023a6646967ae98335/markupsafe-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5", size = 15113, upload-time = "2025-09-27T18:36:52.537Z" },
+ { url = "https://files.pythonhosted.org/packages/f0/3a/fa34a0f7cfef23cf9500d68cb7c32dd64ffd58a12b09225fb03dd37d5b80/markupsafe-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485", size = 13911, upload-time = "2025-09-27T18:36:53.513Z" },
+ { url = "https://files.pythonhosted.org/packages/e4/d7/e05cd7efe43a88a17a37b3ae96e79a19e846f3f456fe79c57ca61356ef01/markupsafe-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73", size = 11658, upload-time = "2025-09-27T18:36:54.819Z" },
+ { url = "https://files.pythonhosted.org/packages/99/9e/e412117548182ce2148bdeacdda3bb494260c0b0184360fe0d56389b523b/markupsafe-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37", size = 12066, upload-time = "2025-09-27T18:36:55.714Z" },
+ { url = "https://files.pythonhosted.org/packages/bc/e6/fa0ffcda717ef64a5108eaa7b4f5ed28d56122c9a6d70ab8b72f9f715c80/markupsafe-3.0.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19", size = 25639, upload-time = "2025-09-27T18:36:56.908Z" },
+ { url = "https://files.pythonhosted.org/packages/96/ec/2102e881fe9d25fc16cb4b25d5f5cde50970967ffa5dddafdb771237062d/markupsafe-3.0.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025", size = 23569, upload-time = "2025-09-27T18:36:57.913Z" },
+ { url = "https://files.pythonhosted.org/packages/4b/30/6f2fce1f1f205fc9323255b216ca8a235b15860c34b6798f810f05828e32/markupsafe-3.0.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6", size = 23284, upload-time = "2025-09-27T18:36:58.833Z" },
+ { url = "https://files.pythonhosted.org/packages/58/47/4a0ccea4ab9f5dcb6f79c0236d954acb382202721e704223a8aafa38b5c8/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f", size = 24801, upload-time = "2025-09-27T18:36:59.739Z" },
+ { url = "https://files.pythonhosted.org/packages/6a/70/3780e9b72180b6fecb83a4814d84c3bf4b4ae4bf0b19c27196104149734c/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb", size = 22769, upload-time = "2025-09-27T18:37:00.719Z" },
+ { url = "https://files.pythonhosted.org/packages/98/c5/c03c7f4125180fc215220c035beac6b9cb684bc7a067c84fc69414d315f5/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009", size = 23642, upload-time = "2025-09-27T18:37:01.673Z" },
+ { url = "https://files.pythonhosted.org/packages/80/d6/2d1b89f6ca4bff1036499b1e29a1d02d282259f3681540e16563f27ebc23/markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354", size = 14612, upload-time = "2025-09-27T18:37:02.639Z" },
+ { url = "https://files.pythonhosted.org/packages/2b/98/e48a4bfba0a0ffcf9925fe2d69240bfaa19c6f7507b8cd09c70684a53c1e/markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218", size = 15200, upload-time = "2025-09-27T18:37:03.582Z" },
+ { url = "https://files.pythonhosted.org/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287", size = 13973, upload-time = "2025-09-27T18:37:04.929Z" },
+ { url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619, upload-time = "2025-09-27T18:37:06.342Z" },
+ { url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029, upload-time = "2025-09-27T18:37:07.213Z" },
+ { url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408, upload-time = "2025-09-27T18:37:09.572Z" },
+ { url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005, upload-time = "2025-09-27T18:37:10.58Z" },
+ { url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048, upload-time = "2025-09-27T18:37:11.547Z" },
+ { url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821, upload-time = "2025-09-27T18:37:12.48Z" },
+ { url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606, upload-time = "2025-09-27T18:37:13.485Z" },
+ { url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043, upload-time = "2025-09-27T18:37:14.408Z" },
+ { url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747, upload-time = "2025-09-27T18:37:15.36Z" },
+ { url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341, upload-time = "2025-09-27T18:37:16.496Z" },
+ { url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073, upload-time = "2025-09-27T18:37:17.476Z" },
+ { url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661, upload-time = "2025-09-27T18:37:18.453Z" },
+ { url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069, upload-time = "2025-09-27T18:37:19.332Z" },
+ { url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670, upload-time = "2025-09-27T18:37:20.245Z" },
+ { url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598, upload-time = "2025-09-27T18:37:21.177Z" },
+ { url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261, upload-time = "2025-09-27T18:37:22.167Z" },
+ { url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835, upload-time = "2025-09-27T18:37:23.296Z" },
+ { url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733, upload-time = "2025-09-27T18:37:24.237Z" },
+ { url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672, upload-time = "2025-09-27T18:37:25.271Z" },
+ { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" },
+ { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" },
+ { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" },
+]
+
+[[package]]
+name = "mcp"
+version = "1.27.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "anyio" },
+ { name = "httpx" },
+ { name = "httpx-sse" },
+ { name = "jsonschema" },
+ { name = "pydantic" },
+ { name = "pydantic-settings" },
+ { name = "pyjwt", extra = ["crypto"] },
+ { name = "python-multipart" },
+ { name = "pywin32", marker = "sys_platform == 'win32'" },
+ { name = "sse-starlette" },
+ { name = "starlette" },
+ { name = "typing-extensions" },
+ { name = "typing-inspection" },
+ { name = "uvicorn", marker = "sys_platform != 'emscripten'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/8b/eb/c0cfc62075dc6e1ec1c64d352ae09ac051d9334311ed226f1f425312848a/mcp-1.27.0.tar.gz", hash = "sha256:d3dc35a7eec0d458c1da4976a48f982097ddaab87e278c5511d5a4a56e852b83", size = 607509, upload-time = "2026-04-02T14:48:08.88Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/9c/46/f6b4ad632c67ef35209a66127e4bddc95759649dd595f71f13fba11bdf9a/mcp-1.27.0-py3-none-any.whl", hash = "sha256:5ce1fa81614958e267b21fb2aa34e0aea8e2c6ede60d52aba45fd47246b4d741", size = 215967, upload-time = "2026-04-02T14:48:07.24Z" },
+]
+
+[[package]]
+name = "mdurl"
+version = "0.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
+]
+
+[[package]]
+name = "more-itertools"
+version = "11.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/f7/139d22fef48ac78127d18e01d80cf1be40236ae489769d17f35c3d425293/more_itertools-11.0.2.tar.gz", hash = "sha256:392a9e1e362cbc106a2457d37cabf9b36e5e12efd4ebff1654630e76597df804", size = 144659, upload-time = "2026-04-09T15:01:33.297Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/cb/98/6af411189d9413534c3eb691182bff1f5c6d44ed2f93f2edfe52a1bbceb8/more_itertools-11.0.2-py3-none-any.whl", hash = "sha256:6e35b35f818b01f691643c6c611bc0902f2e92b46c18fffa77ae1e7c46e912e4", size = 71939, upload-time = "2026-04-09T15:01:32.21Z" },
+]
+
+[[package]]
+name = "numpy"
+version = "2.2.6"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version < '3.11'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/9a/3e/ed6db5be21ce87955c0cbd3009f2803f59fa08df21b5df06862e2d8e2bdd/numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b412caa66f72040e6d268491a59f2c43bf03eb6c96dd8f0307829feb7fa2b6fb", size = 21165245, upload-time = "2025-05-17T21:27:58.555Z" },
+ { url = "https://files.pythonhosted.org/packages/22/c2/4b9221495b2a132cc9d2eb862e21d42a009f5a60e45fc44b00118c174bff/numpy-2.2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e41fd67c52b86603a91c1a505ebaef50b3314de0213461c7a6e99c9a3beff90", size = 14360048, upload-time = "2025-05-17T21:28:21.406Z" },
+ { url = "https://files.pythonhosted.org/packages/fd/77/dc2fcfc66943c6410e2bf598062f5959372735ffda175b39906d54f02349/numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:37e990a01ae6ec7fe7fa1c26c55ecb672dd98b19c3d0e1d1f326fa13cb38d163", size = 5340542, upload-time = "2025-05-17T21:28:30.931Z" },
+ { url = "https://files.pythonhosted.org/packages/7a/4f/1cb5fdc353a5f5cc7feb692db9b8ec2c3d6405453f982435efc52561df58/numpy-2.2.6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:5a6429d4be8ca66d889b7cf70f536a397dc45ba6faeb5f8c5427935d9592e9cf", size = 6878301, upload-time = "2025-05-17T21:28:41.613Z" },
+ { url = "https://files.pythonhosted.org/packages/eb/17/96a3acd228cec142fcb8723bd3cc39c2a474f7dcf0a5d16731980bcafa95/numpy-2.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efd28d4e9cd7d7a8d39074a4d44c63eda73401580c5c76acda2ce969e0a38e83", size = 14297320, upload-time = "2025-05-17T21:29:02.78Z" },
+ { url = "https://files.pythonhosted.org/packages/b4/63/3de6a34ad7ad6646ac7d2f55ebc6ad439dbbf9c4370017c50cf403fb19b5/numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc7b73d02efb0e18c000e9ad8b83480dfcd5dfd11065997ed4c6747470ae8915", size = 16801050, upload-time = "2025-05-17T21:29:27.675Z" },
+ { url = "https://files.pythonhosted.org/packages/07/b6/89d837eddef52b3d0cec5c6ba0456c1bf1b9ef6a6672fc2b7873c3ec4e2e/numpy-2.2.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:74d4531beb257d2c3f4b261bfb0fc09e0f9ebb8842d82a7b4209415896adc680", size = 15807034, upload-time = "2025-05-17T21:29:51.102Z" },
+ { url = "https://files.pythonhosted.org/packages/01/c8/dc6ae86e3c61cfec1f178e5c9f7858584049b6093f843bca541f94120920/numpy-2.2.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8fc377d995680230e83241d8a96def29f204b5782f371c532579b4f20607a289", size = 18614185, upload-time = "2025-05-17T21:30:18.703Z" },
+ { url = "https://files.pythonhosted.org/packages/5b/c5/0064b1b7e7c89137b471ccec1fd2282fceaae0ab3a9550f2568782d80357/numpy-2.2.6-cp310-cp310-win32.whl", hash = "sha256:b093dd74e50a8cba3e873868d9e93a85b78e0daf2e98c6797566ad8044e8363d", size = 6527149, upload-time = "2025-05-17T21:30:29.788Z" },
+ { url = "https://files.pythonhosted.org/packages/a3/dd/4b822569d6b96c39d1215dbae0582fd99954dcbcf0c1a13c61783feaca3f/numpy-2.2.6-cp310-cp310-win_amd64.whl", hash = "sha256:f0fd6321b839904e15c46e0d257fdd101dd7f530fe03fd6359c1ea63738703f3", size = 12904620, upload-time = "2025-05-17T21:30:48.994Z" },
+ { url = "https://files.pythonhosted.org/packages/da/a8/4f83e2aa666a9fbf56d6118faaaf5f1974d456b1823fda0a176eff722839/numpy-2.2.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f9f1adb22318e121c5c69a09142811a201ef17ab257a1e66ca3025065b7f53ae", size = 21176963, upload-time = "2025-05-17T21:31:19.36Z" },
+ { url = "https://files.pythonhosted.org/packages/b3/2b/64e1affc7972decb74c9e29e5649fac940514910960ba25cd9af4488b66c/numpy-2.2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c820a93b0255bc360f53eca31a0e676fd1101f673dda8da93454a12e23fc5f7a", size = 14406743, upload-time = "2025-05-17T21:31:41.087Z" },
+ { url = "https://files.pythonhosted.org/packages/4a/9f/0121e375000b5e50ffdd8b25bf78d8e1a5aa4cca3f185d41265198c7b834/numpy-2.2.6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3d70692235e759f260c3d837193090014aebdf026dfd167834bcba43e30c2a42", size = 5352616, upload-time = "2025-05-17T21:31:50.072Z" },
+ { url = "https://files.pythonhosted.org/packages/31/0d/b48c405c91693635fbe2dcd7bc84a33a602add5f63286e024d3b6741411c/numpy-2.2.6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:481b49095335f8eed42e39e8041327c05b0f6f4780488f61286ed3c01368d491", size = 6889579, upload-time = "2025-05-17T21:32:01.712Z" },
+ { url = "https://files.pythonhosted.org/packages/52/b8/7f0554d49b565d0171eab6e99001846882000883998e7b7d9f0d98b1f934/numpy-2.2.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b64d8d4d17135e00c8e346e0a738deb17e754230d7e0810ac5012750bbd85a5a", size = 14312005, upload-time = "2025-05-17T21:32:23.332Z" },
+ { url = "https://files.pythonhosted.org/packages/b3/dd/2238b898e51bd6d389b7389ffb20d7f4c10066d80351187ec8e303a5a475/numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba10f8411898fc418a521833e014a77d3ca01c15b0c6cdcce6a0d2897e6dbbdf", size = 16821570, upload-time = "2025-05-17T21:32:47.991Z" },
+ { url = "https://files.pythonhosted.org/packages/83/6c/44d0325722cf644f191042bf47eedad61c1e6df2432ed65cbe28509d404e/numpy-2.2.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bd48227a919f1bafbdda0583705e547892342c26fb127219d60a5c36882609d1", size = 15818548, upload-time = "2025-05-17T21:33:11.728Z" },
+ { url = "https://files.pythonhosted.org/packages/ae/9d/81e8216030ce66be25279098789b665d49ff19eef08bfa8cb96d4957f422/numpy-2.2.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9551a499bf125c1d4f9e250377c1ee2eddd02e01eac6644c080162c0c51778ab", size = 18620521, upload-time = "2025-05-17T21:33:39.139Z" },
+ { url = "https://files.pythonhosted.org/packages/6a/fd/e19617b9530b031db51b0926eed5345ce8ddc669bb3bc0044b23e275ebe8/numpy-2.2.6-cp311-cp311-win32.whl", hash = "sha256:0678000bb9ac1475cd454c6b8c799206af8107e310843532b04d49649c717a47", size = 6525866, upload-time = "2025-05-17T21:33:50.273Z" },
+ { url = "https://files.pythonhosted.org/packages/31/0a/f354fb7176b81747d870f7991dc763e157a934c717b67b58456bc63da3df/numpy-2.2.6-cp311-cp311-win_amd64.whl", hash = "sha256:e8213002e427c69c45a52bbd94163084025f533a55a59d6f9c5b820774ef3303", size = 12907455, upload-time = "2025-05-17T21:34:09.135Z" },
+ { url = "https://files.pythonhosted.org/packages/82/5d/c00588b6cf18e1da539b45d3598d3557084990dcc4331960c15ee776ee41/numpy-2.2.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41c5a21f4a04fa86436124d388f6ed60a9343a6f767fced1a8a71c3fbca038ff", size = 20875348, upload-time = "2025-05-17T21:34:39.648Z" },
+ { url = "https://files.pythonhosted.org/packages/66/ee/560deadcdde6c2f90200450d5938f63a34b37e27ebff162810f716f6a230/numpy-2.2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de749064336d37e340f640b05f24e9e3dd678c57318c7289d222a8a2f543e90c", size = 14119362, upload-time = "2025-05-17T21:35:01.241Z" },
+ { url = "https://files.pythonhosted.org/packages/3c/65/4baa99f1c53b30adf0acd9a5519078871ddde8d2339dc5a7fde80d9d87da/numpy-2.2.6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:894b3a42502226a1cac872f840030665f33326fc3dac8e57c607905773cdcde3", size = 5084103, upload-time = "2025-05-17T21:35:10.622Z" },
+ { url = "https://files.pythonhosted.org/packages/cc/89/e5a34c071a0570cc40c9a54eb472d113eea6d002e9ae12bb3a8407fb912e/numpy-2.2.6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:71594f7c51a18e728451bb50cc60a3ce4e6538822731b2933209a1f3614e9282", size = 6625382, upload-time = "2025-05-17T21:35:21.414Z" },
+ { url = "https://files.pythonhosted.org/packages/f8/35/8c80729f1ff76b3921d5c9487c7ac3de9b2a103b1cd05e905b3090513510/numpy-2.2.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2618db89be1b4e05f7a1a847a9c1c0abd63e63a1607d892dd54668dd92faf87", size = 14018462, upload-time = "2025-05-17T21:35:42.174Z" },
+ { url = "https://files.pythonhosted.org/packages/8c/3d/1e1db36cfd41f895d266b103df00ca5b3cbe965184df824dec5c08c6b803/numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd83c01228a688733f1ded5201c678f0c53ecc1006ffbc404db9f7a899ac6249", size = 16527618, upload-time = "2025-05-17T21:36:06.711Z" },
+ { url = "https://files.pythonhosted.org/packages/61/c6/03ed30992602c85aa3cd95b9070a514f8b3c33e31124694438d88809ae36/numpy-2.2.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:37c0ca431f82cd5fa716eca9506aefcabc247fb27ba69c5062a6d3ade8cf8f49", size = 15505511, upload-time = "2025-05-17T21:36:29.965Z" },
+ { url = "https://files.pythonhosted.org/packages/b7/25/5761d832a81df431e260719ec45de696414266613c9ee268394dd5ad8236/numpy-2.2.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fe27749d33bb772c80dcd84ae7e8df2adc920ae8297400dabec45f0dedb3f6de", size = 18313783, upload-time = "2025-05-17T21:36:56.883Z" },
+ { url = "https://files.pythonhosted.org/packages/57/0a/72d5a3527c5ebffcd47bde9162c39fae1f90138c961e5296491ce778e682/numpy-2.2.6-cp312-cp312-win32.whl", hash = "sha256:4eeaae00d789f66c7a25ac5f34b71a7035bb474e679f410e5e1a94deb24cf2d4", size = 6246506, upload-time = "2025-05-17T21:37:07.368Z" },
+ { url = "https://files.pythonhosted.org/packages/36/fa/8c9210162ca1b88529ab76b41ba02d433fd54fecaf6feb70ef9f124683f1/numpy-2.2.6-cp312-cp312-win_amd64.whl", hash = "sha256:c1f9540be57940698ed329904db803cf7a402f3fc200bfe599334c9bd84a40b2", size = 12614190, upload-time = "2025-05-17T21:37:26.213Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/5c/6657823f4f594f72b5471f1db1ab12e26e890bb2e41897522d134d2a3e81/numpy-2.2.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0811bb762109d9708cca4d0b13c4f67146e3c3b7cf8d34018c722adb2d957c84", size = 20867828, upload-time = "2025-05-17T21:37:56.699Z" },
+ { url = "https://files.pythonhosted.org/packages/dc/9e/14520dc3dadf3c803473bd07e9b2bd1b69bc583cb2497b47000fed2fa92f/numpy-2.2.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:287cc3162b6f01463ccd86be154f284d0893d2b3ed7292439ea97eafa8170e0b", size = 14143006, upload-time = "2025-05-17T21:38:18.291Z" },
+ { url = "https://files.pythonhosted.org/packages/4f/06/7e96c57d90bebdce9918412087fc22ca9851cceaf5567a45c1f404480e9e/numpy-2.2.6-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:f1372f041402e37e5e633e586f62aa53de2eac8d98cbfb822806ce4bbefcb74d", size = 5076765, upload-time = "2025-05-17T21:38:27.319Z" },
+ { url = "https://files.pythonhosted.org/packages/73/ed/63d920c23b4289fdac96ddbdd6132e9427790977d5457cd132f18e76eae0/numpy-2.2.6-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:55a4d33fa519660d69614a9fad433be87e5252f4b03850642f88993f7b2ca566", size = 6617736, upload-time = "2025-05-17T21:38:38.141Z" },
+ { url = "https://files.pythonhosted.org/packages/85/c5/e19c8f99d83fd377ec8c7e0cf627a8049746da54afc24ef0a0cb73d5dfb5/numpy-2.2.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f92729c95468a2f4f15e9bb94c432a9229d0d50de67304399627a943201baa2f", size = 14010719, upload-time = "2025-05-17T21:38:58.433Z" },
+ { url = "https://files.pythonhosted.org/packages/19/49/4df9123aafa7b539317bf6d342cb6d227e49f7a35b99c287a6109b13dd93/numpy-2.2.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1bc23a79bfabc5d056d106f9befb8d50c31ced2fbc70eedb8155aec74a45798f", size = 16526072, upload-time = "2025-05-17T21:39:22.638Z" },
+ { url = "https://files.pythonhosted.org/packages/b2/6c/04b5f47f4f32f7c2b0e7260442a8cbcf8168b0e1a41ff1495da42f42a14f/numpy-2.2.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e3143e4451880bed956e706a3220b4e5cf6172ef05fcc397f6f36a550b1dd868", size = 15503213, upload-time = "2025-05-17T21:39:45.865Z" },
+ { url = "https://files.pythonhosted.org/packages/17/0a/5cd92e352c1307640d5b6fec1b2ffb06cd0dabe7d7b8227f97933d378422/numpy-2.2.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b4f13750ce79751586ae2eb824ba7e1e8dba64784086c98cdbbcc6a42112ce0d", size = 18316632, upload-time = "2025-05-17T21:40:13.331Z" },
+ { url = "https://files.pythonhosted.org/packages/f0/3b/5cba2b1d88760ef86596ad0f3d484b1cbff7c115ae2429678465057c5155/numpy-2.2.6-cp313-cp313-win32.whl", hash = "sha256:5beb72339d9d4fa36522fc63802f469b13cdbe4fdab4a288f0c441b74272ebfd", size = 6244532, upload-time = "2025-05-17T21:43:46.099Z" },
+ { url = "https://files.pythonhosted.org/packages/cb/3b/d58c12eafcb298d4e6d0d40216866ab15f59e55d148a5658bb3132311fcf/numpy-2.2.6-cp313-cp313-win_amd64.whl", hash = "sha256:b0544343a702fa80c95ad5d3d608ea3599dd54d4632df855e4c8d24eb6ecfa1c", size = 12610885, upload-time = "2025-05-17T21:44:05.145Z" },
+ { url = "https://files.pythonhosted.org/packages/6b/9e/4bf918b818e516322db999ac25d00c75788ddfd2d2ade4fa66f1f38097e1/numpy-2.2.6-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0bca768cd85ae743b2affdc762d617eddf3bcf8724435498a1e80132d04879e6", size = 20963467, upload-time = "2025-05-17T21:40:44Z" },
+ { url = "https://files.pythonhosted.org/packages/61/66/d2de6b291507517ff2e438e13ff7b1e2cdbdb7cb40b3ed475377aece69f9/numpy-2.2.6-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:fc0c5673685c508a142ca65209b4e79ed6740a4ed6b2267dbba90f34b0b3cfda", size = 14225144, upload-time = "2025-05-17T21:41:05.695Z" },
+ { url = "https://files.pythonhosted.org/packages/e4/25/480387655407ead912e28ba3a820bc69af9adf13bcbe40b299d454ec011f/numpy-2.2.6-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:5bd4fc3ac8926b3819797a7c0e2631eb889b4118a9898c84f585a54d475b7e40", size = 5200217, upload-time = "2025-05-17T21:41:15.903Z" },
+ { url = "https://files.pythonhosted.org/packages/aa/4a/6e313b5108f53dcbf3aca0c0f3e9c92f4c10ce57a0a721851f9785872895/numpy-2.2.6-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:fee4236c876c4e8369388054d02d0e9bb84821feb1a64dd59e137e6511a551f8", size = 6712014, upload-time = "2025-05-17T21:41:27.321Z" },
+ { url = "https://files.pythonhosted.org/packages/b7/30/172c2d5c4be71fdf476e9de553443cf8e25feddbe185e0bd88b096915bcc/numpy-2.2.6-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e1dda9c7e08dc141e0247a5b8f49cf05984955246a327d4c48bda16821947b2f", size = 14077935, upload-time = "2025-05-17T21:41:49.738Z" },
+ { url = "https://files.pythonhosted.org/packages/12/fb/9e743f8d4e4d3c710902cf87af3512082ae3d43b945d5d16563f26ec251d/numpy-2.2.6-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f447e6acb680fd307f40d3da4852208af94afdfab89cf850986c3ca00562f4fa", size = 16600122, upload-time = "2025-05-17T21:42:14.046Z" },
+ { url = "https://files.pythonhosted.org/packages/12/75/ee20da0e58d3a66f204f38916757e01e33a9737d0b22373b3eb5a27358f9/numpy-2.2.6-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:389d771b1623ec92636b0786bc4ae56abafad4a4c513d36a55dce14bd9ce8571", size = 15586143, upload-time = "2025-05-17T21:42:37.464Z" },
+ { url = "https://files.pythonhosted.org/packages/76/95/bef5b37f29fc5e739947e9ce5179ad402875633308504a52d188302319c8/numpy-2.2.6-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8e9ace4a37db23421249ed236fdcdd457d671e25146786dfc96835cd951aa7c1", size = 18385260, upload-time = "2025-05-17T21:43:05.189Z" },
+ { url = "https://files.pythonhosted.org/packages/09/04/f2f83279d287407cf36a7a8053a5abe7be3622a4363337338f2585e4afda/numpy-2.2.6-cp313-cp313t-win32.whl", hash = "sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff", size = 6377225, upload-time = "2025-05-17T21:43:16.254Z" },
+ { url = "https://files.pythonhosted.org/packages/67/0e/35082d13c09c02c011cf21570543d202ad929d961c02a147493cb0c2bdf5/numpy-2.2.6-cp313-cp313t-win_amd64.whl", hash = "sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06", size = 12771374, upload-time = "2025-05-17T21:43:35.479Z" },
+ { url = "https://files.pythonhosted.org/packages/9e/3b/d94a75f4dbf1ef5d321523ecac21ef23a3cd2ac8b78ae2aac40873590229/numpy-2.2.6-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0b605b275d7bd0c640cad4e5d30fa701a8d59302e127e5f79138ad62762c3e3d", size = 21040391, upload-time = "2025-05-17T21:44:35.948Z" },
+ { url = "https://files.pythonhosted.org/packages/17/f4/09b2fa1b58f0fb4f7c7963a1649c64c4d315752240377ed74d9cd878f7b5/numpy-2.2.6-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:7befc596a7dc9da8a337f79802ee8adb30a552a94f792b9c9d18c840055907db", size = 6786754, upload-time = "2025-05-17T21:44:47.446Z" },
+ { url = "https://files.pythonhosted.org/packages/af/30/feba75f143bdc868a1cc3f44ccfa6c4b9ec522b36458e738cd00f67b573f/numpy-2.2.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce47521a4754c8f4593837384bd3424880629f718d87c5d44f8ed763edd63543", size = 16643476, upload-time = "2025-05-17T21:45:11.871Z" },
+ { url = "https://files.pythonhosted.org/packages/37/48/ac2a9584402fb6c0cd5b5d1a91dcf176b15760130dd386bbafdbfe3640bf/numpy-2.2.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d042d24c90c41b54fd506da306759e06e568864df8ec17ccc17e9e884634fd00", size = 12812666, upload-time = "2025-05-17T21:45:31.426Z" },
+]
+
+[[package]]
+name = "numpy"
+version = "2.4.4"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version >= '3.14' and sys_platform == 'win32'",
+ "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+ "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+ "python_full_version == '3.13.*' and sys_platform == 'win32'",
+ "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+ "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+ "python_full_version >= '3.11' and python_full_version < '3.13' and sys_platform == 'win32'",
+ "python_full_version >= '3.11' and python_full_version < '3.13' and sys_platform == 'emscripten'",
+ "python_full_version >= '3.11' and python_full_version < '3.13' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d7/9f/b8cef5bffa569759033adda9481211426f12f53299629b410340795c2514/numpy-2.4.4.tar.gz", hash = "sha256:2d390634c5182175533585cc89f3608a4682ccb173cc9bb940b2881c8d6f8fa0", size = 20731587, upload-time = "2026-03-29T13:22:01.298Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/ef/c6/4218570d8c8ecc9704b5157a3348e486e84ef4be0ed3e38218ab473c83d2/numpy-2.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f983334aea213c99992053ede6168500e5f086ce74fbc4acc3f2b00f5762e9db", size = 16976799, upload-time = "2026-03-29T13:18:15.438Z" },
+ { url = "https://files.pythonhosted.org/packages/dd/92/b4d922c4a5f5dab9ed44e6153908a5c665b71acf183a83b93b690996e39b/numpy-2.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:72944b19f2324114e9dc86a159787333b77874143efcf89a5167ef83cfee8af0", size = 14971552, upload-time = "2026-03-29T13:18:18.606Z" },
+ { url = "https://files.pythonhosted.org/packages/8a/dc/df98c095978fa6ee7b9a9387d1d58cbb3d232d0e69ad169a4ce784bde4fd/numpy-2.4.4-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:86b6f55f5a352b48d7fbfd2dbc3d5b780b2d79f4d3c121f33eb6efb22e9a2015", size = 5476566, upload-time = "2026-03-29T13:18:21.532Z" },
+ { url = "https://files.pythonhosted.org/packages/28/34/b3fdcec6e725409223dd27356bdf5a3c2cc2282e428218ecc9cb7acc9763/numpy-2.4.4-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:ba1f4fc670ed79f876f70082eff4f9583c15fb9a4b89d6188412de4d18ae2f40", size = 6806482, upload-time = "2026-03-29T13:18:23.634Z" },
+ { url = "https://files.pythonhosted.org/packages/68/62/63417c13aa35d57bee1337c67446761dc25ea6543130cf868eace6e8157b/numpy-2.4.4-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a87ec22c87be071b6bdbd27920b129b94f2fc964358ce38f3822635a3e2e03d", size = 15973376, upload-time = "2026-03-29T13:18:26.677Z" },
+ { url = "https://files.pythonhosted.org/packages/cf/c5/9fcb7e0e69cef59cf10c746b84f7d58b08bc66a6b7d459783c5a4f6101a6/numpy-2.4.4-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:df3775294accfdd75f32c74ae39fcba920c9a378a2fc18a12b6820aa8c1fb502", size = 16925137, upload-time = "2026-03-29T13:18:30.14Z" },
+ { url = "https://files.pythonhosted.org/packages/7e/43/80020edacb3f84b9efdd1591120a4296462c23fd8db0dde1666f6ef66f13/numpy-2.4.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0d4e437e295f18ec29bc79daf55e8a47a9113df44d66f702f02a293d93a2d6dd", size = 17329414, upload-time = "2026-03-29T13:18:33.733Z" },
+ { url = "https://files.pythonhosted.org/packages/fd/06/af0658593b18a5f73532d377188b964f239eb0894e664a6c12f484472f97/numpy-2.4.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6aa3236c78803afbcb255045fbef97a9e25a1f6c9888357d205ddc42f4d6eba5", size = 18658397, upload-time = "2026-03-29T13:18:37.511Z" },
+ { url = "https://files.pythonhosted.org/packages/e6/ce/13a09ed65f5d0ce5c7dd0669250374c6e379910f97af2c08c57b0608eee4/numpy-2.4.4-cp311-cp311-win32.whl", hash = "sha256:30caa73029a225b2d40d9fae193e008e24b2026b7ee1a867b7ee8d96ca1a448e", size = 6239499, upload-time = "2026-03-29T13:18:40.372Z" },
+ { url = "https://files.pythonhosted.org/packages/bd/63/05d193dbb4b5eec1eca73822d80da98b511f8328ad4ae3ca4caf0f4db91d/numpy-2.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:6bbe4eb67390b0a0265a2c25458f6b90a409d5d069f1041e6aff1e27e3d9a79e", size = 12614257, upload-time = "2026-03-29T13:18:42.95Z" },
+ { url = "https://files.pythonhosted.org/packages/87/c5/8168052f080c26fa984c413305012be54741c9d0d74abd7fbeeccae3889f/numpy-2.4.4-cp311-cp311-win_arm64.whl", hash = "sha256:fcfe2045fd2e8f3cb0ce9d4ba6dba6333b8fa05bb8a4939c908cd43322d14c7e", size = 10486775, upload-time = "2026-03-29T13:18:45.835Z" },
+ { url = "https://files.pythonhosted.org/packages/28/05/32396bec30fb2263770ee910142f49c1476d08e8ad41abf8403806b520ce/numpy-2.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15716cfef24d3a9762e3acdf87e27f58dc823d1348f765bbea6bef8c639bfa1b", size = 16689272, upload-time = "2026-03-29T13:18:49.223Z" },
+ { url = "https://files.pythonhosted.org/packages/c5/f3/a983d28637bfcd763a9c7aafdb6d5c0ebf3d487d1e1459ffdb57e2f01117/numpy-2.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23cbfd4c17357c81021f21540da84ee282b9c8fba38a03b7b9d09ba6b951421e", size = 14699573, upload-time = "2026-03-29T13:18:52.629Z" },
+ { url = "https://files.pythonhosted.org/packages/9b/fd/e5ecca1e78c05106d98028114f5c00d3eddb41207686b2b7de3e477b0e22/numpy-2.4.4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8b3b60bb7cba2c8c81837661c488637eee696f59a877788a396d33150c35d842", size = 5204782, upload-time = "2026-03-29T13:18:55.579Z" },
+ { url = "https://files.pythonhosted.org/packages/de/2f/702a4594413c1a8632092beae8aba00f1d67947389369b3777aed783fdca/numpy-2.4.4-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e4a010c27ff6f210ff4c6ef34394cd61470d01014439b192ec22552ee867f2a8", size = 6552038, upload-time = "2026-03-29T13:18:57.769Z" },
+ { url = "https://files.pythonhosted.org/packages/7f/37/eed308a8f56cba4d1fdf467a4fc67ef4ff4bf1c888f5fc980481890104b1/numpy-2.4.4-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f9e75681b59ddaa5e659898085ae0eaea229d054f2ac0c7e563a62205a700121", size = 15670666, upload-time = "2026-03-29T13:19:00.341Z" },
+ { url = "https://files.pythonhosted.org/packages/0a/0d/0e3ecece05b7a7e87ab9fb587855548da437a061326fff64a223b6dcb78a/numpy-2.4.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:81f4a14bee47aec54f883e0cad2d73986640c1590eb9bfaaba7ad17394481e6e", size = 16645480, upload-time = "2026-03-29T13:19:03.63Z" },
+ { url = "https://files.pythonhosted.org/packages/34/49/f2312c154b82a286758ee2f1743336d50651f8b5195db18cdb63675ff649/numpy-2.4.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:62d6b0f03b694173f9fcb1fb317f7222fd0b0b103e784c6549f5e53a27718c44", size = 17020036, upload-time = "2026-03-29T13:19:07.428Z" },
+ { url = "https://files.pythonhosted.org/packages/7b/e9/736d17bd77f1b0ec4f9901aaec129c00d59f5d84d5e79bba540ef12c2330/numpy-2.4.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbc356aae7adf9e6336d336b9c8111d390a05df88f1805573ebb0807bd06fd1d", size = 18368643, upload-time = "2026-03-29T13:19:10.775Z" },
+ { url = "https://files.pythonhosted.org/packages/63/f6/d417977c5f519b17c8a5c3bc9e8304b0908b0e21136fe43bf628a1343914/numpy-2.4.4-cp312-cp312-win32.whl", hash = "sha256:0d35aea54ad1d420c812bfa0385c71cd7cc5bcf7c65fed95fc2cd02fe8c79827", size = 5961117, upload-time = "2026-03-29T13:19:13.464Z" },
+ { url = "https://files.pythonhosted.org/packages/2d/5b/e1deebf88ff431b01b7406ca3583ab2bbb90972bbe1c568732e49c844f7e/numpy-2.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:b5f0362dc928a6ecd9db58868fca5e48485205e3855957bdedea308f8672ea4a", size = 12320584, upload-time = "2026-03-29T13:19:16.155Z" },
+ { url = "https://files.pythonhosted.org/packages/58/89/e4e856ac82a68c3ed64486a544977d0e7bdd18b8da75b78a577ca31c4395/numpy-2.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:846300f379b5b12cc769334464656bc882e0735d27d9726568bc932fdc49d5ec", size = 10221450, upload-time = "2026-03-29T13:19:18.994Z" },
+ { url = "https://files.pythonhosted.org/packages/14/1d/d0a583ce4fefcc3308806a749a536c201ed6b5ad6e1322e227ee4848979d/numpy-2.4.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:08f2e31ed5e6f04b118e49821397f12767934cfdd12a1ce86a058f91e004ee50", size = 16684933, upload-time = "2026-03-29T13:19:22.47Z" },
+ { url = "https://files.pythonhosted.org/packages/c1/62/2b7a48fbb745d344742c0277f01286dead15f3f68e4f359fbfcf7b48f70f/numpy-2.4.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e823b8b6edc81e747526f70f71a9c0a07ac4e7ad13020aa736bb7c9d67196115", size = 14694532, upload-time = "2026-03-29T13:19:25.581Z" },
+ { url = "https://files.pythonhosted.org/packages/e5/87/499737bfba066b4a3bebff24a8f1c5b2dee410b209bc6668c9be692580f0/numpy-2.4.4-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4a19d9dba1a76618dd86b164d608566f393f8ec6ac7c44f0cc879011c45e65af", size = 5199661, upload-time = "2026-03-29T13:19:28.31Z" },
+ { url = "https://files.pythonhosted.org/packages/cd/da/464d551604320d1491bc345efed99b4b7034143a85787aab78d5691d5a0e/numpy-2.4.4-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:d2a8490669bfe99a233298348acc2d824d496dee0e66e31b66a6022c2ad74a5c", size = 6547539, upload-time = "2026-03-29T13:19:30.97Z" },
+ { url = "https://files.pythonhosted.org/packages/7d/90/8d23e3b0dafd024bf31bdec225b3bb5c2dbfa6912f8a53b8659f21216cbf/numpy-2.4.4-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45dbed2ab436a9e826e302fcdcbe9133f9b0006e5af7168afb8963a6520da103", size = 15668806, upload-time = "2026-03-29T13:19:33.887Z" },
+ { url = "https://files.pythonhosted.org/packages/d1/73/a9d864e42a01896bb5974475438f16086be9ba1f0d19d0bb7a07427c4a8b/numpy-2.4.4-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c901b15172510173f5cb310eae652908340f8dede90fff9e3bf6c0d8dfd92f83", size = 16632682, upload-time = "2026-03-29T13:19:37.336Z" },
+ { url = "https://files.pythonhosted.org/packages/34/fb/14570d65c3bde4e202a031210475ae9cde9b7686a2e7dc97ee67d2833b35/numpy-2.4.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:99d838547ace2c4aace6c4f76e879ddfe02bb58a80c1549928477862b7a6d6ed", size = 17019810, upload-time = "2026-03-29T13:19:40.963Z" },
+ { url = "https://files.pythonhosted.org/packages/8a/77/2ba9d87081fd41f6d640c83f26fb7351e536b7ce6dd9061b6af5904e8e46/numpy-2.4.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0aec54fd785890ecca25a6003fd9a5aed47ad607bbac5cd64f836ad8666f4959", size = 18357394, upload-time = "2026-03-29T13:19:44.859Z" },
+ { url = "https://files.pythonhosted.org/packages/a2/23/52666c9a41708b0853fa3b1a12c90da38c507a3074883823126d4e9d5b30/numpy-2.4.4-cp313-cp313-win32.whl", hash = "sha256:07077278157d02f65c43b1b26a3886bce886f95d20aabd11f87932750dfb14ed", size = 5959556, upload-time = "2026-03-29T13:19:47.661Z" },
+ { url = "https://files.pythonhosted.org/packages/57/fb/48649b4971cde70d817cf97a2a2fdc0b4d8308569f1dd2f2611959d2e0cf/numpy-2.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:5c70f1cc1c4efbe316a572e2d8b9b9cc44e89b95f79ca3331553fbb63716e2bf", size = 12317311, upload-time = "2026-03-29T13:19:50.67Z" },
+ { url = "https://files.pythonhosted.org/packages/ba/d8/11490cddd564eb4de97b4579ef6bfe6a736cc07e94c1598590ae25415e01/numpy-2.4.4-cp313-cp313-win_arm64.whl", hash = "sha256:ef4059d6e5152fa1a39f888e344c73fdc926e1b2dd58c771d67b0acfbf2aa67d", size = 10222060, upload-time = "2026-03-29T13:19:54.229Z" },
+ { url = "https://files.pythonhosted.org/packages/99/5d/dab4339177a905aad3e2221c915b35202f1ec30d750dd2e5e9d9a72b804b/numpy-2.4.4-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4bbc7f303d125971f60ec0aaad5e12c62d0d2c925f0ab1273debd0e4ba37aba5", size = 14822302, upload-time = "2026-03-29T13:19:57.585Z" },
+ { url = "https://files.pythonhosted.org/packages/eb/e4/0564a65e7d3d97562ed6f9b0fd0fb0a6f559ee444092f105938b50043876/numpy-2.4.4-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:4d6d57903571f86180eb98f8f0c839fa9ebbfb031356d87f1361be91e433f5b7", size = 5327407, upload-time = "2026-03-29T13:20:00.601Z" },
+ { url = "https://files.pythonhosted.org/packages/29/8d/35a3a6ce5ad371afa58b4700f1c820f8f279948cca32524e0a695b0ded83/numpy-2.4.4-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:4636de7fd195197b7535f231b5de9e4b36d2c440b6e566d2e4e4746e6af0ca93", size = 6647631, upload-time = "2026-03-29T13:20:02.855Z" },
+ { url = "https://files.pythonhosted.org/packages/f4/da/477731acbd5a58a946c736edfdabb2ac5b34c3d08d1ba1a7b437fa0884df/numpy-2.4.4-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ad2e2ef14e0b04e544ea2fa0a36463f847f113d314aa02e5b402fdf910ef309e", size = 15727691, upload-time = "2026-03-29T13:20:06.004Z" },
+ { url = "https://files.pythonhosted.org/packages/e6/db/338535d9b152beabeb511579598418ba0212ce77cf9718edd70262cc4370/numpy-2.4.4-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a285b3b96f951841799528cd1f4f01cd70e7e0204b4abebac9463eecfcf2a40", size = 16681241, upload-time = "2026-03-29T13:20:09.417Z" },
+ { url = "https://files.pythonhosted.org/packages/e2/a9/ad248e8f58beb7a0219b413c9c7d8151c5d285f7f946c3e26695bdbbe2df/numpy-2.4.4-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f8474c4241bc18b750be2abea9d7a9ec84f46ef861dbacf86a4f6e043401f79e", size = 17085767, upload-time = "2026-03-29T13:20:13.126Z" },
+ { url = "https://files.pythonhosted.org/packages/b5/1a/3b88ccd3694681356f70da841630e4725a7264d6a885c8d442a697e1146b/numpy-2.4.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4e874c976154687c1f71715b034739b45c7711bec81db01914770373d125e392", size = 18403169, upload-time = "2026-03-29T13:20:17.096Z" },
+ { url = "https://files.pythonhosted.org/packages/c2/c9/fcfd5d0639222c6eac7f304829b04892ef51c96a75d479214d77e3ce6e33/numpy-2.4.4-cp313-cp313t-win32.whl", hash = "sha256:9c585a1790d5436a5374bac930dad6ed244c046ed91b2b2a3634eb2971d21008", size = 6083477, upload-time = "2026-03-29T13:20:20.195Z" },
+ { url = "https://files.pythonhosted.org/packages/d5/e3/3938a61d1c538aaec8ed6fd6323f57b0c2d2d2219512434c5c878db76553/numpy-2.4.4-cp313-cp313t-win_amd64.whl", hash = "sha256:93e15038125dc1e5345d9b5b68aa7f996ec33b98118d18c6ca0d0b7d6198b7e8", size = 12457487, upload-time = "2026-03-29T13:20:22.946Z" },
+ { url = "https://files.pythonhosted.org/packages/97/6a/7e345032cc60501721ef94e0e30b60f6b0bd601f9174ebd36389a2b86d40/numpy-2.4.4-cp313-cp313t-win_arm64.whl", hash = "sha256:0dfd3f9d3adbe2920b68b5cd3d51444e13a10792ec7154cd0a2f6e74d4ab3233", size = 10292002, upload-time = "2026-03-29T13:20:25.909Z" },
+ { url = "https://files.pythonhosted.org/packages/6e/06/c54062f85f673dd5c04cbe2f14c3acb8c8b95e3384869bb8cc9bff8cb9df/numpy-2.4.4-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:f169b9a863d34f5d11b8698ead99febeaa17a13ca044961aa8e2662a6c7766a0", size = 16684353, upload-time = "2026-03-29T13:20:29.504Z" },
+ { url = "https://files.pythonhosted.org/packages/4c/39/8a320264a84404c74cc7e79715de85d6130fa07a0898f67fb5cd5bd79908/numpy-2.4.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2483e4584a1cb3092da4470b38866634bafb223cbcd551ee047633fd2584599a", size = 14704914, upload-time = "2026-03-29T13:20:33.547Z" },
+ { url = "https://files.pythonhosted.org/packages/91/fb/287076b2614e1d1044235f50f03748f31fa287e3dbe6abeb35cdfa351eca/numpy-2.4.4-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:2d19e6e2095506d1736b7d80595e0f252d76b89f5e715c35e06e937679ea7d7a", size = 5210005, upload-time = "2026-03-29T13:20:36.45Z" },
+ { url = "https://files.pythonhosted.org/packages/63/eb/fcc338595309910de6ecabfcef2419a9ce24399680bfb149421fa2df1280/numpy-2.4.4-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:6a246d5914aa1c820c9443ddcee9c02bec3e203b0c080349533fae17727dfd1b", size = 6544974, upload-time = "2026-03-29T13:20:39.014Z" },
+ { url = "https://files.pythonhosted.org/packages/44/5d/e7e9044032a716cdfaa3fba27a8e874bf1c5f1912a1ddd4ed071bf8a14a6/numpy-2.4.4-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:989824e9faf85f96ec9c7761cd8d29c531ad857bfa1daa930cba85baaecf1a9a", size = 15684591, upload-time = "2026-03-29T13:20:42.146Z" },
+ { url = "https://files.pythonhosted.org/packages/98/7c/21252050676612625449b4807d6b695b9ce8a7c9e1c197ee6216c8a65c7c/numpy-2.4.4-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:27a8d92cd10f1382a67d7cf4db7ce18341b66438bdd9f691d7b0e48d104c2a9d", size = 16637700, upload-time = "2026-03-29T13:20:46.204Z" },
+ { url = "https://files.pythonhosted.org/packages/b1/29/56d2bbef9465db24ef25393383d761a1af4f446a1df9b8cded4fe3a5a5d7/numpy-2.4.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e44319a2953c738205bf3354537979eaa3998ed673395b964c1176083dd46252", size = 17035781, upload-time = "2026-03-29T13:20:50.242Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/2b/a35a6d7589d21f44cea7d0a98de5ddcbb3d421b2622a5c96b1edf18707c3/numpy-2.4.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e892aff75639bbef0d2a2cfd55535510df26ff92f63c92cd84ef8d4ba5a5557f", size = 18362959, upload-time = "2026-03-29T13:20:54.019Z" },
+ { url = "https://files.pythonhosted.org/packages/64/c9/d52ec581f2390e0f5f85cbfd80fb83d965fc15e9f0e1aec2195faa142cde/numpy-2.4.4-cp314-cp314-win32.whl", hash = "sha256:1378871da56ca8943c2ba674530924bb8ca40cd228358a3b5f302ad60cf875fc", size = 6008768, upload-time = "2026-03-29T13:20:56.912Z" },
+ { url = "https://files.pythonhosted.org/packages/fa/22/4cc31a62a6c7b74a8730e31a4274c5dc80e005751e277a2ce38e675e4923/numpy-2.4.4-cp314-cp314-win_amd64.whl", hash = "sha256:715d1c092715954784bc79e1174fc2a90093dc4dc84ea15eb14dad8abdcdeb74", size = 12449181, upload-time = "2026-03-29T13:20:59.548Z" },
+ { url = "https://files.pythonhosted.org/packages/70/2e/14cda6f4d8e396c612d1bf97f22958e92148801d7e4f110cabebdc0eef4b/numpy-2.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:2c194dd721e54ecad9ad387c1d35e63dce5c4450c6dc7dd5611283dda239aabb", size = 10496035, upload-time = "2026-03-29T13:21:02.524Z" },
+ { url = "https://files.pythonhosted.org/packages/b1/e8/8fed8c8d848d7ecea092dc3469643f9d10bc3a134a815a3b033da1d2039b/numpy-2.4.4-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2aa0613a5177c264ff5921051a5719d20095ea586ca88cc802c5c218d1c67d3e", size = 14824958, upload-time = "2026-03-29T13:21:05.671Z" },
+ { url = "https://files.pythonhosted.org/packages/05/1a/d8007a5138c179c2bf33ef44503e83d70434d2642877ee8fbb230e7c0548/numpy-2.4.4-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:42c16925aa5a02362f986765f9ebabf20de75cdefdca827d14315c568dcab113", size = 5330020, upload-time = "2026-03-29T13:21:08.635Z" },
+ { url = "https://files.pythonhosted.org/packages/99/64/ffb99ac6ae93faf117bcbd5c7ba48a7f45364a33e8e458545d3633615dda/numpy-2.4.4-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:874f200b2a981c647340f841730fc3a2b54c9d940566a3c4149099591e2c4c3d", size = 6650758, upload-time = "2026-03-29T13:21:10.949Z" },
+ { url = "https://files.pythonhosted.org/packages/6e/6e/795cc078b78a384052e73b2f6281ff7a700e9bf53bcce2ee579d4f6dd879/numpy-2.4.4-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9b39d38a9bd2ae1becd7eac1303d031c5c110ad31f2b319c6e7d98b135c934d", size = 15729948, upload-time = "2026-03-29T13:21:14.047Z" },
+ { url = "https://files.pythonhosted.org/packages/5f/86/2acbda8cc2af5f3d7bfc791192863b9e3e19674da7b5e533fded124d1299/numpy-2.4.4-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b268594bccac7d7cf5844c7732e3f20c50921d94e36d7ec9b79e9857694b1b2f", size = 16679325, upload-time = "2026-03-29T13:21:17.561Z" },
+ { url = "https://files.pythonhosted.org/packages/bc/59/cafd83018f4aa55e0ac6fa92aa066c0a1877b77a615ceff1711c260ffae8/numpy-2.4.4-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ac6b31e35612a26483e20750126d30d0941f949426974cace8e6b5c58a3657b0", size = 17084883, upload-time = "2026-03-29T13:21:21.106Z" },
+ { url = "https://files.pythonhosted.org/packages/f0/85/a42548db84e65ece46ab2caea3d3f78b416a47af387fcbb47ec28e660dc2/numpy-2.4.4-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8e3ed142f2728df44263aaf5fb1f5b0b99f4070c553a0d7f033be65338329150", size = 18403474, upload-time = "2026-03-29T13:21:24.828Z" },
+ { url = "https://files.pythonhosted.org/packages/ed/ad/483d9e262f4b831000062e5d8a45e342166ec8aaa1195264982bca267e62/numpy-2.4.4-cp314-cp314t-win32.whl", hash = "sha256:dddbbd259598d7240b18c9d87c56a9d2fb3b02fe266f49a7c101532e78c1d871", size = 6155500, upload-time = "2026-03-29T13:21:28.205Z" },
+ { url = "https://files.pythonhosted.org/packages/c7/03/2fc4e14c7bd4ff2964b74ba90ecb8552540b6315f201df70f137faa5c589/numpy-2.4.4-cp314-cp314t-win_amd64.whl", hash = "sha256:a7164afb23be6e37ad90b2f10426149fd75aee07ca55653d2aa41e66c4ef697e", size = 12637755, upload-time = "2026-03-29T13:21:31.107Z" },
+ { url = "https://files.pythonhosted.org/packages/58/78/548fb8e07b1a341746bfbecb32f2c268470f45fa028aacdbd10d9bc73aab/numpy-2.4.4-cp314-cp314t-win_arm64.whl", hash = "sha256:ba203255017337d39f89bdd58417f03c4426f12beed0440cfd933cb15f8669c7", size = 10566643, upload-time = "2026-03-29T13:21:34.339Z" },
+ { url = "https://files.pythonhosted.org/packages/6b/33/8fae8f964a4f63ed528264ddf25d2b683d0b663e3cba26961eb838a7c1bd/numpy-2.4.4-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:58c8b5929fcb8287cbd6f0a3fae19c6e03a5c48402ae792962ac465224a629a4", size = 16854491, upload-time = "2026-03-29T13:21:38.03Z" },
+ { url = "https://files.pythonhosted.org/packages/bc/d0/1aabee441380b981cf8cdda3ae7a46aa827d1b5a8cce84d14598bc94d6d9/numpy-2.4.4-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:eea7ac5d2dce4189771cedb559c738a71512768210dc4e4753b107a2048b3d0e", size = 14895830, upload-time = "2026-03-29T13:21:41.509Z" },
+ { url = "https://files.pythonhosted.org/packages/a5/b8/aafb0d1065416894fccf4df6b49ef22b8db045187949545bced89c034b8e/numpy-2.4.4-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:51fc224f7ca4d92656d5a5eb315f12eb5fe2c97a66249aa7b5f562528a3be38c", size = 5400927, upload-time = "2026-03-29T13:21:44.747Z" },
+ { url = "https://files.pythonhosted.org/packages/d6/77/063baa20b08b431038c7f9ff5435540c7b7265c78cf56012a483019ca72d/numpy-2.4.4-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:28a650663f7314afc3e6ec620f44f333c386aad9f6fc472030865dc0ebb26ee3", size = 6715557, upload-time = "2026-03-29T13:21:47.406Z" },
+ { url = "https://files.pythonhosted.org/packages/c7/a8/379542d45a14f149444c5c4c4e7714707239ce9cc1de8c2803958889da14/numpy-2.4.4-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:19710a9ca9992d7174e9c52f643d4272dcd1558c5f7af7f6f8190f633bd651a7", size = 15804253, upload-time = "2026-03-29T13:21:50.753Z" },
+ { url = "https://files.pythonhosted.org/packages/a2/c8/f0a45426d6d21e7ea3310a15cf90c43a14d9232c31a837702dba437f3373/numpy-2.4.4-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9b2aec6af35c113b05695ebb5749a787acd63cafc83086a05771d1e1cd1e555f", size = 16753552, upload-time = "2026-03-29T13:21:54.344Z" },
+ { url = "https://files.pythonhosted.org/packages/04/74/f4c001f4714c3ad9ce037e18cf2b9c64871a84951eaa0baf683a9ca9301c/numpy-2.4.4-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:f2cf083b324a467e1ab358c105f6cad5ea950f50524668a80c486ff1db24e119", size = 12509075, upload-time = "2026-03-29T13:21:57.644Z" },
+]
+
+[[package]]
+name = "openai"
+version = "2.32.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "anyio" },
+ { name = "distro" },
+ { name = "httpx" },
+ { name = "jiter" },
+ { name = "pydantic" },
+ { name = "sniffio" },
+ { name = "tqdm" },
+ { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ed/59/bdcc6b759b8c42dd73afaf5bf8f902c04b37987a5514dbc1c64dba390fef/openai-2.32.0.tar.gz", hash = "sha256:c54b27a9e4cb8d51f0dd94972ffd1a04437efeb259a9e60d8922b8bd26fe55e0", size = 693286, upload-time = "2026-04-15T22:28:19.434Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/1e/c1/d6e64ccd0536bf616556f0cad2b6d94a8125f508d25cfd814b1d2db4e2f1/openai-2.32.0-py3-none-any.whl", hash = "sha256:4dcc9badeb4bf54ad0d187453742f290226d30150890b7890711bda4f32f192f", size = 1162570, upload-time = "2026-04-15T22:28:17.714Z" },
+]
+
+[[package]]
+name = "openapi-pydantic"
+version = "0.5.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/02/2e/58d83848dd1a79cb92ed8e63f6ba901ca282c5f09d04af9423ec26c56fd7/openapi_pydantic-0.5.1.tar.gz", hash = "sha256:ff6835af6bde7a459fb93eb93bb92b8749b754fc6e51b2f1590a19dc3005ee0d", size = 60892, upload-time = "2025-01-08T19:29:27.083Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/12/cf/03675d8bd8ecbf4445504d8071adab19f5f993676795708e36402ab38263/openapi_pydantic-0.5.1-py3-none-any.whl", hash = "sha256:a3a09ef4586f5bd760a8df7f43028b60cafb6d9f61de2acba9574766255ab146", size = 96381, upload-time = "2025-01-08T19:29:25.275Z" },
+]
+
+[[package]]
+name = "openenv-axiomforgeai"
+version = "0.1.0"
+source = { editable = "." }
+dependencies = [
+ { name = "openenv-core", extra = ["core"] },
+]
+
+[package.optional-dependencies]
+dev = [
+ { name = "pytest" },
+ { name = "pytest-cov" },
+]
+
+[package.metadata]
+requires-dist = [
+ { name = "openenv-core", extras = ["core"], specifier = ">=0.2.2" },
+ { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },
+ { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0.0" },
+]
+provides-extras = ["dev"]
+
+[[package]]
+name = "openenv-core"
+version = "0.2.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "fastapi" },
+ { name = "fastmcp" },
+ { name = "gradio" },
+ { name = "httpx" },
+ { name = "huggingface-hub" },
+ { name = "openai" },
+ { name = "pydantic" },
+ { name = "pyyaml" },
+ { name = "requests" },
+ { name = "rich" },
+ { name = "tomli" },
+ { name = "tomli-w" },
+ { name = "typer" },
+ { name = "uvicorn" },
+ { name = "websockets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/93/f3/41a5ed932a2507438c985e9d959dcaa1a6c46f293995c064348c0e52dd40/openenv_core-0.2.3.tar.gz", hash = "sha256:48aefd774474556297ce012b80f2ceb271db51253d7fd0838e6e2dcc329db0c3", size = 146944, upload-time = "2026-03-28T18:56:28.415Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/2f/22/38c339e370d198008f2c17ebdda1ae8f23bb4e1509dc7ae8eab6dc9b9cbe/openenv_core-0.2.3-py3-none-any.whl", hash = "sha256:f75a20c94452057a5f53a86e6d71a9f6a461524c3d6a865aa9344d257a92b795", size = 174557, upload-time = "2026-03-28T18:56:26.874Z" },
+]
+
+[package.optional-dependencies]
+core = [
+ { name = "fastapi" },
+ { name = "pydantic" },
+ { name = "requests" },
+ { name = "uvicorn" },
+ { name = "websockets" },
+]
+
+[[package]]
+name = "opentelemetry-api"
+version = "1.41.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "importlib-metadata" },
+ { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/fa/fc/b7564cbef36601aef0d6c9bc01f7badb64be8e862c2e1c3c5c3b43b53e4f/opentelemetry_api-1.41.1.tar.gz", hash = "sha256:0ad1814d73b875f84494387dae86ce0b12c68556331ce6ce8fe789197c949621", size = 71416, upload-time = "2026-04-24T13:15:38.262Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/29/59/3e7118ed140f76b0982ba4321bdaed1997a0473f9720de2d10788a577033/opentelemetry_api-1.41.1-py3-none-any.whl", hash = "sha256:a22df900e75c76dc08440710e51f52f1aa6b451b429298896023e60db5b3139f", size = 69007, upload-time = "2026-04-24T13:15:15.662Z" },
+]
+
+[[package]]
+name = "orjson"
+version = "3.11.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9d/1b/2024d06792d0779f9dbc51531b61c24f76c75b9f4ce05e6f3377a1814cea/orjson-3.11.8.tar.gz", hash = "sha256:96163d9cdc5a202703e9ad1b9ae757d5f0ca62f4fa0cc93d1f27b0e180cc404e", size = 5603832, upload-time = "2026-03-31T16:16:27.878Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/2f/90/5d81f61fe3e4270da80c71442864c091cee3003cc8984c75f413fe742a07/orjson-3.11.8-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:e6693ff90018600c72fd18d3d22fa438be26076cd3c823da5f63f7bab28c11cb", size = 229663, upload-time = "2026-03-31T16:14:30.708Z" },
+ { url = "https://files.pythonhosted.org/packages/6c/ef/85e06b0eb11de6fb424120fd5788a07035bd4c5e6bb7841ae9972a0526d1/orjson-3.11.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93de06bc920854552493c81f1f729fab7213b7db4b8195355db5fda02c7d1363", size = 132321, upload-time = "2026-03-31T16:14:32.317Z" },
+ { url = "https://files.pythonhosted.org/packages/86/71/089338ee51b3132f050db0864a7df9bdd5e94c2a03820ab8a91e8f655618/orjson-3.11.8-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fe0b8c83e0f36247fc9431ce5425a5d95f9b3a689133d494831bdbd6f0bceb13", size = 130658, upload-time = "2026-03-31T16:14:33.935Z" },
+ { url = "https://files.pythonhosted.org/packages/10/0d/f39d8802345d0ad65f7fd4374b29b9b59f98656dc30f21ca5c773265b2f0/orjson-3.11.8-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:97d823831105c01f6c8029faf297633dbeb30271892bd430e9c24ceae3734744", size = 135708, upload-time = "2026-03-31T16:14:35.224Z" },
+ { url = "https://files.pythonhosted.org/packages/ff/b5/40aae576b3473511696dcffea84fde638b2b64774eb4dcb8b2c262729f8a/orjson-3.11.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c60c0423f15abb6cf78f56dff00168a1b582f7a1c23f114036e2bfc697814d5f", size = 147047, upload-time = "2026-03-31T16:14:36.489Z" },
+ { url = "https://files.pythonhosted.org/packages/7b/f0/778a84458d1fdaa634b2e572e51ce0b354232f580b2327e1f00a8d88c38c/orjson-3.11.8-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:01928d0476b216ad2201823b0a74000440360cef4fed1912d297b8d84718f277", size = 133072, upload-time = "2026-03-31T16:14:37.715Z" },
+ { url = "https://files.pythonhosted.org/packages/bf/d3/1bbf2fc3ffcc4b829ade554b574af68cec898c9b5ad6420a923c75a073d3/orjson-3.11.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6a4a639049c44d36a6d1ae0f4a94b271605c745aee5647fa8ffaabcdc01b69a6", size = 133867, upload-time = "2026-03-31T16:14:39.356Z" },
+ { url = "https://files.pythonhosted.org/packages/08/94/6413da22edc99a69a8d0c2e83bf42973b8aa94d83ef52a6d39ac85da00bc/orjson-3.11.8-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3222adff1e1ff0dce93c16146b93063a7793de6c43d52309ae321234cdaf0f4d", size = 142268, upload-time = "2026-03-31T16:14:40.972Z" },
+ { url = "https://files.pythonhosted.org/packages/4a/5f/aa5dbaa6136d7ba55f5461ac2e885efc6e6349424a428927fd46d68f4396/orjson-3.11.8-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:3223665349bbfb68da234acd9846955b1a0808cbe5520ff634bf253a4407009b", size = 424008, upload-time = "2026-03-31T16:14:42.637Z" },
+ { url = "https://files.pythonhosted.org/packages/fa/aa/2c1962d108c7fe5e27aa03a354b378caf56d8eafdef15fd83dec081ce45a/orjson-3.11.8-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:61c9d357a59465736022d5d9ba06687afb7611dfb581a9d2129b77a6fcf78e59", size = 147942, upload-time = "2026-03-31T16:14:44.256Z" },
+ { url = "https://files.pythonhosted.org/packages/47/d1/65f404f4c47eb1b0b4476f03ec838cac0c4aa933920ff81e5dda4dee14e7/orjson-3.11.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:58fb9b17b4472c7b1dcf1a54583629e62e23779b2331052f09a9249edf81675b", size = 136640, upload-time = "2026-03-31T16:14:45.884Z" },
+ { url = "https://files.pythonhosted.org/packages/90/5f/7b784aea98bdb125a2f2da7c27d6c2d2f6d943d96ef0278bae596d563f85/orjson-3.11.8-cp310-cp310-win32.whl", hash = "sha256:b43dc2a391981d36c42fa57747a49dae793ef1d2e43898b197925b5534abd10a", size = 132066, upload-time = "2026-03-31T16:14:47.397Z" },
+ { url = "https://files.pythonhosted.org/packages/92/ec/2e284af8d6c9478df5ef938917743f61d68f4c70d17f1b6e82f7e3b8dba1/orjson-3.11.8-cp310-cp310-win_amd64.whl", hash = "sha256:c98121237fea2f679480765abd566f7713185897f35c9e6c2add7e3a9900eb61", size = 127609, upload-time = "2026-03-31T16:14:48.78Z" },
+ { url = "https://files.pythonhosted.org/packages/67/41/5aa7fa3b0f4dc6b47dcafc3cea909299c37e40e9972feabc8b6a74e2730d/orjson-3.11.8-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:003646067cc48b7fcab2ae0c562491c9b5d2cbd43f1e5f16d98fd118c5522d34", size = 229229, upload-time = "2026-03-31T16:14:50.424Z" },
+ { url = "https://files.pythonhosted.org/packages/0a/d7/57e7f2458e0a2c41694f39fc830030a13053a84f837a5b73423dca1f0938/orjson-3.11.8-cp311-cp311-macosx_15_0_arm64.whl", hash = "sha256:ed193ce51d77a3830cad399a529cd4ef029968761f43ddc549e1bc62b40d88f8", size = 128871, upload-time = "2026-03-31T16:14:51.888Z" },
+ { url = "https://files.pythonhosted.org/packages/53/4a/e0fdb9430983e6c46e0299559275025075568aad5d21dd606faee3703924/orjson-3.11.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f30491bc4f862aa15744b9738517454f1e46e56c972a2be87d70d727d5b2a8f8", size = 132104, upload-time = "2026-03-31T16:14:53.142Z" },
+ { url = "https://files.pythonhosted.org/packages/08/4a/2025a60ff3f5c8522060cda46612d9b1efa653de66ed2908591d8d82f22d/orjson-3.11.8-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6eda5b8b6be91d3f26efb7dc6e5e68ee805bc5617f65a328587b35255f138bf4", size = 130483, upload-time = "2026-03-31T16:14:54.605Z" },
+ { url = "https://files.pythonhosted.org/packages/2d/3c/b9cde05bdc7b2385c66014e0620627da638d3d04e4954416ab48c31196c5/orjson-3.11.8-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee8db7bfb6fe03581bbab54d7c4124a6dd6a7f4273a38f7267197890f094675f", size = 135481, upload-time = "2026-03-31T16:14:55.901Z" },
+ { url = "https://files.pythonhosted.org/packages/ff/f2/a8238e7734de7cb589fed319857a8025d509c89dc52fdcc88f39c6d03d5a/orjson-3.11.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d8b5231de76c528a46b57010bbd83fb51e056aa0220a372fd5065e978406f1c", size = 146819, upload-time = "2026-03-31T16:14:57.548Z" },
+ { url = "https://files.pythonhosted.org/packages/db/10/dbf1e2a3cafea673b1b4350e371877b759060d6018a998643b7040e5de48/orjson-3.11.8-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:58a4a208a6fbfdb7a7327b8f201c6014f189f721fd55d047cafc4157af1bc62a", size = 132846, upload-time = "2026-03-31T16:14:58.91Z" },
+ { url = "https://files.pythonhosted.org/packages/f8/fc/55e667ec9c85694038fcff00573d221b085d50777368ee3d77f38668bf3c/orjson-3.11.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f8952d6d2505c003e8f0224ff7858d341fa4e33fef82b91c4ff0ef070f2393c", size = 133580, upload-time = "2026-03-31T16:15:00.519Z" },
+ { url = "https://files.pythonhosted.org/packages/7e/a6/c08c589a9aad0cb46c4831d17de212a2b6901f9d976814321ff8e69e8785/orjson-3.11.8-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0022bb50f90da04b009ce32c512dc1885910daa7cb10b7b0cba4505b16db82a8", size = 142042, upload-time = "2026-03-31T16:15:01.906Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/cc/2f78ea241d52b717d2efc38878615fe80425bf2beb6e68c984dde257a766/orjson-3.11.8-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:ff51f9d657d1afb6f410cb435792ce4e1fe427aab23d2fcd727a2876e21d4cb6", size = 423845, upload-time = "2026-03-31T16:15:03.703Z" },
+ { url = "https://files.pythonhosted.org/packages/70/07/c17dcf05dd8045457538428a983bf1f1127928df5bf328cb24d2b7cddacb/orjson-3.11.8-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:6dbe9a97bdb4d8d9d5367b52a7c32549bba70b2739c58ef74a6964a6d05ae054", size = 147729, upload-time = "2026-03-31T16:15:05.203Z" },
+ { url = "https://files.pythonhosted.org/packages/90/6c/0fb6e8a24e682e0958d71711ae6f39110e4b9cd8cab1357e2a89cb8e1951/orjson-3.11.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a5c370674ebabe16c6ccac33ff80c62bf8a6e59439f5e9d40c1f5ab8fd2215b7", size = 136425, upload-time = "2026-03-31T16:15:07.052Z" },
+ { url = "https://files.pythonhosted.org/packages/b2/35/4d3cc3a3d616035beb51b24a09bb872942dc452cf2df0c1d11ab35046d9f/orjson-3.11.8-cp311-cp311-win32.whl", hash = "sha256:0e32f7154299f42ae66f13488963269e5eccb8d588a65bc839ed986919fc9fac", size = 131870, upload-time = "2026-03-31T16:15:08.678Z" },
+ { url = "https://files.pythonhosted.org/packages/13/26/9fe70f81d16b702f8c3a775e8731b50ad91d22dacd14c7599b60a0941cd1/orjson-3.11.8-cp311-cp311-win_amd64.whl", hash = "sha256:25e0c672a2e32348d2eb33057b41e754091f2835f87222e4675b796b92264f06", size = 127440, upload-time = "2026-03-31T16:15:09.994Z" },
+ { url = "https://files.pythonhosted.org/packages/e8/c6/b038339f4145efd2859c1ca53097a52c0bb9cbdd24f947ebe146da1ad067/orjson-3.11.8-cp311-cp311-win_arm64.whl", hash = "sha256:9185589c1f2a944c17e26c9925dcdbc2df061cc4a145395c57f0c51f9b5dbfcd", size = 127399, upload-time = "2026-03-31T16:15:11.412Z" },
+ { url = "https://files.pythonhosted.org/packages/01/f6/8d58b32ab32d9215973a1688aebd098252ee8af1766c0e4e36e7831f0295/orjson-3.11.8-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:1cd0b77e77c95758f8e1100139844e99f3ccc87e71e6fc8e1c027e55807c549f", size = 229233, upload-time = "2026-03-31T16:15:12.762Z" },
+ { url = "https://files.pythonhosted.org/packages/a9/8b/2ffe35e71f6b92622e8ea4607bf33ecf7dfb51b3619dcfabfd36cbe2d0a5/orjson-3.11.8-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:6a3d159d5ffa0e3961f353c4b036540996bf8b9697ccc38261c0eac1fd3347a6", size = 128772, upload-time = "2026-03-31T16:15:14.237Z" },
+ { url = "https://files.pythonhosted.org/packages/27/d2/1f8682ae50d5c6897a563cb96bc106da8c9cb5b7b6e81a52e4cc086679b9/orjson-3.11.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76070a76e9c5ae661e2d9848f216980d8d533e0f8143e6ed462807b242e3c5e8", size = 131946, upload-time = "2026-03-31T16:15:15.607Z" },
+ { url = "https://files.pythonhosted.org/packages/52/4b/5500f76f0eece84226e0689cb48dcde081104c2fa6e2483d17ca13685ffb/orjson-3.11.8-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:54153d21520a71a4c82a0dbb4523e468941d549d221dc173de0f019678cf3813", size = 130368, upload-time = "2026-03-31T16:15:17.066Z" },
+ { url = "https://files.pythonhosted.org/packages/da/4e/58b927e08fbe9840e6c920d9e299b051ea667463b1f39a56e668669f8508/orjson-3.11.8-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:469ac2125611b7c5741a0b3798cd9e5786cbad6345f9f400c77212be89563bec", size = 135540, upload-time = "2026-03-31T16:15:18.404Z" },
+ { url = "https://files.pythonhosted.org/packages/56/7c/ba7cb871cba1bcd5cd02ee34f98d894c6cea96353ad87466e5aef2429c60/orjson-3.11.8-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:14778ffd0f6896aa613951a7fbf4690229aa7a543cb2bfbe9f358e08aafa9546", size = 146877, upload-time = "2026-03-31T16:15:19.833Z" },
+ { url = "https://files.pythonhosted.org/packages/0b/5d/eb9c25fc1386696c6a342cd361c306452c75e0b55e86ad602dd4827a7fd7/orjson-3.11.8-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea56a955056a6d6c550cf18b3348656a9d9a4f02e2d0c02cabf3c73f1055d506", size = 132837, upload-time = "2026-03-31T16:15:21.282Z" },
+ { url = "https://files.pythonhosted.org/packages/37/87/5ddeb7fc1fbd9004aeccab08426f34c81a5b4c25c7061281862b015fce2b/orjson-3.11.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53a0f57e59a530d18a142f4d4ba6dfc708dc5fdedce45e98ff06b44930a2a48f", size = 133624, upload-time = "2026-03-31T16:15:22.641Z" },
+ { url = "https://files.pythonhosted.org/packages/22/09/90048793db94ee4b2fcec4ac8e5ddb077367637d6650be896b3494b79bb7/orjson-3.11.8-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9b48e274f8824567d74e2158199e269597edf00823a1b12b63d48462bbf5123e", size = 141904, upload-time = "2026-03-31T16:15:24.435Z" },
+ { url = "https://files.pythonhosted.org/packages/c0/cf/eb284847487821a5d415e54149a6449ba9bfc5872ce63ab7be41b8ec401c/orjson-3.11.8-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:3f262401086a3960586af06c054609365e98407151f5ea24a62893a40d80dbbb", size = 423742, upload-time = "2026-03-31T16:15:26.155Z" },
+ { url = "https://files.pythonhosted.org/packages/44/09/e12423d327071c851c13e76936f144a96adacfc037394dec35ac3fc8d1e8/orjson-3.11.8-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:8e8c6218b614badf8e229b697865df4301afa74b791b6c9ade01d19a9953a942", size = 147806, upload-time = "2026-03-31T16:15:27.909Z" },
+ { url = "https://files.pythonhosted.org/packages/b3/6d/37c2589ba864e582ffe7611643314785c6afb1f83c701654ef05daa8fcc7/orjson-3.11.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:093d489fa039ddade2db541097dbb484999fcc65fc2b0ff9819141e2ab364f25", size = 136485, upload-time = "2026-03-31T16:15:29.749Z" },
+ { url = "https://files.pythonhosted.org/packages/be/c9/135194a02ab76b04ed9a10f68624b7ebd238bbe55548878b11ff15a0f352/orjson-3.11.8-cp312-cp312-win32.whl", hash = "sha256:e0950ed1bcb9893f4293fd5c5a7ee10934fbf82c4101c70be360db23ce24b7d2", size = 131966, upload-time = "2026-03-31T16:15:31.687Z" },
+ { url = "https://files.pythonhosted.org/packages/ed/9a/9796f8fbe3cf30ce9cb696748dbb535e5c87be4bf4fe2e9ca498ef1fa8cf/orjson-3.11.8-cp312-cp312-win_amd64.whl", hash = "sha256:3cf17c141617b88ced4536b2135c552490f07799f6ad565948ea07bef0dcb9a6", size = 127441, upload-time = "2026-03-31T16:15:33.333Z" },
+ { url = "https://files.pythonhosted.org/packages/cc/47/5aaf54524a7a4a0dd09dd778f3fa65dd2108290615b652e23d944152bc8e/orjson-3.11.8-cp312-cp312-win_arm64.whl", hash = "sha256:48854463b0572cc87dac7d981aa72ed8bf6deedc0511853dc76b8bbd5482d36d", size = 127364, upload-time = "2026-03-31T16:15:34.748Z" },
+ { url = "https://files.pythonhosted.org/packages/66/7f/95fba509bb2305fab0073558f1e8c3a2ec4b2afe58ed9fcb7d3b8beafe94/orjson-3.11.8-cp313-cp313-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:3f23426851d98478c8970da5991f84784a76682213cd50eb73a1da56b95239dc", size = 229180, upload-time = "2026-03-31T16:15:36.426Z" },
+ { url = "https://files.pythonhosted.org/packages/f6/9d/b237215c743ca073697d759b5503abd2cb8a0d7b9c9e21f524bcf176ab66/orjson-3.11.8-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:ebaed4cef74a045b83e23537b52ef19a367c7e3f536751e355a2a394f8648559", size = 128754, upload-time = "2026-03-31T16:15:38.049Z" },
+ { url = "https://files.pythonhosted.org/packages/42/3d/27d65b6d11e63f133781425f132807aef793ed25075fec686fc8e46dd528/orjson-3.11.8-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97c8f5d3b62380b70c36ffacb2a356b7c6becec86099b177f73851ba095ef623", size = 131877, upload-time = "2026-03-31T16:15:39.484Z" },
+ { url = "https://files.pythonhosted.org/packages/dd/cc/faee30cd8f00421999e40ef0eba7332e3a625ce91a58200a2f52c7fef235/orjson-3.11.8-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:436c4922968a619fb7fef1ccd4b8b3a76c13b67d607073914d675026e911a65c", size = 130361, upload-time = "2026-03-31T16:15:41.274Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/bb/a6c55896197f97b6d4b4e7c7fd77e7235517c34f5d6ad5aadd43c54c6d7c/orjson-3.11.8-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1ab359aff0436d80bfe8a23b46b5fea69f1e18aaf1760a709b4787f1318b317f", size = 135521, upload-time = "2026-03-31T16:15:42.758Z" },
+ { url = "https://files.pythonhosted.org/packages/9c/7c/ca3a3525aa32ff636ebb1778e77e3587b016ab2edb1b618b36ba96f8f2c0/orjson-3.11.8-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f89b6d0b3a8d81e1929d3ab3d92bbc225688bd80a770c49432543928fe09ac55", size = 146862, upload-time = "2026-03-31T16:15:44.341Z" },
+ { url = "https://files.pythonhosted.org/packages/3c/0c/18a9d7f18b5edd37344d1fd5be17e94dc652c67826ab749c6e5948a78112/orjson-3.11.8-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:29c009e7a2ca9ad0ed1376ce20dd692146a5d9fe4310848904b6b4fee5c5c137", size = 132847, upload-time = "2026-03-31T16:15:46.368Z" },
+ { url = "https://files.pythonhosted.org/packages/23/91/7e722f352ad67ca573cee44de2a58fb810d0f4eb4e33276c6a557979fd8a/orjson-3.11.8-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:705b895b781b3e395c067129d8551655642dfe9437273211d5404e87ac752b53", size = 133637, upload-time = "2026-03-31T16:15:48.123Z" },
+ { url = "https://files.pythonhosted.org/packages/af/04/32845ce13ac5bd1046ddb02ac9432ba856cc35f6d74dde95864fe0ad5523/orjson-3.11.8-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:88006eda83858a9fdf73985ce3804e885c2befb2f506c9a3723cdeb5a2880e3e", size = 141906, upload-time = "2026-03-31T16:15:49.626Z" },
+ { url = "https://files.pythonhosted.org/packages/02/5e/c551387ddf2d7106d9039369862245c85738b828844d13b99ccb8d61fd06/orjson-3.11.8-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:55120759e61309af7fcf9e961c6f6af3dde5921cdb3ee863ef63fd9db126cae6", size = 423722, upload-time = "2026-03-31T16:15:51.176Z" },
+ { url = "https://files.pythonhosted.org/packages/00/a3/ecfe62434096f8a794d4976728cb59bcfc4a643977f21c2040545d37eb4c/orjson-3.11.8-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:98bdc6cb889d19bed01de46e67574a2eab61f5cc6b768ed50e8ac68e9d6ffab6", size = 147801, upload-time = "2026-03-31T16:15:52.939Z" },
+ { url = "https://files.pythonhosted.org/packages/18/6d/0dce10b9f6643fdc59d99333871a38fa5a769d8e2fc34a18e5d2bfdee900/orjson-3.11.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:708c95f925a43ab9f34625e45dcdadf09ec8a6e7b664a938f2f8d5650f6c090b", size = 136460, upload-time = "2026-03-31T16:15:54.431Z" },
+ { url = "https://files.pythonhosted.org/packages/01/d6/6dde4f31842d87099238f1f07b459d24edc1a774d20687187443ab044191/orjson-3.11.8-cp313-cp313-win32.whl", hash = "sha256:01c4e5a6695dc09098f2e6468a251bc4671c50922d4d745aff1a0a33a0cf5b8d", size = 131956, upload-time = "2026-03-31T16:15:56.081Z" },
+ { url = "https://files.pythonhosted.org/packages/c1/f9/4e494a56e013db957fb77186b818b916d4695b8fa2aa612364974160e91b/orjson-3.11.8-cp313-cp313-win_amd64.whl", hash = "sha256:c154a35dd1330707450bb4d4e7dd1f17fa6f42267a40c1e8a1daa5e13719b4b8", size = 127410, upload-time = "2026-03-31T16:15:57.54Z" },
+ { url = "https://files.pythonhosted.org/packages/57/7f/803203d00d6edb6e9e7eef421d4e1adbb5ea973e40b3533f3cfd9aeb374e/orjson-3.11.8-cp313-cp313-win_arm64.whl", hash = "sha256:4861bde57f4d253ab041e374f44023460e60e71efaa121f3c5f0ed457c3a701e", size = 127338, upload-time = "2026-03-31T16:15:59.106Z" },
+ { url = "https://files.pythonhosted.org/packages/6d/35/b01910c3d6b85dc882442afe5060cbf719c7d1fc85749294beda23d17873/orjson-3.11.8-cp314-cp314-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:ec795530a73c269a55130498842aaa762e4a939f6ce481a7e986eeaa790e9da4", size = 229171, upload-time = "2026-03-31T16:16:00.651Z" },
+ { url = "https://files.pythonhosted.org/packages/c2/56/c9ec97bd11240abef39b9e5d99a15462809c45f677420fd148a6c5e6295e/orjson-3.11.8-cp314-cp314-macosx_15_0_arm64.whl", hash = "sha256:c492a0e011c0f9066e9ceaa896fbc5b068c54d365fea5f3444b697ee01bc8625", size = 128746, upload-time = "2026-03-31T16:16:02.673Z" },
+ { url = "https://files.pythonhosted.org/packages/3b/e4/66d4f30a90de45e2f0cbd9623588e8ae71eef7679dbe2ae954ed6d66a41f/orjson-3.11.8-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:883206d55b1bd5f5679ad5e6ddd3d1a5e3cac5190482927fdb8c78fb699193b5", size = 131867, upload-time = "2026-03-31T16:16:04.342Z" },
+ { url = "https://files.pythonhosted.org/packages/19/30/2a645fc9286b928675e43fa2a3a16fb7b6764aa78cc719dc82141e00f30b/orjson-3.11.8-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5774c1fdcc98b2259800b683b19599c133baeb11d60033e2095fd9d4667b82db", size = 124664, upload-time = "2026-03-31T16:16:05.837Z" },
+ { url = "https://files.pythonhosted.org/packages/db/44/77b9a86d84a28d52ba3316d77737f6514e17118119ade3f91b639e859029/orjson-3.11.8-cp314-cp314-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ac7381c83dd3d4a6347e6635950aa448f54e7b8406a27c7ecb4a37e9f1ae08b", size = 129701, upload-time = "2026-03-31T16:16:07.407Z" },
+ { url = "https://files.pythonhosted.org/packages/b3/ea/eff3d9bfe47e9bc6969c9181c58d9f71237f923f9c86a2d2f490cd898c82/orjson-3.11.8-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:14439063aebcb92401c11afc68ee4e407258d2752e62d748b6942dad20d2a70d", size = 141202, upload-time = "2026-03-31T16:16:09.48Z" },
+ { url = "https://files.pythonhosted.org/packages/52/c8/90d4b4c60c84d62068d0cf9e4d8f0a4e05e76971d133ac0c60d818d4db20/orjson-3.11.8-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa72e71977bff96567b0f500fc5bfd2fdf915f34052c782a4c6ebbdaa97aa858", size = 127194, upload-time = "2026-03-31T16:16:11.02Z" },
+ { url = "https://files.pythonhosted.org/packages/8d/c7/ea9e08d1f0ba981adffb629811148b44774d935171e7b3d780ae43c4c254/orjson-3.11.8-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7679bc2f01bb0d219758f1a5f87bb7c8a81c0a186824a393b366876b4948e14f", size = 133639, upload-time = "2026-03-31T16:16:13.434Z" },
+ { url = "https://files.pythonhosted.org/packages/6c/8c/ddbbfd6ba59453c8fc7fe1d0e5983895864e264c37481b2a791db635f046/orjson-3.11.8-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:14f7b8fcb35ef403b42fa5ecfa4ed032332a91f3dc7368fbce4184d59e1eae0d", size = 141914, upload-time = "2026-03-31T16:16:14.955Z" },
+ { url = "https://files.pythonhosted.org/packages/4e/31/dbfbefec9df060d34ef4962cd0afcb6fa7a9ec65884cb78f04a7859526c3/orjson-3.11.8-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:c2bdf7b2facc80b5e34f48a2d557727d5c5c57a8a450de122ae81fa26a81c1bc", size = 423800, upload-time = "2026-03-31T16:16:16.594Z" },
+ { url = "https://files.pythonhosted.org/packages/87/cf/f74e9ae9803d4ab46b163494adba636c6d7ea955af5cc23b8aaa94cfd528/orjson-3.11.8-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:ccd7ba1b0605813a0715171d39ec4c314cb97a9c85893c2c5c0c3a3729df38bf", size = 147837, upload-time = "2026-03-31T16:16:18.585Z" },
+ { url = "https://files.pythonhosted.org/packages/64/e6/9214f017b5db85e84e68602792f742e5dc5249e963503d1b356bee611e01/orjson-3.11.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:cdbc8c9c02463fef4d3c53a9ba3336d05496ec8e1f1c53326a1e4acc11f5c600", size = 136441, upload-time = "2026-03-31T16:16:20.151Z" },
+ { url = "https://files.pythonhosted.org/packages/24/dd/3590348818f58f837a75fb969b04cdf187ae197e14d60b5e5a794a38b79d/orjson-3.11.8-cp314-cp314-win32.whl", hash = "sha256:0b57f67710a8cd459e4e54eb96d5f77f3624eba0c661ba19a525807e42eccade", size = 131983, upload-time = "2026-03-31T16:16:21.823Z" },
+ { url = "https://files.pythonhosted.org/packages/3f/0f/b6cb692116e05d058f31ceee819c70f097fa9167c82f67fabe7516289abc/orjson-3.11.8-cp314-cp314-win_amd64.whl", hash = "sha256:735e2262363dcbe05c35e3a8869898022af78f89dde9e256924dc02e99fe69ca", size = 127396, upload-time = "2026-03-31T16:16:23.685Z" },
+ { url = "https://files.pythonhosted.org/packages/c0/d1/facb5b5051fabb0ef9d26c6544d87ef19a939a9a001198655d0d891062dd/orjson-3.11.8-cp314-cp314-win_arm64.whl", hash = "sha256:6ccdea2c213cf9f3d9490cbd5d427693c870753df41e6cb375bd79bcbafc8817", size = 127330, upload-time = "2026-03-31T16:16:25.496Z" },
+]
+
+[[package]]
+name = "packaging"
+version = "26.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d7/f1/e7a6dd94a8d4a5626c03e4e99c87f241ba9e350cd9e6d75123f992427270/packaging-26.2.tar.gz", hash = "sha256:ff452ff5a3e828ce110190feff1178bb1f2ea2281fa2075aadb987c2fb221661", size = 228134, upload-time = "2026-04-24T20:15:23.917Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195, upload-time = "2026-04-24T20:15:22.081Z" },
+]
+
+[[package]]
+name = "pandas"
+version = "2.3.3"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version < '3.11'",
+]
+dependencies = [
+ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+ { name = "python-dateutil", marker = "python_full_version < '3.11'" },
+ { name = "pytz", marker = "python_full_version < '3.11'" },
+ { name = "tzdata", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/3d/f7/f425a00df4fcc22b292c6895c6831c0c8ae1d9fac1e024d16f98a9ce8749/pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c", size = 11555763, upload-time = "2025-09-29T23:16:53.287Z" },
+ { url = "https://files.pythonhosted.org/packages/13/4f/66d99628ff8ce7857aca52fed8f0066ce209f96be2fede6cef9f84e8d04f/pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a", size = 10801217, upload-time = "2025-09-29T23:17:04.522Z" },
+ { url = "https://files.pythonhosted.org/packages/1d/03/3fc4a529a7710f890a239cc496fc6d50ad4a0995657dccc1d64695adb9f4/pandas-2.3.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1", size = 12148791, upload-time = "2025-09-29T23:17:18.444Z" },
+ { url = "https://files.pythonhosted.org/packages/40/a8/4dac1f8f8235e5d25b9955d02ff6f29396191d4e665d71122c3722ca83c5/pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838", size = 12769373, upload-time = "2025-09-29T23:17:35.846Z" },
+ { url = "https://files.pythonhosted.org/packages/df/91/82cc5169b6b25440a7fc0ef3a694582418d875c8e3ebf796a6d6470aa578/pandas-2.3.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250", size = 13200444, upload-time = "2025-09-29T23:17:49.341Z" },
+ { url = "https://files.pythonhosted.org/packages/10/ae/89b3283800ab58f7af2952704078555fa60c807fff764395bb57ea0b0dbd/pandas-2.3.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4", size = 13858459, upload-time = "2025-09-29T23:18:03.722Z" },
+ { url = "https://files.pythonhosted.org/packages/85/72/530900610650f54a35a19476eca5104f38555afccda1aa11a92ee14cb21d/pandas-2.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826", size = 11346086, upload-time = "2025-09-29T23:18:18.505Z" },
+ { url = "https://files.pythonhosted.org/packages/c1/fa/7ac648108144a095b4fb6aa3de1954689f7af60a14cf25583f4960ecb878/pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523", size = 11578790, upload-time = "2025-09-29T23:18:30.065Z" },
+ { url = "https://files.pythonhosted.org/packages/9b/35/74442388c6cf008882d4d4bdfc4109be87e9b8b7ccd097ad1e7f006e2e95/pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45", size = 10833831, upload-time = "2025-09-29T23:38:56.071Z" },
+ { url = "https://files.pythonhosted.org/packages/fe/e4/de154cbfeee13383ad58d23017da99390b91d73f8c11856f2095e813201b/pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66", size = 12199267, upload-time = "2025-09-29T23:18:41.627Z" },
+ { url = "https://files.pythonhosted.org/packages/bf/c9/63f8d545568d9ab91476b1818b4741f521646cbdd151c6efebf40d6de6f7/pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b", size = 12789281, upload-time = "2025-09-29T23:18:56.834Z" },
+ { url = "https://files.pythonhosted.org/packages/f2/00/a5ac8c7a0e67fd1a6059e40aa08fa1c52cc00709077d2300e210c3ce0322/pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791", size = 13240453, upload-time = "2025-09-29T23:19:09.247Z" },
+ { url = "https://files.pythonhosted.org/packages/27/4d/5c23a5bc7bd209231618dd9e606ce076272c9bc4f12023a70e03a86b4067/pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151", size = 13890361, upload-time = "2025-09-29T23:19:25.342Z" },
+ { url = "https://files.pythonhosted.org/packages/8e/59/712db1d7040520de7a4965df15b774348980e6df45c129b8c64d0dbe74ef/pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c", size = 11348702, upload-time = "2025-09-29T23:19:38.296Z" },
+ { url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" },
+ { url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" },
+ { url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" },
+ { url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" },
+ { url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" },
+ { url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" },
+ { url = "https://files.pythonhosted.org/packages/cd/4b/18b035ee18f97c1040d94debd8f2e737000ad70ccc8f5513f4eefad75f4b/pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713", size = 11544671, upload-time = "2025-09-29T23:21:05.024Z" },
+ { url = "https://files.pythonhosted.org/packages/31/94/72fac03573102779920099bcac1c3b05975c2cb5f01eac609faf34bed1ca/pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8", size = 10680807, upload-time = "2025-09-29T23:21:15.979Z" },
+ { url = "https://files.pythonhosted.org/packages/16/87/9472cf4a487d848476865321de18cc8c920b8cab98453ab79dbbc98db63a/pandas-2.3.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d", size = 11709872, upload-time = "2025-09-29T23:21:27.165Z" },
+ { url = "https://files.pythonhosted.org/packages/15/07/284f757f63f8a8d69ed4472bfd85122bd086e637bf4ed09de572d575a693/pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac", size = 12306371, upload-time = "2025-09-29T23:21:40.532Z" },
+ { url = "https://files.pythonhosted.org/packages/33/81/a3afc88fca4aa925804a27d2676d22dcd2031c2ebe08aabd0ae55b9ff282/pandas-2.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c", size = 12765333, upload-time = "2025-09-29T23:21:55.77Z" },
+ { url = "https://files.pythonhosted.org/packages/8d/0f/b4d4ae743a83742f1153464cf1a8ecfafc3ac59722a0b5c8602310cb7158/pandas-2.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493", size = 13418120, upload-time = "2025-09-29T23:22:10.109Z" },
+ { url = "https://files.pythonhosted.org/packages/4f/c7/e54682c96a895d0c808453269e0b5928a07a127a15704fedb643e9b0a4c8/pandas-2.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee", size = 10993991, upload-time = "2025-09-29T23:25:04.889Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/ca/3f8d4f49740799189e1395812f3bf23b5e8fc7c190827d55a610da72ce55/pandas-2.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5", size = 12048227, upload-time = "2025-09-29T23:22:24.343Z" },
+ { url = "https://files.pythonhosted.org/packages/0e/5a/f43efec3e8c0cc92c4663ccad372dbdff72b60bdb56b2749f04aa1d07d7e/pandas-2.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21", size = 11411056, upload-time = "2025-09-29T23:22:37.762Z" },
+ { url = "https://files.pythonhosted.org/packages/46/b1/85331edfc591208c9d1a63a06baa67b21d332e63b7a591a5ba42a10bb507/pandas-2.3.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78", size = 11645189, upload-time = "2025-09-29T23:22:51.688Z" },
+ { url = "https://files.pythonhosted.org/packages/44/23/78d645adc35d94d1ac4f2a3c4112ab6f5b8999f4898b8cdf01252f8df4a9/pandas-2.3.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110", size = 12121912, upload-time = "2025-09-29T23:23:05.042Z" },
+ { url = "https://files.pythonhosted.org/packages/53/da/d10013df5e6aaef6b425aa0c32e1fc1f3e431e4bcabd420517dceadce354/pandas-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86", size = 12712160, upload-time = "2025-09-29T23:23:28.57Z" },
+ { url = "https://files.pythonhosted.org/packages/bd/17/e756653095a083d8a37cbd816cb87148debcfcd920129b25f99dd8d04271/pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc", size = 13199233, upload-time = "2025-09-29T23:24:24.876Z" },
+ { url = "https://files.pythonhosted.org/packages/04/fd/74903979833db8390b73b3a8a7d30d146d710bd32703724dd9083950386f/pandas-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0", size = 11540635, upload-time = "2025-09-29T23:25:52.486Z" },
+ { url = "https://files.pythonhosted.org/packages/21/00/266d6b357ad5e6d3ad55093a7e8efc7dd245f5a842b584db9f30b0f0a287/pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593", size = 10759079, upload-time = "2025-09-29T23:26:33.204Z" },
+ { url = "https://files.pythonhosted.org/packages/ca/05/d01ef80a7a3a12b2f8bbf16daba1e17c98a2f039cbc8e2f77a2c5a63d382/pandas-2.3.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c", size = 11814049, upload-time = "2025-09-29T23:27:15.384Z" },
+ { url = "https://files.pythonhosted.org/packages/15/b2/0e62f78c0c5ba7e3d2c5945a82456f4fac76c480940f805e0b97fcbc2f65/pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b", size = 12332638, upload-time = "2025-09-29T23:27:51.625Z" },
+ { url = "https://files.pythonhosted.org/packages/c5/33/dd70400631b62b9b29c3c93d2feee1d0964dc2bae2e5ad7a6c73a7f25325/pandas-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6", size = 12886834, upload-time = "2025-09-29T23:28:21.289Z" },
+ { url = "https://files.pythonhosted.org/packages/d3/18/b5d48f55821228d0d2692b34fd5034bb185e854bdb592e9c640f6290e012/pandas-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3", size = 13409925, upload-time = "2025-09-29T23:28:58.261Z" },
+ { url = "https://files.pythonhosted.org/packages/a6/3d/124ac75fcd0ecc09b8fdccb0246ef65e35b012030defb0e0eba2cbbbe948/pandas-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5", size = 11109071, upload-time = "2025-09-29T23:32:27.484Z" },
+ { url = "https://files.pythonhosted.org/packages/89/9c/0e21c895c38a157e0faa1fb64587a9226d6dd46452cac4532d80c3c4a244/pandas-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec", size = 12048504, upload-time = "2025-09-29T23:29:31.47Z" },
+ { url = "https://files.pythonhosted.org/packages/d7/82/b69a1c95df796858777b68fbe6a81d37443a33319761d7c652ce77797475/pandas-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7", size = 11410702, upload-time = "2025-09-29T23:29:54.591Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/88/702bde3ba0a94b8c73a0181e05144b10f13f29ebfc2150c3a79062a8195d/pandas-2.3.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450", size = 11634535, upload-time = "2025-09-29T23:30:21.003Z" },
+ { url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" },
+ { url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" },
+ { url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" },
+]
+
+[[package]]
+name = "pandas"
+version = "3.0.2"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version >= '3.14' and sys_platform == 'win32'",
+ "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+ "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+ "python_full_version == '3.13.*' and sys_platform == 'win32'",
+ "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
+ "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+ "python_full_version >= '3.11' and python_full_version < '3.13' and sys_platform == 'win32'",
+ "python_full_version >= '3.11' and python_full_version < '3.13' and sys_platform == 'emscripten'",
+ "python_full_version >= '3.11' and python_full_version < '3.13' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+]
+dependencies = [
+ { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+ { name = "python-dateutil", marker = "python_full_version >= '3.11'" },
+ { name = "tzdata", marker = "(python_full_version >= '3.11' and sys_platform == 'emscripten') or (python_full_version >= '3.11' and sys_platform == 'win32')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/da/99/b342345300f13440fe9fe385c3c481e2d9a595ee3bab4d3219247ac94e9a/pandas-3.0.2.tar.gz", hash = "sha256:f4753e73e34c8d83221ba58f232433fca2748be8b18dbca02d242ed153945043", size = 4645855, upload-time = "2026-03-31T06:48:30.816Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/97/35/6411db530c618e0e0005187e35aa02ce60ae4c4c4d206964a2f978217c27/pandas-3.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a727a73cbdba2f7458dc82449e2315899d5140b449015d822f515749a46cbbe0", size = 10326926, upload-time = "2026-03-31T06:46:08.29Z" },
+ { url = "https://files.pythonhosted.org/packages/c4/d3/b7da1d5d7dbdc5ef52ed7debd2b484313b832982266905315dad5a0bf0b1/pandas-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dbbd4aa20ca51e63b53bbde6a0fa4254b1aaabb74d2f542df7a7959feb1d760c", size = 9926987, upload-time = "2026-03-31T06:46:11.724Z" },
+ { url = "https://files.pythonhosted.org/packages/52/77/9b1c2d6070b5dbe239a7bc889e21bfa58720793fb902d1e070695d87c6d0/pandas-3.0.2-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:339dda302bd8369dedeae979cb750e484d549b563c3f54f3922cb8ff4978c5eb", size = 10757067, upload-time = "2026-03-31T06:46:14.903Z" },
+ { url = "https://files.pythonhosted.org/packages/20/17/ec40d981705654853726e7ac9aea9ddbb4a5d9cf54d8472222f4f3de06c2/pandas-3.0.2-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:61c2fd96d72b983a9891b2598f286befd4ad262161a609c92dc1652544b46b76", size = 11258787, upload-time = "2026-03-31T06:46:17.683Z" },
+ { url = "https://files.pythonhosted.org/packages/90/e3/3f1126d43d3702ca8773871a81c9f15122a1f412342cc56284ffda5b1f70/pandas-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c934008c733b8bbea273ea308b73b3156f0181e5b72960790b09c18a2794fe1e", size = 11771616, upload-time = "2026-03-31T06:46:20.532Z" },
+ { url = "https://files.pythonhosted.org/packages/2e/cf/0f4e268e1f5062e44a6bda9f925806721cd4c95c2b808a4c82ebe914f96b/pandas-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:60a80bb4feacbef5e1447a3f82c33209c8b7e07f28d805cfd1fb951e5cb443aa", size = 12337623, upload-time = "2026-03-31T06:46:23.754Z" },
+ { url = "https://files.pythonhosted.org/packages/44/a0/97a6339859d4acb2536efb24feb6708e82f7d33b2ed7e036f2983fcced82/pandas-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:ed72cb3f45190874eb579c64fa92d9df74e98fd63e2be7f62bce5ace0ade61df", size = 9897372, upload-time = "2026-03-31T06:46:26.703Z" },
+ { url = "https://files.pythonhosted.org/packages/8f/eb/781516b808a99ddf288143cec46b342b3016c3414d137da1fdc3290d8860/pandas-3.0.2-cp311-cp311-win_arm64.whl", hash = "sha256:f12b1a9e332c01e09510586f8ca9b108fd631fd656af82e452d7315ef6df5f9f", size = 9154922, upload-time = "2026-03-31T06:46:30.284Z" },
+ { url = "https://files.pythonhosted.org/packages/f3/b0/c20bd4d6d3f736e6bd6b55794e9cd0a617b858eaad27c8f410ea05d953b7/pandas-3.0.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:232a70ebb568c0c4d2db4584f338c1577d81e3af63292208d615907b698a0f18", size = 10347921, upload-time = "2026-03-31T06:46:33.36Z" },
+ { url = "https://files.pythonhosted.org/packages/35/d0/4831af68ce30cc2d03c697bea8450e3225a835ef497d0d70f31b8cdde965/pandas-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:970762605cff1ca0d3f71ed4f3a769ea8f85fc8e6348f6e110b8fea7e6eb5a14", size = 9888127, upload-time = "2026-03-31T06:46:36.253Z" },
+ { url = "https://files.pythonhosted.org/packages/61/a9/16ea9346e1fc4a96e2896242d9bc674764fb9049b0044c0132502f7a771e/pandas-3.0.2-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aff4e6f4d722e0652707d7bcb190c445fe58428500c6d16005b02401764b1b3d", size = 10399577, upload-time = "2026-03-31T06:46:39.224Z" },
+ { url = "https://files.pythonhosted.org/packages/c4/a8/3a61a721472959ab0ce865ef05d10b0d6bfe27ce8801c99f33d4fa996e65/pandas-3.0.2-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ef8b27695c3d3dc78403c9a7d5e59a62d5464a7e1123b4e0042763f7104dc74f", size = 10880030, upload-time = "2026-03-31T06:46:42.412Z" },
+ { url = "https://files.pythonhosted.org/packages/da/65/7225c0ea4d6ce9cb2160a7fb7f39804871049f016e74782e5dade4d14109/pandas-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f8d68083e49e16b84734eb1a4dcae4259a75c90fb6e2251ab9a00b61120c06ab", size = 11409468, upload-time = "2026-03-31T06:46:45.2Z" },
+ { url = "https://files.pythonhosted.org/packages/fa/5b/46e7c76032639f2132359b5cf4c785dd8cf9aea5ea64699eac752f02b9db/pandas-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:32cc41f310ebd4a296d93515fcac312216adfedb1894e879303987b8f1e2b97d", size = 11936381, upload-time = "2026-03-31T06:46:48.293Z" },
+ { url = "https://files.pythonhosted.org/packages/7b/8b/721a9cff6fa6a91b162eb51019c6243b82b3226c71bb6c8ef4a9bd65cbc6/pandas-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:a4785e1d6547d8427c5208b748ae2efb64659a21bd82bf440d4262d02bfa02a4", size = 9744993, upload-time = "2026-03-31T06:46:51.488Z" },
+ { url = "https://files.pythonhosted.org/packages/d5/18/7f0bd34ae27b28159aa80f2a6799f47fda34f7fb938a76e20c7b7fe3b200/pandas-3.0.2-cp312-cp312-win_arm64.whl", hash = "sha256:08504503f7101300107ecdc8df73658e4347586db5cfdadabc1592e9d7e7a0fd", size = 9056118, upload-time = "2026-03-31T06:46:54.548Z" },
+ { url = "https://files.pythonhosted.org/packages/bf/ca/3e639a1ea6fcd0617ca4e8ca45f62a74de33a56ae6cd552735470b22c8d3/pandas-3.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b5918ba197c951dec132b0c5929a00c0bf05d5942f590d3c10a807f6e15a57d3", size = 10321105, upload-time = "2026-03-31T06:46:57.327Z" },
+ { url = "https://files.pythonhosted.org/packages/0b/77/dbc82ff2fb0e63c6564356682bf201edff0ba16c98630d21a1fb312a8182/pandas-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d606a041c89c0a474a4702d532ab7e73a14fe35c8d427b972a625c8e46373668", size = 9864088, upload-time = "2026-03-31T06:46:59.935Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/2b/341f1b04bbca2e17e13cd3f08c215b70ef2c60c5356ef1e8c6857449edc7/pandas-3.0.2-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:710246ba0616e86891b58ab95f2495143bb2bc83ab6b06747c74216f583a6ac9", size = 10369066, upload-time = "2026-03-31T06:47:02.792Z" },
+ { url = "https://files.pythonhosted.org/packages/12/c5/cbb1ffefb20a93d3f0e1fdcda699fb84976210d411b008f97f48bf6ce27e/pandas-3.0.2-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5d3cfe227c725b1f3dff4278b43d8c784656a42a9325b63af6b1492a8232209e", size = 10876780, upload-time = "2026-03-31T06:47:06.205Z" },
+ { url = "https://files.pythonhosted.org/packages/98/fe/2249ae5e0a69bd0ddf17353d0a5d26611d70970111f5b3600cdc8be883e7/pandas-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c3b723df9087a9a9a840e263ebd9f88b64a12075d1bf2ea401a5a42f254f084d", size = 11375181, upload-time = "2026-03-31T06:47:09.383Z" },
+ { url = "https://files.pythonhosted.org/packages/de/64/77a38b09e70b6464883b8d7584ab543e748e42c1b5d337a2ee088e0df741/pandas-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a3096110bf9eac0070b7208465f2740e2d8a670d5cb6530b5bb884eca495fd39", size = 11928899, upload-time = "2026-03-31T06:47:12.686Z" },
+ { url = "https://files.pythonhosted.org/packages/5e/52/42855bf626868413f761addd574acc6195880ae247a5346477a4361c3acb/pandas-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:07a10f5c36512eead51bc578eb3354ad17578b22c013d89a796ab5eee90cd991", size = 9746574, upload-time = "2026-03-31T06:47:15.64Z" },
+ { url = "https://files.pythonhosted.org/packages/88/39/21304ae06a25e8bf9fc820d69b29b2c495b2ae580d1e143146c309941760/pandas-3.0.2-cp313-cp313-win_arm64.whl", hash = "sha256:5fdbfa05931071aba28b408e59226186b01eb5e92bea2ab78b65863ca3228d84", size = 9047156, upload-time = "2026-03-31T06:47:18.595Z" },
+ { url = "https://files.pythonhosted.org/packages/72/20/7defa8b27d4f330a903bb68eea33be07d839c5ea6bdda54174efcec0e1d2/pandas-3.0.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:dbc20dea3b9e27d0e66d74c42b2d0c1bed9c2ffe92adea33633e3bedeb5ac235", size = 10756238, upload-time = "2026-03-31T06:47:22.012Z" },
+ { url = "https://files.pythonhosted.org/packages/e9/95/49433c14862c636afc0e9b2db83ff16b3ad92959364e52b2955e44c8e94c/pandas-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b75c347eff42497452116ce05ef461822d97ce5b9ff8df6edacb8076092c855d", size = 10408520, upload-time = "2026-03-31T06:47:25.197Z" },
+ { url = "https://files.pythonhosted.org/packages/3b/f8/462ad2b5881d6b8ec8e5f7ed2ea1893faa02290d13870a1600fe72ad8efc/pandas-3.0.2-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1478075142e83a5571782ad007fb201ed074bdeac7ebcc8890c71442e96adf7", size = 10324154, upload-time = "2026-03-31T06:47:28.097Z" },
+ { url = "https://files.pythonhosted.org/packages/0a/65/d1e69b649cbcddda23ad6e4c40ef935340f6f652a006e5cbc3555ac8adb3/pandas-3.0.2-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5880314e69e763d4c8b27937090de570f1fb8d027059a7ada3f7f8e98bdcb677", size = 10714449, upload-time = "2026-03-31T06:47:30.85Z" },
+ { url = "https://files.pythonhosted.org/packages/47/a4/85b59bc65b8190ea3689882db6cdf32a5003c0ccd5a586c30fdcc3ffc4fc/pandas-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b5329e26898896f06035241a626d7c335daa479b9bbc82be7c2742d048e41172", size = 11338475, upload-time = "2026-03-31T06:47:34.026Z" },
+ { url = "https://files.pythonhosted.org/packages/1e/c4/bc6966c6e38e5d9478b935272d124d80a589511ed1612a5d21d36f664c68/pandas-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:81526c4afd31971f8b62671442a4b2b51e0aa9acc3819c9f0f12a28b6fcf85f1", size = 11786568, upload-time = "2026-03-31T06:47:36.941Z" },
+ { url = "https://files.pythonhosted.org/packages/e8/74/09298ca9740beed1d3504e073d67e128aa07e5ca5ca2824b0c674c0b8676/pandas-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:7cadd7e9a44ec13b621aec60f9150e744cfc7a3dd32924a7e2f45edff31823b0", size = 10488652, upload-time = "2026-03-31T06:47:40.612Z" },
+ { url = "https://files.pythonhosted.org/packages/bb/40/c6ea527147c73b24fc15c891c3fcffe9c019793119c5742b8784a062c7db/pandas-3.0.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:db0dbfd2a6cdf3770aa60464d50333d8f3d9165b2f2671bcc299b72de5a6677b", size = 10326084, upload-time = "2026-03-31T06:47:43.834Z" },
+ { url = "https://files.pythonhosted.org/packages/95/25/bdb9326c3b5455f8d4d3549fce7abcf967259de146fe2cf7a82368141948/pandas-3.0.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0555c5882688a39317179ab4a0ed41d3ebc8812ab14c69364bbee8fb7a3f6288", size = 9914146, upload-time = "2026-03-31T06:47:46.67Z" },
+ { url = "https://files.pythonhosted.org/packages/8d/77/3a227ff3337aa376c60d288e1d61c5d097131d0ac71f954d90a8f369e422/pandas-3.0.2-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:01f31a546acd5574ef77fe199bc90b55527c225c20ccda6601cf6b0fd5ed597c", size = 10444081, upload-time = "2026-03-31T06:47:49.681Z" },
+ { url = "https://files.pythonhosted.org/packages/15/88/3cdd54fa279341afa10acf8d2b503556b1375245dccc9315659f795dd2e9/pandas-3.0.2-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:deeca1b5a931fdf0c2212c8a659ade6d3b1edc21f0914ce71ef24456ca7a6535", size = 10897535, upload-time = "2026-03-31T06:47:53.033Z" },
+ { url = "https://files.pythonhosted.org/packages/06/9d/98cc7a7624f7932e40f434299260e2917b090a579d75937cb8a57b9d2de3/pandas-3.0.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0f48afd9bb13300ffb5a3316973324c787054ba6665cda0da3fbd67f451995db", size = 11446992, upload-time = "2026-03-31T06:47:56.193Z" },
+ { url = "https://files.pythonhosted.org/packages/9a/cd/19ff605cc3760e80602e6826ddef2824d8e7050ed80f2e11c4b079741dc3/pandas-3.0.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6c4d8458b97a35717b62469a4ea0e85abd5ed8687277f5ccfc67f8a5126f8c53", size = 11968257, upload-time = "2026-03-31T06:47:59.137Z" },
+ { url = "https://files.pythonhosted.org/packages/db/60/aba6a38de456e7341285102bede27514795c1eaa353bc0e7638b6b785356/pandas-3.0.2-cp314-cp314-win_amd64.whl", hash = "sha256:b35d14bb5d8285d9494fe93815a9e9307c0876e10f1e8e89ac5b88f728ec8dcf", size = 9865893, upload-time = "2026-03-31T06:48:02.038Z" },
+ { url = "https://files.pythonhosted.org/packages/08/71/e5ec979dd2e8a093dacb8864598c0ff59a0cee0bbcdc0bfec16a51684d4f/pandas-3.0.2-cp314-cp314-win_arm64.whl", hash = "sha256:63d141b56ef686f7f0d714cfb8de4e320475b86bf4b620aa0b7da89af8cbdbbb", size = 9188644, upload-time = "2026-03-31T06:48:05.045Z" },
+ { url = "https://files.pythonhosted.org/packages/f1/6c/7b45d85db19cae1eb524f2418ceaa9d85965dcf7b764ed151386b7c540f0/pandas-3.0.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:140f0cffb1fa2524e874dde5b477d9defe10780d8e9e220d259b2c0874c89d9d", size = 10776246, upload-time = "2026-03-31T06:48:07.789Z" },
+ { url = "https://files.pythonhosted.org/packages/a8/3e/7b00648b086c106e81766f25322b48aa8dfa95b55e621dbdf2fdd413a117/pandas-3.0.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ae37e833ff4fed0ba352f6bdd8b73ba3ab3256a85e54edfd1ab51ae40cca0af8", size = 10424801, upload-time = "2026-03-31T06:48:10.897Z" },
+ { url = "https://files.pythonhosted.org/packages/da/6e/558dd09a71b53b4008e7fc8a98ec6d447e9bfb63cdaeea10e5eb9b2dabe8/pandas-3.0.2-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4d888a5c678a419a5bb41a2a93818e8ed9fd3172246555c0b37b7cc27027effd", size = 10345643, upload-time = "2026-03-31T06:48:13.7Z" },
+ { url = "https://files.pythonhosted.org/packages/be/e3/921c93b4d9a280409451dc8d07b062b503bbec0531d2627e73a756e99a82/pandas-3.0.2-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b444dc64c079e84df91baa8bf613d58405645461cabca929d9178f2cd392398d", size = 10743641, upload-time = "2026-03-31T06:48:16.659Z" },
+ { url = "https://files.pythonhosted.org/packages/56/ca/fd17286f24fa3b4d067965d8d5d7e14fe557dd4f979a0b068ac0deaf8228/pandas-3.0.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4544c7a54920de8eeacaa1466a6b7268ecfbc9bc64ab4dbb89c6bbe94d5e0660", size = 11361993, upload-time = "2026-03-31T06:48:19.475Z" },
+ { url = "https://files.pythonhosted.org/packages/e4/a5/2f6ed612056819de445a433ca1f2821ac3dab7f150d569a59e9cc105de1d/pandas-3.0.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:734be7551687c00fbd760dc0522ed974f82ad230d4a10f54bf51b80d44a08702", size = 11815274, upload-time = "2026-03-31T06:48:22.695Z" },
+ { url = "https://files.pythonhosted.org/packages/00/2f/b622683e99ec3ce00b0854bac9e80868592c5b051733f2cf3a868e5fea26/pandas-3.0.2-cp314-cp314t-win_amd64.whl", hash = "sha256:57a07209bebcbcf768d2d13c9b78b852f9a15978dac41b9e6421a81ad4cdd276", size = 10888530, upload-time = "2026-03-31T06:48:25.806Z" },
+ { url = "https://files.pythonhosted.org/packages/cb/2b/f8434233fab2bd66a02ec014febe4e5adced20e2693e0e90a07d118ed30e/pandas-3.0.2-cp314-cp314t-win_arm64.whl", hash = "sha256:5371b72c2d4d415d08765f32d689217a43227484e81b2305b52076e328f6f482", size = 9455341, upload-time = "2026-03-31T06:48:28.418Z" },
+]
+
+[[package]]
+name = "pathable"
+version = "0.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/55/b748445cb4ea6b125626f15379be7c96d1035d4fa3e8fee362fa92298abf/pathable-0.5.0.tar.gz", hash = "sha256:d81938348a1cacb525e7c75166270644782c0fb9c8cecc16be033e71427e0ef1", size = 16655, upload-time = "2026-02-20T08:47:00.748Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/52/96/5a770e5c461462575474468e5af931cff9de036e7c2b4fea23c1c58d2cbe/pathable-0.5.0-py3-none-any.whl", hash = "sha256:646e3d09491a6351a0c82632a09c02cdf70a252e73196b36d8a15ba0a114f0a6", size = 16867, upload-time = "2026-02-20T08:46:59.536Z" },
+]
+
+[[package]]
+name = "pillow"
+version = "12.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8c/21/c2bcdd5906101a30244eaffc1b6e6ce71a31bd0742a01eb89e660ebfac2d/pillow-12.2.0.tar.gz", hash = "sha256:a830b1a40919539d07806aa58e1b114df53ddd43213d9c8b75847eee6c0182b5", size = 46987819, upload-time = "2026-04-01T14:46:17.687Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/3a/aa/d0b28e1c811cd4d5f5c2bfe2e022292bd255ae5744a3b9ac7d6c8f72dd75/pillow-12.2.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:a4e8f36e677d3336f35089648c8955c51c6d386a13cf6ee9c189c5f5bd713a9f", size = 5354355, upload-time = "2026-04-01T14:42:15.402Z" },
+ { url = "https://files.pythonhosted.org/packages/27/8e/1d5b39b8ae2bd7650d0c7b6abb9602d16043ead9ebbfef4bc4047454da2a/pillow-12.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e589959f10d9824d39b350472b92f0ce3b443c0a3442ebf41c40cb8361c5b97", size = 4695871, upload-time = "2026-04-01T14:42:18.234Z" },
+ { url = "https://files.pythonhosted.org/packages/f0/c5/dcb7a6ca6b7d3be41a76958e90018d56c8462166b3ef223150360850c8da/pillow-12.2.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a52edc8bfff4429aaabdf4d9ee0daadbbf8562364f940937b941f87a4290f5ff", size = 6269734, upload-time = "2026-04-01T14:42:20.608Z" },
+ { url = "https://files.pythonhosted.org/packages/ea/f1/aa1bb13b2f4eba914e9637893c73f2af8e48d7d4023b9d3750d4c5eb2d0c/pillow-12.2.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:975385f4776fafde056abb318f612ef6285b10a1f12b8570f3647ad0d74b48ec", size = 8076080, upload-time = "2026-04-01T14:42:23.095Z" },
+ { url = "https://files.pythonhosted.org/packages/a1/2a/8c79d6a53169937784604a8ae8d77e45888c41537f7f6f65ed1f407fe66d/pillow-12.2.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bd9c0c7a0c681a347b3194c500cb1e6ca9cab053ea4d82a5cf45b6b754560136", size = 6382236, upload-time = "2026-04-01T14:42:25.82Z" },
+ { url = "https://files.pythonhosted.org/packages/b5/42/bbcb6051030e1e421d103ce7a8ecadf837aa2f39b8f82ef1a8d37c3d4ebc/pillow-12.2.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:88d387ff40b3ff7c274947ed3125dedf5262ec6919d83946753b5f3d7c67ea4c", size = 7070220, upload-time = "2026-04-01T14:42:28.68Z" },
+ { url = "https://files.pythonhosted.org/packages/3f/e1/c2a7d6dd8cfa6b231227da096fd2d58754bab3603b9d73bf609d3c18b64f/pillow-12.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:51c4167c34b0d8ba05b547a3bb23578d0ba17b80a5593f93bd8ecb123dd336a3", size = 6493124, upload-time = "2026-04-01T14:42:31.579Z" },
+ { url = "https://files.pythonhosted.org/packages/5f/41/7c8617da5d32e1d2f026e509484fdb6f3ad7efaef1749a0c1928adbb099e/pillow-12.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:34c0d99ecccea270c04882cb3b86e7b57296079c9a4aff88cb3b33563d95afaa", size = 7194324, upload-time = "2026-04-01T14:42:34.615Z" },
+ { url = "https://files.pythonhosted.org/packages/2d/de/a777627e19fd6d62f84070ee1521adde5eeda4855b5cf60fe0b149118bca/pillow-12.2.0-cp310-cp310-win32.whl", hash = "sha256:b85f66ae9eb53e860a873b858b789217ba505e5e405a24b85c0464822fe88032", size = 6376363, upload-time = "2026-04-01T14:42:37.19Z" },
+ { url = "https://files.pythonhosted.org/packages/e7/34/fc4cb5204896465842767b96d250c08410f01f2f28afc43b257de842eed5/pillow-12.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:673aa32138f3e7531ccdbca7b3901dba9b70940a19ccecc6a37c77d5fdeb05b5", size = 7083523, upload-time = "2026-04-01T14:42:39.62Z" },
+ { url = "https://files.pythonhosted.org/packages/2d/a0/32852d36bc7709f14dc3f64f929a275e958ad8c19a6deba9610d458e28b3/pillow-12.2.0-cp310-cp310-win_arm64.whl", hash = "sha256:3e080565d8d7c671db5802eedfb438e5565ffa40115216eabb8cd52d0ecce024", size = 2463318, upload-time = "2026-04-01T14:42:42.063Z" },
+ { url = "https://files.pythonhosted.org/packages/68/e1/748f5663efe6edcfc4e74b2b93edfb9b8b99b67f21a854c3ae416500a2d9/pillow-12.2.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:8be29e59487a79f173507c30ddf57e733a357f67881430449bb32614075a40ab", size = 5354347, upload-time = "2026-04-01T14:42:44.255Z" },
+ { url = "https://files.pythonhosted.org/packages/47/a1/d5ff69e747374c33a3b53b9f98cca7889fce1fd03d79cdc4e1bccc6c5a87/pillow-12.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:71cde9a1e1551df7d34a25462fc60325e8a11a82cc2e2f54578e5e9a1e153d65", size = 4695873, upload-time = "2026-04-01T14:42:46.452Z" },
+ { url = "https://files.pythonhosted.org/packages/df/21/e3fbdf54408a973c7f7f89a23b2cb97a7ef30c61ab4142af31eee6aebc88/pillow-12.2.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f490f9368b6fc026f021db16d7ec2fbf7d89e2edb42e8ec09d2c60505f5729c7", size = 6280168, upload-time = "2026-04-01T14:42:49.228Z" },
+ { url = "https://files.pythonhosted.org/packages/d3/f1/00b7278c7dd52b17ad4329153748f87b6756ec195ff786c2bdf12518337d/pillow-12.2.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8bd7903a5f2a4545f6fd5935c90058b89d30045568985a71c79f5fd6edf9b91e", size = 8088188, upload-time = "2026-04-01T14:42:51.735Z" },
+ { url = "https://files.pythonhosted.org/packages/ad/cf/220a5994ef1b10e70e85748b75649d77d506499352be135a4989c957b701/pillow-12.2.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3997232e10d2920a68d25191392e3a4487d8183039e1c74c2297f00ed1c50705", size = 6394401, upload-time = "2026-04-01T14:42:54.343Z" },
+ { url = "https://files.pythonhosted.org/packages/e9/bd/e51a61b1054f09437acfbc2ff9106c30d1eb76bc1453d428399946781253/pillow-12.2.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e74473c875d78b8e9d5da2a70f7099549f9eb37ded4e2f6a463e60125bccd176", size = 7079655, upload-time = "2026-04-01T14:42:56.954Z" },
+ { url = "https://files.pythonhosted.org/packages/6b/3d/45132c57d5fb4b5744567c3817026480ac7fc3ce5d4c47902bc0e7f6f853/pillow-12.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:56a3f9c60a13133a98ecff6197af34d7824de9b7b38c3654861a725c970c197b", size = 6503105, upload-time = "2026-04-01T14:42:59.847Z" },
+ { url = "https://files.pythonhosted.org/packages/7d/2e/9df2fc1e82097b1df3dce58dc43286aa01068e918c07574711fcc53e6fb4/pillow-12.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:90e6f81de50ad6b534cab6e5aef77ff6e37722b2f5d908686f4a5c9eba17a909", size = 7203402, upload-time = "2026-04-01T14:43:02.664Z" },
+ { url = "https://files.pythonhosted.org/packages/bd/2e/2941e42858ebb67e50ae741473de81c2984e6eff7b397017623c676e2e8d/pillow-12.2.0-cp311-cp311-win32.whl", hash = "sha256:8c984051042858021a54926eb597d6ee3012393ce9c181814115df4c60b9a808", size = 6378149, upload-time = "2026-04-01T14:43:05.274Z" },
+ { url = "https://files.pythonhosted.org/packages/69/42/836b6f3cd7f3e5fa10a1f1a5420447c17966044c8fbf589cc0452d5502db/pillow-12.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:6e6b2a0c538fc200b38ff9eb6628228b77908c319a005815f2dde585a0664b60", size = 7082626, upload-time = "2026-04-01T14:43:08.557Z" },
+ { url = "https://files.pythonhosted.org/packages/c2/88/549194b5d6f1f494b485e493edc6693c0a16f4ada488e5bd974ed1f42fad/pillow-12.2.0-cp311-cp311-win_arm64.whl", hash = "sha256:9a8a34cc89c67a65ea7437ce257cea81a9dad65b29805f3ecee8c8fe8ff25ffe", size = 2463531, upload-time = "2026-04-01T14:43:10.743Z" },
+ { url = "https://files.pythonhosted.org/packages/58/be/7482c8a5ebebbc6470b3eb791812fff7d5e0216c2be3827b30b8bb6603ed/pillow-12.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2d192a155bbcec180f8564f693e6fd9bccff5a7af9b32e2e4bf8c9c69dbad6b5", size = 5308279, upload-time = "2026-04-01T14:43:13.246Z" },
+ { url = "https://files.pythonhosted.org/packages/d8/95/0a351b9289c2b5cbde0bacd4a83ebc44023e835490a727b2a3bd60ddc0f4/pillow-12.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f3f40b3c5a968281fd507d519e444c35f0ff171237f4fdde090dd60699458421", size = 4695490, upload-time = "2026-04-01T14:43:15.584Z" },
+ { url = "https://files.pythonhosted.org/packages/de/af/4e8e6869cbed569d43c416fad3dc4ecb944cb5d9492defaed89ddd6fe871/pillow-12.2.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:03e7e372d5240cc23e9f07deca4d775c0817bffc641b01e9c3af208dbd300987", size = 6284462, upload-time = "2026-04-01T14:43:18.268Z" },
+ { url = "https://files.pythonhosted.org/packages/e9/9e/c05e19657fd57841e476be1ab46c4d501bffbadbafdc31a6d665f8b737b6/pillow-12.2.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b86024e52a1b269467a802258c25521e6d742349d760728092e1bc2d135b4d76", size = 8094744, upload-time = "2026-04-01T14:43:20.716Z" },
+ { url = "https://files.pythonhosted.org/packages/2b/54/1789c455ed10176066b6e7e6da1b01e50e36f94ba584dc68d9eebfe9156d/pillow-12.2.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7371b48c4fa448d20d2714c9a1f775a81155050d383333e0a6c15b1123dda005", size = 6398371, upload-time = "2026-04-01T14:43:23.443Z" },
+ { url = "https://files.pythonhosted.org/packages/43/e3/fdc657359e919462369869f1c9f0e973f353f9a9ee295a39b1fea8ee1a77/pillow-12.2.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:62f5409336adb0663b7caa0da5c7d9e7bdbaae9ce761d34669420c2a801b2780", size = 7087215, upload-time = "2026-04-01T14:43:26.758Z" },
+ { url = "https://files.pythonhosted.org/packages/8b/f8/2f6825e441d5b1959d2ca5adec984210f1ec086435b0ed5f52c19b3b8a6e/pillow-12.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:01afa7cf67f74f09523699b4e88c73fb55c13346d212a59a2db1f86b0a63e8c5", size = 6509783, upload-time = "2026-04-01T14:43:29.56Z" },
+ { url = "https://files.pythonhosted.org/packages/67/f9/029a27095ad20f854f9dba026b3ea6428548316e057e6fc3545409e86651/pillow-12.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc3d34d4a8fbec3e88a79b92e5465e0f9b842b628675850d860b8bd300b159f5", size = 7212112, upload-time = "2026-04-01T14:43:32.091Z" },
+ { url = "https://files.pythonhosted.org/packages/be/42/025cfe05d1be22dbfdb4f264fe9de1ccda83f66e4fc3aac94748e784af04/pillow-12.2.0-cp312-cp312-win32.whl", hash = "sha256:58f62cc0f00fd29e64b29f4fd923ffdb3859c9f9e6105bfc37ba1d08994e8940", size = 6378489, upload-time = "2026-04-01T14:43:34.601Z" },
+ { url = "https://files.pythonhosted.org/packages/5d/7b/25a221d2c761c6a8ae21bfa3874988ff2583e19cf8a27bf2fee358df7942/pillow-12.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:7f84204dee22a783350679a0333981df803dac21a0190d706a50475e361c93f5", size = 7084129, upload-time = "2026-04-01T14:43:37.213Z" },
+ { url = "https://files.pythonhosted.org/packages/10/e1/542a474affab20fd4a0f1836cb234e8493519da6b76899e30bcc5d990b8b/pillow-12.2.0-cp312-cp312-win_arm64.whl", hash = "sha256:af73337013e0b3b46f175e79492d96845b16126ddf79c438d7ea7ff27783a414", size = 2463612, upload-time = "2026-04-01T14:43:39.421Z" },
+ { url = "https://files.pythonhosted.org/packages/4a/01/53d10cf0dbad820a8db274d259a37ba50b88b24768ddccec07355382d5ad/pillow-12.2.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:8297651f5b5679c19968abefd6bb84d95fe30ef712eb1b2d9b2d31ca61267f4c", size = 4100837, upload-time = "2026-04-01T14:43:41.506Z" },
+ { url = "https://files.pythonhosted.org/packages/0f/98/f3a6657ecb698c937f6c76ee564882945f29b79bad496abcba0e84659ec5/pillow-12.2.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:50d8520da2a6ce0af445fa6d648c4273c3eeefbc32d7ce049f22e8b5c3daecc2", size = 4176528, upload-time = "2026-04-01T14:43:43.773Z" },
+ { url = "https://files.pythonhosted.org/packages/69/bc/8986948f05e3ea490b8442ea1c1d4d990b24a7e43d8a51b2c7d8b1dced36/pillow-12.2.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:766cef22385fa1091258ad7e6216792b156dc16d8d3fa607e7545b2b72061f1c", size = 3640401, upload-time = "2026-04-01T14:43:45.87Z" },
+ { url = "https://files.pythonhosted.org/packages/34/46/6c717baadcd62bc8ed51d238d521ab651eaa74838291bda1f86fe1f864c9/pillow-12.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5d2fd0fa6b5d9d1de415060363433f28da8b1526c1c129020435e186794b3795", size = 5308094, upload-time = "2026-04-01T14:43:48.438Z" },
+ { url = "https://files.pythonhosted.org/packages/71/43/905a14a8b17fdb1ccb58d282454490662d2cb89a6bfec26af6d3520da5ec/pillow-12.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:56b25336f502b6ed02e889f4ece894a72612fe885889a6e8c4c80239ff6e5f5f", size = 4695402, upload-time = "2026-04-01T14:43:51.292Z" },
+ { url = "https://files.pythonhosted.org/packages/73/dd/42107efcb777b16fa0393317eac58f5b5cf30e8392e266e76e51cff28c3d/pillow-12.2.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f1c943e96e85df3d3478f7b691f229887e143f81fedab9b20205349ab04d73ed", size = 6280005, upload-time = "2026-04-01T14:43:54.242Z" },
+ { url = "https://files.pythonhosted.org/packages/a8/68/b93e09e5e8549019e61acf49f65b1a8530765a7f812c77a7461bca7e4494/pillow-12.2.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:03f6fab9219220f041c74aeaa2939ff0062bd5c364ba9ce037197f4c6d498cd9", size = 8090669, upload-time = "2026-04-01T14:43:57.335Z" },
+ { url = "https://files.pythonhosted.org/packages/4b/6e/3ccb54ce8ec4ddd1accd2d89004308b7b0b21c4ac3d20fa70af4760a4330/pillow-12.2.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5cdfebd752ec52bf5bb4e35d9c64b40826bc5b40a13df7c3cda20a2c03a0f5ed", size = 6395194, upload-time = "2026-04-01T14:43:59.864Z" },
+ { url = "https://files.pythonhosted.org/packages/67/ee/21d4e8536afd1a328f01b359b4d3997b291ffd35a237c877b331c1c3b71c/pillow-12.2.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eedf4b74eda2b5a4b2b2fb4c006d6295df3bf29e459e198c90ea48e130dc75c3", size = 7082423, upload-time = "2026-04-01T14:44:02.74Z" },
+ { url = "https://files.pythonhosted.org/packages/78/5f/e9f86ab0146464e8c133fe85df987ed9e77e08b29d8d35f9f9f4d6f917ba/pillow-12.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:00a2865911330191c0b818c59103b58a5e697cae67042366970a6b6f1b20b7f9", size = 6505667, upload-time = "2026-04-01T14:44:05.381Z" },
+ { url = "https://files.pythonhosted.org/packages/ed/1e/409007f56a2fdce61584fd3acbc2bbc259857d555196cedcadc68c015c82/pillow-12.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1e1757442ed87f4912397c6d35a0db6a7b52592156014706f17658ff58bbf795", size = 7208580, upload-time = "2026-04-01T14:44:08.39Z" },
+ { url = "https://files.pythonhosted.org/packages/23/c4/7349421080b12fb35414607b8871e9534546c128a11965fd4a7002ccfbee/pillow-12.2.0-cp313-cp313-win32.whl", hash = "sha256:144748b3af2d1b358d41286056d0003f47cb339b8c43a9ea42f5fea4d8c66b6e", size = 6375896, upload-time = "2026-04-01T14:44:11.197Z" },
+ { url = "https://files.pythonhosted.org/packages/3f/82/8a3739a5e470b3c6cbb1d21d315800d8e16bff503d1f16b03a4ec3212786/pillow-12.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:390ede346628ccc626e5730107cde16c42d3836b89662a115a921f28440e6a3b", size = 7081266, upload-time = "2026-04-01T14:44:13.947Z" },
+ { url = "https://files.pythonhosted.org/packages/c3/25/f968f618a062574294592f668218f8af564830ccebdd1fa6200f598e65c5/pillow-12.2.0-cp313-cp313-win_arm64.whl", hash = "sha256:8023abc91fba39036dbce14a7d6535632f99c0b857807cbbbf21ecc9f4717f06", size = 2463508, upload-time = "2026-04-01T14:44:16.312Z" },
+ { url = "https://files.pythonhosted.org/packages/4d/a4/b342930964e3cb4dce5038ae34b0eab4653334995336cd486c5a8c25a00c/pillow-12.2.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:042db20a421b9bafecc4b84a8b6e444686bd9d836c7fd24542db3e7df7baad9b", size = 5309927, upload-time = "2026-04-01T14:44:18.89Z" },
+ { url = "https://files.pythonhosted.org/packages/9f/de/23198e0a65a9cf06123f5435a5d95cea62a635697f8f03d134d3f3a96151/pillow-12.2.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:dd025009355c926a84a612fecf58bb315a3f6814b17ead51a8e48d3823d9087f", size = 4698624, upload-time = "2026-04-01T14:44:21.115Z" },
+ { url = "https://files.pythonhosted.org/packages/01/a6/1265e977f17d93ea37aa28aa81bad4fa597933879fac2520d24e021c8da3/pillow-12.2.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:88ddbc66737e277852913bd1e07c150cc7bb124539f94c4e2df5344494e0a612", size = 6321252, upload-time = "2026-04-01T14:44:23.663Z" },
+ { url = "https://files.pythonhosted.org/packages/3c/83/5982eb4a285967baa70340320be9f88e57665a387e3a53a7f0db8231a0cd/pillow-12.2.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d362d1878f00c142b7e1a16e6e5e780f02be8195123f164edf7eddd911eefe7c", size = 8126550, upload-time = "2026-04-01T14:44:26.772Z" },
+ { url = "https://files.pythonhosted.org/packages/4e/48/6ffc514adce69f6050d0753b1a18fd920fce8cac87620d5a31231b04bfc5/pillow-12.2.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2c727a6d53cb0018aadd8018c2b938376af27914a68a492f59dfcaca650d5eea", size = 6433114, upload-time = "2026-04-01T14:44:29.615Z" },
+ { url = "https://files.pythonhosted.org/packages/36/a3/f9a77144231fb8d40ee27107b4463e205fa4677e2ca2548e14da5cf18dce/pillow-12.2.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:efd8c21c98c5cc60653bcb311bef2ce0401642b7ce9d09e03a7da87c878289d4", size = 7115667, upload-time = "2026-04-01T14:44:32.773Z" },
+ { url = "https://files.pythonhosted.org/packages/c1/fc/ac4ee3041e7d5a565e1c4fd72a113f03b6394cc72ab7089d27608f8aaccb/pillow-12.2.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9f08483a632889536b8139663db60f6724bfcb443c96f1b18855860d7d5c0fd4", size = 6538966, upload-time = "2026-04-01T14:44:35.252Z" },
+ { url = "https://files.pythonhosted.org/packages/c0/a8/27fb307055087f3668f6d0a8ccb636e7431d56ed0750e07a60547b1e083e/pillow-12.2.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dac8d77255a37e81a2efcbd1fc05f1c15ee82200e6c240d7e127e25e365c39ea", size = 7238241, upload-time = "2026-04-01T14:44:37.875Z" },
+ { url = "https://files.pythonhosted.org/packages/ad/4b/926ab182c07fccae9fcb120043464e1ff1564775ec8864f21a0ebce6ac25/pillow-12.2.0-cp313-cp313t-win32.whl", hash = "sha256:ee3120ae9dff32f121610bb08e4313be87e03efeadfc6c0d18f89127e24d0c24", size = 6379592, upload-time = "2026-04-01T14:44:40.336Z" },
+ { url = "https://files.pythonhosted.org/packages/c2/c4/f9e476451a098181b30050cc4c9a3556b64c02cf6497ea421ac047e89e4b/pillow-12.2.0-cp313-cp313t-win_amd64.whl", hash = "sha256:325ca0528c6788d2a6c3d40e3568639398137346c3d6e66bb61db96b96511c98", size = 7085542, upload-time = "2026-04-01T14:44:43.251Z" },
+ { url = "https://files.pythonhosted.org/packages/00/a4/285f12aeacbe2d6dc36c407dfbbe9e96d4a80b0fb710a337f6d2ad978c75/pillow-12.2.0-cp313-cp313t-win_arm64.whl", hash = "sha256:2e5a76d03a6c6dcef67edabda7a52494afa4035021a79c8558e14af25313d453", size = 2465765, upload-time = "2026-04-01T14:44:45.996Z" },
+ { url = "https://files.pythonhosted.org/packages/bf/98/4595daa2365416a86cb0d495248a393dfc84e96d62ad080c8546256cb9c0/pillow-12.2.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:3adc9215e8be0448ed6e814966ecf3d9952f0ea40eb14e89a102b87f450660d8", size = 4100848, upload-time = "2026-04-01T14:44:48.48Z" },
+ { url = "https://files.pythonhosted.org/packages/0b/79/40184d464cf89f6663e18dfcf7ca21aae2491fff1a16127681bf1fa9b8cf/pillow-12.2.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:6a9adfc6d24b10f89588096364cc726174118c62130c817c2837c60cf08a392b", size = 4176515, upload-time = "2026-04-01T14:44:51.353Z" },
+ { url = "https://files.pythonhosted.org/packages/b0/63/703f86fd4c422a9cf722833670f4f71418fb116b2853ff7da722ea43f184/pillow-12.2.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:6a6e67ea2e6feda684ed370f9a1c52e7a243631c025ba42149a2cc5934dec295", size = 3640159, upload-time = "2026-04-01T14:44:53.588Z" },
+ { url = "https://files.pythonhosted.org/packages/71/e0/fb22f797187d0be2270f83500aab851536101b254bfa1eae10795709d283/pillow-12.2.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:2bb4a8d594eacdfc59d9e5ad972aa8afdd48d584ffd5f13a937a664c3e7db0ed", size = 5312185, upload-time = "2026-04-01T14:44:56.039Z" },
+ { url = "https://files.pythonhosted.org/packages/ba/8c/1a9e46228571de18f8e28f16fabdfc20212a5d019f3e3303452b3f0a580d/pillow-12.2.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:80b2da48193b2f33ed0c32c38140f9d3186583ce7d516526d462645fd98660ae", size = 4695386, upload-time = "2026-04-01T14:44:58.663Z" },
+ { url = "https://files.pythonhosted.org/packages/70/62/98f6b7f0c88b9addd0e87c217ded307b36be024d4ff8869a812b241d1345/pillow-12.2.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22db17c68434de69d8ecfc2fe821569195c0c373b25cccb9cbdacf2c6e53c601", size = 6280384, upload-time = "2026-04-01T14:45:01.5Z" },
+ { url = "https://files.pythonhosted.org/packages/5e/03/688747d2e91cfbe0e64f316cd2e8005698f76ada3130d0194664174fa5de/pillow-12.2.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7b14cc0106cd9aecda615dd6903840a058b4700fcb817687d0ee4fc8b6e389be", size = 8091599, upload-time = "2026-04-01T14:45:04.5Z" },
+ { url = "https://files.pythonhosted.org/packages/f6/35/577e22b936fcdd66537329b33af0b4ccfefaeabd8aec04b266528cddb33c/pillow-12.2.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cbeb542b2ebc6fcdacabf8aca8c1a97c9b3ad3927d46b8723f9d4f033288a0f", size = 6396021, upload-time = "2026-04-01T14:45:07.117Z" },
+ { url = "https://files.pythonhosted.org/packages/11/8d/d2532ad2a603ca2b93ad9f5135732124e57811d0168155852f37fbce2458/pillow-12.2.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4bfd07bc812fbd20395212969e41931001fd59eb55a60658b0e5710872e95286", size = 7083360, upload-time = "2026-04-01T14:45:09.763Z" },
+ { url = "https://files.pythonhosted.org/packages/5e/26/d325f9f56c7e039034897e7380e9cc202b1e368bfd04d4cbe6a441f02885/pillow-12.2.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9aba9a17b623ef750a4d11b742cbafffeb48a869821252b30ee21b5e91392c50", size = 6507628, upload-time = "2026-04-01T14:45:12.378Z" },
+ { url = "https://files.pythonhosted.org/packages/5f/f7/769d5632ffb0988f1c5e7660b3e731e30f7f8ec4318e94d0a5d674eb65a4/pillow-12.2.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:deede7c263feb25dba4e82ea23058a235dcc2fe1f6021025dc71f2b618e26104", size = 7209321, upload-time = "2026-04-01T14:45:15.122Z" },
+ { url = "https://files.pythonhosted.org/packages/6a/7a/c253e3c645cd47f1aceea6a8bacdba9991bf45bb7dfe927f7c893e89c93c/pillow-12.2.0-cp314-cp314-win32.whl", hash = "sha256:632ff19b2778e43162304d50da0181ce24ac5bb8180122cbe1bf4673428328c7", size = 6479723, upload-time = "2026-04-01T14:45:17.797Z" },
+ { url = "https://files.pythonhosted.org/packages/cd/8b/601e6566b957ca50e28725cb6c355c59c2c8609751efbecd980db44e0349/pillow-12.2.0-cp314-cp314-win_amd64.whl", hash = "sha256:4e6c62e9d237e9b65fac06857d511e90d8461a32adcc1b9065ea0c0fa3a28150", size = 7217400, upload-time = "2026-04-01T14:45:20.529Z" },
+ { url = "https://files.pythonhosted.org/packages/d6/94/220e46c73065c3e2951bb91c11a1fb636c8c9ad427ac3ce7d7f3359b9b2f/pillow-12.2.0-cp314-cp314-win_arm64.whl", hash = "sha256:b1c1fbd8a5a1af3412a0810d060a78b5136ec0836c8a4ef9aa11807f2a22f4e1", size = 2554835, upload-time = "2026-04-01T14:45:23.162Z" },
+ { url = "https://files.pythonhosted.org/packages/b6/ab/1b426a3974cb0e7da5c29ccff4807871d48110933a57207b5a676cccc155/pillow-12.2.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:57850958fe9c751670e49b2cecf6294acc99e562531f4bd317fa5ddee2068463", size = 5314225, upload-time = "2026-04-01T14:45:25.637Z" },
+ { url = "https://files.pythonhosted.org/packages/19/1e/dce46f371be2438eecfee2a1960ee2a243bbe5e961890146d2dee1ff0f12/pillow-12.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d5d38f1411c0ed9f97bcb49b7bd59b6b7c314e0e27420e34d99d844b9ce3b6f3", size = 4698541, upload-time = "2026-04-01T14:45:28.355Z" },
+ { url = "https://files.pythonhosted.org/packages/55/c3/7fbecf70adb3a0c33b77a300dc52e424dc22ad8cdc06557a2e49523b703d/pillow-12.2.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5c0a9f29ca8e79f09de89293f82fc9b0270bb4af1d58bc98f540cc4aedf03166", size = 6322251, upload-time = "2026-04-01T14:45:30.924Z" },
+ { url = "https://files.pythonhosted.org/packages/1c/3c/7fbc17cfb7e4fe0ef1642e0abc17fc6c94c9f7a16be41498e12e2ba60408/pillow-12.2.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1610dd6c61621ae1cf811bef44d77e149ce3f7b95afe66a4512f8c59f25d9ebe", size = 8127807, upload-time = "2026-04-01T14:45:33.908Z" },
+ { url = "https://files.pythonhosted.org/packages/ff/c3/a8ae14d6defd2e448493ff512fae903b1e9bd40b72efb6ec55ce0048c8ce/pillow-12.2.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a34329707af4f73cf1782a36cd2289c0368880654a2c11f027bcee9052d35dd", size = 6433935, upload-time = "2026-04-01T14:45:36.623Z" },
+ { url = "https://files.pythonhosted.org/packages/6e/32/2880fb3a074847ac159d8f902cb43278a61e85f681661e7419e6596803ed/pillow-12.2.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8e9c4f5b3c546fa3458a29ab22646c1c6c787ea8f5ef51300e5a60300736905e", size = 7116720, upload-time = "2026-04-01T14:45:39.258Z" },
+ { url = "https://files.pythonhosted.org/packages/46/87/495cc9c30e0129501643f24d320076f4cc54f718341df18cc70ec94c44e1/pillow-12.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:fb043ee2f06b41473269765c2feae53fc2e2fbf96e5e22ca94fb5ad677856f06", size = 6540498, upload-time = "2026-04-01T14:45:41.879Z" },
+ { url = "https://files.pythonhosted.org/packages/18/53/773f5edca692009d883a72211b60fdaf8871cbef075eaa9d577f0a2f989e/pillow-12.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f278f034eb75b4e8a13a54a876cc4a5ab39173d2cdd93a638e1b467fc545ac43", size = 7239413, upload-time = "2026-04-01T14:45:44.705Z" },
+ { url = "https://files.pythonhosted.org/packages/c9/e4/4b64a97d71b2a83158134abbb2f5bd3f8a2ea691361282f010998f339ec7/pillow-12.2.0-cp314-cp314t-win32.whl", hash = "sha256:6bb77b2dcb06b20f9f4b4a8454caa581cd4dd0643a08bacf821216a16d9c8354", size = 6482084, upload-time = "2026-04-01T14:45:47.568Z" },
+ { url = "https://files.pythonhosted.org/packages/ba/13/306d275efd3a3453f72114b7431c877d10b1154014c1ebbedd067770d629/pillow-12.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:6562ace0d3fb5f20ed7290f1f929cae41b25ae29528f2af1722966a0a02e2aa1", size = 7225152, upload-time = "2026-04-01T14:45:50.032Z" },
+ { url = "https://files.pythonhosted.org/packages/ff/6e/cf826fae916b8658848d7b9f38d88da6396895c676e8086fc0988073aaf8/pillow-12.2.0-cp314-cp314t-win_arm64.whl", hash = "sha256:aa88ccfe4e32d362816319ed727a004423aab09c5cea43c01a4b435643fa34eb", size = 2556579, upload-time = "2026-04-01T14:45:52.529Z" },
+ { url = "https://files.pythonhosted.org/packages/4e/b7/2437044fb910f499610356d1352e3423753c98e34f915252aafecc64889f/pillow-12.2.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0538bd5e05efec03ae613fd89c4ce0368ecd2ba239cc25b9f9be7ed426b0af1f", size = 5273969, upload-time = "2026-04-01T14:45:55.538Z" },
+ { url = "https://files.pythonhosted.org/packages/f6/f4/8316e31de11b780f4ac08ef3654a75555e624a98db1056ecb2122d008d5a/pillow-12.2.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:394167b21da716608eac917c60aa9b969421b5dcbbe02ae7f013e7b85811c69d", size = 4659674, upload-time = "2026-04-01T14:45:58.093Z" },
+ { url = "https://files.pythonhosted.org/packages/d4/37/664fca7201f8bb2aa1d20e2c3d5564a62e6ae5111741966c8319ca802361/pillow-12.2.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5d04bfa02cc2d23b497d1e90a0f927070043f6cbf303e738300532379a4b4e0f", size = 5288479, upload-time = "2026-04-01T14:46:01.141Z" },
+ { url = "https://files.pythonhosted.org/packages/49/62/5b0ed78fce87346be7a5cfcfaaad91f6a1f98c26f86bdbafa2066c647ef6/pillow-12.2.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0c838a5125cee37e68edec915651521191cef1e6aa336b855f495766e77a366e", size = 7032230, upload-time = "2026-04-01T14:46:03.874Z" },
+ { url = "https://files.pythonhosted.org/packages/c3/28/ec0fc38107fc32536908034e990c47914c57cd7c5a3ece4d8d8f7ffd7e27/pillow-12.2.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4a6c9fa44005fa37a91ebfc95d081e8079757d2e904b27103f4f5fa6f0bf78c0", size = 5355404, upload-time = "2026-04-01T14:46:06.33Z" },
+ { url = "https://files.pythonhosted.org/packages/5e/8b/51b0eddcfa2180d60e41f06bd6d0a62202b20b59c68f5a132e615b75aecf/pillow-12.2.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:25373b66e0dd5905ed63fa3cae13c82fbddf3079f2c8bf15c6fb6a35586324c1", size = 6002215, upload-time = "2026-04-01T14:46:08.83Z" },
+ { url = "https://files.pythonhosted.org/packages/bc/60/5382c03e1970de634027cee8e1b7d39776b778b81812aaf45b694dfe9e28/pillow-12.2.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:bfa9c230d2fe991bed5318a5f119bd6780cda2915cca595393649fc118ab895e", size = 7080946, upload-time = "2026-04-01T14:46:11.734Z" },
+]
+
+[[package]]
+name = "platformdirs"
+version = "4.9.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9f/4a/0883b8e3802965322523f0b200ecf33d31f10991d0401162f4b23c698b42/platformdirs-4.9.6.tar.gz", hash = "sha256:3bfa75b0ad0db84096ae777218481852c0ebc6c727b3168c1b9e0118e458cf0a", size = 29400, upload-time = "2026-04-09T00:04:10.812Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/75/a6/a0a304dc33b49145b21f4808d763822111e67d1c3a32b524a1baf947b6e1/platformdirs-4.9.6-py3-none-any.whl", hash = "sha256:e61adb1d5e5cb3441b4b7710bea7e4c12250ca49439228cc1021c00dcfac0917", size = 21348, upload-time = "2026-04-09T00:04:09.463Z" },
+]
+
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
+]
+
+[[package]]
+name = "py-key-value-aio"
+version = "0.4.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "beartype" },
+ { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/04/3c/0397c072a38d4bc580994b42e0c90c5f44f679303489e4376289534735e5/py_key_value_aio-0.4.4.tar.gz", hash = "sha256:e3012e6243ed7cc09bb05457bd4d03b1ba5c2b1ca8700096b3927db79ffbbe55", size = 92300, upload-time = "2026-02-16T21:21:43.245Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/32/69/f1b537ee70b7def42d63124a539ed3026a11a3ffc3086947a1ca6e861868/py_key_value_aio-0.4.4-py3-none-any.whl", hash = "sha256:18e17564ecae61b987f909fc2cd41ee2012c84b4b1dcb8c055cf8b4bc1bf3f5d", size = 152291, upload-time = "2026-02-16T21:21:44.241Z" },
+]
+
+[package.optional-dependencies]
+filetree = [
+ { name = "aiofile" },
+ { name = "anyio" },
+]
+keyring = [
+ { name = "keyring" },
+]
+memory = [
+ { name = "cachetools" },
+]
+
+[[package]]
+name = "pycparser"
+version = "3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" },
+]
+
+[[package]]
+name = "pydantic"
+version = "2.13.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "annotated-types" },
+ { name = "pydantic-core" },
+ { name = "typing-extensions" },
+ { name = "typing-inspection" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d9/e4/40d09941a2cebcb20609b86a559817d5b9291c49dd6f8c87e5feffbe703a/pydantic-2.13.3.tar.gz", hash = "sha256:af09e9d1d09f4e7fe37145c1f577e1d61ceb9a41924bf0094a36506285d0a84d", size = 844068, upload-time = "2026-04-20T14:46:43.632Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/f3/0a/fd7d723f8f8153418fb40cf9c940e82004fce7e987026b08a68a36dd3fe7/pydantic-2.13.3-py3-none-any.whl", hash = "sha256:6db14ac8dfc9a1e57f87ea2c0de670c251240f43cb0c30a5130e9720dc612927", size = 471981, upload-time = "2026-04-20T14:46:41.402Z" },
+]
+
+[package.optional-dependencies]
+email = [
+ { name = "email-validator" },
+]
+
+[[package]]
+name = "pydantic-core"
+version = "2.46.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2a/ef/f7abb56c49382a246fd2ce9c799691e3c3e7175ec74b14d99e798bcddb1a/pydantic_core-2.46.3.tar.gz", hash = "sha256:41c178f65b8c29807239d47e6050262eb6bf84eb695e41101e62e38df4a5bc2c", size = 471412, upload-time = "2026-04-20T14:40:56.672Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/22/98/b50eb9a411e87483b5c65dba4fa430a06bac4234d3403a40e5a9905ebcd0/pydantic_core-2.46.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:1da3786b8018e60349680720158cc19161cc3b4bdd815beb0a321cd5ce1ad5b1", size = 2108971, upload-time = "2026-04-20T14:43:51.945Z" },
+ { url = "https://files.pythonhosted.org/packages/08/4b/f364b9d161718ff2217160a4b5d41ce38de60aed91c3689ebffa1c939d23/pydantic_core-2.46.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cc0988cb29d21bf4a9d5cf2ef970b5c0e38d8d8e107a493278c05dc6c1dda69f", size = 1949588, upload-time = "2026-04-20T14:44:10.386Z" },
+ { url = "https://files.pythonhosted.org/packages/8f/8b/30bd03ee83b2f5e29f5ba8e647ab3c456bf56f2ec72fdbcc0215484a0854/pydantic_core-2.46.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f9067c3bfadd04c55484b89c0d267981b2f3512850f6f66e1e74204a4e4ce3", size = 1975986, upload-time = "2026-04-20T14:43:57.106Z" },
+ { url = "https://files.pythonhosted.org/packages/3c/54/13ccf954d84ec275d5d023d5786e4aa48840bc9f161f2838dc98e1153518/pydantic_core-2.46.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a642ac886ecf6402d9882d10c405dcf4b902abeb2972cd5fb4a48c83cd59279a", size = 2055830, upload-time = "2026-04-20T14:44:15.499Z" },
+ { url = "https://files.pythonhosted.org/packages/be/0e/65f38125e660fdbd72aa858e7dfae893645cfa0e7b13d333e174a367cd23/pydantic_core-2.46.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:79f561438481f28681584b89e2effb22855e2179880314bcddbf5968e935e807", size = 2222340, upload-time = "2026-04-20T14:41:51.353Z" },
+ { url = "https://files.pythonhosted.org/packages/d1/88/f3ab7739efe0e7e80777dbb84c59eb98518e3f57ea433206194c2e425272/pydantic_core-2.46.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57a973eae4665352a47cf1a99b4ee864620f2fe663a217d7a8da68a1f3a5bfda", size = 2280727, upload-time = "2026-04-20T14:41:30.461Z" },
+ { url = "https://files.pythonhosted.org/packages/2a/6d/c228219080817bec4982f9531cadb18da6aaa770fdeb114f49c237ac2c9f/pydantic_core-2.46.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83d002b97072a53ea150d63e0a3adfae5670cef5aa8a6e490240e482d3b22e57", size = 2092158, upload-time = "2026-04-20T14:44:07.305Z" },
+ { url = "https://files.pythonhosted.org/packages/0f/b1/525a16711e7c6d61635fac3b0bd54600b5c5d9f60c6fc5aaab26b64a2297/pydantic_core-2.46.3-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:b40ddd51e7c44b28cfaef746c9d3c506d658885e0a46f9eeef2ee815cbf8e045", size = 2116626, upload-time = "2026-04-20T14:42:34.118Z" },
+ { url = "https://files.pythonhosted.org/packages/ef/7c/17d30673351439a6951bf54f564cf2443ab00ae264ec9df00e2efd710eb5/pydantic_core-2.46.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ac5ec7fb9b87f04ee839af2d53bcadea57ded7d229719f56c0ed895bff987943", size = 2160691, upload-time = "2026-04-20T14:41:14.023Z" },
+ { url = "https://files.pythonhosted.org/packages/86/66/af8adbcbc0886ead7f1a116606a534d75a307e71e6e08226000d51b880d2/pydantic_core-2.46.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:a3b11c812f61b3129c4905781a2601dfdfdea5fe1e6c1cfb696b55d14e9c054f", size = 2182543, upload-time = "2026-04-20T14:40:48.886Z" },
+ { url = "https://files.pythonhosted.org/packages/b0/37/6de71e0f54c54a4190010f57deb749e1ddf75c568ada3b1320b70067f121/pydantic_core-2.46.3-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:1108da631e602e5b3c38d6d04fe5bb3bfa54349e6918e3ca6cf570b2e2b2f9d4", size = 2324513, upload-time = "2026-04-20T14:42:36.121Z" },
+ { url = "https://files.pythonhosted.org/packages/51/b1/9fc74ce94f603d5ef59ff258ca9c2c8fb902fb548d340a96f77f4d1c3b7f/pydantic_core-2.46.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:de885175515bcfa98ae618c1df7a072f13d179f81376c8007112af20567fd08a", size = 2361853, upload-time = "2026-04-20T14:43:24.886Z" },
+ { url = "https://files.pythonhosted.org/packages/40/d0/4c652fc592db35f100279ee751d5a145aca1b9a7984b9684ba7c1b5b0535/pydantic_core-2.46.3-cp310-cp310-win32.whl", hash = "sha256:d11058e3201527d41bc6b545c79187c9e4bf85e15a236a6007f0e991518882b7", size = 1980465, upload-time = "2026-04-20T14:44:46.239Z" },
+ { url = "https://files.pythonhosted.org/packages/27/b8/a920453c38afbe1f355e1ea0b0d94a0a3e0b0879d32d793108755fa171d5/pydantic_core-2.46.3-cp310-cp310-win_amd64.whl", hash = "sha256:3612edf65c8ea67ac13616c4d23af12faef1ae435a8a93e5934c2a0cbbdd1fd6", size = 2073884, upload-time = "2026-04-20T14:43:01.201Z" },
+ { url = "https://files.pythonhosted.org/packages/22/a2/1ba90a83e85a3f94c796b184f3efde9c72f2830dcda493eea8d59ba78e6d/pydantic_core-2.46.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:ab124d49d0459b2373ecf54118a45c28a1e6d4192a533fbc915e70f556feb8e5", size = 2106740, upload-time = "2026-04-20T14:41:20.932Z" },
+ { url = "https://files.pythonhosted.org/packages/b6/f6/99ae893c89a0b9d3daec9f95487aa676709aa83f67643b3f0abaf4ab628a/pydantic_core-2.46.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cca67d52a5c7a16aed2b3999e719c4bcf644074eac304a5d3d62dd70ae7d4b2c", size = 1948293, upload-time = "2026-04-20T14:43:42.115Z" },
+ { url = "https://files.pythonhosted.org/packages/3e/b8/2e8e636dc9e3f16c2e16bf0849e24be82c5ee82c603c65fc0326666328fc/pydantic_core-2.46.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c024e08c0ba23e6fd68c771a521e9d6a792f2ebb0fa734296b36394dc30390e", size = 1973222, upload-time = "2026-04-20T14:41:57.841Z" },
+ { url = "https://files.pythonhosted.org/packages/34/36/0e730beec4d83c5306f417afbd82ff237d9a21e83c5edf675f31ed84c1fe/pydantic_core-2.46.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6645ce7eec4928e29a1e3b3d5c946621d105d3e79f0c9cddf07c2a9770949287", size = 2053852, upload-time = "2026-04-20T14:40:43.077Z" },
+ { url = "https://files.pythonhosted.org/packages/4b/f0/3071131f47e39136a17814576e0fada9168569f7f8c0e6ac4d1ede6a4958/pydantic_core-2.46.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a712c7118e6c5ea96562f7b488435172abb94a3c53c22c9efc1412264a45cbbe", size = 2221134, upload-time = "2026-04-20T14:43:03.349Z" },
+ { url = "https://files.pythonhosted.org/packages/2f/a9/a2dc023eec5aa4b02a467874bad32e2446957d2adcab14e107eab502e978/pydantic_core-2.46.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:69a868ef3ff206343579021c40faf3b1edc64b1cc508ff243a28b0a514ccb050", size = 2279785, upload-time = "2026-04-20T14:41:19.285Z" },
+ { url = "https://files.pythonhosted.org/packages/0a/44/93f489d16fb63fbd41c670441536541f6e8cfa1e5a69f40bc9c5d30d8c90/pydantic_core-2.46.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc7e8c32db809aa0f6ea1d6869ebc8518a65d5150fdfad8bcae6a49ae32a22e2", size = 2089404, upload-time = "2026-04-20T14:43:10.108Z" },
+ { url = "https://files.pythonhosted.org/packages/2a/78/8692e3aa72b2d004f7a5d937f1dfdc8552ba26caf0bec75f342c40f00dec/pydantic_core-2.46.3-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:3481bd1341dc85779ee506bc8e1196a277ace359d89d28588a9468c3ecbe63fa", size = 2114898, upload-time = "2026-04-20T14:44:51.475Z" },
+ { url = "https://files.pythonhosted.org/packages/6a/62/e83133f2e7832532060175cebf1f13748f4c7e7e7165cdd1f611f174494b/pydantic_core-2.46.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8690eba565c6d68ffd3a8655525cbdd5246510b44a637ee2c6c03a7ebfe64d3c", size = 2157856, upload-time = "2026-04-20T14:43:46.64Z" },
+ { url = "https://files.pythonhosted.org/packages/6d/ec/6a500e3ad7718ee50583fae79c8651f5d37e3abce1fa9ae177ae65842c53/pydantic_core-2.46.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4de88889d7e88d50d40ee5b39d5dac0bcaef9ba91f7e536ac064e6b2834ecccf", size = 2180168, upload-time = "2026-04-20T14:42:00.302Z" },
+ { url = "https://files.pythonhosted.org/packages/d8/53/8267811054b1aa7fc1dc7ded93812372ef79a839f5e23558136a6afbfde1/pydantic_core-2.46.3-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:e480080975c1ef7f780b8f99ed72337e7cc5efea2e518a20a692e8e7b278eb8b", size = 2322885, upload-time = "2026-04-20T14:41:05.253Z" },
+ { url = "https://files.pythonhosted.org/packages/c8/c1/1c0acdb3aa0856ddc4ecc55214578f896f2de16f400cf51627eb3c26c1c4/pydantic_core-2.46.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:de3a5c376f8cd94da9a1b8fd3dd1c16c7a7b216ed31dc8ce9fd7a22bf13b836e", size = 2360328, upload-time = "2026-04-20T14:41:43.991Z" },
+ { url = "https://files.pythonhosted.org/packages/f0/d0/ef39cd0f4a926814f360e71c1adeab48ad214d9727e4deb48eedfb5bce1a/pydantic_core-2.46.3-cp311-cp311-win32.whl", hash = "sha256:fc331a5314ffddd5385b9ee9d0d2fee0b13c27e0e02dad71b1ae5d6561f51eeb", size = 1979464, upload-time = "2026-04-20T14:43:12.215Z" },
+ { url = "https://files.pythonhosted.org/packages/18/9c/f41951b0d858e343f1cf09398b2a7b3014013799744f2c4a8ad6a3eec4f2/pydantic_core-2.46.3-cp311-cp311-win_amd64.whl", hash = "sha256:b5b9c6cf08a8a5e502698f5e153056d12c34b8fb30317e0c5fd06f45162a6346", size = 2070837, upload-time = "2026-04-20T14:41:47.707Z" },
+ { url = "https://files.pythonhosted.org/packages/9f/1e/264a17cd582f6ed50950d4d03dd5fefd84e570e238afe1cb3e25cf238769/pydantic_core-2.46.3-cp311-cp311-win_arm64.whl", hash = "sha256:5dfd51cf457482f04ec49491811a2b8fd5b843b64b11eecd2d7a1ee596ea78a6", size = 2053647, upload-time = "2026-04-20T14:42:27.535Z" },
+ { url = "https://files.pythonhosted.org/packages/4b/cb/5b47425556ecc1f3fe18ed2a0083188aa46e1dd812b06e406475b3a5d536/pydantic_core-2.46.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:b11b59b3eee90a80a36701ddb4576d9ae31f93f05cb9e277ceaa09e6bf074a67", size = 2101946, upload-time = "2026-04-20T14:40:52.581Z" },
+ { url = "https://files.pythonhosted.org/packages/a1/4f/2fb62c2267cae99b815bbf4a7b9283812c88ca3153ef29f7707200f1d4e5/pydantic_core-2.46.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:af8653713055ea18a3abc1537fe2ebc42f5b0bbb768d1eb79fd74eb47c0ac089", size = 1951612, upload-time = "2026-04-20T14:42:42.996Z" },
+ { url = "https://files.pythonhosted.org/packages/50/6e/b7348fd30d6556d132cddd5bd79f37f96f2601fe0608afac4f5fb01ec0b3/pydantic_core-2.46.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75a519dab6d63c514f3a81053e5266c549679e4aa88f6ec57f2b7b854aceb1b0", size = 1977027, upload-time = "2026-04-20T14:42:02.001Z" },
+ { url = "https://files.pythonhosted.org/packages/82/11/31d60ee2b45540d3fb0b29302a393dbc01cd771c473f5b5147bcd353e593/pydantic_core-2.46.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a6cd87cb1575b1ad05ba98894c5b5c96411ef678fa2f6ed2576607095b8d9789", size = 2063008, upload-time = "2026-04-20T14:44:17.952Z" },
+ { url = "https://files.pythonhosted.org/packages/8a/db/3a9d1957181b59258f44a2300ab0f0be9d1e12d662a4f57bb31250455c52/pydantic_core-2.46.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f80a55484b8d843c8ada81ebf70a682f3f00a3d40e378c06cf17ecb44d280d7d", size = 2233082, upload-time = "2026-04-20T14:40:57.934Z" },
+ { url = "https://files.pythonhosted.org/packages/9c/e1/3277c38792aeb5cfb18c2f0c5785a221d9ff4e149abbe1184d53d5f72273/pydantic_core-2.46.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3861f1731b90c50a3266316b9044f5c9b405eecb8e299b0a7120596334e4fe9c", size = 2304615, upload-time = "2026-04-20T14:42:12.584Z" },
+ { url = "https://files.pythonhosted.org/packages/5e/d5/e3d9717c9eba10855325650afd2a9cba8e607321697f18953af9d562da2f/pydantic_core-2.46.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb528e295ed31570ac3dcc9bfdd6e0150bc11ce6168ac87a8082055cf1a67395", size = 2094380, upload-time = "2026-04-20T14:43:05.522Z" },
+ { url = "https://files.pythonhosted.org/packages/a1/20/abac35dedcbfd66c6f0b03e4e3564511771d6c9b7ede10a362d03e110d9b/pydantic_core-2.46.3-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:367508faa4973b992b271ba1494acaab36eb7e8739d1e47be5035fb1ea225396", size = 2135429, upload-time = "2026-04-20T14:41:55.549Z" },
+ { url = "https://files.pythonhosted.org/packages/6c/a5/41bfd1df69afad71b5cf0535055bccc73022715ad362edbc124bc1e021d7/pydantic_core-2.46.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5ad3c826fe523e4becf4fe39baa44286cff85ef137c729a2c5e269afbfd0905d", size = 2174582, upload-time = "2026-04-20T14:41:45.96Z" },
+ { url = "https://files.pythonhosted.org/packages/79/65/38d86ea056b29b2b10734eb23329b7a7672ca604df4f2b6e9c02d4ee22fe/pydantic_core-2.46.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ec638c5d194ef8af27db69f16c954a09797c0dc25015ad6123eb2c73a4d271ca", size = 2187533, upload-time = "2026-04-20T14:40:55.367Z" },
+ { url = "https://files.pythonhosted.org/packages/b6/55/a1129141678a2026badc539ad1dee0a71d06f54c2f06a4bd68c030ac781b/pydantic_core-2.46.3-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:28ed528c45446062ee66edb1d33df5d88828ae167de76e773a3c7f64bd14e976", size = 2332985, upload-time = "2026-04-20T14:44:13.05Z" },
+ { url = "https://files.pythonhosted.org/packages/d7/60/cb26f4077719f709e54819f4e8e1d43f4091f94e285eb6bd21e1190a7b7c/pydantic_core-2.46.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:aed19d0c783886d5bd86d80ae5030006b45e28464218747dcf83dabfdd092c7b", size = 2373670, upload-time = "2026-04-20T14:41:53.421Z" },
+ { url = "https://files.pythonhosted.org/packages/6b/7e/c3f21882bdf1d8d086876f81b5e296206c69c6082551d776895de7801fa0/pydantic_core-2.46.3-cp312-cp312-win32.whl", hash = "sha256:06d5d8820cbbdb4147578c1fe7ffcd5b83f34508cb9f9ab76e807be7db6ff0a4", size = 1966722, upload-time = "2026-04-20T14:44:30.588Z" },
+ { url = "https://files.pythonhosted.org/packages/57/be/6b5e757b859013ebfbd7adba02f23b428f37c86dcbf78b5bb0b4ffd36e99/pydantic_core-2.46.3-cp312-cp312-win_amd64.whl", hash = "sha256:c3212fda0ee959c1dd04c60b601ec31097aaa893573a3a1abd0a47bcac2968c1", size = 2072970, upload-time = "2026-04-20T14:42:54.248Z" },
+ { url = "https://files.pythonhosted.org/packages/bf/f8/a989b21cc75e9a32d24192ef700eea606521221a89faa40c919ce884f2b1/pydantic_core-2.46.3-cp312-cp312-win_arm64.whl", hash = "sha256:f1f8338dd7a7f31761f1f1a3c47503a9a3b34eea3c8b01fa6ee96408affb5e72", size = 2035963, upload-time = "2026-04-20T14:44:20.4Z" },
+ { url = "https://files.pythonhosted.org/packages/9b/3c/9b5e8eb9821936d065439c3b0fb1490ffa64163bfe7e1595985a47896073/pydantic_core-2.46.3-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:12bc98de041458b80c86c56b24df1d23832f3e166cbaff011f25d187f5c62c37", size = 2102109, upload-time = "2026-04-20T14:41:24.219Z" },
+ { url = "https://files.pythonhosted.org/packages/91/97/1c41d1f5a19f241d8069f1e249853bcce378cdb76eec8ab636d7bc426280/pydantic_core-2.46.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:85348b8f89d2c3508b65b16c3c33a4da22b8215138d8b996912bb1532868885f", size = 1951820, upload-time = "2026-04-20T14:42:14.236Z" },
+ { url = "https://files.pythonhosted.org/packages/30/b4/d03a7ae14571bc2b6b3c7b122441154720619afe9a336fa3a95434df5e2f/pydantic_core-2.46.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1105677a6df914b1fb71a81b96c8cce7726857e1717d86001f29be06a25ee6f8", size = 1977785, upload-time = "2026-04-20T14:42:31.648Z" },
+ { url = "https://files.pythonhosted.org/packages/ae/0c/4086f808834b59e3c8f1aa26df8f4b6d998cdcf354a143d18ef41529d1fe/pydantic_core-2.46.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:87082cd65669a33adeba5470769e9704c7cf026cc30afb9cc77fd865578ebaad", size = 2062761, upload-time = "2026-04-20T14:40:37.093Z" },
+ { url = "https://files.pythonhosted.org/packages/fa/71/a649be5a5064c2df0db06e0a512c2281134ed2fcc981f52a657936a7527c/pydantic_core-2.46.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:60e5f66e12c4f5212d08522963380eaaeac5ebd795826cfd19b2dfb0c7a52b9c", size = 2232989, upload-time = "2026-04-20T14:42:59.254Z" },
+ { url = "https://files.pythonhosted.org/packages/a2/84/7756e75763e810b3a710f4724441d1ecc5883b94aacb07ca71c5fb5cfb69/pydantic_core-2.46.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b6cdf19bf84128d5e7c37e8a73a0c5c10d51103a650ac585d42dd6ae233f2b7f", size = 2303975, upload-time = "2026-04-20T14:41:32.287Z" },
+ { url = "https://files.pythonhosted.org/packages/6c/35/68a762e0c1e31f35fa0dac733cbd9f5b118042853698de9509c8e5bf128b/pydantic_core-2.46.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:031bb17f4885a43773c8c763089499f242aee2ea85cf17154168775dccdecf35", size = 2095325, upload-time = "2026-04-20T14:42:47.685Z" },
+ { url = "https://files.pythonhosted.org/packages/77/bf/1bf8c9a8e91836c926eae5e3e51dce009bf495a60ca56060689d3df3f340/pydantic_core-2.46.3-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:bcf2a8b2982a6673693eae7348ef3d8cf3979c1d63b54fca7c397a635cc68687", size = 2133368, upload-time = "2026-04-20T14:41:22.766Z" },
+ { url = "https://files.pythonhosted.org/packages/e5/50/87d818d6bab915984995157ceb2380f5aac4e563dddbed6b56f0ed057aba/pydantic_core-2.46.3-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28e8cf2f52d72ced402a137145923a762cbb5081e48b34312f7a0c8f55928ec3", size = 2173908, upload-time = "2026-04-20T14:42:52.044Z" },
+ { url = "https://files.pythonhosted.org/packages/91/88/a311fb306d0bd6185db41fa14ae888fb81d0baf648a761ae760d30819d33/pydantic_core-2.46.3-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:17eaface65d9fc5abb940003020309c1bf7a211f5f608d7870297c367e6f9022", size = 2186422, upload-time = "2026-04-20T14:43:29.55Z" },
+ { url = "https://files.pythonhosted.org/packages/8f/79/28fd0d81508525ab2054fef7c77a638c8b5b0afcbbaeee493cf7c3fef7e1/pydantic_core-2.46.3-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:93fd339f23408a07e98950a89644f92c54d8729719a40b30c0a30bb9ebc55d23", size = 2332709, upload-time = "2026-04-20T14:42:16.134Z" },
+ { url = "https://files.pythonhosted.org/packages/b3/21/795bf5fe5c0f379308b8ef19c50dedab2e7711dbc8d0c2acf08f1c7daa05/pydantic_core-2.46.3-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:23cbdb3aaa74dfe0837975dbf69b469753bbde8eacace524519ffdb6b6e89eb7", size = 2372428, upload-time = "2026-04-20T14:41:10.974Z" },
+ { url = "https://files.pythonhosted.org/packages/45/b3/ed14c659cbe7605e3ef063077680a64680aec81eb1a04763a05190d49b7f/pydantic_core-2.46.3-cp313-cp313-win32.whl", hash = "sha256:610eda2e3838f401105e6326ca304f5da1e15393ae25dacae5c5c63f2c275b13", size = 1965601, upload-time = "2026-04-20T14:41:42.128Z" },
+ { url = "https://files.pythonhosted.org/packages/ef/bb/adb70d9a762ddd002d723fbf1bd492244d37da41e3af7b74ad212609027e/pydantic_core-2.46.3-cp313-cp313-win_amd64.whl", hash = "sha256:68cc7866ed863db34351294187f9b729964c371ba33e31c26f478471c52e1ed0", size = 2071517, upload-time = "2026-04-20T14:43:36.096Z" },
+ { url = "https://files.pythonhosted.org/packages/52/eb/66faefabebfe68bd7788339c9c9127231e680b11906368c67ce112fdb47f/pydantic_core-2.46.3-cp313-cp313-win_arm64.whl", hash = "sha256:f64b5537ac62b231572879cd08ec05600308636a5d63bcbdb15063a466977bec", size = 2035802, upload-time = "2026-04-20T14:43:38.507Z" },
+ { url = "https://files.pythonhosted.org/packages/7f/db/a7bcb4940183fda36022cd18ba8dd12f2dff40740ec7b58ce7457befa416/pydantic_core-2.46.3-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:afa3aa644f74e290cdede48a7b0bee37d1c35e71b05105f6b340d484af536d9b", size = 2097614, upload-time = "2026-04-20T14:44:38.374Z" },
+ { url = "https://files.pythonhosted.org/packages/24/35/e4066358a22e3e99519db370494c7528f5a2aa1367370e80e27e20283543/pydantic_core-2.46.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ced3310e51aa425f7f77da8bbbb5212616655bedbe82c70944320bc1dbe5e018", size = 1951896, upload-time = "2026-04-20T14:40:53.996Z" },
+ { url = "https://files.pythonhosted.org/packages/87/92/37cf4049d1636996e4b888c05a501f40a43ff218983a551d57f9d5e14f0d/pydantic_core-2.46.3-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e29908922ce9da1a30b4da490bd1d3d82c01dcfdf864d2a74aacee674d0bfa34", size = 1979314, upload-time = "2026-04-20T14:41:49.446Z" },
+ { url = "https://files.pythonhosted.org/packages/d8/36/9ff4d676dfbdfb2d591cf43f3d90ded01e15b1404fd101180ed2d62a2fd3/pydantic_core-2.46.3-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0c9ff69140423eea8ed2d5477df3ba037f671f5e897d206d921bc9fdc39613e7", size = 2056133, upload-time = "2026-04-20T14:42:23.574Z" },
+ { url = "https://files.pythonhosted.org/packages/bc/f0/405b442a4d7ba855b06eec8b2bf9c617d43b8432d099dfdc7bf999293495/pydantic_core-2.46.3-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b675ab0a0d5b1c8fdb81195dc5bcefea3f3c240871cdd7ff9a2de8aa50772eb2", size = 2228726, upload-time = "2026-04-20T14:44:22.816Z" },
+ { url = "https://files.pythonhosted.org/packages/e7/f8/65cd92dd5a0bd89ba277a98ecbfaf6fc36bbd3300973c7a4b826d6ab1391/pydantic_core-2.46.3-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0087084960f209a9a4af50ecd1fb063d9ad3658c07bb81a7a53f452dacbfb2ba", size = 2301214, upload-time = "2026-04-20T14:44:48.792Z" },
+ { url = "https://files.pythonhosted.org/packages/fd/86/ef96a4c6e79e7a2d0410826a68fbc0eccc0fd44aa733be199d5fcac3bb87/pydantic_core-2.46.3-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed42e6cc8e1b0e2b9b96e2276bad70ae625d10d6d524aed0c93de974ae029f9f", size = 2099927, upload-time = "2026-04-20T14:41:40.196Z" },
+ { url = "https://files.pythonhosted.org/packages/6d/53/269caf30e0096e0a8a8f929d1982a27b3879872cca2d917d17c2f9fdf4fe/pydantic_core-2.46.3-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:f1771ce258afb3e4201e67d154edbbae712a76a6081079fe247c2f53c6322c22", size = 2128789, upload-time = "2026-04-20T14:41:15.868Z" },
+ { url = "https://files.pythonhosted.org/packages/00/b0/1a6d9b6a587e118482910c244a1c5acf4d192604174132efd12bf0ac486f/pydantic_core-2.46.3-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a7610b6a5242a6c736d8ad47fd5fff87fcfe8f833b281b1c409c3d6835d9227f", size = 2173815, upload-time = "2026-04-20T14:44:25.152Z" },
+ { url = "https://files.pythonhosted.org/packages/87/56/e7e00d4041a7e62b5a40815590114db3b535bf3ca0bf4dca9f16cef25246/pydantic_core-2.46.3-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:ff5e7783bcc5476e1db448bf268f11cb257b1c276d3e89f00b5727be86dd0127", size = 2181608, upload-time = "2026-04-20T14:41:28.933Z" },
+ { url = "https://files.pythonhosted.org/packages/e8/22/4bd23c3d41f7c185d60808a1de83c76cf5aeabf792f6c636a55c3b1ec7f9/pydantic_core-2.46.3-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:9d2e32edcc143bc01e95300671915d9ca052d4f745aa0a49c48d4803f8a85f2c", size = 2326968, upload-time = "2026-04-20T14:42:03.962Z" },
+ { url = "https://files.pythonhosted.org/packages/24/ac/66cd45129e3915e5ade3b292cb3bc7fd537f58f8f8dbdaba6170f7cabb74/pydantic_core-2.46.3-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:6e42d83d1c6b87fa56b521479cff237e626a292f3b31b6345c15a99121b454c1", size = 2369842, upload-time = "2026-04-20T14:41:35.52Z" },
+ { url = "https://files.pythonhosted.org/packages/a2/51/dd4248abb84113615473aa20d5545b7c4cd73c8644003b5259686f93996c/pydantic_core-2.46.3-cp314-cp314-win32.whl", hash = "sha256:07bc6d2a28c3adb4f7c6ae46aa4f2d2929af127f587ed44057af50bf1ce0f505", size = 1959661, upload-time = "2026-04-20T14:41:00.042Z" },
+ { url = "https://files.pythonhosted.org/packages/20/eb/59980e5f1ae54a3b86372bd9f0fa373ea2d402e8cdcd3459334430f91e91/pydantic_core-2.46.3-cp314-cp314-win_amd64.whl", hash = "sha256:8940562319bc621da30714617e6a7eaa6b98c84e8c685bcdc02d7ed5e7c7c44e", size = 2071686, upload-time = "2026-04-20T14:43:16.471Z" },
+ { url = "https://files.pythonhosted.org/packages/8c/db/1cf77e5247047dfee34bc01fa9bca134854f528c8eb053e144298893d370/pydantic_core-2.46.3-cp314-cp314-win_arm64.whl", hash = "sha256:5dcbbcf4d22210ced8f837c96db941bdb078f419543472aca5d9a0bb7cddc7df", size = 2026907, upload-time = "2026-04-20T14:43:31.732Z" },
+ { url = "https://files.pythonhosted.org/packages/57/c0/b3df9f6a543276eadba0a48487b082ca1f201745329d97dbfa287034a230/pydantic_core-2.46.3-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:d0fe3dce1e836e418f912c1ad91c73357d03e556a4d286f441bf34fed2dbeecf", size = 2095047, upload-time = "2026-04-20T14:42:37.982Z" },
+ { url = "https://files.pythonhosted.org/packages/66/57/886a938073b97556c168fd99e1a7305bb363cd30a6d2c76086bf0587b32a/pydantic_core-2.46.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:9ce92e58abc722dac1bf835a6798a60b294e48eb0e625ec9fd994b932ac5feee", size = 1934329, upload-time = "2026-04-20T14:43:49.655Z" },
+ { url = "https://files.pythonhosted.org/packages/0b/7c/b42eaa5c34b13b07ecb51da21761297a9b8eb43044c864a035999998f328/pydantic_core-2.46.3-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a03e6467f0f5ab796a486146d1b887b2dc5e5f9b3288898c1b1c3ad974e53e4a", size = 1974847, upload-time = "2026-04-20T14:42:10.737Z" },
+ { url = "https://files.pythonhosted.org/packages/e6/9b/92b42db6543e7de4f99ae977101a2967b63122d4b6cf7773812da2d7d5b5/pydantic_core-2.46.3-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2798b6ba041b9d70acfb9071a2ea13c8456dd1e6a5555798e41ba7b0790e329c", size = 2041742, upload-time = "2026-04-20T14:40:44.262Z" },
+ { url = "https://files.pythonhosted.org/packages/0f/19/46fbe1efabb5aa2834b43b9454e70f9a83ad9c338c1291e48bdc4fecf167/pydantic_core-2.46.3-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9be3e221bdc6d69abf294dcf7aff6af19c31a5cdcc8f0aa3b14be29df4bd03b1", size = 2236235, upload-time = "2026-04-20T14:41:27.307Z" },
+ { url = "https://files.pythonhosted.org/packages/77/da/b3f95bc009ad60ec53120f5d16c6faa8cabdbe8a20d83849a1f2b8728148/pydantic_core-2.46.3-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f13936129ce841f2a5ddf6f126fea3c43cd128807b5a59588c37cf10178c2e64", size = 2282633, upload-time = "2026-04-20T14:44:33.271Z" },
+ { url = "https://files.pythonhosted.org/packages/cc/6e/401336117722e28f32fb8220df676769d28ebdf08f2f4469646d404c43a3/pydantic_core-2.46.3-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28b5f2ef03416facccb1c6ef744c69793175fd27e44ef15669201601cf423acb", size = 2109679, upload-time = "2026-04-20T14:44:41.065Z" },
+ { url = "https://files.pythonhosted.org/packages/fc/53/b289f9bc8756a32fe718c46f55afaeaf8d489ee18d1a1e7be1db73f42cc4/pydantic_core-2.46.3-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:830d1247d77ad23852314f069e9d7ddafeec5f684baf9d7e7065ed46a049c4e6", size = 2108342, upload-time = "2026-04-20T14:42:50.144Z" },
+ { url = "https://files.pythonhosted.org/packages/10/5b/8292fc7c1f9111f1b2b7c1b0dcf1179edcd014fc3ea4517499f50b829d71/pydantic_core-2.46.3-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0793c90c1a3c74966e7975eaef3ed30ebdff3260a0f815a62a22adc17e4c01c", size = 2157208, upload-time = "2026-04-20T14:42:08.133Z" },
+ { url = "https://files.pythonhosted.org/packages/2b/9e/f80044e9ec07580f057a89fc131f78dda7a58751ddf52bbe05eaf31db50f/pydantic_core-2.46.3-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:d2d0aead851b66f5245ec0c4fb2612ef457f8bbafefdf65a2bf9d6bac6140f47", size = 2167237, upload-time = "2026-04-20T14:42:25.412Z" },
+ { url = "https://files.pythonhosted.org/packages/f8/84/6781a1b037f3b96be9227edbd1101f6d3946746056231bf4ac48cdff1a8d/pydantic_core-2.46.3-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:2f40e4246676beb31c5ce77c38a55ca4e465c6b38d11ea1bd935420568e0b1ab", size = 2312540, upload-time = "2026-04-20T14:40:40.313Z" },
+ { url = "https://files.pythonhosted.org/packages/3e/db/19c0839feeb728e7df03255581f198dfdf1c2aeb1e174a8420b63c5252e5/pydantic_core-2.46.3-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:cf489cf8986c543939aeee17a09c04d6ffb43bfef8ca16fcbcc5cfdcbed24dba", size = 2369556, upload-time = "2026-04-20T14:41:09.427Z" },
+ { url = "https://files.pythonhosted.org/packages/e0/15/3228774cb7cd45f5f721ddf1b2242747f4eb834d0c491f0c02d606f09fed/pydantic_core-2.46.3-cp314-cp314t-win32.whl", hash = "sha256:ffe0883b56cfc05798bf994164d2b2ff03efe2d22022a2bb080f3b626176dd56", size = 1949756, upload-time = "2026-04-20T14:41:25.717Z" },
+ { url = "https://files.pythonhosted.org/packages/b8/2a/c79cf53fd91e5a87e30d481809f52f9a60dd221e39de66455cf04deaad37/pydantic_core-2.46.3-cp314-cp314t-win_amd64.whl", hash = "sha256:706d9d0ce9cf4593d07270d8e9f53b161f90c57d315aeec4fb4fd7a8b10240d8", size = 2051305, upload-time = "2026-04-20T14:43:18.627Z" },
+ { url = "https://files.pythonhosted.org/packages/0b/db/d8182a7f1d9343a032265aae186eb063fe26ca4c40f256b21e8da4498e89/pydantic_core-2.46.3-cp314-cp314t-win_arm64.whl", hash = "sha256:77706aeb41df6a76568434701e0917da10692da28cb69d5fb6919ce5fdb07374", size = 2026310, upload-time = "2026-04-20T14:41:01.778Z" },
+ { url = "https://files.pythonhosted.org/packages/66/7f/03dbad45cd3aa9083fbc93c210ae8b005af67e4136a14186950a747c6874/pydantic_core-2.46.3-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:9715525891ed524a0a1eb6d053c74d4d4ad5017677fb00af0b7c2644a31bae46", size = 2105683, upload-time = "2026-04-20T14:42:19.779Z" },
+ { url = "https://files.pythonhosted.org/packages/26/22/4dc186ac8ea6b257e9855031f51b62a9637beac4d68ac06bee02f046f836/pydantic_core-2.46.3-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:9d2f400712a99a013aff420ef1eb9be077f8189a36c1e3ef87660b4e1088a874", size = 1940052, upload-time = "2026-04-20T14:43:59.274Z" },
+ { url = "https://files.pythonhosted.org/packages/0d/ca/d376391a5aff1f2e8188960d7873543608130a870961c2b6b5236627c116/pydantic_core-2.46.3-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd2aab0e2e9dc2daf36bd2686c982535d5e7b1d930a1344a7bb6e82baab42a76", size = 1988172, upload-time = "2026-04-20T14:41:17.469Z" },
+ { url = "https://files.pythonhosted.org/packages/0e/6b/523b9f85c23788755d6ab949329de692a2e3a584bc6beb67fef5e035aa9d/pydantic_core-2.46.3-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e9d76736da5f362fabfeea6a69b13b7f2be405c6d6966f06b2f6bfff7e64531", size = 2128596, upload-time = "2026-04-20T14:40:41.707Z" },
+ { url = "https://files.pythonhosted.org/packages/34/42/f426db557e8ab2791bc7562052299944a118655496fbff99914e564c0a94/pydantic_core-2.46.3-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:b12dd51f1187c2eb489af8e20f880362db98e954b54ab792fa5d92e8bcc6b803", size = 2091877, upload-time = "2026-04-20T14:43:27.091Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/4f/86a832a9d14df58e663bfdf4627dc00d3317c2bd583c4fb23390b0f04b8e/pydantic_core-2.46.3-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:f00a0961b125f1a47af7bcc17f00782e12f4cd056f83416006b30111d941dfa3", size = 1932428, upload-time = "2026-04-20T14:40:45.781Z" },
+ { url = "https://files.pythonhosted.org/packages/11/1a/fe857968954d93fb78e0d4b6df5c988c74c4aaa67181c60be7cfe327c0ca/pydantic_core-2.46.3-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57697d7c056aca4bbb680200f96563e841a6386ac1129370a0102592f4dddff5", size = 1997550, upload-time = "2026-04-20T14:44:02.425Z" },
+ { url = "https://files.pythonhosted.org/packages/17/eb/9d89ad2d9b0ba8cd65393d434471621b98912abb10fbe1df08e480ba57b5/pydantic_core-2.46.3-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd35aa21299def8db7ef4fe5c4ff862941a9a158ca7b63d61e66fe67d30416b4", size = 2137657, upload-time = "2026-04-20T14:42:45.149Z" },
+ { url = "https://files.pythonhosted.org/packages/1f/da/99d40830684f81dec901cac521b5b91c095394cc1084b9433393cde1c2df/pydantic_core-2.46.3-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:13afdd885f3d71280cf286b13b310ee0f7ccfefd1dbbb661514a474b726e2f25", size = 2107973, upload-time = "2026-04-20T14:42:06.175Z" },
+ { url = "https://files.pythonhosted.org/packages/99/a5/87024121818d75bbb2a98ddbaf638e40e7a18b5e0f5492c9ca4b1b316107/pydantic_core-2.46.3-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:f91c0aff3e3ee0928edd1232c57f643a7a003e6edf1860bc3afcdc749cb513f3", size = 1947191, upload-time = "2026-04-20T14:43:14.319Z" },
+ { url = "https://files.pythonhosted.org/packages/60/62/0c1acfe10945b83a6a59d19fbaa92f48825381509e5701b855c08f13db76/pydantic_core-2.46.3-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6529d1d128321a58d30afcc97b49e98836542f68dd41b33c2e972bb9e5290536", size = 2123791, upload-time = "2026-04-20T14:43:22.766Z" },
+ { url = "https://files.pythonhosted.org/packages/75/3e/3b2393b4c8f44285561dc30b00cf307a56a2eff7c483a824db3b8221ca51/pydantic_core-2.46.3-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:975c267cff4f7e7272eacbe50f6cc03ca9a3da4c4fbd66fffd89c94c1e311aa1", size = 2153197, upload-time = "2026-04-20T14:44:27.932Z" },
+ { url = "https://files.pythonhosted.org/packages/ba/75/5af02fb35505051eee727c061f2881c555ab4f8ddb2d42da715a42c9731b/pydantic_core-2.46.3-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:2b8e4f2bbdf71415c544b4b1138b8060db7b6611bc927e8064c769f64bed651c", size = 2181073, upload-time = "2026-04-20T14:43:20.729Z" },
+ { url = "https://files.pythonhosted.org/packages/10/92/7e0e1bd9ca3c68305db037560ca2876f89b2647deb2f8b6319005de37505/pydantic_core-2.46.3-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:e61ea8e9fff9606d09178f577ff8ccdd7206ff73d6552bcec18e1033c4254b85", size = 2315886, upload-time = "2026-04-20T14:44:04.826Z" },
+ { url = "https://files.pythonhosted.org/packages/b8/d8/101655f27eaf3e44558ead736b2795d12500598beed4683f279396fa186e/pydantic_core-2.46.3-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b504bda01bafc69b6d3c7a0c7f039dcf60f47fab70e06fe23f57b5c75bdc82b8", size = 2360528, upload-time = "2026-04-20T14:40:47.431Z" },
+ { url = "https://files.pythonhosted.org/packages/07/0f/1c34a74c8d07136f0d729ffe5e1fdab04fbdaa7684f61a92f92511a84a15/pydantic_core-2.46.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:b00b76f7142fc60c762ce579bd29c8fa44aaa56592dd3c54fab3928d0d4ca6ff", size = 2184144, upload-time = "2026-04-20T14:42:57Z" },
+]
+
+[[package]]
+name = "pydantic-settings"
+version = "2.14.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "pydantic" },
+ { name = "python-dotenv" },
+ { name = "typing-inspection" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/98/c8345dccdc31de4228c039a98f6467a941e39558da41c1744fbe29fa5666/pydantic_settings-2.14.0.tar.gz", hash = "sha256:24285fd4b0e0c06507dd9fdfd331ee23794305352aaec8fc4eb92d4047aeb67d", size = 235709, upload-time = "2026-04-20T13:37:40.293Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/01/dd/bebff3040138f00ae8a102d426b27349b9a49acc310fcae7f92112d867e3/pydantic_settings-2.14.0-py3-none-any.whl", hash = "sha256:fc8d5d692eb7092e43c8647c1c35a3ecd00e040fcf02ed86f4cb5458ca62182e", size = 60940, upload-time = "2026-04-20T13:37:38.586Z" },
+]
+
+[[package]]
+name = "pydub"
+version = "0.25.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fe/9a/e6bca0eed82db26562c73b5076539a4a08d3cffd19c3cc5913a3e61145fd/pydub-0.25.1.tar.gz", hash = "sha256:980a33ce9949cab2a569606b65674d748ecbca4f0796887fd6f46173a7b0d30f", size = 38326, upload-time = "2021-03-10T02:09:54.659Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/a6/53/d78dc063216e62fc55f6b2eebb447f6a4b0a59f55c8406376f76bf959b08/pydub-0.25.1-py2.py3-none-any.whl", hash = "sha256:65617e33033874b59d87db603aa1ed450633288aefead953b30bded59cb599a6", size = 32327, upload-time = "2021-03-10T02:09:53.503Z" },
+]
+
+[[package]]
+name = "pygments"
+version = "2.20.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" },
+]
+
+[[package]]
+name = "pyjwt"
+version = "2.12.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c2/27/a3b6e5bf6ff856d2509292e95c8f57f0df7017cf5394921fc4e4ef40308a/pyjwt-2.12.1.tar.gz", hash = "sha256:c74a7a2adf861c04d002db713dd85f84beb242228e671280bf709d765b03672b", size = 102564, upload-time = "2026-03-13T19:27:37.25Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/e5/7a/8dd906bd22e79e47397a61742927f6747fe93242ef86645ee9092e610244/pyjwt-2.12.1-py3-none-any.whl", hash = "sha256:28ca37c070cad8ba8cd9790cd940535d40274d22f80ab87f3ac6a713e6e8454c", size = 29726, upload-time = "2026-03-13T19:27:35.677Z" },
+]
+
+[package.optional-dependencies]
+crypto = [
+ { name = "cryptography" },
+]
+
+[[package]]
+name = "pyperclip"
+version = "1.11.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e8/52/d87eba7cb129b81563019d1679026e7a112ef76855d6159d24754dbd2a51/pyperclip-1.11.0.tar.gz", hash = "sha256:244035963e4428530d9e3a6101a1ef97209c6825edab1567beac148ccc1db1b6", size = 12185, upload-time = "2025-09-26T14:40:37.245Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/df/80/fc9d01d5ed37ba4c42ca2b55b4339ae6e200b456be3a1aaddf4a9fa99b8c/pyperclip-1.11.0-py3-none-any.whl", hash = "sha256:299403e9ff44581cb9ba2ffeed69c7aa96a008622ad0c46cb575ca75b5b84273", size = 11063, upload-time = "2025-09-26T14:40:36.069Z" },
+]
+
+[[package]]
+name = "pytest"
+version = "9.0.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "colorama", marker = "sys_platform == 'win32'" },
+ { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
+ { name = "iniconfig" },
+ { name = "packaging" },
+ { name = "pluggy" },
+ { name = "pygments" },
+ { name = "tomli", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" },
+]
+
+[[package]]
+name = "pytest-cov"
+version = "7.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "coverage", extra = ["toml"] },
+ { name = "pluggy" },
+ { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b1/51/a849f96e117386044471c8ec2bd6cfebacda285da9525c9106aeb28da671/pytest_cov-7.1.0.tar.gz", hash = "sha256:30674f2b5f6351aa09702a9c8c364f6a01c27aae0c1366ae8016160d1efc56b2", size = 55592, upload-time = "2026-03-21T20:11:16.284Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/9d/7a/d968e294073affff457b041c2be9868a40c1c71f4a35fcc1e45e5493067b/pytest_cov-7.1.0-py3-none-any.whl", hash = "sha256:a0461110b7865f9a271aa1b51e516c9a95de9d696734a2f71e3e78f46e1d4678", size = 22876, upload-time = "2026-03-21T20:11:14.438Z" },
+]
+
+[[package]]
+name = "python-dateutil"
+version = "2.9.0.post0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "six" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
+]
+
+[[package]]
+name = "python-dotenv"
+version = "1.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/82/ed/0301aeeac3e5353ef3d94b6ec08bbcabd04a72018415dcb29e588514bba8/python_dotenv-1.2.2.tar.gz", hash = "sha256:2c371a91fbd7ba082c2c1dc1f8bf89ca22564a087c2c287cd9b662adde799cf3", size = 50135, upload-time = "2026-03-01T16:00:26.196Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" },
+]
+
+[[package]]
+name = "python-multipart"
+version = "0.0.26"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/88/71/b145a380824a960ebd60e1014256dbb7d2253f2316ff2d73dfd8928ec2c3/python_multipart-0.0.26.tar.gz", hash = "sha256:08fadc45918cd615e26846437f50c5d6d23304da32c341f289a617127b081f17", size = 43501, upload-time = "2026-04-10T14:09:59.473Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/9a/22/f1925cdda983ab66fc8ec6ec8014b959262747e58bdca26a4e3d1da29d56/python_multipart-0.0.26-py3-none-any.whl", hash = "sha256:c0b169f8c4484c13b0dcf2ef0ec3a4adb255c4b7d18d8e420477d2b1dd03f185", size = 28847, upload-time = "2026-04-10T14:09:58.131Z" },
+]
+
+[[package]]
+name = "pytz"
+version = "2026.1.post1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/56/db/b8721d71d945e6a8ac63c0fc900b2067181dbb50805958d4d4661cf7d277/pytz-2026.1.post1.tar.gz", hash = "sha256:3378dde6a0c3d26719182142c56e60c7f9af7e968076f31aae569d72a0358ee1", size = 321088, upload-time = "2026-03-03T07:47:50.683Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/10/99/781fe0c827be2742bcc775efefccb3b048a3a9c6ce9aec0cbf4a101677e5/pytz-2026.1.post1-py2.py3-none-any.whl", hash = "sha256:f2fd16142fda348286a75e1a524be810bb05d444e5a081f37f7affc635035f7a", size = 510489, upload-time = "2026-03-03T07:47:49.167Z" },
+]
+
+[[package]]
+name = "pywin32"
+version = "311"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/7b/40/44efbb0dfbd33aca6a6483191dae0716070ed99e2ecb0c53683f400a0b4f/pywin32-311-cp310-cp310-win32.whl", hash = "sha256:d03ff496d2a0cd4a5893504789d4a15399133fe82517455e78bad62efbb7f0a3", size = 8760432, upload-time = "2025-07-14T20:13:05.9Z" },
+ { url = "https://files.pythonhosted.org/packages/5e/bf/360243b1e953bd254a82f12653974be395ba880e7ec23e3731d9f73921cc/pywin32-311-cp310-cp310-win_amd64.whl", hash = "sha256:797c2772017851984b97180b0bebe4b620bb86328e8a884bb626156295a63b3b", size = 9590103, upload-time = "2025-07-14T20:13:07.698Z" },
+ { url = "https://files.pythonhosted.org/packages/57/38/d290720e6f138086fb3d5ffe0b6caa019a791dd57866940c82e4eeaf2012/pywin32-311-cp310-cp310-win_arm64.whl", hash = "sha256:0502d1facf1fed4839a9a51ccbcc63d952cf318f78ffc00a7e78528ac27d7a2b", size = 8778557, upload-time = "2025-07-14T20:13:11.11Z" },
+ { url = "https://files.pythonhosted.org/packages/7c/af/449a6a91e5d6db51420875c54f6aff7c97a86a3b13a0b4f1a5c13b988de3/pywin32-311-cp311-cp311-win32.whl", hash = "sha256:184eb5e436dea364dcd3d2316d577d625c0351bf237c4e9a5fabbcfa5a58b151", size = 8697031, upload-time = "2025-07-14T20:13:13.266Z" },
+ { url = "https://files.pythonhosted.org/packages/51/8f/9bb81dd5bb77d22243d33c8397f09377056d5c687aa6d4042bea7fbf8364/pywin32-311-cp311-cp311-win_amd64.whl", hash = "sha256:3ce80b34b22b17ccbd937a6e78e7225d80c52f5ab9940fe0506a1a16f3dab503", size = 9508308, upload-time = "2025-07-14T20:13:15.147Z" },
+ { url = "https://files.pythonhosted.org/packages/44/7b/9c2ab54f74a138c491aba1b1cd0795ba61f144c711daea84a88b63dc0f6c/pywin32-311-cp311-cp311-win_arm64.whl", hash = "sha256:a733f1388e1a842abb67ffa8e7aad0e70ac519e09b0f6a784e65a136ec7cefd2", size = 8703930, upload-time = "2025-07-14T20:13:16.945Z" },
+ { url = "https://files.pythonhosted.org/packages/e7/ab/01ea1943d4eba0f850c3c61e78e8dd59757ff815ff3ccd0a84de5f541f42/pywin32-311-cp312-cp312-win32.whl", hash = "sha256:750ec6e621af2b948540032557b10a2d43b0cee2ae9758c54154d711cc852d31", size = 8706543, upload-time = "2025-07-14T20:13:20.765Z" },
+ { url = "https://files.pythonhosted.org/packages/d1/a8/a0e8d07d4d051ec7502cd58b291ec98dcc0c3fff027caad0470b72cfcc2f/pywin32-311-cp312-cp312-win_amd64.whl", hash = "sha256:b8c095edad5c211ff31c05223658e71bf7116daa0ecf3ad85f3201ea3190d067", size = 9495040, upload-time = "2025-07-14T20:13:22.543Z" },
+ { url = "https://files.pythonhosted.org/packages/ba/3a/2ae996277b4b50f17d61f0603efd8253cb2d79cc7ae159468007b586396d/pywin32-311-cp312-cp312-win_arm64.whl", hash = "sha256:e286f46a9a39c4a18b319c28f59b61de793654af2f395c102b4f819e584b5852", size = 8710102, upload-time = "2025-07-14T20:13:24.682Z" },
+ { url = "https://files.pythonhosted.org/packages/a5/be/3fd5de0979fcb3994bfee0d65ed8ca9506a8a1260651b86174f6a86f52b3/pywin32-311-cp313-cp313-win32.whl", hash = "sha256:f95ba5a847cba10dd8c4d8fefa9f2a6cf283b8b88ed6178fa8a6c1ab16054d0d", size = 8705700, upload-time = "2025-07-14T20:13:26.471Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/28/e0a1909523c6890208295a29e05c2adb2126364e289826c0a8bc7297bd5c/pywin32-311-cp313-cp313-win_amd64.whl", hash = "sha256:718a38f7e5b058e76aee1c56ddd06908116d35147e133427e59a3983f703a20d", size = 9494700, upload-time = "2025-07-14T20:13:28.243Z" },
+ { url = "https://files.pythonhosted.org/packages/04/bf/90339ac0f55726dce7d794e6d79a18a91265bdf3aa70b6b9ca52f35e022a/pywin32-311-cp313-cp313-win_arm64.whl", hash = "sha256:7b4075d959648406202d92a2310cb990fea19b535c7f4a78d3f5e10b926eeb8a", size = 8709318, upload-time = "2025-07-14T20:13:30.348Z" },
+ { url = "https://files.pythonhosted.org/packages/c9/31/097f2e132c4f16d99a22bfb777e0fd88bd8e1c634304e102f313af69ace5/pywin32-311-cp314-cp314-win32.whl", hash = "sha256:b7a2c10b93f8986666d0c803ee19b5990885872a7de910fc460f9b0c2fbf92ee", size = 8840714, upload-time = "2025-07-14T20:13:32.449Z" },
+ { url = "https://files.pythonhosted.org/packages/90/4b/07c77d8ba0e01349358082713400435347df8426208171ce297da32c313d/pywin32-311-cp314-cp314-win_amd64.whl", hash = "sha256:3aca44c046bd2ed8c90de9cb8427f581c479e594e99b5c0bb19b29c10fd6cb87", size = 9656800, upload-time = "2025-07-14T20:13:34.312Z" },
+ { url = "https://files.pythonhosted.org/packages/c0/d2/21af5c535501a7233e734b8af901574572da66fcc254cb35d0609c9080dd/pywin32-311-cp314-cp314-win_arm64.whl", hash = "sha256:a508e2d9025764a8270f93111a970e1d0fbfc33f4153b388bb649b7eec4f9b42", size = 8932540, upload-time = "2025-07-14T20:13:36.379Z" },
+]
+
+[[package]]
+name = "pywin32-ctypes"
+version = "0.2.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/85/9f/01a1a99704853cb63f253eea009390c88e7131c67e66a0a02099a8c917cb/pywin32-ctypes-0.2.3.tar.gz", hash = "sha256:d162dc04946d704503b2edc4d55f3dba5c1d539ead017afa00142c38b9885755", size = 29471, upload-time = "2024-08-14T10:15:34.626Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/de/3d/8161f7711c017e01ac9f008dfddd9410dff3674334c233bde66e7ba65bbf/pywin32_ctypes-0.2.3-py3-none-any.whl", hash = "sha256:8a1513379d709975552d202d942d9837758905c8d01eb82b8bcc30918929e7b8", size = 30756, upload-time = "2024-08-14T10:15:33.187Z" },
+]
+
+[[package]]
+name = "pyyaml"
+version = "6.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/f4/a0/39350dd17dd6d6c6507025c0e53aef67a9293a6d37d3511f23ea510d5800/pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b", size = 184227, upload-time = "2025-09-25T21:31:46.04Z" },
+ { url = "https://files.pythonhosted.org/packages/05/14/52d505b5c59ce73244f59c7a50ecf47093ce4765f116cdb98286a71eeca2/pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956", size = 174019, upload-time = "2025-09-25T21:31:47.706Z" },
+ { url = "https://files.pythonhosted.org/packages/43/f7/0e6a5ae5599c838c696adb4e6330a59f463265bfa1e116cfd1fbb0abaaae/pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8", size = 740646, upload-time = "2025-09-25T21:31:49.21Z" },
+ { url = "https://files.pythonhosted.org/packages/2f/3a/61b9db1d28f00f8fd0ae760459a5c4bf1b941baf714e207b6eb0657d2578/pyyaml-6.0.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:66291b10affd76d76f54fad28e22e51719ef9ba22b29e1d7d03d6777a9174198", size = 840793, upload-time = "2025-09-25T21:31:50.735Z" },
+ { url = "https://files.pythonhosted.org/packages/7a/1e/7acc4f0e74c4b3d9531e24739e0ab832a5edf40e64fbae1a9c01941cabd7/pyyaml-6.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c7708761fccb9397fe64bbc0395abcae8c4bf7b0eac081e12b809bf47700d0b", size = 770293, upload-time = "2025-09-25T21:31:51.828Z" },
+ { url = "https://files.pythonhosted.org/packages/8b/ef/abd085f06853af0cd59fa5f913d61a8eab65d7639ff2a658d18a25d6a89d/pyyaml-6.0.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:418cf3f2111bc80e0933b2cd8cd04f286338bb88bdc7bc8e6dd775ebde60b5e0", size = 732872, upload-time = "2025-09-25T21:31:53.282Z" },
+ { url = "https://files.pythonhosted.org/packages/1f/15/2bc9c8faf6450a8b3c9fc5448ed869c599c0a74ba2669772b1f3a0040180/pyyaml-6.0.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5e0b74767e5f8c593e8c9b5912019159ed0533c70051e9cce3e8b6aa699fcd69", size = 758828, upload-time = "2025-09-25T21:31:54.807Z" },
+ { url = "https://files.pythonhosted.org/packages/a3/00/531e92e88c00f4333ce359e50c19b8d1de9fe8d581b1534e35ccfbc5f393/pyyaml-6.0.3-cp310-cp310-win32.whl", hash = "sha256:28c8d926f98f432f88adc23edf2e6d4921ac26fb084b028c733d01868d19007e", size = 142415, upload-time = "2025-09-25T21:31:55.885Z" },
+ { url = "https://files.pythonhosted.org/packages/2a/fa/926c003379b19fca39dd4634818b00dec6c62d87faf628d1394e137354d4/pyyaml-6.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:bdb2c67c6c1390b63c6ff89f210c8fd09d9a1217a465701eac7316313c915e4c", size = 158561, upload-time = "2025-09-25T21:31:57.406Z" },
+ { url = "https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size = 185826, upload-time = "2025-09-25T21:31:58.655Z" },
+ { url = "https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size = 175577, upload-time = "2025-09-25T21:32:00.088Z" },
+ { url = "https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size = 775556, upload-time = "2025-09-25T21:32:01.31Z" },
+ { url = "https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size = 882114, upload-time = "2025-09-25T21:32:03.376Z" },
+ { url = "https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size = 806638, upload-time = "2025-09-25T21:32:04.553Z" },
+ { url = "https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size = 767463, upload-time = "2025-09-25T21:32:06.152Z" },
+ { url = "https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size = 794986, upload-time = "2025-09-25T21:32:07.367Z" },
+ { url = "https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size = 142543, upload-time = "2025-09-25T21:32:08.95Z" },
+ { url = "https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size = 158763, upload-time = "2025-09-25T21:32:09.96Z" },
+ { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" },
+ { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" },
+ { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" },
+ { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" },
+ { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" },
+ { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" },
+ { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" },
+ { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" },
+ { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" },
+ { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" },
+ { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" },
+ { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" },
+ { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" },
+ { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" },
+ { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" },
+ { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" },
+ { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" },
+ { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" },
+ { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" },
+ { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" },
+ { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" },
+ { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" },
+ { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" },
+ { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" },
+ { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" },
+ { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" },
+ { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" },
+ { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" },
+ { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" },
+ { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" },
+ { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" },
+ { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" },
+ { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" },
+ { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" },
+ { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" },
+ { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" },
+ { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" },
+]
+
+[[package]]
+name = "referencing"
+version = "0.37.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "attrs" },
+ { name = "rpds-py" },
+ { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/2c/58/ca301544e1fa93ed4f80d724bf5b194f6e4b945841c5bfd555878eea9fcb/referencing-0.37.0-py3-none-any.whl", hash = "sha256:381329a9f99628c9069361716891d34ad94af76e461dcb0335825aecc7692231", size = 26766, upload-time = "2025-10-13T15:30:47.625Z" },
+]
+
+[[package]]
+name = "requests"
+version = "2.33.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "certifi" },
+ { name = "charset-normalizer" },
+ { name = "idna" },
+ { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5f/a4/98b9c7c6428a668bf7e42ebb7c79d576a1c3c1e3ae2d47e674b468388871/requests-2.33.1.tar.gz", hash = "sha256:18817f8c57c6263968bc123d237e3b8b08ac046f5456bd1e307ee8f4250d3517", size = 134120, upload-time = "2026-03-30T16:09:15.531Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl", hash = "sha256:4e6d1ef462f3626a1f0a0a9c42dd93c63bad33f9f1c1937509b8c5c8718ab56a", size = 64947, upload-time = "2026-03-30T16:09:13.83Z" },
+]
+
+[[package]]
+name = "rich"
+version = "15.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "markdown-it-py" },
+ { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c0/8f/0722ca900cc807c13a6a0c696dacf35430f72e0ec571c4275d2371fca3e9/rich-15.0.0.tar.gz", hash = "sha256:edd07a4824c6b40189fb7ac9bc4c52536e9780fbbfbddf6f1e2502c31b068c36", size = 230680, upload-time = "2026-04-12T08:24:00.75Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/82/3b/64d4899d73f91ba49a8c18a8ff3f0ea8f1c1d75481760df8c68ef5235bf5/rich-15.0.0-py3-none-any.whl", hash = "sha256:33bd4ef74232fb73fe9279a257718407f169c09b78a87ad3d296f548e27de0bb", size = 310654, upload-time = "2026-04-12T08:24:02.83Z" },
+]
+
+[[package]]
+name = "rich-rst"
+version = "1.3.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "docutils" },
+ { name = "rich" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/bc/6d/a506aaa4a9eaa945ed8ab2b7347859f53593864289853c5d6d62b77246e0/rich_rst-1.3.2.tar.gz", hash = "sha256:a1196fdddf1e364b02ec68a05e8ff8f6914fee10fbca2e6b6735f166bb0da8d4", size = 14936, upload-time = "2025-10-14T16:49:45.332Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/13/2f/b4530fbf948867702d0a3f27de4a6aab1d156f406d72852ab902c4d04de9/rich_rst-1.3.2-py3-none-any.whl", hash = "sha256:a99b4907cbe118cf9d18b0b44de272efa61f15117c61e39ebdc431baf5df722a", size = 12567, upload-time = "2025-10-14T16:49:42.953Z" },
+]
+
+[[package]]
+name = "rpds-py"
+version = "0.30.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/20/af/3f2f423103f1113b36230496629986e0ef7e199d2aa8392452b484b38ced/rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84", size = 69469, upload-time = "2025-11-30T20:24:38.837Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/06/0c/0c411a0ec64ccb6d104dcabe0e713e05e153a9a2c3c2bd2b32ce412166fe/rpds_py-0.30.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:679ae98e00c0e8d68a7fda324e16b90fd5260945b45d3b824c892cec9eea3288", size = 370490, upload-time = "2025-11-30T20:21:33.256Z" },
+ { url = "https://files.pythonhosted.org/packages/19/6a/4ba3d0fb7297ebae71171822554abe48d7cab29c28b8f9f2c04b79988c05/rpds_py-0.30.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4cc2206b76b4f576934f0ed374b10d7ca5f457858b157ca52064bdfc26b9fc00", size = 359751, upload-time = "2025-11-30T20:21:34.591Z" },
+ { url = "https://files.pythonhosted.org/packages/cd/7c/e4933565ef7f7a0818985d87c15d9d273f1a649afa6a52ea35ad011195ea/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:389a2d49eded1896c3d48b0136ead37c48e221b391c052fba3f4055c367f60a6", size = 389696, upload-time = "2025-11-30T20:21:36.122Z" },
+ { url = "https://files.pythonhosted.org/packages/5e/01/6271a2511ad0815f00f7ed4390cf2567bec1d4b1da39e2c27a41e6e3b4de/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:32c8528634e1bf7121f3de08fa85b138f4e0dc47657866630611b03967f041d7", size = 403136, upload-time = "2025-11-30T20:21:37.728Z" },
+ { url = "https://files.pythonhosted.org/packages/55/64/c857eb7cd7541e9b4eee9d49c196e833128a55b89a9850a9c9ac33ccf897/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f207f69853edd6f6700b86efb84999651baf3789e78a466431df1331608e5324", size = 524699, upload-time = "2025-11-30T20:21:38.92Z" },
+ { url = "https://files.pythonhosted.org/packages/9c/ed/94816543404078af9ab26159c44f9e98e20fe47e2126d5d32c9d9948d10a/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:67b02ec25ba7a9e8fa74c63b6ca44cf5707f2fbfadae3ee8e7494297d56aa9df", size = 412022, upload-time = "2025-11-30T20:21:40.407Z" },
+ { url = "https://files.pythonhosted.org/packages/61/b5/707f6cf0066a6412aacc11d17920ea2e19e5b2f04081c64526eb35b5c6e7/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c0e95f6819a19965ff420f65578bacb0b00f251fefe2c8b23347c37174271f3", size = 390522, upload-time = "2025-11-30T20:21:42.17Z" },
+ { url = "https://files.pythonhosted.org/packages/13/4e/57a85fda37a229ff4226f8cbcf09f2a455d1ed20e802ce5b2b4a7f5ed053/rpds_py-0.30.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:a452763cc5198f2f98898eb98f7569649fe5da666c2dc6b5ddb10fde5a574221", size = 404579, upload-time = "2025-11-30T20:21:43.769Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/da/c9339293513ec680a721e0e16bf2bac3db6e5d7e922488de471308349bba/rpds_py-0.30.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e0b65193a413ccc930671c55153a03ee57cecb49e6227204b04fae512eb657a7", size = 421305, upload-time = "2025-11-30T20:21:44.994Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/be/522cb84751114f4ad9d822ff5a1aa3c98006341895d5f084779b99596e5c/rpds_py-0.30.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:858738e9c32147f78b3ac24dc0edb6610000e56dc0f700fd5f651d0a0f0eb9ff", size = 572503, upload-time = "2025-11-30T20:21:46.91Z" },
+ { url = "https://files.pythonhosted.org/packages/a2/9b/de879f7e7ceddc973ea6e4629e9b380213a6938a249e94b0cdbcc325bb66/rpds_py-0.30.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:da279aa314f00acbb803da1e76fa18666778e8a8f83484fba94526da5de2cba7", size = 598322, upload-time = "2025-11-30T20:21:48.709Z" },
+ { url = "https://files.pythonhosted.org/packages/48/ac/f01fc22efec3f37d8a914fc1b2fb9bcafd56a299edbe96406f3053edea5a/rpds_py-0.30.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7c64d38fb49b6cdeda16ab49e35fe0da2e1e9b34bc38bd78386530f218b37139", size = 560792, upload-time = "2025-11-30T20:21:50.024Z" },
+ { url = "https://files.pythonhosted.org/packages/e2/da/4e2b19d0f131f35b6146425f846563d0ce036763e38913d917187307a671/rpds_py-0.30.0-cp310-cp310-win32.whl", hash = "sha256:6de2a32a1665b93233cde140ff8b3467bdb9e2af2b91079f0333a0974d12d464", size = 221901, upload-time = "2025-11-30T20:21:51.32Z" },
+ { url = "https://files.pythonhosted.org/packages/96/cb/156d7a5cf4f78a7cc571465d8aec7a3c447c94f6749c5123f08438bcf7bc/rpds_py-0.30.0-cp310-cp310-win_amd64.whl", hash = "sha256:1726859cd0de969f88dc8673bdd954185b9104e05806be64bcd87badbe313169", size = 235823, upload-time = "2025-11-30T20:21:52.505Z" },
+ { url = "https://files.pythonhosted.org/packages/4d/6e/f964e88b3d2abee2a82c1ac8366da848fce1c6d834dc2132c3fda3970290/rpds_py-0.30.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a2bffea6a4ca9f01b3f8e548302470306689684e61602aa3d141e34da06cf425", size = 370157, upload-time = "2025-11-30T20:21:53.789Z" },
+ { url = "https://files.pythonhosted.org/packages/94/ba/24e5ebb7c1c82e74c4e4f33b2112a5573ddc703915b13a073737b59b86e0/rpds_py-0.30.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dc4f992dfe1e2bc3ebc7444f6c7051b4bc13cd8e33e43511e8ffd13bf407010d", size = 359676, upload-time = "2025-11-30T20:21:55.475Z" },
+ { url = "https://files.pythonhosted.org/packages/84/86/04dbba1b087227747d64d80c3b74df946b986c57af0a9f0c98726d4d7a3b/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:422c3cb9856d80b09d30d2eb255d0754b23e090034e1deb4083f8004bd0761e4", size = 389938, upload-time = "2025-11-30T20:21:57.079Z" },
+ { url = "https://files.pythonhosted.org/packages/42/bb/1463f0b1722b7f45431bdd468301991d1328b16cffe0b1c2918eba2c4eee/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07ae8a593e1c3c6b82ca3292efbe73c30b61332fd612e05abee07c79359f292f", size = 402932, upload-time = "2025-11-30T20:21:58.47Z" },
+ { url = "https://files.pythonhosted.org/packages/99/ee/2520700a5c1f2d76631f948b0736cdf9b0acb25abd0ca8e889b5c62ac2e3/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12f90dd7557b6bd57f40abe7747e81e0c0b119bef015ea7726e69fe550e394a4", size = 525830, upload-time = "2025-11-30T20:21:59.699Z" },
+ { url = "https://files.pythonhosted.org/packages/e0/ad/bd0331f740f5705cc555a5e17fdf334671262160270962e69a2bdef3bf76/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:99b47d6ad9a6da00bec6aabe5a6279ecd3c06a329d4aa4771034a21e335c3a97", size = 412033, upload-time = "2025-11-30T20:22:00.991Z" },
+ { url = "https://files.pythonhosted.org/packages/f8/1e/372195d326549bb51f0ba0f2ecb9874579906b97e08880e7a65c3bef1a99/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33f559f3104504506a44bb666b93a33f5d33133765b0c216a5bf2f1e1503af89", size = 390828, upload-time = "2025-11-30T20:22:02.723Z" },
+ { url = "https://files.pythonhosted.org/packages/ab/2b/d88bb33294e3e0c76bc8f351a3721212713629ffca1700fa94979cb3eae8/rpds_py-0.30.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:946fe926af6e44f3697abbc305ea168c2c31d3e3ef1058cf68f379bf0335a78d", size = 404683, upload-time = "2025-11-30T20:22:04.367Z" },
+ { url = "https://files.pythonhosted.org/packages/50/32/c759a8d42bcb5289c1fac697cd92f6fe01a018dd937e62ae77e0e7f15702/rpds_py-0.30.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:495aeca4b93d465efde585977365187149e75383ad2684f81519f504f5c13038", size = 421583, upload-time = "2025-11-30T20:22:05.814Z" },
+ { url = "https://files.pythonhosted.org/packages/2b/81/e729761dbd55ddf5d84ec4ff1f47857f4374b0f19bdabfcf929164da3e24/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d9a0ca5da0386dee0655b4ccdf46119df60e0f10da268d04fe7cc87886872ba7", size = 572496, upload-time = "2025-11-30T20:22:07.713Z" },
+ { url = "https://files.pythonhosted.org/packages/14/f6/69066a924c3557c9c30baa6ec3a0aa07526305684c6f86c696b08860726c/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8d6d1cc13664ec13c1b84241204ff3b12f9bb82464b8ad6e7a5d3486975c2eed", size = 598669, upload-time = "2025-11-30T20:22:09.312Z" },
+ { url = "https://files.pythonhosted.org/packages/5f/48/905896b1eb8a05630d20333d1d8ffd162394127b74ce0b0784ae04498d32/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3896fa1be39912cf0757753826bc8bdc8ca331a28a7c4ae46b7a21280b06bb85", size = 561011, upload-time = "2025-11-30T20:22:11.309Z" },
+ { url = "https://files.pythonhosted.org/packages/22/16/cd3027c7e279d22e5eb431dd3c0fbc677bed58797fe7581e148f3f68818b/rpds_py-0.30.0-cp311-cp311-win32.whl", hash = "sha256:55f66022632205940f1827effeff17c4fa7ae1953d2b74a8581baaefb7d16f8c", size = 221406, upload-time = "2025-11-30T20:22:13.101Z" },
+ { url = "https://files.pythonhosted.org/packages/fa/5b/e7b7aa136f28462b344e652ee010d4de26ee9fd16f1bfd5811f5153ccf89/rpds_py-0.30.0-cp311-cp311-win_amd64.whl", hash = "sha256:a51033ff701fca756439d641c0ad09a41d9242fa69121c7d8769604a0a629825", size = 236024, upload-time = "2025-11-30T20:22:14.853Z" },
+ { url = "https://files.pythonhosted.org/packages/14/a6/364bba985e4c13658edb156640608f2c9e1d3ea3c81b27aa9d889fff0e31/rpds_py-0.30.0-cp311-cp311-win_arm64.whl", hash = "sha256:47b0ef6231c58f506ef0b74d44e330405caa8428e770fec25329ed2cb971a229", size = 229069, upload-time = "2025-11-30T20:22:16.577Z" },
+ { url = "https://files.pythonhosted.org/packages/03/e7/98a2f4ac921d82f33e03f3835f5bf3a4a40aa1bfdc57975e74a97b2b4bdd/rpds_py-0.30.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a161f20d9a43006833cd7068375a94d035714d73a172b681d8881820600abfad", size = 375086, upload-time = "2025-11-30T20:22:17.93Z" },
+ { url = "https://files.pythonhosted.org/packages/4d/a1/bca7fd3d452b272e13335db8d6b0b3ecde0f90ad6f16f3328c6fb150c889/rpds_py-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6abc8880d9d036ecaafe709079969f56e876fcf107f7a8e9920ba6d5a3878d05", size = 359053, upload-time = "2025-11-30T20:22:19.297Z" },
+ { url = "https://files.pythonhosted.org/packages/65/1c/ae157e83a6357eceff62ba7e52113e3ec4834a84cfe07fa4b0757a7d105f/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca28829ae5f5d569bb62a79512c842a03a12576375d5ece7d2cadf8abe96ec28", size = 390763, upload-time = "2025-11-30T20:22:21.661Z" },
+ { url = "https://files.pythonhosted.org/packages/d4/36/eb2eb8515e2ad24c0bd43c3ee9cd74c33f7ca6430755ccdb240fd3144c44/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1010ed9524c73b94d15919ca4d41d8780980e1765babf85f9a2f90d247153dd", size = 408951, upload-time = "2025-11-30T20:22:23.408Z" },
+ { url = "https://files.pythonhosted.org/packages/d6/65/ad8dc1784a331fabbd740ef6f71ce2198c7ed0890dab595adb9ea2d775a1/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8d1736cfb49381ba528cd5baa46f82fdc65c06e843dab24dd70b63d09121b3f", size = 514622, upload-time = "2025-11-30T20:22:25.16Z" },
+ { url = "https://files.pythonhosted.org/packages/63/8e/0cfa7ae158e15e143fe03993b5bcd743a59f541f5952e1546b1ac1b5fd45/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d948b135c4693daff7bc2dcfc4ec57237a29bd37e60c2fabf5aff2bbacf3e2f1", size = 414492, upload-time = "2025-11-30T20:22:26.505Z" },
+ { url = "https://files.pythonhosted.org/packages/60/1b/6f8f29f3f995c7ffdde46a626ddccd7c63aefc0efae881dc13b6e5d5bb16/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47f236970bccb2233267d89173d3ad2703cd36a0e2a6e92d0560d333871a3d23", size = 394080, upload-time = "2025-11-30T20:22:27.934Z" },
+ { url = "https://files.pythonhosted.org/packages/6d/d5/a266341051a7a3ca2f4b750a3aa4abc986378431fc2da508c5034d081b70/rpds_py-0.30.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:2e6ecb5a5bcacf59c3f912155044479af1d0b6681280048b338b28e364aca1f6", size = 408680, upload-time = "2025-11-30T20:22:29.341Z" },
+ { url = "https://files.pythonhosted.org/packages/10/3b/71b725851df9ab7a7a4e33cf36d241933da66040d195a84781f49c50490c/rpds_py-0.30.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a8fa71a2e078c527c3e9dc9fc5a98c9db40bcc8a92b4e8858e36d329f8684b51", size = 423589, upload-time = "2025-11-30T20:22:31.469Z" },
+ { url = "https://files.pythonhosted.org/packages/00/2b/e59e58c544dc9bd8bd8384ecdb8ea91f6727f0e37a7131baeff8d6f51661/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:73c67f2db7bc334e518d097c6d1e6fed021bbc9b7d678d6cc433478365d1d5f5", size = 573289, upload-time = "2025-11-30T20:22:32.997Z" },
+ { url = "https://files.pythonhosted.org/packages/da/3e/a18e6f5b460893172a7d6a680e86d3b6bc87a54c1f0b03446a3c8c7b588f/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5ba103fb455be00f3b1c2076c9d4264bfcb037c976167a6047ed82f23153f02e", size = 599737, upload-time = "2025-11-30T20:22:34.419Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/e2/714694e4b87b85a18e2c243614974413c60aa107fd815b8cbc42b873d1d7/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7cee9c752c0364588353e627da8a7e808a66873672bcb5f52890c33fd965b394", size = 563120, upload-time = "2025-11-30T20:22:35.903Z" },
+ { url = "https://files.pythonhosted.org/packages/6f/ab/d5d5e3bcedb0a77f4f613706b750e50a5a3ba1c15ccd3665ecc636c968fd/rpds_py-0.30.0-cp312-cp312-win32.whl", hash = "sha256:1ab5b83dbcf55acc8b08fc62b796ef672c457b17dbd7820a11d6c52c06839bdf", size = 223782, upload-time = "2025-11-30T20:22:37.271Z" },
+ { url = "https://files.pythonhosted.org/packages/39/3b/f786af9957306fdc38a74cef405b7b93180f481fb48453a114bb6465744a/rpds_py-0.30.0-cp312-cp312-win_amd64.whl", hash = "sha256:a090322ca841abd453d43456ac34db46e8b05fd9b3b4ac0c78bcde8b089f959b", size = 240463, upload-time = "2025-11-30T20:22:39.021Z" },
+ { url = "https://files.pythonhosted.org/packages/f3/d2/b91dc748126c1559042cfe41990deb92c4ee3e2b415f6b5234969ffaf0cc/rpds_py-0.30.0-cp312-cp312-win_arm64.whl", hash = "sha256:669b1805bd639dd2989b281be2cfd951c6121b65e729d9b843e9639ef1fd555e", size = 230868, upload-time = "2025-11-30T20:22:40.493Z" },
+ { url = "https://files.pythonhosted.org/packages/ed/dc/d61221eb88ff410de3c49143407f6f3147acf2538c86f2ab7ce65ae7d5f9/rpds_py-0.30.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:f83424d738204d9770830d35290ff3273fbb02b41f919870479fab14b9d303b2", size = 374887, upload-time = "2025-11-30T20:22:41.812Z" },
+ { url = "https://files.pythonhosted.org/packages/fd/32/55fb50ae104061dbc564ef15cc43c013dc4a9f4527a1f4d99baddf56fe5f/rpds_py-0.30.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e7536cd91353c5273434b4e003cbda89034d67e7710eab8761fd918ec6c69cf8", size = 358904, upload-time = "2025-11-30T20:22:43.479Z" },
+ { url = "https://files.pythonhosted.org/packages/58/70/faed8186300e3b9bdd138d0273109784eea2396c68458ed580f885dfe7ad/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2771c6c15973347f50fece41fc447c054b7ac2ae0502388ce3b6738cd366e3d4", size = 389945, upload-time = "2025-11-30T20:22:44.819Z" },
+ { url = "https://files.pythonhosted.org/packages/bd/a8/073cac3ed2c6387df38f71296d002ab43496a96b92c823e76f46b8af0543/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0a59119fc6e3f460315fe9d08149f8102aa322299deaa5cab5b40092345c2136", size = 407783, upload-time = "2025-11-30T20:22:46.103Z" },
+ { url = "https://files.pythonhosted.org/packages/77/57/5999eb8c58671f1c11eba084115e77a8899d6e694d2a18f69f0ba471ec8b/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:76fec018282b4ead0364022e3c54b60bf368b9d926877957a8624b58419169b7", size = 515021, upload-time = "2025-11-30T20:22:47.458Z" },
+ { url = "https://files.pythonhosted.org/packages/e0/af/5ab4833eadc36c0a8ed2bc5c0de0493c04f6c06de223170bd0798ff98ced/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:692bef75a5525db97318e8cd061542b5a79812d711ea03dbc1f6f8dbb0c5f0d2", size = 414589, upload-time = "2025-11-30T20:22:48.872Z" },
+ { url = "https://files.pythonhosted.org/packages/b7/de/f7192e12b21b9e9a68a6d0f249b4af3fdcdff8418be0767a627564afa1f1/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9027da1ce107104c50c81383cae773ef5c24d296dd11c99e2629dbd7967a20c6", size = 394025, upload-time = "2025-11-30T20:22:50.196Z" },
+ { url = "https://files.pythonhosted.org/packages/91/c4/fc70cd0249496493500e7cc2de87504f5aa6509de1e88623431fec76d4b6/rpds_py-0.30.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:9cf69cdda1f5968a30a359aba2f7f9aa648a9ce4b580d6826437f2b291cfc86e", size = 408895, upload-time = "2025-11-30T20:22:51.87Z" },
+ { url = "https://files.pythonhosted.org/packages/58/95/d9275b05ab96556fefff73a385813eb66032e4c99f411d0795372d9abcea/rpds_py-0.30.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a4796a717bf12b9da9d3ad002519a86063dcac8988b030e405704ef7d74d2d9d", size = 422799, upload-time = "2025-11-30T20:22:53.341Z" },
+ { url = "https://files.pythonhosted.org/packages/06/c1/3088fc04b6624eb12a57eb814f0d4997a44b0d208d6cace713033ff1a6ba/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5d4c2aa7c50ad4728a094ebd5eb46c452e9cb7edbfdb18f9e1221f597a73e1e7", size = 572731, upload-time = "2025-11-30T20:22:54.778Z" },
+ { url = "https://files.pythonhosted.org/packages/d8/42/c612a833183b39774e8ac8fecae81263a68b9583ee343db33ab571a7ce55/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ba81a9203d07805435eb06f536d95a266c21e5b2dfbf6517748ca40c98d19e31", size = 599027, upload-time = "2025-11-30T20:22:56.212Z" },
+ { url = "https://files.pythonhosted.org/packages/5f/60/525a50f45b01d70005403ae0e25f43c0384369ad24ffe46e8d9068b50086/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:945dccface01af02675628334f7cf49c2af4c1c904748efc5cf7bbdf0b579f95", size = 563020, upload-time = "2025-11-30T20:22:58.2Z" },
+ { url = "https://files.pythonhosted.org/packages/0b/5d/47c4655e9bcd5ca907148535c10e7d489044243cc9941c16ed7cd53be91d/rpds_py-0.30.0-cp313-cp313-win32.whl", hash = "sha256:b40fb160a2db369a194cb27943582b38f79fc4887291417685f3ad693c5a1d5d", size = 223139, upload-time = "2025-11-30T20:23:00.209Z" },
+ { url = "https://files.pythonhosted.org/packages/f2/e1/485132437d20aa4d3e1d8b3fb5a5e65aa8139f1e097080c2a8443201742c/rpds_py-0.30.0-cp313-cp313-win_amd64.whl", hash = "sha256:806f36b1b605e2d6a72716f321f20036b9489d29c51c91f4dd29a3e3afb73b15", size = 240224, upload-time = "2025-11-30T20:23:02.008Z" },
+ { url = "https://files.pythonhosted.org/packages/24/95/ffd128ed1146a153d928617b0ef673960130be0009c77d8fbf0abe306713/rpds_py-0.30.0-cp313-cp313-win_arm64.whl", hash = "sha256:d96c2086587c7c30d44f31f42eae4eac89b60dabbac18c7669be3700f13c3ce1", size = 230645, upload-time = "2025-11-30T20:23:03.43Z" },
+ { url = "https://files.pythonhosted.org/packages/ff/1b/b10de890a0def2a319a2626334a7f0ae388215eb60914dbac8a3bae54435/rpds_py-0.30.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:eb0b93f2e5c2189ee831ee43f156ed34e2a89a78a66b98cadad955972548be5a", size = 364443, upload-time = "2025-11-30T20:23:04.878Z" },
+ { url = "https://files.pythonhosted.org/packages/0d/bf/27e39f5971dc4f305a4fb9c672ca06f290f7c4e261c568f3dea16a410d47/rpds_py-0.30.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:922e10f31f303c7c920da8981051ff6d8c1a56207dbdf330d9047f6d30b70e5e", size = 353375, upload-time = "2025-11-30T20:23:06.342Z" },
+ { url = "https://files.pythonhosted.org/packages/40/58/442ada3bba6e8e6615fc00483135c14a7538d2ffac30e2d933ccf6852232/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdc62c8286ba9bf7f47befdcea13ea0e26bf294bda99758fd90535cbaf408000", size = 383850, upload-time = "2025-11-30T20:23:07.825Z" },
+ { url = "https://files.pythonhosted.org/packages/14/14/f59b0127409a33c6ef6f5c1ebd5ad8e32d7861c9c7adfa9a624fc3889f6c/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47f9a91efc418b54fb8190a6b4aa7813a23fb79c51f4bb84e418f5476c38b8db", size = 392812, upload-time = "2025-11-30T20:23:09.228Z" },
+ { url = "https://files.pythonhosted.org/packages/b3/66/e0be3e162ac299b3a22527e8913767d869e6cc75c46bd844aa43fb81ab62/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1f3587eb9b17f3789ad50824084fa6f81921bbf9a795826570bda82cb3ed91f2", size = 517841, upload-time = "2025-11-30T20:23:11.186Z" },
+ { url = "https://files.pythonhosted.org/packages/3d/55/fa3b9cf31d0c963ecf1ba777f7cf4b2a2c976795ac430d24a1f43d25a6ba/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39c02563fc592411c2c61d26b6c5fe1e51eaa44a75aa2c8735ca88b0d9599daa", size = 408149, upload-time = "2025-11-30T20:23:12.864Z" },
+ { url = "https://files.pythonhosted.org/packages/60/ca/780cf3b1a32b18c0f05c441958d3758f02544f1d613abf9488cd78876378/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a1234d8febafdfd33a42d97da7a43f5dcb120c1060e352a3fbc0c6d36e2083", size = 383843, upload-time = "2025-11-30T20:23:14.638Z" },
+ { url = "https://files.pythonhosted.org/packages/82/86/d5f2e04f2aa6247c613da0c1dd87fcd08fa17107e858193566048a1e2f0a/rpds_py-0.30.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:eb2c4071ab598733724c08221091e8d80e89064cd472819285a9ab0f24bcedb9", size = 396507, upload-time = "2025-11-30T20:23:16.105Z" },
+ { url = "https://files.pythonhosted.org/packages/4b/9a/453255d2f769fe44e07ea9785c8347edaf867f7026872e76c1ad9f7bed92/rpds_py-0.30.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6bdfdb946967d816e6adf9a3d8201bfad269c67efe6cefd7093ef959683c8de0", size = 414949, upload-time = "2025-11-30T20:23:17.539Z" },
+ { url = "https://files.pythonhosted.org/packages/a3/31/622a86cdc0c45d6df0e9ccb6becdba5074735e7033c20e401a6d9d0e2ca0/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c77afbd5f5250bf27bf516c7c4a016813eb2d3e116139aed0096940c5982da94", size = 565790, upload-time = "2025-11-30T20:23:19.029Z" },
+ { url = "https://files.pythonhosted.org/packages/1c/5d/15bbf0fb4a3f58a3b1c67855ec1efcc4ceaef4e86644665fff03e1b66d8d/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:61046904275472a76c8c90c9ccee9013d70a6d0f73eecefd38c1ae7c39045a08", size = 590217, upload-time = "2025-11-30T20:23:20.885Z" },
+ { url = "https://files.pythonhosted.org/packages/6d/61/21b8c41f68e60c8cc3b2e25644f0e3681926020f11d06ab0b78e3c6bbff1/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c5f36a861bc4b7da6516dbdf302c55313afa09b81931e8280361a4f6c9a2d27", size = 555806, upload-time = "2025-11-30T20:23:22.488Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/39/7e067bb06c31de48de3eb200f9fc7c58982a4d3db44b07e73963e10d3be9/rpds_py-0.30.0-cp313-cp313t-win32.whl", hash = "sha256:3d4a69de7a3e50ffc214ae16d79d8fbb0922972da0356dcf4d0fdca2878559c6", size = 211341, upload-time = "2025-11-30T20:23:24.449Z" },
+ { url = "https://files.pythonhosted.org/packages/0a/4d/222ef0b46443cf4cf46764d9c630f3fe4abaa7245be9417e56e9f52b8f65/rpds_py-0.30.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f14fc5df50a716f7ece6a80b6c78bb35ea2ca47c499e422aa4463455dd96d56d", size = 225768, upload-time = "2025-11-30T20:23:25.908Z" },
+ { url = "https://files.pythonhosted.org/packages/86/81/dad16382ebbd3d0e0328776d8fd7ca94220e4fa0798d1dc5e7da48cb3201/rpds_py-0.30.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:68f19c879420aa08f61203801423f6cd5ac5f0ac4ac82a2368a9fcd6a9a075e0", size = 362099, upload-time = "2025-11-30T20:23:27.316Z" },
+ { url = "https://files.pythonhosted.org/packages/2b/60/19f7884db5d5603edf3c6bce35408f45ad3e97e10007df0e17dd57af18f8/rpds_py-0.30.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ec7c4490c672c1a0389d319b3a9cfcd098dcdc4783991553c332a15acf7249be", size = 353192, upload-time = "2025-11-30T20:23:29.151Z" },
+ { url = "https://files.pythonhosted.org/packages/bf/c4/76eb0e1e72d1a9c4703c69607cec123c29028bff28ce41588792417098ac/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f251c812357a3fed308d684a5079ddfb9d933860fc6de89f2b7ab00da481e65f", size = 384080, upload-time = "2025-11-30T20:23:30.785Z" },
+ { url = "https://files.pythonhosted.org/packages/72/87/87ea665e92f3298d1b26d78814721dc39ed8d2c74b86e83348d6b48a6f31/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac98b175585ecf4c0348fd7b29c3864bda53b805c773cbf7bfdaffc8070c976f", size = 394841, upload-time = "2025-11-30T20:23:32.209Z" },
+ { url = "https://files.pythonhosted.org/packages/77/ad/7783a89ca0587c15dcbf139b4a8364a872a25f861bdb88ed99f9b0dec985/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3e62880792319dbeb7eb866547f2e35973289e7d5696c6e295476448f5b63c87", size = 516670, upload-time = "2025-11-30T20:23:33.742Z" },
+ { url = "https://files.pythonhosted.org/packages/5b/3c/2882bdac942bd2172f3da574eab16f309ae10a3925644e969536553cb4ee/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4e7fc54e0900ab35d041b0601431b0a0eb495f0851a0639b6ef90f7741b39a18", size = 408005, upload-time = "2025-11-30T20:23:35.253Z" },
+ { url = "https://files.pythonhosted.org/packages/ce/81/9a91c0111ce1758c92516a3e44776920b579d9a7c09b2b06b642d4de3f0f/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47e77dc9822d3ad616c3d5759ea5631a75e5809d5a28707744ef79d7a1bcfcad", size = 382112, upload-time = "2025-11-30T20:23:36.842Z" },
+ { url = "https://files.pythonhosted.org/packages/cf/8e/1da49d4a107027e5fbc64daeab96a0706361a2918da10cb41769244b805d/rpds_py-0.30.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:b4dc1a6ff022ff85ecafef7979a2c6eb423430e05f1165d6688234e62ba99a07", size = 399049, upload-time = "2025-11-30T20:23:38.343Z" },
+ { url = "https://files.pythonhosted.org/packages/df/5a/7ee239b1aa48a127570ec03becbb29c9d5a9eb092febbd1699d567cae859/rpds_py-0.30.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4559c972db3a360808309e06a74628b95eaccbf961c335c8fe0d590cf587456f", size = 415661, upload-time = "2025-11-30T20:23:40.263Z" },
+ { url = "https://files.pythonhosted.org/packages/70/ea/caa143cf6b772f823bc7929a45da1fa83569ee49b11d18d0ada7f5ee6fd6/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0ed177ed9bded28f8deb6ab40c183cd1192aa0de40c12f38be4d59cd33cb5c65", size = 565606, upload-time = "2025-11-30T20:23:42.186Z" },
+ { url = "https://files.pythonhosted.org/packages/64/91/ac20ba2d69303f961ad8cf55bf7dbdb4763f627291ba3d0d7d67333cced9/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:ad1fa8db769b76ea911cb4e10f049d80bf518c104f15b3edb2371cc65375c46f", size = 591126, upload-time = "2025-11-30T20:23:44.086Z" },
+ { url = "https://files.pythonhosted.org/packages/21/20/7ff5f3c8b00c8a95f75985128c26ba44503fb35b8e0259d812766ea966c7/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:46e83c697b1f1c72b50e5ee5adb4353eef7406fb3f2043d64c33f20ad1c2fc53", size = 553371, upload-time = "2025-11-30T20:23:46.004Z" },
+ { url = "https://files.pythonhosted.org/packages/72/c7/81dadd7b27c8ee391c132a6b192111ca58d866577ce2d9b0ca157552cce0/rpds_py-0.30.0-cp314-cp314-win32.whl", hash = "sha256:ee454b2a007d57363c2dfd5b6ca4a5d7e2c518938f8ed3b706e37e5d470801ed", size = 215298, upload-time = "2025-11-30T20:23:47.696Z" },
+ { url = "https://files.pythonhosted.org/packages/3e/d2/1aaac33287e8cfb07aab2e6b8ac1deca62f6f65411344f1433c55e6f3eb8/rpds_py-0.30.0-cp314-cp314-win_amd64.whl", hash = "sha256:95f0802447ac2d10bcc69f6dc28fe95fdf17940367b21d34e34c737870758950", size = 228604, upload-time = "2025-11-30T20:23:49.501Z" },
+ { url = "https://files.pythonhosted.org/packages/e8/95/ab005315818cc519ad074cb7784dae60d939163108bd2b394e60dc7b5461/rpds_py-0.30.0-cp314-cp314-win_arm64.whl", hash = "sha256:613aa4771c99f03346e54c3f038e4cc574ac09a3ddfb0e8878487335e96dead6", size = 222391, upload-time = "2025-11-30T20:23:50.96Z" },
+ { url = "https://files.pythonhosted.org/packages/9e/68/154fe0194d83b973cdedcdcc88947a2752411165930182ae41d983dcefa6/rpds_py-0.30.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:7e6ecfcb62edfd632e56983964e6884851786443739dbfe3582947e87274f7cb", size = 364868, upload-time = "2025-11-30T20:23:52.494Z" },
+ { url = "https://files.pythonhosted.org/packages/83/69/8bbc8b07ec854d92a8b75668c24d2abcb1719ebf890f5604c61c9369a16f/rpds_py-0.30.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a1d0bc22a7cdc173fedebb73ef81e07faef93692b8c1ad3733b67e31e1b6e1b8", size = 353747, upload-time = "2025-11-30T20:23:54.036Z" },
+ { url = "https://files.pythonhosted.org/packages/ab/00/ba2e50183dbd9abcce9497fa5149c62b4ff3e22d338a30d690f9af970561/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d08f00679177226c4cb8c5265012eea897c8ca3b93f429e546600c971bcbae7", size = 383795, upload-time = "2025-11-30T20:23:55.556Z" },
+ { url = "https://files.pythonhosted.org/packages/05/6f/86f0272b84926bcb0e4c972262f54223e8ecc556b3224d281e6598fc9268/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5965af57d5848192c13534f90f9dd16464f3c37aaf166cc1da1cae1fd5a34898", size = 393330, upload-time = "2025-11-30T20:23:57.033Z" },
+ { url = "https://files.pythonhosted.org/packages/cb/e9/0e02bb2e6dc63d212641da45df2b0bf29699d01715913e0d0f017ee29438/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a4e86e34e9ab6b667c27f3211ca48f73dba7cd3d90f8d5b11be56e5dbc3fb4e", size = 518194, upload-time = "2025-11-30T20:23:58.637Z" },
+ { url = "https://files.pythonhosted.org/packages/ee/ca/be7bca14cf21513bdf9c0606aba17d1f389ea2b6987035eb4f62bd923f25/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d3e6b26f2c785d65cc25ef1e5267ccbe1b069c5c21b8cc724efee290554419", size = 408340, upload-time = "2025-11-30T20:24:00.2Z" },
+ { url = "https://files.pythonhosted.org/packages/c2/c7/736e00ebf39ed81d75544c0da6ef7b0998f8201b369acf842f9a90dc8fce/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:626a7433c34566535b6e56a1b39a7b17ba961e97ce3b80ec62e6f1312c025551", size = 383765, upload-time = "2025-11-30T20:24:01.759Z" },
+ { url = "https://files.pythonhosted.org/packages/4a/3f/da50dfde9956aaf365c4adc9533b100008ed31aea635f2b8d7b627e25b49/rpds_py-0.30.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:acd7eb3f4471577b9b5a41baf02a978e8bdeb08b4b355273994f8b87032000a8", size = 396834, upload-time = "2025-11-30T20:24:03.687Z" },
+ { url = "https://files.pythonhosted.org/packages/4e/00/34bcc2565b6020eab2623349efbdec810676ad571995911f1abdae62a3a0/rpds_py-0.30.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fe5fa731a1fa8a0a56b0977413f8cacac1768dad38d16b3a296712709476fbd5", size = 415470, upload-time = "2025-11-30T20:24:05.232Z" },
+ { url = "https://files.pythonhosted.org/packages/8c/28/882e72b5b3e6f718d5453bd4d0d9cf8df36fddeb4ddbbab17869d5868616/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:74a3243a411126362712ee1524dfc90c650a503502f135d54d1b352bd01f2404", size = 565630, upload-time = "2025-11-30T20:24:06.878Z" },
+ { url = "https://files.pythonhosted.org/packages/3b/97/04a65539c17692de5b85c6e293520fd01317fd878ea1995f0367d4532fb1/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:3e8eeb0544f2eb0d2581774be4c3410356eba189529a6b3e36bbbf9696175856", size = 591148, upload-time = "2025-11-30T20:24:08.445Z" },
+ { url = "https://files.pythonhosted.org/packages/85/70/92482ccffb96f5441aab93e26c4d66489eb599efdcf96fad90c14bbfb976/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:dbd936cde57abfee19ab3213cf9c26be06d60750e60a8e4dd85d1ab12c8b1f40", size = 556030, upload-time = "2025-11-30T20:24:10.956Z" },
+ { url = "https://files.pythonhosted.org/packages/20/53/7c7e784abfa500a2b6b583b147ee4bb5a2b3747a9166bab52fec4b5b5e7d/rpds_py-0.30.0-cp314-cp314t-win32.whl", hash = "sha256:dc824125c72246d924f7f796b4f63c1e9dc810c7d9e2355864b3c3a73d59ade0", size = 211570, upload-time = "2025-11-30T20:24:12.735Z" },
+ { url = "https://files.pythonhosted.org/packages/d0/02/fa464cdfbe6b26e0600b62c528b72d8608f5cc49f96b8d6e38c95d60c676/rpds_py-0.30.0-cp314-cp314t-win_amd64.whl", hash = "sha256:27f4b0e92de5bfbc6f86e43959e6edd1425c33b5e69aab0984a72047f2bcf1e3", size = 226532, upload-time = "2025-11-30T20:24:14.634Z" },
+ { url = "https://files.pythonhosted.org/packages/69/71/3f34339ee70521864411f8b6992e7ab13ac30d8e4e3309e07c7361767d91/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c2262bdba0ad4fc6fb5545660673925c2d2a5d9e2e0fb603aad545427be0fc58", size = 372292, upload-time = "2025-11-30T20:24:16.537Z" },
+ { url = "https://files.pythonhosted.org/packages/57/09/f183df9b8f2d66720d2ef71075c59f7e1b336bec7ee4c48f0a2b06857653/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ee6af14263f25eedc3bb918a3c04245106a42dfd4f5c2285ea6f997b1fc3f89a", size = 362128, upload-time = "2025-11-30T20:24:18.086Z" },
+ { url = "https://files.pythonhosted.org/packages/7a/68/5c2594e937253457342e078f0cc1ded3dd7b2ad59afdbf2d354869110a02/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3adbb8179ce342d235c31ab8ec511e66c73faa27a47e076ccc92421add53e2bb", size = 391542, upload-time = "2025-11-30T20:24:20.092Z" },
+ { url = "https://files.pythonhosted.org/packages/49/5c/31ef1afd70b4b4fbdb2800249f34c57c64beb687495b10aec0365f53dfc4/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:250fa00e9543ac9b97ac258bd37367ff5256666122c2d0f2bc97577c60a1818c", size = 404004, upload-time = "2025-11-30T20:24:22.231Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/63/0cfbea38d05756f3440ce6534d51a491d26176ac045e2707adc99bb6e60a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9854cf4f488b3d57b9aaeb105f06d78e5529d3145b1e4a41750167e8c213c6d3", size = 527063, upload-time = "2025-11-30T20:24:24.302Z" },
+ { url = "https://files.pythonhosted.org/packages/42/e6/01e1f72a2456678b0f618fc9a1a13f882061690893c192fcad9f2926553a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:993914b8e560023bc0a8bf742c5f303551992dcb85e247b1e5c7f4a7d145bda5", size = 413099, upload-time = "2025-11-30T20:24:25.916Z" },
+ { url = "https://files.pythonhosted.org/packages/b8/25/8df56677f209003dcbb180765520c544525e3ef21ea72279c98b9aa7c7fb/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58edca431fb9b29950807e301826586e5bbf24163677732429770a697ffe6738", size = 392177, upload-time = "2025-11-30T20:24:27.834Z" },
+ { url = "https://files.pythonhosted.org/packages/4a/b4/0a771378c5f16f8115f796d1f437950158679bcd2a7c68cf251cfb00ed5b/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:dea5b552272a944763b34394d04577cf0f9bd013207bc32323b5a89a53cf9c2f", size = 406015, upload-time = "2025-11-30T20:24:29.457Z" },
+ { url = "https://files.pythonhosted.org/packages/36/d8/456dbba0af75049dc6f63ff295a2f92766b9d521fa00de67a2bd6427d57a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ba3af48635eb83d03f6c9735dfb21785303e73d22ad03d489e88adae6eab8877", size = 423736, upload-time = "2025-11-30T20:24:31.22Z" },
+ { url = "https://files.pythonhosted.org/packages/13/64/b4d76f227d5c45a7e0b796c674fd81b0a6c4fbd48dc29271857d8219571c/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:dff13836529b921e22f15cb099751209a60009731a68519630a24d61f0b1b30a", size = 573981, upload-time = "2025-11-30T20:24:32.934Z" },
+ { url = "https://files.pythonhosted.org/packages/20/91/092bacadeda3edf92bf743cc96a7be133e13a39cdbfd7b5082e7ab638406/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:1b151685b23929ab7beec71080a8889d4d6d9fa9a983d213f07121205d48e2c4", size = 599782, upload-time = "2025-11-30T20:24:35.169Z" },
+ { url = "https://files.pythonhosted.org/packages/d1/b7/b95708304cd49b7b6f82fdd039f1748b66ec2b21d6a45180910802f1abf1/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:ac37f9f516c51e5753f27dfdef11a88330f04de2d564be3991384b2f3535d02e", size = 562191, upload-time = "2025-11-30T20:24:36.853Z" },
+]
+
+[[package]]
+name = "safehttpx"
+version = "0.1.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "httpx" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/89/d1/4282284d9cf1ee873607a46442da977fc3c985059315ab23610be31d5885/safehttpx-0.1.7.tar.gz", hash = "sha256:db201c0978c41eddb8bb480f3eee59dd67304fdd91646035e9d9a720049a9d23", size = 10385, upload-time = "2025-10-24T18:30:09.783Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/2e/a3/0f0b7d78e2f1eb9e8e1afbff1d2bff8d60144aee17aca51c065b516743dd/safehttpx-0.1.7-py3-none-any.whl", hash = "sha256:c4f4a162db6993464d7ca3d7cc4af0ffc6515a606dfd220b9f82c6945d869cde", size = 8959, upload-time = "2025-10-24T18:30:08.733Z" },
+]
+
+[[package]]
+name = "secretstorage"
+version = "3.5.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "cryptography", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
+ { name = "jeepney", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/1c/03/e834bcd866f2f8a49a85eaff47340affa3bfa391ee9912a952a1faa68c7b/secretstorage-3.5.0.tar.gz", hash = "sha256:f04b8e4689cbce351744d5537bf6b1329c6fc68f91fa666f60a380edddcd11be", size = 19884, upload-time = "2025-11-23T19:02:53.191Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/b7/46/f5af3402b579fd5e11573ce652019a67074317e18c1935cc0b4ba9b35552/secretstorage-3.5.0-py3-none-any.whl", hash = "sha256:0ce65888c0725fcb2c5bc0fdb8e5438eece02c523557ea40ce0703c266248137", size = 15554, upload-time = "2025-11-23T19:02:51.545Z" },
+]
+
+[[package]]
+name = "semantic-version"
+version = "2.10.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7d/31/f2289ce78b9b473d582568c234e104d2a342fd658cc288a7553d83bb8595/semantic_version-2.10.0.tar.gz", hash = "sha256:bdabb6d336998cbb378d4b9db3a4b56a1e3235701dc05ea2690d9a997ed5041c", size = 52289, upload-time = "2022-05-26T13:35:23.454Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/6a/23/8146aad7d88f4fcb3a6218f41a60f6c2d4e3a72de72da1825dc7c8f7877c/semantic_version-2.10.0-py2.py3-none-any.whl", hash = "sha256:de78a3b8e0feda74cabc54aab2da702113e33ac9d9eb9d2389bcf1f58b7d9177", size = 15552, upload-time = "2022-05-26T13:35:21.206Z" },
+]
+
+[[package]]
+name = "shellingham"
+version = "1.5.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
+]
+
+[[package]]
+name = "six"
+version = "1.17.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
+]
+
+[[package]]
+name = "sniffio"
+version = "1.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
+]
+
+[[package]]
+name = "sse-starlette"
+version = "3.3.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "anyio" },
+ { name = "starlette" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/26/8c/f9290339ef6d79badbc010f067cd769d6601ec11a57d78569c683fb4dd87/sse_starlette-3.3.4.tar.gz", hash = "sha256:aaf92fc067af8a5427192895ac028e947b484ac01edbc3caf00e7e7137c7bef1", size = 32427, upload-time = "2026-03-29T09:00:23.307Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/f8/7f/3de5402f39890ac5660b86bcf5c03f9d855dad5c4ed764866d7b592b46fd/sse_starlette-3.3.4-py3-none-any.whl", hash = "sha256:84bb06e58939a8b38d8341f1bc9792f06c2b53f48c608dd207582b664fc8f3c1", size = 14330, upload-time = "2026-03-29T09:00:21.846Z" },
+]
+
+[[package]]
+name = "starlette"
+version = "1.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "anyio" },
+ { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/81/69/17425771797c36cded50b7fe44e850315d039f28b15901ab44839e70b593/starlette-1.0.0.tar.gz", hash = "sha256:6a4beaf1f81bb472fd19ea9b918b50dc3a77a6f2e190a12954b25e6ed5eea149", size = 2655289, upload-time = "2026-03-22T18:29:46.779Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/0b/c9/584bc9651441b4ba60cc4d557d8a547b5aff901af35bda3a4ee30c819b82/starlette-1.0.0-py3-none-any.whl", hash = "sha256:d3ec55e0bb321692d275455ddfd3df75fff145d009685eb40dc91fc66b03d38b", size = 72651, upload-time = "2026-03-22T18:29:45.111Z" },
+]
+
+[[package]]
+name = "tomli"
+version = "2.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/22/de/48c59722572767841493b26183a0d1cc411d54fd759c5607c4590b6563a6/tomli-2.4.1.tar.gz", hash = "sha256:7c7e1a961a0b2f2472c1ac5b69affa0ae1132c39adcb67aba98568702b9cc23f", size = 17543, upload-time = "2026-03-25T20:22:03.828Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/f4/11/db3d5885d8528263d8adc260bb2d28ebf1270b96e98f0e0268d32b8d9900/tomli-2.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f8f0fc26ec2cc2b965b7a3b87cd19c5c6b8c5e5f436b984e85f486d652285c30", size = 154704, upload-time = "2026-03-25T20:21:10.473Z" },
+ { url = "https://files.pythonhosted.org/packages/6d/f7/675db52c7e46064a9aa928885a9b20f4124ecb9bc2e1ce74c9106648d202/tomli-2.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4ab97e64ccda8756376892c53a72bd1f964e519c77236368527f758fbc36a53a", size = 149454, upload-time = "2026-03-25T20:21:12.036Z" },
+ { url = "https://files.pythonhosted.org/packages/61/71/81c50943cf953efa35bce7646caab3cf457a7d8c030b27cfb40d7235f9ee/tomli-2.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96481a5786729fd470164b47cdb3e0e58062a496f455ee41b4403be77cb5a076", size = 237561, upload-time = "2026-03-25T20:21:13.098Z" },
+ { url = "https://files.pythonhosted.org/packages/48/c1/f41d9cb618acccca7df82aaf682f9b49013c9397212cb9f53219e3abac37/tomli-2.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a881ab208c0baf688221f8cecc5401bd291d67e38a1ac884d6736cbcd8247e9", size = 243824, upload-time = "2026-03-25T20:21:14.569Z" },
+ { url = "https://files.pythonhosted.org/packages/22/e4/5a816ecdd1f8ca51fb756ef684b90f2780afc52fc67f987e3c61d800a46d/tomli-2.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47149d5bd38761ac8be13a84864bf0b7b70bc051806bc3669ab1cbc56216b23c", size = 242227, upload-time = "2026-03-25T20:21:15.712Z" },
+ { url = "https://files.pythonhosted.org/packages/6b/49/2b2a0ef529aa6eec245d25f0c703e020a73955ad7edf73e7f54ddc608aa5/tomli-2.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ec9bfaf3ad2df51ace80688143a6a4ebc09a248f6ff781a9945e51937008fcbc", size = 247859, upload-time = "2026-03-25T20:21:17.001Z" },
+ { url = "https://files.pythonhosted.org/packages/83/bd/6c1a630eaca337e1e78c5903104f831bda934c426f9231429396ce3c3467/tomli-2.4.1-cp311-cp311-win32.whl", hash = "sha256:ff2983983d34813c1aeb0fa89091e76c3a22889ee83ab27c5eeb45100560c049", size = 97204, upload-time = "2026-03-25T20:21:18.079Z" },
+ { url = "https://files.pythonhosted.org/packages/42/59/71461df1a885647e10b6bb7802d0b8e66480c61f3f43079e0dcd315b3954/tomli-2.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:5ee18d9ebdb417e384b58fe414e8d6af9f4e7a0ae761519fb50f721de398dd4e", size = 108084, upload-time = "2026-03-25T20:21:18.978Z" },
+ { url = "https://files.pythonhosted.org/packages/b8/83/dceca96142499c069475b790e7913b1044c1a4337e700751f48ed723f883/tomli-2.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:c2541745709bad0264b7d4705ad453b76ccd191e64aa6f0fc66b69a293a45ece", size = 95285, upload-time = "2026-03-25T20:21:20.309Z" },
+ { url = "https://files.pythonhosted.org/packages/c1/ba/42f134a3fe2b370f555f44b1d72feebb94debcab01676bf918d0cb70e9aa/tomli-2.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c742f741d58a28940ce01d58f0ab2ea3ced8b12402f162f4d534dfe18ba1cd6a", size = 155924, upload-time = "2026-03-25T20:21:21.626Z" },
+ { url = "https://files.pythonhosted.org/packages/dc/c7/62d7a17c26487ade21c5422b646110f2162f1fcc95980ef7f63e73c68f14/tomli-2.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7f86fd587c4ed9dd76f318225e7d9b29cfc5a9d43de44e5754db8d1128487085", size = 150018, upload-time = "2026-03-25T20:21:23.002Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/05/79d13d7c15f13bdef410bdd49a6485b1c37d28968314eabee452c22a7fda/tomli-2.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ff18e6a727ee0ab0388507b89d1bc6a22b138d1e2fa56d1ad494586d61d2eae9", size = 244948, upload-time = "2026-03-25T20:21:24.04Z" },
+ { url = "https://files.pythonhosted.org/packages/10/90/d62ce007a1c80d0b2c93e02cab211224756240884751b94ca72df8a875ca/tomli-2.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:136443dbd7e1dee43c68ac2694fde36b2849865fa258d39bf822c10e8068eac5", size = 253341, upload-time = "2026-03-25T20:21:25.177Z" },
+ { url = "https://files.pythonhosted.org/packages/1a/7e/caf6496d60152ad4ed09282c1885cca4eea150bfd007da84aea07bcc0a3e/tomli-2.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5e262d41726bc187e69af7825504c933b6794dc3fbd5945e41a79bb14c31f585", size = 248159, upload-time = "2026-03-25T20:21:26.364Z" },
+ { url = "https://files.pythonhosted.org/packages/99/e7/c6f69c3120de34bbd882c6fba7975f3d7a746e9218e56ab46a1bc4b42552/tomli-2.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5cb41aa38891e073ee49d55fbc7839cfdb2bc0e600add13874d048c94aadddd1", size = 253290, upload-time = "2026-03-25T20:21:27.46Z" },
+ { url = "https://files.pythonhosted.org/packages/d6/2f/4a3c322f22c5c66c4b836ec58211641a4067364f5dcdd7b974b4c5da300c/tomli-2.4.1-cp312-cp312-win32.whl", hash = "sha256:da25dc3563bff5965356133435b757a795a17b17d01dbc0f42fb32447ddfd917", size = 98141, upload-time = "2026-03-25T20:21:28.492Z" },
+ { url = "https://files.pythonhosted.org/packages/24/22/4daacd05391b92c55759d55eaee21e1dfaea86ce5c571f10083360adf534/tomli-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:52c8ef851d9a240f11a88c003eacb03c31fc1c9c4ec64a99a0f922b93874fda9", size = 108847, upload-time = "2026-03-25T20:21:29.386Z" },
+ { url = "https://files.pythonhosted.org/packages/68/fd/70e768887666ddd9e9f5d85129e84910f2db2796f9096aa02b721a53098d/tomli-2.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:f758f1b9299d059cc3f6546ae2af89670cb1c4d48ea29c3cacc4fe7de3058257", size = 95088, upload-time = "2026-03-25T20:21:30.677Z" },
+ { url = "https://files.pythonhosted.org/packages/07/06/b823a7e818c756d9a7123ba2cda7d07bc2dd32835648d1a7b7b7a05d848d/tomli-2.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:36d2bd2ad5fb9eaddba5226aa02c8ec3fa4f192631e347b3ed28186d43be6b54", size = 155866, upload-time = "2026-03-25T20:21:31.65Z" },
+ { url = "https://files.pythonhosted.org/packages/14/6f/12645cf7f08e1a20c7eb8c297c6f11d31c1b50f316a7e7e1e1de6e2e7b7e/tomli-2.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:eb0dc4e38e6a1fd579e5d50369aa2e10acfc9cace504579b2faabb478e76941a", size = 149887, upload-time = "2026-03-25T20:21:33.028Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/e0/90637574e5e7212c09099c67ad349b04ec4d6020324539297b634a0192b0/tomli-2.4.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7f2c7f2b9ca6bdeef8f0fa897f8e05085923eb091721675170254cbc5b02897", size = 243704, upload-time = "2026-03-25T20:21:34.51Z" },
+ { url = "https://files.pythonhosted.org/packages/10/8f/d3ddb16c5a4befdf31a23307f72828686ab2096f068eaf56631e136c1fdd/tomli-2.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f3c6818a1a86dd6dca7ddcaaf76947d5ba31aecc28cb1b67009a5877c9a64f3f", size = 251628, upload-time = "2026-03-25T20:21:36.012Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/f1/dbeeb9116715abee2485bf0a12d07a8f31af94d71608c171c45f64c0469d/tomli-2.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d312ef37c91508b0ab2cee7da26ec0b3ed2f03ce12bd87a588d771ae15dcf82d", size = 247180, upload-time = "2026-03-25T20:21:37.136Z" },
+ { url = "https://files.pythonhosted.org/packages/d3/74/16336ffd19ed4da28a70959f92f506233bd7cfc2332b20bdb01591e8b1d1/tomli-2.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51529d40e3ca50046d7606fa99ce3956a617f9b36380da3b7f0dd3dd28e68cb5", size = 251674, upload-time = "2026-03-25T20:21:38.298Z" },
+ { url = "https://files.pythonhosted.org/packages/16/f9/229fa3434c590ddf6c0aa9af64d3af4b752540686cace29e6281e3458469/tomli-2.4.1-cp313-cp313-win32.whl", hash = "sha256:2190f2e9dd7508d2a90ded5ed369255980a1bcdd58e52f7fe24b8162bf9fedbd", size = 97976, upload-time = "2026-03-25T20:21:39.316Z" },
+ { url = "https://files.pythonhosted.org/packages/6a/1e/71dfd96bcc1c775420cb8befe7a9d35f2e5b1309798f009dca17b7708c1e/tomli-2.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:8d65a2fbf9d2f8352685bc1364177ee3923d6baf5e7f43ea4959d7d8bc326a36", size = 108755, upload-time = "2026-03-25T20:21:40.248Z" },
+ { url = "https://files.pythonhosted.org/packages/83/7a/d34f422a021d62420b78f5c538e5b102f62bea616d1d75a13f0a88acb04a/tomli-2.4.1-cp313-cp313-win_arm64.whl", hash = "sha256:4b605484e43cdc43f0954ddae319fb75f04cc10dd80d830540060ee7cd0243cd", size = 95265, upload-time = "2026-03-25T20:21:41.219Z" },
+ { url = "https://files.pythonhosted.org/packages/3c/fb/9a5c8d27dbab540869f7c1f8eb0abb3244189ce780ba9cd73f3770662072/tomli-2.4.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:fd0409a3653af6c147209d267a0e4243f0ae46b011aa978b1080359fddc9b6cf", size = 155726, upload-time = "2026-03-25T20:21:42.23Z" },
+ { url = "https://files.pythonhosted.org/packages/62/05/d2f816630cc771ad836af54f5001f47a6f611d2d39535364f148b6a92d6b/tomli-2.4.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a120733b01c45e9a0c34aeef92bf0cf1d56cfe81ed9d47d562f9ed591a9828ac", size = 149859, upload-time = "2026-03-25T20:21:43.386Z" },
+ { url = "https://files.pythonhosted.org/packages/ce/48/66341bdb858ad9bd0ceab5a86f90eddab127cf8b046418009f2125630ecb/tomli-2.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:559db847dc486944896521f68d8190be1c9e719fced785720d2216fe7022b662", size = 244713, upload-time = "2026-03-25T20:21:44.474Z" },
+ { url = "https://files.pythonhosted.org/packages/df/6d/c5fad00d82b3c7a3ab6189bd4b10e60466f22cfe8a08a9394185c8a8111c/tomli-2.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01f520d4f53ef97964a240a035ec2a869fe1a37dde002b57ebc4417a27ccd853", size = 252084, upload-time = "2026-03-25T20:21:45.62Z" },
+ { url = "https://files.pythonhosted.org/packages/00/71/3a69e86f3eafe8c7a59d008d245888051005bd657760e96d5fbfb0b740c2/tomli-2.4.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7f94b27a62cfad8496c8d2513e1a222dd446f095fca8987fceef261225538a15", size = 247973, upload-time = "2026-03-25T20:21:46.937Z" },
+ { url = "https://files.pythonhosted.org/packages/67/50/361e986652847fec4bd5e4a0208752fbe64689c603c7ae5ea7cb16b1c0ca/tomli-2.4.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ede3e6487c5ef5d28634ba3f31f989030ad6af71edfb0055cbbd14189ff240ba", size = 256223, upload-time = "2026-03-25T20:21:48.467Z" },
+ { url = "https://files.pythonhosted.org/packages/8c/9a/b4173689a9203472e5467217e0154b00e260621caa227b6fa01feab16998/tomli-2.4.1-cp314-cp314-win32.whl", hash = "sha256:3d48a93ee1c9b79c04bb38772ee1b64dcf18ff43085896ea460ca8dec96f35f6", size = 98973, upload-time = "2026-03-25T20:21:49.526Z" },
+ { url = "https://files.pythonhosted.org/packages/14/58/640ac93bf230cd27d002462c9af0d837779f8773bc03dee06b5835208214/tomli-2.4.1-cp314-cp314-win_amd64.whl", hash = "sha256:88dceee75c2c63af144e456745e10101eb67361050196b0b6af5d717254dddf7", size = 109082, upload-time = "2026-03-25T20:21:50.506Z" },
+ { url = "https://files.pythonhosted.org/packages/d5/2f/702d5e05b227401c1068f0d386d79a589bb12bf64c3d2c72ce0631e3bc49/tomli-2.4.1-cp314-cp314-win_arm64.whl", hash = "sha256:b8c198f8c1805dc42708689ed6864951fd2494f924149d3e4bce7710f8eb5232", size = 96490, upload-time = "2026-03-25T20:21:51.474Z" },
+ { url = "https://files.pythonhosted.org/packages/45/4b/b877b05c8ba62927d9865dd980e34a755de541eb65fffba52b4cc495d4d2/tomli-2.4.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:d4d8fe59808a54658fcc0160ecfb1b30f9089906c50b23bcb4c69eddc19ec2b4", size = 164263, upload-time = "2026-03-25T20:21:52.543Z" },
+ { url = "https://files.pythonhosted.org/packages/24/79/6ab420d37a270b89f7195dec5448f79400d9e9c1826df982f3f8e97b24fd/tomli-2.4.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7008df2e7655c495dd12d2a4ad038ff878d4ca4b81fccaf82b714e07eae4402c", size = 160736, upload-time = "2026-03-25T20:21:53.674Z" },
+ { url = "https://files.pythonhosted.org/packages/02/e0/3630057d8eb170310785723ed5adcdfb7d50cb7e6455f85ba8a3deed642b/tomli-2.4.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d8591993e228b0c930c4bb0db464bdad97b3289fb981255d6c9a41aedc84b2d", size = 270717, upload-time = "2026-03-25T20:21:55.129Z" },
+ { url = "https://files.pythonhosted.org/packages/7a/b4/1613716072e544d1a7891f548d8f9ec6ce2faf42ca65acae01d76ea06bb0/tomli-2.4.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:734e20b57ba95624ecf1841e72b53f6e186355e216e5412de414e3c51e5e3c41", size = 278461, upload-time = "2026-03-25T20:21:56.228Z" },
+ { url = "https://files.pythonhosted.org/packages/05/38/30f541baf6a3f6df77b3df16b01ba319221389e2da59427e221ef417ac0c/tomli-2.4.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8a650c2dbafa08d42e51ba0b62740dae4ecb9338eefa093aa5c78ceb546fcd5c", size = 274855, upload-time = "2026-03-25T20:21:57.653Z" },
+ { url = "https://files.pythonhosted.org/packages/77/a3/ec9dd4fd2c38e98de34223b995a3b34813e6bdadf86c75314c928350ed14/tomli-2.4.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:504aa796fe0569bb43171066009ead363de03675276d2d121ac1a4572397870f", size = 283144, upload-time = "2026-03-25T20:21:59.089Z" },
+ { url = "https://files.pythonhosted.org/packages/ef/be/605a6261cac79fba2ec0c9827e986e00323a1945700969b8ee0b30d85453/tomli-2.4.1-cp314-cp314t-win32.whl", hash = "sha256:b1d22e6e9387bf4739fbe23bfa80e93f6b0373a7f1b96c6227c32bef95a4d7a8", size = 108683, upload-time = "2026-03-25T20:22:00.214Z" },
+ { url = "https://files.pythonhosted.org/packages/12/64/da524626d3b9cc40c168a13da8335fe1c51be12c0a63685cc6db7308daae/tomli-2.4.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2c1c351919aca02858f740c6d33adea0c5deea37f9ecca1cc1ef9e884a619d26", size = 121196, upload-time = "2026-03-25T20:22:01.169Z" },
+ { url = "https://files.pythonhosted.org/packages/5a/cd/e80b62269fc78fc36c9af5a6b89c835baa8af28ff5ad28c7028d60860320/tomli-2.4.1-cp314-cp314t-win_arm64.whl", hash = "sha256:eab21f45c7f66c13f2a9e0e1535309cee140182a9cdae1e041d02e47291e8396", size = 100393, upload-time = "2026-03-25T20:22:02.137Z" },
+ { url = "https://files.pythonhosted.org/packages/7b/61/cceae43728b7de99d9b847560c262873a1f6c98202171fd5ed62640b494b/tomli-2.4.1-py3-none-any.whl", hash = "sha256:0d85819802132122da43cb86656f8d1f8c6587d54ae7dcaf30e90533028b49fe", size = 14583, upload-time = "2026-03-25T20:22:03.012Z" },
+]
+
+[[package]]
+name = "tomli-w"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/19/75/241269d1da26b624c0d5e110e8149093c759b7a286138f4efd61a60e75fe/tomli_w-1.2.0.tar.gz", hash = "sha256:2dd14fac5a47c27be9cd4c976af5a12d87fb1f0b4512f81d69cce3b35ae25021", size = 7184, upload-time = "2025-01-15T12:07:24.262Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/c7/18/c86eb8e0202e32dd3df50d43d7ff9854f8e0603945ff398974c1d91ac1ef/tomli_w-1.2.0-py3-none-any.whl", hash = "sha256:188306098d013b691fcadc011abd66727d3c414c571bb01b1a174ba8c983cf90", size = 6675, upload-time = "2025-01-15T12:07:22.074Z" },
+]
+
+[[package]]
+name = "tomlkit"
+version = "0.14.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/af/14b24e41977adb296d6bd1fb59402cf7d60ce364f90c890bd2ec65c43b5a/tomlkit-0.14.0.tar.gz", hash = "sha256:cf00efca415dbd57575befb1f6634c4f42d2d87dbba376128adb42c121b87064", size = 187167, upload-time = "2026-01-13T01:14:53.304Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/b5/11/87d6d29fb5d237229d67973a6c9e06e048f01cf4994dee194ab0ea841814/tomlkit-0.14.0-py3-none-any.whl", hash = "sha256:592064ed85b40fa213469f81ac584f67a4f2992509a7c3ea2d632208623a3680", size = 39310, upload-time = "2026-01-13T01:14:51.965Z" },
+]
+
+[[package]]
+name = "tqdm"
+version = "4.67.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" },
+]
+
+[[package]]
+name = "typer"
+version = "0.24.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "annotated-doc" },
+ { name = "click" },
+ { name = "rich" },
+ { name = "shellingham" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/83/b8/9ebb531b6c2d377af08ac6746a5df3425b21853a5d2260876919b58a2a4a/typer-0.24.2.tar.gz", hash = "sha256:ec070dcfca1408e85ee203c6365001e818c3b7fffe686fd07ff2d68095ca0480", size = 119849, upload-time = "2026-04-22T17:45:34.413Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/39/d1/9484b497e0a0410b901c12b8251c3e746e1e863f7d28419ffe06f7892fda/typer-0.24.2-py3-none-any.whl", hash = "sha256:b618bc3d721f9a8d30f3e05565be26416d06e9bcc29d49bc491dc26aba674fa8", size = 55977, upload-time = "2026-04-22T17:45:33.055Z" },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.15.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
+]
+
+[[package]]
+name = "typing-inspection"
+version = "0.4.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
+]
+
+[[package]]
+name = "tzdata"
+version = "2026.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ba/19/1b9b0e29f30c6d35cb345486df41110984ea67ae69dddbc0e8a100999493/tzdata-2026.2.tar.gz", hash = "sha256:9173fde7d80d9018e02a662e168e5a2d04f87c41ea174b139fbef642eda62d10", size = 198254, upload-time = "2026-04-24T15:22:08.651Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/ce/e4/dccd7f47c4b64213ac01ef921a1337ee6e30e8c6466046018326977efd95/tzdata-2026.2-py2.py3-none-any.whl", hash = "sha256:bbe9af844f658da81a5f95019480da3a89415801f6cc966806612cc7169bffe7", size = 349321, upload-time = "2026-04-24T15:22:05.876Z" },
+]
+
+[[package]]
+name = "uncalled-for"
+version = "0.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e1/68/35c1d87e608940badbcfeb630347aa0509897284684f61fab6423d02b253/uncalled_for-0.3.1.tar.gz", hash = "sha256:5e412ac6708f04b56bef5867b5dcf6690ebce4eb7316058d9c50787492bb4bca", size = 49693, upload-time = "2026-04-07T13:05:06.462Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/11/e1/7ec67882ad8fc9f86384bef6421fa252c9cbe5744f8df6ce77afc9eca1f5/uncalled_for-0.3.1-py3-none-any.whl", hash = "sha256:074cdc92da8356278f93d0ded6f2a66dd883dbecaf9bc89437646ee2289cc200", size = 11361, upload-time = "2026-04-07T13:05:05.341Z" },
+]
+
+[[package]]
+name = "urllib3"
+version = "2.6.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" },
+]
+
+[[package]]
+name = "uvicorn"
+version = "0.46.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "click" },
+ { name = "h11" },
+ { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/1f/93/041fca8274050e40e6791f267d82e0e2e27dd165627bd640d3e0e378d877/uvicorn-0.46.0.tar.gz", hash = "sha256:fb9da0926999cc6cb22dc7cd71a94a632f078e6ae47ff683c5c420750fb7413d", size = 88758, upload-time = "2026-04-23T07:16:00.151Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/31/a3/5b1562db76a5a488274b2332a97199b32d0442aca0ed193697fd47786316/uvicorn-0.46.0-py3-none-any.whl", hash = "sha256:bbebbcbed972d162afca128605223022bedd345b7bc7855ce66deb31487a9048", size = 70926, upload-time = "2026-04-23T07:15:58.355Z" },
+]
+
+[[package]]
+name = "watchfiles"
+version = "1.1.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "anyio" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c2/c9/8869df9b2a2d6c59d79220a4db37679e74f807c559ffe5265e08b227a210/watchfiles-1.1.1.tar.gz", hash = "sha256:a173cb5c16c4f40ab19cecf48a534c409f7ea983ab8fed0741304a1c0a31b3f2", size = 94440, upload-time = "2025-10-14T15:06:21.08Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/a7/1a/206e8cf2dd86fddf939165a57b4df61607a1e0add2785f170a3f616b7d9f/watchfiles-1.1.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:eef58232d32daf2ac67f42dea51a2c80f0d03379075d44a587051e63cc2e368c", size = 407318, upload-time = "2025-10-14T15:04:18.753Z" },
+ { url = "https://files.pythonhosted.org/packages/b3/0f/abaf5262b9c496b5dad4ed3c0e799cbecb1f8ea512ecb6ddd46646a9fca3/watchfiles-1.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:03fa0f5237118a0c5e496185cafa92878568b652a2e9a9382a5151b1a0380a43", size = 394478, upload-time = "2025-10-14T15:04:20.297Z" },
+ { url = "https://files.pythonhosted.org/packages/b1/04/9cc0ba88697b34b755371f5ace8d3a4d9a15719c07bdc7bd13d7d8c6a341/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8ca65483439f9c791897f7db49202301deb6e15fe9f8fe2fed555bf986d10c31", size = 449894, upload-time = "2025-10-14T15:04:21.527Z" },
+ { url = "https://files.pythonhosted.org/packages/d2/9c/eda4615863cd8621e89aed4df680d8c3ec3da6a4cf1da113c17decd87c7f/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f0ab1c1af0cb38e3f598244c17919fb1a84d1629cc08355b0074b6d7f53138ac", size = 459065, upload-time = "2025-10-14T15:04:22.795Z" },
+ { url = "https://files.pythonhosted.org/packages/84/13/f28b3f340157d03cbc8197629bc109d1098764abe1e60874622a0be5c112/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3bc570d6c01c206c46deb6e935a260be44f186a2f05179f52f7fcd2be086a94d", size = 488377, upload-time = "2025-10-14T15:04:24.138Z" },
+ { url = "https://files.pythonhosted.org/packages/86/93/cfa597fa9389e122488f7ffdbd6db505b3b915ca7435ecd7542e855898c2/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e84087b432b6ac94778de547e08611266f1f8ffad28c0ee4c82e028b0fc5966d", size = 595837, upload-time = "2025-10-14T15:04:25.057Z" },
+ { url = "https://files.pythonhosted.org/packages/57/1e/68c1ed5652b48d89fc24d6af905d88ee4f82fa8bc491e2666004e307ded1/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:620bae625f4cb18427b1bb1a2d9426dc0dd5a5ba74c7c2cdb9de405f7b129863", size = 473456, upload-time = "2025-10-14T15:04:26.497Z" },
+ { url = "https://files.pythonhosted.org/packages/d5/dc/1a680b7458ffa3b14bb64878112aefc8f2e4f73c5af763cbf0bd43100658/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:544364b2b51a9b0c7000a4b4b02f90e9423d97fbbf7e06689236443ebcad81ab", size = 455614, upload-time = "2025-10-14T15:04:27.539Z" },
+ { url = "https://files.pythonhosted.org/packages/61/a5/3d782a666512e01eaa6541a72ebac1d3aae191ff4a31274a66b8dd85760c/watchfiles-1.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bbe1ef33d45bc71cf21364df962af171f96ecaeca06bd9e3d0b583efb12aec82", size = 630690, upload-time = "2025-10-14T15:04:28.495Z" },
+ { url = "https://files.pythonhosted.org/packages/9b/73/bb5f38590e34687b2a9c47a244aa4dd50c56a825969c92c9c5fc7387cea1/watchfiles-1.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1a0bb430adb19ef49389e1ad368450193a90038b5b752f4ac089ec6942c4dff4", size = 622459, upload-time = "2025-10-14T15:04:29.491Z" },
+ { url = "https://files.pythonhosted.org/packages/f1/ac/c9bb0ec696e07a20bd58af5399aeadaef195fb2c73d26baf55180fe4a942/watchfiles-1.1.1-cp310-cp310-win32.whl", hash = "sha256:3f6d37644155fb5beca5378feb8c1708d5783145f2a0f1c4d5a061a210254844", size = 272663, upload-time = "2025-10-14T15:04:30.435Z" },
+ { url = "https://files.pythonhosted.org/packages/11/a0/a60c5a7c2ec59fa062d9a9c61d02e3b6abd94d32aac2d8344c4bdd033326/watchfiles-1.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:a36d8efe0f290835fd0f33da35042a1bb5dc0e83cbc092dcf69bce442579e88e", size = 287453, upload-time = "2025-10-14T15:04:31.53Z" },
+ { url = "https://files.pythonhosted.org/packages/1f/f8/2c5f479fb531ce2f0564eda479faecf253d886b1ab3630a39b7bf7362d46/watchfiles-1.1.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:f57b396167a2565a4e8b5e56a5a1c537571733992b226f4f1197d79e94cf0ae5", size = 406529, upload-time = "2025-10-14T15:04:32.899Z" },
+ { url = "https://files.pythonhosted.org/packages/fe/cd/f515660b1f32f65df671ddf6f85bfaca621aee177712874dc30a97397977/watchfiles-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:421e29339983e1bebc281fab40d812742268ad057db4aee8c4d2bce0af43b741", size = 394384, upload-time = "2025-10-14T15:04:33.761Z" },
+ { url = "https://files.pythonhosted.org/packages/7b/c3/28b7dc99733eab43fca2d10f55c86e03bd6ab11ca31b802abac26b23d161/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e43d39a741e972bab5d8100b5cdacf69db64e34eb19b6e9af162bccf63c5cc6", size = 448789, upload-time = "2025-10-14T15:04:34.679Z" },
+ { url = "https://files.pythonhosted.org/packages/4a/24/33e71113b320030011c8e4316ccca04194bf0cbbaeee207f00cbc7d6b9f5/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f537afb3276d12814082a2e9b242bdcf416c2e8fd9f799a737990a1dbe906e5b", size = 460521, upload-time = "2025-10-14T15:04:35.963Z" },
+ { url = "https://files.pythonhosted.org/packages/f4/c3/3c9a55f255aa57b91579ae9e98c88704955fa9dac3e5614fb378291155df/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b2cd9e04277e756a2e2d2543d65d1e2166d6fd4c9b183f8808634fda23f17b14", size = 488722, upload-time = "2025-10-14T15:04:37.091Z" },
+ { url = "https://files.pythonhosted.org/packages/49/36/506447b73eb46c120169dc1717fe2eff07c234bb3232a7200b5f5bd816e9/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5f3f58818dc0b07f7d9aa7fe9eb1037aecb9700e63e1f6acfed13e9fef648f5d", size = 596088, upload-time = "2025-10-14T15:04:38.39Z" },
+ { url = "https://files.pythonhosted.org/packages/82/ab/5f39e752a9838ec4d52e9b87c1e80f1ee3ccdbe92e183c15b6577ab9de16/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bb9f66367023ae783551042d31b1d7fd422e8289eedd91f26754a66f44d5cff", size = 472923, upload-time = "2025-10-14T15:04:39.666Z" },
+ { url = "https://files.pythonhosted.org/packages/af/b9/a419292f05e302dea372fa7e6fda5178a92998411f8581b9830d28fb9edb/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aebfd0861a83e6c3d1110b78ad54704486555246e542be3e2bb94195eabb2606", size = 456080, upload-time = "2025-10-14T15:04:40.643Z" },
+ { url = "https://files.pythonhosted.org/packages/b0/c3/d5932fd62bde1a30c36e10c409dc5d54506726f08cb3e1d8d0ba5e2bc8db/watchfiles-1.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:5fac835b4ab3c6487b5dbad78c4b3724e26bcc468e886f8ba8cc4306f68f6701", size = 629432, upload-time = "2025-10-14T15:04:41.789Z" },
+ { url = "https://files.pythonhosted.org/packages/f7/77/16bddd9779fafb795f1a94319dc965209c5641db5bf1edbbccace6d1b3c0/watchfiles-1.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:399600947b170270e80134ac854e21b3ccdefa11a9529a3decc1327088180f10", size = 623046, upload-time = "2025-10-14T15:04:42.718Z" },
+ { url = "https://files.pythonhosted.org/packages/46/ef/f2ecb9a0f342b4bfad13a2787155c6ee7ce792140eac63a34676a2feeef2/watchfiles-1.1.1-cp311-cp311-win32.whl", hash = "sha256:de6da501c883f58ad50db3a32ad397b09ad29865b5f26f64c24d3e3281685849", size = 271473, upload-time = "2025-10-14T15:04:43.624Z" },
+ { url = "https://files.pythonhosted.org/packages/94/bc/f42d71125f19731ea435c3948cad148d31a64fccde3867e5ba4edee901f9/watchfiles-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:35c53bd62a0b885bf653ebf6b700d1bf05debb78ad9292cf2a942b23513dc4c4", size = 287598, upload-time = "2025-10-14T15:04:44.516Z" },
+ { url = "https://files.pythonhosted.org/packages/57/c9/a30f897351f95bbbfb6abcadafbaca711ce1162f4db95fc908c98a9165f3/watchfiles-1.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:57ca5281a8b5e27593cb7d82c2ac927ad88a96ed406aa446f6344e4328208e9e", size = 277210, upload-time = "2025-10-14T15:04:45.883Z" },
+ { url = "https://files.pythonhosted.org/packages/74/d5/f039e7e3c639d9b1d09b07ea412a6806d38123f0508e5f9b48a87b0a76cc/watchfiles-1.1.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:8c89f9f2f740a6b7dcc753140dd5e1ab9215966f7a3530d0c0705c83b401bd7d", size = 404745, upload-time = "2025-10-14T15:04:46.731Z" },
+ { url = "https://files.pythonhosted.org/packages/a5/96/a881a13aa1349827490dab2d363c8039527060cfcc2c92cc6d13d1b1049e/watchfiles-1.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bd404be08018c37350f0d6e34676bd1e2889990117a2b90070b3007f172d0610", size = 391769, upload-time = "2025-10-14T15:04:48.003Z" },
+ { url = "https://files.pythonhosted.org/packages/4b/5b/d3b460364aeb8da471c1989238ea0e56bec24b6042a68046adf3d9ddb01c/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8526e8f916bb5b9a0a777c8317c23ce65de259422bba5b31325a6fa6029d33af", size = 449374, upload-time = "2025-10-14T15:04:49.179Z" },
+ { url = "https://files.pythonhosted.org/packages/b9/44/5769cb62d4ed055cb17417c0a109a92f007114a4e07f30812a73a4efdb11/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2edc3553362b1c38d9f06242416a5d8e9fe235c204a4072e988ce2e5bb1f69f6", size = 459485, upload-time = "2025-10-14T15:04:50.155Z" },
+ { url = "https://files.pythonhosted.org/packages/19/0c/286b6301ded2eccd4ffd0041a1b726afda999926cf720aab63adb68a1e36/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30f7da3fb3f2844259cba4720c3fc7138eb0f7b659c38f3bfa65084c7fc7abce", size = 488813, upload-time = "2025-10-14T15:04:51.059Z" },
+ { url = "https://files.pythonhosted.org/packages/c7/2b/8530ed41112dd4a22f4dcfdb5ccf6a1baad1ff6eed8dc5a5f09e7e8c41c7/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8979280bdafff686ba5e4d8f97840f929a87ed9cdf133cbbd42f7766774d2aa", size = 594816, upload-time = "2025-10-14T15:04:52.031Z" },
+ { url = "https://files.pythonhosted.org/packages/ce/d2/f5f9fb49489f184f18470d4f99f4e862a4b3e9ac2865688eb2099e3d837a/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dcc5c24523771db3a294c77d94771abcfcb82a0e0ee8efd910c37c59ec1b31bb", size = 475186, upload-time = "2025-10-14T15:04:53.064Z" },
+ { url = "https://files.pythonhosted.org/packages/cf/68/5707da262a119fb06fbe214d82dd1fe4a6f4af32d2d14de368d0349eb52a/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db5d7ae38ff20153d542460752ff397fcf5c96090c1230803713cf3147a6803", size = 456812, upload-time = "2025-10-14T15:04:55.174Z" },
+ { url = "https://files.pythonhosted.org/packages/66/ab/3cbb8756323e8f9b6f9acb9ef4ec26d42b2109bce830cc1f3468df20511d/watchfiles-1.1.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:28475ddbde92df1874b6c5c8aaeb24ad5be47a11f87cde5a28ef3835932e3e94", size = 630196, upload-time = "2025-10-14T15:04:56.22Z" },
+ { url = "https://files.pythonhosted.org/packages/78/46/7152ec29b8335f80167928944a94955015a345440f524d2dfe63fc2f437b/watchfiles-1.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:36193ed342f5b9842edd3532729a2ad55c4160ffcfa3700e0d54be496b70dd43", size = 622657, upload-time = "2025-10-14T15:04:57.521Z" },
+ { url = "https://files.pythonhosted.org/packages/0a/bf/95895e78dd75efe9a7f31733607f384b42eb5feb54bd2eb6ed57cc2e94f4/watchfiles-1.1.1-cp312-cp312-win32.whl", hash = "sha256:859e43a1951717cc8de7f4c77674a6d389b106361585951d9e69572823f311d9", size = 272042, upload-time = "2025-10-14T15:04:59.046Z" },
+ { url = "https://files.pythonhosted.org/packages/87/0a/90eb755f568de2688cb220171c4191df932232c20946966c27a59c400850/watchfiles-1.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:91d4c9a823a8c987cce8fa2690923b069966dabb196dd8d137ea2cede885fde9", size = 288410, upload-time = "2025-10-14T15:05:00.081Z" },
+ { url = "https://files.pythonhosted.org/packages/36/76/f322701530586922fbd6723c4f91ace21364924822a8772c549483abed13/watchfiles-1.1.1-cp312-cp312-win_arm64.whl", hash = "sha256:a625815d4a2bdca61953dbba5a39d60164451ef34c88d751f6c368c3ea73d404", size = 278209, upload-time = "2025-10-14T15:05:01.168Z" },
+ { url = "https://files.pythonhosted.org/packages/bb/f4/f750b29225fe77139f7ae5de89d4949f5a99f934c65a1f1c0b248f26f747/watchfiles-1.1.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:130e4876309e8686a5e37dba7d5e9bc77e6ed908266996ca26572437a5271e18", size = 404321, upload-time = "2025-10-14T15:05:02.063Z" },
+ { url = "https://files.pythonhosted.org/packages/2b/f9/f07a295cde762644aa4c4bb0f88921d2d141af45e735b965fb2e87858328/watchfiles-1.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5f3bde70f157f84ece3765b42b4a52c6ac1a50334903c6eaf765362f6ccca88a", size = 391783, upload-time = "2025-10-14T15:05:03.052Z" },
+ { url = "https://files.pythonhosted.org/packages/bc/11/fc2502457e0bea39a5c958d86d2cb69e407a4d00b85735ca724bfa6e0d1a/watchfiles-1.1.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:14e0b1fe858430fc0251737ef3824c54027bedb8c37c38114488b8e131cf8219", size = 449279, upload-time = "2025-10-14T15:05:04.004Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/1f/d66bc15ea0b728df3ed96a539c777acfcad0eb78555ad9efcaa1274688f0/watchfiles-1.1.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f27db948078f3823a6bb3b465180db8ebecf26dd5dae6f6180bd87383b6b4428", size = 459405, upload-time = "2025-10-14T15:05:04.942Z" },
+ { url = "https://files.pythonhosted.org/packages/be/90/9f4a65c0aec3ccf032703e6db02d89a157462fbb2cf20dd415128251cac0/watchfiles-1.1.1-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:059098c3a429f62fc98e8ec62b982230ef2c8df68c79e826e37b895bc359a9c0", size = 488976, upload-time = "2025-10-14T15:05:05.905Z" },
+ { url = "https://files.pythonhosted.org/packages/37/57/ee347af605d867f712be7029bb94c8c071732a4b44792e3176fa3c612d39/watchfiles-1.1.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfb5862016acc9b869bb57284e6cb35fdf8e22fe59f7548858e2f971d045f150", size = 595506, upload-time = "2025-10-14T15:05:06.906Z" },
+ { url = "https://files.pythonhosted.org/packages/a8/78/cc5ab0b86c122047f75e8fc471c67a04dee395daf847d3e59381996c8707/watchfiles-1.1.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:319b27255aacd9923b8a276bb14d21a5f7ff82564c744235fc5eae58d95422ae", size = 474936, upload-time = "2025-10-14T15:05:07.906Z" },
+ { url = "https://files.pythonhosted.org/packages/62/da/def65b170a3815af7bd40a3e7010bf6ab53089ef1b75d05dd5385b87cf08/watchfiles-1.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c755367e51db90e75b19454b680903631d41f9e3607fbd941d296a020c2d752d", size = 456147, upload-time = "2025-10-14T15:05:09.138Z" },
+ { url = "https://files.pythonhosted.org/packages/57/99/da6573ba71166e82d288d4df0839128004c67d2778d3b566c138695f5c0b/watchfiles-1.1.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c22c776292a23bfc7237a98f791b9ad3144b02116ff10d820829ce62dff46d0b", size = 630007, upload-time = "2025-10-14T15:05:10.117Z" },
+ { url = "https://files.pythonhosted.org/packages/a8/51/7439c4dd39511368849eb1e53279cd3454b4a4dbace80bab88feeb83c6b5/watchfiles-1.1.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:3a476189be23c3686bc2f4321dd501cb329c0a0469e77b7b534ee10129ae6374", size = 622280, upload-time = "2025-10-14T15:05:11.146Z" },
+ { url = "https://files.pythonhosted.org/packages/95/9c/8ed97d4bba5db6fdcdb2b298d3898f2dd5c20f6b73aee04eabe56c59677e/watchfiles-1.1.1-cp313-cp313-win32.whl", hash = "sha256:bf0a91bfb5574a2f7fc223cf95eeea79abfefa404bf1ea5e339c0c1560ae99a0", size = 272056, upload-time = "2025-10-14T15:05:12.156Z" },
+ { url = "https://files.pythonhosted.org/packages/1f/f3/c14e28429f744a260d8ceae18bf58c1d5fa56b50d006a7a9f80e1882cb0d/watchfiles-1.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:52e06553899e11e8074503c8e716d574adeeb7e68913115c4b3653c53f9bae42", size = 288162, upload-time = "2025-10-14T15:05:13.208Z" },
+ { url = "https://files.pythonhosted.org/packages/dc/61/fe0e56c40d5cd29523e398d31153218718c5786b5e636d9ae8ae79453d27/watchfiles-1.1.1-cp313-cp313-win_arm64.whl", hash = "sha256:ac3cc5759570cd02662b15fbcd9d917f7ecd47efe0d6b40474eafd246f91ea18", size = 277909, upload-time = "2025-10-14T15:05:14.49Z" },
+ { url = "https://files.pythonhosted.org/packages/79/42/e0a7d749626f1e28c7108a99fb9bf524b501bbbeb9b261ceecde644d5a07/watchfiles-1.1.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:563b116874a9a7ce6f96f87cd0b94f7faf92d08d0021e837796f0a14318ef8da", size = 403389, upload-time = "2025-10-14T15:05:15.777Z" },
+ { url = "https://files.pythonhosted.org/packages/15/49/08732f90ce0fbbc13913f9f215c689cfc9ced345fb1bcd8829a50007cc8d/watchfiles-1.1.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3ad9fe1dae4ab4212d8c91e80b832425e24f421703b5a42ef2e4a1e215aff051", size = 389964, upload-time = "2025-10-14T15:05:16.85Z" },
+ { url = "https://files.pythonhosted.org/packages/27/0d/7c315d4bd5f2538910491a0393c56bf70d333d51bc5b34bee8e68e8cea19/watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce70f96a46b894b36eba678f153f052967a0d06d5b5a19b336ab0dbbd029f73e", size = 448114, upload-time = "2025-10-14T15:05:17.876Z" },
+ { url = "https://files.pythonhosted.org/packages/c3/24/9e096de47a4d11bc4df41e9d1e61776393eac4cb6eb11b3e23315b78b2cc/watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cb467c999c2eff23a6417e58d75e5828716f42ed8289fe6b77a7e5a91036ca70", size = 460264, upload-time = "2025-10-14T15:05:18.962Z" },
+ { url = "https://files.pythonhosted.org/packages/cc/0f/e8dea6375f1d3ba5fcb0b3583e2b493e77379834c74fd5a22d66d85d6540/watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:836398932192dae4146c8f6f737d74baeac8b70ce14831a239bdb1ca882fc261", size = 487877, upload-time = "2025-10-14T15:05:20.094Z" },
+ { url = "https://files.pythonhosted.org/packages/ac/5b/df24cfc6424a12deb41503b64d42fbea6b8cb357ec62ca84a5a3476f654a/watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:743185e7372b7bc7c389e1badcc606931a827112fbbd37f14c537320fca08620", size = 595176, upload-time = "2025-10-14T15:05:21.134Z" },
+ { url = "https://files.pythonhosted.org/packages/8f/b5/853b6757f7347de4e9b37e8cc3289283fb983cba1ab4d2d7144694871d9c/watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:afaeff7696e0ad9f02cbb8f56365ff4686ab205fcf9c4c5b6fdfaaa16549dd04", size = 473577, upload-time = "2025-10-14T15:05:22.306Z" },
+ { url = "https://files.pythonhosted.org/packages/e1/f7/0a4467be0a56e80447c8529c9fce5b38eab4f513cb3d9bf82e7392a5696b/watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f7eb7da0eb23aa2ba036d4f616d46906013a68caf61b7fdbe42fc8b25132e77", size = 455425, upload-time = "2025-10-14T15:05:23.348Z" },
+ { url = "https://files.pythonhosted.org/packages/8e/e0/82583485ea00137ddf69bc84a2db88bd92ab4a6e3c405e5fb878ead8d0e7/watchfiles-1.1.1-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:831a62658609f0e5c64178211c942ace999517f5770fe9436be4c2faeba0c0ef", size = 628826, upload-time = "2025-10-14T15:05:24.398Z" },
+ { url = "https://files.pythonhosted.org/packages/28/9a/a785356fccf9fae84c0cc90570f11702ae9571036fb25932f1242c82191c/watchfiles-1.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:f9a2ae5c91cecc9edd47e041a930490c31c3afb1f5e6d71de3dc671bfaca02bf", size = 622208, upload-time = "2025-10-14T15:05:25.45Z" },
+ { url = "https://files.pythonhosted.org/packages/c3/f4/0872229324ef69b2c3edec35e84bd57a1289e7d3fe74588048ed8947a323/watchfiles-1.1.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:d1715143123baeeaeadec0528bb7441103979a1d5f6fd0e1f915383fea7ea6d5", size = 404315, upload-time = "2025-10-14T15:05:26.501Z" },
+ { url = "https://files.pythonhosted.org/packages/7b/22/16d5331eaed1cb107b873f6ae1b69e9ced582fcf0c59a50cd84f403b1c32/watchfiles-1.1.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:39574d6370c4579d7f5d0ad940ce5b20db0e4117444e39b6d8f99db5676c52fd", size = 390869, upload-time = "2025-10-14T15:05:27.649Z" },
+ { url = "https://files.pythonhosted.org/packages/b2/7e/5643bfff5acb6539b18483128fdc0ef2cccc94a5b8fbda130c823e8ed636/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7365b92c2e69ee952902e8f70f3ba6360d0d596d9299d55d7d386df84b6941fb", size = 449919, upload-time = "2025-10-14T15:05:28.701Z" },
+ { url = "https://files.pythonhosted.org/packages/51/2e/c410993ba5025a9f9357c376f48976ef0e1b1aefb73b97a5ae01a5972755/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bfff9740c69c0e4ed32416f013f3c45e2ae42ccedd1167ef2d805c000b6c71a5", size = 460845, upload-time = "2025-10-14T15:05:30.064Z" },
+ { url = "https://files.pythonhosted.org/packages/8e/a4/2df3b404469122e8680f0fcd06079317e48db58a2da2950fb45020947734/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b27cf2eb1dda37b2089e3907d8ea92922b673c0c427886d4edc6b94d8dfe5db3", size = 489027, upload-time = "2025-10-14T15:05:31.064Z" },
+ { url = "https://files.pythonhosted.org/packages/ea/84/4587ba5b1f267167ee715b7f66e6382cca6938e0a4b870adad93e44747e6/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:526e86aced14a65a5b0ec50827c745597c782ff46b571dbfe46192ab9e0b3c33", size = 595615, upload-time = "2025-10-14T15:05:32.074Z" },
+ { url = "https://files.pythonhosted.org/packages/6a/0f/c6988c91d06e93cd0bb3d4a808bcf32375ca1904609835c3031799e3ecae/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04e78dd0b6352db95507fd8cb46f39d185cf8c74e4cf1e4fbad1d3df96faf510", size = 474836, upload-time = "2025-10-14T15:05:33.209Z" },
+ { url = "https://files.pythonhosted.org/packages/b4/36/ded8aebea91919485b7bbabbd14f5f359326cb5ec218cd67074d1e426d74/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c85794a4cfa094714fb9c08d4a218375b2b95b8ed1666e8677c349906246c05", size = 455099, upload-time = "2025-10-14T15:05:34.189Z" },
+ { url = "https://files.pythonhosted.org/packages/98/e0/8c9bdba88af756a2fce230dd365fab2baf927ba42cd47521ee7498fd5211/watchfiles-1.1.1-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:74d5012b7630714b66be7b7b7a78855ef7ad58e8650c73afc4c076a1f480a8d6", size = 630626, upload-time = "2025-10-14T15:05:35.216Z" },
+ { url = "https://files.pythonhosted.org/packages/2a/84/a95db05354bf2d19e438520d92a8ca475e578c647f78f53197f5a2f17aaf/watchfiles-1.1.1-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:8fbe85cb3201c7d380d3d0b90e63d520f15d6afe217165d7f98c9c649654db81", size = 622519, upload-time = "2025-10-14T15:05:36.259Z" },
+ { url = "https://files.pythonhosted.org/packages/1d/ce/d8acdc8de545de995c339be67711e474c77d643555a9bb74a9334252bd55/watchfiles-1.1.1-cp314-cp314-win32.whl", hash = "sha256:3fa0b59c92278b5a7800d3ee7733da9d096d4aabcfabb9a928918bd276ef9b9b", size = 272078, upload-time = "2025-10-14T15:05:37.63Z" },
+ { url = "https://files.pythonhosted.org/packages/c4/c9/a74487f72d0451524be827e8edec251da0cc1fcf111646a511ae752e1a3d/watchfiles-1.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:c2047d0b6cea13b3316bdbafbfa0c4228ae593d995030fda39089d36e64fc03a", size = 287664, upload-time = "2025-10-14T15:05:38.95Z" },
+ { url = "https://files.pythonhosted.org/packages/df/b8/8ac000702cdd496cdce998c6f4ee0ca1f15977bba51bdf07d872ebdfc34c/watchfiles-1.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:842178b126593addc05acf6fce960d28bc5fae7afbaa2c6c1b3a7b9460e5be02", size = 277154, upload-time = "2025-10-14T15:05:39.954Z" },
+ { url = "https://files.pythonhosted.org/packages/47/a8/e3af2184707c29f0f14b1963c0aace6529f9d1b8582d5b99f31bbf42f59e/watchfiles-1.1.1-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:88863fbbc1a7312972f1c511f202eb30866370ebb8493aef2812b9ff28156a21", size = 403820, upload-time = "2025-10-14T15:05:40.932Z" },
+ { url = "https://files.pythonhosted.org/packages/c0/ec/e47e307c2f4bd75f9f9e8afbe3876679b18e1bcec449beca132a1c5ffb2d/watchfiles-1.1.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:55c7475190662e202c08c6c0f4d9e345a29367438cf8e8037f3155e10a88d5a5", size = 390510, upload-time = "2025-10-14T15:05:41.945Z" },
+ { url = "https://files.pythonhosted.org/packages/d5/a0/ad235642118090f66e7b2f18fd5c42082418404a79205cdfca50b6309c13/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f53fa183d53a1d7a8852277c92b967ae99c2d4dcee2bfacff8868e6e30b15f7", size = 448408, upload-time = "2025-10-14T15:05:43.385Z" },
+ { url = "https://files.pythonhosted.org/packages/df/85/97fa10fd5ff3332ae17e7e40e20784e419e28521549780869f1413742e9d/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6aae418a8b323732fa89721d86f39ec8f092fc2af67f4217a2b07fd3e93c6101", size = 458968, upload-time = "2025-10-14T15:05:44.404Z" },
+ { url = "https://files.pythonhosted.org/packages/47/c2/9059c2e8966ea5ce678166617a7f75ecba6164375f3b288e50a40dc6d489/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f096076119da54a6080e8920cbdaac3dbee667eb91dcc5e5b78840b87415bd44", size = 488096, upload-time = "2025-10-14T15:05:45.398Z" },
+ { url = "https://files.pythonhosted.org/packages/94/44/d90a9ec8ac309bc26db808a13e7bfc0e4e78b6fc051078a554e132e80160/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:00485f441d183717038ed2e887a7c868154f216877653121068107b227a2f64c", size = 596040, upload-time = "2025-10-14T15:05:46.502Z" },
+ { url = "https://files.pythonhosted.org/packages/95/68/4e3479b20ca305cfc561db3ed207a8a1c745ee32bf24f2026a129d0ddb6e/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a55f3e9e493158d7bfdb60a1165035f1cf7d320914e7b7ea83fe22c6023b58fc", size = 473847, upload-time = "2025-10-14T15:05:47.484Z" },
+ { url = "https://files.pythonhosted.org/packages/4f/55/2af26693fd15165c4ff7857e38330e1b61ab8c37d15dc79118cdba115b7a/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c91ed27800188c2ae96d16e3149f199d62f86c7af5f5f4d2c61a3ed8cd3666c", size = 455072, upload-time = "2025-10-14T15:05:48.928Z" },
+ { url = "https://files.pythonhosted.org/packages/66/1d/d0d200b10c9311ec25d2273f8aad8c3ef7cc7ea11808022501811208a750/watchfiles-1.1.1-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:311ff15a0bae3714ffb603e6ba6dbfba4065ab60865d15a6ec544133bdb21099", size = 629104, upload-time = "2025-10-14T15:05:49.908Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/bd/fa9bb053192491b3867ba07d2343d9f2252e00811567d30ae8d0f78136fe/watchfiles-1.1.1-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:a916a2932da8f8ab582f242c065f5c81bed3462849ca79ee357dd9551b0e9b01", size = 622112, upload-time = "2025-10-14T15:05:50.941Z" },
+ { url = "https://files.pythonhosted.org/packages/ba/4c/a888c91e2e326872fa4705095d64acd8aa2fb9c1f7b9bd0588f33850516c/watchfiles-1.1.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:17ef139237dfced9da49fb7f2232c86ca9421f666d78c264c7ffca6601d154c3", size = 409611, upload-time = "2025-10-14T15:06:05.809Z" },
+ { url = "https://files.pythonhosted.org/packages/1e/c7/5420d1943c8e3ce1a21c0a9330bcf7edafb6aa65d26b21dbb3267c9e8112/watchfiles-1.1.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:672b8adf25b1a0d35c96b5888b7b18699d27d4194bac8beeae75be4b7a3fc9b2", size = 396889, upload-time = "2025-10-14T15:06:07.035Z" },
+ { url = "https://files.pythonhosted.org/packages/0c/e5/0072cef3804ce8d3aaddbfe7788aadff6b3d3f98a286fdbee9fd74ca59a7/watchfiles-1.1.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77a13aea58bc2b90173bc69f2a90de8e282648939a00a602e1dc4ee23e26b66d", size = 451616, upload-time = "2025-10-14T15:06:08.072Z" },
+ { url = "https://files.pythonhosted.org/packages/83/4e/b87b71cbdfad81ad7e83358b3e447fedd281b880a03d64a760fe0a11fc2e/watchfiles-1.1.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b495de0bb386df6a12b18335a0285dda90260f51bdb505503c02bcd1ce27a8b", size = 458413, upload-time = "2025-10-14T15:06:09.209Z" },
+ { url = "https://files.pythonhosted.org/packages/d3/8e/e500f8b0b77be4ff753ac94dc06b33d8f0d839377fee1b78e8c8d8f031bf/watchfiles-1.1.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:db476ab59b6765134de1d4fe96a1a9c96ddf091683599be0f26147ea1b2e4b88", size = 408250, upload-time = "2025-10-14T15:06:10.264Z" },
+ { url = "https://files.pythonhosted.org/packages/bd/95/615e72cd27b85b61eec764a5ca51bd94d40b5adea5ff47567d9ebc4d275a/watchfiles-1.1.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:89eef07eee5e9d1fda06e38822ad167a044153457e6fd997f8a858ab7564a336", size = 396117, upload-time = "2025-10-14T15:06:11.28Z" },
+ { url = "https://files.pythonhosted.org/packages/c9/81/e7fe958ce8a7fb5c73cc9fb07f5aeaf755e6aa72498c57d760af760c91f8/watchfiles-1.1.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce19e06cbda693e9e7686358af9cd6f5d61312ab8b00488bc36f5aabbaf77e24", size = 450493, upload-time = "2025-10-14T15:06:12.321Z" },
+ { url = "https://files.pythonhosted.org/packages/6e/d4/ed38dd3b1767193de971e694aa544356e63353c33a85d948166b5ff58b9e/watchfiles-1.1.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e6f39af2eab0118338902798b5aa6664f46ff66bc0280de76fca67a7f262a49", size = 457546, upload-time = "2025-10-14T15:06:13.372Z" },
+]
+
+[[package]]
+name = "websockets"
+version = "16.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/04/24/4b2031d72e840ce4c1ccb255f693b15c334757fc50023e4db9537080b8c4/websockets-16.0.tar.gz", hash = "sha256:5f6261a5e56e8d5c42a4497b364ea24d94d9563e8fbd44e78ac40879c60179b5", size = 179346, upload-time = "2026-01-10T09:23:47.181Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/20/74/221f58decd852f4b59cc3354cccaf87e8ef695fede361d03dc9a7396573b/websockets-16.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:04cdd5d2d1dacbad0a7bf36ccbcd3ccd5a30ee188f2560b7a62a30d14107b31a", size = 177343, upload-time = "2026-01-10T09:22:21.28Z" },
+ { url = "https://files.pythonhosted.org/packages/19/0f/22ef6107ee52ab7f0b710d55d36f5a5d3ef19e8a205541a6d7ffa7994e5a/websockets-16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8ff32bb86522a9e5e31439a58addbb0166f0204d64066fb955265c4e214160f0", size = 175021, upload-time = "2026-01-10T09:22:22.696Z" },
+ { url = "https://files.pythonhosted.org/packages/10/40/904a4cb30d9b61c0e278899bf36342e9b0208eb3c470324a9ecbaac2a30f/websockets-16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:583b7c42688636f930688d712885cf1531326ee05effd982028212ccc13e5957", size = 175320, upload-time = "2026-01-10T09:22:23.94Z" },
+ { url = "https://files.pythonhosted.org/packages/9d/2f/4b3ca7e106bc608744b1cdae041e005e446124bebb037b18799c2d356864/websockets-16.0-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:7d837379b647c0c4c2355c2499723f82f1635fd2c26510e1f587d89bc2199e72", size = 183815, upload-time = "2026-01-10T09:22:25.469Z" },
+ { url = "https://files.pythonhosted.org/packages/86/26/d40eaa2a46d4302becec8d15b0fc5e45bdde05191e7628405a19cf491ccd/websockets-16.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:df57afc692e517a85e65b72e165356ed1df12386ecb879ad5693be08fac65dde", size = 185054, upload-time = "2026-01-10T09:22:27.101Z" },
+ { url = "https://files.pythonhosted.org/packages/b0/ba/6500a0efc94f7373ee8fefa8c271acdfd4dca8bd49a90d4be7ccabfc397e/websockets-16.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2b9f1e0d69bc60a4a87349d50c09a037a2607918746f07de04df9e43252c77a3", size = 184565, upload-time = "2026-01-10T09:22:28.293Z" },
+ { url = "https://files.pythonhosted.org/packages/04/b4/96bf2cee7c8d8102389374a2616200574f5f01128d1082f44102140344cc/websockets-16.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:335c23addf3d5e6a8633f9f8eda77efad001671e80b95c491dd0924587ece0b3", size = 183848, upload-time = "2026-01-10T09:22:30.394Z" },
+ { url = "https://files.pythonhosted.org/packages/02/8e/81f40fb00fd125357814e8c3025738fc4ffc3da4b6b4a4472a82ba304b41/websockets-16.0-cp310-cp310-win32.whl", hash = "sha256:37b31c1623c6605e4c00d466c9d633f9b812ea430c11c8a278774a1fde1acfa9", size = 178249, upload-time = "2026-01-10T09:22:32.083Z" },
+ { url = "https://files.pythonhosted.org/packages/b4/5f/7e40efe8df57db9b91c88a43690ac66f7b7aa73a11aa6a66b927e44f26fa/websockets-16.0-cp310-cp310-win_amd64.whl", hash = "sha256:8e1dab317b6e77424356e11e99a432b7cb2f3ec8c5ab4dabbcee6add48f72b35", size = 178685, upload-time = "2026-01-10T09:22:33.345Z" },
+ { url = "https://files.pythonhosted.org/packages/f2/db/de907251b4ff46ae804ad0409809504153b3f30984daf82a1d84a9875830/websockets-16.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:31a52addea25187bde0797a97d6fc3d2f92b6f72a9370792d65a6e84615ac8a8", size = 177340, upload-time = "2026-01-10T09:22:34.539Z" },
+ { url = "https://files.pythonhosted.org/packages/f3/fa/abe89019d8d8815c8781e90d697dec52523fb8ebe308bf11664e8de1877e/websockets-16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:417b28978cdccab24f46400586d128366313e8a96312e4b9362a4af504f3bbad", size = 175022, upload-time = "2026-01-10T09:22:36.332Z" },
+ { url = "https://files.pythonhosted.org/packages/58/5d/88ea17ed1ded2079358b40d31d48abe90a73c9e5819dbcde1606e991e2ad/websockets-16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:af80d74d4edfa3cb9ed973a0a5ba2b2a549371f8a741e0800cb07becdd20f23d", size = 175319, upload-time = "2026-01-10T09:22:37.602Z" },
+ { url = "https://files.pythonhosted.org/packages/d2/ae/0ee92b33087a33632f37a635e11e1d99d429d3d323329675a6022312aac2/websockets-16.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:08d7af67b64d29823fed316505a89b86705f2b7981c07848fb5e3ea3020c1abe", size = 184631, upload-time = "2026-01-10T09:22:38.789Z" },
+ { url = "https://files.pythonhosted.org/packages/c8/c5/27178df583b6c5b31b29f526ba2da5e2f864ecc79c99dae630a85d68c304/websockets-16.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7be95cfb0a4dae143eaed2bcba8ac23f4892d8971311f1b06f3c6b78952ee70b", size = 185870, upload-time = "2026-01-10T09:22:39.893Z" },
+ { url = "https://files.pythonhosted.org/packages/87/05/536652aa84ddc1c018dbb7e2c4cbcd0db884580bf8e95aece7593fde526f/websockets-16.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d6297ce39ce5c2e6feb13c1a996a2ded3b6832155fcfc920265c76f24c7cceb5", size = 185361, upload-time = "2026-01-10T09:22:41.016Z" },
+ { url = "https://files.pythonhosted.org/packages/6d/e2/d5332c90da12b1e01f06fb1b85c50cfc489783076547415bf9f0a659ec19/websockets-16.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1c1b30e4f497b0b354057f3467f56244c603a79c0d1dafce1d16c283c25f6e64", size = 184615, upload-time = "2026-01-10T09:22:42.442Z" },
+ { url = "https://files.pythonhosted.org/packages/77/fb/d3f9576691cae9253b51555f841bc6600bf0a983a461c79500ace5a5b364/websockets-16.0-cp311-cp311-win32.whl", hash = "sha256:5f451484aeb5cafee1ccf789b1b66f535409d038c56966d6101740c1614b86c6", size = 178246, upload-time = "2026-01-10T09:22:43.654Z" },
+ { url = "https://files.pythonhosted.org/packages/54/67/eaff76b3dbaf18dcddabc3b8c1dba50b483761cccff67793897945b37408/websockets-16.0-cp311-cp311-win_amd64.whl", hash = "sha256:8d7f0659570eefb578dacde98e24fb60af35350193e4f56e11190787bee77dac", size = 178684, upload-time = "2026-01-10T09:22:44.941Z" },
+ { url = "https://files.pythonhosted.org/packages/84/7b/bac442e6b96c9d25092695578dda82403c77936104b5682307bd4deb1ad4/websockets-16.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:71c989cbf3254fbd5e84d3bff31e4da39c43f884e64f2551d14bb3c186230f00", size = 177365, upload-time = "2026-01-10T09:22:46.787Z" },
+ { url = "https://files.pythonhosted.org/packages/b0/fe/136ccece61bd690d9c1f715baaeefd953bb2360134de73519d5df19d29ca/websockets-16.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:8b6e209ffee39ff1b6d0fa7bfef6de950c60dfb91b8fcead17da4ee539121a79", size = 175038, upload-time = "2026-01-10T09:22:47.999Z" },
+ { url = "https://files.pythonhosted.org/packages/40/1e/9771421ac2286eaab95b8575b0cb701ae3663abf8b5e1f64f1fd90d0a673/websockets-16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:86890e837d61574c92a97496d590968b23c2ef0aeb8a9bc9421d174cd378ae39", size = 175328, upload-time = "2026-01-10T09:22:49.809Z" },
+ { url = "https://files.pythonhosted.org/packages/18/29/71729b4671f21e1eaa5d6573031ab810ad2936c8175f03f97f3ff164c802/websockets-16.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9b5aca38b67492ef518a8ab76851862488a478602229112c4b0d58d63a7a4d5c", size = 184915, upload-time = "2026-01-10T09:22:51.071Z" },
+ { url = "https://files.pythonhosted.org/packages/97/bb/21c36b7dbbafc85d2d480cd65df02a1dc93bf76d97147605a8e27ff9409d/websockets-16.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e0334872c0a37b606418ac52f6ab9cfd17317ac26365f7f65e203e2d0d0d359f", size = 186152, upload-time = "2026-01-10T09:22:52.224Z" },
+ { url = "https://files.pythonhosted.org/packages/4a/34/9bf8df0c0cf88fa7bfe36678dc7b02970c9a7d5e065a3099292db87b1be2/websockets-16.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a0b31e0b424cc6b5a04b8838bbaec1688834b2383256688cf47eb97412531da1", size = 185583, upload-time = "2026-01-10T09:22:53.443Z" },
+ { url = "https://files.pythonhosted.org/packages/47/88/4dd516068e1a3d6ab3c7c183288404cd424a9a02d585efbac226cb61ff2d/websockets-16.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:485c49116d0af10ac698623c513c1cc01c9446c058a4e61e3bf6c19dff7335a2", size = 184880, upload-time = "2026-01-10T09:22:55.033Z" },
+ { url = "https://files.pythonhosted.org/packages/91/d6/7d4553ad4bf1c0421e1ebd4b18de5d9098383b5caa1d937b63df8d04b565/websockets-16.0-cp312-cp312-win32.whl", hash = "sha256:eaded469f5e5b7294e2bdca0ab06becb6756ea86894a47806456089298813c89", size = 178261, upload-time = "2026-01-10T09:22:56.251Z" },
+ { url = "https://files.pythonhosted.org/packages/c3/f0/f3a17365441ed1c27f850a80b2bc680a0fa9505d733fe152fdf5e98c1c0b/websockets-16.0-cp312-cp312-win_amd64.whl", hash = "sha256:5569417dc80977fc8c2d43a86f78e0a5a22fee17565d78621b6bb264a115d4ea", size = 178693, upload-time = "2026-01-10T09:22:57.478Z" },
+ { url = "https://files.pythonhosted.org/packages/cc/9c/baa8456050d1c1b08dd0ec7346026668cbc6f145ab4e314d707bb845bf0d/websockets-16.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:878b336ac47938b474c8f982ac2f7266a540adc3fa4ad74ae96fea9823a02cc9", size = 177364, upload-time = "2026-01-10T09:22:59.333Z" },
+ { url = "https://files.pythonhosted.org/packages/7e/0c/8811fc53e9bcff68fe7de2bcbe75116a8d959ac699a3200f4847a8925210/websockets-16.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:52a0fec0e6c8d9a784c2c78276a48a2bdf099e4ccc2a4cad53b27718dbfd0230", size = 175039, upload-time = "2026-01-10T09:23:01.171Z" },
+ { url = "https://files.pythonhosted.org/packages/aa/82/39a5f910cb99ec0b59e482971238c845af9220d3ab9fa76dd9162cda9d62/websockets-16.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e6578ed5b6981005df1860a56e3617f14a6c307e6a71b4fff8c48fdc50f3ed2c", size = 175323, upload-time = "2026-01-10T09:23:02.341Z" },
+ { url = "https://files.pythonhosted.org/packages/bd/28/0a25ee5342eb5d5f297d992a77e56892ecb65e7854c7898fb7d35e9b33bd/websockets-16.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:95724e638f0f9c350bb1c2b0a7ad0e83d9cc0c9259f3ea94e40d7b02a2179ae5", size = 184975, upload-time = "2026-01-10T09:23:03.756Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/66/27ea52741752f5107c2e41fda05e8395a682a1e11c4e592a809a90c6a506/websockets-16.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c0204dc62a89dc9d50d682412c10b3542d748260d743500a85c13cd1ee4bde82", size = 186203, upload-time = "2026-01-10T09:23:05.01Z" },
+ { url = "https://files.pythonhosted.org/packages/37/e5/8e32857371406a757816a2b471939d51c463509be73fa538216ea52b792a/websockets-16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:52ac480f44d32970d66763115edea932f1c5b1312de36df06d6b219f6741eed8", size = 185653, upload-time = "2026-01-10T09:23:06.301Z" },
+ { url = "https://files.pythonhosted.org/packages/9b/67/f926bac29882894669368dc73f4da900fcdf47955d0a0185d60103df5737/websockets-16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6e5a82b677f8f6f59e8dfc34ec06ca6b5b48bc4fcda346acd093694cc2c24d8f", size = 184920, upload-time = "2026-01-10T09:23:07.492Z" },
+ { url = "https://files.pythonhosted.org/packages/3c/a1/3d6ccdcd125b0a42a311bcd15a7f705d688f73b2a22d8cf1c0875d35d34a/websockets-16.0-cp313-cp313-win32.whl", hash = "sha256:abf050a199613f64c886ea10f38b47770a65154dc37181bfaff70c160f45315a", size = 178255, upload-time = "2026-01-10T09:23:09.245Z" },
+ { url = "https://files.pythonhosted.org/packages/6b/ae/90366304d7c2ce80f9b826096a9e9048b4bb760e44d3b873bb272cba696b/websockets-16.0-cp313-cp313-win_amd64.whl", hash = "sha256:3425ac5cf448801335d6fdc7ae1eb22072055417a96cc6b31b3861f455fbc156", size = 178689, upload-time = "2026-01-10T09:23:10.483Z" },
+ { url = "https://files.pythonhosted.org/packages/f3/1d/e88022630271f5bd349ed82417136281931e558d628dd52c4d8621b4a0b2/websockets-16.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:8cc451a50f2aee53042ac52d2d053d08bf89bcb31ae799cb4487587661c038a0", size = 177406, upload-time = "2026-01-10T09:23:12.178Z" },
+ { url = "https://files.pythonhosted.org/packages/f2/78/e63be1bf0724eeb4616efb1ae1c9044f7c3953b7957799abb5915bffd38e/websockets-16.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:daa3b6ff70a9241cf6c7fc9e949d41232d9d7d26fd3522b1ad2b4d62487e9904", size = 175085, upload-time = "2026-01-10T09:23:13.511Z" },
+ { url = "https://files.pythonhosted.org/packages/bb/f4/d3c9220d818ee955ae390cf319a7c7a467beceb24f05ee7aaaa2414345ba/websockets-16.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:fd3cb4adb94a2a6e2b7c0d8d05cb94e6f1c81a0cf9dc2694fb65c7e8d94c42e4", size = 175328, upload-time = "2026-01-10T09:23:14.727Z" },
+ { url = "https://files.pythonhosted.org/packages/63/bc/d3e208028de777087e6fb2b122051a6ff7bbcca0d6df9d9c2bf1dd869ae9/websockets-16.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:781caf5e8eee67f663126490c2f96f40906594cb86b408a703630f95550a8c3e", size = 185044, upload-time = "2026-01-10T09:23:15.939Z" },
+ { url = "https://files.pythonhosted.org/packages/ad/6e/9a0927ac24bd33a0a9af834d89e0abc7cfd8e13bed17a86407a66773cc0e/websockets-16.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:caab51a72c51973ca21fa8a18bd8165e1a0183f1ac7066a182ff27107b71e1a4", size = 186279, upload-time = "2026-01-10T09:23:17.148Z" },
+ { url = "https://files.pythonhosted.org/packages/b9/ca/bf1c68440d7a868180e11be653c85959502efd3a709323230314fda6e0b3/websockets-16.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:19c4dc84098e523fd63711e563077d39e90ec6702aff4b5d9e344a60cb3c0cb1", size = 185711, upload-time = "2026-01-10T09:23:18.372Z" },
+ { url = "https://files.pythonhosted.org/packages/c4/f8/fdc34643a989561f217bb477cbc47a3a07212cbda91c0e4389c43c296ebf/websockets-16.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:a5e18a238a2b2249c9a9235466b90e96ae4795672598a58772dd806edc7ac6d3", size = 184982, upload-time = "2026-01-10T09:23:19.652Z" },
+ { url = "https://files.pythonhosted.org/packages/dd/d1/574fa27e233764dbac9c52730d63fcf2823b16f0856b3329fc6268d6ae4f/websockets-16.0-cp314-cp314-win32.whl", hash = "sha256:a069d734c4a043182729edd3e9f247c3b2a4035415a9172fd0f1b71658a320a8", size = 177915, upload-time = "2026-01-10T09:23:21.458Z" },
+ { url = "https://files.pythonhosted.org/packages/8a/f1/ae6b937bf3126b5134ce1f482365fde31a357c784ac51852978768b5eff4/websockets-16.0-cp314-cp314-win_amd64.whl", hash = "sha256:c0ee0e63f23914732c6d7e0cce24915c48f3f1512ec1d079ed01fc629dab269d", size = 178381, upload-time = "2026-01-10T09:23:22.715Z" },
+ { url = "https://files.pythonhosted.org/packages/06/9b/f791d1db48403e1f0a27577a6beb37afae94254a8c6f08be4a23e4930bc0/websockets-16.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:a35539cacc3febb22b8f4d4a99cc79b104226a756aa7400adc722e83b0d03244", size = 177737, upload-time = "2026-01-10T09:23:24.523Z" },
+ { url = "https://files.pythonhosted.org/packages/bd/40/53ad02341fa33b3ce489023f635367a4ac98b73570102ad2cdd770dacc9a/websockets-16.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:b784ca5de850f4ce93ec85d3269d24d4c82f22b7212023c974c401d4980ebc5e", size = 175268, upload-time = "2026-01-10T09:23:25.781Z" },
+ { url = "https://files.pythonhosted.org/packages/74/9b/6158d4e459b984f949dcbbb0c5d270154c7618e11c01029b9bbd1bb4c4f9/websockets-16.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:569d01a4e7fba956c5ae4fc988f0d4e187900f5497ce46339c996dbf24f17641", size = 175486, upload-time = "2026-01-10T09:23:27.033Z" },
+ { url = "https://files.pythonhosted.org/packages/e5/2d/7583b30208b639c8090206f95073646c2c9ffd66f44df967981a64f849ad/websockets-16.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:50f23cdd8343b984957e4077839841146f67a3d31ab0d00e6b824e74c5b2f6e8", size = 185331, upload-time = "2026-01-10T09:23:28.259Z" },
+ { url = "https://files.pythonhosted.org/packages/45/b0/cce3784eb519b7b5ad680d14b9673a31ab8dcb7aad8b64d81709d2430aa8/websockets-16.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:152284a83a00c59b759697b7f9e9cddf4e3c7861dd0d964b472b70f78f89e80e", size = 186501, upload-time = "2026-01-10T09:23:29.449Z" },
+ { url = "https://files.pythonhosted.org/packages/19/60/b8ebe4c7e89fb5f6cdf080623c9d92789a53636950f7abacfc33fe2b3135/websockets-16.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:bc59589ab64b0022385f429b94697348a6a234e8ce22544e3681b2e9331b5944", size = 186062, upload-time = "2026-01-10T09:23:31.368Z" },
+ { url = "https://files.pythonhosted.org/packages/88/a8/a080593f89b0138b6cba1b28f8df5673b5506f72879322288b031337c0b8/websockets-16.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:32da954ffa2814258030e5a57bc73a3635463238e797c7375dc8091327434206", size = 185356, upload-time = "2026-01-10T09:23:32.627Z" },
+ { url = "https://files.pythonhosted.org/packages/c2/b6/b9afed2afadddaf5ebb2afa801abf4b0868f42f8539bfe4b071b5266c9fe/websockets-16.0-cp314-cp314t-win32.whl", hash = "sha256:5a4b4cc550cb665dd8a47f868c8d04c8230f857363ad3c9caf7a0c3bf8c61ca6", size = 178085, upload-time = "2026-01-10T09:23:33.816Z" },
+ { url = "https://files.pythonhosted.org/packages/9f/3e/28135a24e384493fa804216b79a6a6759a38cc4ff59118787b9fb693df93/websockets-16.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b14dc141ed6d2dde437cddb216004bcac6a1df0935d79656387bd41632ba0bbd", size = 178531, upload-time = "2026-01-10T09:23:35.016Z" },
+ { url = "https://files.pythonhosted.org/packages/72/07/c98a68571dcf256e74f1f816b8cc5eae6eb2d3d5cfa44d37f801619d9166/websockets-16.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:349f83cd6c9a415428ee1005cadb5c2c56f4389bc06a9af16103c3bc3dcc8b7d", size = 174947, upload-time = "2026-01-10T09:23:36.166Z" },
+ { url = "https://files.pythonhosted.org/packages/7e/52/93e166a81e0305b33fe416338be92ae863563fe7bce446b0f687b9df5aea/websockets-16.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:4a1aba3340a8dca8db6eb5a7986157f52eb9e436b74813764241981ca4888f03", size = 175260, upload-time = "2026-01-10T09:23:37.409Z" },
+ { url = "https://files.pythonhosted.org/packages/56/0c/2dbf513bafd24889d33de2ff0368190a0e69f37bcfa19009ef819fe4d507/websockets-16.0-pp311-pypy311_pp73-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f4a32d1bd841d4bcbffdcb3d2ce50c09c3909fbead375ab28d0181af89fd04da", size = 176071, upload-time = "2026-01-10T09:23:39.158Z" },
+ { url = "https://files.pythonhosted.org/packages/a5/8f/aea9c71cc92bf9b6cc0f7f70df8f0b420636b6c96ef4feee1e16f80f75dd/websockets-16.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0298d07ee155e2e9fda5be8a9042200dd2e3bb0b8a38482156576f863a9d457c", size = 176968, upload-time = "2026-01-10T09:23:41.031Z" },
+ { url = "https://files.pythonhosted.org/packages/9a/3f/f70e03f40ffc9a30d817eef7da1be72ee4956ba8d7255c399a01b135902a/websockets-16.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:a653aea902e0324b52f1613332ddf50b00c06fdaf7e92624fbf8c77c78fa5767", size = 178735, upload-time = "2026-01-10T09:23:42.259Z" },
+ { url = "https://files.pythonhosted.org/packages/6f/28/258ebab549c2bf3e64d2b0217b973467394a9cea8c42f70418ca2c5d0d2e/websockets-16.0-py3-none-any.whl", hash = "sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec", size = 171598, upload-time = "2026-01-10T09:23:45.395Z" },
+]
+
+[[package]]
+name = "zipp"
+version = "3.23.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/30/21/093488dfc7cc8964ded15ab726fad40f25fd3d788fd741cc1c5a17d78ee8/zipp-3.23.1.tar.gz", hash = "sha256:32120e378d32cd9714ad503c1d024619063ec28aad2248dc6672ad13edfa5110", size = 25965, upload-time = "2026-04-13T23:21:46.6Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/08/8a/0861bec20485572fbddf3dfba2910e38fe249796cb73ecdeb74e07eeb8d3/zipp-3.23.1-py3-none-any.whl", hash = "sha256:0b3596c50a5c700c9cb40ba8d86d9f2cc4807e9bedb06bcdf7fac85633e444dc", size = 10378, upload-time = "2026-04-13T23:21:45.386Z" },
+]