Spaces:

jampuramprem
/

AxiomForgeAI

Sleeping

App Files Files Community

AxiomForgeAI / src /rl /math_environment_curriculum.py

jampuramprem

Initial Space deployment

ec4ae03 12 days ago

raw

history blame contribute delete

53.3 kB

	"""
	Curriculum-aware math environment with dual reward signals.

	This file is deliberately minimal: a single ``collect_rollouts`` method is all
	the training loop needs. Rollouts and PPO updates run in the same process on
	a single GPU — no subprocesses, no RPC, no vLLM colocation.
	"""

	from __future__ import annotations

	import logging
	import random
	import re
	from dataclasses import asdict, dataclass
	from typing import Any, Dict, List, Optional, Tuple

	import torch
	from sympy import simplify
	from sympy.parsing.sympy_parser import parse_expr
	from tqdm.auto import tqdm
	from transformers import AutoModelForCausalLM, AutoTokenizer

	from src.config.prompts import create_generator_messages, create_solver_messages
	from src.rl.curriculum_manager import CurriculumManager
	from src.rl.expert_panel import SimulatedExpertPanel
	from src.rl.mdp_components import Action, State, Trajectory, Transition
	from src.rl.prm_scorer import ProcessRewardScorer
	from src.rl.quality_filter import QualityFilter
	from src.rl.question_quality_evaluator import QuestionQualityEvaluator
	from src.rl.replay_buffer import GenerationalReplayBuffer
	from src.rl.value_network import ValueHead
	from src.sft.solution_format import extract_final_answer_numeric_str
	from src.sft.sympy_normalize import normalize_for_parse_expr

	logger = logging.getLogger(__name__)


	@dataclass
	class TrajectoryMetadata:
	curriculum_iteration: int
	target_topic: str
	target_difficulty: float
	instruction: str
	generated_question: str
	generated_solution: str
	question_length: int
	solution_length: int
	detected_topic: str
	detected_secondary_topics: List[str]
	topic_match_score: float
	estimated_difficulty: float
	clarity_score: float
	novelty_scores: Dict[str, float]
	consensus_achieved: bool
	consensus_strength: float
	answer_diversity: int
	majority_answer: Optional[float]
	primary_matches_majority: bool
	sympy_verified: bool
	steps_total: int
	steps_verified_ok: int
	steps_failed: int
	final_answer_ok: bool
	question_reward: float
	solution_reward: float
	pre_expert_reward: float
	expert_reward_modifier: float
	expert_phase: str
	expert_feedback: str
	replay_candidate: bool
	replay_novelty: float
	replay_added: bool
	combined_reward: float
	reward_breakdown: Dict[str, object]
	topics_in_sweet_spot: List[str]
	current_focus_topics: List[str]
	curriculum_state_snapshot: Dict[str, object]


	class CurriculumMathEnvironment:
	"""Standalone curriculum environment with PRM-based rewards and GRPO training support."""

	def __init__(
	self,
	policy_model: AutoModelForCausalLM,
	value_model: Optional[ValueHead],
	tokenizer: AutoTokenizer,
	reference_questions: Optional[List[str]] = None,
	grounded_qa_pairs: Optional[List[Dict[str, str]]] = None,
	prm_scorer: Optional[ProcessRewardScorer] = None,
	curriculum_checkpoint_dir: str = "checkpoints/curriculum",
	max_question_tokens: int = 200,
	max_solution_tokens: int = 500,
	temperature: float = 0.7,
	top_p: float = 0.9,
	consensus_temperature: float = 0.7,
	device: Optional[torch.device] = None,
	unified_accuracy_calc: Optional[Any] = None,
	):
	# ── Core model attributes (used by generation helpers) ───────────
	self.policy = policy_model
	self.value = value_model
	self.tokenizer = tokenizer
	self.max_question_tokens = max_question_tokens
	self.max_solution_tokens = max_solution_tokens
	self.temperature = temperature
	self.top_p = top_p

	if device is not None:
	self.device = torch.device(device)
	else:
	try:
	self.device = next(policy_model.parameters()).device
	except StopIteration:
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	self.reference_questions = reference_questions or []
	self.grounded_qa_pairs: List[Dict[str, str]] = [
	qa for qa in (grounded_qa_pairs or [])
	if qa.get("question") and qa.get("gold_final")
	]
	self.consensus_temperature = consensus_temperature
	self.curriculum_manager = CurriculumManager(checkpoint_dir=curriculum_checkpoint_dir)
	self.curriculum_manager.initialize(bootstrap_questions=self.reference_questions)
	self.curriculum_manager.load_checkpoint_safe()
	self.question_evaluator = QuestionQualityEvaluator(
	reference_questions=self.reference_questions
	)
	# PRM is the sole process-quality signal. Passing prm_scorer=None
	# will cause compute_reward/compute_grounded_reward to raise at
	# call time — GRPO training always supplies the PRM.
	self.prm_scorer = prm_scorer
	# Unified accuracy calculator — activated on Phase 2+ transition.
	# When use_chain_scoring is True, chain_integrity_score from this
	# calculator replaces PRM-based process_score in both grounded and
	# self-play reward paths.
	self.unified_accuracy_calc: Optional[Any] = unified_accuracy_calc
	self.use_chain_scoring: bool = False
	self.expert_panel = SimulatedExpertPanel()
	self.replay_buffer = GenerationalReplayBuffer(max_size=500)
	self.quality_filter = QualityFilter(novelty_threshold=0.5)
	self.last_replay_ratio: float = 0.0
	self.last_rollout_mix: Dict[str, int] = {
	"fresh": 0,
	"replay": 0,
	"grounded": 0,
	}
	# Running counts for the most recent grounded batch, so the training
	# script can log grounded accuracy per iteration without re-parsing
	# trajectory metadata.
	self.last_grounded_stats: Dict[str, float] = {
	"count": 0,
	"correct": 0,
	"accuracy": 0.0,
	"mean_reward": 0.0,
	}

	def sample_instruction(self) -> Tuple[str, str, float]:
	topic, difficulty = self.curriculum_manager.select_topic_and_difficulty()
	instruction = self.curriculum_manager.generate_instruction(
	topic=topic, target_difficulty=difficulty
	)
	return instruction, topic, difficulty

	def format_solution_prompt(self, question: str) -> str:
	"""Format a question into a chat-templated solver prompt."""
	messages = create_solver_messages(question)
	return self.tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)

	def format_question_generation_prompt(self, instruction: str) -> str:
	"""Format a curriculum instruction into a chat-templated generator prompt."""
	messages = create_generator_messages(instruction)
	return self.tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)

	def generate_with_logging(
	self,
	initial_prompt: str,
	max_tokens: int,
	phase: str,
	) -> Tuple[str, List[Transition]]:
	"""
	Generate text with per-step PPO-grade transition logging.

	Used by the PPO-compatible rollout methods (``collect_rollouts``,
	``rollout_trajectory``, ``rollout_grounded_trajectory``). The GRPO
	training loop uses ``generate_solutions_batched`` instead.
	"""
	import torch.nn.functional as F # local import to keep top-level clean

	prompt_ids = self.tokenizer.encode(
	initial_prompt, return_tensors="pt"
	).to(self.device)
	prompt_length = prompt_ids.shape[1]
	prompt_attn = torch.ones_like(prompt_ids)

	temperature = float(self.temperature)
	do_sample = temperature > 1e-4
	eos_id = self.tokenizer.eos_token_id
	pad_id = self.tokenizer.pad_token_id or eos_id

	gen_kwargs: Dict[str, Any] = dict(
	input_ids=prompt_ids,
	attention_mask=prompt_attn,
	max_new_tokens=max_tokens,
	do_sample=do_sample,
	use_cache=True,
	output_logits=True,
	return_dict_in_generate=True,
	pad_token_id=pad_id,
	eos_token_id=eos_id,
	)
	if do_sample:
	gen_kwargs["temperature"] = max(temperature, 1e-6)
	gen_kwargs["top_p"] = float(self.top_p)

	with torch.no_grad():
	gen_out = self.policy.generate(**gen_kwargs)

	full_ids = gen_out.sequences # [1, P + T]
	T_gen = int(full_ids.shape[1] - prompt_length)
	if T_gen <= 0:
	return "", []

	raw_logits = torch.stack([lg[0] for lg in gen_out.logits], dim=0).float()
	raw_log_probs = F.log_softmax(raw_logits, dim=-1)
	sampled_tokens = full_ids[0, prompt_length:]
	chosen_log_probs = raw_log_probs.gather(
	1, sampled_tokens.unsqueeze(1)
	).squeeze(1)
	entropies = -(raw_log_probs.exp() * raw_log_probs).sum(dim=-1)

	positions = torch.arange(
	prompt_length - 1, prompt_length + T_gen - 1, device=self.device
	)
	full_attn = torch.ones_like(full_ids)
	if self.value is not None:
	values = self.value.values_at_positions(
	input_ids=full_ids, positions=positions, attention_mask=full_attn
	)
	else:
	values = torch.zeros(T_gen, device=self.device)

	piece_by_piece: List[str] = self.tokenizer.batch_decode(
	[[tok.item()] for tok in sampled_tokens], skip_special_tokens=False
	)

	transitions: List[Transition] = []
	running_text = initial_prompt
	for t in range(T_gen):
	state_input_ids = full_ids[0, : prompt_length + t]
	current_state = State(
	text=running_text,
	input_ids=state_input_ids,
	attention_mask=torch.ones_like(state_input_ids),
	phase=phase,
	)
	action_token = int(sampled_tokens[t].item())
	action = Action(
	token_id=action_token,
	log_prob=float(chosen_log_probs[t].item()),
	entropy=float(entropies[t].item()),
	)
	next_text = running_text + piece_by_piece[t]
	next_input_ids = full_ids[0, : prompt_length + t + 1]
	next_state = State(
	text=next_text,
	input_ids=next_input_ids,
	attention_mask=torch.ones_like(next_input_ids),
	phase=phase,
	)
	is_done = eos_id is not None and action_token == eos_id
	transitions.append(
	Transition(
	state=current_state,
	action=action,
	reward=0.0,
	next_state=next_state,
	value=float(values[t].item()),
	done=is_done,
	)
	)
	running_text = next_text
	if is_done:
	break

	generated_ids = full_ids[0, prompt_length : prompt_length + len(transitions)]
	generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
	return generated_text, transitions

	def _compute_format_score(self, solution: str) -> float:
	"""
	Structural format score based purely on text patterns — no SymPy.

	Checks:
	- Presence of 'Step N:' lines (multi-step structure)
	- Presence of 'Final Answer:' line (correct termination)
	- Length: ≥2 step lines scores highest

	Returns a score in [0, 1].
	"""
	lines = solution.splitlines()
	step_lines = [l for l in lines if re.match(r"^\sStep\s+\d+\s:", l)]
	has_final = any(re.match(r"^\sFinal Answer\s:", l, re.IGNORECASE) for l in lines)

	n_steps = len(step_lines)
	if n_steps >= 2:
	length_bonus = 1.0
	elif n_steps == 1:
	length_bonus = 0.5
	else:
	length_bonus = 0.0

	final_ok = 1.0 if has_final else 0.0
	# 0.7 × step-structure + 0.3 × final-answer presence
	return max(0.0, min(1.0, 0.7 * length_bonus + 0.3 * final_ok))

	def compute_reward(
	self,
	question: str,
	solution: str,
	target_topic: str,
	target_difficulty: float,
	) -> Dict[str, object]:
	# With a PRM scorer plugged in we skip the expensive (and noisy)
	# TripleVerifier consensus step. PRM gives per-step correctness
	# against the actual question semantics, which is strictly better
	# than "do 3 independent samples agree?"
	if self.prm_scorer is not None:
	return self._compute_reward_with_prm(
	question=question,
	solution=solution,
	target_topic=target_topic,
	target_difficulty=target_difficulty,
	)

	raise RuntimeError(
	"compute_reward called without a PRM scorer. "
	"CurriculumMathEnvironment requires prm_scorer to be set. "
	"Pass prm_scorer=ProcessRewardScorer(...) at construction time."
	)

	def _compute_reward_with_prm(
	self,
	question: str,
	solution: str,
	target_topic: str,
	target_difficulty: float,
	) -> Dict[str, object]:
	"""
	Self-play reward using Qwen2.5-Math-PRM as the semantic-correctness
	signal. PRM gives per-step probabilities that each reasoning step
	is correct given the question — exactly the signal consensus
	voting was supposed to approximate but couldn't (three samples
	from the same policy agree on wrong answers).

	Solution reward (PRM path):
	R_sol = 0.45·prm_final + 0.35·prm_mean + 0.20·lccp
	R = 0.4·R_q + 0.6·R_sol (then expert-panel modifier)

	* ``prm_final`` (final step score) is the strongest predictor of
	overall answer correctness.
	* ``prm_mean`` provides a smooth gradient over all steps.
	* ``lccp`` (Longest Correct Consecutive Prefix) rewards chain
	integrity — consecutive correct steps before the first failure.
	* The 0.4/0.6 Q/Sol split boosts gradient to question-generation
	without starving the solution-correctness signal.
	"""
	assert self.prm_scorer is not None, "caller must check self.prm_scorer"

	prm_result = self.prm_scorer.score_solution(
	question=question, solution=solution
	)
	format_score = self._compute_format_score(solution)

	prm_mean = float(prm_result.get("mean_score", 0.0))
	prm_min = float(prm_result.get("min_score", 0.0))
	prm_final = float(prm_result.get("final_score", 0.0))
	prm_num_steps = int(prm_result.get("num_steps", 0))
	prm_degraded = bool(prm_result.get("degraded", False))

	# If the PRM degraded (empty solution, tokeniser mismatch, truncation),
	# the output is effectively unparseable. Prior behavior was to fall
	# back on SymPy+format, but the upstream ``base_combined_score`` also
	# blends in the question reward — so the policy got a positive signal
	# for producing a broken solution as long as the question looked
	# fine. We now treat a degraded PRM as a hard zero on the solution
	# reward; the question reward is gated below so the full combined
	# score also collapses.
	if prm_degraded or prm_num_steps == 0:
	solution_reward = 0.0
	_sp_lccp = 0.0
	sol_valid = False
	_sp_chain_integrity: Optional[float] = None
	logger.info(
	"PRM degraded (%s); sol_reward set to 0.0 (format=%.2f).",
	prm_result.get("degraded_reason", "unknown"),
	format_score,
	)
	else:
	# LCCP for self-play: same chain-integrity measure as grounded path
	_sp_step_scores = prm_result.get("step_scores", []) or []
	if _sp_step_scores:
	_first_fail = next(
	(i for i, s in enumerate(_sp_step_scores) if s <= 0.5),
	len(_sp_step_scores),
	)
	_sp_lccp = _first_fail / len(_sp_step_scores)
	else:
	_sp_lccp = 0.0

	# Self-play solution: PRM-only reward blending mean, final & chain integrity.
	# LCCP anchors the grade to consecutive correctness, not just bag-of-steps.
	solution_reward = (
	0.45 * prm_final
	+ 0.35 * prm_mean
	+ 0.20 * _sp_lccp
	)
	# Phase 2+ chain scoring: replace PRM solution blend with unified
	# chain integrity + dependency consistency. This also populates the
	# question_score from the unified calculator so the Q/Sol weighting
	# below uses chain-verified signals instead of PRM proxies.
	_sp_chain_integrity = None
	if self.use_chain_scoring and self.unified_accuracy_calc is not None:
	try:
	_sp_report = self.unified_accuracy_calc.compute(
	solution=solution,
	gold_answer=None,
	question=question,
	topic=target_topic,
	phase="selfplay",
	)
	solution_reward = _sp_report.composite_accuracy
	_sp_chain_integrity = _sp_report.chain_integrity_score
	except Exception as _sp_exc:
	logger.debug("Unified accuracy calc (self-play) failed: %s", _sp_exc)
	sol_valid = True
	solution_reward = max(0.0, min(1.0, solution_reward))

	question_result = self.question_evaluator.evaluate(
	question=question,
	solution=solution,
	# Synthesize a "consensus-equivalent" dict so the question
	# evaluator keeps working unchanged. PRM mean score stands
	# in for consensus strength since both are correctness proxies.
	consensus_result={
	"has_majority": prm_mean >= 0.5,
	"consensus_strength": prm_mean,
	"primary_matches_majority": prm_mean >= 0.5,
	"answer_diversity": 0,
	"majority_answer": None,
	"primary_answer": None,
	},
	target_topic=target_topic,
	target_difficulty=target_difficulty,
	)
	question_reward = float(question_result["overall_score"])

	# Gate the question-quality bonus on having a parseable solution.
	# A great-looking question with a broken solution is not progress
	# toward self-improvement — it's the policy gaming whichever
	# signal is easier to produce.
	effective_question_reward = question_reward if sol_valid else 0.0

	# Q/Sol = 0.4/0.6 — see note in compute_reward (non-PRM path).
	base_combined_score = (
	0.4 * effective_question_reward + 0.6 * solution_reward
	)

	# Format floor: if the solution structure is broken (<0.5 format),
	# cap the overall reward at 0.3 regardless of how much the PRM
	# likes the prose. Previously we saw combined=0.83 with
	# Format=0.30, i.e. the PRM "approved" an output that didn't have
	# parseable Step/Final Answer lines — pure reward hacking.
	format_floor_active = format_score < 0.5
	format_cap = 0.3 if format_floor_active else 1.0
	base_combined_score = min(base_combined_score, format_cap)

	# Novelty gate: prevent template-copying reward hacking.
	# If the model just generates "John has X apples..." with different numbers,
	# n-gram similarity to the reference corpus is high → dataset_novelty is LOW.
	# We cap the reward to discourage this without penalising genuinely novel questions.
	# < 0.20: near-copy of a training question (template + new variables) → cap 0.35
	# > 0.85: completely off-domain (not a real math problem style) → cap 0.55
	# [0.20, 0.85]: Goldilocks zone → full reward (novelty_cap = 1.0)
	_dataset_novelty = float(
	question_result.get("novelty", {}).get("dataset_novelty", 0.5)
	if isinstance(question_result.get("novelty"), dict)
	else 0.5
	)
	if _dataset_novelty < 0.20:
	_novelty_cap = 0.35
	elif _dataset_novelty > 0.85:
	_novelty_cap = 0.55
	else:
	_novelty_cap = 1.0
	if _novelty_cap < 1.0:
	base_combined_score = min(base_combined_score, _novelty_cap)
	logger.debug(
	"Novelty gate: dataset_novelty=%.2f → cap=%.2f (was %.3f → now %.3f)",
	_dataset_novelty, _novelty_cap,
	base_combined_score / _novelty_cap if _novelty_cap > 0 else 0,
	base_combined_score,
	)

	expert_adjustment = self.expert_panel.apply_expert_preferences(
	base_reward=base_combined_score,
	question_metrics=question_result,
	solution_metrics={
	# Only format_compliance still influences shaping — the
	# PRM/correctness signal lives inside ``solution_reward``
	# already and must not be double-counted here.
	"format_compliance": format_score,
	},
	iteration=self.curriculum_manager.current_iteration,
	)
	combined_score = float(expert_adjustment["adjusted_reward"])
	# Re-clip after additive shaping + respect the format cap one more
	# time so the shaping can't lift a badly-formatted solution back
	# above the cap.
	combined_score = max(0.0, min(format_cap, combined_score))

	# Curriculum mastery: consider self-play solution "successful" when
	# both the chain mean AND the final concluding step are above threshold.
	# Using prm_final as a required condition prevents a solution that gets
	# most steps right but fails the conclusion from being marked "mastered".
	solution_success = (
	(not prm_degraded)
	and (prm_mean >= 0.65)
	and (prm_final >= 0.50)
	)
	self.curriculum_manager.update_from_trajectory(
	topic=target_topic,
	question_reward=question_reward,
	solution_success=solution_success,
	combined_reward=combined_score,
	measured_difficulty=float(question_result["measured_difficulty"]),
	)

	modifier_val = float(expert_adjustment.get("reward_modifier", 0.0))
	floor_tag = " FLOOR" if format_floor_active else ""
	valid_tag = "" if sol_valid else " [SOL_INVALID]"
	logger.info(
	"PRM reward%s: combined=%.3f = clip(base=%.3f + mod=%+.3f, cap=%.2f)%s "
	"\| Q=%.2f sol=%.3f novelty=%.2f \| "
	"sol=0.45prm_final(%.2f)+0.35prm_mean(%.2f)+0.20*lccp(%.2f) "
	"\| steps=%d",
	valid_tag,
	combined_score,
	base_combined_score,
	modifier_val,
	format_cap,
	floor_tag,
	effective_question_reward,
	solution_reward,
	_dataset_novelty,
	prm_final,
	prm_mean,
	_sp_lccp if sol_valid else 0.0,
	prm_num_steps,
	)

	# Shape a consensus-style verification_details dict so downstream
	# aggregation (which reads these keys) keeps working unchanged.
	verification_details = {
	"consensus": {
	"has_majority": prm_mean >= 0.5,
	"consensus_strength": prm_mean,
	"primary_matches_majority": prm_mean >= 0.5,
	"answer_diversity": 0,
	"majority_answer": None,
	"primary_answer": extract_final_answer_numeric_str(solution) or None,
	"prm_mean_score": prm_mean,
	"prm_min_score": prm_min,
	"prm_final_score": prm_final,
	"prm_step_scores": prm_result.get("step_scores", []),
	"prm_num_steps": prm_num_steps,
	"prm_degraded": prm_degraded,
	},
	}

	return {
	"combined_score": combined_score,
	"base_combined_score": base_combined_score,
	"effective_question_reward": effective_question_reward, # gated (0 when sol invalid)
	"question_metrics": question_result,
	"solution_metrics": {
	"overall_score": solution_reward,
	"correctness": prm_mean,
	"format_compliance": format_score,
	"efficiency": prm_mean, # legacy slot
	"consensus_score": prm_mean, # legacy slot
	"prm_mean_score": prm_mean,
	"prm_min_score": prm_min,
	"prm_final_score": prm_final,
	"prm_step_scores": prm_result.get("step_scores", []),
	"prm_num_steps": prm_num_steps,
	"prm_degraded": prm_degraded,
	"verification_details": verification_details,
	},
	"curriculum_metrics": {
	"target_topic": target_topic,
	"target_difficulty": target_difficulty,
	"detected_topic": question_result["detected_topic"],
	"measured_difficulty": question_result["measured_difficulty"],
	},
	"expert_metrics": expert_adjustment,
	# Chain scoring metrics (Phase 2+; None when use_chain_scoring=False)
	"sp_chain_integrity_score": _sp_chain_integrity,
	}

	# ------------------------------------------------------------------
	# Grounded (GSM8K-anchored) rollouts
	# ------------------------------------------------------------------
	#
	# Why this exists: self-play rewards are dominated by consensus voting
	# between 3 same-model samples, which correlates poorly with GSM8K
	# accuracy (all three samples can be wrong in the same way). For the
	# grounded path we solve a known GSM8K problem and score the solution
	# directly against the gold final answer, which is the only signal
	# guaranteed to move the benchmark we actually evaluate on.
	#
	# The reward: R = 0.50·gt_match + 0.40·process(PRM) + 0.10·format
	#
	# * gt_match = 1.0 iff the model's Final Answer is mathematically
	# equivalent to the GSM8K gold final (via sympy.simplify on the
	# extracted numeric string).
	# * process = 0.60·prm_final + 0.40·prm_mean (PRM step-level quality)
	# * format rewards Step N: lines and a Final Answer: line.
	#
	# No TripleVerifier call on this path — ground truth obviates consensus.

	@staticmethod
	def _norm_expr_for_match(s: str) -> str:
	s = (s or "").strip()
	s = s.replace("^", "**")
	s = re.sub(r"[,$€£\s]+", "", s)
	return s

	@classmethod
	def _answers_equivalent(cls, pred: str, gold: str) -> bool:
	"""Return True iff ``pred`` and ``gold`` parse to the same number."""
	if not pred or not gold:
	return False
	p = cls._norm_expr_for_match(pred)
	g = cls._norm_expr_for_match(gold)
	if p == g:
	return True
	try:
	diff = simplify(
	parse_expr(normalize_for_parse_expr(p))
	- parse_expr(normalize_for_parse_expr(g))
	)
	return bool(diff == 0)
	except Exception:
	return False

	def compute_grounded_reward(
	self,
	question: str,
	solution: str,
	gold_final: str,
	) -> Dict[str, object]:
	"""
	Compute a ground-truth-anchored reward for a solution to a known
	GSM8K problem. No TripleVerifier call — the gold final answer
	replaces consensus voting as the semantic check.
	"""
	format_score = self._compute_format_score(solution)

	pred_final = extract_final_answer_numeric_str(solution) or ""
	gt_match_bool = self._answers_equivalent(pred_final, gold_final)
	if gt_match_bool:
	gt_match = 1.0
	else:
	# Soft numeric proximity: reward near-misses rather than cliffing at 0.
	# Gives partial credit proportional to how close the numeric answer is.
	# Capped at 0.85 so an exact match (1.0) is always strictly better.
	# Non-numeric wrong answers still get 0.0.
	try:
	_p = float(pred_final.replace(",", "").strip())
	_g = float(gold_final.replace(",", "").strip())
	_denom = max(abs(_g), 1.0)
	gt_match = min(0.85, 1.0 / (1.0 + 2.0 * abs(_p - _g) / _denom))
	except (ValueError, TypeError, AttributeError):
	gt_match = 0.0

	# Optional PRM step-level quality on grounded rollouts.
	# prm_final (last step score) is the strongest single predictor of
	# answer correctness. step_accuracy = fraction of steps the PRM
	# considers correct — the direct measure of reasoning process quality.
	prm_mean = 0.0
	prm_final = 0.0
	prm_step_scores: List[float] = []
	prm_num_steps = 0
	prm_degraded = True
	if self.prm_scorer is not None:
	prm_result = self.prm_scorer.score_solution(
	question=question, solution=solution
	)
	prm_degraded = bool(prm_result.get("degraded", False))
	if not prm_degraded:
	prm_mean = float(prm_result.get("mean_score", 0.0))
	prm_final = float(prm_result.get("final_score", 0.0))
	prm_step_scores = list(prm_result.get("step_scores", []))
	prm_num_steps = int(prm_result.get("num_steps", 0))

	# Step accuracy: fraction of individual steps rated correct by PRM.
	step_accuracy = (
	sum(1.0 for s in prm_step_scores if s > 0.5) / len(prm_step_scores)
	if prm_step_scores else 0.0
	)

	# Longest Correct Consecutive Prefix (LCCP): fraction of steps from
	# the start that are ALL rated correct before the first failure.
	# This captures chain integrity — a broken step 3 makes steps 4+ invalid
	# regardless of their individual PRM scores.
	# LCCP=1.0 means every step was correct (necessary condition for right answer).
	# LCCP=0.0 means step 1 itself was wrong (model never had a valid chain).
	if prm_step_scores:
	first_fail = next(
	(i for i, s in enumerate(prm_step_scores) if s <= 0.5), len(prm_step_scores)
	)
	lccp = first_fail / len(prm_step_scores)
	else:
	lccp = 0.0

	if self.prm_scorer is not None and not prm_degraded:
	# process_score: weight prm_final (conclusion step) more than mean
	# — the final step is the most critical and most predictive.
	process_score = 0.60 * prm_final + 0.40 * prm_mean
	combined = (
	0.50 * gt_match
	+ 0.40 * process_score
	+ 0.10 * format_score
	)
	_gt_tag = "exact" if gt_match_bool else f"prox={gt_match:.2f}"
	components_str = (
	f"0.50×{gt_match:.2f}({_gt_tag}) + 0.40×proc({process_score:.3f}"
	f"[fin={prm_final:.2f},mean={prm_mean:.2f}]) + "
	f"0.10×fmt({format_score:.3f})"
	)
	else:
	combined = 0.85 * gt_match + 0.15 * format_score
	components_str = (
	f"0.85×{gt_match:.2f} + 0.15×fmt({format_score:.3f})"
	)

	# Phase 2+ chain scoring: override process_score, step_accuracy, lccp,
	# and combined with formally-verified chain integrity metrics.
	# PRM is still called above so its scores remain logged for comparison.
	_chain_report = None
	if self.use_chain_scoring and self.unified_accuracy_calc is not None:
	try:
	_chain_report = self.unified_accuracy_calc.compute(
	solution=solution,
	gold_answer=gold_final,
	topic="grounded",
	phase="grounded",
	)
	process_score = _chain_report.chain_integrity_score
	step_accuracy = _chain_report.step_arithmetic_score
	lccp = _chain_report.lccp_score
	combined = max(0.0, min(1.0,
	0.50 * gt_match + 0.30 * process_score + 0.20 * lccp
	))
	components_str = (
	f"0.50×{gt_match:.2f} + 0.30×chain({process_score:.3f}"
	f"[arith={_chain_report.step_arithmetic_score:.2f},"
	f"dep={_chain_report.step_dependency_score:.2f}]) + "
	f"0.20×lccp({lccp:.3f})"
	)
	except Exception as _chain_exc:
	logger.debug("Unified accuracy calc failed, keeping PRM scores: %s", _chain_exc)
	else:
	combined = max(0.0, min(1.0, combined))

	# Hard negative mining: wrong-answer solutions still get a partial signal
	# proportional to how far they got before the first error (LCCP).
	# This prevents gradient starvation on hard problems where no solution in
	# the group is fully correct — the model still learns "longer correct prefix
	# is better" rather than receiving zero reward for all K samples.
	if gt_match < 0.5 and lccp > 0.0 and self.prm_scorer is not None:
	# Bonus = 0.15 × LCCP, capped so that a wrong answer (combined ≈ 0.40)
	# can never exceed 0.55 — always well below a correct answer (≈ 0.90+).
	_hnm_bonus = 0.15 * lccp
	combined = min(combined + _hnm_bonus, 0.55)

	_chain_depth = first_fail if prm_step_scores else 0
	logger.info(
	"Grounded reward: combined=%.3f = %s \| pred=%r gold=%r \| "
	"step_acc=%.0f%% lccp=%.0f%% (chain=%d/%d ok_count=%d) n_steps=%d",
	combined,
	components_str,
	pred_final,
	gold_final,
	100 * step_accuracy,
	100 * lccp,
	_chain_depth,
	len(prm_step_scores),
	sum(1 for s in prm_step_scores if s > 0.5),
	prm_num_steps,
	)

	return {
	"combined_score": combined,
	"gt_match": gt_match_bool,
	# process metrics
	"step_accuracy": step_accuracy,
	"lccp": lccp, # longest correct consecutive prefix ratio
	"prm_mean_score": prm_mean,
	"prm_final_score": prm_final,
	"prm_step_scores": prm_step_scores,
	"prm_num_steps": prm_num_steps,
	"prm_degraded": prm_degraded,
	# format / answer
	"format_score": format_score,
	"pred_final": pred_final,
	"gold_final": gold_final,
	# chain scoring metrics (populated in Phase 2+, None otherwise)
	"chain_arith_score": _chain_report.step_arithmetic_score if _chain_report else None,
	"chain_dep_score": _chain_report.step_dependency_score if _chain_report else None,
	"chain_integrity_score": _chain_report.chain_integrity_score if _chain_report else None,
	"first_failure_step": _chain_report.first_failure_step if _chain_report else None,
	"final_consistent": _chain_report.final_answer_consistent if _chain_report else None,
	}

	def rollout_grounded_trajectory(self, qa_pair: Dict[str, str]) -> Trajectory:
	"""
	Run a rollout on a known GSM8K (question, gold_final) pair.

	The policy generates a solution to the real question; reward is
	dominated by whether the model's final number matches the gold
	final (ground-truth-anchored).
	"""
	question = str(qa_pair["question"]).strip()
	gold_final = str(qa_pair["gold_final"]).strip()

	solution_prompt = self.format_solution_prompt(question)
	generated_solution, solution_transitions = self.generate_with_logging(
	initial_prompt=solution_prompt,
	max_tokens=self.max_solution_tokens,
	phase="grounded_solution",
	)

	reward_result = self.compute_grounded_reward(
	question=question,
	solution=generated_solution,
	gold_final=gold_final,
	)

	terminal_reward = float(reward_result["combined_score"])
	trajectory = Trajectory()
	for idx, transition in enumerate(solution_transitions):
	transition.reward = (
	terminal_reward if idx == len(solution_transitions) - 1 else 0.0
	)
	trajectory.add(transition)

	metadata = {
	"rollout_source": "grounded",
	"curriculum_iteration": self.curriculum_manager.current_iteration,
	"target_topic": "grounded_gsm8k",
	"target_difficulty": 0.5,
	"instruction": "",
	"generated_question": question,
	"generated_solution": generated_solution,
	"question_length": 0,
	"solution_length": len(solution_transitions),
	"detected_topic": "grounded_gsm8k",
	"detected_secondary_topics": [],
	"topic_match_score": 1.0,
	"estimated_difficulty": 0.5,
	"clarity_score": 1.0,
	"novelty_scores": {"combined": 0.0},
	"consensus_achieved": bool(reward_result["gt_match"]),
	"consensus_strength": 1.0 if reward_result["gt_match"] else 0.0,
	"answer_diversity": 0,
	"majority_answer": None,
	"primary_matches_majority": bool(reward_result["gt_match"]),
	"question_reward": 0.0,
	"solution_reward": terminal_reward,
	"pre_expert_reward": terminal_reward,
	"expert_reward_modifier": 0.0,
	"expert_phase": "grounded",
	"expert_feedback": "ground-truth anchored",
	"replay_candidate": False,
	"replay_novelty": 0.0,
	"replay_added": False,
	"combined_reward": terminal_reward,
	"reward_breakdown": {
	"grounded": True,
	"gt_match": bool(reward_result["gt_match"]),
	"format_score": float(reward_result["format_score"]),
	"pred_final": reward_result["pred_final"],
	"gold_final": reward_result["gold_final"],
	"prm_mean_score": float(reward_result.get("prm_mean_score", 0.0)),
	"prm_num_steps": int(reward_result.get("prm_num_steps", 0)),
	"prm_step_scores": list(reward_result.get("prm_step_scores", [])),
	"prm_degraded": bool(reward_result.get("prm_degraded", True)),
	},
	"topics_in_sweet_spot": self.curriculum_manager.get_sweet_spot_topics(),
	"current_focus_topics": self.curriculum_manager.get_current_focus(),
	"curriculum_state_snapshot": self.curriculum_manager.get_curriculum_stats(),
	"grounded_gt_match": bool(reward_result["gt_match"]),
	"grounded_pred_final": reward_result["pred_final"],
	"grounded_gold_final": reward_result["gold_final"],
	}
	trajectory.metadata = metadata
	return trajectory

	def rollout_trajectory(self) -> Trajectory:
	instruction, target_topic, target_difficulty = self.sample_instruction()
	question_prompt = self.format_question_generation_prompt(instruction)
	generated_question, question_transitions = self.generate_with_logging(
	initial_prompt=question_prompt,
	max_tokens=self.max_question_tokens,
	phase="question_generation",
	)
	return self._build_trajectory_from_question(
	instruction=instruction,
	target_topic=target_topic,
	target_difficulty=target_difficulty,
	generated_question=generated_question,
	question_transitions=question_transitions,
	)

	def _build_trajectory_from_question(
	self,
	instruction: str,
	target_topic: str,
	target_difficulty: float,
	generated_question: str,
	question_transitions: Optional[List] = None,
	) -> Trajectory:
	trajectory = Trajectory()
	question_transitions = question_transitions or []

	solution_prompt = self.format_solution_prompt(generated_question)
	generated_solution, solution_transitions = self.generate_with_logging(
	initial_prompt=solution_prompt,
	max_tokens=self.max_solution_tokens,
	phase="solution",
	)

	reward_result = self.compute_reward(
	question=generated_question,
	solution=generated_solution,
	target_topic=target_topic,
	target_difficulty=target_difficulty,
	)

	terminal_reward = float(reward_result["combined_score"])
	all_transitions = question_transitions + solution_transitions
	# Terminal-only reward — gae_lambda=1.0 makes A_t = R - V(s_t) for all t.
	for idx, transition in enumerate(all_transitions):
	transition.reward = (
	terminal_reward if idx == len(all_transitions) - 1 else 0.0
	)
	trajectory.add(transition)

	verification = reward_result["solution_metrics"]["verification_details"]
	consensus = verification["consensus"]
	question_metrics = reward_result["question_metrics"]

	metadata = TrajectoryMetadata(
	curriculum_iteration=self.curriculum_manager.current_iteration,
	target_topic=target_topic,
	target_difficulty=target_difficulty,
	instruction=instruction,
	generated_question=generated_question,
	generated_solution=generated_solution,
	question_length=len(question_transitions),
	solution_length=len(solution_transitions),
	detected_topic=str(question_metrics["detected_topic"]["primary_topic"]),
	detected_secondary_topics=[
	str(x) for x in question_metrics["detected_topic"]["secondary_topics"]
	],
	topic_match_score=float(question_metrics["topic_match"]),
	estimated_difficulty=float(question_metrics["measured_difficulty"]),
	clarity_score=float(question_metrics["clarity"]),
	novelty_scores=dict(question_metrics["novelty"]),
	consensus_achieved=bool(consensus["has_majority"]),
	consensus_strength=float(consensus["consensus_strength"]),
	answer_diversity=int(consensus["answer_diversity"]),
	majority_answer=consensus.get("majority_answer"),
	primary_matches_majority=bool(consensus["primary_matches_majority"]),
	sympy_verified=True,
	steps_total=int(consensus.get("prm_num_steps", 0)),
	steps_verified_ok=int(consensus.get("prm_num_steps", 0)),
	steps_failed=0,
	final_answer_ok=bool(consensus.get("primary_matches_majority", False)),
	question_reward=float(question_metrics["overall_score"]),
	solution_reward=float(reward_result["solution_metrics"]["overall_score"]),
	pre_expert_reward=float(reward_result["base_combined_score"]),
	expert_reward_modifier=float(
	reward_result["expert_metrics"]["reward_modifier"]
	),
	expert_phase=str(reward_result["expert_metrics"]["phase"]),
	expert_feedback=str(reward_result["expert_metrics"]["feedback"]),
	replay_candidate=False,
	replay_novelty=0.0,
	replay_added=False,
	combined_reward=terminal_reward,
	reward_breakdown=reward_result,
	topics_in_sweet_spot=self.curriculum_manager.get_sweet_spot_topics(),
	current_focus_topics=self.curriculum_manager.get_current_focus(),
	curriculum_state_snapshot=self.curriculum_manager.get_curriculum_stats(),
	)
	metadata_dict = asdict(metadata)
	trajectory.metadata = metadata_dict

	# Replay admission: requires trajectory.metadata to already exist
	# because check_novelty reads metadata["generated_question"].
	is_candidate, reason = self.quality_filter.meets_replay_criteria(metadata_dict)
	metadata_dict["replay_candidate"] = is_candidate
	if is_candidate:
	novelty_score = self.quality_filter.check_novelty(
	trajectory, self.replay_buffer.buffer
	)
	metadata_dict["replay_novelty"] = float(novelty_score)
	if self.quality_filter.is_novel_enough(novelty_score):
	quality_score = self.quality_filter.compute_quality_score(metadata_dict)
	self.replay_buffer.add_trajectory(
	trajectory=trajectory,
	metadata=metadata_dict,
	iteration=self.curriculum_manager.current_iteration,
	quality_score=quality_score,
	)
	metadata_dict["replay_added"] = True
	else:
	metadata_dict["replay_added"] = False
	else:
	metadata_dict["replay_added"] = False
	metadata_dict["replay_reject_reason"] = reason

	trajectory.metadata = metadata_dict
	return trajectory

	def _get_adaptive_replay_ratio(self) -> float:
	iteration = self.curriculum_manager.current_iteration
	if iteration < 3:
	return 0.0
	if iteration < 5:
	return 0.15

	buffer_stats = self.replay_buffer.get_buffer_stats(current_iteration=iteration)
	buffer_health = float(buffer_stats.get("buffer_health", 0.0))
	if buffer_health >= 0.75:
	return 0.3
	if buffer_health >= 0.6:
	return 0.25
	return 0.2

	def collect_rollouts(
	self,
	num_trajectories: int,
	verbose: bool = True,
	grounded_ratio: float = 0.0,
	) -> List[Trajectory]:
	"""
	Generate ``num_trajectories`` episodes in-process on the current
	device.

	Mix:
	* ``grounded_ratio`` of rollouts are GSM8K-anchored (real question,
	reward scored against gold final answer). These give the policy
	a clean gradient toward benchmark correctness and are also ~3x
	faster than self-play rollouts (no TripleVerifier call).
	* an adaptive fraction is drawn from the replay buffer when buffer
	health is good (self-play only).
	* the remainder are fresh self-play rollouts.
	"""
	if num_trajectories <= 0:
	return []

	# Defensive .eval() on both policy and value before any generation.
	# The first iteration runs rollouts right after model load (HF default
	# is .train()). Qwen2.5 has zero dropout so this is currently cosmetic,
	# but cheap insurance against any future model swap with stochastic layers.
	if self.policy is not None:
	self.policy.eval()
	if self.value is not None:
	self.value.eval()

	# Grounded rollouts: only if we actually have QA pairs loaded.
	if grounded_ratio > 0.0 and self.grounded_qa_pairs:
	num_grounded = int(round(num_trajectories * grounded_ratio))
	num_grounded = min(num_grounded, num_trajectories)
	else:
	num_grounded = 0
	num_selfplay = num_trajectories - num_grounded

	# Within the self-play half, the existing replay-buffer mix applies.
	replay_ratio = self._get_adaptive_replay_ratio()
	num_replay = int(num_selfplay * replay_ratio)
	num_replay = min(num_replay, len(self.replay_buffer))
	num_fresh = max(0, num_selfplay - num_replay)

	# ---- Grounded rollouts (GSM8K-anchored) --------------------------
	grounded_trajectories: List[Trajectory] = []
	grounded_correct = 0
	grounded_reward_sum = 0.0
	if num_grounded > 0:
	qa_sample = random.sample(
	self.grounded_qa_pairs,
	k=min(num_grounded, len(self.grounded_qa_pairs)),
	)
	# If we asked for more grounded rollouts than we have distinct
	# pairs, pad by re-sampling with replacement.
	while len(qa_sample) < num_grounded:
	qa_sample.append(random.choice(self.grounded_qa_pairs))
	pbar = tqdm(
	qa_sample,
	desc="Grounded rollouts",
	unit="ep",
	dynamic_ncols=True,
	leave=False,
	disable=not verbose,
	)
	for qa in pbar:
	trajectory = self.rollout_grounded_trajectory(qa)
	grounded_trajectories.append(trajectory)
	r = float(trajectory.metadata.get("combined_reward", 0.0))
	grounded_reward_sum += r
	if bool(trajectory.metadata.get("grounded_gt_match", False)):
	grounded_correct += 1
	done = len(grounded_trajectories)
	pbar.set_postfix(
	acc=f"{grounded_correct / done:.1%}",
	reward=f"{grounded_reward_sum / done:+.3f}",
	refresh=False,
	)

	# ---- Fresh self-play rollouts ------------------------------------
	fresh_trajectories: List[Trajectory] = []
	pbar = tqdm(
	range(num_fresh),
	desc="Self-play rollouts",
	unit="ep",
	dynamic_ncols=True,
	leave=False,
	disable=not verbose,
	)
	running_reward = 0.0
	running_ok = 0
	for _ in pbar:
	trajectory = self.rollout_trajectory()
	trajectory.metadata["rollout_source"] = "fresh"
	fresh_trajectories.append(trajectory)

	running_reward += float(trajectory.metadata.get("combined_reward", 0.0))
	if trajectory.metadata.get("final_answer_ok", False):
	running_ok += 1
	done = len(fresh_trajectories)
	pbar.set_postfix(
	reward=f"{running_reward / done:+.3f}",
	ok=f"{running_ok}/{done}",
	refresh=False,
	)

	# ---- Replay buffer draws -----------------------------------------
	replay_trajectories = self.replay_buffer.sample_replay_batch(
	num_replay, diversity_sample=True
	)
	for trajectory in replay_trajectories:
	trajectory.metadata["rollout_source"] = "replay"

	trajectories = (
	grounded_trajectories + fresh_trajectories + replay_trajectories
	)
	random.shuffle(trajectories)

	self.last_replay_ratio = replay_ratio
	self.last_rollout_mix = {
	"fresh": len(fresh_trajectories),
	"replay": len(replay_trajectories),
	"grounded": len(grounded_trajectories),
	}
	grounded_count = len(grounded_trajectories)
	self.last_grounded_stats = {
	"count": grounded_count,
	"correct": grounded_correct,
	"accuracy": (
	grounded_correct / grounded_count if grounded_count > 0 else 0.0
	),
	"mean_reward": (
	grounded_reward_sum / grounded_count if grounded_count > 0 else 0.0
	),
	}

	if verbose:
	buffer_stats = self.replay_buffer.get_buffer_stats(
	current_iteration=self.curriculum_manager.current_iteration
	)
	logger.info(
	"Rollout mix: %d grounded + %d fresh + %d replay "
	"(grounded_ratio=%.2f, replay_ratio=%.2f, buffer_size=%d, health=%.3f)",
	len(grounded_trajectories),
	len(fresh_trajectories),
	len(replay_trajectories),
	grounded_ratio,
	replay_ratio,
	len(self.replay_buffer),
	float(buffer_stats.get("buffer_health", 0.0)),
	)
	if grounded_count > 0:
	logger.info(
	"Grounded accuracy this iter: %d/%d = %.1f%% (mean reward %.3f)",
	grounded_correct,
	grounded_count,
	100.0 * grounded_correct / grounded_count,
	grounded_reward_sum / grounded_count,
	)

	self.curriculum_manager.increment_iteration()
	self.curriculum_manager.save_state(
	iteration=self.curriculum_manager.current_iteration, rollout=None
	)
	return trajectories