"""Tests for the per-component reward accumulator + EpisodeStats path. These guard the "watch individual reward function columns" view (hackathon FAQ Q17, Q43, Q52). If a future change accidentally regresses the breakdown so only mean reward is logged, the verifier-hack monitoring loses one of its main inputs and these tests fail loudly. """ from __future__ import annotations import json import pytest from server.environment import CERNCollisionEnvironment from training.training_script import ( EpisodeContext, EpisodeStats, RewardComponentAccumulator, _stepwise_reward, make_reward_fn, ) def _make_ctx() -> EpisodeContext: return EpisodeContext( env=CERNCollisionEnvironment(max_steps=8), seed=11, scenario="easy_diphoton_160", difficulty="easy", ) def test_episode_stats_populated_when_out_param_given() -> None: stats = EpisodeStats() ctx = _make_ctx() completion = json.dumps({ "action_type": "configure_beam", "parameters": {"beam_energy": "13TeV"}, }) reward = _stepwise_reward( completion_text=completion, ctx=ctx, out_stats=stats, ) # The scalar reward must equal the cumulative we recorded (within fp). assert pytest.approx(reward, abs=1e-9) == stats.cumulative_reward # We did at least one step. assert stats.n_steps >= 1 # Decomposition arithmetic holds: cumulative = step_shaping + terminal. assert pytest.approx(stats.cumulative_reward, abs=1e-9) == ( stats.step_shaping + stats.terminal_reward ) # The completion was a valid action. assert stats.parsed_ok is True # The env reported a difficulty for the rollout. assert stats.difficulty in {"easy", "medium", "hard"} def test_episode_stats_marks_unparseable_completion() -> None: stats = EpisodeStats() ctx = _make_ctx() _stepwise_reward( completion_text="this is not json at all", ctx=ctx, out_stats=stats, ) assert stats.parsed_ok is False def test_accumulator_thread_safe_drain_returns_all_appended() -> None: acc = RewardComponentAccumulator() for i in range(5): s = EpisodeStats(cumulative_reward=float(i), parsed_ok=(i % 2 == 0)) acc.append(s) drained = acc.drain() assert len(drained) == 5 # Drain is destructive: subsequent drain returns empty. assert acc.drain() == [] # Order is preserved. assert [s.cumulative_reward for s in drained] == [0.0, 1.0, 2.0, 3.0, 4.0] def test_accumulator_summarise_basic_rates() -> None: drained = [ EpisodeStats( cumulative_reward=2.0, terminal_reward=1.5, step_shaping=0.5, discovered=True, correct_mass=True, correct_channel=True, parsed_ok=True, n_steps=10, ), EpisodeStats( cumulative_reward=-1.0, terminal_reward=-2.0, step_shaping=1.0, discovered=False, correct_mass=False, correct_channel=False, parsed_ok=False, n_steps=4, ), ] summary = RewardComponentAccumulator.summarise(drained) assert summary["n"] == 2 assert pytest.approx(summary["mean_cumulative"]) == 0.5 assert pytest.approx(summary["mean_terminal"]) == -0.25 assert pytest.approx(summary["mean_step_shaping"]) == 0.75 assert summary["discovered_rate"] == 0.5 assert summary["mass_correct_rate"] == 0.5 assert summary["channel_correct_rate"] == 0.5 assert summary["parsed_rate"] == 0.5 assert pytest.approx(summary["mean_n_steps"]) == 7.0 def test_accumulator_summarise_empty_returns_zeros() -> None: summary = RewardComponentAccumulator.summarise([]) assert summary["n"] == 0 assert summary["mean_cumulative"] == 0.0 assert summary["discovered_rate"] == 0.0 def test_make_reward_fn_writes_to_accumulator() -> None: """The production reward path (make_reward_fn) must populate the accumulator one entry per completion when one is provided. """ acc = RewardComponentAccumulator() ctx = _make_ctx() rf = make_reward_fn(ctx, accumulator=acc) rewards = rf( prompts=["p1", "p2", "p3"], completions=[ json.dumps({"action_type": "configure_beam"}), "not-json", json.dumps({"action_type": "select_channel", "parameters": {"channel": "diphoton"}}), ], ) assert len(rewards) == 3 drained = acc.drain() assert len(drained) == 3 # Two of the three completions parsed cleanly. parsed_count = sum(1 for s in drained if s.parsed_ok) assert parsed_count == 2 def test_make_reward_fn_without_accumulator_is_a_noop_for_stats() -> None: """When no accumulator is supplied, no per-completion EpisodeStats should be allocated (minor perf win for non-monitored runs). """ ctx = _make_ctx() rf = make_reward_fn(ctx, accumulator=None) rewards = rf( prompts=["p1"], completions=[json.dumps({"action_type": "configure_beam"})], ) assert len(rewards) == 1 # Nothing to assert on accumulator here because we passed None; # the implicit contract is "doesn't crash".