"""End-to-end smoke tests for :class:`PhysiXEnvironment`. These tests exercise the full pipeline (parse + simulate + score + record) without spinning up a FastAPI server. They serve as the first sanity check that the parser, simulator, metrics, and reward composer interoperate. """ from __future__ import annotations import pytest from physix.models import CONVERGENCE_THRESHOLD, PhysiXAction from physix.server.environment import PhysiXEnvironment from physix.systems import SystemTier # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- @pytest.fixture def env() -> PhysiXEnvironment: """Deterministic env restricted to Tier 1 systems for fast tests.""" return PhysiXEnvironment(seed=42, train_tiers=(SystemTier.TIER_1,)) # --------------------------------------------------------------------------- # Tests # --------------------------------------------------------------------------- def test_reset_returns_well_formed_observation(env: PhysiXEnvironment) -> None: obs = env.reset(system_id="free_fall") assert obs.system_id == "free_fall" assert obs.turn == 0 assert obs.turn_remaining > 0 assert obs.history == [] assert obs.mismatch_summary == "" assert "y" in obs.state_variables and "vy" in obs.state_variables assert len(obs.trajectory) == 100 assert obs.hint # non-empty assert obs.done is False def test_step_with_ground_truth_rewards_high(env: PhysiXEnvironment) -> None: """The exact ground-truth equation should yield r_match close to 1.""" env.reset(system_id="free_fall") obs = env.step(PhysiXAction(equation="d2y/dt2 = -9.81", params={})) breakdown = obs.reward_breakdown assert breakdown["format"] == 1.0 assert breakdown["match"] >= 0.95 assert obs.reward >= CONVERGENCE_THRESHOLD * 0.5 # weighted total floor def test_step_with_unparseable_equation_short_circuits( env: PhysiXEnvironment, ) -> None: """A junk payload should set r_format=0 and other components to 0.""" env.reset(system_id="free_fall") obs = env.step(PhysiXAction(equation="not a real equation")) breakdown = obs.reward_breakdown assert breakdown["format"] == 0.0 assert breakdown["match"] == 0.0 assert breakdown["progress"] == 0.0 assert breakdown["simplicity"] == 0.0 assert "Parse error" in obs.mismatch_summary def test_episode_terminates_on_convergence(env: PhysiXEnvironment) -> None: """High-quality match should set done=True via the convergence threshold.""" env.reset(system_id="free_fall") obs = env.step(PhysiXAction(equation="d2y/dt2 = -9.81")) assert obs.done is True def test_history_accumulates_across_turns(env: PhysiXEnvironment) -> None: """Each step should append exactly one history entry.""" env.reset(system_id="free_fall_drag") obs1 = env.step(PhysiXAction(equation="d2y/dt2 = -9.81")) assert len(obs1.history) == 1 assert obs1.history[0]["equation"] == "d2y/dt2 = -9.81" if not obs1.done: obs2 = env.step( PhysiXAction(equation="d2y/dt2 = -9.81 + 0.05 * vy**2"), ) assert len(obs2.history) == 2 assert obs2.history[1]["equation"] == "d2y/dt2 = -9.81 + 0.05 * vy**2" def test_progress_reward_rewards_improvement(env: PhysiXEnvironment) -> None: """A second-turn improvement should yield positive r_progress.""" env.reset(system_id="free_fall_drag") # Turn 1: pure gravity (decent fit but missing drag). obs1 = env.step(PhysiXAction(equation="d2y/dt2 = -9.81")) if obs1.done: pytest.skip("episode converged on turn 1") # Turn 2: add drag (closer fit). obs2 = env.step( PhysiXAction(equation="d2y/dt2 = -9.81 + 0.05 * vy**2"), ) assert obs2.reward_breakdown["match"] >= obs1.reward_breakdown["match"] if obs2.reward_breakdown["match"] > obs1.reward_breakdown["total"]: assert obs2.reward_breakdown["progress"] > 0.0 def test_max_turns_terminates_episode() -> None: """When budget is exhausted with no convergence, ``done`` flips true.""" env = PhysiXEnvironment(seed=0, max_turns=3, train_tiers=(SystemTier.TIER_1,)) env.reset(system_id="simple_pendulum") last_obs = None for _ in range(3): # Deliberately wrong-but-parseable equation. last_obs = env.step(PhysiXAction(equation="d2theta/dt2 = 0")) assert last_obs is not None assert last_obs.done is True assert last_obs.turn_remaining == 0 def test_state_property_exposes_episode_id(env: PhysiXEnvironment) -> None: obs = env.reset(system_id="free_fall") assert env.state.episode_id is not None assert env.state.episode_id # non-empty string assert env.state.system_id == "free_fall" assert obs.system_id == env.state.system_id @pytest.mark.parametrize( "system_id, equation", [ # Pendulum-like system with a sqrt of an Add — historically # produced a TypeError ("loop of ufunc does not support argument # 0 of type Add which has no callable sqrt method") that escaped # the simulator and 500-ed the route. ("simple_pendulum", "d2theta/dt2 = -sqrt(dtheta**2 + theta**2) * sin(theta)"), # sqrt of a guaranteed-negative quantity → numpy emits NaN. ("simple_pendulum", "d2theta/dt2 = -sqrt(-theta**2 - 1)"), # Division by zero from constant numerics in the RHS. ("free_fall", "d2y/dt2 = -9.81 / (y - y)"), # Pathological growth that overflows odeint. ("free_fall", "d2y/dt2 = exp(exp(exp(y)))"), # log of zero (-inf) propagating through the RHS. ("free_fall", "d2y/dt2 = log(0 * y)"), ], ) def test_step_swallows_simulator_failures_as_format_zero_match_zero( system_id: str, equation: str ) -> None: """``step`` must never propagate a TypeError / overflow / NaN out of the simulator into the route layer. A model-emitted equation that parses but blows up numerically should score ``r_match=0`` cleanly, surface a ``Simulation error: ...`` mismatch, and let the episode continue. Without the broadened exception catch in :func:`simulate_hypothesis`, several of these would 500 the server. """ env = PhysiXEnvironment(seed=0, train_tiers=(SystemTier.TIER_1,)) env.reset(system_id=system_id) obs = env.step(PhysiXAction(equation=equation)) assert obs.reward_breakdown["match"] == 0.0 # The equation parses, so format should be 1; any "format=0" here # indicates parse rejection (also acceptable for these inputs). assert obs.reward_breakdown["format"] in (0.0, 1.0) # Either path must produce a non-empty diagnostic string. assert obs.mismatch_summary