Spaces:

Pratyush-01
/

physix

Sleeping

File size: 6,790 Bytes

0e24aff

"""End-to-end smoke tests for :class:`PhysiXEnvironment`.

These tests exercise the full pipeline (parse + simulate + score + record)
without spinning up a FastAPI server. They serve as the first sanity check
that the parser, simulator, metrics, and reward composer interoperate.
"""

from __future__ import annotations

import pytest

from physix.models import CONVERGENCE_THRESHOLD, PhysiXAction
from physix.server.environment import PhysiXEnvironment
from physix.systems import SystemTier


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------


@pytest.fixture
def env() -> PhysiXEnvironment:
    """Deterministic env restricted to Tier 1 systems for fast tests."""
    return PhysiXEnvironment(seed=42, train_tiers=(SystemTier.TIER_1,))


# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------


def test_reset_returns_well_formed_observation(env: PhysiXEnvironment) -> None:
    obs = env.reset(system_id="free_fall")

    assert obs.system_id == "free_fall"
    assert obs.turn == 0
    assert obs.turn_remaining > 0
    assert obs.history == []
    assert obs.mismatch_summary == ""
    assert "y" in obs.state_variables and "vy" in obs.state_variables
    assert len(obs.trajectory) == 100
    assert obs.hint  # non-empty
    assert obs.done is False


def test_step_with_ground_truth_rewards_high(env: PhysiXEnvironment) -> None:
    """The exact ground-truth equation should yield r_match close to 1."""
    env.reset(system_id="free_fall")

    obs = env.step(PhysiXAction(equation="d2y/dt2 = -9.81", params={}))
    breakdown = obs.reward_breakdown

    assert breakdown["format"] == 1.0
    assert breakdown["match"] >= 0.95
    assert obs.reward >= CONVERGENCE_THRESHOLD * 0.5  # weighted total floor


def test_step_with_unparseable_equation_short_circuits(
    env: PhysiXEnvironment,
) -> None:
    """A junk payload should set r_format=0 and other components to 0."""
    env.reset(system_id="free_fall")

    obs = env.step(PhysiXAction(equation="not a real equation"))
    breakdown = obs.reward_breakdown

    assert breakdown["format"] == 0.0
    assert breakdown["match"] == 0.0
    assert breakdown["progress"] == 0.0
    assert breakdown["simplicity"] == 0.0
    assert "Parse error" in obs.mismatch_summary


def test_episode_terminates_on_convergence(env: PhysiXEnvironment) -> None:
    """High-quality match should set done=True via the convergence threshold."""
    env.reset(system_id="free_fall")

    obs = env.step(PhysiXAction(equation="d2y/dt2 = -9.81"))

    assert obs.done is True


def test_history_accumulates_across_turns(env: PhysiXEnvironment) -> None:
    """Each step should append exactly one history entry."""
    env.reset(system_id="free_fall_drag")

    obs1 = env.step(PhysiXAction(equation="d2y/dt2 = -9.81"))
    assert len(obs1.history) == 1
    assert obs1.history[0]["equation"] == "d2y/dt2 = -9.81"

    if not obs1.done:
        obs2 = env.step(
            PhysiXAction(equation="d2y/dt2 = -9.81 + 0.05 * vy**2"),
        )
        assert len(obs2.history) == 2
        assert obs2.history[1]["equation"] == "d2y/dt2 = -9.81 + 0.05 * vy**2"


def test_progress_reward_rewards_improvement(env: PhysiXEnvironment) -> None:
    """A second-turn improvement should yield positive r_progress."""
    env.reset(system_id="free_fall_drag")

    # Turn 1: pure gravity (decent fit but missing drag).
    obs1 = env.step(PhysiXAction(equation="d2y/dt2 = -9.81"))
    if obs1.done:
        pytest.skip("episode converged on turn 1")

    # Turn 2: add drag (closer fit).
    obs2 = env.step(
        PhysiXAction(equation="d2y/dt2 = -9.81 + 0.05 * vy**2"),
    )

    assert obs2.reward_breakdown["match"] >= obs1.reward_breakdown["match"]
    if obs2.reward_breakdown["match"] > obs1.reward_breakdown["total"]:
        assert obs2.reward_breakdown["progress"] > 0.0


def test_max_turns_terminates_episode() -> None:
    """When budget is exhausted with no convergence, ``done`` flips true."""
    env = PhysiXEnvironment(seed=0, max_turns=3, train_tiers=(SystemTier.TIER_1,))
    env.reset(system_id="simple_pendulum")

    last_obs = None
    for _ in range(3):
        # Deliberately wrong-but-parseable equation.
        last_obs = env.step(PhysiXAction(equation="d2theta/dt2 = 0"))

    assert last_obs is not None
    assert last_obs.done is True
    assert last_obs.turn_remaining == 0


def test_state_property_exposes_episode_id(env: PhysiXEnvironment) -> None:
    obs = env.reset(system_id="free_fall")
    assert env.state.episode_id is not None
    assert env.state.episode_id  # non-empty string
    assert env.state.system_id == "free_fall"
    assert obs.system_id == env.state.system_id


@pytest.mark.parametrize(
    "system_id, equation",
    [
        # Pendulum-like system with a sqrt of an Add — historically
        # produced a TypeError ("loop of ufunc does not support argument
        # 0 of type Add which has no callable sqrt method") that escaped
        # the simulator and 500-ed the route.
        ("simple_pendulum", "d2theta/dt2 = -sqrt(dtheta**2 + theta**2) * sin(theta)"),
        # sqrt of a guaranteed-negative quantity → numpy emits NaN.
        ("simple_pendulum", "d2theta/dt2 = -sqrt(-theta**2 - 1)"),
        # Division by zero from constant numerics in the RHS.
        ("free_fall", "d2y/dt2 = -9.81 / (y - y)"),
        # Pathological growth that overflows odeint.
        ("free_fall", "d2y/dt2 = exp(exp(exp(y)))"),
        # log of zero (-inf) propagating through the RHS.
        ("free_fall", "d2y/dt2 = log(0 * y)"),
    ],
)
def test_step_swallows_simulator_failures_as_format_zero_match_zero(
    system_id: str, equation: str
) -> None:
    """``step`` must never propagate a TypeError / overflow / NaN out of
    the simulator into the route layer. A model-emitted equation that
    parses but blows up numerically should score ``r_match=0`` cleanly,
    surface a ``Simulation error: ...`` mismatch, and let the episode
    continue. Without the broadened exception catch in
    :func:`simulate_hypothesis`, several of these would 500 the server.
    """
    env = PhysiXEnvironment(seed=0, train_tiers=(SystemTier.TIER_1,))
    env.reset(system_id=system_id)

    obs = env.step(PhysiXAction(equation=equation))

    assert obs.reward_breakdown["match"] == 0.0
    # The equation parses, so format should be 1; any "format=0" here
    # indicates parse rejection (also acceptable for these inputs).
    assert obs.reward_breakdown["format"] in (0.0, 1.0)
    # Either path must produce a non-empty diagnostic string.
    assert obs.mismatch_summary