Spaces:
Sleeping
Sleeping
| """Unit tests for :mod:`physix.training.prompt`.""" | |
| from __future__ import annotations | |
| from physix.models import PhysiXObservation | |
| from physix.training.prompt import ( | |
| SYSTEM_MESSAGE, | |
| build_prompt, | |
| parse_completion, | |
| render_observation_for_prompt, | |
| ) | |
| def _sample_observation() -> PhysiXObservation: | |
| return PhysiXObservation( | |
| done=False, | |
| reward=None, | |
| trajectory=[ | |
| {"t": 0.0, "y": 50.0, "vy": 0.0}, | |
| {"t": 0.5, "y": 48.7, "vy": -4.9}, | |
| {"t": 1.0, "y": 45.1, "vy": -9.7}, | |
| ], | |
| state_variables=["y", "vy"], | |
| hint="Object dropped from 50 m, mass 2 kg.", | |
| history=[ | |
| { | |
| "turn": 1, | |
| "equation": "d2y/dt2 = -9.81", | |
| "params": {}, | |
| "reward_total": 0.42, | |
| "reward_components": {"match": 0.42, "progress": 0.0, "simplicity": 0.95, "format": 1.0}, | |
| "mismatch_summary": "predicted y diverges past t=2.0s.", | |
| } | |
| ], | |
| mismatch_summary="predicted y diverges past t=2.0s.", | |
| turn=1, | |
| turn_remaining=7, | |
| system_id="free_fall_drag", | |
| stats={"y_min": 0.0, "y_max": 50.0, "duration": 6.0}, | |
| reward_breakdown={"match": 0.42, "total": 0.40}, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # render_observation_for_prompt | |
| # --------------------------------------------------------------------------- | |
| def test_render_includes_metadata_block() -> None: | |
| text = render_observation_for_prompt(_sample_observation()) | |
| assert "SYSTEM_ID: free_fall_drag" in text | |
| assert "STATE_VARIABLES: y, vy" in text | |
| assert "HINT: Object dropped from 50 m" in text | |
| assert "STATS:" in text | |
| def test_render_includes_trajectory_samples() -> None: | |
| text = render_observation_for_prompt(_sample_observation()) | |
| assert "TRAJECTORY" in text | |
| assert "y=50" in text or "y=50.000" in text | |
| assert "vy=" in text | |
| def test_render_includes_history_when_present() -> None: | |
| text = render_observation_for_prompt(_sample_observation()) | |
| assert "HISTORY" in text | |
| assert "turn=1" in text | |
| assert "d2y/dt2 = -9.81" in text | |
| def test_history_uses_equation_field_name_not_shorthand() -> None: | |
| """Regression: HISTORY originally used ``eqn=`` as a display | |
| shorthand, which mid-strength chat models then mimicked when | |
| emitting their own JSON (``{"eqn": "..."}``). The parser only reads | |
| ``equation``, so every post-first turn silently scored | |
| ``r_format=0`` even when the model's actual equation was perfect. | |
| The prompt must use the same field name the parser expects.""" | |
| text = render_observation_for_prompt(_sample_observation()) | |
| assert "equation=`d2y/dt2 = -9.81`" in text | |
| assert "eqn=" not in text | |
| def test_history_block_surfaces_dense_reward_components() -> None: | |
| """The model needs to see *which* reward component scored what so it | |
| can attribute its own progress turn-over-turn — e.g. push on grammar | |
| when ``format=0``, or try a structurally different equation when | |
| ``match`` plateaus while ``progress=0``. Showing only ``reward=`` (the | |
| weighted total) hides that signal. | |
| """ | |
| obs = _sample_observation() | |
| obs.history = [ | |
| { | |
| "turn": 1, | |
| "equation": "d2y/dt2 = -9.81", | |
| "params": {}, | |
| "reward_total": 0.42, | |
| "reward_components": { | |
| "match": 0.42, | |
| "progress": 0.0, | |
| "simplicity": 0.95, | |
| "format": 1.0, | |
| }, | |
| "mismatch_summary": "predicted y diverges past t=2.0s.", | |
| } | |
| ] | |
| text = render_observation_for_prompt(obs) | |
| assert "match=0.42" in text | |
| assert "progress=0.00" in text | |
| assert "simplicity=0.95" in text | |
| assert "format=1.00" in text | |
| def test_history_block_tolerates_missing_reward_components() -> None: | |
| """Older history rows or partial breakdowns shouldn't crash render — | |
| missing components default to 0.00 so the column layout stays | |
| stable and the model can still parse the block in-context.""" | |
| obs = _sample_observation() | |
| obs.history = [ | |
| { | |
| "turn": 1, | |
| "equation": "d2y/dt2 = -9.81", | |
| "params": {}, | |
| "reward_total": 0.4, | |
| "mismatch_summary": "", | |
| } | |
| ] | |
| text = render_observation_for_prompt(obs) | |
| for name in ("match", "progress", "simplicity", "format"): | |
| assert f"{name}=0.00" in text | |
| def test_system_message_locks_in_canonical_field_name() -> None: | |
| """The system prompt must explicitly forbid synonyms so the model | |
| doesn't drift to ``eqn``/``ode``/``formula`` on later turns.""" | |
| assert '"equation"' in SYSTEM_MESSAGE | |
| assert "never" in SYSTEM_MESSAGE.lower() | |
| def test_render_omits_history_block_when_empty() -> None: | |
| obs = _sample_observation() | |
| obs.history = [] | |
| text = render_observation_for_prompt(obs) | |
| assert "HISTORY" not in text | |
| # --------------------------------------------------------------------------- | |
| # build_prompt | |
| # --------------------------------------------------------------------------- | |
| def test_build_prompt_returns_chat_pair() -> None: | |
| prompt = build_prompt(_sample_observation()) | |
| assert len(prompt) == 2 | |
| assert prompt[0] == {"role": "system", "content": SYSTEM_MESSAGE} | |
| assert prompt[1]["role"] == "user" | |
| assert "TRAJECTORY" in prompt[1]["content"] | |
| # --------------------------------------------------------------------------- | |
| # parse_completion | |
| # --------------------------------------------------------------------------- | |
| def test_parse_completion_extracts_clean_json() -> None: | |
| completion = '{"equation": "d2y/dt2 = -9.81", "params": {"g": 9.81}, "rationale": "free fall"}' | |
| action = parse_completion(completion) | |
| assert action.equation == "d2y/dt2 = -9.81" | |
| assert action.params == {"g": 9.81} | |
| assert action.rationale == "free fall" | |
| def test_parse_completion_handles_code_fences() -> None: | |
| completion = ''' | |
| Here is my hypothesis: | |
| ```json | |
| { | |
| "equation": "d2y/dt2 = -g + k * vy**2", | |
| "params": {"g": 9.81, "k": 0.05}, | |
| "rationale": "added drag" | |
| } | |
| ``` | |
| ''' | |
| action = parse_completion(completion) | |
| assert action.equation == "d2y/dt2 = -g + k * vy**2" | |
| assert action.params == {"g": 9.81, "k": 0.05} | |
| def test_parse_completion_with_no_json_yields_empty_equation() -> None: | |
| """When the completion contains no extractable JSON object, the action | |
| must have an *empty* equation (so the verifier reports a clean | |
| ``Empty equation payload`` error and the env scores ``r_format=0``). | |
| The raw text is preserved in ``rationale`` so logs/UI can still show | |
| what the model said, but it is never fed to the equation parser as | |
| if it were an equation. | |
| """ | |
| completion = "I think the equation is d2y/dt2 = -g but I'm not sure." | |
| action = parse_completion(completion) | |
| assert action.equation == "" | |
| assert action.params == {} | |
| assert action.rationale == completion.strip() | |
| def test_parse_completion_coerces_string_params() -> None: | |
| completion = '{"equation": "d2y/dt2 = -g", "params": {"g": "9.81", "bad": "not_a_number"}}' | |
| action = parse_completion(completion) | |
| assert action.params == {"g": 9.81} # bad value silently dropped | |
| def test_parse_completion_handles_latex_braces_inside_json_string() -> None: | |
| """Regression: a regex-based brace matcher mis-balances when the JSON | |
| *string* value contains literal ``{`` / ``}`` (e.g. LaTeX ``\\frac{...}``). | |
| The JSON-aware extractor (``json.JSONDecoder.raw_decode``) handles it | |
| correctly. The equation field is preserved verbatim — rewriting it | |
| upstream would silently corrupt the agent's output and is the env's | |
| job to score, not ours.""" | |
| completion = ( | |
| '```json\n' | |
| '{ "equation": "\\\\frac{d vy}{dt} = -9.81", ' | |
| '"params": { "g": 9.81 }, ' | |
| '"rationale": "free fall" }\n' | |
| '```' | |
| ) | |
| action = parse_completion(completion) | |
| assert action.equation == "\\frac{d vy}{dt} = -9.81" | |
| assert action.params == {"g": 9.81} | |
| assert action.rationale == "free fall" | |
| def test_parse_completion_picks_first_object_when_text_has_multiple() -> None: | |
| completion = ( | |
| "Some thoughts here.\n" | |
| '{"equation": "dy/dt = vy", "params": {}}\n' | |
| "And then another:\n" | |
| '{"equation": "dvy/dt = -g", "params": {"g": 9.81}}' | |
| ) | |
| action = parse_completion(completion) | |
| assert action.equation == "dy/dt = vy" | |
| def test_parse_completion_accepts_eqn_synonym() -> None: | |
| """Regression: qwen2.5:7b emits ``{"eqn": "..."}`` after the first | |
| turn (mimicking the historical HISTORY block shorthand). The parser | |
| must accept this rather than silently scoring r_format=0.""" | |
| completion = '{"eqn": "d2y/dt2 = -9.81", "rationale": "free fall"}' | |
| action = parse_completion(completion) | |
| assert action.equation == "d2y/dt2 = -9.81" | |
| assert action.rationale == "free fall" | |
| def test_parse_completion_accepts_other_equation_synonyms() -> None: | |
| for key in ("ode", "formula", "expression", "expr"): | |
| completion = '{"%s": "dy/dt = vy"}' % key | |
| action = parse_completion(completion) | |
| assert action.equation == "dy/dt = vy", f"failed for key={key!r}" | |
| def test_parse_completion_accepts_capitalised_keys() -> None: | |
| """Some models emit ``"Equation"`` / ``"EQN"``; we lowercase keys | |
| once before lookup so the alias table stays simple.""" | |
| completion = '{"Equation": "dy/dt = vy", "Rationale": "kinematic"}' | |
| action = parse_completion(completion) | |
| assert action.equation == "dy/dt = vy" | |
| assert action.rationale == "kinematic" | |
| def test_parse_completion_accepts_parameters_synonym() -> None: | |
| completion = '{"equation": "d2y/dt2 = -g", "parameters": {"g": 9.81}}' | |
| action = parse_completion(completion) | |
| assert action.params == {"g": 9.81} | |
| def test_parse_completion_canonical_key_wins_over_synonym() -> None: | |
| """If both ``equation`` and a synonym are present, ``equation`` | |
| wins. This is just hygiene — we don't want the synonym fallback to | |
| silently override a model that *did* emit the canonical key.""" | |
| completion = '{"equation": "dy/dt = vy", "eqn": "d2y/dt2 = -9.81"}' | |
| action = parse_completion(completion) | |
| assert action.equation == "dy/dt = vy" | |
| def test_parse_completion_accepts_reasoning_synonym() -> None: | |
| completion = '{"equation": "dy/dt = vy", "reasoning": "kinematic"}' | |
| action = parse_completion(completion) | |
| assert action.rationale == "kinematic" | |