Spaces:

Pratyush-01
/

physix

Sleeping

App Files Files Community

physix / tests /test_prompt.py

Pratyush-01

Upload folder using huggingface_hub

0e24aff verified 12 days ago

raw

history blame contribute delete

10.7 kB

	"""Unit tests for :mod:`physix.training.prompt`."""

	from __future__ import annotations

	from physix.models import PhysiXObservation
	from physix.training.prompt import (
	SYSTEM_MESSAGE,
	build_prompt,
	parse_completion,
	render_observation_for_prompt,
	)


	def _sample_observation() -> PhysiXObservation:
	return PhysiXObservation(
	done=False,
	reward=None,
	trajectory=[
	{"t": 0.0, "y": 50.0, "vy": 0.0},
	{"t": 0.5, "y": 48.7, "vy": -4.9},
	{"t": 1.0, "y": 45.1, "vy": -9.7},
	],
	state_variables=["y", "vy"],
	hint="Object dropped from 50 m, mass 2 kg.",
	history=[
	{
	"turn": 1,
	"equation": "d2y/dt2 = -9.81",
	"params": {},
	"reward_total": 0.42,
	"reward_components": {"match": 0.42, "progress": 0.0, "simplicity": 0.95, "format": 1.0},
	"mismatch_summary": "predicted y diverges past t=2.0s.",
	}
	],
	mismatch_summary="predicted y diverges past t=2.0s.",
	turn=1,
	turn_remaining=7,
	system_id="free_fall_drag",
	stats={"y_min": 0.0, "y_max": 50.0, "duration": 6.0},
	reward_breakdown={"match": 0.42, "total": 0.40},
	)


	# ---------------------------------------------------------------------------
	# render_observation_for_prompt
	# ---------------------------------------------------------------------------


	def test_render_includes_metadata_block() -> None:
	text = render_observation_for_prompt(_sample_observation())

	assert "SYSTEM_ID: free_fall_drag" in text
	assert "STATE_VARIABLES: y, vy" in text
	assert "HINT: Object dropped from 50 m" in text
	assert "STATS:" in text


	def test_render_includes_trajectory_samples() -> None:
	text = render_observation_for_prompt(_sample_observation())
	assert "TRAJECTORY" in text
	assert "y=50" in text or "y=50.000" in text
	assert "vy=" in text


	def test_render_includes_history_when_present() -> None:
	text = render_observation_for_prompt(_sample_observation())
	assert "HISTORY" in text
	assert "turn=1" in text
	assert "d2y/dt2 = -9.81" in text


	def test_history_uses_equation_field_name_not_shorthand() -> None:
	"""Regression: HISTORY originally used ``eqn=`` as a display
	shorthand, which mid-strength chat models then mimicked when
	emitting their own JSON (``{"eqn": "..."}``). The parser only reads
	``equation``, so every post-first turn silently scored
	``r_format=0`` even when the model's actual equation was perfect.
	The prompt must use the same field name the parser expects."""
	text = render_observation_for_prompt(_sample_observation())
	assert "equation=`d2y/dt2 = -9.81`" in text
	assert "eqn=" not in text


	def test_history_block_surfaces_dense_reward_components() -> None:
	"""The model needs to see which reward component scored what so it
	can attribute its own progress turn-over-turn — e.g. push on grammar
	when ``format=0``, or try a structurally different equation when
	``match`` plateaus while ``progress=0``. Showing only ``reward=`` (the
	weighted total) hides that signal.
	"""
	obs = _sample_observation()
	obs.history = [
	{
	"turn": 1,
	"equation": "d2y/dt2 = -9.81",
	"params": {},
	"reward_total": 0.42,
	"reward_components": {
	"match": 0.42,
	"progress": 0.0,
	"simplicity": 0.95,
	"format": 1.0,
	},
	"mismatch_summary": "predicted y diverges past t=2.0s.",
	}
	]
	text = render_observation_for_prompt(obs)

	assert "match=0.42" in text
	assert "progress=0.00" in text
	assert "simplicity=0.95" in text
	assert "format=1.00" in text


	def test_history_block_tolerates_missing_reward_components() -> None:
	"""Older history rows or partial breakdowns shouldn't crash render —
	missing components default to 0.00 so the column layout stays
	stable and the model can still parse the block in-context."""
	obs = _sample_observation()
	obs.history = [
	{
	"turn": 1,
	"equation": "d2y/dt2 = -9.81",
	"params": {},
	"reward_total": 0.4,
	"mismatch_summary": "",
	}
	]
	text = render_observation_for_prompt(obs)

	for name in ("match", "progress", "simplicity", "format"):
	assert f"{name}=0.00" in text


	def test_system_message_locks_in_canonical_field_name() -> None:
	"""The system prompt must explicitly forbid synonyms so the model
	doesn't drift to ``eqn``/``ode``/``formula`` on later turns."""
	assert '"equation"' in SYSTEM_MESSAGE
	assert "never" in SYSTEM_MESSAGE.lower()


	def test_render_omits_history_block_when_empty() -> None:
	obs = _sample_observation()
	obs.history = []
	text = render_observation_for_prompt(obs)
	assert "HISTORY" not in text


	# ---------------------------------------------------------------------------
	# build_prompt
	# ---------------------------------------------------------------------------


	def test_build_prompt_returns_chat_pair() -> None:
	prompt = build_prompt(_sample_observation())

	assert len(prompt) == 2
	assert prompt[0] == {"role": "system", "content": SYSTEM_MESSAGE}
	assert prompt[1]["role"] == "user"
	assert "TRAJECTORY" in prompt[1]["content"]


	# ---------------------------------------------------------------------------
	# parse_completion
	# ---------------------------------------------------------------------------


	def test_parse_completion_extracts_clean_json() -> None:
	completion = '{"equation": "d2y/dt2 = -9.81", "params": {"g": 9.81}, "rationale": "free fall"}'
	action = parse_completion(completion)

	assert action.equation == "d2y/dt2 = -9.81"
	assert action.params == {"g": 9.81}
	assert action.rationale == "free fall"


	def test_parse_completion_handles_code_fences() -> None:
	completion = '''
	Here is my hypothesis:
	```json
	{
	"equation": "d2y/dt2 = -g + k * vy**2",
	"params": {"g": 9.81, "k": 0.05},
	"rationale": "added drag"
	}
	```
	'''
	action = parse_completion(completion)

	assert action.equation == "d2y/dt2 = -g + k * vy**2"
	assert action.params == {"g": 9.81, "k": 0.05}


	def test_parse_completion_with_no_json_yields_empty_equation() -> None:
	"""When the completion contains no extractable JSON object, the action
	must have an empty equation (so the verifier reports a clean
	``Empty equation payload`` error and the env scores ``r_format=0``).

	The raw text is preserved in ``rationale`` so logs/UI can still show
	what the model said, but it is never fed to the equation parser as
	if it were an equation.
	"""
	completion = "I think the equation is d2y/dt2 = -g but I'm not sure."
	action = parse_completion(completion)

	assert action.equation == ""
	assert action.params == {}
	assert action.rationale == completion.strip()


	def test_parse_completion_coerces_string_params() -> None:
	completion = '{"equation": "d2y/dt2 = -g", "params": {"g": "9.81", "bad": "not_a_number"}}'
	action = parse_completion(completion)

	assert action.params == {"g": 9.81} # bad value silently dropped


	def test_parse_completion_handles_latex_braces_inside_json_string() -> None:
	"""Regression: a regex-based brace matcher mis-balances when the JSON
	string value contains literal ``{`` / ``}`` (e.g. LaTeX ``\\frac{...}``).
	The JSON-aware extractor (``json.JSONDecoder.raw_decode``) handles it
	correctly. The equation field is preserved verbatim — rewriting it
	upstream would silently corrupt the agent's output and is the env's
	job to score, not ours."""
	completion = (
	'```json\n'
	'{ "equation": "\\\\frac{d vy}{dt} = -9.81", '
	'"params": { "g": 9.81 }, '
	'"rationale": "free fall" }\n'
	'```'
	)
	action = parse_completion(completion)

	assert action.equation == "\\frac{d vy}{dt} = -9.81"
	assert action.params == {"g": 9.81}
	assert action.rationale == "free fall"


	def test_parse_completion_picks_first_object_when_text_has_multiple() -> None:
	completion = (
	"Some thoughts here.\n"
	'{"equation": "dy/dt = vy", "params": {}}\n'
	"And then another:\n"
	'{"equation": "dvy/dt = -g", "params": {"g": 9.81}}'
	)
	action = parse_completion(completion)

	assert action.equation == "dy/dt = vy"


	def test_parse_completion_accepts_eqn_synonym() -> None:
	"""Regression: qwen2.5:7b emits ``{"eqn": "..."}`` after the first
	turn (mimicking the historical HISTORY block shorthand). The parser
	must accept this rather than silently scoring r_format=0."""
	completion = '{"eqn": "d2y/dt2 = -9.81", "rationale": "free fall"}'
	action = parse_completion(completion)

	assert action.equation == "d2y/dt2 = -9.81"
	assert action.rationale == "free fall"


	def test_parse_completion_accepts_other_equation_synonyms() -> None:
	for key in ("ode", "formula", "expression", "expr"):
	completion = '{"%s": "dy/dt = vy"}' % key
	action = parse_completion(completion)
	assert action.equation == "dy/dt = vy", f"failed for key={key!r}"


	def test_parse_completion_accepts_capitalised_keys() -> None:
	"""Some models emit ``"Equation"`` / ``"EQN"``; we lowercase keys
	once before lookup so the alias table stays simple."""
	completion = '{"Equation": "dy/dt = vy", "Rationale": "kinematic"}'
	action = parse_completion(completion)

	assert action.equation == "dy/dt = vy"
	assert action.rationale == "kinematic"


	def test_parse_completion_accepts_parameters_synonym() -> None:
	completion = '{"equation": "d2y/dt2 = -g", "parameters": {"g": 9.81}}'
	action = parse_completion(completion)

	assert action.params == {"g": 9.81}


	def test_parse_completion_canonical_key_wins_over_synonym() -> None:
	"""If both ``equation`` and a synonym are present, ``equation``
	wins. This is just hygiene — we don't want the synonym fallback to
	silently override a model that did emit the canonical key."""
	completion = '{"equation": "dy/dt = vy", "eqn": "d2y/dt2 = -9.81"}'
	action = parse_completion(completion)

	assert action.equation == "dy/dt = vy"


	def test_parse_completion_accepts_reasoning_synonym() -> None:
	completion = '{"equation": "dy/dt = vy", "reasoning": "kinematic"}'
	action = parse_completion(completion)

	assert action.rationale == "kinematic"