Spaces:

modelbuilderhq
/

ghostexec

Sleeping

App Files Files Community

ghostexec / tests /test_reward_dead_suite.py

modelbuilderhq

Upload folder using huggingface_hub

ff293b1 verified 12 days ago

raw

history blame contribute delete

12.6 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	#
	# Dead-test suite for Phase 4 step rewards: 100+ independent scenarios on
	# phase2_core.json. Asserts penalization (do_nothing, invalid), priority
	# ordering (VIP critical > normal), and legal-action signatures for GRPO-style
	# post-training signal quality.

	from __future__ import annotations

	from pathlib import Path

	import pytest

	from ghostexec.models import GhostexecAction
	from ghostexec.server import reward as reward_mod
	from ghostexec.server.ghostexec_environment import GhostexecEnvironment

	ROOT = Path(__file__).resolve().parents[1]
	SCENARIO = ROOT / "scenarios" / "phase2_core.json"

	# All inbox ids from phase2_core (e01–e30).
	REPLY_EMAIL_IDS = [f"e{i:02d}" for i in range(1, 31)]

	# Unread or replyable ids suitable for archive (skip if unknown — all exist).
	ARCHIVE_EMAIL_IDS = [f"e{i:02d}" for i in range(1, 16)]

	# Pending / in-progress tasks only (t09 is done in fixture).
	COMPLETE_TASK_IDS = [f"t{i:02d}" for i in range(1, 13) if i != 9]

	# Known non-overlapping reschedules for 08:00 sim time (from phase4 tests).
	_SAFE_RESCHEDULES: list[tuple[str, str]] = [
	("m02", "2026-04-21T18:00:00"),
	("m03", "2026-04-21T18:30:00"),
	("m06", "2026-04-21T20:00:00"),
	("m09", "2026-04-21T21:00:00"),
	("m04", "2026-04-21T19:00:00"),
	("m05", "2026-04-21T19:30:00"),
	("m07", "2026-04-21T20:30:00"),
	("m08", "2026-04-21T21:30:00"),
	("m01", "2026-04-21T17:00:00"),
	("m10", "2026-04-21T22:00:00"),
	]

	MEETING_IDS_CANCEL = [f"m{i:02d}" for i in range(1, 11)]

	KNOWN_CONTACTS = ["Jordan Lee", "Jamie Liu", "Marcus Webb", "Sarah Chen", "Priya Sharma", "David Okonkwo"]

	_BODY = "Thanks — triaging and will follow up shortly."


	# --- 30 cases: reply every email id -------------------------------------------


	@pytest.mark.parametrize("email_id", REPLY_EMAIL_IDS)
	def test_dead_reply_email_each_id_positive_or_neutral(email_id: str) -> None:
	e = GhostexecEnvironment(SCENARIO)
	e.reset()
	obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY))
	assert obs.metadata.get("step_ok") is True
	assert obs.reward is not None
	bd = (obs.metadata or {}).get("reward_breakdown") or {}
	assert bd.get("invalid_step_adjustment", 0) == pytest.approx(0.0)
	assert bd.get("do_nothing_floor", 0) == pytest.approx(0.0)
	# No snapshot -4 conflict tax: legal reply should not tank below -0.5
	assert float(obs.reward) > -0.5


	@pytest.mark.parametrize("email_id", ("e01", "e03", "e12", "e21", "e27"))
	def test_dead_reply_vip_critical_queue_bonus(email_id: str) -> None:
	e = GhostexecEnvironment(SCENARIO)
	e.reset()
	obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY))
	assert obs.metadata.get("step_ok") is True
	# VIP+critical micro + critical_queue bonus; exact float varies slightly (0.48 scale).
	assert float(obs.reward or 0) > 0.06
	bd = (obs.metadata or {}).get("reward_breakdown") or {}
	assert float(bd.get("critical_queue_bonus") or 0) > 0


	@pytest.mark.parametrize("email_id", ("e02", "e04", "e06", "e14", "e23"))
	def test_dead_reply_high_or_normal_small_positive(email_id: str) -> None:
	e = GhostexecEnvironment(SCENARIO)
	e.reset()
	obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY))
	assert obs.metadata.get("step_ok") is True
	assert float(obs.reward or 0) > 0.0


	# --- 20 cases: do_nothing always penalized ------------------------------------


	@pytest.mark.parametrize("seed", range(20))
	def test_dead_do_nothing_strict_penalty(seed: int) -> None:
	e = GhostexecEnvironment(SCENARIO)
	e.reset()
	obs = e.step(GhostexecAction(action_type="do_nothing"))
	assert obs.metadata.get("step_ok") is True
	assert float(obs.reward or 0) < 0
	bd = (obs.metadata or {}).get("reward_breakdown") or {}
	assert float(bd.get("do_nothing_floor") or 0) == pytest.approx(reward_mod._DO_NOTHING_STRICT_PENALTY)


	# --- 15 cases: archive --------------------------------------------------------


	@pytest.mark.parametrize("email_id", ARCHIVE_EMAIL_IDS)
	def test_dead_archive_email_step_ok(email_id: str) -> None:
	e = GhostexecEnvironment(SCENARIO)
	e.reset()
	obs = e.step(GhostexecAction(action_type="archive_email", email_id=email_id))
	assert obs.metadata.get("step_ok") is True
	assert obs.reward is not None


	# --- 11 cases: complete pending task -----------------------------------------


	@pytest.mark.parametrize("task_id", COMPLETE_TASK_IDS)
	def test_dead_complete_task_step_ok(task_id: str) -> None:
	e = GhostexecEnvironment(SCENARIO)
	e.reset()
	obs = e.step(GhostexecAction(action_type="complete_task", task_id=task_id))
	assert obs.metadata.get("step_ok") is True
	assert obs.reward is not None
	bd = (obs.metadata or {}).get("reward_breakdown") or {}
	assert float(bd.get("task") or 0) >= reward_mod._COMPLETE_TASK_VALID_MICRO_BONUS


	# --- 10 cases: reschedule safe slots -----------------------------------------


	@pytest.mark.parametrize("meeting_id,new_time", _SAFE_RESCHEDULES)
	def test_dead_reschedule_meeting_resolves_or_micro(meeting_id: str, new_time: str) -> None:
	e = GhostexecEnvironment(SCENARIO)
	e.reset()
	obs = e.step(
	GhostexecAction(action_type="reschedule_meeting", meeting_id=meeting_id, new_time=new_time)
	)
	assert obs.metadata.get("step_ok") is True
	assert obs.reward is not None
	# Should beat idle do-nothing on same fresh env
	e2 = GhostexecEnvironment(SCENARIO)
	e2.reset()
	idle = e2.step(GhostexecAction(action_type="do_nothing"))
	assert float(obs.reward or 0) > float(idle.reward or 0)


	# --- 10 cases: cancel meeting --------------------------------------------------


	@pytest.mark.parametrize("meeting_id", MEETING_IDS_CANCEL)
	def test_dead_cancel_meeting_step_ok(meeting_id: str) -> None:
	e = GhostexecEnvironment(SCENARIO)
	e.reset()
	obs = e.step(
	GhostexecAction(action_type="cancel_meeting", meeting_id=meeting_id, reason="dead test cancel")
	)
	assert obs.metadata.get("step_ok") is True
	assert obs.reward is not None


	# --- 6 cases: send_message -----------------------------------------------------


	@pytest.mark.parametrize("contact_name", KNOWN_CONTACTS)
	def test_dead_send_message_known_contact(contact_name: str) -> None:
	e = GhostexecEnvironment(SCENARIO)
	e.reset()
	obs = e.step(
	GhostexecAction(
	action_type="send_message",
	contact_name=contact_name,
	message_body="Quick sync on priorities.",
	)
	)
	assert obs.metadata.get("step_ok") is True
	bd = (obs.metadata or {}).get("reward_breakdown") or {}
	assert float(bd.get("relationship") or 0) >= reward_mod._SEND_MESSAGE_VALID_MICRO_BONUS - 0.01


	# --- 5 cases: delegate_task ---------------------------------------------------


	@pytest.mark.parametrize(
	"task_id,contact",
	[
	("t08", "Jordan Lee"),
	("t07", "Jamie Liu"),
	("t01", "Marcus Webb"),
	("t02", "Sarah Chen"),
	("t11", "Casey Nguyen"),
	],
	)
	def test_dead_delegate_task(task_id: str, contact: str) -> None:
	e = GhostexecEnvironment(SCENARIO)
	e.reset()
	obs = e.step(
	GhostexecAction(action_type="delegate_task", task_id=task_id, contact_name=contact)
	)
	assert obs.metadata.get("step_ok") is True
	bd = (obs.metadata or {}).get("reward_breakdown") or {}
	assert float(bd.get("task") or 0) >= reward_mod._DELEGATE_TASK_VALID_MICRO_BONUS - 0.01


	# --- 10 cases: invalid actions ------------------------------------------------


	@pytest.mark.parametrize(
	"action,expect_ok",
	[
	(GhostexecAction(action_type="reply_email", email_id="nope", message_body="x"), False),
	(GhostexecAction(action_type="complete_task", task_id="t09"), False),
	(GhostexecAction(action_type="archive_email", email_id="nope"), False),
	(GhostexecAction(action_type="reschedule_meeting", meeting_id="m99", new_time="2026-04-21T18:00:00"), False),
	(GhostexecAction(action_type="cancel_meeting", meeting_id="m99", reason="x"), False),
	(GhostexecAction(action_type="delegate_task", task_id="t01", contact_name="Nobody"), False),
	(GhostexecAction(action_type="send_message", contact_name="Nobody", message_body="hi"), False),
	(GhostexecAction(action_type="reply_email", email_id="", message_body="hi"), False),
	(GhostexecAction(action_type="complete_task", task_id=""), False),
	(GhostexecAction(action_type="archive_email", email_id=""), False),
	],
	)
	def test_dead_invalid_action_step_ok_false(action: GhostexecAction, expect_ok: bool) -> None:
	e = GhostexecEnvironment(SCENARIO)
	e.reset()
	obs = e.step(action)
	assert obs.metadata.get("step_ok") is expect_ok
	bd = (obs.metadata or {}).get("reward_breakdown") or {}
	assert float(bd.get("invalid_step_adjustment") or 0) == pytest.approx(-0.25)


	# --- Ordering: VIP critical reply >> do_nothing --------------------------------


	def test_dead_priority_ordering_vip_critical_over_normal_over_idle() -> None:
	r_vip: list[float] = []
	r_norm: list[float] = []
	r_idle: list[float] = []
	for _ in range(5):
	e1 = GhostexecEnvironment(SCENARIO)
	e1.reset()
	r_vip.append(float(e1.step(GhostexecAction(action_type="reply_email", email_id="e01", message_body=_BODY)).reward or 0))
	e2 = GhostexecEnvironment(SCENARIO)
	e2.reset()
	r_norm.append(float(e2.step(GhostexecAction(action_type="reply_email", email_id="e14", message_body=_BODY)).reward or 0))
	e3 = GhostexecEnvironment(SCENARIO)
	e3.reset()
	r_idle.append(float(e3.step(GhostexecAction(action_type="do_nothing")).reward or 0))
	assert min(r_vip) > max(r_idle)
	assert min(r_norm) > max(r_idle)
	assert sum(r_vip) / len(r_vip) > sum(r_norm) / len(r_norm)


	# --- Tone penalty: casual to angry board contact ------------------------------


	def test_dead_tone_penalty_casual_to_angry_board() -> None:
	e = GhostexecEnvironment(SCENARIO)
	e.reset()
	# Marcus Webb is board; ensure angry mood in scenario or pick contact - phase2 has Marcus ANGRY in briefing
	obs_bad = e.step(
	GhostexecAction(
	action_type="reply_email",
	email_id="e01",
	message_body="hey lol no worries",
	)
	)
	assert obs_bad.metadata.get("step_ok") is True
	e2 = GhostexecEnvironment(SCENARIO)
	e2.reset()
	obs_good = e2.step(
	GhostexecAction(
	action_type="reply_email",
	email_id="e01",
	message_body="Dear Marcus, sincerely addressing the board request now.",
	)
	)
	assert float(obs_good.reward or 0) > float(obs_bad.reward or 0)


	# --- Reschedule adds conflict channel micro even if overlap unchanged ---------


	def test_dead_reschedule_micro_in_breakdown() -> None:
	e = GhostexecEnvironment(SCENARIO)
	e.reset()
	obs = e.step(
	GhostexecAction(action_type="reschedule_meeting", meeting_id="m07", new_time="2026-04-21T20:30:00")
	)
	assert obs.metadata.get("step_ok") is True
	bd = (obs.metadata or {}).get("reward_breakdown") or {}
	assert float(bd.get("conflict_raw") or 0) >= reward_mod._RESCHEDULE_VALID_MICRO_BONUS - 0.01


	# --- Unit: compute_step_reward invalid vs noop delta matches contract ---------


	def test_dead_compute_reward_invalid_vs_noop_delta() -> None:
	w = GhostexecEnvironment.load_world_from_json(SCENARIO)
	noop = GhostexecAction(action_type="do_nothing")
	bad = GhostexecAction(action_type="reply_email", email_id="missing", message_body="x")
	bd_ok = reward_mod.compute_step_reward(w, w, noop, action_ok=True, episode_done=False)
	bd_bad = reward_mod.compute_step_reward(w, w, bad, action_ok=False, episode_done=False)
	assert bd_bad.final == pytest.approx(bd_ok.final - (0.25 - 0.15))


	def test_dead_vip_critical_reply_outscores_professional_critical() -> None:
	"""VIP x2 micro on critical senders should dominate professional critical."""
	e_vip = GhostexecEnvironment(SCENARIO)
	e_vip.reset()
	r_vip = float(
	e_vip.step(GhostexecAction(action_type="reply_email", email_id="e01", message_body=_BODY)).reward or 0
	)
	e_pro = GhostexecEnvironment(SCENARIO)
	e_pro.reset()
	r_pro = float(
	e_pro.step(GhostexecAction(action_type="reply_email", email_id="e21", message_body=_BODY)).reward or 0
	)
	assert r_vip > r_pro