Spaces:
Sleeping
Sleeping
| # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| # | |
| # Dead-test suite for Phase 4 step rewards: 100+ independent scenarios on | |
| # phase2_core.json. Asserts penalization (do_nothing, invalid), priority | |
| # ordering (VIP critical > normal), and legal-action signatures for GRPO-style | |
| # post-training signal quality. | |
| from __future__ import annotations | |
| from pathlib import Path | |
| import pytest | |
| from ghostexec.models import GhostexecAction | |
| from ghostexec.server import reward as reward_mod | |
| from ghostexec.server.ghostexec_environment import GhostexecEnvironment | |
| ROOT = Path(__file__).resolve().parents[1] | |
| SCENARIO = ROOT / "scenarios" / "phase2_core.json" | |
| # All inbox ids from phase2_core (e01–e30). | |
| REPLY_EMAIL_IDS = [f"e{i:02d}" for i in range(1, 31)] | |
| # Unread or replyable ids suitable for archive (skip if unknown — all exist). | |
| ARCHIVE_EMAIL_IDS = [f"e{i:02d}" for i in range(1, 16)] | |
| # Pending / in-progress tasks only (t09 is done in fixture). | |
| COMPLETE_TASK_IDS = [f"t{i:02d}" for i in range(1, 13) if i != 9] | |
| # Known non-overlapping reschedules for 08:00 sim time (from phase4 tests). | |
| _SAFE_RESCHEDULES: list[tuple[str, str]] = [ | |
| ("m02", "2026-04-21T18:00:00"), | |
| ("m03", "2026-04-21T18:30:00"), | |
| ("m06", "2026-04-21T20:00:00"), | |
| ("m09", "2026-04-21T21:00:00"), | |
| ("m04", "2026-04-21T19:00:00"), | |
| ("m05", "2026-04-21T19:30:00"), | |
| ("m07", "2026-04-21T20:30:00"), | |
| ("m08", "2026-04-21T21:30:00"), | |
| ("m01", "2026-04-21T17:00:00"), | |
| ("m10", "2026-04-21T22:00:00"), | |
| ] | |
| MEETING_IDS_CANCEL = [f"m{i:02d}" for i in range(1, 11)] | |
| KNOWN_CONTACTS = ["Jordan Lee", "Jamie Liu", "Marcus Webb", "Sarah Chen", "Priya Sharma", "David Okonkwo"] | |
| _BODY = "Thanks — triaging and will follow up shortly." | |
| # --- 30 cases: reply every email id ------------------------------------------- | |
| def test_dead_reply_email_each_id_positive_or_neutral(email_id: str) -> None: | |
| e = GhostexecEnvironment(SCENARIO) | |
| e.reset() | |
| obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY)) | |
| assert obs.metadata.get("step_ok") is True | |
| assert obs.reward is not None | |
| bd = (obs.metadata or {}).get("reward_breakdown") or {} | |
| assert bd.get("invalid_step_adjustment", 0) == pytest.approx(0.0) | |
| assert bd.get("do_nothing_floor", 0) == pytest.approx(0.0) | |
| # No snapshot -4 conflict tax: legal reply should not tank below -0.5 | |
| assert float(obs.reward) > -0.5 | |
| def test_dead_reply_vip_critical_queue_bonus(email_id: str) -> None: | |
| e = GhostexecEnvironment(SCENARIO) | |
| e.reset() | |
| obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY)) | |
| assert obs.metadata.get("step_ok") is True | |
| # VIP+critical micro + critical_queue bonus; exact float varies slightly (0.48 scale). | |
| assert float(obs.reward or 0) > 0.06 | |
| bd = (obs.metadata or {}).get("reward_breakdown") or {} | |
| assert float(bd.get("critical_queue_bonus") or 0) > 0 | |
| def test_dead_reply_high_or_normal_small_positive(email_id: str) -> None: | |
| e = GhostexecEnvironment(SCENARIO) | |
| e.reset() | |
| obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY)) | |
| assert obs.metadata.get("step_ok") is True | |
| assert float(obs.reward or 0) > 0.0 | |
| # --- 20 cases: do_nothing always penalized ------------------------------------ | |
| def test_dead_do_nothing_strict_penalty(seed: int) -> None: | |
| e = GhostexecEnvironment(SCENARIO) | |
| e.reset() | |
| obs = e.step(GhostexecAction(action_type="do_nothing")) | |
| assert obs.metadata.get("step_ok") is True | |
| assert float(obs.reward or 0) < 0 | |
| bd = (obs.metadata or {}).get("reward_breakdown") or {} | |
| assert float(bd.get("do_nothing_floor") or 0) == pytest.approx(reward_mod._DO_NOTHING_STRICT_PENALTY) | |
| # --- 15 cases: archive -------------------------------------------------------- | |
| def test_dead_archive_email_step_ok(email_id: str) -> None: | |
| e = GhostexecEnvironment(SCENARIO) | |
| e.reset() | |
| obs = e.step(GhostexecAction(action_type="archive_email", email_id=email_id)) | |
| assert obs.metadata.get("step_ok") is True | |
| assert obs.reward is not None | |
| # --- 11 cases: complete pending task ----------------------------------------- | |
| def test_dead_complete_task_step_ok(task_id: str) -> None: | |
| e = GhostexecEnvironment(SCENARIO) | |
| e.reset() | |
| obs = e.step(GhostexecAction(action_type="complete_task", task_id=task_id)) | |
| assert obs.metadata.get("step_ok") is True | |
| assert obs.reward is not None | |
| bd = (obs.metadata or {}).get("reward_breakdown") or {} | |
| assert float(bd.get("task") or 0) >= reward_mod._COMPLETE_TASK_VALID_MICRO_BONUS | |
| # --- 10 cases: reschedule safe slots ----------------------------------------- | |
| def test_dead_reschedule_meeting_resolves_or_micro(meeting_id: str, new_time: str) -> None: | |
| e = GhostexecEnvironment(SCENARIO) | |
| e.reset() | |
| obs = e.step( | |
| GhostexecAction(action_type="reschedule_meeting", meeting_id=meeting_id, new_time=new_time) | |
| ) | |
| assert obs.metadata.get("step_ok") is True | |
| assert obs.reward is not None | |
| # Should beat idle do-nothing on same fresh env | |
| e2 = GhostexecEnvironment(SCENARIO) | |
| e2.reset() | |
| idle = e2.step(GhostexecAction(action_type="do_nothing")) | |
| assert float(obs.reward or 0) > float(idle.reward or 0) | |
| # --- 10 cases: cancel meeting -------------------------------------------------- | |
| def test_dead_cancel_meeting_step_ok(meeting_id: str) -> None: | |
| e = GhostexecEnvironment(SCENARIO) | |
| e.reset() | |
| obs = e.step( | |
| GhostexecAction(action_type="cancel_meeting", meeting_id=meeting_id, reason="dead test cancel") | |
| ) | |
| assert obs.metadata.get("step_ok") is True | |
| assert obs.reward is not None | |
| # --- 6 cases: send_message ----------------------------------------------------- | |
| def test_dead_send_message_known_contact(contact_name: str) -> None: | |
| e = GhostexecEnvironment(SCENARIO) | |
| e.reset() | |
| obs = e.step( | |
| GhostexecAction( | |
| action_type="send_message", | |
| contact_name=contact_name, | |
| message_body="Quick sync on priorities.", | |
| ) | |
| ) | |
| assert obs.metadata.get("step_ok") is True | |
| bd = (obs.metadata or {}).get("reward_breakdown") or {} | |
| assert float(bd.get("relationship") or 0) >= reward_mod._SEND_MESSAGE_VALID_MICRO_BONUS - 0.01 | |
| # --- 5 cases: delegate_task --------------------------------------------------- | |
| def test_dead_delegate_task(task_id: str, contact: str) -> None: | |
| e = GhostexecEnvironment(SCENARIO) | |
| e.reset() | |
| obs = e.step( | |
| GhostexecAction(action_type="delegate_task", task_id=task_id, contact_name=contact) | |
| ) | |
| assert obs.metadata.get("step_ok") is True | |
| bd = (obs.metadata or {}).get("reward_breakdown") or {} | |
| assert float(bd.get("task") or 0) >= reward_mod._DELEGATE_TASK_VALID_MICRO_BONUS - 0.01 | |
| # --- 10 cases: invalid actions ------------------------------------------------ | |
| def test_dead_invalid_action_step_ok_false(action: GhostexecAction, expect_ok: bool) -> None: | |
| e = GhostexecEnvironment(SCENARIO) | |
| e.reset() | |
| obs = e.step(action) | |
| assert obs.metadata.get("step_ok") is expect_ok | |
| bd = (obs.metadata or {}).get("reward_breakdown") or {} | |
| assert float(bd.get("invalid_step_adjustment") or 0) == pytest.approx(-0.25) | |
| # --- Ordering: VIP critical reply >> do_nothing -------------------------------- | |
| def test_dead_priority_ordering_vip_critical_over_normal_over_idle() -> None: | |
| r_vip: list[float] = [] | |
| r_norm: list[float] = [] | |
| r_idle: list[float] = [] | |
| for _ in range(5): | |
| e1 = GhostexecEnvironment(SCENARIO) | |
| e1.reset() | |
| r_vip.append(float(e1.step(GhostexecAction(action_type="reply_email", email_id="e01", message_body=_BODY)).reward or 0)) | |
| e2 = GhostexecEnvironment(SCENARIO) | |
| e2.reset() | |
| r_norm.append(float(e2.step(GhostexecAction(action_type="reply_email", email_id="e14", message_body=_BODY)).reward or 0)) | |
| e3 = GhostexecEnvironment(SCENARIO) | |
| e3.reset() | |
| r_idle.append(float(e3.step(GhostexecAction(action_type="do_nothing")).reward or 0)) | |
| assert min(r_vip) > max(r_idle) | |
| assert min(r_norm) > max(r_idle) | |
| assert sum(r_vip) / len(r_vip) > sum(r_norm) / len(r_norm) | |
| # --- Tone penalty: casual to angry board contact ------------------------------ | |
| def test_dead_tone_penalty_casual_to_angry_board() -> None: | |
| e = GhostexecEnvironment(SCENARIO) | |
| e.reset() | |
| # Marcus Webb is board; ensure angry mood in scenario or pick contact - phase2 has Marcus ANGRY in briefing | |
| obs_bad = e.step( | |
| GhostexecAction( | |
| action_type="reply_email", | |
| email_id="e01", | |
| message_body="hey lol no worries", | |
| ) | |
| ) | |
| assert obs_bad.metadata.get("step_ok") is True | |
| e2 = GhostexecEnvironment(SCENARIO) | |
| e2.reset() | |
| obs_good = e2.step( | |
| GhostexecAction( | |
| action_type="reply_email", | |
| email_id="e01", | |
| message_body="Dear Marcus, sincerely addressing the board request now.", | |
| ) | |
| ) | |
| assert float(obs_good.reward or 0) > float(obs_bad.reward or 0) | |
| # --- Reschedule adds conflict channel micro even if overlap unchanged --------- | |
| def test_dead_reschedule_micro_in_breakdown() -> None: | |
| e = GhostexecEnvironment(SCENARIO) | |
| e.reset() | |
| obs = e.step( | |
| GhostexecAction(action_type="reschedule_meeting", meeting_id="m07", new_time="2026-04-21T20:30:00") | |
| ) | |
| assert obs.metadata.get("step_ok") is True | |
| bd = (obs.metadata or {}).get("reward_breakdown") or {} | |
| assert float(bd.get("conflict_raw") or 0) >= reward_mod._RESCHEDULE_VALID_MICRO_BONUS - 0.01 | |
| # --- Unit: compute_step_reward invalid vs noop delta matches contract --------- | |
| def test_dead_compute_reward_invalid_vs_noop_delta() -> None: | |
| w = GhostexecEnvironment.load_world_from_json(SCENARIO) | |
| noop = GhostexecAction(action_type="do_nothing") | |
| bad = GhostexecAction(action_type="reply_email", email_id="missing", message_body="x") | |
| bd_ok = reward_mod.compute_step_reward(w, w, noop, action_ok=True, episode_done=False) | |
| bd_bad = reward_mod.compute_step_reward(w, w, bad, action_ok=False, episode_done=False) | |
| assert bd_bad.final == pytest.approx(bd_ok.final - (0.25 - 0.15)) | |
| def test_dead_vip_critical_reply_outscores_professional_critical() -> None: | |
| """VIP x2 micro on critical senders should dominate professional critical.""" | |
| e_vip = GhostexecEnvironment(SCENARIO) | |
| e_vip.reset() | |
| r_vip = float( | |
| e_vip.step(GhostexecAction(action_type="reply_email", email_id="e01", message_body=_BODY)).reward or 0 | |
| ) | |
| e_pro = GhostexecEnvironment(SCENARIO) | |
| e_pro.reset() | |
| r_pro = float( | |
| e_pro.step(GhostexecAction(action_type="reply_email", email_id="e21", message_body=_BODY)).reward or 0 | |
| ) | |
| assert r_vip > r_pro | |