Spaces:

modelbuilderhq
/

ghostexec

Sleeping

File size: 12,566 Bytes

ff293b1

# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# Dead-test suite for Phase 4 step rewards: 100+ independent scenarios on
# phase2_core.json. Asserts penalization (do_nothing, invalid), priority
# ordering (VIP critical > normal), and legal-action signatures for GRPO-style
# post-training signal quality.

from __future__ import annotations

from pathlib import Path

import pytest

from ghostexec.models import GhostexecAction
from ghostexec.server import reward as reward_mod
from ghostexec.server.ghostexec_environment import GhostexecEnvironment

ROOT = Path(__file__).resolve().parents[1]
SCENARIO = ROOT / "scenarios" / "phase2_core.json"

# All inbox ids from phase2_core (e01–e30).
REPLY_EMAIL_IDS = [f"e{i:02d}" for i in range(1, 31)]

# Unread or replyable ids suitable for archive (skip if unknown — all exist).
ARCHIVE_EMAIL_IDS = [f"e{i:02d}" for i in range(1, 16)]

# Pending / in-progress tasks only (t09 is done in fixture).
COMPLETE_TASK_IDS = [f"t{i:02d}" for i in range(1, 13) if i != 9]

# Known non-overlapping reschedules for 08:00 sim time (from phase4 tests).
_SAFE_RESCHEDULES: list[tuple[str, str]] = [
    ("m02", "2026-04-21T18:00:00"),
    ("m03", "2026-04-21T18:30:00"),
    ("m06", "2026-04-21T20:00:00"),
    ("m09", "2026-04-21T21:00:00"),
    ("m04", "2026-04-21T19:00:00"),
    ("m05", "2026-04-21T19:30:00"),
    ("m07", "2026-04-21T20:30:00"),
    ("m08", "2026-04-21T21:30:00"),
    ("m01", "2026-04-21T17:00:00"),
    ("m10", "2026-04-21T22:00:00"),
]

MEETING_IDS_CANCEL = [f"m{i:02d}" for i in range(1, 11)]

KNOWN_CONTACTS = ["Jordan Lee", "Jamie Liu", "Marcus Webb", "Sarah Chen", "Priya Sharma", "David Okonkwo"]

_BODY = "Thanks — triaging and will follow up shortly."


# --- 30 cases: reply every email id -------------------------------------------


@pytest.mark.parametrize("email_id", REPLY_EMAIL_IDS)
def test_dead_reply_email_each_id_positive_or_neutral(email_id: str) -> None:
    e = GhostexecEnvironment(SCENARIO)
    e.reset()
    obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY))
    assert obs.metadata.get("step_ok") is True
    assert obs.reward is not None
    bd = (obs.metadata or {}).get("reward_breakdown") or {}
    assert bd.get("invalid_step_adjustment", 0) == pytest.approx(0.0)
    assert bd.get("do_nothing_floor", 0) == pytest.approx(0.0)
    # No snapshot -4 conflict tax: legal reply should not tank below -0.5
    assert float(obs.reward) > -0.5


@pytest.mark.parametrize("email_id", ("e01", "e03", "e12", "e21", "e27"))
def test_dead_reply_vip_critical_queue_bonus(email_id: str) -> None:
    e = GhostexecEnvironment(SCENARIO)
    e.reset()
    obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY))
    assert obs.metadata.get("step_ok") is True
    # VIP+critical micro + critical_queue bonus; exact float varies slightly (0.48 scale).
    assert float(obs.reward or 0) > 0.06
    bd = (obs.metadata or {}).get("reward_breakdown") or {}
    assert float(bd.get("critical_queue_bonus") or 0) > 0


@pytest.mark.parametrize("email_id", ("e02", "e04", "e06", "e14", "e23"))
def test_dead_reply_high_or_normal_small_positive(email_id: str) -> None:
    e = GhostexecEnvironment(SCENARIO)
    e.reset()
    obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY))
    assert obs.metadata.get("step_ok") is True
    assert float(obs.reward or 0) > 0.0


# --- 20 cases: do_nothing always penalized ------------------------------------


@pytest.mark.parametrize("seed", range(20))
def test_dead_do_nothing_strict_penalty(seed: int) -> None:
    e = GhostexecEnvironment(SCENARIO)
    e.reset()
    obs = e.step(GhostexecAction(action_type="do_nothing"))
    assert obs.metadata.get("step_ok") is True
    assert float(obs.reward or 0) < 0
    bd = (obs.metadata or {}).get("reward_breakdown") or {}
    assert float(bd.get("do_nothing_floor") or 0) == pytest.approx(reward_mod._DO_NOTHING_STRICT_PENALTY)


# --- 15 cases: archive --------------------------------------------------------


@pytest.mark.parametrize("email_id", ARCHIVE_EMAIL_IDS)
def test_dead_archive_email_step_ok(email_id: str) -> None:
    e = GhostexecEnvironment(SCENARIO)
    e.reset()
    obs = e.step(GhostexecAction(action_type="archive_email", email_id=email_id))
    assert obs.metadata.get("step_ok") is True
    assert obs.reward is not None


# --- 11 cases: complete pending task -----------------------------------------


@pytest.mark.parametrize("task_id", COMPLETE_TASK_IDS)
def test_dead_complete_task_step_ok(task_id: str) -> None:
    e = GhostexecEnvironment(SCENARIO)
    e.reset()
    obs = e.step(GhostexecAction(action_type="complete_task", task_id=task_id))
    assert obs.metadata.get("step_ok") is True
    assert obs.reward is not None
    bd = (obs.metadata or {}).get("reward_breakdown") or {}
    assert float(bd.get("task") or 0) >= reward_mod._COMPLETE_TASK_VALID_MICRO_BONUS


# --- 10 cases: reschedule safe slots -----------------------------------------


@pytest.mark.parametrize("meeting_id,new_time", _SAFE_RESCHEDULES)
def test_dead_reschedule_meeting_resolves_or_micro(meeting_id: str, new_time: str) -> None:
    e = GhostexecEnvironment(SCENARIO)
    e.reset()
    obs = e.step(
        GhostexecAction(action_type="reschedule_meeting", meeting_id=meeting_id, new_time=new_time)
    )
    assert obs.metadata.get("step_ok") is True
    assert obs.reward is not None
    # Should beat idle do-nothing on same fresh env
    e2 = GhostexecEnvironment(SCENARIO)
    e2.reset()
    idle = e2.step(GhostexecAction(action_type="do_nothing"))
    assert float(obs.reward or 0) > float(idle.reward or 0)


# --- 10 cases: cancel meeting --------------------------------------------------


@pytest.mark.parametrize("meeting_id", MEETING_IDS_CANCEL)
def test_dead_cancel_meeting_step_ok(meeting_id: str) -> None:
    e = GhostexecEnvironment(SCENARIO)
    e.reset()
    obs = e.step(
        GhostexecAction(action_type="cancel_meeting", meeting_id=meeting_id, reason="dead test cancel")
    )
    assert obs.metadata.get("step_ok") is True
    assert obs.reward is not None


# --- 6 cases: send_message -----------------------------------------------------


@pytest.mark.parametrize("contact_name", KNOWN_CONTACTS)
def test_dead_send_message_known_contact(contact_name: str) -> None:
    e = GhostexecEnvironment(SCENARIO)
    e.reset()
    obs = e.step(
        GhostexecAction(
            action_type="send_message",
            contact_name=contact_name,
            message_body="Quick sync on priorities.",
        )
    )
    assert obs.metadata.get("step_ok") is True
    bd = (obs.metadata or {}).get("reward_breakdown") or {}
    assert float(bd.get("relationship") or 0) >= reward_mod._SEND_MESSAGE_VALID_MICRO_BONUS - 0.01


# --- 5 cases: delegate_task ---------------------------------------------------


@pytest.mark.parametrize(
    "task_id,contact",
    [
        ("t08", "Jordan Lee"),
        ("t07", "Jamie Liu"),
        ("t01", "Marcus Webb"),
        ("t02", "Sarah Chen"),
        ("t11", "Casey Nguyen"),
    ],
)
def test_dead_delegate_task(task_id: str, contact: str) -> None:
    e = GhostexecEnvironment(SCENARIO)
    e.reset()
    obs = e.step(
        GhostexecAction(action_type="delegate_task", task_id=task_id, contact_name=contact)
    )
    assert obs.metadata.get("step_ok") is True
    bd = (obs.metadata or {}).get("reward_breakdown") or {}
    assert float(bd.get("task") or 0) >= reward_mod._DELEGATE_TASK_VALID_MICRO_BONUS - 0.01


# --- 10 cases: invalid actions ------------------------------------------------


@pytest.mark.parametrize(
    "action,expect_ok",
    [
        (GhostexecAction(action_type="reply_email", email_id="nope", message_body="x"), False),
        (GhostexecAction(action_type="complete_task", task_id="t09"), False),
        (GhostexecAction(action_type="archive_email", email_id="nope"), False),
        (GhostexecAction(action_type="reschedule_meeting", meeting_id="m99", new_time="2026-04-21T18:00:00"), False),
        (GhostexecAction(action_type="cancel_meeting", meeting_id="m99", reason="x"), False),
        (GhostexecAction(action_type="delegate_task", task_id="t01", contact_name="Nobody"), False),
        (GhostexecAction(action_type="send_message", contact_name="Nobody", message_body="hi"), False),
        (GhostexecAction(action_type="reply_email", email_id="", message_body="hi"), False),
        (GhostexecAction(action_type="complete_task", task_id=""), False),
        (GhostexecAction(action_type="archive_email", email_id=""), False),
    ],
)
def test_dead_invalid_action_step_ok_false(action: GhostexecAction, expect_ok: bool) -> None:
    e = GhostexecEnvironment(SCENARIO)
    e.reset()
    obs = e.step(action)
    assert obs.metadata.get("step_ok") is expect_ok
    bd = (obs.metadata or {}).get("reward_breakdown") or {}
    assert float(bd.get("invalid_step_adjustment") or 0) == pytest.approx(-0.25)


# --- Ordering: VIP critical reply >> do_nothing --------------------------------


def test_dead_priority_ordering_vip_critical_over_normal_over_idle() -> None:
    r_vip: list[float] = []
    r_norm: list[float] = []
    r_idle: list[float] = []
    for _ in range(5):
        e1 = GhostexecEnvironment(SCENARIO)
        e1.reset()
        r_vip.append(float(e1.step(GhostexecAction(action_type="reply_email", email_id="e01", message_body=_BODY)).reward or 0))
        e2 = GhostexecEnvironment(SCENARIO)
        e2.reset()
        r_norm.append(float(e2.step(GhostexecAction(action_type="reply_email", email_id="e14", message_body=_BODY)).reward or 0))
        e3 = GhostexecEnvironment(SCENARIO)
        e3.reset()
        r_idle.append(float(e3.step(GhostexecAction(action_type="do_nothing")).reward or 0))
    assert min(r_vip) > max(r_idle)
    assert min(r_norm) > max(r_idle)
    assert sum(r_vip) / len(r_vip) > sum(r_norm) / len(r_norm)


# --- Tone penalty: casual to angry board contact ------------------------------


def test_dead_tone_penalty_casual_to_angry_board() -> None:
    e = GhostexecEnvironment(SCENARIO)
    e.reset()
    # Marcus Webb is board; ensure angry mood in scenario or pick contact - phase2 has Marcus ANGRY in briefing
    obs_bad = e.step(
        GhostexecAction(
            action_type="reply_email",
            email_id="e01",
            message_body="hey lol no worries",
        )
    )
    assert obs_bad.metadata.get("step_ok") is True
    e2 = GhostexecEnvironment(SCENARIO)
    e2.reset()
    obs_good = e2.step(
        GhostexecAction(
            action_type="reply_email",
            email_id="e01",
            message_body="Dear Marcus, sincerely addressing the board request now.",
        )
    )
    assert float(obs_good.reward or 0) > float(obs_bad.reward or 0)


# --- Reschedule adds conflict channel micro even if overlap unchanged ---------


def test_dead_reschedule_micro_in_breakdown() -> None:
    e = GhostexecEnvironment(SCENARIO)
    e.reset()
    obs = e.step(
        GhostexecAction(action_type="reschedule_meeting", meeting_id="m07", new_time="2026-04-21T20:30:00")
    )
    assert obs.metadata.get("step_ok") is True
    bd = (obs.metadata or {}).get("reward_breakdown") or {}
    assert float(bd.get("conflict_raw") or 0) >= reward_mod._RESCHEDULE_VALID_MICRO_BONUS - 0.01


# --- Unit: compute_step_reward invalid vs noop delta matches contract ---------


def test_dead_compute_reward_invalid_vs_noop_delta() -> None:
    w = GhostexecEnvironment.load_world_from_json(SCENARIO)
    noop = GhostexecAction(action_type="do_nothing")
    bad = GhostexecAction(action_type="reply_email", email_id="missing", message_body="x")
    bd_ok = reward_mod.compute_step_reward(w, w, noop, action_ok=True, episode_done=False)
    bd_bad = reward_mod.compute_step_reward(w, w, bad, action_ok=False, episode_done=False)
    assert bd_bad.final == pytest.approx(bd_ok.final - (0.25 - 0.15))


def test_dead_vip_critical_reply_outscores_professional_critical() -> None:
    """VIP x2 micro on critical senders should dominate professional critical."""
    e_vip = GhostexecEnvironment(SCENARIO)
    e_vip.reset()
    r_vip = float(
        e_vip.step(GhostexecAction(action_type="reply_email", email_id="e01", message_body=_BODY)).reward or 0
    )
    e_pro = GhostexecEnvironment(SCENARIO)
    e_pro.reset()
    r_pro = float(
        e_pro.step(GhostexecAction(action_type="reply_email", email_id="e21", message_body=_BODY)).reward or 0
    )
    assert r_vip > r_pro