File size: 5,834 Bytes
ff293b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""Hard API dead-test: 500+ calls with reward-consistency checks."""

from __future__ import annotations

from typing import Any

import pytest
from fastapi.testclient import TestClient

from ghostexec.server.app import app

W_CONFLICT = 0.35
W_REL = 0.35
W_TASK = 0.30
OUTPUT_SCALE = 0.48


def _step_payload_for(i: int) -> dict[str, Any]:
    templates: list[dict[str, Any]] = [
        {"action": {"action_type": "do_nothing"}},
        {"action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}},
        {"action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}},
        {"action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}},
        {"action": {"action_type": "archive_email", "email_id": "e09"}},
        {"action": {"action_type": "archive_email", "email_id": "bad_id"}},
        {
            "action": {
                "action_type": "reschedule_meeting",
                "meeting_id": "m02",
                "new_time": "2026-04-21T18:00:00",
            }
        },
        {
            "action": {
                "action_type": "reschedule_meeting",
                "meeting_id": "m03",
                "new_time": "2026-04-21T09:30:00",  # overlap -> invalid semantic
            }
        },
        {"action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}},
        {"action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}},
        {"action": {"action_type": "complete_task", "task_id": "t07"}},
        {"action": {"action_type": "complete_task", "task_id": "t09"}},  # already done
        {"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}},
        {"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}},
        {
            "action": {
                "action_type": "send_message",
                "contact_name": "Jamie Liu",
                "message_body": "Quick sync please.",
            }
        },
        {
            "action": {
                "action_type": "send_message",
                "contact_name": "Nobody",
                "message_body": "hello",
            }
        },
    ]
    return templates[i % len(templates)]


@pytest.fixture(scope="module")
def client() -> TestClient:
    return TestClient(app, raise_server_exceptions=True)


def test_api_surface_all_endpoints(client: TestClient) -> None:
    # Core GET endpoints.
    for path in ("/health", "/metadata", "/state", "/schema", "/openapi.json", "/docs", "/redoc"):
        r = client.get(path)
        assert r.status_code == 200, f"{path} -> {r.status_code}"

    # Control routes: method contracts.
    assert client.get("/reset").status_code == 405
    assert client.get("/step").status_code == 405
    assert client.put("/reset", json={}).status_code in (405, 422)
    assert client.get("/this-path-should-not-exist-ghostexec").status_code == 404

    # Reset variants.
    for body in ({}, {"seed": 42}, {"episode_id": "dead-api-001"}, {"seed": 1, "future_field": True}):
        rr = client.post("/reset", json=body)
        assert rr.status_code == 200
        j = rr.json()
        assert "observation" in j and "done" in j

    # MCP endpoint variants.
    mcp_ok = client.post(
        "/mcp",
        json={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}},
    )
    assert mcp_ok.status_code == 200
    mcp_bad_json = client.post("/mcp", content="{", headers={"Content-Type": "application/json"})
    assert mcp_bad_json.status_code == 200


@pytest.mark.parametrize("idx", range(520))
def test_api_reward_dead_520_cases(client: TestClient, idx: int) -> None:
    # Keep each case independent and deterministic.
    rr = client.post("/reset", json={"episode_id": f"dead-{idx:04d}", "seed": 42})
    assert rr.status_code == 200

    payload = _step_payload_for(idx)
    rs = client.post("/step", json=payload)
    assert rs.status_code == 200, f"idx={idx} payload={payload} status={rs.status_code}"

    body = rs.json()
    assert "observation" in body and "reward" in body and "done" in body
    obs = body["observation"]
    meta = obs.get("metadata") or {}
    bd = meta.get("reward_breakdown") or {}

    # Structural contracts.
    assert isinstance(obs.get("echoed_message", ""), str) and obs.get("echoed_message")
    assert "step_ok" in meta
    assert "step_detail" in meta
    assert "final" in bd
    assert "weighted_base" in bd

    # Reward identity: top-level reward must equal breakdown.final.
    reward = float(body["reward"])
    final = float(bd["final"])
    assert reward == pytest.approx(final, abs=1e-9)

    # Aggregation formula must hold exactly (within floating tolerance).
    conflict = float(bd.get("conflict", 0.0))
    relationship = float(bd.get("relationship", 0.0))
    task = float(bd.get("task", 0.0))
    weighted_inner = W_CONFLICT * conflict + W_REL * relationship + W_TASK * task
    expected_weighted = OUTPUT_SCALE * weighted_inner
    assert float(bd["weighted_base"]) == pytest.approx(expected_weighted, abs=1e-9)

    expected_final = (
        float(bd.get("weighted_base", 0.0))
        + float(bd.get("invalid_step_adjustment", 0.0))
        + float(bd.get("episode_completion_bonus", 0.0))
        + float(bd.get("catastrophic_penalty", 0.0))
        + float(bd.get("do_nothing_floor", 0.0))
    )
    assert final == pytest.approx(expected_final, abs=1e-9)

    action_type = payload["action"]["action_type"]
    if action_type == "do_nothing":
        assert float(bd.get("do_nothing_floor", 0.0)) == pytest.approx(-0.15, abs=1e-12)
        assert reward < 0

    if meta.get("step_ok") is False:
        assert float(bd.get("invalid_step_adjustment", 0.0)) == pytest.approx(-0.25, abs=1e-12)