ghostexec / tests /test_api_reward_dead_500.py
modelbuilderhq's picture
Upload folder using huggingface_hub
ff293b1 verified
"""Hard API dead-test: 500+ calls with reward-consistency checks."""
from __future__ import annotations
from typing import Any
import pytest
from fastapi.testclient import TestClient
from ghostexec.server.app import app
W_CONFLICT = 0.35
W_REL = 0.35
W_TASK = 0.30
OUTPUT_SCALE = 0.48
def _step_payload_for(i: int) -> dict[str, Any]:
templates: list[dict[str, Any]] = [
{"action": {"action_type": "do_nothing"}},
{"action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}},
{"action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}},
{"action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}},
{"action": {"action_type": "archive_email", "email_id": "e09"}},
{"action": {"action_type": "archive_email", "email_id": "bad_id"}},
{
"action": {
"action_type": "reschedule_meeting",
"meeting_id": "m02",
"new_time": "2026-04-21T18:00:00",
}
},
{
"action": {
"action_type": "reschedule_meeting",
"meeting_id": "m03",
"new_time": "2026-04-21T09:30:00", # overlap -> invalid semantic
}
},
{"action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}},
{"action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}},
{"action": {"action_type": "complete_task", "task_id": "t07"}},
{"action": {"action_type": "complete_task", "task_id": "t09"}}, # already done
{"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}},
{"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}},
{
"action": {
"action_type": "send_message",
"contact_name": "Jamie Liu",
"message_body": "Quick sync please.",
}
},
{
"action": {
"action_type": "send_message",
"contact_name": "Nobody",
"message_body": "hello",
}
},
]
return templates[i % len(templates)]
@pytest.fixture(scope="module")
def client() -> TestClient:
return TestClient(app, raise_server_exceptions=True)
def test_api_surface_all_endpoints(client: TestClient) -> None:
# Core GET endpoints.
for path in ("/health", "/metadata", "/state", "/schema", "/openapi.json", "/docs", "/redoc"):
r = client.get(path)
assert r.status_code == 200, f"{path} -> {r.status_code}"
# Control routes: method contracts.
assert client.get("/reset").status_code == 405
assert client.get("/step").status_code == 405
assert client.put("/reset", json={}).status_code in (405, 422)
assert client.get("/this-path-should-not-exist-ghostexec").status_code == 404
# Reset variants.
for body in ({}, {"seed": 42}, {"episode_id": "dead-api-001"}, {"seed": 1, "future_field": True}):
rr = client.post("/reset", json=body)
assert rr.status_code == 200
j = rr.json()
assert "observation" in j and "done" in j
# MCP endpoint variants.
mcp_ok = client.post(
"/mcp",
json={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}},
)
assert mcp_ok.status_code == 200
mcp_bad_json = client.post("/mcp", content="{", headers={"Content-Type": "application/json"})
assert mcp_bad_json.status_code == 200
@pytest.mark.parametrize("idx", range(520))
def test_api_reward_dead_520_cases(client: TestClient, idx: int) -> None:
# Keep each case independent and deterministic.
rr = client.post("/reset", json={"episode_id": f"dead-{idx:04d}", "seed": 42})
assert rr.status_code == 200
payload = _step_payload_for(idx)
rs = client.post("/step", json=payload)
assert rs.status_code == 200, f"idx={idx} payload={payload} status={rs.status_code}"
body = rs.json()
assert "observation" in body and "reward" in body and "done" in body
obs = body["observation"]
meta = obs.get("metadata") or {}
bd = meta.get("reward_breakdown") or {}
# Structural contracts.
assert isinstance(obs.get("echoed_message", ""), str) and obs.get("echoed_message")
assert "step_ok" in meta
assert "step_detail" in meta
assert "final" in bd
assert "weighted_base" in bd
# Reward identity: top-level reward must equal breakdown.final.
reward = float(body["reward"])
final = float(bd["final"])
assert reward == pytest.approx(final, abs=1e-9)
# Aggregation formula must hold exactly (within floating tolerance).
conflict = float(bd.get("conflict", 0.0))
relationship = float(bd.get("relationship", 0.0))
task = float(bd.get("task", 0.0))
weighted_inner = W_CONFLICT * conflict + W_REL * relationship + W_TASK * task
expected_weighted = OUTPUT_SCALE * weighted_inner
assert float(bd["weighted_base"]) == pytest.approx(expected_weighted, abs=1e-9)
expected_final = (
float(bd.get("weighted_base", 0.0))
+ float(bd.get("invalid_step_adjustment", 0.0))
+ float(bd.get("episode_completion_bonus", 0.0))
+ float(bd.get("catastrophic_penalty", 0.0))
+ float(bd.get("do_nothing_floor", 0.0))
)
assert final == pytest.approx(expected_final, abs=1e-9)
action_type = payload["action"]["action_type"]
if action_type == "do_nothing":
assert float(bd.get("do_nothing_floor", 0.0)) == pytest.approx(-0.15, abs=1e-12)
assert reward < 0
if meta.get("step_ok") is False:
assert float(bd.get("invalid_step_adjustment", 0.0)) == pytest.approx(-0.25, abs=1e-12)