OSINT / tests /test_server.py
siddeshwar-kagatikar
fix(rewards): never crash GRPO on malformed completions
d814291
import json
import os
from fastapi.testclient import TestClient
import server
from server import app
client = TestClient(app)
def test_server_health():
response = client.get("/healthz")
assert response.status_code == 200
assert response.json()["status"] == "ok"
def test_server_health_alias():
response = client.get("/health")
assert response.status_code == 200
assert response.json()["status"] == "ok"
def test_server_environment_metadata():
response = client.get("/api/environment")
assert response.status_code == 200
body = response.json()
assert "action_space" in body
assert "observation_space" in body
assert "summary" in body
def test_openenv_spec_and_tasks_endpoints():
spec = client.get("/openenv.yaml")
assert spec.status_code == 200
assert "reset" in spec.text
tasks = client.get("/openenv/tasks")
assert tasks.status_code == 200
body = tasks.json()
assert len(body) >= 3
assert {"task_id", "task_type", "question", "difficulty"} <= set(body[0].keys())
def test_openenv_reset_step_and_state_cycle():
reset = client.post("/openenv/reset", json={"task_index": 0})
assert reset.status_code == 200
body = reset.json()
session_id = body["session_id"]
assert body["done"] is False
assert "question" in body["observation"]["task"]
state = client.get(f"/openenv/state/{session_id}")
assert state.status_code == 200
assert state.json()["session_id"] == session_id
step = client.post(
"/openenv/step",
json={
"session_id": session_id,
"action_type": "ANSWER",
"payload": {"answer": "unknown"},
},
)
assert step.status_code == 200
step_body = step.json()
assert step_body["session_id"] == session_id
assert step_body["done"] is True
assert "task_answer" in step_body["info"]
def test_openenv_reset_accepts_empty_body():
reset = client.post("/openenv/reset")
assert reset.status_code == 200
body = reset.json()
assert body["done"] is False
assert "session_id" in body
def test_openenv_reset_accepts_empty_json_body():
reset = client.post(
"/openenv/reset",
data="",
headers={"Content-Type": "application/json"},
)
assert reset.status_code == 200
body = reset.json()
assert body["done"] is False
assert "session_id" in body
def test_openenv_reset_trailing_slash_post_returns_json():
reset = client.post(
"/openenv/reset/",
data="",
headers={"Content-Type": "application/json"},
)
assert reset.status_code == 200
body = reset.json()
assert body["done"] is False
assert "session_id" in body
def test_openenv_step_accepts_nested_action_payload():
reset = client.post("/openenv/reset", json={"task_index": 0})
assert reset.status_code == 200
session_id = reset.json()["session_id"]
step = client.post(
"/openenv/step",
json={
"session_id": session_id,
"action": {
"action_type": "ANSWER",
"payload": {"answer": "unknown"},
},
},
)
assert step.status_code == 200
assert step.json()["done"] is True
def test_step_alias_uses_latest_session_when_session_id_missing():
reset = client.post("/reset", json={"task_index": 0})
assert reset.status_code == 200
session_id = reset.json()["session_id"]
step = client.post(
"/step",
json={
"action_type": "ANSWER",
"payload": {"answer": "unknown"},
},
)
assert step.status_code == 200
body = step.json()
assert body["session_id"] == session_id
assert body["done"] is True
def test_state_alias_returns_latest_session():
reset = client.post("/reset", json={"task_index": 0})
assert reset.status_code == 200
session_id = reset.json()["session_id"]
state = client.get("/state")
assert state.status_code == 200
body = state.json()
assert body["session_id"] == session_id
assert "task" in body["observation"]
def test_report_inference_updates_latest_evaluation_and_dashboard(tmp_path, monkeypatch):
latest_evaluation = tmp_path / "latest_evaluation.json"
space_dashboard = tmp_path / "space_dashboard.html"
monkeypatch.setattr(server, "LATEST_EVALUATION_OUTPUT", latest_evaluation)
monkeypatch.setattr(server, "SPACE_DASHBOARD", space_dashboard)
monkeypatch.setattr(server, "load_leaderboard", lambda path: [])
monkeypatch.setattr(server, "export_dashboard", lambda env, evaluation, leaderboard_records, output_path: str(space_dashboard))
response = client.post(
"/openenv/report_inference",
json={
"run": {"name": "inference_py_run"},
"summary": {"leaderboard_score": 0.75, "task_success_rate": 1.0},
"episodes": [
{
"task_id": "seed_task_0",
"agent_answer": "user_bharat",
"graph_f1": 0.5,
"reward": 1.2,
"steps": 5,
"tool_calls": 4,
"success": 1,
}
],
},
)
assert response.status_code == 200
body = response.json()
assert body["status"] == "ok"
assert latest_evaluation.exists()
stored = json.loads(latest_evaluation.read_text(encoding="utf-8"))
assert stored["summary"]["leaderboard_score"] == 0.75
assert stored["episodes"][0]["task_id"] == "seed_task_0"
assert stored["episodes"][0]["truth_edges"]
def test_space_snapshot_prefers_newer_evaluation_payload(tmp_path, monkeypatch):
baseline_path = tmp_path / "baseline.json"
evaluation_path = tmp_path / "evaluation.json"
baseline_dashboard = tmp_path / "baseline_dashboard.html"
space_dashboard = tmp_path / "space_dashboard.html"
baseline_path.write_text(
json.dumps(
{
"run": {"dashboard_path": str(baseline_dashboard)},
"summary": {"leaderboard_score": 0.1, "task_success_rate": 0.1},
}
),
encoding="utf-8",
)
baseline_dashboard.write_text("<html>baseline</html>", encoding="utf-8")
evaluation_path.write_text(
json.dumps({"summary": {"leaderboard_score": 0.9, "task_success_rate": 0.9}, "episodes": []}),
encoding="utf-8",
)
space_dashboard.write_text("<html>space</html>", encoding="utf-8")
os.utime(evaluation_path, (baseline_path.stat().st_atime + 5, baseline_path.stat().st_mtime + 5))
monkeypatch.setattr(server, "LATEST_BASELINE_OUTPUT", baseline_path)
monkeypatch.setattr(server, "LATEST_EVALUATION_OUTPUT", evaluation_path)
monkeypatch.setattr(server, "SPACE_DASHBOARD", space_dashboard)
monkeypatch.setattr(
server,
"_base_environment_snapshot",
lambda: {
"task_count": 30,
"difficulty_counts": {},
"action_space": ["CALL_TOOL", "ADD_EDGE", "ANSWER"],
"observation_space": {},
"task_types": [],
"config": {},
},
)
monkeypatch.setattr(server, "_build_environment", lambda: object())
monkeypatch.setattr(server, "export_dashboard", lambda env, evaluation, leaderboard_records, output_path: str(space_dashboard))
snapshot = server._space_snapshot()
assert snapshot["source"] == "latest_evaluation"
assert snapshot["summary"]["leaderboard_score"] == 0.9
assert snapshot["dashboard_path"] == str(space_dashboard)