Spaces:
Sleeping
Sleeping
siddeshwar-kagatikar commited on
Commit ·
1166e01
1
Parent(s): f92e1ac
Update dashboard from inference runs
Browse files- README.md +3 -0
- inference.py +83 -9
- server.py +64 -0
- src/osint_env/api/__init__.py +4 -0
- src/osint_env/api/models.py +12 -0
- tests/test_server.py +37 -0
README.md
CHANGED
|
@@ -160,6 +160,9 @@ The script is designed to stay bounded enough for a normal benchmark pass to fin
|
|
| 160 |
|
| 161 |
The submission-ready inference entrypoint is the root `inference.py` file. It talks to the deployed Hugging Face Space over HTTP, uses the OpenAI client for all model calls, and emits structured stdout logs in the `[START]`, `[STEP]`, and `[END]` format.
|
| 162 |
|
|
|
|
|
|
|
|
|
|
| 163 |
Required environment variables:
|
| 164 |
|
| 165 |
- `API_BASE_URL`
|
|
|
|
| 160 |
|
| 161 |
The submission-ready inference entrypoint is the root `inference.py` file. It talks to the deployed Hugging Face Space over HTTP, uses the OpenAI client for all model calls, and emits structured stdout logs in the `[START]`, `[STEP]`, and `[END]` format.
|
| 162 |
|
| 163 |
+
The script accepts `HF_TOKEN` as the primary auth variable and also supports `OPENAI_API_KEY` or `API_KEY` as local fallbacks.
|
| 164 |
+
After a successful run, `inference.py` also posts the evaluation summary back to the Space so the latest `/dashboard` view reflects that run.
|
| 165 |
+
|
| 166 |
Required environment variables:
|
| 167 |
|
| 168 |
- `API_BASE_URL`
|
inference.py
CHANGED
|
@@ -9,11 +9,13 @@ from openai import OpenAI
|
|
| 9 |
from requests import RequestException
|
| 10 |
|
| 11 |
from osint_env.baselines.openai_runner import SYSTEM_PROMPT, build_action_tools
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
|
| 15 |
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-5.4-mini")
|
| 16 |
HF_TOKEN = os.getenv("HF_TOKEN", "")
|
|
|
|
| 17 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 18 |
SPACE_URL = os.getenv("SPACE_URL", "https://siddeshwar1625-osint.hf.space").rstrip("/")
|
| 19 |
|
|
@@ -32,19 +34,18 @@ def log_start(task: str, env: str, model: str) -> None:
|
|
| 32 |
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 33 |
|
| 34 |
|
| 35 |
-
def log_step(step: int, action:
|
| 36 |
-
|
| 37 |
-
error_text = "null" if error is None else json.dumps(error)
|
| 38 |
print(
|
| 39 |
-
f"[STEP] step={step} action={
|
| 40 |
flush=True,
|
| 41 |
)
|
| 42 |
|
| 43 |
|
| 44 |
def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
|
| 45 |
-
rewards_text =
|
| 46 |
print(
|
| 47 |
-
f"[END] success={str(bool(success)).lower()} steps={steps} score={score:.
|
| 48 |
flush=True,
|
| 49 |
)
|
| 50 |
|
|
@@ -135,6 +136,28 @@ def _decode_action(tool_name: str, args: dict[str, Any]) -> dict[str, Any]:
|
|
| 135 |
return {"action_type": "CALL_TOOL", "payload": {"tool_name": tool_name, "args": dict(args)}}
|
| 136 |
|
| 137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
def _assistant_tool_call_id(message: dict[str, Any]) -> str | None:
|
| 139 |
tool_calls = list(message.get("tool_calls", []))
|
| 140 |
if not tool_calls:
|
|
@@ -192,10 +215,54 @@ def get_model_action(client: OpenAI, messages: list[dict[str, Any]], tools: list
|
|
| 192 |
return {"action_type": "ANSWER", "payload": {"answer": "unknown"}}, {"role": "assistant", "content": ""}
|
| 193 |
|
| 194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
def main() -> None:
|
| 196 |
-
api_key = OPENAI_API_KEY or HF_TOKEN
|
| 197 |
if not api_key:
|
| 198 |
-
raise SystemExit("Set OPENAI_API_KEY or
|
| 199 |
if _looks_like_placeholder_api_key(api_key):
|
| 200 |
raise SystemExit("Replace the placeholder with your real OpenAI API key.")
|
| 201 |
|
|
@@ -212,6 +279,8 @@ def main() -> None:
|
|
| 212 |
history: list[str] = []
|
| 213 |
rewards: list[float] = []
|
| 214 |
task_scores: list[float] = []
|
|
|
|
|
|
|
| 215 |
steps_taken = 0
|
| 216 |
|
| 217 |
log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
|
|
@@ -220,6 +289,7 @@ def main() -> None:
|
|
| 220 |
result = _space_post("/openenv/reset", {"task_index": task_index})
|
| 221 |
session_id = str(result["session_id"])
|
| 222 |
done = bool(result.get("done", False))
|
|
|
|
| 223 |
messages: list[dict[str, Any]] = [
|
| 224 |
{"role": "system", "content": SYSTEM_PROMPT},
|
| 225 |
{
|
|
@@ -249,7 +319,7 @@ def main() -> None:
|
|
| 249 |
done = bool(result.get("done", False))
|
| 250 |
rewards.append(reward)
|
| 251 |
steps_taken += 1
|
| 252 |
-
log_step(step=steps_taken, action=action, reward=reward, done=done, error=error)
|
| 253 |
history.append(f"step={steps_taken} task_index={task_index} reward={reward:+.4f}")
|
| 254 |
messages.append(assistant_message)
|
| 255 |
tool_message = _tool_result_message(assistant_message, result)
|
|
@@ -262,10 +332,14 @@ def main() -> None:
|
|
| 262 |
task_answer = str(info.get("task_answer", ""))
|
| 263 |
agent_answer = str(info.get("agent_answer", ""))
|
| 264 |
task_scores.append(1.0 if agent_answer and agent_answer == task_answer else 0.0)
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
score = sum(task_scores) / max(1, len(task_scores))
|
| 267 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 268 |
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
|
|
|
| 269 |
|
| 270 |
|
| 271 |
if __name__ == "__main__":
|
|
|
|
| 9 |
from requests import RequestException
|
| 10 |
|
| 11 |
from osint_env.baselines.openai_runner import SYSTEM_PROMPT, build_action_tools
|
| 12 |
+
from osint_env.eval.metrics import EvalMetrics
|
| 13 |
|
| 14 |
|
| 15 |
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
|
| 16 |
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-5.4-mini")
|
| 17 |
HF_TOKEN = os.getenv("HF_TOKEN", "")
|
| 18 |
+
API_KEY = os.getenv("API_KEY", "")
|
| 19 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 20 |
SPACE_URL = os.getenv("SPACE_URL", "https://siddeshwar1625-osint.hf.space").rstrip("/")
|
| 21 |
|
|
|
|
| 34 |
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 35 |
|
| 36 |
|
| 37 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: str | None) -> None:
|
| 38 |
+
error_text = "null" if error is None else str(error)
|
|
|
|
| 39 |
print(
|
| 40 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} done={str(bool(done)).lower()} error={error_text}",
|
| 41 |
flush=True,
|
| 42 |
)
|
| 43 |
|
| 44 |
|
| 45 |
def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
|
| 46 |
+
rewards_text = ",".join(f"{value:.2f}" for value in rewards)
|
| 47 |
print(
|
| 48 |
+
f"[END] success={str(bool(success)).lower()} steps={steps} score={score:.3f} rewards={rewards_text}",
|
| 49 |
flush=True,
|
| 50 |
)
|
| 51 |
|
|
|
|
| 136 |
return {"action_type": "CALL_TOOL", "payload": {"tool_name": tool_name, "args": dict(args)}}
|
| 137 |
|
| 138 |
|
| 139 |
+
def _format_action(action: dict[str, Any]) -> str:
|
| 140 |
+
action_type = str(action.get("action_type", ""))
|
| 141 |
+
payload = dict(action.get("payload", {}))
|
| 142 |
+
if action_type == "ANSWER":
|
| 143 |
+
return f"answer({payload.get('answer', 'unknown')})"
|
| 144 |
+
if action_type == "ADD_EDGE":
|
| 145 |
+
return (
|
| 146 |
+
"add_edge("
|
| 147 |
+
f"{payload.get('src', '')},"
|
| 148 |
+
f"{payload.get('rel', '')},"
|
| 149 |
+
f"{payload.get('dst', '')},"
|
| 150 |
+
f"{float(payload.get('confidence', 1.0)):.2f}"
|
| 151 |
+
")"
|
| 152 |
+
)
|
| 153 |
+
tool_name = str(payload.get("tool_name", "tool"))
|
| 154 |
+
args = dict(payload.get("args", {}))
|
| 155 |
+
if not args:
|
| 156 |
+
return f"{tool_name}()"
|
| 157 |
+
arg_str = ",".join(f"{key}={value}" for key, value in sorted(args.items()))
|
| 158 |
+
return f"{tool_name}({arg_str})"
|
| 159 |
+
|
| 160 |
+
|
| 161 |
def _assistant_tool_call_id(message: dict[str, Any]) -> str | None:
|
| 162 |
tool_calls = list(message.get("tool_calls", []))
|
| 163 |
if not tool_calls:
|
|
|
|
| 215 |
return {"action_type": "ANSWER", "payload": {"answer": "unknown"}}, {"role": "assistant", "content": ""}
|
| 216 |
|
| 217 |
|
| 218 |
+
def _episode_row(result: dict[str, Any], task_meta: dict[str, Any]) -> dict[str, Any]:
|
| 219 |
+
info = dict(result.get("info", {}))
|
| 220 |
+
graph_snapshot = dict((result.get("observation") or {}).get("graph_snapshot", {}))
|
| 221 |
+
task_type = str(task_meta.get("task_type", "unknown"))
|
| 222 |
+
task_id = str(task_meta.get("task_id", "unknown"))
|
| 223 |
+
question = str(task_meta.get("question", ""))
|
| 224 |
+
task_answer = str(info.get("task_answer", ""))
|
| 225 |
+
agent_answer = str(info.get("agent_answer", ""))
|
| 226 |
+
graph_f1 = float(info.get("graph_f1", 0.0) or 0.0)
|
| 227 |
+
return {
|
| 228 |
+
"task_id": task_id,
|
| 229 |
+
"task_type": task_type,
|
| 230 |
+
"question": question,
|
| 231 |
+
"task_answer": task_answer,
|
| 232 |
+
"agent_answer": agent_answer,
|
| 233 |
+
"graph_f1": graph_f1,
|
| 234 |
+
"reward": float(info.get("total_reward", 0.0) or 0.0),
|
| 235 |
+
"steps": int(info.get("step_count", 0) or 0),
|
| 236 |
+
"tool_calls": int(info.get("tool_calls", 0) or 0),
|
| 237 |
+
"success": int(bool(agent_answer) and agent_answer == task_answer),
|
| 238 |
+
"reward_components": dict(info.get("reward_components", {})),
|
| 239 |
+
"pred_edges": list(graph_snapshot.get("edges", [])),
|
| 240 |
+
"truth_edges": [],
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def _publish_inference_report(summary: dict[str, Any], episodes: list[dict[str, Any]]) -> None:
|
| 245 |
+
payload = {
|
| 246 |
+
"run": {
|
| 247 |
+
"name": "inference_py_run",
|
| 248 |
+
"model": MODEL_NAME,
|
| 249 |
+
"space_url": SPACE_URL,
|
| 250 |
+
"task_indices": TASK_INDICES,
|
| 251 |
+
"max_steps": MAX_STEPS,
|
| 252 |
+
},
|
| 253 |
+
"summary": summary,
|
| 254 |
+
"episodes": episodes,
|
| 255 |
+
}
|
| 256 |
+
try:
|
| 257 |
+
_space_post("/openenv/report_inference", payload)
|
| 258 |
+
except RequestException as exc:
|
| 259 |
+
print(f"[DEBUG] Failed to publish inference report: {exc}", flush=True)
|
| 260 |
+
|
| 261 |
+
|
| 262 |
def main() -> None:
|
| 263 |
+
api_key = OPENAI_API_KEY or HF_TOKEN or API_KEY
|
| 264 |
if not api_key:
|
| 265 |
+
raise SystemExit("Set HF_TOKEN, OPENAI_API_KEY, or API_KEY before running inference.py.")
|
| 266 |
if _looks_like_placeholder_api_key(api_key):
|
| 267 |
raise SystemExit("Replace the placeholder with your real OpenAI API key.")
|
| 268 |
|
|
|
|
| 279 |
history: list[str] = []
|
| 280 |
rewards: list[float] = []
|
| 281 |
task_scores: list[float] = []
|
| 282 |
+
episode_rows: list[dict[str, Any]] = []
|
| 283 |
+
metrics = EvalMetrics()
|
| 284 |
steps_taken = 0
|
| 285 |
|
| 286 |
log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
|
|
|
|
| 289 |
result = _space_post("/openenv/reset", {"task_index": task_index})
|
| 290 |
session_id = str(result["session_id"])
|
| 291 |
done = bool(result.get("done", False))
|
| 292 |
+
task_meta = dict((result.get("observation") or {}).get("task", {}))
|
| 293 |
messages: list[dict[str, Any]] = [
|
| 294 |
{"role": "system", "content": SYSTEM_PROMPT},
|
| 295 |
{
|
|
|
|
| 319 |
done = bool(result.get("done", False))
|
| 320 |
rewards.append(reward)
|
| 321 |
steps_taken += 1
|
| 322 |
+
log_step(step=steps_taken, action=_format_action(action), reward=reward, done=done, error=error)
|
| 323 |
history.append(f"step={steps_taken} task_index={task_index} reward={reward:+.4f}")
|
| 324 |
messages.append(assistant_message)
|
| 325 |
tool_message = _tool_result_message(assistant_message, result)
|
|
|
|
| 332 |
task_answer = str(info.get("task_answer", ""))
|
| 333 |
agent_answer = str(info.get("agent_answer", ""))
|
| 334 |
task_scores.append(1.0 if agent_answer and agent_answer == task_answer else 0.0)
|
| 335 |
+
episode_row = _episode_row(result, task_meta)
|
| 336 |
+
episode_rows.append(episode_row)
|
| 337 |
+
metrics.add(info, task_type=episode_row["task_type"], graph_f1=float(episode_row["graph_f1"]))
|
| 338 |
|
| 339 |
score = sum(task_scores) / max(1, len(task_scores))
|
| 340 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 341 |
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 342 |
+
_publish_inference_report(metrics.summary(), episode_rows)
|
| 343 |
|
| 344 |
|
| 345 |
if __name__ == "__main__":
|
server.py
CHANGED
|
@@ -14,6 +14,8 @@ from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
|
|
| 14 |
|
| 15 |
from osint_env.api import (
|
| 16 |
OpenEnvActionRequest,
|
|
|
|
|
|
|
| 17 |
OpenEnvObservationModel,
|
| 18 |
OpenEnvResetRequest,
|
| 19 |
OpenEnvResponseEnvelope,
|
|
@@ -22,6 +24,7 @@ from osint_env.api import (
|
|
| 22 |
from osint_env.config import clone_environment_config, load_seeding_config, load_shared_config
|
| 23 |
from osint_env.domain.models import Action, ActionType
|
| 24 |
from osint_env.env.environment import OSINTEnvironment
|
|
|
|
| 25 |
from osint_env.eval.runner import run_evaluation
|
| 26 |
from osint_env.llm import build_llm_client
|
| 27 |
from osint_env.viz import export_dashboard
|
|
@@ -134,6 +137,43 @@ def _store_session(session_id: str, env: OSINTEnvironment) -> None:
|
|
| 134 |
_SESSIONS[session_id] = env
|
| 135 |
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
@lru_cache(maxsize=1)
|
| 138 |
def _base_environment_snapshot() -> dict[str, Any]:
|
| 139 |
env = _build_environment()
|
|
@@ -409,6 +449,30 @@ def openenv_state(session_id: str) -> OpenEnvResponseEnvelope:
|
|
| 409 |
)
|
| 410 |
|
| 411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
@app.get("/dashboard")
|
| 413 |
def dashboard() -> FileResponse:
|
| 414 |
snapshot = _space_snapshot()
|
|
|
|
| 14 |
|
| 15 |
from osint_env.api import (
|
| 16 |
OpenEnvActionRequest,
|
| 17 |
+
OpenEnvInferenceReportRequest,
|
| 18 |
+
OpenEnvInferenceReportResponse,
|
| 19 |
OpenEnvObservationModel,
|
| 20 |
OpenEnvResetRequest,
|
| 21 |
OpenEnvResponseEnvelope,
|
|
|
|
| 24 |
from osint_env.config import clone_environment_config, load_seeding_config, load_shared_config
|
| 25 |
from osint_env.domain.models import Action, ActionType
|
| 26 |
from osint_env.env.environment import OSINTEnvironment
|
| 27 |
+
from osint_env.eval.leaderboard import load_leaderboard
|
| 28 |
from osint_env.eval.runner import run_evaluation
|
| 29 |
from osint_env.llm import build_llm_client
|
| 30 |
from osint_env.viz import export_dashboard
|
|
|
|
| 137 |
_SESSIONS[session_id] = env
|
| 138 |
|
| 139 |
|
| 140 |
+
def _task_lookup(env: OSINTEnvironment) -> dict[str, Any]:
|
| 141 |
+
return {task.task_id: task for task in env.tasks}
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def _normalize_episode_rows(env: OSINTEnvironment, episodes: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
| 145 |
+
tasks_by_id = _task_lookup(env)
|
| 146 |
+
normalized: list[dict[str, Any]] = []
|
| 147 |
+
for episode in episodes:
|
| 148 |
+
row = dict(episode)
|
| 149 |
+
task = tasks_by_id.get(str(row.get("task_id", "")))
|
| 150 |
+
if task is not None:
|
| 151 |
+
row.setdefault("task_type", task.task_type)
|
| 152 |
+
row.setdefault("question", task.question)
|
| 153 |
+
row.setdefault("task_answer", task.answer)
|
| 154 |
+
row.setdefault(
|
| 155 |
+
"truth_edges",
|
| 156 |
+
[
|
| 157 |
+
{
|
| 158 |
+
"src": edge.src,
|
| 159 |
+
"rel": edge.rel,
|
| 160 |
+
"dst": edge.dst,
|
| 161 |
+
"confidence": float(edge.confidence),
|
| 162 |
+
}
|
| 163 |
+
for edge in task.supporting_edges
|
| 164 |
+
],
|
| 165 |
+
)
|
| 166 |
+
row.setdefault("pred_edges", [])
|
| 167 |
+
row.setdefault("reward_components", {})
|
| 168 |
+
row.setdefault("graph_f1", 0.0)
|
| 169 |
+
row.setdefault("reward", 0.0)
|
| 170 |
+
row.setdefault("steps", 0)
|
| 171 |
+
row.setdefault("tool_calls", 0)
|
| 172 |
+
row.setdefault("success", 0)
|
| 173 |
+
normalized.append(row)
|
| 174 |
+
return normalized
|
| 175 |
+
|
| 176 |
+
|
| 177 |
@lru_cache(maxsize=1)
|
| 178 |
def _base_environment_snapshot() -> dict[str, Any]:
|
| 179 |
env = _build_environment()
|
|
|
|
| 449 |
)
|
| 450 |
|
| 451 |
|
| 452 |
+
@app.post("/openenv/report_inference", response_model=OpenEnvInferenceReportResponse)
|
| 453 |
+
def openenv_report_inference(request: OpenEnvInferenceReportRequest) -> OpenEnvInferenceReportResponse:
|
| 454 |
+
env = _build_environment()
|
| 455 |
+
normalized_episodes = _normalize_episode_rows(env, list(request.episodes))
|
| 456 |
+
payload = {
|
| 457 |
+
"run": dict(request.run),
|
| 458 |
+
"summary": dict(request.summary),
|
| 459 |
+
"episodes": normalized_episodes,
|
| 460 |
+
}
|
| 461 |
+
LATEST_EVALUATION_OUTPUT.parent.mkdir(parents=True, exist_ok=True)
|
| 462 |
+
LATEST_EVALUATION_OUTPUT.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
| 463 |
+
dashboard_path = export_dashboard(
|
| 464 |
+
env=env,
|
| 465 |
+
evaluation=payload,
|
| 466 |
+
leaderboard_records=load_leaderboard("artifacts/baselines/openai_fixed_levels_leaderboard.json"),
|
| 467 |
+
output_path=str(SPACE_DASHBOARD),
|
| 468 |
+
)
|
| 469 |
+
return OpenEnvInferenceReportResponse(
|
| 470 |
+
status="ok",
|
| 471 |
+
output_path=str(LATEST_EVALUATION_OUTPUT),
|
| 472 |
+
dashboard_path=str(dashboard_path),
|
| 473 |
+
)
|
| 474 |
+
|
| 475 |
+
|
| 476 |
@app.get("/dashboard")
|
| 477 |
def dashboard() -> FileResponse:
|
| 478 |
snapshot = _space_snapshot()
|
src/osint_env/api/__init__.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
from osint_env.api.models import (
|
| 2 |
OpenEnvActionRequest,
|
|
|
|
|
|
|
| 3 |
OpenEnvObservationModel,
|
| 4 |
OpenEnvResetRequest,
|
| 5 |
OpenEnvResponseEnvelope,
|
|
@@ -8,6 +10,8 @@ from osint_env.api.models import (
|
|
| 8 |
|
| 9 |
__all__ = [
|
| 10 |
"OpenEnvActionRequest",
|
|
|
|
|
|
|
| 11 |
"OpenEnvObservationModel",
|
| 12 |
"OpenEnvResetRequest",
|
| 13 |
"OpenEnvResponseEnvelope",
|
|
|
|
| 1 |
from osint_env.api.models import (
|
| 2 |
OpenEnvActionRequest,
|
| 3 |
+
OpenEnvInferenceReportRequest,
|
| 4 |
+
OpenEnvInferenceReportResponse,
|
| 5 |
OpenEnvObservationModel,
|
| 6 |
OpenEnvResetRequest,
|
| 7 |
OpenEnvResponseEnvelope,
|
|
|
|
| 10 |
|
| 11 |
__all__ = [
|
| 12 |
"OpenEnvActionRequest",
|
| 13 |
+
"OpenEnvInferenceReportRequest",
|
| 14 |
+
"OpenEnvInferenceReportResponse",
|
| 15 |
"OpenEnvObservationModel",
|
| 16 |
"OpenEnvResetRequest",
|
| 17 |
"OpenEnvResponseEnvelope",
|
src/osint_env/api/models.py
CHANGED
|
@@ -36,3 +36,15 @@ class OpenEnvResponseEnvelope(BaseModel):
|
|
| 36 |
reward: float
|
| 37 |
done: bool
|
| 38 |
info: dict[str, Any]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
reward: float
|
| 37 |
done: bool
|
| 38 |
info: dict[str, Any]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class OpenEnvInferenceReportRequest(BaseModel):
|
| 42 |
+
run: dict[str, Any] = Field(default_factory=dict)
|
| 43 |
+
summary: dict[str, Any]
|
| 44 |
+
episodes: list[dict[str, Any]] = Field(default_factory=list)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class OpenEnvInferenceReportResponse(BaseModel):
|
| 48 |
+
status: str
|
| 49 |
+
output_path: str
|
| 50 |
+
dashboard_path: str
|
tests/test_server.py
CHANGED
|
@@ -64,6 +64,43 @@ def test_openenv_reset_step_and_state_cycle():
|
|
| 64 |
assert "task_answer" in step_body["info"]
|
| 65 |
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
def test_space_snapshot_prefers_newer_evaluation_payload(tmp_path, monkeypatch):
|
| 68 |
baseline_path = tmp_path / "baseline.json"
|
| 69 |
evaluation_path = tmp_path / "evaluation.json"
|
|
|
|
| 64 |
assert "task_answer" in step_body["info"]
|
| 65 |
|
| 66 |
|
| 67 |
+
def test_report_inference_updates_latest_evaluation_and_dashboard(tmp_path, monkeypatch):
|
| 68 |
+
latest_evaluation = tmp_path / "latest_evaluation.json"
|
| 69 |
+
space_dashboard = tmp_path / "space_dashboard.html"
|
| 70 |
+
|
| 71 |
+
monkeypatch.setattr(server, "LATEST_EVALUATION_OUTPUT", latest_evaluation)
|
| 72 |
+
monkeypatch.setattr(server, "SPACE_DASHBOARD", space_dashboard)
|
| 73 |
+
monkeypatch.setattr(server, "load_leaderboard", lambda path: [])
|
| 74 |
+
monkeypatch.setattr(server, "export_dashboard", lambda env, evaluation, leaderboard_records, output_path: str(space_dashboard))
|
| 75 |
+
|
| 76 |
+
response = client.post(
|
| 77 |
+
"/openenv/report_inference",
|
| 78 |
+
json={
|
| 79 |
+
"run": {"name": "inference_py_run"},
|
| 80 |
+
"summary": {"leaderboard_score": 0.75, "task_success_rate": 1.0},
|
| 81 |
+
"episodes": [
|
| 82 |
+
{
|
| 83 |
+
"task_id": "seed_task_0",
|
| 84 |
+
"agent_answer": "user_bharat",
|
| 85 |
+
"graph_f1": 0.5,
|
| 86 |
+
"reward": 1.2,
|
| 87 |
+
"steps": 5,
|
| 88 |
+
"tool_calls": 4,
|
| 89 |
+
"success": 1,
|
| 90 |
+
}
|
| 91 |
+
],
|
| 92 |
+
},
|
| 93 |
+
)
|
| 94 |
+
assert response.status_code == 200
|
| 95 |
+
body = response.json()
|
| 96 |
+
assert body["status"] == "ok"
|
| 97 |
+
assert latest_evaluation.exists()
|
| 98 |
+
stored = json.loads(latest_evaluation.read_text(encoding="utf-8"))
|
| 99 |
+
assert stored["summary"]["leaderboard_score"] == 0.75
|
| 100 |
+
assert stored["episodes"][0]["task_id"] == "seed_task_0"
|
| 101 |
+
assert stored["episodes"][0]["truth_edges"]
|
| 102 |
+
|
| 103 |
+
|
| 104 |
def test_space_snapshot_prefers_newer_evaluation_payload(tmp_path, monkeypatch):
|
| 105 |
baseline_path = tmp_path / "baseline.json"
|
| 106 |
evaluation_path = tmp_path / "evaluation.json"
|