siddeshwar-kagatikar commited on
Commit
1166e01
·
1 Parent(s): f92e1ac

Update dashboard from inference runs

Browse files
README.md CHANGED
@@ -160,6 +160,9 @@ The script is designed to stay bounded enough for a normal benchmark pass to fin
160
 
161
  The submission-ready inference entrypoint is the root `inference.py` file. It talks to the deployed Hugging Face Space over HTTP, uses the OpenAI client for all model calls, and emits structured stdout logs in the `[START]`, `[STEP]`, and `[END]` format.
162
 
 
 
 
163
  Required environment variables:
164
 
165
  - `API_BASE_URL`
 
160
 
161
  The submission-ready inference entrypoint is the root `inference.py` file. It talks to the deployed Hugging Face Space over HTTP, uses the OpenAI client for all model calls, and emits structured stdout logs in the `[START]`, `[STEP]`, and `[END]` format.
162
 
163
+ The script accepts `HF_TOKEN` as the primary auth variable and also supports `OPENAI_API_KEY` or `API_KEY` as local fallbacks.
164
+ After a successful run, `inference.py` also posts the evaluation summary back to the Space so the latest `/dashboard` view reflects that run.
165
+
166
  Required environment variables:
167
 
168
  - `API_BASE_URL`
inference.py CHANGED
@@ -9,11 +9,13 @@ from openai import OpenAI
9
  from requests import RequestException
10
 
11
  from osint_env.baselines.openai_runner import SYSTEM_PROMPT, build_action_tools
 
12
 
13
 
14
  API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
15
  MODEL_NAME = os.getenv("MODEL_NAME", "gpt-5.4-mini")
16
  HF_TOKEN = os.getenv("HF_TOKEN", "")
 
17
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
18
  SPACE_URL = os.getenv("SPACE_URL", "https://siddeshwar1625-osint.hf.space").rstrip("/")
19
 
@@ -32,19 +34,18 @@ def log_start(task: str, env: str, model: str) -> None:
32
  print(f"[START] task={task} env={env} model={model}", flush=True)
33
 
34
 
35
- def log_step(step: int, action: dict[str, Any], reward: float, done: bool, error: str | None) -> None:
36
- action_text = json.dumps(action, sort_keys=True, separators=(",", ":"))
37
- error_text = "null" if error is None else json.dumps(error)
38
  print(
39
- f"[STEP] step={step} action={action_text} reward={reward:.4f} done={str(bool(done)).lower()} error={error_text}",
40
  flush=True,
41
  )
42
 
43
 
44
  def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
45
- rewards_text = json.dumps([round(value, 4) for value in rewards], separators=(",", ":"))
46
  print(
47
- f"[END] success={str(bool(success)).lower()} steps={steps} score={score:.4f} rewards={rewards_text}",
48
  flush=True,
49
  )
50
 
@@ -135,6 +136,28 @@ def _decode_action(tool_name: str, args: dict[str, Any]) -> dict[str, Any]:
135
  return {"action_type": "CALL_TOOL", "payload": {"tool_name": tool_name, "args": dict(args)}}
136
 
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  def _assistant_tool_call_id(message: dict[str, Any]) -> str | None:
139
  tool_calls = list(message.get("tool_calls", []))
140
  if not tool_calls:
@@ -192,10 +215,54 @@ def get_model_action(client: OpenAI, messages: list[dict[str, Any]], tools: list
192
  return {"action_type": "ANSWER", "payload": {"answer": "unknown"}}, {"role": "assistant", "content": ""}
193
 
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  def main() -> None:
196
- api_key = OPENAI_API_KEY or HF_TOKEN
197
  if not api_key:
198
- raise SystemExit("Set OPENAI_API_KEY or HF_TOKEN before running inference.py.")
199
  if _looks_like_placeholder_api_key(api_key):
200
  raise SystemExit("Replace the placeholder with your real OpenAI API key.")
201
 
@@ -212,6 +279,8 @@ def main() -> None:
212
  history: list[str] = []
213
  rewards: list[float] = []
214
  task_scores: list[float] = []
 
 
215
  steps_taken = 0
216
 
217
  log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
@@ -220,6 +289,7 @@ def main() -> None:
220
  result = _space_post("/openenv/reset", {"task_index": task_index})
221
  session_id = str(result["session_id"])
222
  done = bool(result.get("done", False))
 
223
  messages: list[dict[str, Any]] = [
224
  {"role": "system", "content": SYSTEM_PROMPT},
225
  {
@@ -249,7 +319,7 @@ def main() -> None:
249
  done = bool(result.get("done", False))
250
  rewards.append(reward)
251
  steps_taken += 1
252
- log_step(step=steps_taken, action=action, reward=reward, done=done, error=error)
253
  history.append(f"step={steps_taken} task_index={task_index} reward={reward:+.4f}")
254
  messages.append(assistant_message)
255
  tool_message = _tool_result_message(assistant_message, result)
@@ -262,10 +332,14 @@ def main() -> None:
262
  task_answer = str(info.get("task_answer", ""))
263
  agent_answer = str(info.get("agent_answer", ""))
264
  task_scores.append(1.0 if agent_answer and agent_answer == task_answer else 0.0)
 
 
 
265
 
266
  score = sum(task_scores) / max(1, len(task_scores))
267
  success = score >= SUCCESS_SCORE_THRESHOLD
268
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
 
269
 
270
 
271
  if __name__ == "__main__":
 
9
  from requests import RequestException
10
 
11
  from osint_env.baselines.openai_runner import SYSTEM_PROMPT, build_action_tools
12
+ from osint_env.eval.metrics import EvalMetrics
13
 
14
 
15
  API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
16
  MODEL_NAME = os.getenv("MODEL_NAME", "gpt-5.4-mini")
17
  HF_TOKEN = os.getenv("HF_TOKEN", "")
18
+ API_KEY = os.getenv("API_KEY", "")
19
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
20
  SPACE_URL = os.getenv("SPACE_URL", "https://siddeshwar1625-osint.hf.space").rstrip("/")
21
 
 
34
  print(f"[START] task={task} env={env} model={model}", flush=True)
35
 
36
 
37
+ def log_step(step: int, action: str, reward: float, done: bool, error: str | None) -> None:
38
+ error_text = "null" if error is None else str(error)
 
39
  print(
40
+ f"[STEP] step={step} action={action} reward={reward:.2f} done={str(bool(done)).lower()} error={error_text}",
41
  flush=True,
42
  )
43
 
44
 
45
  def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
46
+ rewards_text = ",".join(f"{value:.2f}" for value in rewards)
47
  print(
48
+ f"[END] success={str(bool(success)).lower()} steps={steps} score={score:.3f} rewards={rewards_text}",
49
  flush=True,
50
  )
51
 
 
136
  return {"action_type": "CALL_TOOL", "payload": {"tool_name": tool_name, "args": dict(args)}}
137
 
138
 
139
+ def _format_action(action: dict[str, Any]) -> str:
140
+ action_type = str(action.get("action_type", ""))
141
+ payload = dict(action.get("payload", {}))
142
+ if action_type == "ANSWER":
143
+ return f"answer({payload.get('answer', 'unknown')})"
144
+ if action_type == "ADD_EDGE":
145
+ return (
146
+ "add_edge("
147
+ f"{payload.get('src', '')},"
148
+ f"{payload.get('rel', '')},"
149
+ f"{payload.get('dst', '')},"
150
+ f"{float(payload.get('confidence', 1.0)):.2f}"
151
+ ")"
152
+ )
153
+ tool_name = str(payload.get("tool_name", "tool"))
154
+ args = dict(payload.get("args", {}))
155
+ if not args:
156
+ return f"{tool_name}()"
157
+ arg_str = ",".join(f"{key}={value}" for key, value in sorted(args.items()))
158
+ return f"{tool_name}({arg_str})"
159
+
160
+
161
  def _assistant_tool_call_id(message: dict[str, Any]) -> str | None:
162
  tool_calls = list(message.get("tool_calls", []))
163
  if not tool_calls:
 
215
  return {"action_type": "ANSWER", "payload": {"answer": "unknown"}}, {"role": "assistant", "content": ""}
216
 
217
 
218
+ def _episode_row(result: dict[str, Any], task_meta: dict[str, Any]) -> dict[str, Any]:
219
+ info = dict(result.get("info", {}))
220
+ graph_snapshot = dict((result.get("observation") or {}).get("graph_snapshot", {}))
221
+ task_type = str(task_meta.get("task_type", "unknown"))
222
+ task_id = str(task_meta.get("task_id", "unknown"))
223
+ question = str(task_meta.get("question", ""))
224
+ task_answer = str(info.get("task_answer", ""))
225
+ agent_answer = str(info.get("agent_answer", ""))
226
+ graph_f1 = float(info.get("graph_f1", 0.0) or 0.0)
227
+ return {
228
+ "task_id": task_id,
229
+ "task_type": task_type,
230
+ "question": question,
231
+ "task_answer": task_answer,
232
+ "agent_answer": agent_answer,
233
+ "graph_f1": graph_f1,
234
+ "reward": float(info.get("total_reward", 0.0) or 0.0),
235
+ "steps": int(info.get("step_count", 0) or 0),
236
+ "tool_calls": int(info.get("tool_calls", 0) or 0),
237
+ "success": int(bool(agent_answer) and agent_answer == task_answer),
238
+ "reward_components": dict(info.get("reward_components", {})),
239
+ "pred_edges": list(graph_snapshot.get("edges", [])),
240
+ "truth_edges": [],
241
+ }
242
+
243
+
244
+ def _publish_inference_report(summary: dict[str, Any], episodes: list[dict[str, Any]]) -> None:
245
+ payload = {
246
+ "run": {
247
+ "name": "inference_py_run",
248
+ "model": MODEL_NAME,
249
+ "space_url": SPACE_URL,
250
+ "task_indices": TASK_INDICES,
251
+ "max_steps": MAX_STEPS,
252
+ },
253
+ "summary": summary,
254
+ "episodes": episodes,
255
+ }
256
+ try:
257
+ _space_post("/openenv/report_inference", payload)
258
+ except RequestException as exc:
259
+ print(f"[DEBUG] Failed to publish inference report: {exc}", flush=True)
260
+
261
+
262
  def main() -> None:
263
+ api_key = OPENAI_API_KEY or HF_TOKEN or API_KEY
264
  if not api_key:
265
+ raise SystemExit("Set HF_TOKEN, OPENAI_API_KEY, or API_KEY before running inference.py.")
266
  if _looks_like_placeholder_api_key(api_key):
267
  raise SystemExit("Replace the placeholder with your real OpenAI API key.")
268
 
 
279
  history: list[str] = []
280
  rewards: list[float] = []
281
  task_scores: list[float] = []
282
+ episode_rows: list[dict[str, Any]] = []
283
+ metrics = EvalMetrics()
284
  steps_taken = 0
285
 
286
  log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
 
289
  result = _space_post("/openenv/reset", {"task_index": task_index})
290
  session_id = str(result["session_id"])
291
  done = bool(result.get("done", False))
292
+ task_meta = dict((result.get("observation") or {}).get("task", {}))
293
  messages: list[dict[str, Any]] = [
294
  {"role": "system", "content": SYSTEM_PROMPT},
295
  {
 
319
  done = bool(result.get("done", False))
320
  rewards.append(reward)
321
  steps_taken += 1
322
+ log_step(step=steps_taken, action=_format_action(action), reward=reward, done=done, error=error)
323
  history.append(f"step={steps_taken} task_index={task_index} reward={reward:+.4f}")
324
  messages.append(assistant_message)
325
  tool_message = _tool_result_message(assistant_message, result)
 
332
  task_answer = str(info.get("task_answer", ""))
333
  agent_answer = str(info.get("agent_answer", ""))
334
  task_scores.append(1.0 if agent_answer and agent_answer == task_answer else 0.0)
335
+ episode_row = _episode_row(result, task_meta)
336
+ episode_rows.append(episode_row)
337
+ metrics.add(info, task_type=episode_row["task_type"], graph_f1=float(episode_row["graph_f1"]))
338
 
339
  score = sum(task_scores) / max(1, len(task_scores))
340
  success = score >= SUCCESS_SCORE_THRESHOLD
341
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
342
+ _publish_inference_report(metrics.summary(), episode_rows)
343
 
344
 
345
  if __name__ == "__main__":
server.py CHANGED
@@ -14,6 +14,8 @@ from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
14
 
15
  from osint_env.api import (
16
  OpenEnvActionRequest,
 
 
17
  OpenEnvObservationModel,
18
  OpenEnvResetRequest,
19
  OpenEnvResponseEnvelope,
@@ -22,6 +24,7 @@ from osint_env.api import (
22
  from osint_env.config import clone_environment_config, load_seeding_config, load_shared_config
23
  from osint_env.domain.models import Action, ActionType
24
  from osint_env.env.environment import OSINTEnvironment
 
25
  from osint_env.eval.runner import run_evaluation
26
  from osint_env.llm import build_llm_client
27
  from osint_env.viz import export_dashboard
@@ -134,6 +137,43 @@ def _store_session(session_id: str, env: OSINTEnvironment) -> None:
134
  _SESSIONS[session_id] = env
135
 
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  @lru_cache(maxsize=1)
138
  def _base_environment_snapshot() -> dict[str, Any]:
139
  env = _build_environment()
@@ -409,6 +449,30 @@ def openenv_state(session_id: str) -> OpenEnvResponseEnvelope:
409
  )
410
 
411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  @app.get("/dashboard")
413
  def dashboard() -> FileResponse:
414
  snapshot = _space_snapshot()
 
14
 
15
  from osint_env.api import (
16
  OpenEnvActionRequest,
17
+ OpenEnvInferenceReportRequest,
18
+ OpenEnvInferenceReportResponse,
19
  OpenEnvObservationModel,
20
  OpenEnvResetRequest,
21
  OpenEnvResponseEnvelope,
 
24
  from osint_env.config import clone_environment_config, load_seeding_config, load_shared_config
25
  from osint_env.domain.models import Action, ActionType
26
  from osint_env.env.environment import OSINTEnvironment
27
+ from osint_env.eval.leaderboard import load_leaderboard
28
  from osint_env.eval.runner import run_evaluation
29
  from osint_env.llm import build_llm_client
30
  from osint_env.viz import export_dashboard
 
137
  _SESSIONS[session_id] = env
138
 
139
 
140
+ def _task_lookup(env: OSINTEnvironment) -> dict[str, Any]:
141
+ return {task.task_id: task for task in env.tasks}
142
+
143
+
144
+ def _normalize_episode_rows(env: OSINTEnvironment, episodes: list[dict[str, Any]]) -> list[dict[str, Any]]:
145
+ tasks_by_id = _task_lookup(env)
146
+ normalized: list[dict[str, Any]] = []
147
+ for episode in episodes:
148
+ row = dict(episode)
149
+ task = tasks_by_id.get(str(row.get("task_id", "")))
150
+ if task is not None:
151
+ row.setdefault("task_type", task.task_type)
152
+ row.setdefault("question", task.question)
153
+ row.setdefault("task_answer", task.answer)
154
+ row.setdefault(
155
+ "truth_edges",
156
+ [
157
+ {
158
+ "src": edge.src,
159
+ "rel": edge.rel,
160
+ "dst": edge.dst,
161
+ "confidence": float(edge.confidence),
162
+ }
163
+ for edge in task.supporting_edges
164
+ ],
165
+ )
166
+ row.setdefault("pred_edges", [])
167
+ row.setdefault("reward_components", {})
168
+ row.setdefault("graph_f1", 0.0)
169
+ row.setdefault("reward", 0.0)
170
+ row.setdefault("steps", 0)
171
+ row.setdefault("tool_calls", 0)
172
+ row.setdefault("success", 0)
173
+ normalized.append(row)
174
+ return normalized
175
+
176
+
177
  @lru_cache(maxsize=1)
178
  def _base_environment_snapshot() -> dict[str, Any]:
179
  env = _build_environment()
 
449
  )
450
 
451
 
452
+ @app.post("/openenv/report_inference", response_model=OpenEnvInferenceReportResponse)
453
+ def openenv_report_inference(request: OpenEnvInferenceReportRequest) -> OpenEnvInferenceReportResponse:
454
+ env = _build_environment()
455
+ normalized_episodes = _normalize_episode_rows(env, list(request.episodes))
456
+ payload = {
457
+ "run": dict(request.run),
458
+ "summary": dict(request.summary),
459
+ "episodes": normalized_episodes,
460
+ }
461
+ LATEST_EVALUATION_OUTPUT.parent.mkdir(parents=True, exist_ok=True)
462
+ LATEST_EVALUATION_OUTPUT.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
463
+ dashboard_path = export_dashboard(
464
+ env=env,
465
+ evaluation=payload,
466
+ leaderboard_records=load_leaderboard("artifacts/baselines/openai_fixed_levels_leaderboard.json"),
467
+ output_path=str(SPACE_DASHBOARD),
468
+ )
469
+ return OpenEnvInferenceReportResponse(
470
+ status="ok",
471
+ output_path=str(LATEST_EVALUATION_OUTPUT),
472
+ dashboard_path=str(dashboard_path),
473
+ )
474
+
475
+
476
  @app.get("/dashboard")
477
  def dashboard() -> FileResponse:
478
  snapshot = _space_snapshot()
src/osint_env/api/__init__.py CHANGED
@@ -1,5 +1,7 @@
1
  from osint_env.api.models import (
2
  OpenEnvActionRequest,
 
 
3
  OpenEnvObservationModel,
4
  OpenEnvResetRequest,
5
  OpenEnvResponseEnvelope,
@@ -8,6 +10,8 @@ from osint_env.api.models import (
8
 
9
  __all__ = [
10
  "OpenEnvActionRequest",
 
 
11
  "OpenEnvObservationModel",
12
  "OpenEnvResetRequest",
13
  "OpenEnvResponseEnvelope",
 
1
  from osint_env.api.models import (
2
  OpenEnvActionRequest,
3
+ OpenEnvInferenceReportRequest,
4
+ OpenEnvInferenceReportResponse,
5
  OpenEnvObservationModel,
6
  OpenEnvResetRequest,
7
  OpenEnvResponseEnvelope,
 
10
 
11
  __all__ = [
12
  "OpenEnvActionRequest",
13
+ "OpenEnvInferenceReportRequest",
14
+ "OpenEnvInferenceReportResponse",
15
  "OpenEnvObservationModel",
16
  "OpenEnvResetRequest",
17
  "OpenEnvResponseEnvelope",
src/osint_env/api/models.py CHANGED
@@ -36,3 +36,15 @@ class OpenEnvResponseEnvelope(BaseModel):
36
  reward: float
37
  done: bool
38
  info: dict[str, Any]
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  reward: float
37
  done: bool
38
  info: dict[str, Any]
39
+
40
+
41
+ class OpenEnvInferenceReportRequest(BaseModel):
42
+ run: dict[str, Any] = Field(default_factory=dict)
43
+ summary: dict[str, Any]
44
+ episodes: list[dict[str, Any]] = Field(default_factory=list)
45
+
46
+
47
+ class OpenEnvInferenceReportResponse(BaseModel):
48
+ status: str
49
+ output_path: str
50
+ dashboard_path: str
tests/test_server.py CHANGED
@@ -64,6 +64,43 @@ def test_openenv_reset_step_and_state_cycle():
64
  assert "task_answer" in step_body["info"]
65
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  def test_space_snapshot_prefers_newer_evaluation_payload(tmp_path, monkeypatch):
68
  baseline_path = tmp_path / "baseline.json"
69
  evaluation_path = tmp_path / "evaluation.json"
 
64
  assert "task_answer" in step_body["info"]
65
 
66
 
67
+ def test_report_inference_updates_latest_evaluation_and_dashboard(tmp_path, monkeypatch):
68
+ latest_evaluation = tmp_path / "latest_evaluation.json"
69
+ space_dashboard = tmp_path / "space_dashboard.html"
70
+
71
+ monkeypatch.setattr(server, "LATEST_EVALUATION_OUTPUT", latest_evaluation)
72
+ monkeypatch.setattr(server, "SPACE_DASHBOARD", space_dashboard)
73
+ monkeypatch.setattr(server, "load_leaderboard", lambda path: [])
74
+ monkeypatch.setattr(server, "export_dashboard", lambda env, evaluation, leaderboard_records, output_path: str(space_dashboard))
75
+
76
+ response = client.post(
77
+ "/openenv/report_inference",
78
+ json={
79
+ "run": {"name": "inference_py_run"},
80
+ "summary": {"leaderboard_score": 0.75, "task_success_rate": 1.0},
81
+ "episodes": [
82
+ {
83
+ "task_id": "seed_task_0",
84
+ "agent_answer": "user_bharat",
85
+ "graph_f1": 0.5,
86
+ "reward": 1.2,
87
+ "steps": 5,
88
+ "tool_calls": 4,
89
+ "success": 1,
90
+ }
91
+ ],
92
+ },
93
+ )
94
+ assert response.status_code == 200
95
+ body = response.json()
96
+ assert body["status"] == "ok"
97
+ assert latest_evaluation.exists()
98
+ stored = json.loads(latest_evaluation.read_text(encoding="utf-8"))
99
+ assert stored["summary"]["leaderboard_score"] == 0.75
100
+ assert stored["episodes"][0]["task_id"] == "seed_task_0"
101
+ assert stored["episodes"][0]["truth_edges"]
102
+
103
+
104
  def test_space_snapshot_prefers_newer_evaluation_payload(tmp_path, monkeypatch):
105
  baseline_path = tmp_path / "baseline.json"
106
  evaluation_path = tmp_path / "evaluation.json"