Siddeshwar1625 commited on
Commit
5b2fec4
·
1 Parent(s): 6f6ecdc

fixed inference

Browse files
.gitignore CHANGED
@@ -3,3 +3,4 @@ blueprint.txt
3
  *.egg-info
4
  artifacts/*
5
  *.html
 
 
3
  *.egg-info
4
  artifacts/*
5
  *.html
6
+ .venv/
datasets/fixed_levels/leaderboard_fixed_levels.json CHANGED
@@ -121,5 +121,298 @@
121
  },
122
  "run_id": "run_0003",
123
  "run_name": "fixed_levels_qwen_swarm"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  }
125
  ]
 
121
  },
122
  "run_id": "run_0003",
123
  "run_name": "fixed_levels_qwen_swarm"
124
+ },
125
+ {
126
+ "config": {
127
+ "max_agents": 3,
128
+ "max_breadth": 2,
129
+ "max_depth": 2,
130
+ "max_steps": 24,
131
+ "max_width": 2,
132
+ "seed": 2026,
133
+ "seeded_questions": 30,
134
+ "swarm_enabled": true
135
+ },
136
+ "created_at": "2026-04-06T18:29:39+00:00",
137
+ "episodes": 30,
138
+ "metrics": {
139
+ "avg_compactness_reward": 0.0,
140
+ "avg_connectivity_gain_reward": 0.2000000000000001,
141
+ "avg_connectivity_reward": 0.12999999999999998,
142
+ "avg_diversity_reward": 0.12433333333333325,
143
+ "avg_entity_informativeness_reward": -0.02515191749984708,
144
+ "avg_format_reward": 0.15,
145
+ "avg_graph_f1": 0.2916528337385394,
146
+ "avg_knowledge_carrier_reward": 0.5,
147
+ "avg_knowledge_indexing_reward": 0.11539120363588044,
148
+ "avg_relation_informativeness_reward": 0.0769903534735767,
149
+ "avg_reward": 4.460667345528021,
150
+ "avg_soft_shaping_reward": 0.3,
151
+ "avg_spawn_count": 4.0,
152
+ "avg_spawn_critical_steps": 6.0,
153
+ "avg_steps_to_solution": 9.0,
154
+ "deanonymization_accuracy": 0.0,
155
+ "leaderboard_score": 0.6269168609961595,
156
+ "retrieval_signal": 0.7153869212725582,
157
+ "spawn_completion_rate": 1.0,
158
+ "spawn_signal": 0.6666666666666666,
159
+ "structural_signal": 0.5815176871947458,
160
+ "task_success_rate": 1.0,
161
+ "tool_efficiency": 0.5
162
+ },
163
+ "run_id": "run_0004",
164
+ "run_name": "fixed_levels_qwen_swarm"
165
+ },
166
+ {
167
+ "config": {
168
+ "max_agents": 3,
169
+ "max_breadth": 2,
170
+ "max_depth": 2,
171
+ "max_steps": 24,
172
+ "max_width": 2,
173
+ "seed": 2026,
174
+ "seeded_questions": 30,
175
+ "swarm_enabled": true
176
+ },
177
+ "created_at": "2026-04-06T18:33:06+00:00",
178
+ "episodes": 2,
179
+ "metrics": {
180
+ "avg_compactness_reward": 0.0,
181
+ "avg_connectivity_gain_reward": 0.2,
182
+ "avg_connectivity_reward": -0.15,
183
+ "avg_diversity_reward": 0.13833333333333334,
184
+ "avg_entity_informativeness_reward": -0.026628229842114173,
185
+ "avg_format_reward": 0.15,
186
+ "avg_graph_f1": 0.6190476190476191,
187
+ "avg_knowledge_carrier_reward": 0.5,
188
+ "avg_knowledge_indexing_reward": 0.10681818181818181,
189
+ "avg_relation_informativeness_reward": 0.048120982127120335,
190
+ "avg_reward": 4.334953339016039,
191
+ "avg_soft_shaping_reward": 0.3,
192
+ "avg_spawn_count": 4.0,
193
+ "avg_spawn_critical_steps": 6.0,
194
+ "avg_steps_to_solution": 9.0,
195
+ "deanonymization_accuracy": 0.0,
196
+ "leaderboard_score": 0.685242999396977,
197
+ "retrieval_signal": 0.7123863636363637,
198
+ "spawn_completion_rate": 1.0,
199
+ "spawn_signal": 0.6666666666666666,
200
+ "structural_signal": 0.5075485504570012,
201
+ "task_success_rate": 1.0,
202
+ "tool_efficiency": 0.5
203
+ },
204
+ "run_id": "run_0005",
205
+ "run_name": "fixed_levels_qwen_swarm"
206
+ },
207
+ {
208
+ "config": {
209
+ "max_agents": 1,
210
+ "max_breadth": 2,
211
+ "max_depth": 2,
212
+ "max_steps": 24,
213
+ "max_width": 2,
214
+ "seed": 2026,
215
+ "seeded_questions": 30,
216
+ "swarm_enabled": true
217
+ },
218
+ "created_at": "2026-04-06T18:54:52+00:00",
219
+ "episodes": 1,
220
+ "metrics": {
221
+ "avg_compactness_reward": 0.0,
222
+ "avg_connectivity_gain_reward": 0.1,
223
+ "avg_connectivity_reward": -0.3,
224
+ "avg_diversity_reward": 0.08,
225
+ "avg_entity_informativeness_reward": -0.02450859227728558,
226
+ "avg_format_reward": 0.15,
227
+ "avg_graph_f1": 0.33333333333333337,
228
+ "avg_knowledge_carrier_reward": 0.5,
229
+ "avg_knowledge_indexing_reward": 0.08181818181818182,
230
+ "avg_relation_informativeness_reward": 0.04353540016904645,
231
+ "avg_reward": 3.037246438342494,
232
+ "avg_soft_shaping_reward": 0.15,
233
+ "avg_spawn_count": 2.0,
234
+ "avg_spawn_critical_steps": 6.0,
235
+ "avg_steps_to_solution": 5.0,
236
+ "deanonymization_accuracy": 0.0,
237
+ "leaderboard_score": 0.6201263424948862,
238
+ "retrieval_signal": 0.7036363636363637,
239
+ "spawn_completion_rate": 1.0,
240
+ "spawn_signal": 0.6666666666666666,
241
+ "structural_signal": 0.45080536157835216,
242
+ "task_success_rate": 1.0,
243
+ "tool_efficiency": 0.5
244
+ },
245
+ "run_id": "run_0006",
246
+ "run_name": "fixed_levels_qwen_swarm"
247
+ },
248
+ {
249
+ "config": {
250
+ "max_agents": 1,
251
+ "max_breadth": 2,
252
+ "max_depth": 2,
253
+ "max_steps": 24,
254
+ "max_width": 2,
255
+ "seed": 2026,
256
+ "seeded_questions": 30,
257
+ "swarm_enabled": true
258
+ },
259
+ "created_at": "2026-04-06T19:22:57+00:00",
260
+ "episodes": 1,
261
+ "metrics": {
262
+ "avg_compactness_reward": 0.0,
263
+ "avg_connectivity_gain_reward": 0.1,
264
+ "avg_connectivity_reward": -0.3,
265
+ "avg_diversity_reward": 0.08,
266
+ "avg_entity_informativeness_reward": -0.005263146336646693,
267
+ "avg_format_reward": 0.15,
268
+ "avg_graph_f1": 0.33333333333333337,
269
+ "avg_knowledge_carrier_reward": 0.5,
270
+ "avg_knowledge_indexing_reward": 0.08181818181818182,
271
+ "avg_relation_informativeness_reward": 0.044276243254877785,
272
+ "avg_reward": 3.057232727368964,
273
+ "avg_soft_shaping_reward": 0.15,
274
+ "avg_spawn_count": 2.0,
275
+ "avg_spawn_critical_steps": 6.0,
276
+ "avg_steps_to_solution": 5.0,
277
+ "deanonymization_accuracy": 0.0,
278
+ "leaderboard_score": 0.6205293479318178,
279
+ "retrieval_signal": 0.7036363636363637,
280
+ "spawn_completion_rate": 1.0,
281
+ "spawn_signal": 0.6666666666666666,
282
+ "structural_signal": 0.4548026193836462,
283
+ "task_success_rate": 1.0,
284
+ "tool_efficiency": 0.5
285
+ },
286
+ "run_id": "run_0007",
287
+ "run_name": "fixed_levels_qwen_swarm"
288
+ },
289
+ {
290
+ "config": {
291
+ "llm_model": "qwen3:1.7b",
292
+ "llm_provider": "ollama",
293
+ "max_agents": 1,
294
+ "max_breadth": 2,
295
+ "max_depth": 2,
296
+ "max_steps": 24,
297
+ "max_width": 2,
298
+ "seed": 2026,
299
+ "seeded_questions": 30,
300
+ "swarm_enabled": true
301
+ },
302
+ "created_at": "2026-04-06T19:48:33+00:00",
303
+ "episodes": 3,
304
+ "metrics": {
305
+ "avg_compactness_reward": 0.0,
306
+ "avg_connectivity_gain_reward": 0.10000000000000002,
307
+ "avg_connectivity_reward": -0.09999999999999999,
308
+ "avg_diversity_reward": 0.08,
309
+ "avg_entity_informativeness_reward": -0.028683816517602444,
310
+ "avg_format_reward": 0.15,
311
+ "avg_graph_f1": 0.15537340619307835,
312
+ "avg_knowledge_carrier_reward": 0.5,
313
+ "avg_knowledge_indexing_reward": 0.07932190760059611,
314
+ "avg_relation_informativeness_reward": 0.044225025032092045,
315
+ "avg_reward": 3.1324990406542437,
316
+ "avg_soft_shaping_reward": 0.15,
317
+ "avg_spawn_count": 2.0,
318
+ "avg_spawn_critical_steps": 6.0,
319
+ "avg_steps_to_solution": 5.0,
320
+ "deanonymization_accuracy": 0.0,
321
+ "leaderboard_score": 0.5890485416309927,
322
+ "retrieval_signal": 0.7027626676602087,
323
+ "spawn_completion_rate": 1.0,
324
+ "spawn_signal": 0.6666666666666666,
325
+ "structural_signal": 0.5001082417028979,
326
+ "task_success_rate": 1.0,
327
+ "tool_efficiency": 0.5
328
+ },
329
+ "run_id": "run_0008",
330
+ "run_name": "fixed_levels_qwen_swarm"
331
+ },
332
+ {
333
+ "config": {
334
+ "llm_model": "qwen3:1.7b",
335
+ "llm_provider": "ollama",
336
+ "max_agents": 1,
337
+ "max_breadth": 2,
338
+ "max_depth": 2,
339
+ "max_steps": 24,
340
+ "max_width": 2,
341
+ "seed": 2026,
342
+ "seeded_questions": 30,
343
+ "swarm_enabled": true
344
+ },
345
+ "created_at": "2026-04-06T19:55:08+00:00",
346
+ "episodes": 1,
347
+ "metrics": {
348
+ "avg_compactness_reward": 0.0,
349
+ "avg_connectivity_gain_reward": 0.1,
350
+ "avg_connectivity_reward": -0.3,
351
+ "avg_diversity_reward": 0.08,
352
+ "avg_entity_informativeness_reward": -0.005263146336646693,
353
+ "avg_format_reward": 0.15,
354
+ "avg_graph_f1": 0.33333333333333337,
355
+ "avg_knowledge_carrier_reward": 0.5,
356
+ "avg_knowledge_indexing_reward": 0.08181818181818182,
357
+ "avg_relation_informativeness_reward": 0.04406984773661544,
358
+ "avg_reward": 3.0570263318507016,
359
+ "avg_soft_shaping_reward": 0.15,
360
+ "avg_spawn_count": 2.0,
361
+ "avg_spawn_critical_steps": 6.0,
362
+ "avg_steps_to_solution": 5.0,
363
+ "deanonymization_accuracy": 0.0,
364
+ "leaderboard_score": 0.6205251901591228,
365
+ "retrieval_signal": 0.7036363636363637,
366
+ "spawn_completion_rate": 1.0,
367
+ "spawn_signal": 0.6666666666666666,
368
+ "structural_signal": 0.45476134027999376,
369
+ "task_success_rate": 1.0,
370
+ "tool_efficiency": 0.5
371
+ },
372
+ "run_id": "run_0009",
373
+ "run_name": "fixed_levels_qwen_swarm"
374
+ },
375
+ {
376
+ "config": {
377
+ "llm_model": "qwen3:1.7b",
378
+ "llm_provider": "ollama",
379
+ "max_agents": 1,
380
+ "max_breadth": 2,
381
+ "max_depth": 2,
382
+ "max_steps": 24,
383
+ "max_width": 2,
384
+ "seed": 2026,
385
+ "seeded_questions": 30,
386
+ "swarm_enabled": true
387
+ },
388
+ "created_at": "2026-04-06T20:01:34+00:00",
389
+ "episodes": 1,
390
+ "metrics": {
391
+ "avg_compactness_reward": 0.0,
392
+ "avg_connectivity_gain_reward": 0.1,
393
+ "avg_connectivity_reward": -0.3,
394
+ "avg_diversity_reward": 0.08,
395
+ "avg_entity_informativeness_reward": -0.020826953461399098,
396
+ "avg_format_reward": 0.15,
397
+ "avg_graph_f1": 0.33333333333333337,
398
+ "avg_knowledge_carrier_reward": 0.5,
399
+ "avg_knowledge_indexing_reward": 0.08181818181818182,
400
+ "avg_relation_informativeness_reward": 0.04348043923536236,
401
+ "avg_reward": 3.040873116224696,
402
+ "avg_soft_shaping_reward": 0.15,
403
+ "avg_spawn_count": 2.0,
404
+ "avg_spawn_critical_steps": 6.0,
405
+ "avg_steps_to_solution": 5.0,
406
+ "deanonymization_accuracy": 0.0,
407
+ "leaderboard_score": 0.6201995296517067,
408
+ "retrieval_signal": 0.7036363636363637,
409
+ "spawn_completion_rate": 1.0,
410
+ "spawn_signal": 0.6666666666666666,
411
+ "structural_signal": 0.45153069715479266,
412
+ "task_success_rate": 1.0,
413
+ "tool_efficiency": 0.5
414
+ },
415
+ "run_id": "run_0010",
416
+ "run_name": "fixed_levels_qwen_swarm"
417
  }
418
  ]
inference.py CHANGED
@@ -2,34 +2,73 @@ from __future__ import annotations
2
 
3
  import json
4
  import os
 
5
  from typing import Any
6
 
7
- import requests
8
- from openai import OpenAI
9
- from requests import RequestException
10
-
11
- from osint_env.baselines.openai_runner import SYSTEM_PROMPT, build_action_tools
 
 
12
  from osint_env.eval.metrics import EvalMetrics
 
 
13
 
14
 
15
- API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
16
- MODEL_NAME = os.getenv("MODEL_NAME", "gpt-5.4-mini")
17
- HF_TOKEN = os.getenv("HF_TOKEN", "")
18
- API_KEY = os.getenv("API_KEY", "")
 
 
 
19
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
20
- SPACE_URL = os.getenv("SPACE_URL", "https://siddeshwar1625-osint.hf.space").rstrip("/")
21
-
22
- MAX_STEPS = int(os.getenv("MAX_STEPS", "8"))
23
- TEMPERATURE = float(os.getenv("TEMPERATURE", "0.0"))
24
- MAX_TOKENS = int(os.getenv("MAX_TOKENS", "256"))
25
- REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "90"))
26
- TASK_INDICES = [int(part.strip()) for part in os.getenv("TASK_INDICES", "0,10,20").split(",") if part.strip()]
27
  SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.67"))
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  BENCHMARK = "osint-openenv"
30
  TASK_NAME = "fixed_levels_easy_mid_hard"
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def log_start(task: str, env: str, model: str) -> None:
34
  print(f"[START] task={task} env={env} model={model}", flush=True)
35
 
@@ -50,301 +89,257 @@ def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> No
50
  )
51
 
52
 
53
- def _looks_like_placeholder_api_key(value: str) -> bool:
54
- token = str(value or "").strip().lower()
55
- if not token:
56
- return True
57
- placeholder_markers = [
58
- "your_openai_api_key",
59
- "your-key",
60
- "your_key",
61
- "your real",
62
- "real-openai-key",
63
- "replace-me",
64
- "changeme",
65
- "example",
66
- "<api-key>",
67
- ]
68
- if token.startswith("your_") or token.startswith("sk-your-"):
69
- return True
70
- return any(marker in token for marker in placeholder_markers)
71
-
72
-
73
- def _supports_reasoning_effort_in_chat_completions(model: str) -> bool:
74
- model_name = str(model).strip().lower()
75
- if model_name.startswith("gpt-5.4-mini"):
76
- return False
77
- return model_name.startswith("gpt-5")
78
-
79
-
80
- def _request_kwargs(messages: list[dict[str, Any]], tools: list[dict[str, Any]]) -> dict[str, Any]:
81
- kwargs: dict[str, Any] = {
82
- "model": MODEL_NAME,
83
- "messages": messages,
84
- "tools": tools,
85
- "tool_choice": "required",
86
- "parallel_tool_calls": False,
87
- }
88
- if MODEL_NAME.strip().lower().startswith("gpt-5"):
89
- kwargs["max_completion_tokens"] = MAX_TOKENS
90
- if _supports_reasoning_effort_in_chat_completions(MODEL_NAME):
91
- kwargs["reasoning_effort"] = "none"
92
- else:
93
- kwargs["temperature"] = TEMPERATURE
94
- kwargs["max_tokens"] = MAX_TOKENS
95
- return kwargs
96
-
97
-
98
- def _message_text(message: Any) -> str:
99
- content = getattr(message, "content", "")
100
- if isinstance(content, str):
101
- return content
102
- if isinstance(content, list):
103
- parts: list[str] = []
104
- for item in content:
105
- if isinstance(item, dict) and item.get("type") == "text":
106
- parts.append(str(item.get("text", "")))
107
- return "\n".join(part for part in parts if part)
108
- return str(content or "")
109
-
110
-
111
- def _space_get(path: str) -> dict[str, Any]:
112
- response = requests.get(f"{SPACE_URL}{path}", timeout=REQUEST_TIMEOUT)
113
- response.raise_for_status()
114
- return response.json()
115
-
116
-
117
- def _space_post(path: str, payload: dict[str, Any]) -> dict[str, Any]:
118
- response = requests.post(f"{SPACE_URL}{path}", json=payload, timeout=REQUEST_TIMEOUT)
119
- response.raise_for_status()
120
- return response.json()
121
-
122
-
123
- def _decode_action(tool_name: str, args: dict[str, Any]) -> dict[str, Any]:
124
- if tool_name == "submit_answer":
125
- return {"action_type": "ANSWER", "payload": {"answer": str(args.get("answer", "")).strip()}}
126
- if tool_name == "add_edge":
127
- return {
128
- "action_type": "ADD_EDGE",
129
- "payload": {
130
- "src": str(args.get("src", "")).strip(),
131
- "rel": str(args.get("rel", "")).strip(),
132
- "dst": str(args.get("dst", "")).strip(),
133
- "confidence": float(args.get("confidence", 1.0)),
134
- },
135
- }
136
- return {"action_type": "CALL_TOOL", "payload": {"tool_name": tool_name, "args": dict(args)}}
137
 
 
 
138
 
139
- def _format_action(action: dict[str, Any]) -> str:
140
- action_type = str(action.get("action_type", ""))
141
- payload = dict(action.get("payload", {}))
142
- if action_type == "ANSWER":
143
- return f"answer({payload.get('answer', 'unknown')})"
144
- if action_type == "ADD_EDGE":
145
- return (
146
- "add_edge("
147
- f"{payload.get('src', '')},"
148
- f"{payload.get('rel', '')},"
149
- f"{payload.get('dst', '')},"
150
- f"{float(payload.get('confidence', 1.0)):.2f}"
151
- ")"
152
- )
153
- tool_name = str(payload.get("tool_name", "tool"))
154
- args = dict(payload.get("args", {}))
155
- if not args:
156
- return f"{tool_name}()"
157
- arg_str = ",".join(f"{key}={value}" for key, value in sorted(args.items()))
158
- return f"{tool_name}({arg_str})"
159
 
 
 
 
160
 
161
- def _assistant_tool_call_id(message: dict[str, Any]) -> str | None:
162
- tool_calls = list(message.get("tool_calls", []))
163
- if not tool_calls:
164
- return None
165
- tool_call_id = tool_calls[0].get("id")
166
- return str(tool_call_id) if tool_call_id else None
167
 
 
 
168
 
169
- def _tool_result_message(assistant_message: dict[str, Any], result: dict[str, Any]) -> dict[str, Any] | None:
170
- tool_call_id = _assistant_tool_call_id(assistant_message)
171
- if not tool_call_id:
172
- return None
173
- return {
174
- "role": "tool",
175
- "tool_call_id": tool_call_id,
176
- "content": json.dumps(result, sort_keys=True),
177
- }
178
 
 
 
179
 
180
- def get_model_action(client: OpenAI, messages: list[dict[str, Any]], tools: list[dict[str, Any]]) -> tuple[dict[str, Any], dict[str, Any]]:
181
- try:
182
- completion = client.chat.completions.create(**_request_kwargs(messages, tools))
183
- message = completion.choices[0].message
184
- tool_calls = list(message.tool_calls or [])
185
- if not tool_calls:
186
- fallback_answer = _message_text(message).strip() or "unknown"
187
- return {"action_type": "ANSWER", "payload": {"answer": fallback_answer}}, {
188
- "role": "assistant",
189
- "content": _message_text(message),
190
- }
191
- tool_call = tool_calls[0]
192
- try:
193
- args = json.loads(tool_call.function.arguments or "{}")
194
- except json.JSONDecodeError:
195
- args = {}
196
- if not isinstance(args, dict):
197
- args = {}
198
- assistant_message = {
199
- "role": "assistant",
200
- "content": _message_text(message),
201
- "tool_calls": [
202
- {
203
- "id": tool_call.id,
204
- "type": "function",
205
- "function": {
206
- "name": str(tool_call.function.name),
207
- "arguments": json.dumps(args, sort_keys=True),
208
- },
209
- }
210
- ],
211
  }
212
- return _decode_action(str(tool_call.function.name), args), assistant_message
213
- except Exception as exc:
214
- print(f"[DEBUG] Model request failed: {exc}", flush=True)
215
- return {"action_type": "ANSWER", "payload": {"answer": "unknown"}}, {"role": "assistant", "content": ""}
216
-
217
-
218
- def _episode_row(result: dict[str, Any], task_meta: dict[str, Any]) -> dict[str, Any]:
219
- info = dict(result.get("info", {}))
220
- graph_snapshot = dict((result.get("observation") or {}).get("graph_snapshot", {}))
221
- task_type = str(task_meta.get("task_type", "unknown"))
222
- task_id = str(task_meta.get("task_id", "unknown"))
223
- question = str(task_meta.get("question", ""))
224
- task_answer = str(info.get("task_answer", ""))
225
- agent_answer = str(info.get("agent_answer", ""))
226
- graph_f1 = float(info.get("graph_f1", 0.0) or 0.0)
227
  return {
228
- "task_id": task_id,
229
- "task_type": task_type,
230
- "question": question,
231
- "task_answer": task_answer,
232
- "agent_answer": agent_answer,
233
  "graph_f1": graph_f1,
234
  "reward": float(info.get("total_reward", 0.0) or 0.0),
235
  "steps": int(info.get("step_count", 0) or 0),
236
  "tool_calls": int(info.get("tool_calls", 0) or 0),
237
- "success": int(bool(agent_answer) and agent_answer == task_answer),
238
  "reward_components": dict(info.get("reward_components", {})),
239
- "pred_edges": list(graph_snapshot.get("edges", [])),
240
- "truth_edges": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  }
242
 
243
 
244
- def _publish_inference_report(summary: dict[str, Any], episodes: list[dict[str, Any]]) -> None:
245
- payload = {
246
- "run": {
247
- "name": "inference_py_run",
248
- "model": MODEL_NAME,
249
- "space_url": SPACE_URL,
250
- "task_indices": TASK_INDICES,
251
- "max_steps": MAX_STEPS,
252
- },
253
- "summary": summary,
254
- "episodes": episodes,
255
- }
256
- try:
257
- _space_post("/openenv/report_inference", payload)
258
- except RequestException as exc:
259
- print(f"[DEBUG] Failed to publish inference report: {exc}", flush=True)
260
 
 
 
261
 
262
- def main() -> None:
263
- if not str(API_BASE_URL).strip():
264
- raise SystemExit("Set API_BASE_URL before running inference.py.")
265
- if not str(MODEL_NAME).strip():
266
- raise SystemExit("Set MODEL_NAME before running inference.py.")
267
-
268
- api_key = HF_TOKEN or OPENAI_API_KEY or API_KEY
269
- if not api_key:
270
- raise SystemExit("Set HF_TOKEN (or OPENAI_API_KEY/API_KEY) before running inference.py.")
271
- if _looks_like_placeholder_api_key(api_key):
272
- raise SystemExit("Replace the placeholder with your real OpenAI API key.")
273
-
274
- try:
275
- ping = _space_get("/healthz")
276
- if ping.get("status") != "ok":
277
- raise SystemExit(f"Unexpected healthz payload: {ping}")
278
- except RequestException as exc:
279
- raise SystemExit(f"Space ping failed: {exc}") from exc
280
-
281
- client = OpenAI(base_url=API_BASE_URL, api_key=api_key, timeout=REQUEST_TIMEOUT)
282
- tools = build_action_tools()
283
-
284
- history: list[str] = []
285
- rewards: list[float] = []
286
- task_scores: list[float] = []
287
- episode_rows: list[dict[str, Any]] = []
 
 
 
 
 
 
 
 
 
288
  metrics = EvalMetrics()
 
 
289
  steps_taken = 0
290
 
291
- log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
 
 
 
292
 
293
- for task_index in TASK_INDICES:
294
- result = _space_post("/openenv/reset", {"task_index": task_index})
295
- session_id = str(result["session_id"])
296
- done = bool(result.get("done", False))
297
- task_meta = dict((result.get("observation") or {}).get("task", {}))
298
- messages: list[dict[str, Any]] = [
299
- {"role": "system", "content": SYSTEM_PROMPT},
300
- {
301
- "role": "user",
302
- "content": json.dumps(result["observation"], indent=2, sort_keys=True),
303
- },
304
- ]
305
-
306
- for local_step in range(1, MAX_STEPS + 1):
307
- if done:
308
- break
309
- action, assistant_message = get_model_action(client, messages, tools)
310
- error = None
311
- try:
312
- result = _space_post(
313
- "/openenv/step",
314
- {
315
- "session_id": session_id,
316
- "action_type": action["action_type"],
317
- "payload": action["payload"],
318
- },
319
- )
320
- except RequestException as exc:
321
- error = str(exc)
322
- result = _space_get(f"/openenv/state/{session_id}")
323
- reward = float(result.get("reward", 0.0) or 0.0)
324
- done = bool(result.get("done", False))
325
  rewards.append(reward)
326
  steps_taken += 1
327
- log_step(step=steps_taken, action=_format_action(action), reward=reward, done=done, error=error)
328
- history.append(f"step={steps_taken} task_index={task_index} reward={reward:+.4f}")
329
- messages.append(assistant_message)
330
- tool_message = _tool_result_message(assistant_message, result)
331
- if tool_message is not None:
332
- messages.append(tool_message)
333
- if done:
334
- break
335
-
336
- info = dict(result.get("info", {}))
337
- task_answer = str(info.get("task_answer", ""))
338
- agent_answer = str(info.get("agent_answer", ""))
339
- task_scores.append(1.0 if agent_answer and agent_answer == task_answer else 0.0)
340
- episode_row = _episode_row(result, task_meta)
341
- episode_rows.append(episode_row)
342
- metrics.add(info, task_type=episode_row["task_type"], graph_f1=float(episode_row["graph_f1"]))
343
-
344
- score = sum(task_scores) / max(1, len(task_scores))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  success = score >= SUCCESS_SCORE_THRESHOLD
346
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
347
- _publish_inference_report(metrics.summary(), episode_rows)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
 
349
 
350
  if __name__ == "__main__":
 
2
 
3
  import json
4
  import os
5
+ from pathlib import Path
6
  from typing import Any
7
 
8
+ from osint_env.agents.single_agent import SingleAgentRunner
9
+ from osint_env.agents.swarm_agent import SwarmAgentRunner
10
+ from osint_env.config import clone_environment_config, load_seeding_config, load_shared_config
11
+ from osint_env.domain.models import EnvironmentConfig
12
+ from osint_env.env.environment import OSINTEnvironment
13
+ from osint_env.env.reward import compute_graph_f1
14
+ from osint_env.eval.leaderboard import append_leaderboard_record, load_leaderboard
15
  from osint_env.eval.metrics import EvalMetrics
16
+ from osint_env.llm import build_llm_client
17
+ from osint_env.viz import export_dashboard
18
 
19
 
20
+ CONFIG_PATH = os.getenv("CONFIG_PATH", "datasets/fixed_levels/shared_config_fixed_levels.json")
21
+ SEED_FILE = os.getenv("SEED_FILE", "datasets/fixed_levels/seed_fixed_levels.json")
22
+ AGENT_MODE = os.getenv("AGENT_MODE", "swarm")
23
+ LLM_PROVIDER = os.getenv("LLM_PROVIDER", "ollama")
24
+ MODEL_NAME = os.getenv("MODEL_NAME", "qwen3:1.7b")
25
+ OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "")
26
+ OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "")
27
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
28
+ OPENAI_API_KEY_ENV = os.getenv("OPENAI_API_KEY_ENV", "OPENAI_API_KEY")
29
+ LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "0"))
30
+ EPISODES = int(os.getenv("EPISODES", "1"))
 
 
 
 
31
  SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.67"))
32
+ TASK_INDICES_RAW = os.getenv("TASK_INDICES", "")
33
+
34
+ WRITE_BENCHMARK_ARTIFACTS = os.getenv("WRITE_BENCHMARK_ARTIFACTS", "1").strip().lower() in {
35
+ "1",
36
+ "true",
37
+ "yes",
38
+ "y",
39
+ "on",
40
+ }
41
+ LEADERBOARD_PATH = os.getenv("LEADERBOARD_PATH", "datasets/fixed_levels/leaderboard_fixed_levels.json")
42
+ DASHBOARD_PATH = os.getenv("DASHBOARD_PATH", "datasets/fixed_levels/dashboard_fixed_levels.html")
43
+ RUN_NAME = os.getenv("RUN_NAME", "fixed_levels_qwen_swarm")
44
 
45
  BENCHMARK = "osint-openenv"
46
  TASK_NAME = "fixed_levels_easy_mid_hard"
47
 
48
 
49
+ def _parse_task_indices(raw: str) -> list[int]:
50
+ out: list[int] = []
51
+ for token in str(raw or "").split(","):
52
+ stripped = token.strip()
53
+ if not stripped:
54
+ continue
55
+ try:
56
+ out.append(int(stripped))
57
+ except ValueError:
58
+ continue
59
+ return out
60
+
61
+
62
+ def _normalize_ollama_base_url(url: str) -> str:
63
+ normalized = str(url or "").strip().rstrip("/")
64
+ if normalized.endswith("/v1"):
65
+ normalized = normalized[:-3].rstrip("/")
66
+ return normalized or "http://127.0.0.1:11434"
67
+
68
+
69
+ TASK_INDICES = _parse_task_indices(TASK_INDICES_RAW)
70
+
71
+
72
  def log_start(task: str, env: str, model: str) -> None:
73
  print(f"[START] task={task} env={env} model={model}", flush=True)
74
 
 
89
  )
90
 
91
 
92
+ def _resolve_environment_config() -> EnvironmentConfig:
93
+ shared = load_shared_config(CONFIG_PATH)
94
+ env_cfg = clone_environment_config(shared.environment)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
+ if SEED_FILE and Path(SEED_FILE).exists():
97
+ env_cfg.seeding = load_seeding_config(SEED_FILE)
98
 
99
+ mode = AGENT_MODE.strip().lower()
100
+ if mode == "single":
101
+ env_cfg.swarm.enabled = False
102
+ elif mode == "swarm":
103
+ env_cfg.swarm.enabled = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
+ provider = LLM_PROVIDER.strip().lower()
106
+ if provider and provider != "config":
107
+ env_cfg.llm.provider = provider
108
 
109
+ if MODEL_NAME.strip():
110
+ env_cfg.llm.model = MODEL_NAME.strip()
 
 
 
 
111
 
112
+ if LLM_TIMEOUT_SECONDS > 0:
113
+ env_cfg.llm.timeout_seconds = int(LLM_TIMEOUT_SECONDS)
114
 
115
+ api_base_override = os.getenv("API_BASE_URL", "")
116
+ if api_base_override.strip() or OLLAMA_BASE_URL.strip():
117
+ env_cfg.llm.ollama_base_url = _normalize_ollama_base_url(api_base_override or OLLAMA_BASE_URL)
 
 
 
 
 
 
118
 
119
+ if OPENAI_BASE_URL.strip():
120
+ env_cfg.llm.openai_base_url = OPENAI_BASE_URL.strip()
121
 
122
+ if OPENAI_API_KEY.strip():
123
+ env_cfg.llm.openai_api_key = OPENAI_API_KEY.strip()
124
+
125
+ if OPENAI_API_KEY_ENV.strip():
126
+ env_cfg.llm.openai_api_key_env = OPENAI_API_KEY_ENV.strip()
127
+
128
+ return env_cfg
129
+
130
+
131
+ def _runner_for(env: OSINTEnvironment, llm: Any) -> SingleAgentRunner | SwarmAgentRunner:
132
+ if env.config.swarm.enabled:
133
+ return SwarmAgentRunner(env=env, llm=llm)
134
+ return SingleAgentRunner(env=env, llm=llm)
135
+
136
+
137
+ def _episode_row(env: OSINTEnvironment, info: dict[str, Any]) -> dict[str, Any]:
138
+ if env.state is None:
139
+ return {
140
+ "task_id": "unknown",
141
+ "task_type": "unknown",
142
+ "question": "",
143
+ "task_answer": str(info.get("task_answer", "")),
144
+ "agent_answer": str(info.get("agent_answer", "")),
145
+ "graph_f1": 0.0,
146
+ "reward": float(info.get("total_reward", 0.0) or 0.0),
147
+ "steps": int(info.get("step_count", 0) or 0),
148
+ "tool_calls": int(info.get("tool_calls", 0) or 0),
149
+ "success": int(info.get("agent_answer") == info.get("task_answer")),
150
+ "reward_components": dict(info.get("reward_components", {})),
151
+ "pred_edges": [],
152
+ "truth_edges": [],
153
  }
154
+
155
+ graph_f1 = compute_graph_f1(env.memory_graph.edges, env.state.task.supporting_edges)
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  return {
157
+ "task_id": env.state.task.task_id,
158
+ "task_type": env.state.task.task_type,
159
+ "question": env.state.task.question,
160
+ "task_answer": str(info.get("task_answer", "")),
161
+ "agent_answer": str(info.get("agent_answer", "")) if info.get("agent_answer") is not None else "",
162
  "graph_f1": graph_f1,
163
  "reward": float(info.get("total_reward", 0.0) or 0.0),
164
  "steps": int(info.get("step_count", 0) or 0),
165
  "tool_calls": int(info.get("tool_calls", 0) or 0),
166
+ "success": int(info.get("agent_answer") == info.get("task_answer")),
167
  "reward_components": dict(info.get("reward_components", {})),
168
+ "spawn_count": int(info.get("spawn_count", 0) or 0),
169
+ "spawn_critical_steps": int(info.get("spawn_critical_steps", 0) or 0),
170
+ "pred_edges": [
171
+ {
172
+ "src": edge.src,
173
+ "rel": edge.rel,
174
+ "dst": edge.dst,
175
+ "confidence": float(edge.confidence),
176
+ }
177
+ for edge in env.memory_graph.edges
178
+ ],
179
+ "truth_edges": [
180
+ {
181
+ "src": edge.src,
182
+ "rel": edge.rel,
183
+ "dst": edge.dst,
184
+ "confidence": float(edge.confidence),
185
+ }
186
+ for edge in env.state.task.supporting_edges
187
+ ],
188
  }
189
 
190
 
191
+ def _format_action_from_history(item: dict[str, Any]) -> str:
192
+ action_type = str(item.get("type", "")).upper()
193
+ payload = dict(item.get("payload", {}))
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ if action_type == "ANSWER":
196
+ return f"answer({str(payload.get('answer', 'unknown')).strip()})"
197
 
198
+ if action_type == "ADD_EDGE":
199
+ try:
200
+ conf = float(payload.get("confidence", 1.0))
201
+ except (TypeError, ValueError):
202
+ conf = 1.0
203
+ return (
204
+ "add_edge("
205
+ f"{payload.get('src', '')},"
206
+ f"{payload.get('rel', '')},"
207
+ f"{payload.get('dst', '')},"
208
+ f"{conf:.2f}"
209
+ ")"
210
+ )
211
+
212
+ tool_name = str(payload.get("tool_name", "tool")).strip() or "tool"
213
+ args = payload.get("args", {})
214
+ if not isinstance(args, dict) or not args:
215
+ return f"{tool_name}()"
216
+ args_text = ",".join(f"{key}={value}" for key, value in sorted(args.items()))
217
+ return f"{tool_name}({args_text})"
218
+
219
+
220
+ def _task_targets(env: OSINTEnvironment, episodes: int, task_indices: list[int]) -> list[int | None]:
221
+ if task_indices:
222
+ task_count = max(1, len(env.tasks))
223
+ return [index % task_count for index in task_indices]
224
+ return [None] * max(1, episodes)
225
+
226
+
227
+ def _run_with_runner(
228
+ env: OSINTEnvironment,
229
+ runner: SingleAgentRunner | SwarmAgentRunner,
230
+ episodes: int,
231
+ task_indices: list[int],
232
+ ) -> tuple[dict[str, Any], list[dict[str, Any]], list[float], int]:
233
  metrics = EvalMetrics()
234
+ episode_rows: list[dict[str, Any]] = []
235
+ rewards: list[float] = []
236
  steps_taken = 0
237
 
238
+ for task_index in _task_targets(env, episodes, task_indices):
239
+ if task_index is not None:
240
+ # Keep compatibility with explicit task selection from the previous inference script.
241
+ env._task_idx = task_index
242
 
243
+ info = runner.run_episode()
244
+ if env.state is None:
245
+ continue
246
+
247
+ history = list(env.state.action_history)
248
+ for idx, action_item in enumerate(history, start=1):
249
+ reward = float(action_item.get("reward", 0.0) or 0.0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  rewards.append(reward)
251
  steps_taken += 1
252
+ done = idx == len(history)
253
+ log_step(
254
+ step=steps_taken,
255
+ action=_format_action_from_history(action_item),
256
+ reward=reward,
257
+ done=done,
258
+ error=None,
259
+ )
260
+
261
+ graph_f1 = compute_graph_f1(env.memory_graph.edges, env.state.task.supporting_edges)
262
+ metrics.add(info, task_type=env.state.task.task_type, graph_f1=graph_f1)
263
+ episode_rows.append(_episode_row(env, info))
264
+
265
+ return metrics.summary(), episode_rows, rewards, steps_taken
266
+
267
+
268
+ def _maybe_write_artifacts(
269
+ env: OSINTEnvironment,
270
+ summary: dict[str, Any],
271
+ episodes: int,
272
+ episode_rows: list[dict[str, Any]],
273
+ ) -> tuple[dict[str, Any] | None, str | None]:
274
+ if not WRITE_BENCHMARK_ARTIFACTS:
275
+ return None, None
276
+
277
+ record = append_leaderboard_record(
278
+ path=LEADERBOARD_PATH,
279
+ summary=summary,
280
+ episodes=episodes,
281
+ run_name=RUN_NAME or None,
282
+ config={
283
+ "seed": env.config.seed,
284
+ "max_steps": env.config.max_steps,
285
+ "swarm_enabled": env.config.swarm.enabled,
286
+ "max_agents": env.config.swarm.max_agents,
287
+ "max_breadth": env.config.swarm.max_breadth,
288
+ "max_width": env.config.swarm.max_width,
289
+ "max_depth": env.config.swarm.max_depth,
290
+ "seeded_questions": len(env.config.seeding.seeded_questions),
291
+ "llm_provider": env.config.llm.provider,
292
+ "llm_model": env.config.llm.model,
293
+ },
294
+ )
295
+
296
+ leaderboard = load_leaderboard(LEADERBOARD_PATH)
297
+ dashboard = export_dashboard(
298
+ env=env,
299
+ evaluation={"summary": summary, "episodes": episode_rows},
300
+ leaderboard_records=leaderboard,
301
+ output_path=DASHBOARD_PATH,
302
+ )
303
+ return record, dashboard
304
+
305
+
306
+ def main() -> None:
307
+ env_cfg = _resolve_environment_config()
308
+ llm_client = build_llm_client(env_cfg.llm)
309
+ env = OSINTEnvironment(env_cfg, llm=llm_client)
310
+ runner = _runner_for(env, llm_client)
311
+
312
+ log_start(task=TASK_NAME, env=BENCHMARK, model=env_cfg.llm.model)
313
+
314
+ episodes = len(TASK_INDICES) if TASK_INDICES else max(1, EPISODES)
315
+ summary, episode_rows, rewards, steps_taken = _run_with_runner(
316
+ env=env,
317
+ runner=runner,
318
+ episodes=episodes,
319
+ task_indices=TASK_INDICES,
320
+ )
321
+
322
+ score = float(summary.get("task_success_rate", 0.0) or 0.0)
323
  success = score >= SUCCESS_SCORE_THRESHOLD
324
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
325
+
326
+ record, dashboard = _maybe_write_artifacts(
327
+ env=env,
328
+ summary=summary,
329
+ episodes=episodes,
330
+ episode_rows=episode_rows,
331
+ )
332
+
333
+ payload: dict[str, Any] = {
334
+ "summary": summary,
335
+ "episodes": episode_rows,
336
+ }
337
+ if record is not None:
338
+ payload["record"] = record
339
+ if dashboard is not None:
340
+ payload["dashboard"] = dashboard
341
+
342
+ print(json.dumps(payload, indent=2, sort_keys=True))
343
 
344
 
345
  if __name__ == "__main__":
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ openenv>=0.1.13
2
+ openai>=1.40.0
3
+ fastapi>=0.115.0
4
+ requests>=2.32.3
5
+ uvicorn>=0.30.0
6
+ pytest>=8.0.0