ritishshrirao commited on
Commit
b366696
·
1 Parent(s): 8ceabd3

Updated OpenEnv server

Browse files
.gitignore CHANGED
@@ -1,4 +1,5 @@
1
  *.pyc
2
  blueprint.txt
3
  *.egg-info
4
- artifacts/*
 
 
1
  *.pyc
2
  blueprint.txt
3
  *.egg-info
4
+ artifacts/*
5
+ *.html
artifacts/leaderboard.json CHANGED
@@ -80,5 +80,374 @@
80
  },
81
  "run_id": "run_0002",
82
  "run_name": "swarm_seed_smoke"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  }
84
  ]
 
80
  },
81
  "run_id": "run_0002",
82
  "run_name": "swarm_seed_smoke"
83
+ },
84
+ {
85
+ "config": {
86
+ "max_agents": 3,
87
+ "max_breadth": 2,
88
+ "max_depth": 2,
89
+ "max_steps": 18,
90
+ "max_width": 2,
91
+ "seed": 7,
92
+ "seeded_questions": 0,
93
+ "swarm_enabled": true
94
+ },
95
+ "created_at": "2026-04-01T12:25:15+00:00",
96
+ "episodes": 20,
97
+ "metrics": {
98
+ "avg_compactness_reward": 0.0,
99
+ "avg_connectivity_gain_reward": 0.10000000000000002,
100
+ "avg_connectivity_reward": 0.23999999999999994,
101
+ "avg_diversity_reward": 0.08000000000000002,
102
+ "avg_entity_informativeness_reward": -0.00983642442912193,
103
+ "avg_format_reward": 0.14999999999999997,
104
+ "avg_graph_f1": 1.0,
105
+ "avg_knowledge_carrier_reward": 0.5,
106
+ "avg_knowledge_indexing_reward": 0.1125,
107
+ "avg_relation_informativeness_reward": 0.007185245326892638,
108
+ "avg_reward": 3.351267560586956,
109
+ "avg_soft_shaping_reward": 0.14999999999999997,
110
+ "avg_spawn_count": 4.0,
111
+ "avg_spawn_critical_steps": 6.0,
112
+ "avg_steps_to_solution": 9.0,
113
+ "deanonymization_accuracy": 1.0,
114
+ "leaderboard_score": 0.8573187614039594,
115
+ "retrieval_signal": 0.7143750000000001,
116
+ "spawn_completion_rate": 1.0,
117
+ "spawn_signal": 0.6666666666666666,
118
+ "structural_signal": 0.5814697641795541,
119
+ "task_success_rate": 1.0,
120
+ "tool_efficiency": 0.25
121
+ },
122
+ "run_id": "run_0003",
123
+ "run_name": "baseline_swarm"
124
+ },
125
+ {
126
+ "config": {
127
+ "max_agents": 3,
128
+ "max_breadth": 2,
129
+ "max_depth": 2,
130
+ "max_steps": 18,
131
+ "max_width": 2,
132
+ "seed": 7,
133
+ "seeded_questions": 1,
134
+ "swarm_enabled": true
135
+ },
136
+ "created_at": "2026-04-01T17:27:30+00:00",
137
+ "episodes": 1,
138
+ "metrics": {
139
+ "avg_compactness_reward": 0.0,
140
+ "avg_connectivity_gain_reward": 0.1,
141
+ "avg_connectivity_reward": 0.3,
142
+ "avg_diversity_reward": 0.08,
143
+ "avg_entity_informativeness_reward": 0.06128386989162576,
144
+ "avg_format_reward": 0.15,
145
+ "avg_graph_f1": 1.0,
146
+ "avg_knowledge_carrier_reward": 0.5,
147
+ "avg_knowledge_indexing_reward": 0.3,
148
+ "avg_relation_informativeness_reward": 0.12,
149
+ "avg_reward": 3.916035942914144,
150
+ "avg_soft_shaping_reward": 0.15,
151
+ "avg_spawn_count": 4.0,
152
+ "avg_spawn_critical_steps": 6.0,
153
+ "avg_steps_to_solution": 9.0,
154
+ "deanonymization_accuracy": 1.0,
155
+ "leaderboard_score": 0.8718832338515622,
156
+ "retrieval_signal": 0.78,
157
+ "spawn_completion_rate": 1.0,
158
+ "spawn_signal": 0.6666666666666666,
159
+ "structural_signal": 0.6332567739783251,
160
+ "task_success_rate": 1.0,
161
+ "tool_efficiency": 0.25
162
+ },
163
+ "run_id": "run_0004",
164
+ "run_name": "ollama_qwen_smoke"
165
+ },
166
+ {
167
+ "config": {
168
+ "max_agents": 3,
169
+ "max_breadth": 2,
170
+ "max_depth": 2,
171
+ "max_steps": 18,
172
+ "max_width": 2,
173
+ "seed": 7,
174
+ "seeded_questions": 1,
175
+ "swarm_enabled": true
176
+ },
177
+ "created_at": "2026-04-01T17:29:12+00:00",
178
+ "episodes": 1,
179
+ "metrics": {
180
+ "avg_compactness_reward": 0.0,
181
+ "avg_connectivity_gain_reward": 0.1,
182
+ "avg_connectivity_reward": 0.3,
183
+ "avg_diversity_reward": 0.08,
184
+ "avg_entity_informativeness_reward": 0.06128386989162576,
185
+ "avg_format_reward": 0.15,
186
+ "avg_graph_f1": 1.0,
187
+ "avg_knowledge_carrier_reward": 0.5,
188
+ "avg_knowledge_indexing_reward": 0.3,
189
+ "avg_relation_informativeness_reward": 0.12,
190
+ "avg_reward": 4.059369276247478,
191
+ "avg_soft_shaping_reward": 0.15,
192
+ "avg_spawn_count": 4.0,
193
+ "avg_spawn_critical_steps": 6.0,
194
+ "avg_steps_to_solution": 9.0,
195
+ "deanonymization_accuracy": 1.0,
196
+ "leaderboard_score": 0.9020114237119466,
197
+ "retrieval_signal": 0.78,
198
+ "spawn_completion_rate": 1.0,
199
+ "spawn_signal": 0.6666666666666666,
200
+ "structural_signal": 0.6332567739783251,
201
+ "task_success_rate": 1.0,
202
+ "tool_efficiency": 0.5
203
+ },
204
+ "run_id": "run_0005",
205
+ "run_name": "ollama_qwen_smoke2"
206
+ },
207
+ {
208
+ "config": {
209
+ "max_agents": 3,
210
+ "max_breadth": 2,
211
+ "max_depth": 2,
212
+ "max_steps": 18,
213
+ "max_width": 2,
214
+ "seed": 7,
215
+ "seeded_questions": 0,
216
+ "swarm_enabled": true
217
+ },
218
+ "created_at": "2026-04-01T17:39:15+00:00",
219
+ "episodes": 2,
220
+ "metrics": {
221
+ "avg_compactness_reward": 0.0,
222
+ "avg_connectivity_gain_reward": 0.2,
223
+ "avg_connectivity_reward": 0.0,
224
+ "avg_diversity_reward": 0.0683333333333333,
225
+ "avg_entity_informativeness_reward": -0.07397348480982455,
226
+ "avg_format_reward": 0.15,
227
+ "avg_graph_f1": 0.6666666666666667,
228
+ "avg_knowledge_carrier_reward": 0.5,
229
+ "avg_knowledge_indexing_reward": 0.14884615384615385,
230
+ "avg_relation_informativeness_reward": -0.00860389783205907,
231
+ "avg_reward": 4.351764433970379,
232
+ "avg_soft_shaping_reward": 0.3,
233
+ "avg_spawn_count": 4.0,
234
+ "avg_spawn_critical_steps": 6.0,
235
+ "avg_steps_to_solution": 9.0,
236
+ "deanonymization_accuracy": 0.0,
237
+ "leaderboard_score": 0.6973935600514568,
238
+ "retrieval_signal": 0.7270961538461539,
239
+ "spawn_completion_rate": 1.0,
240
+ "spawn_signal": 0.6666666666666666,
241
+ "structural_signal": 0.5137345234716233,
242
+ "task_success_rate": 1.0,
243
+ "tool_efficiency": 0.5
244
+ },
245
+ "run_id": "run_0006",
246
+ "run_name": "high_timeout_shared_ctx"
247
+ },
248
+ {
249
+ "config": {
250
+ "max_agents": 3,
251
+ "max_breadth": 2,
252
+ "max_depth": 2,
253
+ "max_steps": 18,
254
+ "max_width": 2,
255
+ "seed": 7,
256
+ "seeded_questions": 0,
257
+ "swarm_enabled": true
258
+ },
259
+ "created_at": "2026-04-01T18:57:40+00:00",
260
+ "episodes": 3,
261
+ "metrics": {
262
+ "avg_compactness_reward": 0.0,
263
+ "avg_connectivity_gain_reward": 0.13333333333333333,
264
+ "avg_connectivity_reward": 0.09999999999999999,
265
+ "avg_diversity_reward": 0.056666666666666664,
266
+ "avg_entity_informativeness_reward": -0.020478979694240708,
267
+ "avg_format_reward": 0.15,
268
+ "avg_graph_f1": 0.8148148148148149,
269
+ "avg_knowledge_carrier_reward": 0.5,
270
+ "avg_knowledge_indexing_reward": 0.27,
271
+ "avg_relation_informativeness_reward": 0.07174291752145656,
272
+ "avg_reward": 4.0269419367756605,
273
+ "avg_soft_shaping_reward": 0.19999999999999998,
274
+ "avg_spawn_count": 4.0,
275
+ "avg_spawn_critical_steps": 6.0,
276
+ "avg_steps_to_solution": 9.0,
277
+ "deanonymization_accuracy": 0.0,
278
+ "leaderboard_score": 0.7366215569569294,
279
+ "retrieval_signal": 0.7695000000000001,
280
+ "spawn_completion_rate": 1.0,
281
+ "spawn_signal": 0.6666666666666666,
282
+ "structural_signal": 0.5570861208987765,
283
+ "task_success_rate": 1.0,
284
+ "tool_efficiency": 0.5
285
+ },
286
+ "run_id": "run_0007",
287
+ "run_name": "episode_selector_check"
288
+ },
289
+ {
290
+ "config": {
291
+ "max_agents": 3,
292
+ "max_breadth": 2,
293
+ "max_depth": 2,
294
+ "max_steps": 18,
295
+ "max_width": 2,
296
+ "seed": 7,
297
+ "seeded_questions": 15,
298
+ "swarm_enabled": true
299
+ },
300
+ "created_at": "2026-04-01T19:11:44+00:00",
301
+ "episodes": 3,
302
+ "metrics": {
303
+ "avg_compactness_reward": 0.0,
304
+ "avg_connectivity_gain_reward": 0.10000000000000002,
305
+ "avg_connectivity_reward": 0.3,
306
+ "avg_diversity_reward": 0.08,
307
+ "avg_entity_informativeness_reward": -0.02722031691758704,
308
+ "avg_format_reward": 0.15,
309
+ "avg_graph_f1": 1.0,
310
+ "avg_knowledge_carrier_reward": 0.5,
311
+ "avg_knowledge_indexing_reward": 0.0,
312
+ "avg_relation_informativeness_reward": -0.00011920119799207429,
313
+ "avg_reward": 3.444079221573606,
314
+ "avg_soft_shaping_reward": 0.15,
315
+ "avg_spawn_count": 4.0,
316
+ "avg_spawn_critical_steps": 6.0,
317
+ "avg_steps_to_solution": 9.0,
318
+ "deanonymization_accuracy": 1.0,
319
+ "leaderboard_score": 0.8828572592896698,
320
+ "retrieval_signal": 0.675,
321
+ "spawn_completion_rate": 1.0,
322
+ "spawn_signal": 0.6666666666666666,
323
+ "structural_signal": 0.5915320963768841,
324
+ "task_success_rate": 1.0,
325
+ "tool_efficiency": 0.5
326
+ },
327
+ "run_id": "run_0008",
328
+ "run_name": "qwen_rerun"
329
+ },
330
+ {
331
+ "config": {
332
+ "max_agents": 3,
333
+ "max_breadth": 2,
334
+ "max_depth": 2,
335
+ "max_steps": 18,
336
+ "max_width": 2,
337
+ "seed": 7,
338
+ "seeded_questions": 15,
339
+ "swarm_enabled": true
340
+ },
341
+ "created_at": "2026-04-01T19:19:34+00:00",
342
+ "episodes": 3,
343
+ "metrics": {
344
+ "avg_compactness_reward": 0.0,
345
+ "avg_connectivity_gain_reward": 0.10000000000000002,
346
+ "avg_connectivity_reward": 0.3,
347
+ "avg_diversity_reward": 0.08,
348
+ "avg_entity_informativeness_reward": -0.024861029515896544,
349
+ "avg_format_reward": 0.15,
350
+ "avg_graph_f1": 1.0,
351
+ "avg_knowledge_carrier_reward": 0.5,
352
+ "avg_knowledge_indexing_reward": 0.0,
353
+ "avg_relation_informativeness_reward": -0.0024320085090966614,
354
+ "avg_reward": 3.4441257016641917,
355
+ "avg_soft_shaping_reward": 0.15,
356
+ "avg_spawn_count": 4.0,
357
+ "avg_spawn_critical_steps": 6.0,
358
+ "avg_steps_to_solution": 9.0,
359
+ "deanonymization_accuracy": 1.0,
360
+ "leaderboard_score": 0.8828581656226586,
361
+ "retrieval_signal": 0.675,
362
+ "spawn_completion_rate": 1.0,
363
+ "spawn_signal": 0.6666666666666666,
364
+ "structural_signal": 0.5915413923950014,
365
+ "task_success_rate": 1.0,
366
+ "tool_efficiency": 0.5
367
+ },
368
+ "run_id": "run_0009",
369
+ "run_name": "qwen_episode_fix"
370
+ },
371
+ {
372
+ "config": {
373
+ "max_agents": 3,
374
+ "max_breadth": 2,
375
+ "max_depth": 2,
376
+ "max_steps": 18,
377
+ "max_width": 2,
378
+ "seed": 7,
379
+ "seeded_questions": 15,
380
+ "swarm_enabled": true
381
+ },
382
+ "created_at": "2026-04-01T19:24:37+00:00",
383
+ "episodes": 3,
384
+ "metrics": {
385
+ "avg_compactness_reward": 0.0,
386
+ "avg_connectivity_gain_reward": 0.10000000000000002,
387
+ "avg_connectivity_reward": 0.3,
388
+ "avg_diversity_reward": 0.08,
389
+ "avg_entity_informativeness_reward": -0.02722031691758704,
390
+ "avg_format_reward": 0.15,
391
+ "avg_graph_f1": 1.0,
392
+ "avg_knowledge_carrier_reward": 0.5,
393
+ "avg_knowledge_indexing_reward": 0.0,
394
+ "avg_relation_informativeness_reward": -0.0030604289114462002,
395
+ "avg_reward": 3.4411379938601514,
396
+ "avg_soft_shaping_reward": 0.15,
397
+ "avg_spawn_count": 4.0,
398
+ "avg_spawn_critical_steps": 6.0,
399
+ "avg_steps_to_solution": 9.0,
400
+ "deanonymization_accuracy": 1.0,
401
+ "leaderboard_score": 0.8827999009847504,
402
+ "retrieval_signal": 0.675,
403
+ "spawn_completion_rate": 1.0,
404
+ "spawn_signal": 0.6666666666666666,
405
+ "structural_signal": 0.5909438508341933,
406
+ "task_success_rate": 1.0,
407
+ "tool_efficiency": 0.5
408
+ },
409
+ "run_id": "run_0010",
410
+ "run_name": "qwen_rerun_graph_fix"
411
+ },
412
+ {
413
+ "config": {
414
+ "max_agents": 3,
415
+ "max_breadth": 2,
416
+ "max_depth": 2,
417
+ "max_steps": 18,
418
+ "max_width": 2,
419
+ "seed": 7,
420
+ "seeded_questions": 15,
421
+ "swarm_enabled": true
422
+ },
423
+ "created_at": "2026-04-01T19:31:54+00:00",
424
+ "episodes": 15,
425
+ "metrics": {
426
+ "avg_compactness_reward": 0.0,
427
+ "avg_connectivity_gain_reward": 0.16666666666666666,
428
+ "avg_connectivity_reward": 0.16999999999999998,
429
+ "avg_diversity_reward": 0.1157777777777778,
430
+ "avg_entity_informativeness_reward": -0.0181244777358718,
431
+ "avg_format_reward": 0.14999999999999997,
432
+ "avg_graph_f1": 0.8492063492063492,
433
+ "avg_knowledge_carrier_reward": 0.5,
434
+ "avg_knowledge_indexing_reward": 0.012000000000000002,
435
+ "avg_relation_informativeness_reward": 0.05935837081627929,
436
+ "avg_reward": 4.201760569277529,
437
+ "avg_soft_shaping_reward": 0.24999999999999994,
438
+ "avg_spawn_count": 4.0,
439
+ "avg_spawn_critical_steps": 6.0,
440
+ "avg_steps_to_solution": 9.0,
441
+ "deanonymization_accuracy": 1.0,
442
+ "leaderboard_score": 0.8534887252258901,
443
+ "retrieval_signal": 0.6792,
444
+ "spawn_completion_rate": 1.0,
445
+ "spawn_signal": 0.6666666666666666,
446
+ "structural_signal": 0.5847801119494148,
447
+ "task_success_rate": 1.0,
448
+ "tool_efficiency": 0.5
449
+ },
450
+ "run_id": "run_0011",
451
+ "run_name": "qwen_rerun_graph_fix"
452
  }
453
  ]
artifacts/osint_dashboard.html CHANGED
The diff for this file is too large to render. See raw diff
 
datasets/fixed_levels/leaderboard_fixed_levels.json CHANGED
@@ -39,5 +39,87 @@
39
  },
40
  "run_id": "run_0001",
41
  "run_name": "fixed_levels_qwen_swarm"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  }
43
  ]
 
39
  },
40
  "run_id": "run_0001",
41
  "run_name": "fixed_levels_qwen_swarm"
42
+ },
43
+ {
44
+ "config": {
45
+ "max_agents": 3,
46
+ "max_breadth": 2,
47
+ "max_depth": 2,
48
+ "max_steps": 24,
49
+ "max_width": 2,
50
+ "seed": 2026,
51
+ "seeded_questions": 30,
52
+ "swarm_enabled": true
53
+ },
54
+ "created_at": "2026-04-02T09:16:05+00:00",
55
+ "episodes": 30,
56
+ "metrics": {
57
+ "avg_compactness_reward": 0.0,
58
+ "avg_connectivity_gain_reward": 0.2000000000000001,
59
+ "avg_connectivity_reward": 0.12999999999999998,
60
+ "avg_diversity_reward": 0.12433333333333325,
61
+ "avg_entity_informativeness_reward": 0.000700571890338102,
62
+ "avg_format_reward": 0.15,
63
+ "avg_graph_f1": 0.2916528337385394,
64
+ "avg_knowledge_carrier_reward": 0.5,
65
+ "avg_knowledge_indexing_reward": 0.05070078042510192,
66
+ "avg_relation_informativeness_reward": 0.07853375358885142,
67
+ "avg_reward": 4.377456514967488,
68
+ "avg_soft_shaping_reward": 0.3,
69
+ "avg_spawn_count": 4.0,
70
+ "avg_spawn_critical_steps": 6.0,
71
+ "avg_steps_to_solution": 9.0,
72
+ "deanonymization_accuracy": 0.0,
73
+ "leaderboard_score": 0.6241912131110795,
74
+ "retrieval_signal": 0.6927452731487858,
75
+ "spawn_completion_rate": 1.0,
76
+ "spawn_signal": 0.6666666666666666,
77
+ "structural_signal": 0.5869968650958378,
78
+ "task_success_rate": 1.0,
79
+ "tool_efficiency": 0.5
80
+ },
81
+ "run_id": "run_0002",
82
+ "run_name": "fixed_levels_qwen_swarm"
83
+ },
84
+ {
85
+ "config": {
86
+ "max_agents": 3,
87
+ "max_breadth": 2,
88
+ "max_depth": 2,
89
+ "max_steps": 24,
90
+ "max_width": 2,
91
+ "seed": 2026,
92
+ "seeded_questions": 30,
93
+ "swarm_enabled": true
94
+ },
95
+ "created_at": "2026-04-03T13:22:03+00:00",
96
+ "episodes": 3,
97
+ "metrics": {
98
+ "avg_compactness_reward": 0.0,
99
+ "avg_connectivity_gain_reward": 0.20000000000000004,
100
+ "avg_connectivity_reward": -0.06666666666666667,
101
+ "avg_diversity_reward": 0.13444444444444445,
102
+ "avg_entity_informativeness_reward": -0.01010882862863417,
103
+ "avg_format_reward": 0.15,
104
+ "avg_graph_f1": 0.5793650793650794,
105
+ "avg_knowledge_carrier_reward": 0.5,
106
+ "avg_knowledge_indexing_reward": 0.10372960372960373,
107
+ "avg_relation_informativeness_reward": 0.07108687894082726,
108
+ "avg_reward": 4.419313576918165,
109
+ "avg_soft_shaping_reward": 0.3,
110
+ "avg_spawn_count": 4.0,
111
+ "avg_spawn_critical_steps": 6.0,
112
+ "avg_steps_to_solution": 9.0,
113
+ "deanonymization_accuracy": 0.0,
114
+ "leaderboard_score": 0.6797400780463063,
115
+ "retrieval_signal": 0.7113053613053614,
116
+ "spawn_completion_rate": 1.0,
117
+ "spawn_signal": 0.6666666666666666,
118
+ "structural_signal": 0.5356956100624386,
119
+ "task_success_rate": 1.0,
120
+ "tool_efficiency": 0.5
121
+ },
122
+ "run_id": "run_0003",
123
+ "run_name": "fixed_levels_qwen_swarm"
124
  }
125
  ]
inference.py CHANGED
@@ -260,9 +260,14 @@ def _publish_inference_report(summary: dict[str, Any], episodes: list[dict[str,
260
 
261
 
262
  def main() -> None:
263
- api_key = OPENAI_API_KEY or HF_TOKEN or API_KEY
 
 
 
 
 
264
  if not api_key:
265
- raise SystemExit("Set HF_TOKEN, OPENAI_API_KEY, or API_KEY before running inference.py.")
266
  if _looks_like_placeholder_api_key(api_key):
267
  raise SystemExit("Replace the placeholder with your real OpenAI API key.")
268
 
 
260
 
261
 
262
  def main() -> None:
263
+ if not str(API_BASE_URL).strip():
264
+ raise SystemExit("Set API_BASE_URL before running inference.py.")
265
+ if not str(MODEL_NAME).strip():
266
+ raise SystemExit("Set MODEL_NAME before running inference.py.")
267
+
268
+ api_key = HF_TOKEN or OPENAI_API_KEY or API_KEY
269
  if not api_key:
270
+ raise SystemExit("Set HF_TOKEN (or OPENAI_API_KEY/API_KEY) before running inference.py.")
271
  if _looks_like_placeholder_api_key(api_key):
272
  raise SystemExit("Replace the placeholder with your real OpenAI API key.")
273
 
pyproject.toml CHANGED
@@ -19,6 +19,7 @@ dev = [
19
 
20
  [project.scripts]
21
  osint-env = "osint_env.cli:main"
 
22
 
23
  [build-system]
24
  requires = ["setuptools>=68", "wheel"]
 
19
 
20
  [project.scripts]
21
  osint-env = "osint_env.cli:main"
22
+ server = "osint_env.server_entry:main"
23
 
24
  [build-system]
25
  requires = ["setuptools>=68", "wheel"]
server.py CHANGED
@@ -403,9 +403,10 @@ def openenv_tasks() -> list[OpenEnvTaskSummary]:
403
 
404
 
405
  @app.post("/openenv/reset", response_model=OpenEnvResponseEnvelope)
406
- def openenv_reset(request: OpenEnvResetRequest) -> OpenEnvResponseEnvelope:
407
  env = _build_environment()
408
- env._task_idx = _resolve_task_index(env, request)
 
409
  observation = env.reset()
410
  session_id = str(uuid4())
411
  _store_session(session_id, env)
@@ -421,11 +422,14 @@ def openenv_reset(request: OpenEnvResetRequest) -> OpenEnvResponseEnvelope:
421
  @app.post("/openenv/step", response_model=OpenEnvResponseEnvelope)
422
  def openenv_step(request: OpenEnvActionRequest) -> OpenEnvResponseEnvelope:
423
  env = _get_session_env(request.session_id)
 
 
 
424
  try:
425
- action_type = ActionType(str(request.action_type))
426
  except ValueError as exc:
427
- raise HTTPException(status_code=400, detail=f"Unsupported action_type {request.action_type}") from exc
428
- observation, reward, done, info = env.step(Action(action_type, dict(request.payload)))
429
  return OpenEnvResponseEnvelope(
430
  session_id=request.session_id,
431
  observation=_serialize_observation(observation),
 
403
 
404
 
405
  @app.post("/openenv/reset", response_model=OpenEnvResponseEnvelope)
406
+ def openenv_reset(request: OpenEnvResetRequest | None = None) -> OpenEnvResponseEnvelope:
407
  env = _build_environment()
408
+ reset_request = request or OpenEnvResetRequest()
409
+ env._task_idx = _resolve_task_index(env, reset_request)
410
  observation = env.reset()
411
  session_id = str(uuid4())
412
  _store_session(session_id, env)
 
422
  @app.post("/openenv/step", response_model=OpenEnvResponseEnvelope)
423
  def openenv_step(request: OpenEnvActionRequest) -> OpenEnvResponseEnvelope:
424
  env = _get_session_env(request.session_id)
425
+ action_type_raw = request.resolved_action_type().strip()
426
+ if not action_type_raw:
427
+ raise HTTPException(status_code=400, detail="Missing action_type")
428
  try:
429
+ action_type = ActionType(action_type_raw)
430
  except ValueError as exc:
431
+ raise HTTPException(status_code=400, detail=f"Unsupported action_type {action_type_raw}") from exc
432
+ observation, reward, done, info = env.step(Action(action_type=action_type, payload=request.resolved_payload()))
433
  return OpenEnvResponseEnvelope(
434
  session_id=request.session_id,
435
  observation=_serialize_observation(observation),
server/app.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib.util
4
+ import os
5
+ from pathlib import Path
6
+
7
+ import uvicorn
8
+
9
+
10
+ _ROOT_SERVER_PATH = Path(__file__).resolve().parents[1] / "server.py"
11
+ _SPEC = importlib.util.spec_from_file_location("osint_root_server", _ROOT_SERVER_PATH)
12
+ if _SPEC is None or _SPEC.loader is None:
13
+ raise RuntimeError(f"Unable to load server module from {_ROOT_SERVER_PATH}")
14
+
15
+ _MODULE = importlib.util.module_from_spec(_SPEC)
16
+ _SPEC.loader.exec_module(_MODULE)
17
+ app = _MODULE.app
18
+
19
+
20
+ def main() -> None:
21
+ port = int(os.getenv("PORT", "7860"))
22
+ uvicorn.run("server.app:app", host="0.0.0.0", port=port)
23
+
24
+
25
+ if __name__ == "__main__":
26
+ main()
src/osint_env/api/models.py CHANGED
@@ -26,8 +26,27 @@ class OpenEnvResetRequest(BaseModel):
26
 
27
  class OpenEnvActionRequest(BaseModel):
28
  session_id: str
29
- action_type: str = Field(description="One of CALL_TOOL, ADD_EDGE, ANSWER.")
30
  payload: dict[str, Any] = Field(default_factory=dict)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
 
33
  class OpenEnvResponseEnvelope(BaseModel):
 
26
 
27
  class OpenEnvActionRequest(BaseModel):
28
  session_id: str
29
+ action_type: str | None = Field(default=None, description="One of CALL_TOOL, ADD_EDGE, ANSWER.")
30
  payload: dict[str, Any] = Field(default_factory=dict)
31
+ action: dict[str, Any] | None = None
32
+
33
+ def resolved_action_type(self) -> str:
34
+ if self.action_type:
35
+ return str(self.action_type)
36
+ if isinstance(self.action, dict):
37
+ nested = self.action.get("action_type")
38
+ if nested:
39
+ return str(nested)
40
+ return ""
41
+
42
+ def resolved_payload(self) -> dict[str, Any]:
43
+ if self.payload:
44
+ return dict(self.payload)
45
+ if isinstance(self.action, dict):
46
+ nested = self.action.get("payload")
47
+ if isinstance(nested, dict):
48
+ return dict(nested)
49
+ return {}
50
 
51
 
52
  class OpenEnvResponseEnvelope(BaseModel):
src/osint_env/domain/models.py CHANGED
@@ -4,6 +4,8 @@ from dataclasses import dataclass, field
4
  from enum import Enum
5
  from typing import Any
6
 
 
 
7
 
8
  class NodeType(str, Enum):
9
  USER = "user"
@@ -48,18 +50,44 @@ class ToolCall:
48
  args: dict[str, Any]
49
 
50
 
51
- @dataclass(slots=True)
52
- class Action:
 
 
 
53
  action_type: ActionType
54
- payload: dict[str, Any]
55
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- @dataclass(slots=True)
58
- class Observation:
59
- tool_outputs: list[dict[str, Any]]
60
- graph_snapshot: dict[str, Any]
61
- action_history: list[dict[str, Any]]
62
- task: dict[str, Any]
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
 
65
  @dataclass(slots=True)
 
4
  from enum import Enum
5
  from typing import Any
6
 
7
+ from pydantic import BaseModel, ConfigDict, Field
8
+
9
 
10
  class NodeType(str, Enum):
11
  USER = "user"
 
50
  args: dict[str, Any]
51
 
52
 
53
+ class Action(BaseModel):
54
+ """Structured action payload used by OpenEnv step()."""
55
+
56
+ model_config = ConfigDict(extra="forbid")
57
+
58
  action_type: ActionType
59
+ payload: dict[str, Any] = Field(default_factory=dict)
60
 
61
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
62
+ # Backward-compatible positional form: Action(action_type, payload)
63
+ if args:
64
+ if len(args) != 2:
65
+ raise TypeError("Action() accepts either keyword fields or 2 positional args")
66
+ if "action_type" in kwargs or "payload" in kwargs:
67
+ raise TypeError("Action() cannot mix positional and keyword fields")
68
+ kwargs["action_type"] = args[0]
69
+ kwargs["payload"] = args[1]
70
+ super().__init__(**kwargs)
71
 
72
+
73
+ class Observation(BaseModel):
74
+ """Typed observation payload returned by reset()/step()/state()."""
75
+
76
+ model_config = ConfigDict(extra="forbid")
77
+
78
+ tool_outputs: list[dict[str, Any]] = Field(default_factory=list)
79
+ graph_snapshot: dict[str, Any] = Field(default_factory=dict)
80
+ action_history: list[dict[str, Any]] = Field(default_factory=list)
81
+ task: dict[str, Any] = Field(default_factory=dict)
82
+
83
+
84
+ class Reward(BaseModel):
85
+ """Typed reward payload for structured reward accounting."""
86
+
87
+ model_config = ConfigDict(extra="forbid")
88
+
89
+ value: float = 0.0
90
+ components: dict[str, float] = Field(default_factory=dict)
91
 
92
 
93
  @dataclass(slots=True)
src/osint_env/server_entry.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+
5
+ import uvicorn
6
+
7
+
8
+ def main() -> None:
9
+ port = int(os.getenv("PORT", "7860"))
10
+ uvicorn.run("server:app", host="0.0.0.0", port=port)
tests/test_server.py CHANGED
@@ -64,6 +64,33 @@ def test_openenv_reset_step_and_state_cycle():
64
  assert "task_answer" in step_body["info"]
65
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  def test_report_inference_updates_latest_evaluation_and_dashboard(tmp_path, monkeypatch):
68
  latest_evaluation = tmp_path / "latest_evaluation.json"
69
  space_dashboard = tmp_path / "space_dashboard.html"
 
64
  assert "task_answer" in step_body["info"]
65
 
66
 
67
+ def test_openenv_reset_accepts_empty_body():
68
+ reset = client.post("/openenv/reset")
69
+ assert reset.status_code == 200
70
+ body = reset.json()
71
+ assert body["done"] is False
72
+ assert "session_id" in body
73
+
74
+
75
+ def test_openenv_step_accepts_nested_action_payload():
76
+ reset = client.post("/openenv/reset", json={"task_index": 0})
77
+ assert reset.status_code == 200
78
+ session_id = reset.json()["session_id"]
79
+
80
+ step = client.post(
81
+ "/openenv/step",
82
+ json={
83
+ "session_id": session_id,
84
+ "action": {
85
+ "action_type": "ANSWER",
86
+ "payload": {"answer": "unknown"},
87
+ },
88
+ },
89
+ )
90
+ assert step.status_code == 200
91
+ assert step.json()["done"] is True
92
+
93
+
94
  def test_report_inference_updates_latest_evaluation_and_dashboard(tmp_path, monkeypatch):
95
  latest_evaluation = tmp_path / "latest_evaluation.json"
96
  space_dashboard = tmp_path / "space_dashboard.html"
uv.lock ADDED
The diff for this file is too large to render. See raw diff