Spaces:
Sleeping
Sleeping
Commit ·
5b2fec4
1
Parent(s): 6f6ecdc
fixed inference
Browse files- .gitignore +1 -0
- datasets/fixed_levels/leaderboard_fixed_levels.json +293 -0
- inference.py +278 -283
- requirements.txt +6 -0
.gitignore
CHANGED
|
@@ -3,3 +3,4 @@ blueprint.txt
|
|
| 3 |
*.egg-info
|
| 4 |
artifacts/*
|
| 5 |
*.html
|
|
|
|
|
|
| 3 |
*.egg-info
|
| 4 |
artifacts/*
|
| 5 |
*.html
|
| 6 |
+
.venv/
|
datasets/fixed_levels/leaderboard_fixed_levels.json
CHANGED
|
@@ -121,5 +121,298 @@
|
|
| 121 |
},
|
| 122 |
"run_id": "run_0003",
|
| 123 |
"run_name": "fixed_levels_qwen_swarm"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
}
|
| 125 |
]
|
|
|
|
| 121 |
},
|
| 122 |
"run_id": "run_0003",
|
| 123 |
"run_name": "fixed_levels_qwen_swarm"
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"config": {
|
| 127 |
+
"max_agents": 3,
|
| 128 |
+
"max_breadth": 2,
|
| 129 |
+
"max_depth": 2,
|
| 130 |
+
"max_steps": 24,
|
| 131 |
+
"max_width": 2,
|
| 132 |
+
"seed": 2026,
|
| 133 |
+
"seeded_questions": 30,
|
| 134 |
+
"swarm_enabled": true
|
| 135 |
+
},
|
| 136 |
+
"created_at": "2026-04-06T18:29:39+00:00",
|
| 137 |
+
"episodes": 30,
|
| 138 |
+
"metrics": {
|
| 139 |
+
"avg_compactness_reward": 0.0,
|
| 140 |
+
"avg_connectivity_gain_reward": 0.2000000000000001,
|
| 141 |
+
"avg_connectivity_reward": 0.12999999999999998,
|
| 142 |
+
"avg_diversity_reward": 0.12433333333333325,
|
| 143 |
+
"avg_entity_informativeness_reward": -0.02515191749984708,
|
| 144 |
+
"avg_format_reward": 0.15,
|
| 145 |
+
"avg_graph_f1": 0.2916528337385394,
|
| 146 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 147 |
+
"avg_knowledge_indexing_reward": 0.11539120363588044,
|
| 148 |
+
"avg_relation_informativeness_reward": 0.0769903534735767,
|
| 149 |
+
"avg_reward": 4.460667345528021,
|
| 150 |
+
"avg_soft_shaping_reward": 0.3,
|
| 151 |
+
"avg_spawn_count": 4.0,
|
| 152 |
+
"avg_spawn_critical_steps": 6.0,
|
| 153 |
+
"avg_steps_to_solution": 9.0,
|
| 154 |
+
"deanonymization_accuracy": 0.0,
|
| 155 |
+
"leaderboard_score": 0.6269168609961595,
|
| 156 |
+
"retrieval_signal": 0.7153869212725582,
|
| 157 |
+
"spawn_completion_rate": 1.0,
|
| 158 |
+
"spawn_signal": 0.6666666666666666,
|
| 159 |
+
"structural_signal": 0.5815176871947458,
|
| 160 |
+
"task_success_rate": 1.0,
|
| 161 |
+
"tool_efficiency": 0.5
|
| 162 |
+
},
|
| 163 |
+
"run_id": "run_0004",
|
| 164 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"config": {
|
| 168 |
+
"max_agents": 3,
|
| 169 |
+
"max_breadth": 2,
|
| 170 |
+
"max_depth": 2,
|
| 171 |
+
"max_steps": 24,
|
| 172 |
+
"max_width": 2,
|
| 173 |
+
"seed": 2026,
|
| 174 |
+
"seeded_questions": 30,
|
| 175 |
+
"swarm_enabled": true
|
| 176 |
+
},
|
| 177 |
+
"created_at": "2026-04-06T18:33:06+00:00",
|
| 178 |
+
"episodes": 2,
|
| 179 |
+
"metrics": {
|
| 180 |
+
"avg_compactness_reward": 0.0,
|
| 181 |
+
"avg_connectivity_gain_reward": 0.2,
|
| 182 |
+
"avg_connectivity_reward": -0.15,
|
| 183 |
+
"avg_diversity_reward": 0.13833333333333334,
|
| 184 |
+
"avg_entity_informativeness_reward": -0.026628229842114173,
|
| 185 |
+
"avg_format_reward": 0.15,
|
| 186 |
+
"avg_graph_f1": 0.6190476190476191,
|
| 187 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 188 |
+
"avg_knowledge_indexing_reward": 0.10681818181818181,
|
| 189 |
+
"avg_relation_informativeness_reward": 0.048120982127120335,
|
| 190 |
+
"avg_reward": 4.334953339016039,
|
| 191 |
+
"avg_soft_shaping_reward": 0.3,
|
| 192 |
+
"avg_spawn_count": 4.0,
|
| 193 |
+
"avg_spawn_critical_steps": 6.0,
|
| 194 |
+
"avg_steps_to_solution": 9.0,
|
| 195 |
+
"deanonymization_accuracy": 0.0,
|
| 196 |
+
"leaderboard_score": 0.685242999396977,
|
| 197 |
+
"retrieval_signal": 0.7123863636363637,
|
| 198 |
+
"spawn_completion_rate": 1.0,
|
| 199 |
+
"spawn_signal": 0.6666666666666666,
|
| 200 |
+
"structural_signal": 0.5075485504570012,
|
| 201 |
+
"task_success_rate": 1.0,
|
| 202 |
+
"tool_efficiency": 0.5
|
| 203 |
+
},
|
| 204 |
+
"run_id": "run_0005",
|
| 205 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"config": {
|
| 209 |
+
"max_agents": 1,
|
| 210 |
+
"max_breadth": 2,
|
| 211 |
+
"max_depth": 2,
|
| 212 |
+
"max_steps": 24,
|
| 213 |
+
"max_width": 2,
|
| 214 |
+
"seed": 2026,
|
| 215 |
+
"seeded_questions": 30,
|
| 216 |
+
"swarm_enabled": true
|
| 217 |
+
},
|
| 218 |
+
"created_at": "2026-04-06T18:54:52+00:00",
|
| 219 |
+
"episodes": 1,
|
| 220 |
+
"metrics": {
|
| 221 |
+
"avg_compactness_reward": 0.0,
|
| 222 |
+
"avg_connectivity_gain_reward": 0.1,
|
| 223 |
+
"avg_connectivity_reward": -0.3,
|
| 224 |
+
"avg_diversity_reward": 0.08,
|
| 225 |
+
"avg_entity_informativeness_reward": -0.02450859227728558,
|
| 226 |
+
"avg_format_reward": 0.15,
|
| 227 |
+
"avg_graph_f1": 0.33333333333333337,
|
| 228 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 229 |
+
"avg_knowledge_indexing_reward": 0.08181818181818182,
|
| 230 |
+
"avg_relation_informativeness_reward": 0.04353540016904645,
|
| 231 |
+
"avg_reward": 3.037246438342494,
|
| 232 |
+
"avg_soft_shaping_reward": 0.15,
|
| 233 |
+
"avg_spawn_count": 2.0,
|
| 234 |
+
"avg_spawn_critical_steps": 6.0,
|
| 235 |
+
"avg_steps_to_solution": 5.0,
|
| 236 |
+
"deanonymization_accuracy": 0.0,
|
| 237 |
+
"leaderboard_score": 0.6201263424948862,
|
| 238 |
+
"retrieval_signal": 0.7036363636363637,
|
| 239 |
+
"spawn_completion_rate": 1.0,
|
| 240 |
+
"spawn_signal": 0.6666666666666666,
|
| 241 |
+
"structural_signal": 0.45080536157835216,
|
| 242 |
+
"task_success_rate": 1.0,
|
| 243 |
+
"tool_efficiency": 0.5
|
| 244 |
+
},
|
| 245 |
+
"run_id": "run_0006",
|
| 246 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 247 |
+
},
|
| 248 |
+
{
|
| 249 |
+
"config": {
|
| 250 |
+
"max_agents": 1,
|
| 251 |
+
"max_breadth": 2,
|
| 252 |
+
"max_depth": 2,
|
| 253 |
+
"max_steps": 24,
|
| 254 |
+
"max_width": 2,
|
| 255 |
+
"seed": 2026,
|
| 256 |
+
"seeded_questions": 30,
|
| 257 |
+
"swarm_enabled": true
|
| 258 |
+
},
|
| 259 |
+
"created_at": "2026-04-06T19:22:57+00:00",
|
| 260 |
+
"episodes": 1,
|
| 261 |
+
"metrics": {
|
| 262 |
+
"avg_compactness_reward": 0.0,
|
| 263 |
+
"avg_connectivity_gain_reward": 0.1,
|
| 264 |
+
"avg_connectivity_reward": -0.3,
|
| 265 |
+
"avg_diversity_reward": 0.08,
|
| 266 |
+
"avg_entity_informativeness_reward": -0.005263146336646693,
|
| 267 |
+
"avg_format_reward": 0.15,
|
| 268 |
+
"avg_graph_f1": 0.33333333333333337,
|
| 269 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 270 |
+
"avg_knowledge_indexing_reward": 0.08181818181818182,
|
| 271 |
+
"avg_relation_informativeness_reward": 0.044276243254877785,
|
| 272 |
+
"avg_reward": 3.057232727368964,
|
| 273 |
+
"avg_soft_shaping_reward": 0.15,
|
| 274 |
+
"avg_spawn_count": 2.0,
|
| 275 |
+
"avg_spawn_critical_steps": 6.0,
|
| 276 |
+
"avg_steps_to_solution": 5.0,
|
| 277 |
+
"deanonymization_accuracy": 0.0,
|
| 278 |
+
"leaderboard_score": 0.6205293479318178,
|
| 279 |
+
"retrieval_signal": 0.7036363636363637,
|
| 280 |
+
"spawn_completion_rate": 1.0,
|
| 281 |
+
"spawn_signal": 0.6666666666666666,
|
| 282 |
+
"structural_signal": 0.4548026193836462,
|
| 283 |
+
"task_success_rate": 1.0,
|
| 284 |
+
"tool_efficiency": 0.5
|
| 285 |
+
},
|
| 286 |
+
"run_id": "run_0007",
|
| 287 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 288 |
+
},
|
| 289 |
+
{
|
| 290 |
+
"config": {
|
| 291 |
+
"llm_model": "qwen3:1.7b",
|
| 292 |
+
"llm_provider": "ollama",
|
| 293 |
+
"max_agents": 1,
|
| 294 |
+
"max_breadth": 2,
|
| 295 |
+
"max_depth": 2,
|
| 296 |
+
"max_steps": 24,
|
| 297 |
+
"max_width": 2,
|
| 298 |
+
"seed": 2026,
|
| 299 |
+
"seeded_questions": 30,
|
| 300 |
+
"swarm_enabled": true
|
| 301 |
+
},
|
| 302 |
+
"created_at": "2026-04-06T19:48:33+00:00",
|
| 303 |
+
"episodes": 3,
|
| 304 |
+
"metrics": {
|
| 305 |
+
"avg_compactness_reward": 0.0,
|
| 306 |
+
"avg_connectivity_gain_reward": 0.10000000000000002,
|
| 307 |
+
"avg_connectivity_reward": -0.09999999999999999,
|
| 308 |
+
"avg_diversity_reward": 0.08,
|
| 309 |
+
"avg_entity_informativeness_reward": -0.028683816517602444,
|
| 310 |
+
"avg_format_reward": 0.15,
|
| 311 |
+
"avg_graph_f1": 0.15537340619307835,
|
| 312 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 313 |
+
"avg_knowledge_indexing_reward": 0.07932190760059611,
|
| 314 |
+
"avg_relation_informativeness_reward": 0.044225025032092045,
|
| 315 |
+
"avg_reward": 3.1324990406542437,
|
| 316 |
+
"avg_soft_shaping_reward": 0.15,
|
| 317 |
+
"avg_spawn_count": 2.0,
|
| 318 |
+
"avg_spawn_critical_steps": 6.0,
|
| 319 |
+
"avg_steps_to_solution": 5.0,
|
| 320 |
+
"deanonymization_accuracy": 0.0,
|
| 321 |
+
"leaderboard_score": 0.5890485416309927,
|
| 322 |
+
"retrieval_signal": 0.7027626676602087,
|
| 323 |
+
"spawn_completion_rate": 1.0,
|
| 324 |
+
"spawn_signal": 0.6666666666666666,
|
| 325 |
+
"structural_signal": 0.5001082417028979,
|
| 326 |
+
"task_success_rate": 1.0,
|
| 327 |
+
"tool_efficiency": 0.5
|
| 328 |
+
},
|
| 329 |
+
"run_id": "run_0008",
|
| 330 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 331 |
+
},
|
| 332 |
+
{
|
| 333 |
+
"config": {
|
| 334 |
+
"llm_model": "qwen3:1.7b",
|
| 335 |
+
"llm_provider": "ollama",
|
| 336 |
+
"max_agents": 1,
|
| 337 |
+
"max_breadth": 2,
|
| 338 |
+
"max_depth": 2,
|
| 339 |
+
"max_steps": 24,
|
| 340 |
+
"max_width": 2,
|
| 341 |
+
"seed": 2026,
|
| 342 |
+
"seeded_questions": 30,
|
| 343 |
+
"swarm_enabled": true
|
| 344 |
+
},
|
| 345 |
+
"created_at": "2026-04-06T19:55:08+00:00",
|
| 346 |
+
"episodes": 1,
|
| 347 |
+
"metrics": {
|
| 348 |
+
"avg_compactness_reward": 0.0,
|
| 349 |
+
"avg_connectivity_gain_reward": 0.1,
|
| 350 |
+
"avg_connectivity_reward": -0.3,
|
| 351 |
+
"avg_diversity_reward": 0.08,
|
| 352 |
+
"avg_entity_informativeness_reward": -0.005263146336646693,
|
| 353 |
+
"avg_format_reward": 0.15,
|
| 354 |
+
"avg_graph_f1": 0.33333333333333337,
|
| 355 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 356 |
+
"avg_knowledge_indexing_reward": 0.08181818181818182,
|
| 357 |
+
"avg_relation_informativeness_reward": 0.04406984773661544,
|
| 358 |
+
"avg_reward": 3.0570263318507016,
|
| 359 |
+
"avg_soft_shaping_reward": 0.15,
|
| 360 |
+
"avg_spawn_count": 2.0,
|
| 361 |
+
"avg_spawn_critical_steps": 6.0,
|
| 362 |
+
"avg_steps_to_solution": 5.0,
|
| 363 |
+
"deanonymization_accuracy": 0.0,
|
| 364 |
+
"leaderboard_score": 0.6205251901591228,
|
| 365 |
+
"retrieval_signal": 0.7036363636363637,
|
| 366 |
+
"spawn_completion_rate": 1.0,
|
| 367 |
+
"spawn_signal": 0.6666666666666666,
|
| 368 |
+
"structural_signal": 0.45476134027999376,
|
| 369 |
+
"task_success_rate": 1.0,
|
| 370 |
+
"tool_efficiency": 0.5
|
| 371 |
+
},
|
| 372 |
+
"run_id": "run_0009",
|
| 373 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 374 |
+
},
|
| 375 |
+
{
|
| 376 |
+
"config": {
|
| 377 |
+
"llm_model": "qwen3:1.7b",
|
| 378 |
+
"llm_provider": "ollama",
|
| 379 |
+
"max_agents": 1,
|
| 380 |
+
"max_breadth": 2,
|
| 381 |
+
"max_depth": 2,
|
| 382 |
+
"max_steps": 24,
|
| 383 |
+
"max_width": 2,
|
| 384 |
+
"seed": 2026,
|
| 385 |
+
"seeded_questions": 30,
|
| 386 |
+
"swarm_enabled": true
|
| 387 |
+
},
|
| 388 |
+
"created_at": "2026-04-06T20:01:34+00:00",
|
| 389 |
+
"episodes": 1,
|
| 390 |
+
"metrics": {
|
| 391 |
+
"avg_compactness_reward": 0.0,
|
| 392 |
+
"avg_connectivity_gain_reward": 0.1,
|
| 393 |
+
"avg_connectivity_reward": -0.3,
|
| 394 |
+
"avg_diversity_reward": 0.08,
|
| 395 |
+
"avg_entity_informativeness_reward": -0.020826953461399098,
|
| 396 |
+
"avg_format_reward": 0.15,
|
| 397 |
+
"avg_graph_f1": 0.33333333333333337,
|
| 398 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 399 |
+
"avg_knowledge_indexing_reward": 0.08181818181818182,
|
| 400 |
+
"avg_relation_informativeness_reward": 0.04348043923536236,
|
| 401 |
+
"avg_reward": 3.040873116224696,
|
| 402 |
+
"avg_soft_shaping_reward": 0.15,
|
| 403 |
+
"avg_spawn_count": 2.0,
|
| 404 |
+
"avg_spawn_critical_steps": 6.0,
|
| 405 |
+
"avg_steps_to_solution": 5.0,
|
| 406 |
+
"deanonymization_accuracy": 0.0,
|
| 407 |
+
"leaderboard_score": 0.6201995296517067,
|
| 408 |
+
"retrieval_signal": 0.7036363636363637,
|
| 409 |
+
"spawn_completion_rate": 1.0,
|
| 410 |
+
"spawn_signal": 0.6666666666666666,
|
| 411 |
+
"structural_signal": 0.45153069715479266,
|
| 412 |
+
"task_success_rate": 1.0,
|
| 413 |
+
"tool_efficiency": 0.5
|
| 414 |
+
},
|
| 415 |
+
"run_id": "run_0010",
|
| 416 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 417 |
}
|
| 418 |
]
|
inference.py
CHANGED
|
@@ -2,34 +2,73 @@ from __future__ import annotations
|
|
| 2 |
|
| 3 |
import json
|
| 4 |
import os
|
|
|
|
| 5 |
from typing import Any
|
| 6 |
|
| 7 |
-
import
|
| 8 |
-
from
|
| 9 |
-
from
|
| 10 |
-
|
| 11 |
-
from osint_env.
|
|
|
|
|
|
|
| 12 |
from osint_env.eval.metrics import EvalMetrics
|
|
|
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
| 19 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
TEMPERATURE = float(os.getenv("TEMPERATURE", "0.0"))
|
| 24 |
-
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "256"))
|
| 25 |
-
REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "90"))
|
| 26 |
-
TASK_INDICES = [int(part.strip()) for part in os.getenv("TASK_INDICES", "0,10,20").split(",") if part.strip()]
|
| 27 |
SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.67"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
BENCHMARK = "osint-openenv"
|
| 30 |
TASK_NAME = "fixed_levels_easy_mid_hard"
|
| 31 |
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
def log_start(task: str, env: str, model: str) -> None:
|
| 34 |
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 35 |
|
|
@@ -50,301 +89,257 @@ def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> No
|
|
| 50 |
)
|
| 51 |
|
| 52 |
|
| 53 |
-
def
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
return True
|
| 57 |
-
placeholder_markers = [
|
| 58 |
-
"your_openai_api_key",
|
| 59 |
-
"your-key",
|
| 60 |
-
"your_key",
|
| 61 |
-
"your real",
|
| 62 |
-
"real-openai-key",
|
| 63 |
-
"replace-me",
|
| 64 |
-
"changeme",
|
| 65 |
-
"example",
|
| 66 |
-
"<api-key>",
|
| 67 |
-
]
|
| 68 |
-
if token.startswith("your_") or token.startswith("sk-your-"):
|
| 69 |
-
return True
|
| 70 |
-
return any(marker in token for marker in placeholder_markers)
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
def _supports_reasoning_effort_in_chat_completions(model: str) -> bool:
|
| 74 |
-
model_name = str(model).strip().lower()
|
| 75 |
-
if model_name.startswith("gpt-5.4-mini"):
|
| 76 |
-
return False
|
| 77 |
-
return model_name.startswith("gpt-5")
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
def _request_kwargs(messages: list[dict[str, Any]], tools: list[dict[str, Any]]) -> dict[str, Any]:
|
| 81 |
-
kwargs: dict[str, Any] = {
|
| 82 |
-
"model": MODEL_NAME,
|
| 83 |
-
"messages": messages,
|
| 84 |
-
"tools": tools,
|
| 85 |
-
"tool_choice": "required",
|
| 86 |
-
"parallel_tool_calls": False,
|
| 87 |
-
}
|
| 88 |
-
if MODEL_NAME.strip().lower().startswith("gpt-5"):
|
| 89 |
-
kwargs["max_completion_tokens"] = MAX_TOKENS
|
| 90 |
-
if _supports_reasoning_effort_in_chat_completions(MODEL_NAME):
|
| 91 |
-
kwargs["reasoning_effort"] = "none"
|
| 92 |
-
else:
|
| 93 |
-
kwargs["temperature"] = TEMPERATURE
|
| 94 |
-
kwargs["max_tokens"] = MAX_TOKENS
|
| 95 |
-
return kwargs
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
def _message_text(message: Any) -> str:
|
| 99 |
-
content = getattr(message, "content", "")
|
| 100 |
-
if isinstance(content, str):
|
| 101 |
-
return content
|
| 102 |
-
if isinstance(content, list):
|
| 103 |
-
parts: list[str] = []
|
| 104 |
-
for item in content:
|
| 105 |
-
if isinstance(item, dict) and item.get("type") == "text":
|
| 106 |
-
parts.append(str(item.get("text", "")))
|
| 107 |
-
return "\n".join(part for part in parts if part)
|
| 108 |
-
return str(content or "")
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
def _space_get(path: str) -> dict[str, Any]:
|
| 112 |
-
response = requests.get(f"{SPACE_URL}{path}", timeout=REQUEST_TIMEOUT)
|
| 113 |
-
response.raise_for_status()
|
| 114 |
-
return response.json()
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
def _space_post(path: str, payload: dict[str, Any]) -> dict[str, Any]:
|
| 118 |
-
response = requests.post(f"{SPACE_URL}{path}", json=payload, timeout=REQUEST_TIMEOUT)
|
| 119 |
-
response.raise_for_status()
|
| 120 |
-
return response.json()
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
def _decode_action(tool_name: str, args: dict[str, Any]) -> dict[str, Any]:
|
| 124 |
-
if tool_name == "submit_answer":
|
| 125 |
-
return {"action_type": "ANSWER", "payload": {"answer": str(args.get("answer", "")).strip()}}
|
| 126 |
-
if tool_name == "add_edge":
|
| 127 |
-
return {
|
| 128 |
-
"action_type": "ADD_EDGE",
|
| 129 |
-
"payload": {
|
| 130 |
-
"src": str(args.get("src", "")).strip(),
|
| 131 |
-
"rel": str(args.get("rel", "")).strip(),
|
| 132 |
-
"dst": str(args.get("dst", "")).strip(),
|
| 133 |
-
"confidence": float(args.get("confidence", 1.0)),
|
| 134 |
-
},
|
| 135 |
-
}
|
| 136 |
-
return {"action_type": "CALL_TOOL", "payload": {"tool_name": tool_name, "args": dict(args)}}
|
| 137 |
|
|
|
|
|
|
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
if action_type == "ADD_EDGE":
|
| 145 |
-
return (
|
| 146 |
-
"add_edge("
|
| 147 |
-
f"{payload.get('src', '')},"
|
| 148 |
-
f"{payload.get('rel', '')},"
|
| 149 |
-
f"{payload.get('dst', '')},"
|
| 150 |
-
f"{float(payload.get('confidence', 1.0)):.2f}"
|
| 151 |
-
")"
|
| 152 |
-
)
|
| 153 |
-
tool_name = str(payload.get("tool_name", "tool"))
|
| 154 |
-
args = dict(payload.get("args", {}))
|
| 155 |
-
if not args:
|
| 156 |
-
return f"{tool_name}()"
|
| 157 |
-
arg_str = ",".join(f"{key}={value}" for key, value in sorted(args.items()))
|
| 158 |
-
return f"{tool_name}({arg_str})"
|
| 159 |
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
if not tool_calls:
|
| 164 |
-
return None
|
| 165 |
-
tool_call_id = tool_calls[0].get("id")
|
| 166 |
-
return str(tool_call_id) if tool_call_id else None
|
| 167 |
|
|
|
|
|
|
|
| 168 |
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
return None
|
| 173 |
-
return {
|
| 174 |
-
"role": "tool",
|
| 175 |
-
"tool_call_id": tool_call_id,
|
| 176 |
-
"content": json.dumps(result, sort_keys=True),
|
| 177 |
-
}
|
| 178 |
|
|
|
|
|
|
|
| 179 |
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
"
|
| 200 |
-
"
|
| 201 |
-
"
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
],
|
| 211 |
}
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
print(f"[DEBUG] Model request failed: {exc}", flush=True)
|
| 215 |
-
return {"action_type": "ANSWER", "payload": {"answer": "unknown"}}, {"role": "assistant", "content": ""}
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
def _episode_row(result: dict[str, Any], task_meta: dict[str, Any]) -> dict[str, Any]:
|
| 219 |
-
info = dict(result.get("info", {}))
|
| 220 |
-
graph_snapshot = dict((result.get("observation") or {}).get("graph_snapshot", {}))
|
| 221 |
-
task_type = str(task_meta.get("task_type", "unknown"))
|
| 222 |
-
task_id = str(task_meta.get("task_id", "unknown"))
|
| 223 |
-
question = str(task_meta.get("question", ""))
|
| 224 |
-
task_answer = str(info.get("task_answer", ""))
|
| 225 |
-
agent_answer = str(info.get("agent_answer", ""))
|
| 226 |
-
graph_f1 = float(info.get("graph_f1", 0.0) or 0.0)
|
| 227 |
return {
|
| 228 |
-
"task_id": task_id,
|
| 229 |
-
"task_type": task_type,
|
| 230 |
-
"question": question,
|
| 231 |
-
"task_answer": task_answer,
|
| 232 |
-
"agent_answer": agent_answer,
|
| 233 |
"graph_f1": graph_f1,
|
| 234 |
"reward": float(info.get("total_reward", 0.0) or 0.0),
|
| 235 |
"steps": int(info.get("step_count", 0) or 0),
|
| 236 |
"tool_calls": int(info.get("tool_calls", 0) or 0),
|
| 237 |
-
"success": int(
|
| 238 |
"reward_components": dict(info.get("reward_components", {})),
|
| 239 |
-
"
|
| 240 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
}
|
| 242 |
|
| 243 |
|
| 244 |
-
def
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
"name": "inference_py_run",
|
| 248 |
-
"model": MODEL_NAME,
|
| 249 |
-
"space_url": SPACE_URL,
|
| 250 |
-
"task_indices": TASK_INDICES,
|
| 251 |
-
"max_steps": MAX_STEPS,
|
| 252 |
-
},
|
| 253 |
-
"summary": summary,
|
| 254 |
-
"episodes": episodes,
|
| 255 |
-
}
|
| 256 |
-
try:
|
| 257 |
-
_space_post("/openenv/report_inference", payload)
|
| 258 |
-
except RequestException as exc:
|
| 259 |
-
print(f"[DEBUG] Failed to publish inference report: {exc}", flush=True)
|
| 260 |
|
|
|
|
|
|
|
| 261 |
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
metrics = EvalMetrics()
|
|
|
|
|
|
|
| 289 |
steps_taken = 0
|
| 290 |
|
| 291 |
-
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
{
|
| 301 |
-
"role": "user",
|
| 302 |
-
"content": json.dumps(result["observation"], indent=2, sort_keys=True),
|
| 303 |
-
},
|
| 304 |
-
]
|
| 305 |
-
|
| 306 |
-
for local_step in range(1, MAX_STEPS + 1):
|
| 307 |
-
if done:
|
| 308 |
-
break
|
| 309 |
-
action, assistant_message = get_model_action(client, messages, tools)
|
| 310 |
-
error = None
|
| 311 |
-
try:
|
| 312 |
-
result = _space_post(
|
| 313 |
-
"/openenv/step",
|
| 314 |
-
{
|
| 315 |
-
"session_id": session_id,
|
| 316 |
-
"action_type": action["action_type"],
|
| 317 |
-
"payload": action["payload"],
|
| 318 |
-
},
|
| 319 |
-
)
|
| 320 |
-
except RequestException as exc:
|
| 321 |
-
error = str(exc)
|
| 322 |
-
result = _space_get(f"/openenv/state/{session_id}")
|
| 323 |
-
reward = float(result.get("reward", 0.0) or 0.0)
|
| 324 |
-
done = bool(result.get("done", False))
|
| 325 |
rewards.append(reward)
|
| 326 |
steps_taken += 1
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 346 |
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 347 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
|
| 349 |
|
| 350 |
if __name__ == "__main__":
|
|
|
|
| 2 |
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
from typing import Any
|
| 7 |
|
| 8 |
+
from osint_env.agents.single_agent import SingleAgentRunner
|
| 9 |
+
from osint_env.agents.swarm_agent import SwarmAgentRunner
|
| 10 |
+
from osint_env.config import clone_environment_config, load_seeding_config, load_shared_config
|
| 11 |
+
from osint_env.domain.models import EnvironmentConfig
|
| 12 |
+
from osint_env.env.environment import OSINTEnvironment
|
| 13 |
+
from osint_env.env.reward import compute_graph_f1
|
| 14 |
+
from osint_env.eval.leaderboard import append_leaderboard_record, load_leaderboard
|
| 15 |
from osint_env.eval.metrics import EvalMetrics
|
| 16 |
+
from osint_env.llm import build_llm_client
|
| 17 |
+
from osint_env.viz import export_dashboard
|
| 18 |
|
| 19 |
|
| 20 |
+
CONFIG_PATH = os.getenv("CONFIG_PATH", "datasets/fixed_levels/shared_config_fixed_levels.json")
|
| 21 |
+
SEED_FILE = os.getenv("SEED_FILE", "datasets/fixed_levels/seed_fixed_levels.json")
|
| 22 |
+
AGENT_MODE = os.getenv("AGENT_MODE", "swarm")
|
| 23 |
+
LLM_PROVIDER = os.getenv("LLM_PROVIDER", "ollama")
|
| 24 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "qwen3:1.7b")
|
| 25 |
+
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "")
|
| 26 |
+
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "")
|
| 27 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 28 |
+
OPENAI_API_KEY_ENV = os.getenv("OPENAI_API_KEY_ENV", "OPENAI_API_KEY")
|
| 29 |
+
LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "0"))
|
| 30 |
+
EPISODES = int(os.getenv("EPISODES", "1"))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.67"))
|
| 32 |
+
TASK_INDICES_RAW = os.getenv("TASK_INDICES", "")
|
| 33 |
+
|
| 34 |
+
WRITE_BENCHMARK_ARTIFACTS = os.getenv("WRITE_BENCHMARK_ARTIFACTS", "1").strip().lower() in {
|
| 35 |
+
"1",
|
| 36 |
+
"true",
|
| 37 |
+
"yes",
|
| 38 |
+
"y",
|
| 39 |
+
"on",
|
| 40 |
+
}
|
| 41 |
+
LEADERBOARD_PATH = os.getenv("LEADERBOARD_PATH", "datasets/fixed_levels/leaderboard_fixed_levels.json")
|
| 42 |
+
DASHBOARD_PATH = os.getenv("DASHBOARD_PATH", "datasets/fixed_levels/dashboard_fixed_levels.html")
|
| 43 |
+
RUN_NAME = os.getenv("RUN_NAME", "fixed_levels_qwen_swarm")
|
| 44 |
|
| 45 |
BENCHMARK = "osint-openenv"
|
| 46 |
TASK_NAME = "fixed_levels_easy_mid_hard"
|
| 47 |
|
| 48 |
|
| 49 |
+
def _parse_task_indices(raw: str) -> list[int]:
|
| 50 |
+
out: list[int] = []
|
| 51 |
+
for token in str(raw or "").split(","):
|
| 52 |
+
stripped = token.strip()
|
| 53 |
+
if not stripped:
|
| 54 |
+
continue
|
| 55 |
+
try:
|
| 56 |
+
out.append(int(stripped))
|
| 57 |
+
except ValueError:
|
| 58 |
+
continue
|
| 59 |
+
return out
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _normalize_ollama_base_url(url: str) -> str:
|
| 63 |
+
normalized = str(url or "").strip().rstrip("/")
|
| 64 |
+
if normalized.endswith("/v1"):
|
| 65 |
+
normalized = normalized[:-3].rstrip("/")
|
| 66 |
+
return normalized or "http://127.0.0.1:11434"
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
TASK_INDICES = _parse_task_indices(TASK_INDICES_RAW)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
def log_start(task: str, env: str, model: str) -> None:
|
| 73 |
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 74 |
|
|
|
|
| 89 |
)
|
| 90 |
|
| 91 |
|
| 92 |
+
def _resolve_environment_config() -> EnvironmentConfig:
|
| 93 |
+
shared = load_shared_config(CONFIG_PATH)
|
| 94 |
+
env_cfg = clone_environment_config(shared.environment)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
+
if SEED_FILE and Path(SEED_FILE).exists():
|
| 97 |
+
env_cfg.seeding = load_seeding_config(SEED_FILE)
|
| 98 |
|
| 99 |
+
mode = AGENT_MODE.strip().lower()
|
| 100 |
+
if mode == "single":
|
| 101 |
+
env_cfg.swarm.enabled = False
|
| 102 |
+
elif mode == "swarm":
|
| 103 |
+
env_cfg.swarm.enabled = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
+
provider = LLM_PROVIDER.strip().lower()
|
| 106 |
+
if provider and provider != "config":
|
| 107 |
+
env_cfg.llm.provider = provider
|
| 108 |
|
| 109 |
+
if MODEL_NAME.strip():
|
| 110 |
+
env_cfg.llm.model = MODEL_NAME.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
+
if LLM_TIMEOUT_SECONDS > 0:
|
| 113 |
+
env_cfg.llm.timeout_seconds = int(LLM_TIMEOUT_SECONDS)
|
| 114 |
|
| 115 |
+
api_base_override = os.getenv("API_BASE_URL", "")
|
| 116 |
+
if api_base_override.strip() or OLLAMA_BASE_URL.strip():
|
| 117 |
+
env_cfg.llm.ollama_base_url = _normalize_ollama_base_url(api_base_override or OLLAMA_BASE_URL)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
+
if OPENAI_BASE_URL.strip():
|
| 120 |
+
env_cfg.llm.openai_base_url = OPENAI_BASE_URL.strip()
|
| 121 |
|
| 122 |
+
if OPENAI_API_KEY.strip():
|
| 123 |
+
env_cfg.llm.openai_api_key = OPENAI_API_KEY.strip()
|
| 124 |
+
|
| 125 |
+
if OPENAI_API_KEY_ENV.strip():
|
| 126 |
+
env_cfg.llm.openai_api_key_env = OPENAI_API_KEY_ENV.strip()
|
| 127 |
+
|
| 128 |
+
return env_cfg
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def _runner_for(env: OSINTEnvironment, llm: Any) -> SingleAgentRunner | SwarmAgentRunner:
|
| 132 |
+
if env.config.swarm.enabled:
|
| 133 |
+
return SwarmAgentRunner(env=env, llm=llm)
|
| 134 |
+
return SingleAgentRunner(env=env, llm=llm)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def _episode_row(env: OSINTEnvironment, info: dict[str, Any]) -> dict[str, Any]:
|
| 138 |
+
if env.state is None:
|
| 139 |
+
return {
|
| 140 |
+
"task_id": "unknown",
|
| 141 |
+
"task_type": "unknown",
|
| 142 |
+
"question": "",
|
| 143 |
+
"task_answer": str(info.get("task_answer", "")),
|
| 144 |
+
"agent_answer": str(info.get("agent_answer", "")),
|
| 145 |
+
"graph_f1": 0.0,
|
| 146 |
+
"reward": float(info.get("total_reward", 0.0) or 0.0),
|
| 147 |
+
"steps": int(info.get("step_count", 0) or 0),
|
| 148 |
+
"tool_calls": int(info.get("tool_calls", 0) or 0),
|
| 149 |
+
"success": int(info.get("agent_answer") == info.get("task_answer")),
|
| 150 |
+
"reward_components": dict(info.get("reward_components", {})),
|
| 151 |
+
"pred_edges": [],
|
| 152 |
+
"truth_edges": [],
|
| 153 |
}
|
| 154 |
+
|
| 155 |
+
graph_f1 = compute_graph_f1(env.memory_graph.edges, env.state.task.supporting_edges)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
return {
|
| 157 |
+
"task_id": env.state.task.task_id,
|
| 158 |
+
"task_type": env.state.task.task_type,
|
| 159 |
+
"question": env.state.task.question,
|
| 160 |
+
"task_answer": str(info.get("task_answer", "")),
|
| 161 |
+
"agent_answer": str(info.get("agent_answer", "")) if info.get("agent_answer") is not None else "",
|
| 162 |
"graph_f1": graph_f1,
|
| 163 |
"reward": float(info.get("total_reward", 0.0) or 0.0),
|
| 164 |
"steps": int(info.get("step_count", 0) or 0),
|
| 165 |
"tool_calls": int(info.get("tool_calls", 0) or 0),
|
| 166 |
+
"success": int(info.get("agent_answer") == info.get("task_answer")),
|
| 167 |
"reward_components": dict(info.get("reward_components", {})),
|
| 168 |
+
"spawn_count": int(info.get("spawn_count", 0) or 0),
|
| 169 |
+
"spawn_critical_steps": int(info.get("spawn_critical_steps", 0) or 0),
|
| 170 |
+
"pred_edges": [
|
| 171 |
+
{
|
| 172 |
+
"src": edge.src,
|
| 173 |
+
"rel": edge.rel,
|
| 174 |
+
"dst": edge.dst,
|
| 175 |
+
"confidence": float(edge.confidence),
|
| 176 |
+
}
|
| 177 |
+
for edge in env.memory_graph.edges
|
| 178 |
+
],
|
| 179 |
+
"truth_edges": [
|
| 180 |
+
{
|
| 181 |
+
"src": edge.src,
|
| 182 |
+
"rel": edge.rel,
|
| 183 |
+
"dst": edge.dst,
|
| 184 |
+
"confidence": float(edge.confidence),
|
| 185 |
+
}
|
| 186 |
+
for edge in env.state.task.supporting_edges
|
| 187 |
+
],
|
| 188 |
}
|
| 189 |
|
| 190 |
|
| 191 |
+
def _format_action_from_history(item: dict[str, Any]) -> str:
|
| 192 |
+
action_type = str(item.get("type", "")).upper()
|
| 193 |
+
payload = dict(item.get("payload", {}))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
+
if action_type == "ANSWER":
|
| 196 |
+
return f"answer({str(payload.get('answer', 'unknown')).strip()})"
|
| 197 |
|
| 198 |
+
if action_type == "ADD_EDGE":
|
| 199 |
+
try:
|
| 200 |
+
conf = float(payload.get("confidence", 1.0))
|
| 201 |
+
except (TypeError, ValueError):
|
| 202 |
+
conf = 1.0
|
| 203 |
+
return (
|
| 204 |
+
"add_edge("
|
| 205 |
+
f"{payload.get('src', '')},"
|
| 206 |
+
f"{payload.get('rel', '')},"
|
| 207 |
+
f"{payload.get('dst', '')},"
|
| 208 |
+
f"{conf:.2f}"
|
| 209 |
+
")"
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
tool_name = str(payload.get("tool_name", "tool")).strip() or "tool"
|
| 213 |
+
args = payload.get("args", {})
|
| 214 |
+
if not isinstance(args, dict) or not args:
|
| 215 |
+
return f"{tool_name}()"
|
| 216 |
+
args_text = ",".join(f"{key}={value}" for key, value in sorted(args.items()))
|
| 217 |
+
return f"{tool_name}({args_text})"
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def _task_targets(env: OSINTEnvironment, episodes: int, task_indices: list[int]) -> list[int | None]:
|
| 221 |
+
if task_indices:
|
| 222 |
+
task_count = max(1, len(env.tasks))
|
| 223 |
+
return [index % task_count for index in task_indices]
|
| 224 |
+
return [None] * max(1, episodes)
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def _run_with_runner(
|
| 228 |
+
env: OSINTEnvironment,
|
| 229 |
+
runner: SingleAgentRunner | SwarmAgentRunner,
|
| 230 |
+
episodes: int,
|
| 231 |
+
task_indices: list[int],
|
| 232 |
+
) -> tuple[dict[str, Any], list[dict[str, Any]], list[float], int]:
|
| 233 |
metrics = EvalMetrics()
|
| 234 |
+
episode_rows: list[dict[str, Any]] = []
|
| 235 |
+
rewards: list[float] = []
|
| 236 |
steps_taken = 0
|
| 237 |
|
| 238 |
+
for task_index in _task_targets(env, episodes, task_indices):
|
| 239 |
+
if task_index is not None:
|
| 240 |
+
# Keep compatibility with explicit task selection from the previous inference script.
|
| 241 |
+
env._task_idx = task_index
|
| 242 |
|
| 243 |
+
info = runner.run_episode()
|
| 244 |
+
if env.state is None:
|
| 245 |
+
continue
|
| 246 |
+
|
| 247 |
+
history = list(env.state.action_history)
|
| 248 |
+
for idx, action_item in enumerate(history, start=1):
|
| 249 |
+
reward = float(action_item.get("reward", 0.0) or 0.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
rewards.append(reward)
|
| 251 |
steps_taken += 1
|
| 252 |
+
done = idx == len(history)
|
| 253 |
+
log_step(
|
| 254 |
+
step=steps_taken,
|
| 255 |
+
action=_format_action_from_history(action_item),
|
| 256 |
+
reward=reward,
|
| 257 |
+
done=done,
|
| 258 |
+
error=None,
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
graph_f1 = compute_graph_f1(env.memory_graph.edges, env.state.task.supporting_edges)
|
| 262 |
+
metrics.add(info, task_type=env.state.task.task_type, graph_f1=graph_f1)
|
| 263 |
+
episode_rows.append(_episode_row(env, info))
|
| 264 |
+
|
| 265 |
+
return metrics.summary(), episode_rows, rewards, steps_taken
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def _maybe_write_artifacts(
|
| 269 |
+
env: OSINTEnvironment,
|
| 270 |
+
summary: dict[str, Any],
|
| 271 |
+
episodes: int,
|
| 272 |
+
episode_rows: list[dict[str, Any]],
|
| 273 |
+
) -> tuple[dict[str, Any] | None, str | None]:
|
| 274 |
+
if not WRITE_BENCHMARK_ARTIFACTS:
|
| 275 |
+
return None, None
|
| 276 |
+
|
| 277 |
+
record = append_leaderboard_record(
|
| 278 |
+
path=LEADERBOARD_PATH,
|
| 279 |
+
summary=summary,
|
| 280 |
+
episodes=episodes,
|
| 281 |
+
run_name=RUN_NAME or None,
|
| 282 |
+
config={
|
| 283 |
+
"seed": env.config.seed,
|
| 284 |
+
"max_steps": env.config.max_steps,
|
| 285 |
+
"swarm_enabled": env.config.swarm.enabled,
|
| 286 |
+
"max_agents": env.config.swarm.max_agents,
|
| 287 |
+
"max_breadth": env.config.swarm.max_breadth,
|
| 288 |
+
"max_width": env.config.swarm.max_width,
|
| 289 |
+
"max_depth": env.config.swarm.max_depth,
|
| 290 |
+
"seeded_questions": len(env.config.seeding.seeded_questions),
|
| 291 |
+
"llm_provider": env.config.llm.provider,
|
| 292 |
+
"llm_model": env.config.llm.model,
|
| 293 |
+
},
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
leaderboard = load_leaderboard(LEADERBOARD_PATH)
|
| 297 |
+
dashboard = export_dashboard(
|
| 298 |
+
env=env,
|
| 299 |
+
evaluation={"summary": summary, "episodes": episode_rows},
|
| 300 |
+
leaderboard_records=leaderboard,
|
| 301 |
+
output_path=DASHBOARD_PATH,
|
| 302 |
+
)
|
| 303 |
+
return record, dashboard
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def main() -> None:
|
| 307 |
+
env_cfg = _resolve_environment_config()
|
| 308 |
+
llm_client = build_llm_client(env_cfg.llm)
|
| 309 |
+
env = OSINTEnvironment(env_cfg, llm=llm_client)
|
| 310 |
+
runner = _runner_for(env, llm_client)
|
| 311 |
+
|
| 312 |
+
log_start(task=TASK_NAME, env=BENCHMARK, model=env_cfg.llm.model)
|
| 313 |
+
|
| 314 |
+
episodes = len(TASK_INDICES) if TASK_INDICES else max(1, EPISODES)
|
| 315 |
+
summary, episode_rows, rewards, steps_taken = _run_with_runner(
|
| 316 |
+
env=env,
|
| 317 |
+
runner=runner,
|
| 318 |
+
episodes=episodes,
|
| 319 |
+
task_indices=TASK_INDICES,
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
score = float(summary.get("task_success_rate", 0.0) or 0.0)
|
| 323 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 324 |
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 325 |
+
|
| 326 |
+
record, dashboard = _maybe_write_artifacts(
|
| 327 |
+
env=env,
|
| 328 |
+
summary=summary,
|
| 329 |
+
episodes=episodes,
|
| 330 |
+
episode_rows=episode_rows,
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
payload: dict[str, Any] = {
|
| 334 |
+
"summary": summary,
|
| 335 |
+
"episodes": episode_rows,
|
| 336 |
+
}
|
| 337 |
+
if record is not None:
|
| 338 |
+
payload["record"] = record
|
| 339 |
+
if dashboard is not None:
|
| 340 |
+
payload["dashboard"] = dashboard
|
| 341 |
+
|
| 342 |
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
| 343 |
|
| 344 |
|
| 345 |
if __name__ == "__main__":
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv>=0.1.13
|
| 2 |
+
openai>=1.40.0
|
| 3 |
+
fastapi>=0.115.0
|
| 4 |
+
requests>=2.32.3
|
| 5 |
+
uvicorn>=0.30.0
|
| 6 |
+
pytest>=8.0.0
|