Spaces:
Sleeping
Sleeping
Commit ·
805fc08
1
Parent(s): 65c0dda
Updated reward to range [0,1]
Browse files
datasets/fixed_levels/leaderboard_fixed_levels.json
CHANGED
|
@@ -414,5 +414,128 @@
|
|
| 414 |
},
|
| 415 |
"run_id": "run_0010",
|
| 416 |
"run_name": "fixed_levels_qwen_swarm"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
}
|
| 418 |
]
|
|
|
|
| 414 |
},
|
| 415 |
"run_id": "run_0010",
|
| 416 |
"run_name": "fixed_levels_qwen_swarm"
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"config": {
|
| 420 |
+
"max_agents": 3,
|
| 421 |
+
"max_breadth": 2,
|
| 422 |
+
"max_depth": 2,
|
| 423 |
+
"max_steps": 24,
|
| 424 |
+
"max_width": 2,
|
| 425 |
+
"seed": 2026,
|
| 426 |
+
"seeded_questions": 30,
|
| 427 |
+
"swarm_enabled": true
|
| 428 |
+
},
|
| 429 |
+
"created_at": "2026-04-06T20:46:11+00:00",
|
| 430 |
+
"episodes": 1,
|
| 431 |
+
"metrics": {
|
| 432 |
+
"avg_compactness_reward": 0.0,
|
| 433 |
+
"avg_connectivity_gain_reward": 0.2,
|
| 434 |
+
"avg_connectivity_reward": -0.15,
|
| 435 |
+
"avg_diversity_reward": 0.12666666666666665,
|
| 436 |
+
"avg_entity_informativeness_reward": 0.019629386278697845,
|
| 437 |
+
"avg_format_reward": 0.15,
|
| 438 |
+
"avg_graph_f1": 0.5714285714285715,
|
| 439 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 440 |
+
"avg_knowledge_indexing_reward": 0.12272727272727273,
|
| 441 |
+
"avg_relation_informativeness_reward": 0.08347928023822283,
|
| 442 |
+
"avg_reward": 1.829702015111513,
|
| 443 |
+
"avg_soft_shaping_reward": 0.3,
|
| 444 |
+
"avg_spawn_count": 4.0,
|
| 445 |
+
"avg_spawn_critical_steps": 6.0,
|
| 446 |
+
"avg_steps_to_solution": 9.0,
|
| 447 |
+
"deanonymization_accuracy": 0.0,
|
| 448 |
+
"leaderboard_score": 0.6715432845394145,
|
| 449 |
+
"retrieval_signal": 0.7179545454545455,
|
| 450 |
+
"spawn_completion_rate": 1.0,
|
| 451 |
+
"spawn_signal": 0.6666666666666666,
|
| 452 |
+
"structural_signal": 0.5221217333033842,
|
| 453 |
+
"task_success_rate": 1.0,
|
| 454 |
+
"tool_efficiency": 0.5
|
| 455 |
+
},
|
| 456 |
+
"run_id": "run_0011",
|
| 457 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 458 |
+
},
|
| 459 |
+
{
|
| 460 |
+
"config": {
|
| 461 |
+
"max_agents": 3,
|
| 462 |
+
"max_breadth": 2,
|
| 463 |
+
"max_depth": 2,
|
| 464 |
+
"max_steps": 24,
|
| 465 |
+
"max_width": 2,
|
| 466 |
+
"seed": 2026,
|
| 467 |
+
"seeded_questions": 30,
|
| 468 |
+
"swarm_enabled": true
|
| 469 |
+
},
|
| 470 |
+
"created_at": "2026-04-06T20:49:44+00:00",
|
| 471 |
+
"episodes": 1,
|
| 472 |
+
"metrics": {
|
| 473 |
+
"avg_compactness_reward": 0.0,
|
| 474 |
+
"avg_connectivity_gain_reward": 0.2,
|
| 475 |
+
"avg_connectivity_reward": -0.15,
|
| 476 |
+
"avg_diversity_reward": 0.12666666666666665,
|
| 477 |
+
"avg_entity_informativeness_reward": 0.019629386278697845,
|
| 478 |
+
"avg_format_reward": 0.15,
|
| 479 |
+
"avg_graph_f1": 0.5714285714285715,
|
| 480 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 481 |
+
"avg_knowledge_indexing_reward": 0.12272727272727273,
|
| 482 |
+
"avg_relation_informativeness_reward": 0.08335372627068136,
|
| 483 |
+
"avg_reward": 0.7139904233885594,
|
| 484 |
+
"avg_soft_shaping_reward": 0.3,
|
| 485 |
+
"avg_spawn_count": 4.0,
|
| 486 |
+
"avg_spawn_critical_steps": 6.0,
|
| 487 |
+
"avg_steps_to_solution": 9.0,
|
| 488 |
+
"deanonymization_accuracy": 0.0,
|
| 489 |
+
"leaderboard_score": 0.6641542345113342,
|
| 490 |
+
"retrieval_signal": 0.7179545454545455,
|
| 491 |
+
"spawn_completion_rate": 1.0,
|
| 492 |
+
"spawn_signal": 0.6666666666666666,
|
| 493 |
+
"structural_signal": 0.5220966225098759,
|
| 494 |
+
"task_success_rate": 1.0,
|
| 495 |
+
"tool_efficiency": 0.5
|
| 496 |
+
},
|
| 497 |
+
"run_id": "run_0012",
|
| 498 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 499 |
+
},
|
| 500 |
+
{
|
| 501 |
+
"config": {
|
| 502 |
+
"max_agents": 3,
|
| 503 |
+
"max_breadth": 2,
|
| 504 |
+
"max_depth": 2,
|
| 505 |
+
"max_steps": 24,
|
| 506 |
+
"max_width": 2,
|
| 507 |
+
"seed": 2026,
|
| 508 |
+
"seeded_questions": 30,
|
| 509 |
+
"swarm_enabled": true
|
| 510 |
+
},
|
| 511 |
+
"created_at": "2026-04-06T20:59:43+00:00",
|
| 512 |
+
"episodes": 1,
|
| 513 |
+
"metrics": {
|
| 514 |
+
"avg_compactness_reward": 0.0,
|
| 515 |
+
"avg_connectivity_gain_reward": 0.2,
|
| 516 |
+
"avg_connectivity_reward": -0.15,
|
| 517 |
+
"avg_diversity_reward": 0.12666666666666665,
|
| 518 |
+
"avg_entity_informativeness_reward": 0.0036675120354726642,
|
| 519 |
+
"avg_format_reward": 0.15,
|
| 520 |
+
"avg_graph_f1": 0.5714285714285715,
|
| 521 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 522 |
+
"avg_knowledge_indexing_reward": 0.12272727272727273,
|
| 523 |
+
"avg_relation_informativeness_reward": 0.08250745620050208,
|
| 524 |
+
"avg_reward": 0.7138056720677886,
|
| 525 |
+
"avg_soft_shaping_reward": 0.3,
|
| 526 |
+
"avg_spawn_count": 4.0,
|
| 527 |
+
"avg_spawn_critical_steps": 6.0,
|
| 528 |
+
"avg_steps_to_solution": 9.0,
|
| 529 |
+
"deanonymization_accuracy": 0.0,
|
| 530 |
+
"leaderboard_score": 0.6638424503476543,
|
| 531 |
+
"retrieval_signal": 0.7179545454545455,
|
| 532 |
+
"spawn_completion_rate": 1.0,
|
| 533 |
+
"spawn_signal": 0.6666666666666666,
|
| 534 |
+
"structural_signal": 0.518734993647195,
|
| 535 |
+
"task_success_rate": 1.0,
|
| 536 |
+
"tool_efficiency": 0.5
|
| 537 |
+
},
|
| 538 |
+
"run_id": "run_0013",
|
| 539 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 540 |
}
|
| 541 |
]
|
src/osint_env/env/reward.py
CHANGED
|
@@ -170,6 +170,15 @@ def _connectivity_gain(edge: Edge, existing_edges: list[Edge]) -> float:
|
|
| 170 |
return 0.10
|
| 171 |
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
def compute_edge_reward(
|
| 174 |
edge: Edge,
|
| 175 |
task: TaskInstance,
|
|
@@ -207,7 +216,7 @@ def compute_edge_reward(
|
|
| 207 |
# Additional structural utility shaping for KG construction.
|
| 208 |
connectivity_gain = _connectivity_gain(edge, existing_edges)
|
| 209 |
|
| 210 |
-
|
| 211 |
global_accuracy
|
| 212 |
+ soft_shaping
|
| 213 |
+ efficiency
|
|
@@ -216,6 +225,7 @@ def compute_edge_reward(
|
|
| 216 |
+ entity_informativeness
|
| 217 |
+ connectivity_gain
|
| 218 |
)
|
|
|
|
| 219 |
return EdgeRewardBreakdown(
|
| 220 |
total=total,
|
| 221 |
global_accuracy=global_accuracy,
|
|
@@ -383,7 +393,7 @@ def compute_answer_reward(
|
|
| 383 |
# AutoGraph-R1 repetition control variant used in larger models.
|
| 384 |
repetition_penalty = -0.10 * _relation_repetition_ratio(pred_edges)
|
| 385 |
|
| 386 |
-
|
| 387 |
format_reward
|
| 388 |
+ correctness
|
| 389 |
+ knowledge_carrier
|
|
@@ -396,6 +406,7 @@ def compute_answer_reward(
|
|
| 396 |
+ entity_informativeness
|
| 397 |
+ repetition_penalty
|
| 398 |
)
|
|
|
|
| 399 |
return AnswerRewardBreakdown(
|
| 400 |
total=total,
|
| 401 |
format_reward=format_reward,
|
|
|
|
| 170 |
return 0.10
|
| 171 |
|
| 172 |
|
| 173 |
+
def _sigmoid_temperature(value: float, temperature: float = 2.0) -> float:
|
| 174 |
+
scaled = float(value) / max(1e-6, float(temperature))
|
| 175 |
+
if scaled >= 0:
|
| 176 |
+
z = math.exp(-scaled)
|
| 177 |
+
return 1.0 / (1.0 + z)
|
| 178 |
+
z = math.exp(scaled)
|
| 179 |
+
return z / (1.0 + z)
|
| 180 |
+
|
| 181 |
+
|
| 182 |
def compute_edge_reward(
|
| 183 |
edge: Edge,
|
| 184 |
task: TaskInstance,
|
|
|
|
| 216 |
# Additional structural utility shaping for KG construction.
|
| 217 |
connectivity_gain = _connectivity_gain(edge, existing_edges)
|
| 218 |
|
| 219 |
+
raw_total = (
|
| 220 |
global_accuracy
|
| 221 |
+ soft_shaping
|
| 222 |
+ efficiency
|
|
|
|
| 225 |
+ entity_informativeness
|
| 226 |
+ connectivity_gain
|
| 227 |
)
|
| 228 |
+
total = _sigmoid_temperature(raw_total, temperature=2.0)
|
| 229 |
return EdgeRewardBreakdown(
|
| 230 |
total=total,
|
| 231 |
global_accuracy=global_accuracy,
|
|
|
|
| 393 |
# AutoGraph-R1 repetition control variant used in larger models.
|
| 394 |
repetition_penalty = -0.10 * _relation_repetition_ratio(pred_edges)
|
| 395 |
|
| 396 |
+
raw_total = (
|
| 397 |
format_reward
|
| 398 |
+ correctness
|
| 399 |
+ knowledge_carrier
|
|
|
|
| 406 |
+ entity_informativeness
|
| 407 |
+ repetition_penalty
|
| 408 |
)
|
| 409 |
+
total = _sigmoid_temperature(raw_total, temperature=2.0)
|
| 410 |
return AnswerRewardBreakdown(
|
| 411 |
total=total,
|
| 412 |
format_reward=format_reward,
|
src/osint_env/eval/metrics.py
CHANGED
|
@@ -29,6 +29,15 @@ class EvalMetrics:
|
|
| 29 |
total_spawn_finished_subtasks: int = 0
|
| 30 |
total_spawn_critical_steps: int = 0
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
def add(self, info: dict, task_type: str, graph_f1: float) -> None:
|
| 33 |
self.episodes += 1
|
| 34 |
ok = info.get("agent_answer") == info.get("task_answer")
|
|
@@ -62,7 +71,8 @@ class EvalMetrics:
|
|
| 62 |
tool_efficiency = 1.0 - (self.total_redundant_tool_calls / max(1, self.total_tool_calls))
|
| 63 |
avg_graph_f1 = sum(self.graph_f1_scores) / max(1, len(self.graph_f1_scores))
|
| 64 |
deanonymization_accuracy = self.deanonymization_success / max(1, self.deanonymization_total)
|
| 65 |
-
|
|
|
|
| 66 |
avg_knowledge_carrier = self.total_knowledge_carrier / episodes
|
| 67 |
avg_knowledge_indexing = self.total_knowledge_indexing / episodes
|
| 68 |
avg_connectivity = self.total_connectivity / episodes
|
|
@@ -78,7 +88,7 @@ class EvalMetrics:
|
|
| 78 |
spawn_latency_signal = 1.0 / max(1.0, avg_spawn_critical_steps)
|
| 79 |
spawn_signal = max(0.0, min(1.0, 0.6 * spawn_completion + 0.4 * spawn_latency_signal))
|
| 80 |
|
| 81 |
-
reward_norm =
|
| 82 |
retrieval_signal = max(0.0, min(1.0, 0.5 + 0.35 * avg_knowledge_carrier + 0.35 * avg_knowledge_indexing))
|
| 83 |
structural_signal = max(
|
| 84 |
0.0,
|
|
|
|
| 29 |
total_spawn_finished_subtasks: int = 0
|
| 30 |
total_spawn_critical_steps: int = 0
|
| 31 |
|
| 32 |
+
@staticmethod
|
| 33 |
+
def _sigmoid_temperature(value: float, temperature: float = 2.0) -> float:
|
| 34 |
+
scaled = float(value) / max(1e-6, float(temperature))
|
| 35 |
+
if scaled >= 0:
|
| 36 |
+
z = math.exp(-scaled)
|
| 37 |
+
return 1.0 / (1.0 + z)
|
| 38 |
+
z = math.exp(scaled)
|
| 39 |
+
return z / (1.0 + z)
|
| 40 |
+
|
| 41 |
def add(self, info: dict, task_type: str, graph_f1: float) -> None:
|
| 42 |
self.episodes += 1
|
| 43 |
ok = info.get("agent_answer") == info.get("task_answer")
|
|
|
|
| 71 |
tool_efficiency = 1.0 - (self.total_redundant_tool_calls / max(1, self.total_tool_calls))
|
| 72 |
avg_graph_f1 = sum(self.graph_f1_scores) / max(1, len(self.graph_f1_scores))
|
| 73 |
deanonymization_accuracy = self.deanonymization_success / max(1, self.deanonymization_total)
|
| 74 |
+
avg_reward_raw = self.total_reward / episodes
|
| 75 |
+
avg_reward = self._sigmoid_temperature(avg_reward_raw, temperature=2.0)
|
| 76 |
avg_knowledge_carrier = self.total_knowledge_carrier / episodes
|
| 77 |
avg_knowledge_indexing = self.total_knowledge_indexing / episodes
|
| 78 |
avg_connectivity = self.total_connectivity / episodes
|
|
|
|
| 88 |
spawn_latency_signal = 1.0 / max(1.0, avg_spawn_critical_steps)
|
| 89 |
spawn_signal = max(0.0, min(1.0, 0.6 * spawn_completion + 0.4 * spawn_latency_signal))
|
| 90 |
|
| 91 |
+
reward_norm = avg_reward
|
| 92 |
retrieval_signal = max(0.0, min(1.0, 0.5 + 0.35 * avg_knowledge_carrier + 0.35 * avg_knowledge_indexing))
|
| 93 |
structural_signal = max(
|
| 94 |
0.0,
|