Siddeshwar1625 commited on
Commit
805fc08
·
1 Parent(s): 65c0dda

Updated reward to range [0,1]

Browse files
datasets/fixed_levels/leaderboard_fixed_levels.json CHANGED
@@ -414,5 +414,128 @@
414
  },
415
  "run_id": "run_0010",
416
  "run_name": "fixed_levels_qwen_swarm"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
  }
418
  ]
 
414
  },
415
  "run_id": "run_0010",
416
  "run_name": "fixed_levels_qwen_swarm"
417
+ },
418
+ {
419
+ "config": {
420
+ "max_agents": 3,
421
+ "max_breadth": 2,
422
+ "max_depth": 2,
423
+ "max_steps": 24,
424
+ "max_width": 2,
425
+ "seed": 2026,
426
+ "seeded_questions": 30,
427
+ "swarm_enabled": true
428
+ },
429
+ "created_at": "2026-04-06T20:46:11+00:00",
430
+ "episodes": 1,
431
+ "metrics": {
432
+ "avg_compactness_reward": 0.0,
433
+ "avg_connectivity_gain_reward": 0.2,
434
+ "avg_connectivity_reward": -0.15,
435
+ "avg_diversity_reward": 0.12666666666666665,
436
+ "avg_entity_informativeness_reward": 0.019629386278697845,
437
+ "avg_format_reward": 0.15,
438
+ "avg_graph_f1": 0.5714285714285715,
439
+ "avg_knowledge_carrier_reward": 0.5,
440
+ "avg_knowledge_indexing_reward": 0.12272727272727273,
441
+ "avg_relation_informativeness_reward": 0.08347928023822283,
442
+ "avg_reward": 1.829702015111513,
443
+ "avg_soft_shaping_reward": 0.3,
444
+ "avg_spawn_count": 4.0,
445
+ "avg_spawn_critical_steps": 6.0,
446
+ "avg_steps_to_solution": 9.0,
447
+ "deanonymization_accuracy": 0.0,
448
+ "leaderboard_score": 0.6715432845394145,
449
+ "retrieval_signal": 0.7179545454545455,
450
+ "spawn_completion_rate": 1.0,
451
+ "spawn_signal": 0.6666666666666666,
452
+ "structural_signal": 0.5221217333033842,
453
+ "task_success_rate": 1.0,
454
+ "tool_efficiency": 0.5
455
+ },
456
+ "run_id": "run_0011",
457
+ "run_name": "fixed_levels_qwen_swarm"
458
+ },
459
+ {
460
+ "config": {
461
+ "max_agents": 3,
462
+ "max_breadth": 2,
463
+ "max_depth": 2,
464
+ "max_steps": 24,
465
+ "max_width": 2,
466
+ "seed": 2026,
467
+ "seeded_questions": 30,
468
+ "swarm_enabled": true
469
+ },
470
+ "created_at": "2026-04-06T20:49:44+00:00",
471
+ "episodes": 1,
472
+ "metrics": {
473
+ "avg_compactness_reward": 0.0,
474
+ "avg_connectivity_gain_reward": 0.2,
475
+ "avg_connectivity_reward": -0.15,
476
+ "avg_diversity_reward": 0.12666666666666665,
477
+ "avg_entity_informativeness_reward": 0.019629386278697845,
478
+ "avg_format_reward": 0.15,
479
+ "avg_graph_f1": 0.5714285714285715,
480
+ "avg_knowledge_carrier_reward": 0.5,
481
+ "avg_knowledge_indexing_reward": 0.12272727272727273,
482
+ "avg_relation_informativeness_reward": 0.08335372627068136,
483
+ "avg_reward": 0.7139904233885594,
484
+ "avg_soft_shaping_reward": 0.3,
485
+ "avg_spawn_count": 4.0,
486
+ "avg_spawn_critical_steps": 6.0,
487
+ "avg_steps_to_solution": 9.0,
488
+ "deanonymization_accuracy": 0.0,
489
+ "leaderboard_score": 0.6641542345113342,
490
+ "retrieval_signal": 0.7179545454545455,
491
+ "spawn_completion_rate": 1.0,
492
+ "spawn_signal": 0.6666666666666666,
493
+ "structural_signal": 0.5220966225098759,
494
+ "task_success_rate": 1.0,
495
+ "tool_efficiency": 0.5
496
+ },
497
+ "run_id": "run_0012",
498
+ "run_name": "fixed_levels_qwen_swarm"
499
+ },
500
+ {
501
+ "config": {
502
+ "max_agents": 3,
503
+ "max_breadth": 2,
504
+ "max_depth": 2,
505
+ "max_steps": 24,
506
+ "max_width": 2,
507
+ "seed": 2026,
508
+ "seeded_questions": 30,
509
+ "swarm_enabled": true
510
+ },
511
+ "created_at": "2026-04-06T20:59:43+00:00",
512
+ "episodes": 1,
513
+ "metrics": {
514
+ "avg_compactness_reward": 0.0,
515
+ "avg_connectivity_gain_reward": 0.2,
516
+ "avg_connectivity_reward": -0.15,
517
+ "avg_diversity_reward": 0.12666666666666665,
518
+ "avg_entity_informativeness_reward": 0.0036675120354726642,
519
+ "avg_format_reward": 0.15,
520
+ "avg_graph_f1": 0.5714285714285715,
521
+ "avg_knowledge_carrier_reward": 0.5,
522
+ "avg_knowledge_indexing_reward": 0.12272727272727273,
523
+ "avg_relation_informativeness_reward": 0.08250745620050208,
524
+ "avg_reward": 0.7138056720677886,
525
+ "avg_soft_shaping_reward": 0.3,
526
+ "avg_spawn_count": 4.0,
527
+ "avg_spawn_critical_steps": 6.0,
528
+ "avg_steps_to_solution": 9.0,
529
+ "deanonymization_accuracy": 0.0,
530
+ "leaderboard_score": 0.6638424503476543,
531
+ "retrieval_signal": 0.7179545454545455,
532
+ "spawn_completion_rate": 1.0,
533
+ "spawn_signal": 0.6666666666666666,
534
+ "structural_signal": 0.518734993647195,
535
+ "task_success_rate": 1.0,
536
+ "tool_efficiency": 0.5
537
+ },
538
+ "run_id": "run_0013",
539
+ "run_name": "fixed_levels_qwen_swarm"
540
  }
541
  ]
src/osint_env/env/reward.py CHANGED
@@ -170,6 +170,15 @@ def _connectivity_gain(edge: Edge, existing_edges: list[Edge]) -> float:
170
  return 0.10
171
 
172
 
 
 
 
 
 
 
 
 
 
173
  def compute_edge_reward(
174
  edge: Edge,
175
  task: TaskInstance,
@@ -207,7 +216,7 @@ def compute_edge_reward(
207
  # Additional structural utility shaping for KG construction.
208
  connectivity_gain = _connectivity_gain(edge, existing_edges)
209
 
210
- total = (
211
  global_accuracy
212
  + soft_shaping
213
  + efficiency
@@ -216,6 +225,7 @@ def compute_edge_reward(
216
  + entity_informativeness
217
  + connectivity_gain
218
  )
 
219
  return EdgeRewardBreakdown(
220
  total=total,
221
  global_accuracy=global_accuracy,
@@ -383,7 +393,7 @@ def compute_answer_reward(
383
  # AutoGraph-R1 repetition control variant used in larger models.
384
  repetition_penalty = -0.10 * _relation_repetition_ratio(pred_edges)
385
 
386
- total = (
387
  format_reward
388
  + correctness
389
  + knowledge_carrier
@@ -396,6 +406,7 @@ def compute_answer_reward(
396
  + entity_informativeness
397
  + repetition_penalty
398
  )
 
399
  return AnswerRewardBreakdown(
400
  total=total,
401
  format_reward=format_reward,
 
170
  return 0.10
171
 
172
 
173
+ def _sigmoid_temperature(value: float, temperature: float = 2.0) -> float:
174
+ scaled = float(value) / max(1e-6, float(temperature))
175
+ if scaled >= 0:
176
+ z = math.exp(-scaled)
177
+ return 1.0 / (1.0 + z)
178
+ z = math.exp(scaled)
179
+ return z / (1.0 + z)
180
+
181
+
182
  def compute_edge_reward(
183
  edge: Edge,
184
  task: TaskInstance,
 
216
  # Additional structural utility shaping for KG construction.
217
  connectivity_gain = _connectivity_gain(edge, existing_edges)
218
 
219
+ raw_total = (
220
  global_accuracy
221
  + soft_shaping
222
  + efficiency
 
225
  + entity_informativeness
226
  + connectivity_gain
227
  )
228
+ total = _sigmoid_temperature(raw_total, temperature=2.0)
229
  return EdgeRewardBreakdown(
230
  total=total,
231
  global_accuracy=global_accuracy,
 
393
  # AutoGraph-R1 repetition control variant used in larger models.
394
  repetition_penalty = -0.10 * _relation_repetition_ratio(pred_edges)
395
 
396
+ raw_total = (
397
  format_reward
398
  + correctness
399
  + knowledge_carrier
 
406
  + entity_informativeness
407
  + repetition_penalty
408
  )
409
+ total = _sigmoid_temperature(raw_total, temperature=2.0)
410
  return AnswerRewardBreakdown(
411
  total=total,
412
  format_reward=format_reward,
src/osint_env/eval/metrics.py CHANGED
@@ -29,6 +29,15 @@ class EvalMetrics:
29
  total_spawn_finished_subtasks: int = 0
30
  total_spawn_critical_steps: int = 0
31
 
 
 
 
 
 
 
 
 
 
32
  def add(self, info: dict, task_type: str, graph_f1: float) -> None:
33
  self.episodes += 1
34
  ok = info.get("agent_answer") == info.get("task_answer")
@@ -62,7 +71,8 @@ class EvalMetrics:
62
  tool_efficiency = 1.0 - (self.total_redundant_tool_calls / max(1, self.total_tool_calls))
63
  avg_graph_f1 = sum(self.graph_f1_scores) / max(1, len(self.graph_f1_scores))
64
  deanonymization_accuracy = self.deanonymization_success / max(1, self.deanonymization_total)
65
- avg_reward = self.total_reward / episodes
 
66
  avg_knowledge_carrier = self.total_knowledge_carrier / episodes
67
  avg_knowledge_indexing = self.total_knowledge_indexing / episodes
68
  avg_connectivity = self.total_connectivity / episodes
@@ -78,7 +88,7 @@ class EvalMetrics:
78
  spawn_latency_signal = 1.0 / max(1.0, avg_spawn_critical_steps)
79
  spawn_signal = max(0.0, min(1.0, 0.6 * spawn_completion + 0.4 * spawn_latency_signal))
80
 
81
- reward_norm = 1.0 / (1.0 + math.exp(-avg_reward))
82
  retrieval_signal = max(0.0, min(1.0, 0.5 + 0.35 * avg_knowledge_carrier + 0.35 * avg_knowledge_indexing))
83
  structural_signal = max(
84
  0.0,
 
29
  total_spawn_finished_subtasks: int = 0
30
  total_spawn_critical_steps: int = 0
31
 
32
+ @staticmethod
33
+ def _sigmoid_temperature(value: float, temperature: float = 2.0) -> float:
34
+ scaled = float(value) / max(1e-6, float(temperature))
35
+ if scaled >= 0:
36
+ z = math.exp(-scaled)
37
+ return 1.0 / (1.0 + z)
38
+ z = math.exp(scaled)
39
+ return z / (1.0 + z)
40
+
41
  def add(self, info: dict, task_type: str, graph_f1: float) -> None:
42
  self.episodes += 1
43
  ok = info.get("agent_answer") == info.get("task_answer")
 
71
  tool_efficiency = 1.0 - (self.total_redundant_tool_calls / max(1, self.total_tool_calls))
72
  avg_graph_f1 = sum(self.graph_f1_scores) / max(1, len(self.graph_f1_scores))
73
  deanonymization_accuracy = self.deanonymization_success / max(1, self.deanonymization_total)
74
+ avg_reward_raw = self.total_reward / episodes
75
+ avg_reward = self._sigmoid_temperature(avg_reward_raw, temperature=2.0)
76
  avg_knowledge_carrier = self.total_knowledge_carrier / episodes
77
  avg_knowledge_indexing = self.total_knowledge_indexing / episodes
78
  avg_connectivity = self.total_connectivity / episodes
 
88
  spawn_latency_signal = 1.0 / max(1.0, avg_spawn_critical_steps)
89
  spawn_signal = max(0.0, min(1.0, 0.6 * spawn_completion + 0.4 * spawn_latency_signal))
90
 
91
+ reward_norm = avg_reward
92
  retrieval_signal = max(0.0, min(1.0, 0.5 + 0.35 * avg_knowledge_carrier + 0.35 * avg_knowledge_indexing))
93
  structural_signal = max(
94
  0.0,