Siddeshwar1625 commited on
Commit
49ed720
·
1 Parent(s): 7f9b770

fixed error

Browse files
datasets/fixed_levels/leaderboard_fixed_levels.json CHANGED
@@ -537,5 +537,263 @@
537
  },
538
  "run_id": "run_0013",
539
  "run_name": "fixed_levels_qwen_swarm"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540
  }
541
  ]
 
537
  },
538
  "run_id": "run_0013",
539
  "run_name": "fixed_levels_qwen_swarm"
540
+ },
541
+ {
542
+ "config": {
543
+ "llm_model": "gpt-5.4-mini",
544
+ "llm_provider": "openai",
545
+ "max_agents": 3,
546
+ "max_breadth": 2,
547
+ "max_depth": 2,
548
+ "max_steps": 24,
549
+ "max_width": 2,
550
+ "seed": 2026,
551
+ "seeded_questions": 30,
552
+ "swarm_enabled": true
553
+ },
554
+ "created_at": "2026-04-07T09:44:40+00:00",
555
+ "episodes": 1,
556
+ "metrics": {
557
+ "avg_compactness_reward": 0.0,
558
+ "avg_connectivity_gain_reward": 0.2,
559
+ "avg_connectivity_reward": -0.15,
560
+ "avg_diversity_reward": 0.12666666666666665,
561
+ "avg_entity_informativeness_reward": -0.018704290877944903,
562
+ "avg_format_reward": 0.15,
563
+ "avg_graph_f1": 0.5714285714285715,
564
+ "avg_knowledge_carrier_reward": 0.5,
565
+ "avg_knowledge_indexing_reward": 0.12272727272727273,
566
+ "avg_relation_informativeness_reward": 0.08056039127695382,
567
+ "avg_reward": 0.7135379106634446,
568
+ "avg_soft_shaping_reward": 0.3,
569
+ "avg_spawn_count": 4.0,
570
+ "avg_spawn_critical_steps": 6.0,
571
+ "avg_steps_to_solution": 9.0,
572
+ "deanonymization_accuracy": 0.0,
573
+ "leaderboard_score": 0.6633913226563717,
574
+ "retrieval_signal": 0.7179545454545455,
575
+ "spawn_completion_rate": 1.0,
576
+ "spawn_signal": 0.6666666666666666,
577
+ "structural_signal": 0.5138712200798018,
578
+ "task_success_rate": 1.0,
579
+ "tool_efficiency": 0.5
580
+ },
581
+ "run_id": "run_0014",
582
+ "run_name": "fixed_levels_qwen_swarm"
583
+ },
584
+ {
585
+ "config": {
586
+ "llm_model": "gpt-5.4-mini",
587
+ "llm_provider": "openai",
588
+ "max_agents": 3,
589
+ "max_breadth": 2,
590
+ "max_depth": 2,
591
+ "max_steps": 24,
592
+ "max_width": 2,
593
+ "seed": 2026,
594
+ "seeded_questions": 30,
595
+ "swarm_enabled": true
596
+ },
597
+ "created_at": "2026-04-07T09:55:19+00:00",
598
+ "episodes": 1,
599
+ "metrics": {
600
+ "avg_compactness_reward": 0.0,
601
+ "avg_connectivity_gain_reward": 0.2,
602
+ "avg_connectivity_reward": -0.15,
603
+ "avg_diversity_reward": 0.12666666666666665,
604
+ "avg_entity_informativeness_reward": -0.018704290877944903,
605
+ "avg_format_reward": 0.15,
606
+ "avg_graph_f1": 0.5714285714285715,
607
+ "avg_knowledge_carrier_reward": 0.5,
608
+ "avg_knowledge_indexing_reward": 0.12272727272727273,
609
+ "avg_relation_informativeness_reward": 0.08056039127695382,
610
+ "avg_reward": 0.7135379106634446,
611
+ "avg_soft_shaping_reward": 0.3,
612
+ "avg_spawn_count": 4.0,
613
+ "avg_spawn_critical_steps": 6.0,
614
+ "avg_steps_to_solution": 9.0,
615
+ "deanonymization_accuracy": 0.0,
616
+ "leaderboard_score": 0.6633913226563717,
617
+ "retrieval_signal": 0.7179545454545455,
618
+ "spawn_completion_rate": 1.0,
619
+ "spawn_signal": 0.6666666666666666,
620
+ "structural_signal": 0.5138712200798018,
621
+ "task_success_rate": 1.0,
622
+ "tool_efficiency": 0.5
623
+ },
624
+ "run_id": "run_0015",
625
+ "run_name": "fixed_levels_qwen_swarm"
626
+ },
627
+ {
628
+ "config": {
629
+ "llm_model": "gpt-5.4-mini",
630
+ "llm_provider": "openai",
631
+ "max_agents": 3,
632
+ "max_breadth": 2,
633
+ "max_depth": 2,
634
+ "max_steps": 24,
635
+ "max_width": 2,
636
+ "seed": 2026,
637
+ "seeded_questions": 30,
638
+ "swarm_enabled": true
639
+ },
640
+ "created_at": "2026-04-07T09:56:28+00:00",
641
+ "episodes": 30,
642
+ "metrics": {
643
+ "avg_compactness_reward": 0.0,
644
+ "avg_connectivity_gain_reward": 0.2000000000000001,
645
+ "avg_connectivity_reward": 0.12999999999999998,
646
+ "avg_diversity_reward": 0.12433333333333325,
647
+ "avg_entity_informativeness_reward": -0.02515191749984708,
648
+ "avg_format_reward": 0.15,
649
+ "avg_graph_f1": 0.2916528337385394,
650
+ "avg_knowledge_carrier_reward": 0.5,
651
+ "avg_knowledge_indexing_reward": 0.11539120363588044,
652
+ "avg_relation_informativeness_reward": 0.0769903534735767,
653
+ "avg_reward": 0.7150555461096118,
654
+ "avg_soft_shaping_reward": 0.3,
655
+ "avg_spawn_count": 4.0,
656
+ "avg_spawn_critical_steps": 6.0,
657
+ "avg_steps_to_solution": 9.0,
658
+ "deanonymization_accuracy": 0.0,
659
+ "leaderboard_score": 0.6132407715455404,
660
+ "retrieval_signal": 0.7153869212725582,
661
+ "spawn_completion_rate": 1.0,
662
+ "spawn_signal": 0.6666666666666666,
663
+ "structural_signal": 0.5815176871947458,
664
+ "task_success_rate": 1.0,
665
+ "tool_efficiency": 0.5
666
+ },
667
+ "run_id": "run_0016",
668
+ "run_name": "fixed_levels_qwen_swarm"
669
+ },
670
+ {
671
+ "config": {
672
+ "llm_model": "gpt-5.4-mini",
673
+ "llm_provider": "openai",
674
+ "max_agents": 3,
675
+ "max_breadth": 2,
676
+ "max_depth": 2,
677
+ "max_steps": 24,
678
+ "max_width": 2,
679
+ "seed": 2026,
680
+ "seeded_questions": 30,
681
+ "swarm_enabled": true
682
+ },
683
+ "created_at": "2026-04-07T10:02:32+00:00",
684
+ "episodes": 1,
685
+ "metrics": {
686
+ "avg_compactness_reward": 0.0,
687
+ "avg_connectivity_gain_reward": 0.2,
688
+ "avg_connectivity_reward": -0.15,
689
+ "avg_diversity_reward": 0.12666666666666665,
690
+ "avg_entity_informativeness_reward": -0.018704290877944903,
691
+ "avg_format_reward": 0.15,
692
+ "avg_graph_f1": 0.5714285714285715,
693
+ "avg_knowledge_carrier_reward": 0.5,
694
+ "avg_knowledge_indexing_reward": 0.12272727272727273,
695
+ "avg_relation_informativeness_reward": 0.08056039127695382,
696
+ "avg_reward": 0.7135379106634446,
697
+ "avg_soft_shaping_reward": 0.3,
698
+ "avg_spawn_count": 4.0,
699
+ "avg_spawn_critical_steps": 6.0,
700
+ "avg_steps_to_solution": 9.0,
701
+ "deanonymization_accuracy": 0.0,
702
+ "leaderboard_score": 0.6633913226563717,
703
+ "retrieval_signal": 0.7179545454545455,
704
+ "spawn_completion_rate": 1.0,
705
+ "spawn_signal": 0.6666666666666666,
706
+ "structural_signal": 0.5138712200798018,
707
+ "task_success_rate": 1.0,
708
+ "tool_efficiency": 0.5
709
+ },
710
+ "run_id": "run_0017",
711
+ "run_name": "fixed_levels_qwen_swarm"
712
+ },
713
+ {
714
+ "config": {
715
+ "llm_model": "gpt-5.4-mini",
716
+ "llm_provider": "openai",
717
+ "max_agents": 3,
718
+ "max_breadth": 2,
719
+ "max_depth": 2,
720
+ "max_steps": 24,
721
+ "max_width": 2,
722
+ "seed": 2026,
723
+ "seeded_questions": 30,
724
+ "swarm_enabled": true
725
+ },
726
+ "created_at": "2026-04-07T10:02:49+00:00",
727
+ "episodes": 3,
728
+ "metrics": {
729
+ "avg_compactness_reward": 0.0,
730
+ "avg_connectivity_gain_reward": 0.20000000000000004,
731
+ "avg_connectivity_reward": -0.06666666666666667,
732
+ "avg_diversity_reward": 0.13444444444444445,
733
+ "avg_entity_informativeness_reward": -0.029992009599206938,
734
+ "avg_format_reward": 0.15,
735
+ "avg_graph_f1": 0.5793650793650794,
736
+ "avg_knowledge_carrier_reward": 0.5,
737
+ "avg_knowledge_indexing_reward": 0.10372960372960373,
738
+ "avg_relation_informativeness_reward": 0.06898843512226,
739
+ "avg_reward": 0.7133699465240085,
740
+ "avg_soft_shaping_reward": 0.3,
741
+ "avg_spawn_count": 4.0,
742
+ "avg_spawn_critical_steps": 6.0,
743
+ "avg_steps_to_solution": 9.0,
744
+ "deanonymization_accuracy": 0.0,
745
+ "leaderboard_score": 0.6656078661080486,
746
+ "retrieval_signal": 0.7113053613053614,
747
+ "spawn_completion_rate": 1.0,
748
+ "spawn_signal": 0.6666666666666666,
749
+ "structural_signal": 0.5312992851046106,
750
+ "task_success_rate": 1.0,
751
+ "tool_efficiency": 0.5
752
+ },
753
+ "run_id": "run_0018",
754
+ "run_name": "fixed_levels_qwen_swarm"
755
+ },
756
+ {
757
+ "config": {
758
+ "llm_model": "gpt-5.4-mini",
759
+ "llm_provider": "openai",
760
+ "max_agents": 3,
761
+ "max_breadth": 2,
762
+ "max_depth": 2,
763
+ "max_steps": 24,
764
+ "max_width": 2,
765
+ "seed": 2026,
766
+ "seeded_questions": 30,
767
+ "swarm_enabled": true
768
+ },
769
+ "created_at": "2026-04-07T10:04:53+00:00",
770
+ "episodes": 3,
771
+ "metrics": {
772
+ "avg_compactness_reward": 0.0,
773
+ "avg_connectivity_gain_reward": 0.20000000000000004,
774
+ "avg_connectivity_reward": -0.06666666666666667,
775
+ "avg_diversity_reward": 0.13444444444444445,
776
+ "avg_entity_informativeness_reward": -0.029992009599206938,
777
+ "avg_format_reward": 0.15,
778
+ "avg_graph_f1": 0.5793650793650794,
779
+ "avg_knowledge_carrier_reward": 0.5,
780
+ "avg_knowledge_indexing_reward": 0.10372960372960373,
781
+ "avg_relation_informativeness_reward": 0.06898843512226,
782
+ "avg_reward": 0.7133699465240085,
783
+ "avg_soft_shaping_reward": 0.3,
784
+ "avg_spawn_count": 4.0,
785
+ "avg_spawn_critical_steps": 6.0,
786
+ "avg_steps_to_solution": 9.0,
787
+ "deanonymization_accuracy": 0.0,
788
+ "leaderboard_score": 0.6656078661080486,
789
+ "retrieval_signal": 0.7113053613053614,
790
+ "spawn_completion_rate": 1.0,
791
+ "spawn_signal": 0.6666666666666666,
792
+ "structural_signal": 0.5312992851046106,
793
+ "task_success_rate": 1.0,
794
+ "tool_efficiency": 0.5
795
+ },
796
+ "run_id": "run_0019",
797
+ "run_name": "fixed_levels_qwen_swarm"
798
  }
799
  ]
inference.py CHANGED
@@ -20,12 +20,16 @@ from osint_env.viz import export_dashboard
20
  CONFIG_PATH = os.getenv("CONFIG_PATH", "datasets/fixed_levels/shared_config_fixed_levels.json")
21
  SEED_FILE = os.getenv("SEED_FILE", "datasets/fixed_levels/seed_fixed_levels.json")
22
  AGENT_MODE = os.getenv("AGENT_MODE", "swarm")
23
- LLM_PROVIDER = os.getenv("LLM_PROVIDER", "ollama")
24
- MODEL_NAME = os.getenv("MODEL_NAME", "qwen3:1.7b")
25
  OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "")
26
  OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "")
27
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
28
  OPENAI_API_KEY_ENV = os.getenv("OPENAI_API_KEY_ENV", "OPENAI_API_KEY")
 
 
 
 
29
  LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "0"))
30
  EPISODES = int(os.getenv("EPISODES", "1"))
31
  SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.67"))
@@ -66,6 +70,15 @@ def _normalize_ollama_base_url(url: str) -> str:
66
  return normalized or "http://127.0.0.1:11434"
67
 
68
 
 
 
 
 
 
 
 
 
 
69
  TASK_INDICES = _parse_task_indices(TASK_INDICES_RAW)
70
 
71
 
@@ -180,15 +193,20 @@ def _resolve_environment_config() -> EnvironmentConfig:
180
  if LLM_TIMEOUT_SECONDS > 0:
181
  env_cfg.llm.timeout_seconds = int(LLM_TIMEOUT_SECONDS)
182
 
183
- api_base_override = os.getenv("API_BASE_URL", "")
184
- if api_base_override.strip() or OLLAMA_BASE_URL.strip():
185
- env_cfg.llm.ollama_base_url = _normalize_ollama_base_url(api_base_override or OLLAMA_BASE_URL)
186
-
187
- if OPENAI_BASE_URL.strip():
188
- env_cfg.llm.openai_base_url = OPENAI_BASE_URL.strip()
189
-
190
- if OPENAI_API_KEY.strip():
191
- env_cfg.llm.openai_api_key = OPENAI_API_KEY.strip()
 
 
 
 
 
192
 
193
  if OPENAI_API_KEY_ENV.strip():
194
  env_cfg.llm.openai_api_key_env = OPENAI_API_KEY_ENV.strip()
@@ -387,7 +405,7 @@ def main() -> None:
387
  task_indices=TASK_INDICES,
388
  )
389
 
390
- score = float(summary.get("task_success_rate", 0.0) or 0.0)
391
  success = score >= SUCCESS_SCORE_THRESHOLD
392
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
393
 
 
20
  CONFIG_PATH = os.getenv("CONFIG_PATH", "datasets/fixed_levels/shared_config_fixed_levels.json")
21
  SEED_FILE = os.getenv("SEED_FILE", "datasets/fixed_levels/seed_fixed_levels.json")
22
  AGENT_MODE = os.getenv("AGENT_MODE", "swarm")
23
+ LLM_PROVIDER = os.getenv("LLM_PROVIDER", "openai")
24
+ MODEL_NAME = os.getenv("MODEL_NAME", "gpt-5.4-mini")
25
  OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "")
26
  OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "")
27
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
28
  OPENAI_API_KEY_ENV = os.getenv("OPENAI_API_KEY_ENV", "OPENAI_API_KEY")
29
+ API_BASE_URL = os.getenv("API_BASE_URL", "")
30
+ API_KEY = os.getenv("API_KEY", "")
31
+ HF_SPACE_URL = os.getenv("HF_SPACE_URL", "")
32
+ HF_TOKEN = os.getenv("HF_TOKEN", "")
33
  LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "0"))
34
  EPISODES = int(os.getenv("EPISODES", "1"))
35
  SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.67"))
 
70
  return normalized or "http://127.0.0.1:11434"
71
 
72
 
73
+ def _normalize_openai_base_url(url: str) -> str:
74
+ normalized = str(url or "").strip().rstrip("/")
75
+ if not normalized:
76
+ return ""
77
+ if normalized.endswith("/v1"):
78
+ return normalized
79
+ return f"{normalized}/v1"
80
+
81
+
82
  TASK_INDICES = _parse_task_indices(TASK_INDICES_RAW)
83
 
84
 
 
193
  if LLM_TIMEOUT_SECONDS > 0:
194
  env_cfg.llm.timeout_seconds = int(LLM_TIMEOUT_SECONDS)
195
 
196
+ if provider == "openai":
197
+ # Evaluation harnesses often inject API_BASE_URL/API_KEY for proxy enforcement.
198
+ resolved_openai_base = API_BASE_URL.strip() or OPENAI_BASE_URL.strip() or HF_SPACE_URL.strip()
199
+ if resolved_openai_base:
200
+ env_cfg.llm.openai_base_url = _normalize_openai_base_url(resolved_openai_base)
201
+
202
+ if API_KEY.strip():
203
+ env_cfg.llm.openai_api_key = API_KEY.strip()
204
+ elif OPENAI_API_KEY.strip():
205
+ env_cfg.llm.openai_api_key = OPENAI_API_KEY.strip()
206
+ elif HF_TOKEN.strip():
207
+ env_cfg.llm.openai_api_key = HF_TOKEN.strip()
208
+ elif API_BASE_URL.strip() or OLLAMA_BASE_URL.strip():
209
+ env_cfg.llm.ollama_base_url = _normalize_ollama_base_url(API_BASE_URL or OLLAMA_BASE_URL)
210
 
211
  if OPENAI_API_KEY_ENV.strip():
212
  env_cfg.llm.openai_api_key_env = OPENAI_API_KEY_ENV.strip()
 
405
  task_indices=TASK_INDICES,
406
  )
407
 
408
+ score = float(summary.get("avg_reward", 0.0) or 0.0)
409
  success = score >= SUCCESS_SCORE_THRESHOLD
410
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
411