ritishshrirao commited on
Commit
8ad6382
·
1 Parent(s): 55c5f82

Add evaluation, minor updates to HF space

Browse files
.gitignore CHANGED
@@ -1,7 +1,6 @@
1
  *.pyc
2
  blueprint.txt
3
  *.egg-info
4
- artifacts/*
5
  *.html
6
  osint_dashboard.html
7
  .venv/
@@ -10,3 +9,5 @@ metaQA/
10
  .codex
11
  TODO.txt
12
  uv.lock
 
 
 
1
  *.pyc
2
  blueprint.txt
3
  *.egg-info
 
4
  *.html
5
  osint_dashboard.html
6
  .venv/
 
9
  .codex
10
  TODO.txt
11
  uv.lock
12
+ artifacts/hf_downloads/*
13
+ artifacts/local_bin/*
artifacts/latest_evaluation.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "episodes": [
3
+ {
4
+ "agent_answer": "user_bharat",
5
+ "graph_f1": 0.5714285714285715,
6
+ "pred_edges": [
7
+ {
8
+ "confidence": 1.0,
9
+ "dst": "user_ivy",
10
+ "rel": "alias_of",
11
+ "src": "alias_orchidfox"
12
+ },
13
+ {
14
+ "confidence": 1.0,
15
+ "dst": "post_midnight_manifest",
16
+ "rel": "authored_post",
17
+ "src": "alias_orchidfox"
18
+ }
19
+ ],
20
+ "question": "alias_orchidfox -> post_midnight_manifest -> loc_dockyard17 -> connected collaborator on event_project_lantern. Who is it?",
21
+ "reward": 4.375796665887621,
22
+ "reward_components": {
23
+ "compactness": 0.0,
24
+ "connectivity": -0.15,
25
+ "connectivity_gain": 0.2,
26
+ "correctness": 1.15,
27
+ "diversity": 0.12666666666666665,
28
+ "duplicate_edge_penalty": -0.3,
29
+ "efficiency": 0.08833333333333335,
30
+ "entity_informativeness": 0.0036675120354726642,
31
+ "format_reward": 0.15,
32
+ "global_accuracy": 1.7,
33
+ "graph_f1": 0.31428571428571433,
34
+ "invalid_tool_penalty": 0.0,
35
+ "knowledge_carrier": 0.5,
36
+ "knowledge_indexing": 0.12272727272727273,
37
+ "relation_informativeness": 0.08384894230149129,
38
+ "repetition_penalty": 0.0,
39
+ "soft_shaping": 0.3,
40
+ "spawn_auxiliary": 0.3280854063558518,
41
+ "spawn_breadth": 2.0,
42
+ "spawn_count": 4.0,
43
+ "spawn_critical_steps": 6.0,
44
+ "spawn_depth": 2.0,
45
+ "spawn_finished_subtasks": 4.0,
46
+ "tool_novelty": -0.30000000000000004,
47
+ "tool_relevance": 0.05818181818181818,
48
+ "total": 4.589529441349951
49
+ },
50
+ "spawn_count": 4,
51
+ "spawn_critical_steps": 6,
52
+ "steps": 9,
53
+ "success": 1,
54
+ "task_answer": "user_bharat",
55
+ "task_id": "seed_task_0",
56
+ "task_type": "fixed_trace",
57
+ "tool_calls": 4,
58
+ "truth_edges": [
59
+ {
60
+ "confidence": 1.0,
61
+ "dst": "user_ivy",
62
+ "rel": "alias_of",
63
+ "src": "alias_orchidfox"
64
+ },
65
+ {
66
+ "confidence": 1.0,
67
+ "dst": "post_midnight_manifest",
68
+ "rel": "authored_post",
69
+ "src": "alias_orchidfox"
70
+ },
71
+ {
72
+ "confidence": 1.0,
73
+ "dst": "loc_dockyard17",
74
+ "rel": "references",
75
+ "src": "post_midnight_manifest"
76
+ },
77
+ {
78
+ "confidence": 0.95,
79
+ "dst": "user_bharat",
80
+ "rel": "connected_to",
81
+ "src": "user_ivy"
82
+ },
83
+ {
84
+ "confidence": 0.9,
85
+ "dst": "event_project_lantern",
86
+ "rel": "collaborates_on",
87
+ "src": "user_bharat"
88
+ }
89
+ ]
90
+ }
91
+ ],
92
+ "summary": {
93
+ "avg_compactness_reward": 0.0,
94
+ "avg_connectivity_gain_reward": 0.2,
95
+ "avg_connectivity_reward": -0.15,
96
+ "avg_diversity_reward": 0.12666666666666665,
97
+ "avg_entity_informativeness_reward": 0.0036675120354726642,
98
+ "avg_format_reward": 0.15,
99
+ "avg_graph_f1": 0.5714285714285715,
100
+ "avg_knowledge_carrier_reward": 0.5,
101
+ "avg_knowledge_indexing_reward": 0.12272727272727273,
102
+ "avg_relation_informativeness_reward": 0.08384894230149129,
103
+ "avg_reward": 4.375796665887621,
104
+ "avg_soft_shaping_reward": 0.3,
105
+ "avg_spawn_count": 4.0,
106
+ "avg_spawn_critical_steps": 6.0,
107
+ "avg_steps_to_solution": 9.0,
108
+ "deanonymization_accuracy": 0.0,
109
+ "leaderboard_score": 0.6775552197990489,
110
+ "retrieval_signal": 0.7179545454545455,
111
+ "spawn_completion_rate": 1.0,
112
+ "spawn_signal": 0.6666666666666666,
113
+ "structural_signal": 0.5190032908673928,
114
+ "task_success_rate": 1.0,
115
+ "tool_efficiency": 0.5
116
+ }
117
+ }
artifacts/local_hf_eval_round_006/evaluation_context.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model": "Qwen/Qwen2.5-0.5B-Instruct",
3
+ "dashboard_paths": {
4
+ "finetuned": "artifacts/local_hf_eval_round_006/post_training_benchmark_dashboard.html",
5
+ "original": "artifacts/local_hf_eval_round_006/post_training_benchmark_dashboard_original.html"
6
+ },
7
+ "downloaded_finetuned_model": "artifacts/hf_downloads/self_play_hf_l40s_full/round_006/answerer_train/final_model",
8
+ "env_llm_seeding_enabled": false,
9
+ "environment_llm_provider": "mock",
10
+ "evaluation_path": "artifacts/local_hf_eval_round_006/post_training_evaluation.json",
11
+ "leaderboard_path": "artifacts/local_hf_eval_round_006/post_training_compare_leaderboard.json",
12
+ "pipeline_mode": "swarm_v2",
13
+ "repo_id": "Siddeshwar1625/osint-checkpoints",
14
+ "repo_type": "model",
15
+ "run_prefix": "self_play_hf_l40s_full",
16
+ "summary_path": "artifacts/hf_downloads/self_play_hf_l40s_full/self_play_summary.json"
17
+ }
artifacts/local_hf_eval_round_006/post_training_compare_leaderboard.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "config": {
4
+ "source": "post_training_evaluation"
5
+ },
6
+ "episodes": 24,
7
+ "metrics": {
8
+ "avg_compactness_reward": 0.0,
9
+ "avg_connectivity_gain_reward": 0.0,
10
+ "avg_connectivity_reward": 0.0,
11
+ "avg_diversity_reward": 0.0,
12
+ "avg_entity_informativeness_reward": 0.0,
13
+ "avg_format_reward": 0.0,
14
+ "avg_graph_f1": 0.0,
15
+ "avg_knowledge_carrier_reward": 0.0,
16
+ "avg_knowledge_indexing_reward": 0.0,
17
+ "avg_relation_informativeness_reward": 0.0,
18
+ "avg_reward": 0.8995857577183174,
19
+ "avg_soft_shaping_reward": 0.0,
20
+ "avg_spawn_count": 0.0,
21
+ "avg_spawn_critical_steps": 0.0,
22
+ "avg_steps_to_solution": 0.0,
23
+ "deanonymization_accuracy": 0.0,
24
+ "leaderboard_score": 0.28997928788591587,
25
+ "retrieval_signal": 0.0,
26
+ "spawn_completion_rate": 0.0,
27
+ "spawn_signal": 0.0,
28
+ "structural_signal": 0.0,
29
+ "task_success_rate": 0.875,
30
+ "tool_efficiency": 0.0
31
+ },
32
+ "run_id": "post_train_01",
33
+ "run_name": "finetuned_answerer"
34
+ },
35
+ {
36
+ "config": {
37
+ "source": "post_training_evaluation"
38
+ },
39
+ "episodes": 24,
40
+ "metrics": {
41
+ "avg_compactness_reward": 0.0,
42
+ "avg_connectivity_gain_reward": 0.0,
43
+ "avg_connectivity_reward": 0.0,
44
+ "avg_diversity_reward": 0.0,
45
+ "avg_entity_informativeness_reward": 0.0,
46
+ "avg_format_reward": 0.0,
47
+ "avg_graph_f1": 0.5347222222222222,
48
+ "avg_knowledge_carrier_reward": 0.0,
49
+ "avg_knowledge_indexing_reward": 0.0,
50
+ "avg_relation_informativeness_reward": 0.0,
51
+ "avg_reward": 0.5195708125237718,
52
+ "avg_soft_shaping_reward": 0.0,
53
+ "avg_spawn_count": 0.0,
54
+ "avg_spawn_critical_steps": 0.0,
55
+ "avg_steps_to_solution": 0.0,
56
+ "deanonymization_accuracy": 0.0,
57
+ "leaderboard_score": 0.13292298507063305,
58
+ "retrieval_signal": 0.0,
59
+ "spawn_completion_rate": 0.0,
60
+ "spawn_signal": 0.0,
61
+ "structural_signal": 0.0,
62
+ "task_success_rate": 0.0,
63
+ "tool_efficiency": 0.0
64
+ },
65
+ "run_id": "post_train_02",
66
+ "run_name": "original_answerer"
67
+ }
68
+ ]
artifacts/local_hf_eval_round_006/post_training_eval_generated_tasks.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/local_hf_eval_round_006/post_training_eval_validation_reports.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/local_hf_eval_round_006/post_training_evaluation.json ADDED
@@ -0,0 +1,1685 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "answerer_models": {
3
+ "finetuned_answerer": "artifacts/hf_downloads/self_play_hf_l40s_full/round_006/answerer_train/final_model",
4
+ "original_answerer": "Qwen/Qwen2.5-0.5B-Instruct"
5
+ },
6
+ "generated_tasks_path": "artifacts/local_hf_eval_round_006/post_training_eval_generated_tasks.json",
7
+ "generator_model": "artifacts/hf_downloads/self_play_hf_l40s_full/round_006/answerer_train/final_model",
8
+ "model_evaluations": {
9
+ "finetuned_answerer": {
10
+ "episodes": [
11
+ {
12
+ "agent_answer": "user_25",
13
+ "completion_length": 547,
14
+ "graph_f1": 0.0,
15
+ "pred_edges": [],
16
+ "predicted_edge_count": 0,
17
+ "question": "If you start at alias_30_572 and follow the relation path alias_of -> connected_to, which entity do you reach after 2 hops?",
18
+ "reward": 0.9323133138030703,
19
+ "success": 1,
20
+ "support_edge_count": 2,
21
+ "task_answer": "user_25",
22
+ "task_id": "swarm_v2_r7_0",
23
+ "task_type": "swarm_v2_2hop_trace",
24
+ "truth_edges": [
25
+ {
26
+ "confidence": 1.0,
27
+ "dst": "user_30",
28
+ "rel": "alias_of",
29
+ "src": "alias_30_572"
30
+ },
31
+ {
32
+ "confidence": 0.8,
33
+ "dst": "user_25",
34
+ "rel": "connected_to",
35
+ "src": "user_30"
36
+ }
37
+ ]
38
+ },
39
+ {
40
+ "agent_answer": "loc_pune",
41
+ "completion_length": 552,
42
+ "graph_f1": 0.0,
43
+ "pred_edges": [],
44
+ "predicted_edge_count": 0,
45
+ "question": "If you start at user_37 and follow the relation path connected_to -> located_in, which entity do you reach after 2 hops?",
46
+ "reward": 0.9323133138030703,
47
+ "success": 1,
48
+ "support_edge_count": 2,
49
+ "task_answer": "loc_pune",
50
+ "task_id": "swarm_v2_r7_1",
51
+ "task_type": "swarm_v2_2hop_trace",
52
+ "truth_edges": [
53
+ {
54
+ "confidence": 0.8,
55
+ "dst": "user_11",
56
+ "rel": "connected_to",
57
+ "src": "user_37"
58
+ },
59
+ {
60
+ "confidence": 1.0,
61
+ "dst": "loc_pune",
62
+ "rel": "located_in",
63
+ "src": "user_11"
64
+ }
65
+ ]
66
+ },
67
+ {
68
+ "agent_answer": "loc_hyderabad",
69
+ "completion_length": 553,
70
+ "graph_f1": 0.0,
71
+ "pred_edges": [],
72
+ "predicted_edge_count": 0,
73
+ "question": "If you start at alias_24_458 and follow the relation path alias_of -> located_in, which entity do you reach after 2 hops?",
74
+ "reward": 0.9323133138030703,
75
+ "success": 1,
76
+ "support_edge_count": 2,
77
+ "task_answer": "loc_hyderabad",
78
+ "task_id": "swarm_v2_r7_2",
79
+ "task_type": "swarm_v2_2hop_trace",
80
+ "truth_edges": [
81
+ {
82
+ "confidence": 1.0,
83
+ "dst": "user_24",
84
+ "rel": "alias_of",
85
+ "src": "alias_24_458"
86
+ },
87
+ {
88
+ "confidence": 1.0,
89
+ "dst": "loc_hyderabad",
90
+ "rel": "located_in",
91
+ "src": "user_24"
92
+ }
93
+ ]
94
+ },
95
+ {
96
+ "agent_answer": "loc_bengaluru",
97
+ "completion_length": 599,
98
+ "graph_f1": 0.0,
99
+ "pred_edges": [],
100
+ "predicted_edge_count": 0,
101
+ "question": "If you start at alias_4_664 and follow the relation path alias_of -> located_in, which entity do you reach after 2 hops?",
102
+ "reward": 0.9323133138030703,
103
+ "success": 1,
104
+ "support_edge_count": 2,
105
+ "task_answer": "loc_bengaluru",
106
+ "task_id": "swarm_v2_r7_3",
107
+ "task_type": "swarm_v2_2hop_trace",
108
+ "truth_edges": [
109
+ {
110
+ "confidence": 1.0,
111
+ "dst": "user_4",
112
+ "rel": "alias_of",
113
+ "src": "alias_4_664"
114
+ },
115
+ {
116
+ "confidence": 1.0,
117
+ "dst": "loc_bengaluru",
118
+ "rel": "located_in",
119
+ "src": "user_4"
120
+ }
121
+ ]
122
+ },
123
+ {
124
+ "agent_answer": "user_3",
125
+ "completion_length": 563,
126
+ "graph_f1": 0.0,
127
+ "pred_edges": [],
128
+ "predicted_edge_count": 0,
129
+ "question": "If you start at alias_39_951 and follow the relation path alias_of -> connected_to, which entity do you reach after 2 hops?",
130
+ "reward": 0.9323133138030703,
131
+ "success": 1,
132
+ "support_edge_count": 2,
133
+ "task_answer": "user_3",
134
+ "task_id": "swarm_v2_r7_4",
135
+ "task_type": "swarm_v2_2hop_trace",
136
+ "truth_edges": [
137
+ {
138
+ "confidence": 1.0,
139
+ "dst": "user_39",
140
+ "rel": "alias_of",
141
+ "src": "alias_39_951"
142
+ },
143
+ {
144
+ "confidence": 0.8,
145
+ "dst": "user_3",
146
+ "rel": "connected_to",
147
+ "src": "user_39"
148
+ }
149
+ ]
150
+ },
151
+ {
152
+ "agent_answer": "org_northbridge",
153
+ "completion_length": 595,
154
+ "graph_f1": 0.0,
155
+ "pred_edges": [],
156
+ "predicted_edge_count": 0,
157
+ "question": "If you start at alias_20_174 and follow the relation path alias_of -> works_at, which entity do you reach after 2 hops?",
158
+ "reward": 0.9323133138030703,
159
+ "success": 1,
160
+ "support_edge_count": 2,
161
+ "task_answer": "org_northbridge",
162
+ "task_id": "swarm_v2_r7_5",
163
+ "task_type": "swarm_v2_2hop_trace",
164
+ "truth_edges": [
165
+ {
166
+ "confidence": 1.0,
167
+ "dst": "user_20",
168
+ "rel": "alias_of",
169
+ "src": "alias_20_174"
170
+ },
171
+ {
172
+ "confidence": 1.0,
173
+ "dst": "org_northbridge",
174
+ "rel": "works_at",
175
+ "src": "user_20"
176
+ }
177
+ ]
178
+ },
179
+ {
180
+ "agent_answer": "org_helios_labs",
181
+ "completion_length": 556,
182
+ "graph_f1": 0.0,
183
+ "pred_edges": [],
184
+ "predicted_edge_count": 0,
185
+ "question": "If you start at alias_34_511 and follow the relation path alias_of -> works_at, which entity do you reach after 2 hops?",
186
+ "reward": 0.9323133138030703,
187
+ "success": 1,
188
+ "support_edge_count": 2,
189
+ "task_answer": "org_helios_labs",
190
+ "task_id": "swarm_v2_r7_6",
191
+ "task_type": "swarm_v2_2hop_trace",
192
+ "truth_edges": [
193
+ {
194
+ "confidence": 1.0,
195
+ "dst": "user_34",
196
+ "rel": "alias_of",
197
+ "src": "alias_34_511"
198
+ },
199
+ {
200
+ "confidence": 1.0,
201
+ "dst": "org_helios_labs",
202
+ "rel": "works_at",
203
+ "src": "user_34"
204
+ }
205
+ ]
206
+ },
207
+ {
208
+ "agent_answer": "user_12",
209
+ "completion_length": 577,
210
+ "graph_f1": 0.0,
211
+ "pred_edges": [],
212
+ "predicted_edge_count": 0,
213
+ "question": "If you start at alias_3_544 and follow the relation path alias_of -> connected_to, which entity do you reach after 2 hops?",
214
+ "reward": 0.9323133138030703,
215
+ "success": 1,
216
+ "support_edge_count": 2,
217
+ "task_answer": "user_12",
218
+ "task_id": "swarm_v2_r7_7",
219
+ "task_type": "swarm_v2_2hop_trace",
220
+ "truth_edges": [
221
+ {
222
+ "confidence": 1.0,
223
+ "dst": "user_3",
224
+ "rel": "alias_of",
225
+ "src": "alias_3_544"
226
+ },
227
+ {
228
+ "confidence": 0.8,
229
+ "dst": "user_12",
230
+ "rel": "connected_to",
231
+ "src": "user_3"
232
+ }
233
+ ]
234
+ },
235
+ {
236
+ "agent_answer": "loc_pune",
237
+ "completion_length": 582,
238
+ "graph_f1": 0.0,
239
+ "pred_edges": [],
240
+ "predicted_edge_count": 0,
241
+ "question": "If you start at alias_3_544 and follow the relation path alias_of -> located_in, which entity do you reach after 2 hops?",
242
+ "reward": 0.9323133138030703,
243
+ "success": 1,
244
+ "support_edge_count": 2,
245
+ "task_answer": "loc_pune",
246
+ "task_id": "swarm_v2_r7_8",
247
+ "task_type": "swarm_v2_2hop_trace",
248
+ "truth_edges": [
249
+ {
250
+ "confidence": 1.0,
251
+ "dst": "user_3",
252
+ "rel": "alias_of",
253
+ "src": "alias_3_544"
254
+ },
255
+ {
256
+ "confidence": 1.0,
257
+ "dst": "loc_pune",
258
+ "rel": "located_in",
259
+ "src": "user_3"
260
+ }
261
+ ]
262
+ },
263
+ {
264
+ "agent_answer": "user_12",
265
+ "completion_length": 595,
266
+ "graph_f1": 0.0,
267
+ "pred_edges": [],
268
+ "predicted_edge_count": 0,
269
+ "question": "If you start at user_39 and follow the relation path connected_to -> connected_to, which entity do you reach after 2 hops?",
270
+ "reward": 0.9323133138030703,
271
+ "success": 1,
272
+ "support_edge_count": 2,
273
+ "task_answer": "user_12",
274
+ "task_id": "swarm_v2_r7_9",
275
+ "task_type": "swarm_v2_2hop_trace",
276
+ "truth_edges": [
277
+ {
278
+ "confidence": 0.8,
279
+ "dst": "user_3",
280
+ "rel": "connected_to",
281
+ "src": "user_39"
282
+ },
283
+ {
284
+ "confidence": 0.8,
285
+ "dst": "user_12",
286
+ "rel": "connected_to",
287
+ "src": "user_3"
288
+ }
289
+ ]
290
+ },
291
+ {
292
+ "agent_answer": "user_1",
293
+ "completion_length": 579,
294
+ "graph_f1": 0.0,
295
+ "pred_edges": [],
296
+ "predicted_edge_count": 0,
297
+ "question": "If you start at alias_4_664 and follow the relation path alias_of -> connected_to -> located_in, which entity do you reach after 3 hops?",
298
+ "reward": 0.6704928651250475,
299
+ "success": 0,
300
+ "support_edge_count": 3,
301
+ "task_answer": "loc_bengaluru",
302
+ "task_id": "swarm_v2_r7_10",
303
+ "task_type": "swarm_v2_3hop_trace",
304
+ "truth_edges": [
305
+ {
306
+ "confidence": 1.0,
307
+ "dst": "user_4",
308
+ "rel": "alias_of",
309
+ "src": "alias_4_664"
310
+ },
311
+ {
312
+ "confidence": 0.8,
313
+ "dst": "user_13",
314
+ "rel": "connected_to",
315
+ "src": "user_4"
316
+ },
317
+ {
318
+ "confidence": 1.0,
319
+ "dst": "loc_bengaluru",
320
+ "rel": "located_in",
321
+ "src": "user_13"
322
+ }
323
+ ]
324
+ },
325
+ {
326
+ "agent_answer": "org_apex_dynamics",
327
+ "completion_length": 552,
328
+ "graph_f1": 0.0,
329
+ "pred_edges": [],
330
+ "predicted_edge_count": 0,
331
+ "question": "If you start at user_16 and follow the relation path connected_to -> works_at, which entity do you reach after 2 hops?",
332
+ "reward": 0.9323133138030703,
333
+ "success": 1,
334
+ "support_edge_count": 2,
335
+ "task_answer": "org_apex_dynamics",
336
+ "task_id": "swarm_v2_r7_11",
337
+ "task_type": "swarm_v2_2hop_trace",
338
+ "truth_edges": [
339
+ {
340
+ "confidence": 0.8,
341
+ "dst": "user_18",
342
+ "rel": "connected_to",
343
+ "src": "user_16"
344
+ },
345
+ {
346
+ "confidence": 1.0,
347
+ "dst": "org_apex_dynamics",
348
+ "rel": "works_at",
349
+ "src": "user_18"
350
+ }
351
+ ]
352
+ },
353
+ {
354
+ "agent_answer": "org_northbridge",
355
+ "completion_length": 555,
356
+ "graph_f1": 0.0,
357
+ "pred_edges": [],
358
+ "predicted_edge_count": 0,
359
+ "question": "If you start at user_3 and follow the relation path connected_to -> works_at, which entity do you reach after 2 hops?",
360
+ "reward": 0.9323133138030703,
361
+ "success": 1,
362
+ "support_edge_count": 2,
363
+ "task_answer": "org_northbridge",
364
+ "task_id": "swarm_v2_r7_12",
365
+ "task_type": "swarm_v2_2hop_trace",
366
+ "truth_edges": [
367
+ {
368
+ "confidence": 0.8,
369
+ "dst": "user_12",
370
+ "rel": "connected_to",
371
+ "src": "user_3"
372
+ },
373
+ {
374
+ "confidence": 1.0,
375
+ "dst": "org_northbridge",
376
+ "rel": "works_at",
377
+ "src": "user_12"
378
+ }
379
+ ]
380
+ },
381
+ {
382
+ "agent_answer": "org_northbridge",
383
+ "completion_length": 603,
384
+ "graph_f1": 0.0,
385
+ "pred_edges": [],
386
+ "predicted_edge_count": 0,
387
+ "question": "If you start at user_30 and follow the relation path connected_to -> works_at, which entity do you reach after 2 hops?",
388
+ "reward": 0.9323133138030703,
389
+ "success": 1,
390
+ "support_edge_count": 2,
391
+ "task_answer": "org_northbridge",
392
+ "task_id": "swarm_v2_r7_13",
393
+ "task_type": "swarm_v2_2hop_trace",
394
+ "truth_edges": [
395
+ {
396
+ "confidence": 0.8,
397
+ "dst": "user_25",
398
+ "rel": "connected_to",
399
+ "src": "user_30"
400
+ },
401
+ {
402
+ "confidence": 1.0,
403
+ "dst": "org_northbridge",
404
+ "rel": "works_at",
405
+ "src": "user_25"
406
+ }
407
+ ]
408
+ },
409
+ {
410
+ "agent_answer": "org_northbridge",
411
+ "completion_length": 568,
412
+ "graph_f1": 0.0,
413
+ "pred_edges": [],
414
+ "predicted_edge_count": 0,
415
+ "question": "If you start at alias_11_684 and follow the relation path alias_of -> works_at, which entity do you reach after 2 hops?",
416
+ "reward": 0.9323133138030703,
417
+ "success": 1,
418
+ "support_edge_count": 2,
419
+ "task_answer": "org_northbridge",
420
+ "task_id": "swarm_v2_r7_14",
421
+ "task_type": "swarm_v2_2hop_trace",
422
+ "truth_edges": [
423
+ {
424
+ "confidence": 1.0,
425
+ "dst": "user_11",
426
+ "rel": "alias_of",
427
+ "src": "alias_11_684"
428
+ },
429
+ {
430
+ "confidence": 1.0,
431
+ "dst": "org_northbridge",
432
+ "rel": "works_at",
433
+ "src": "user_11"
434
+ }
435
+ ]
436
+ },
437
+ {
438
+ "agent_answer": "user_35",
439
+ "completion_length": 547,
440
+ "graph_f1": 0.0,
441
+ "pred_edges": [],
442
+ "predicted_edge_count": 0,
443
+ "question": "If you start at alias_29_495 and follow the relation path alias_of -> connected_to -> located_in, which entity do you reach after 3 hops?",
444
+ "reward": 0.6704928651250475,
445
+ "success": 0,
446
+ "support_edge_count": 3,
447
+ "task_answer": "loc_hyderabad",
448
+ "task_id": "swarm_v2_r7_15",
449
+ "task_type": "swarm_v2_3hop_trace",
450
+ "truth_edges": [
451
+ {
452
+ "confidence": 1.0,
453
+ "dst": "user_29",
454
+ "rel": "alias_of",
455
+ "src": "alias_29_495"
456
+ },
457
+ {
458
+ "confidence": 0.8,
459
+ "dst": "user_35",
460
+ "rel": "connected_to",
461
+ "src": "user_29"
462
+ },
463
+ {
464
+ "confidence": 1.0,
465
+ "dst": "loc_hyderabad",
466
+ "rel": "located_in",
467
+ "src": "user_35"
468
+ }
469
+ ]
470
+ },
471
+ {
472
+ "agent_answer": "loc_bengaluru",
473
+ "completion_length": 548,
474
+ "graph_f1": 0.0,
475
+ "pred_edges": [],
476
+ "predicted_edge_count": 0,
477
+ "question": "If you start at alias_38_337 and follow the relation path alias_of -> located_in, which entity do you reach after 2 hops?",
478
+ "reward": 0.9323133138030703,
479
+ "success": 1,
480
+ "support_edge_count": 2,
481
+ "task_answer": "loc_bengaluru",
482
+ "task_id": "swarm_v2_r7_16",
483
+ "task_type": "swarm_v2_2hop_trace",
484
+ "truth_edges": [
485
+ {
486
+ "confidence": 1.0,
487
+ "dst": "user_38",
488
+ "rel": "alias_of",
489
+ "src": "alias_38_337"
490
+ },
491
+ {
492
+ "confidence": 1.0,
493
+ "dst": "loc_bengaluru",
494
+ "rel": "located_in",
495
+ "src": "user_38"
496
+ }
497
+ ]
498
+ },
499
+ {
500
+ "agent_answer": "user_12",
501
+ "completion_length": 547,
502
+ "graph_f1": 0.0,
503
+ "pred_edges": [],
504
+ "predicted_edge_count": 0,
505
+ "question": "If you start at user_38 and follow the relation path connected_to -> connected_to, which entity do you reach after 2 hops?",
506
+ "reward": 0.9323133138030703,
507
+ "success": 1,
508
+ "support_edge_count": 2,
509
+ "task_answer": "user_12",
510
+ "task_id": "swarm_v2_r7_17",
511
+ "task_type": "swarm_v2_2hop_trace",
512
+ "truth_edges": [
513
+ {
514
+ "confidence": 0.8,
515
+ "dst": "user_3",
516
+ "rel": "connected_to",
517
+ "src": "user_38"
518
+ },
519
+ {
520
+ "confidence": 0.8,
521
+ "dst": "user_12",
522
+ "rel": "connected_to",
523
+ "src": "user_3"
524
+ }
525
+ ]
526
+ },
527
+ {
528
+ "agent_answer": "org_northbridge",
529
+ "completion_length": 555,
530
+ "graph_f1": 0.0,
531
+ "pred_edges": [],
532
+ "predicted_edge_count": 0,
533
+ "question": "If you start at alias_39_951 and follow the relation path alias_of -> works_at, which entity do you reach after 2 hops?",
534
+ "reward": 0.9323133138030703,
535
+ "success": 1,
536
+ "support_edge_count": 2,
537
+ "task_answer": "org_northbridge",
538
+ "task_id": "swarm_v2_r7_18",
539
+ "task_type": "swarm_v2_2hop_trace",
540
+ "truth_edges": [
541
+ {
542
+ "confidence": 1.0,
543
+ "dst": "user_39",
544
+ "rel": "alias_of",
545
+ "src": "alias_39_951"
546
+ },
547
+ {
548
+ "confidence": 1.0,
549
+ "dst": "org_northbridge",
550
+ "rel": "works_at",
551
+ "src": "user_39"
552
+ }
553
+ ]
554
+ },
555
+ {
556
+ "agent_answer": "loc_pune",
557
+ "completion_length": 555,
558
+ "graph_f1": 0.0,
559
+ "pred_edges": [],
560
+ "predicted_edge_count": 0,
561
+ "question": "If you start at user_34 and follow the relation path connected_to -> located_in, which entity do you reach after 2 hops?",
562
+ "reward": 0.9323133138030703,
563
+ "success": 1,
564
+ "support_edge_count": 2,
565
+ "task_answer": "loc_pune",
566
+ "task_id": "swarm_v2_r7_19",
567
+ "task_type": "swarm_v2_2hop_trace",
568
+ "truth_edges": [
569
+ {
570
+ "confidence": 0.8,
571
+ "dst": "user_6",
572
+ "rel": "connected_to",
573
+ "src": "user_34"
574
+ },
575
+ {
576
+ "confidence": 1.0,
577
+ "dst": "loc_pune",
578
+ "rel": "located_in",
579
+ "src": "user_6"
580
+ }
581
+ ]
582
+ },
583
+ {
584
+ "agent_answer": "loc_bengaluru",
585
+ "completion_length": 599,
586
+ "graph_f1": 0.0,
587
+ "pred_edges": [],
588
+ "predicted_edge_count": 0,
589
+ "question": "If you start at alias_32_394 and follow the relation path alias_of -> located_in, which entity do you reach after 2 hops?",
590
+ "reward": 0.9323133138030703,
591
+ "success": 1,
592
+ "support_edge_count": 2,
593
+ "task_answer": "loc_bengaluru",
594
+ "task_id": "swarm_v2_r7_20",
595
+ "task_type": "swarm_v2_2hop_trace",
596
+ "truth_edges": [
597
+ {
598
+ "confidence": 1.0,
599
+ "dst": "user_32",
600
+ "rel": "alias_of",
601
+ "src": "alias_32_394"
602
+ },
603
+ {
604
+ "confidence": 1.0,
605
+ "dst": "loc_bengaluru",
606
+ "rel": "located_in",
607
+ "src": "user_32"
608
+ }
609
+ ]
610
+ },
611
+ {
612
+ "agent_answer": "org_northbridge",
613
+ "completion_length": 561,
614
+ "graph_f1": 0.0,
615
+ "pred_edges": [],
616
+ "predicted_edge_count": 0,
617
+ "question": "If you start at alias_30_572 and follow the relation path alias_of -> works_at, which entity do you reach after 2 hops?",
618
+ "reward": 0.9323133138030703,
619
+ "success": 1,
620
+ "support_edge_count": 2,
621
+ "task_answer": "org_northbridge",
622
+ "task_id": "swarm_v2_r7_21",
623
+ "task_type": "swarm_v2_2hop_trace",
624
+ "truth_edges": [
625
+ {
626
+ "confidence": 1.0,
627
+ "dst": "user_30",
628
+ "rel": "alias_of",
629
+ "src": "alias_30_572"
630
+ },
631
+ {
632
+ "confidence": 1.0,
633
+ "dst": "org_northbridge",
634
+ "rel": "works_at",
635
+ "src": "user_30"
636
+ }
637
+ ]
638
+ },
639
+ {
640
+ "agent_answer": "user_3",
641
+ "completion_length": 553,
642
+ "graph_f1": 0.0,
643
+ "pred_edges": [],
644
+ "predicted_edge_count": 0,
645
+ "question": "If you start at alias_39_951 and follow the relation path alias_of -> connected_to -> located_in, which entity do you reach after 3 hops?",
646
+ "reward": 0.6704928651250475,
647
+ "success": 0,
648
+ "support_edge_count": 3,
649
+ "task_answer": "loc_pune",
650
+ "task_id": "swarm_v2_r7_22",
651
+ "task_type": "swarm_v2_3hop_trace",
652
+ "truth_edges": [
653
+ {
654
+ "confidence": 1.0,
655
+ "dst": "user_39",
656
+ "rel": "alias_of",
657
+ "src": "alias_39_951"
658
+ },
659
+ {
660
+ "confidence": 0.8,
661
+ "dst": "user_3",
662
+ "rel": "connected_to",
663
+ "src": "user_39"
664
+ },
665
+ {
666
+ "confidence": 1.0,
667
+ "dst": "loc_pune",
668
+ "rel": "located_in",
669
+ "src": "user_3"
670
+ }
671
+ ]
672
+ },
673
+ {
674
+ "agent_answer": "org_helios_labs",
675
+ "completion_length": 556,
676
+ "graph_f1": 0.0,
677
+ "pred_edges": [],
678
+ "predicted_edge_count": 0,
679
+ "question": "If you start at alias_32_394 and follow the relation path alias_of -> works_at, which entity do you reach after 2 hops?",
680
+ "reward": 0.9323133138030703,
681
+ "success": 1,
682
+ "support_edge_count": 2,
683
+ "task_answer": "org_helios_labs",
684
+ "task_id": "swarm_v2_r7_23",
685
+ "task_type": "swarm_v2_2hop_trace",
686
+ "truth_edges": [
687
+ {
688
+ "confidence": 1.0,
689
+ "dst": "user_32",
690
+ "rel": "alias_of",
691
+ "src": "alias_32_394"
692
+ },
693
+ {
694
+ "confidence": 1.0,
695
+ "dst": "org_helios_labs",
696
+ "rel": "works_at",
697
+ "src": "user_32"
698
+ }
699
+ ]
700
+ }
701
+ ],
702
+ "model_path": "artifacts/hf_downloads/self_play_hf_l40s_full/round_006/answerer_train/final_model",
703
+ "summary": {
704
+ "avg_completion_length": 566.5416666666666,
705
+ "avg_graph_f1": 0.0,
706
+ "avg_reward": 0.8995857577183174,
707
+ "episodes": 24,
708
+ "task_success_rate": 0.875
709
+ }
710
+ },
711
+ "original_answerer": {
712
+ "episodes": [
713
+ {
714
+ "agent_answer": "{\"answer\":\"user_30\",\"supporting_edges\":[{\"src\":\"alias_30_572\",\"rel\":\"alias_of\",\"dst\":\"user_30\",\"confidence\":1.0},{\"src\":\"user_30\",\"rel\":\"connected_to\",\"dst\":\"org_northbridge\",\"confidence\":1.0},{\"src\":\"user_30\",\"rel\":\"connected_to\",\"dst\":\"loc_delhi\",\"confidence\":1.0},{\"src\":\"user_30\",\"rel\":\"connected_to\",\"dst\":\"loc_hyderabad\",\"confidence\":1.0},{\"src\":\"user_30\",\"rel\":\"connected_to\",\"dst",
715
+ "completion_length": 388,
716
+ "graph_f1": 0.6666666666666666,
717
+ "pred_edges": [
718
+ {
719
+ "confidence": 1.0,
720
+ "dst": "user_30",
721
+ "rel": "alias_of",
722
+ "src": "alias_30_572"
723
+ }
724
+ ],
725
+ "predicted_edge_count": 1,
726
+ "question": "If you start at alias_30_572 and follow the relation path alias_of -> connected_to, which entity do you reach after 2 hops?",
727
+ "reward": 0.4462659972989426,
728
+ "success": 0,
729
+ "support_edge_count": 2,
730
+ "task_answer": "user_25",
731
+ "task_id": "swarm_v2_r7_0",
732
+ "task_type": "swarm_v2_2hop_trace",
733
+ "truth_edges": [
734
+ {
735
+ "confidence": 1.0,
736
+ "dst": "user_30",
737
+ "rel": "alias_of",
738
+ "src": "alias_30_572"
739
+ },
740
+ {
741
+ "confidence": 0.8,
742
+ "dst": "user_25",
743
+ "rel": "connected_to",
744
+ "src": "user_30"
745
+ }
746
+ ]
747
+ },
748
+ {
749
+ "agent_answer": "user_3",
750
+ "completion_length": 445,
751
+ "graph_f1": 0.0,
752
+ "pred_edges": [
753
+ {
754
+ "confidence": 1.0,
755
+ "dst": "loc_pune",
756
+ "rel": "connected_to",
757
+ "src": "user_37"
758
+ },
759
+ {
760
+ "confidence": 1.0,
761
+ "dst": "user_3",
762
+ "rel": "located_in",
763
+ "src": "loc_pune"
764
+ }
765
+ ],
766
+ "predicted_edge_count": 2,
767
+ "question": "If you start at user_37 and follow the relation path connected_to -> located_in, which entity do you reach after 2 hops?",
768
+ "reward": 0.5084758288369232,
769
+ "success": 0,
770
+ "support_edge_count": 2,
771
+ "task_answer": "loc_pune",
772
+ "task_id": "swarm_v2_r7_1",
773
+ "task_type": "swarm_v2_2hop_trace",
774
+ "truth_edges": [
775
+ {
776
+ "confidence": 0.8,
777
+ "dst": "user_11",
778
+ "rel": "connected_to",
779
+ "src": "user_37"
780
+ },
781
+ {
782
+ "confidence": 1.0,
783
+ "dst": "loc_pune",
784
+ "rel": "located_in",
785
+ "src": "user_11"
786
+ }
787
+ ]
788
+ },
789
+ {
790
+ "agent_answer": "{\"answer\":\"user_24\",\"supporting_edges\":[{\"src\":\"alias_24_458\",\"rel\":\"alias_of\",\"dst\":\"user_24\",\"confidence\":1.0},{\"src\":\"user_24\",\"rel\":\"located_in\",\"dst\":\"loc_hyderabad\",\"confidence\":1.0},{\"src\":\"loc_hyderabad\",\"rel\":\"located_in\",\"dst\":\"org_northbridge\",\"confidence\":1.0},{\"src\":\"org_northbridge\",\"rel\":\"works_at\",\"dst\":\"user_24\",\"confidence\":1.0},{\"src\":\"user_24\",\"rel\":\"located_in\",",
791
+ "completion_length": 387,
792
+ "graph_f1": 1.0,
793
+ "pred_edges": [
794
+ {
795
+ "confidence": 1.0,
796
+ "dst": "user_24",
797
+ "rel": "alias_of",
798
+ "src": "alias_24_458"
799
+ },
800
+ {
801
+ "confidence": 1.0,
802
+ "dst": "loc_hyderabad",
803
+ "rel": "located_in",
804
+ "src": "user_24"
805
+ }
806
+ ],
807
+ "predicted_edge_count": 2,
808
+ "question": "If you start at alias_24_458 and follow the relation path alias_of -> located_in, which entity do you reach after 2 hops?",
809
+ "reward": 0.5018390028775422,
810
+ "success": 0,
811
+ "support_edge_count": 2,
812
+ "task_answer": "loc_hyderabad",
813
+ "task_id": "swarm_v2_r7_2",
814
+ "task_type": "swarm_v2_2hop_trace",
815
+ "truth_edges": [
816
+ {
817
+ "confidence": 1.0,
818
+ "dst": "user_24",
819
+ "rel": "alias_of",
820
+ "src": "alias_24_458"
821
+ },
822
+ {
823
+ "confidence": 1.0,
824
+ "dst": "loc_hyderabad",
825
+ "rel": "located_in",
826
+ "src": "user_24"
827
+ }
828
+ ]
829
+ },
830
+ {
831
+ "agent_answer": "{\"answer\":\"user_7\",\"supporting_edges\":[{\"src\":\"alias_4_664\",\"rel\":\"alias_of\",\"dst\":\"user_7\",\"confidence\":1.0},{\"src\":\"user_7\",\"rel\":\"located_in\",\"dst\":\"loc_bengaluru\",\"confidence\":1.0},{\"src\":\"loc_bengaluru\",\"rel\":\"located_in\",\"dst\":\"org_helios_labs\",\"confidence\":1.0},{\"src\":\"org_helios_labs\",\"rel\":\"works_at\",\"dst\":\"user_4\",\"confidence\":1.0},{\"src\":\"user_4\",\"rel\":\"located_in\",",
832
+ "completion_length": 381,
833
+ "graph_f1": 1.0,
834
+ "pred_edges": [
835
+ {
836
+ "confidence": 1.0,
837
+ "dst": "user_4",
838
+ "rel": "alias_of",
839
+ "src": "alias_4_664"
840
+ },
841
+ {
842
+ "confidence": 1.0,
843
+ "dst": "loc_bengaluru",
844
+ "rel": "located_in",
845
+ "src": "user_4"
846
+ }
847
+ ],
848
+ "predicted_edge_count": 2,
849
+ "question": "If you start at alias_4_664 and follow the relation path alias_of -> located_in, which entity do you reach after 2 hops?",
850
+ "reward": 0.5000528755950792,
851
+ "success": 0,
852
+ "support_edge_count": 2,
853
+ "task_answer": "loc_bengaluru",
854
+ "task_id": "swarm_v2_r7_3",
855
+ "task_type": "swarm_v2_2hop_trace",
856
+ "truth_edges": [
857
+ {
858
+ "confidence": 1.0,
859
+ "dst": "user_4",
860
+ "rel": "alias_of",
861
+ "src": "alias_4_664"
862
+ },
863
+ {
864
+ "confidence": 1.0,
865
+ "dst": "loc_bengaluru",
866
+ "rel": "located_in",
867
+ "src": "user_4"
868
+ }
869
+ ]
870
+ },
871
+ {
872
+ "agent_answer": "{\"answer\":\"user_3\",\"supporting_edges\":[{\"src\":\"alias_39_951\",\"rel\":\"alias_of\",\"dst\":\"user_3\",\"confidence\":1.0},{\"src\":\"user_3\",\"rel\":\"connected_to\",\"dst\":\"org_northbridge\",\"confidence\":1.0},{\"src\":\"user_3\",\"rel\":\"connected_to\",\"dst\":\"loc_pune\",\"confidence\":1.0},{\"src\":\"user_3\",\"rel\":\"connected_to\",\"dst\":\"org_northbridge\",\"confidence\":1.0},{\"src\":\"user_3\",\"rel\":\"connected_to\",\"dst\":\"user_39\",",
873
+ "completion_length": 396,
874
+ "graph_f1": 1.0,
875
+ "pred_edges": [
876
+ {
877
+ "confidence": 1.0,
878
+ "dst": "user_39",
879
+ "rel": "alias_of",
880
+ "src": "alias_39_951"
881
+ },
882
+ {
883
+ "confidence": 0.8,
884
+ "dst": "user_3",
885
+ "rel": "connected_to",
886
+ "src": "user_39"
887
+ }
888
+ ],
889
+ "predicted_edge_count": 2,
890
+ "question": "If you start at alias_39_951 and follow the relation path alias_of -> connected_to, which entity do you reach after 2 hops?",
891
+ "reward": 0.5017743415980812,
892
+ "success": 0,
893
+ "support_edge_count": 2,
894
+ "task_answer": "user_3",
895
+ "task_id": "swarm_v2_r7_4",
896
+ "task_type": "swarm_v2_2hop_trace",
897
+ "truth_edges": [
898
+ {
899
+ "confidence": 1.0,
900
+ "dst": "user_39",
901
+ "rel": "alias_of",
902
+ "src": "alias_39_951"
903
+ },
904
+ {
905
+ "confidence": 0.8,
906
+ "dst": "user_3",
907
+ "rel": "connected_to",
908
+ "src": "user_39"
909
+ }
910
+ ]
911
+ },
912
+ {
913
+ "agent_answer": "user_7",
914
+ "completion_length": 387,
915
+ "graph_f1": 0.0,
916
+ "pred_edges": [
917
+ {
918
+ "confidence": 1.0,
919
+ "dst": "user_7",
920
+ "rel": "alias_of",
921
+ "src": "alias_20_174"
922
+ },
923
+ {
924
+ "confidence": 1.0,
925
+ "dst": "org_northbridge",
926
+ "rel": "works_at",
927
+ "src": "user_7"
928
+ }
929
+ ],
930
+ "predicted_edge_count": 2,
931
+ "question": "If you start at alias_20_174 and follow the relation path alias_of -> works_at, which entity do you reach after 2 hops?",
932
+ "reward": 0.7089737433490155,
933
+ "success": 0,
934
+ "support_edge_count": 2,
935
+ "task_answer": "org_northbridge",
936
+ "task_id": "swarm_v2_r7_5",
937
+ "task_type": "swarm_v2_2hop_trace",
938
+ "truth_edges": [
939
+ {
940
+ "confidence": 1.0,
941
+ "dst": "user_20",
942
+ "rel": "alias_of",
943
+ "src": "alias_20_174"
944
+ },
945
+ {
946
+ "confidence": 1.0,
947
+ "dst": "org_northbridge",
948
+ "rel": "works_at",
949
+ "src": "user_20"
950
+ }
951
+ ]
952
+ },
953
+ {
954
+ "agent_answer": "{\"answer\":\"user_34\",\"supporting_edges\":[{\"src\":\"alias_34_511\",\"rel\":\"alias_of\",\"dst\":\"user_34\",\"confidence\":1.0},{\"src\":\"user_34\",\"rel\":\"works_at\",\"dst\":\"org_helios_labs\",\"confidence\":1.0},{\"src\":\"org_helios_labs\",\"rel\":\"works_at\",\"dst\":\"user_0\",\"confidence\":1.0},{\"src\":\"user_0\",\"rel\":\"works_at\",\"dst\":\"user_16\",\"confidence\":1.0},{\"src\":\"user_16\",\"rel\":\"works",
955
+ "completion_length": 361,
956
+ "graph_f1": 1.0,
957
+ "pred_edges": [
958
+ {
959
+ "confidence": 1.0,
960
+ "dst": "user_34",
961
+ "rel": "alias_of",
962
+ "src": "alias_34_511"
963
+ },
964
+ {
965
+ "confidence": 1.0,
966
+ "dst": "org_helios_labs",
967
+ "rel": "works_at",
968
+ "src": "user_34"
969
+ }
970
+ ],
971
+ "predicted_edge_count": 2,
972
+ "question": "If you start at alias_34_511 and follow the relation path alias_of -> works_at, which entity do you reach after 2 hops?",
973
+ "reward": 0.49966549250864506,
974
+ "success": 0,
975
+ "support_edge_count": 2,
976
+ "task_answer": "org_helios_labs",
977
+ "task_id": "swarm_v2_r7_6",
978
+ "task_type": "swarm_v2_2hop_trace",
979
+ "truth_edges": [
980
+ {
981
+ "confidence": 1.0,
982
+ "dst": "user_34",
983
+ "rel": "alias_of",
984
+ "src": "alias_34_511"
985
+ },
986
+ {
987
+ "confidence": 1.0,
988
+ "dst": "org_helios_labs",
989
+ "rel": "works_at",
990
+ "src": "user_34"
991
+ }
992
+ ]
993
+ },
994
+ {
995
+ "agent_answer": "{\"answer\":\"user_3\",\"supporting_edges\":[{\"src\":\"alias_3_544\",\"rel\":\"alias_of\",\"dst\":\"user_3\",\"confidence\":1.0},{\"src\":\"user_3\",\"rel\":\"connected_to\",\"dst\":\"org_northbridge\",\"confidence\":1.0},{\"src\":\"user_3\",\"rel\":\"connected_to\",\"dst\":\"loc_pune\",\"confidence\":1.0},{\"src\":\"user_3\",\"rel\":\"connected_to\",\"dst\":\"org_northbridge\",\"confidence\":1.0},{\"src\":\"user_3\",\"rel\":\"connected_to\",\"dst\":\"loc_pune\",\"confidence\":",
996
+ "completion_length": 408,
997
+ "graph_f1": 0.6666666666666666,
998
+ "pred_edges": [
999
+ {
1000
+ "confidence": 1.0,
1001
+ "dst": "user_3",
1002
+ "rel": "alias_of",
1003
+ "src": "alias_3_544"
1004
+ }
1005
+ ],
1006
+ "predicted_edge_count": 1,
1007
+ "question": "If you start at alias_3_544 and follow the relation path alias_of -> connected_to, which entity do you reach after 2 hops?",
1008
+ "reward": 0.4454139519345446,
1009
+ "success": 0,
1010
+ "support_edge_count": 2,
1011
+ "task_answer": "user_12",
1012
+ "task_id": "swarm_v2_r7_7",
1013
+ "task_type": "swarm_v2_2hop_trace",
1014
+ "truth_edges": [
1015
+ {
1016
+ "confidence": 1.0,
1017
+ "dst": "user_3",
1018
+ "rel": "alias_of",
1019
+ "src": "alias_3_544"
1020
+ },
1021
+ {
1022
+ "confidence": 0.8,
1023
+ "dst": "user_12",
1024
+ "rel": "connected_to",
1025
+ "src": "user_3"
1026
+ }
1027
+ ]
1028
+ },
1029
+ {
1030
+ "agent_answer": "{\"answer\":\"user_3\",\"supporting_edges\":[{\"src\":\"alias_3_544\",\"rel\":\"alias_of\",\"dst\":\"user_3\",\"confidence\":1.0},{\"src\":\"user_3\",\"rel\":\"located_in\",\"dst\":\"loc_pune\",\"confidence\":1.0},{\"src\":\"loc_pune\",\"rel\":\"located_in\",\"dst\":\"org_northbridge\",\"confidence\":1.0},{\"src\":\"org_northbridge\",\"rel\":\"works_at\",\"dst\":\"user_3\",\"confidence\":1.0},{\"src\":\"user_3\",\"rel\":\"located_in\",\"dst\":\"user_6\",\"confidence\":",
1031
+ "completion_length": 398,
1032
+ "graph_f1": 1.0,
1033
+ "pred_edges": [
1034
+ {
1035
+ "confidence": 1.0,
1036
+ "dst": "user_3",
1037
+ "rel": "alias_of",
1038
+ "src": "alias_3_544"
1039
+ },
1040
+ {
1041
+ "confidence": 1.0,
1042
+ "dst": "loc_pune",
1043
+ "rel": "located_in",
1044
+ "src": "user_3"
1045
+ }
1046
+ ],
1047
+ "predicted_edge_count": 2,
1048
+ "question": "If you start at alias_3_544 and follow the relation path alias_of -> located_in, which entity do you reach after 2 hops?",
1049
+ "reward": 0.4994635369242434,
1050
+ "success": 0,
1051
+ "support_edge_count": 2,
1052
+ "task_answer": "loc_pune",
1053
+ "task_id": "swarm_v2_r7_8",
1054
+ "task_type": "swarm_v2_2hop_trace",
1055
+ "truth_edges": [
1056
+ {
1057
+ "confidence": 1.0,
1058
+ "dst": "user_3",
1059
+ "rel": "alias_of",
1060
+ "src": "alias_3_544"
1061
+ },
1062
+ {
1063
+ "confidence": 1.0,
1064
+ "dst": "loc_pune",
1065
+ "rel": "located_in",
1066
+ "src": "user_3"
1067
+ }
1068
+ ]
1069
+ },
1070
+ {
1071
+ "agent_answer": "user_3",
1072
+ "completion_length": 423,
1073
+ "graph_f1": 0.5,
1074
+ "pred_edges": [
1075
+ {
1076
+ "confidence": 1.0,
1077
+ "dst": "user_3",
1078
+ "rel": "connected_to",
1079
+ "src": "user_39"
1080
+ },
1081
+ {
1082
+ "confidence": 1.0,
1083
+ "dst": "user_39",
1084
+ "rel": "connected_to",
1085
+ "src": "user_3"
1086
+ }
1087
+ ],
1088
+ "predicted_edge_count": 2,
1089
+ "question": "If you start at user_39 and follow the relation path connected_to -> connected_to, which entity do you reach after 2 hops?",
1090
+ "reward": 0.6989023123676081,
1091
+ "success": 0,
1092
+ "support_edge_count": 2,
1093
+ "task_answer": "user_12",
1094
+ "task_id": "swarm_v2_r7_9",
1095
+ "task_type": "swarm_v2_2hop_trace",
1096
+ "truth_edges": [
1097
+ {
1098
+ "confidence": 0.8,
1099
+ "dst": "user_3",
1100
+ "rel": "connected_to",
1101
+ "src": "user_39"
1102
+ },
1103
+ {
1104
+ "confidence": 0.8,
1105
+ "dst": "user_12",
1106
+ "rel": "connected_to",
1107
+ "src": "user_3"
1108
+ }
1109
+ ]
1110
+ },
1111
+ {
1112
+ "agent_answer": "{\"answer\":\"user_7\",\"supporting_edges\":[{\"src\":\"alias_4_664\",\"rel\":\"alias_of\",\"dst\":\"user_7\",\"confidence\":1.0},{\"src\":\"user_7\",\"rel\":\"connected_to\",\"dst\":\"loc_bengaluru\",\"confidence\":1.0},{\"src\":\"loc_bengaluru\",\"rel\":\"located_in\",\"dst\":\"user_7\",\"confidence\":1.0},{\"src\":\"user_7\",\"rel\":\"located_in\",\"dst\":\"org_helios_labs\",\"confidence\":1.0},{\"src\":\"org_helios_labs\",\"rel\":\"works_at\",",
1113
+ "completion_length": 383,
1114
+ "graph_f1": 0.0,
1115
+ "pred_edges": [],
1116
+ "predicted_edge_count": 0,
1117
+ "question": "If you start at alias_4_664 and follow the relation path alias_of -> connected_to -> located_in, which entity do you reach after 3 hops?",
1118
+ "reward": 0.39524653939368076,
1119
+ "success": 0,
1120
+ "support_edge_count": 3,
1121
+ "task_answer": "loc_bengaluru",
1122
+ "task_id": "swarm_v2_r7_10",
1123
+ "task_type": "swarm_v2_3hop_trace",
1124
+ "truth_edges": [
1125
+ {
1126
+ "confidence": 1.0,
1127
+ "dst": "user_4",
1128
+ "rel": "alias_of",
1129
+ "src": "alias_4_664"
1130
+ },
1131
+ {
1132
+ "confidence": 0.8,
1133
+ "dst": "user_13",
1134
+ "rel": "connected_to",
1135
+ "src": "user_4"
1136
+ },
1137
+ {
1138
+ "confidence": 1.0,
1139
+ "dst": "loc_bengaluru",
1140
+ "rel": "located_in",
1141
+ "src": "user_13"
1142
+ }
1143
+ ]
1144
+ },
1145
+ {
1146
+ "agent_answer": "{\"answer\":\"org_apex_dynamics\",\"supporting_edges\":[{\"src\":\"user_16\",\"rel\":\"connected_to\",\"dst\":\"org_apex_dynamics\",\"confidence\":1.0},{\"src\":\"org_apex_dynamics\",\"rel\":\"works_at\",\"dst\":\"user_18\",\"confidence\":1.0},{\"src\":\"user_18\",\"rel\":\"connected_to\",\"dst\":\"org_apex_dynamics\",\"confidence\":1.0},{\"src\":\"org_apex_dynamics\",\"rel\":\"works_at\",\"dst\":\"user_1\",\"confidence\":1.0},{\"src\":\"user_1\",\"rel\":\"connected_to\",\"dst",
1147
+ "completion_length": 411,
1148
+ "graph_f1": 1.0,
1149
+ "pred_edges": [
1150
+ {
1151
+ "confidence": 0.8,
1152
+ "dst": "user_18",
1153
+ "rel": "connected_to",
1154
+ "src": "user_16"
1155
+ },
1156
+ {
1157
+ "confidence": 1.0,
1158
+ "dst": "org_apex_dynamics",
1159
+ "rel": "works_at",
1160
+ "src": "user_18"
1161
+ }
1162
+ ],
1163
+ "predicted_edge_count": 2,
1164
+ "question": "If you start at user_16 and follow the relation path connected_to -> works_at, which entity do you reach after 2 hops?",
1165
+ "reward": 0.5363758482799911,
1166
+ "success": 0,
1167
+ "support_edge_count": 2,
1168
+ "task_answer": "org_apex_dynamics",
1169
+ "task_id": "swarm_v2_r7_11",
1170
+ "task_type": "swarm_v2_2hop_trace",
1171
+ "truth_edges": [
1172
+ {
1173
+ "confidence": 0.8,
1174
+ "dst": "user_18",
1175
+ "rel": "connected_to",
1176
+ "src": "user_16"
1177
+ },
1178
+ {
1179
+ "confidence": 1.0,
1180
+ "dst": "org_apex_dynamics",
1181
+ "rel": "works_at",
1182
+ "src": "user_18"
1183
+ }
1184
+ ]
1185
+ },
1186
+ {
1187
+ "agent_answer": "{\"answer\":\"org_northbridge\",\"supporting_edges\":[{\"src\":\"user_3\",\"rel\":\"connected_to\",\"dst\":\"org_northbridge\",\"confidence\":1.0},{\"src\":\"org_northbridge\",\"rel\":\"works_at\",\"dst\":\"user_3\",\"confidence\":1.0},{\"src\":\"user_3\",\"rel\":\"connected_to\",\"dst\":\"org_northbridge\",\"confidence\":1.0},{\"src\":\"org_northbridge\",\"rel\":\"works_at\",\"dst\":\"user_3\",\"confidence\":1.0},{\"src\":\"user_3\",\"rel\":\"connected_to\",\"dst\":\"org_northbridge\",\"confidence\":",
1188
+ "completion_length": 431,
1189
+ "graph_f1": 0.0,
1190
+ "pred_edges": [],
1191
+ "predicted_edge_count": 0,
1192
+ "question": "If you start at user_3 and follow the relation path connected_to -> works_at, which entity do you reach after 2 hops?",
1193
+ "reward": 0.39524653939368076,
1194
+ "success": 0,
1195
+ "support_edge_count": 2,
1196
+ "task_answer": "org_northbridge",
1197
+ "task_id": "swarm_v2_r7_12",
1198
+ "task_type": "swarm_v2_2hop_trace",
1199
+ "truth_edges": [
1200
+ {
1201
+ "confidence": 0.8,
1202
+ "dst": "user_12",
1203
+ "rel": "connected_to",
1204
+ "src": "user_3"
1205
+ },
1206
+ {
1207
+ "confidence": 1.0,
1208
+ "dst": "org_northbridge",
1209
+ "rel": "works_at",
1210
+ "src": "user_12"
1211
+ }
1212
+ ]
1213
+ },
1214
+ {
1215
+ "agent_answer": "user_7",
1216
+ "completion_length": 418,
1217
+ "graph_f1": 0.0,
1218
+ "pred_edges": [
1219
+ {
1220
+ "confidence": 1.0,
1221
+ "dst": "org_northbridge",
1222
+ "rel": "connected_to",
1223
+ "src": "user_30"
1224
+ },
1225
+ {
1226
+ "confidence": 1.0,
1227
+ "dst": "user_7",
1228
+ "rel": "works_at",
1229
+ "src": "org_northbridge"
1230
+ }
1231
+ ],
1232
+ "predicted_edge_count": 2,
1233
+ "question": "If you start at user_30 and follow the relation path connected_to -> works_at, which entity do you reach after 2 hops?",
1234
+ "reward": 0.7410081838633457,
1235
+ "success": 0,
1236
+ "support_edge_count": 2,
1237
+ "task_answer": "org_northbridge",
1238
+ "task_id": "swarm_v2_r7_13",
1239
+ "task_type": "swarm_v2_2hop_trace",
1240
+ "truth_edges": [
1241
+ {
1242
+ "confidence": 0.8,
1243
+ "dst": "user_25",
1244
+ "rel": "connected_to",
1245
+ "src": "user_30"
1246
+ },
1247
+ {
1248
+ "confidence": 1.0,
1249
+ "dst": "org_northbridge",
1250
+ "rel": "works_at",
1251
+ "src": "user_25"
1252
+ }
1253
+ ]
1254
+ },
1255
+ {
1256
+ "agent_answer": "{\"answer\":\"user_7\",\"supporting_edges\":[{\"src\":\"alias_11_684\",\"rel\":\"alias_of\",\"dst\":\"user_7\",\"confidence\":1.0},{\"src\":\"user_7\",\"rel\":\"works_at\",\"dst\":\"org_northbridge\",\"confidence\":1.0},{\"src\":\"org_northbridge\",\"rel\":\"works_at\",\"dst\":\"user_10\",\"confidence\":1.0},{\"src\":\"user_10\",\"rel\":\"located_in\",\"dst\":\"loc_pune\",\"confidence\":1.0},{\"src\":\"loc_pune\",\"rel\":\"located_in\",\"dst\":\"org_north",
1257
+ "completion_length": 387,
1258
+ "graph_f1": 0.0,
1259
+ "pred_edges": [],
1260
+ "predicted_edge_count": 0,
1261
+ "question": "If you start at alias_11_684 and follow the relation path alias_of -> works_at, which entity do you reach after 2 hops?",
1262
+ "reward": 0.39524653939368076,
1263
+ "success": 0,
1264
+ "support_edge_count": 2,
1265
+ "task_answer": "org_northbridge",
1266
+ "task_id": "swarm_v2_r7_14",
1267
+ "task_type": "swarm_v2_2hop_trace",
1268
+ "truth_edges": [
1269
+ {
1270
+ "confidence": 1.0,
1271
+ "dst": "user_11",
1272
+ "rel": "alias_of",
1273
+ "src": "alias_11_684"
1274
+ },
1275
+ {
1276
+ "confidence": 1.0,
1277
+ "dst": "org_northbridge",
1278
+ "rel": "works_at",
1279
+ "src": "user_11"
1280
+ }
1281
+ ]
1282
+ },
1283
+ {
1284
+ "agent_answer": "{\"answer\":\"user_35\",\"supporting_edges\":[{\"src\":\"alias_29_495\",\"rel\":\"alias_of\",\"dst\":\"user_35\",\"confidence\":1.0},{\"src\":\"user_35\",\"rel\":\"connected_to\",\"dst\":\"loc_hyderabad\",\"confidence\":1.0},{\"src\":\"loc_hyderabad\",\"rel\":\"located_in\",\"dst\":\"user_35\",\"confidence\":1.0},{\"src\":\"user_35\",\"rel\":\"located_in\",\"dst\":\"org_helios_labs\",\"confidence\":1.0},{\"src\":\"org_helios_labs\",\"rel",
1285
+ "completion_length": 375,
1286
+ "graph_f1": 0.5,
1287
+ "pred_edges": [
1288
+ {
1289
+ "confidence": 1.0,
1290
+ "dst": "loc_hyderabad",
1291
+ "rel": "located_in",
1292
+ "src": "user_35"
1293
+ }
1294
+ ],
1295
+ "predicted_edge_count": 1,
1296
+ "question": "If you start at alias_29_495 and follow the relation path alias_of -> connected_to -> located_in, which entity do you reach after 3 hops?",
1297
+ "reward": 0.42825354331930193,
1298
+ "success": 0,
1299
+ "support_edge_count": 3,
1300
+ "task_answer": "loc_hyderabad",
1301
+ "task_id": "swarm_v2_r7_15",
1302
+ "task_type": "swarm_v2_3hop_trace",
1303
+ "truth_edges": [
1304
+ {
1305
+ "confidence": 1.0,
1306
+ "dst": "user_29",
1307
+ "rel": "alias_of",
1308
+ "src": "alias_29_495"
1309
+ },
1310
+ {
1311
+ "confidence": 0.8,
1312
+ "dst": "user_35",
1313
+ "rel": "connected_to",
1314
+ "src": "user_29"
1315
+ },
1316
+ {
1317
+ "confidence": 1.0,
1318
+ "dst": "loc_hyderabad",
1319
+ "rel": "located_in",
1320
+ "src": "user_35"
1321
+ }
1322
+ ]
1323
+ },
1324
+ {
1325
+ "agent_answer": "{\"answer\":\"user_7\",\"supporting_edges\":[{\"src\":\"alias_38_337\",\"rel\":\"alias_of\",\"dst\":\"user_7\",\"confidence\":1.0},{\"src\":\"user_7\",\"rel\":\"located_in\",\"dst\":\"loc_bengaluru\",\"confidence\":1.0},{\"src\":\"loc_bengaluru\",\"rel\":\"located_in\",\"dst\":\"user_1\",\"confidence\":1.0},{\"src\":\"user_1\",\"rel\":\"located_in\",\"dst\":\"user_4\",\"confidence\":1.0},{\"src\":\"user_4\",\"rel\":\"located_in\",\"dst\":\"user_5",
1326
+ "completion_length": 378,
1327
+ "graph_f1": 0.0,
1328
+ "pred_edges": [],
1329
+ "predicted_edge_count": 0,
1330
+ "question": "If you start at alias_38_337 and follow the relation path alias_of -> located_in, which entity do you reach after 2 hops?",
1331
+ "reward": 0.39524653939368076,
1332
+ "success": 0,
1333
+ "support_edge_count": 2,
1334
+ "task_answer": "loc_bengaluru",
1335
+ "task_id": "swarm_v2_r7_16",
1336
+ "task_type": "swarm_v2_2hop_trace",
1337
+ "truth_edges": [
1338
+ {
1339
+ "confidence": 1.0,
1340
+ "dst": "user_38",
1341
+ "rel": "alias_of",
1342
+ "src": "alias_38_337"
1343
+ },
1344
+ {
1345
+ "confidence": 1.0,
1346
+ "dst": "loc_bengaluru",
1347
+ "rel": "located_in",
1348
+ "src": "user_38"
1349
+ }
1350
+ ]
1351
+ },
1352
+ {
1353
+ "agent_answer": "user_3",
1354
+ "completion_length": 391,
1355
+ "graph_f1": 0.5,
1356
+ "pred_edges": [
1357
+ {
1358
+ "confidence": 1.0,
1359
+ "dst": "user_3",
1360
+ "rel": "connected_to",
1361
+ "src": "user_38"
1362
+ },
1363
+ {
1364
+ "confidence": 1.0,
1365
+ "dst": "user_38",
1366
+ "rel": "connected_to",
1367
+ "src": "user_3"
1368
+ }
1369
+ ],
1370
+ "predicted_edge_count": 2,
1371
+ "question": "If you start at user_38 and follow the relation path connected_to -> connected_to, which entity do you reach after 2 hops?",
1372
+ "reward": 0.6997365142465289,
1373
+ "success": 0,
1374
+ "support_edge_count": 2,
1375
+ "task_answer": "user_12",
1376
+ "task_id": "swarm_v2_r7_17",
1377
+ "task_type": "swarm_v2_2hop_trace",
1378
+ "truth_edges": [
1379
+ {
1380
+ "confidence": 0.8,
1381
+ "dst": "user_3",
1382
+ "rel": "connected_to",
1383
+ "src": "user_38"
1384
+ },
1385
+ {
1386
+ "confidence": 0.8,
1387
+ "dst": "user_12",
1388
+ "rel": "connected_to",
1389
+ "src": "user_3"
1390
+ }
1391
+ ]
1392
+ },
1393
+ {
1394
+ "agent_answer": "{\"answer\":\"user_39\",\"supporting_edges\":[{\"src\":\"alias_39_951\",\"rel\":\"alias_of\",\"dst\":\"user_39\",\"confidence\":1.0},{\"src\":\"user_39\",\"rel\":\"works_at\",\"dst\":\"org_northbridge\",\"confidence\":1.0},{\"src\":\"org_northbridge\",\"rel\":\"works_at\",\"dst\":\"user_3\",\"confidence\":1.0},{\"src\":\"user_3\",\"rel\":\"works_at\",\"dst\":\"user_10\",\"confidence\":1.0},{\"src\":\"user_10\",\"rel\":\"works_at\",\"dst\":",
1395
+ "completion_length": 373,
1396
+ "graph_f1": 1.0,
1397
+ "pred_edges": [
1398
+ {
1399
+ "confidence": 1.0,
1400
+ "dst": "user_39",
1401
+ "rel": "alias_of",
1402
+ "src": "alias_39_951"
1403
+ },
1404
+ {
1405
+ "confidence": 1.0,
1406
+ "dst": "org_northbridge",
1407
+ "rel": "works_at",
1408
+ "src": "user_39"
1409
+ }
1410
+ ],
1411
+ "predicted_edge_count": 2,
1412
+ "question": "If you start at alias_39_951 and follow the relation path alias_of -> works_at, which entity do you reach after 2 hops?",
1413
+ "reward": 0.49900727548994456,
1414
+ "success": 0,
1415
+ "support_edge_count": 2,
1416
+ "task_answer": "org_northbridge",
1417
+ "task_id": "swarm_v2_r7_18",
1418
+ "task_type": "swarm_v2_2hop_trace",
1419
+ "truth_edges": [
1420
+ {
1421
+ "confidence": 1.0,
1422
+ "dst": "user_39",
1423
+ "rel": "alias_of",
1424
+ "src": "alias_39_951"
1425
+ },
1426
+ {
1427
+ "confidence": 1.0,
1428
+ "dst": "org_northbridge",
1429
+ "rel": "works_at",
1430
+ "src": "user_39"
1431
+ }
1432
+ ]
1433
+ },
1434
+ {
1435
+ "agent_answer": "{\"answer\":\"user_3\",\"supporting_edges\":[{\"src\":\"user_34\",\"rel\":\"connected_to\",\"dst\":\"user_6\",\"confidence\":1.0},{\"src\":\"user_6\",\"rel\":\"located_in\",\"dst\":\"loc_pune\",\"confidence\":1.0},{\"src\":\"loc_pune\",\"rel\":\"located_in\",\"dst\":\"org_apex_dynamics\",\"confidence\":1.0},{\"src\":\"org_apex_dynamics\",\"rel\":\"works_at\",\"dst\":\"user_6\",\"confidence\":1.0}],\"orchestrator\":{\"spawn_count\":1,\"finished_subtasks\":0,\"critical_steps\":0",
1436
+ "completion_length": 412,
1437
+ "graph_f1": 1.0,
1438
+ "pred_edges": [
1439
+ {
1440
+ "confidence": 0.8,
1441
+ "dst": "user_6",
1442
+ "rel": "connected_to",
1443
+ "src": "user_34"
1444
+ },
1445
+ {
1446
+ "confidence": 1.0,
1447
+ "dst": "loc_pune",
1448
+ "rel": "located_in",
1449
+ "src": "user_6"
1450
+ }
1451
+ ],
1452
+ "predicted_edge_count": 2,
1453
+ "question": "If you start at user_34 and follow the relation path connected_to -> located_in, which entity do you reach after 2 hops?",
1454
+ "reward": 0.5347671871919794,
1455
+ "success": 0,
1456
+ "support_edge_count": 2,
1457
+ "task_answer": "loc_pune",
1458
+ "task_id": "swarm_v2_r7_19",
1459
+ "task_type": "swarm_v2_2hop_trace",
1460
+ "truth_edges": [
1461
+ {
1462
+ "confidence": 0.8,
1463
+ "dst": "user_6",
1464
+ "rel": "connected_to",
1465
+ "src": "user_34"
1466
+ },
1467
+ {
1468
+ "confidence": 1.0,
1469
+ "dst": "loc_pune",
1470
+ "rel": "located_in",
1471
+ "src": "user_6"
1472
+ }
1473
+ ]
1474
+ },
1475
+ {
1476
+ "agent_answer": "{\"answer\":\"user_7\",\"supporting_edges\":[{\"src\":\"alias_32_394\",\"rel\":\"alias_of\",\"dst\":\"user_7\",\"confidence\":1.0},{\"src\":\"user_7\",\"rel\":\"located_in\",\"dst\":\"loc_bengaluru\",\"confidence\":1.0},{\"src\":\"loc_bengaluru\",\"rel\":\"located_in\",\"dst\":\"user_1\",\"confidence\":1.0},{\"src\":\"user_1\",\"rel\":\"located_in\",\"dst\":\"user_4\",\"confidence\":1.0},{\"src\":\"user_4\",\"rel\":\"located_in\",\"dst\":\"user_5",
1477
+ "completion_length": 378,
1478
+ "graph_f1": 0.0,
1479
+ "pred_edges": [],
1480
+ "predicted_edge_count": 0,
1481
+ "question": "If you start at alias_32_394 and follow the relation path alias_of -> located_in, which entity do you reach after 2 hops?",
1482
+ "reward": 0.39524653939368076,
1483
+ "success": 0,
1484
+ "support_edge_count": 2,
1485
+ "task_answer": "loc_bengaluru",
1486
+ "task_id": "swarm_v2_r7_20",
1487
+ "task_type": "swarm_v2_2hop_trace",
1488
+ "truth_edges": [
1489
+ {
1490
+ "confidence": 1.0,
1491
+ "dst": "user_32",
1492
+ "rel": "alias_of",
1493
+ "src": "alias_32_394"
1494
+ },
1495
+ {
1496
+ "confidence": 1.0,
1497
+ "dst": "loc_bengaluru",
1498
+ "rel": "located_in",
1499
+ "src": "user_32"
1500
+ }
1501
+ ]
1502
+ },
1503
+ {
1504
+ "agent_answer": "{\"answer\":\"user_30\",\"supporting_edges\":[{\"src\":\"alias_30_572\",\"rel\":\"alias_of\",\"dst\":\"user_30\",\"confidence\":1.0},{\"src\":\"user_30\",\"rel\":\"works_at\",\"dst\":\"org_northbridge\",\"confidence\":1.0},{\"src\":\"org_northbridge\",\"rel\":\"works_at\",\"dst\":\"user_3\",\"confidence\":1.0},{\"src\":\"user_3\",\"rel\":\"works_at\",\"dst\":\"user_10\",\"confidence\":1.0},{\"src\":\"user_10\",\"rel\":\"works_at\",\"dst\":",
1505
+ "completion_length": 373,
1506
+ "graph_f1": 1.0,
1507
+ "pred_edges": [
1508
+ {
1509
+ "confidence": 1.0,
1510
+ "dst": "user_30",
1511
+ "rel": "alias_of",
1512
+ "src": "alias_30_572"
1513
+ },
1514
+ {
1515
+ "confidence": 1.0,
1516
+ "dst": "org_northbridge",
1517
+ "rel": "works_at",
1518
+ "src": "user_30"
1519
+ }
1520
+ ],
1521
+ "predicted_edge_count": 2,
1522
+ "question": "If you start at alias_30_572 and follow the relation path alias_of -> works_at, which entity do you reach after 2 hops?",
1523
+ "reward": 0.4998888700517808,
1524
+ "success": 0,
1525
+ "support_edge_count": 2,
1526
+ "task_answer": "org_northbridge",
1527
+ "task_id": "swarm_v2_r7_21",
1528
+ "task_type": "swarm_v2_2hop_trace",
1529
+ "truth_edges": [
1530
+ {
1531
+ "confidence": 1.0,
1532
+ "dst": "user_30",
1533
+ "rel": "alias_of",
1534
+ "src": "alias_30_572"
1535
+ },
1536
+ {
1537
+ "confidence": 1.0,
1538
+ "dst": "org_northbridge",
1539
+ "rel": "works_at",
1540
+ "src": "user_30"
1541
+ }
1542
+ ]
1543
+ },
1544
+ {
1545
+ "agent_answer": "user_3",
1546
+ "completion_length": 431,
1547
+ "graph_f1": 0.0,
1548
+ "pred_edges": [
1549
+ {
1550
+ "confidence": 1.0,
1551
+ "dst": "user_3",
1552
+ "rel": "alias_of",
1553
+ "src": "alias_39_951"
1554
+ },
1555
+ {
1556
+ "confidence": 1.0,
1557
+ "dst": "loc_pune",
1558
+ "rel": "connected_to",
1559
+ "src": "user_3"
1560
+ },
1561
+ {
1562
+ "confidence": 1.0,
1563
+ "dst": "user_3",
1564
+ "rel": "located_in",
1565
+ "src": "loc_pune"
1566
+ }
1567
+ ],
1568
+ "predicted_edge_count": 3,
1569
+ "question": "If you start at alias_39_951 and follow the relation path alias_of -> connected_to -> located_in, which entity do you reach after 3 hops?",
1570
+ "reward": 0.468212688181137,
1571
+ "success": 0,
1572
+ "support_edge_count": 3,
1573
+ "task_answer": "loc_pune",
1574
+ "task_id": "swarm_v2_r7_22",
1575
+ "task_type": "swarm_v2_3hop_trace",
1576
+ "truth_edges": [
1577
+ {
1578
+ "confidence": 1.0,
1579
+ "dst": "user_39",
1580
+ "rel": "alias_of",
1581
+ "src": "alias_39_951"
1582
+ },
1583
+ {
1584
+ "confidence": 0.8,
1585
+ "dst": "user_3",
1586
+ "rel": "connected_to",
1587
+ "src": "user_39"
1588
+ },
1589
+ {
1590
+ "confidence": 1.0,
1591
+ "dst": "loc_pune",
1592
+ "rel": "located_in",
1593
+ "src": "user_3"
1594
+ }
1595
+ ]
1596
+ },
1597
+ {
1598
+ "agent_answer": "user_32",
1599
+ "completion_length": 434,
1600
+ "graph_f1": 1.0,
1601
+ "pred_edges": [
1602
+ {
1603
+ "confidence": 1.0,
1604
+ "dst": "user_32",
1605
+ "rel": "alias_of",
1606
+ "src": "alias_32_394"
1607
+ },
1608
+ {
1609
+ "confidence": 1.0,
1610
+ "dst": "org_helios_labs",
1611
+ "rel": "works_at",
1612
+ "src": "user_32"
1613
+ }
1614
+ ],
1615
+ "predicted_edge_count": 2,
1616
+ "question": "If you start at alias_32_394 and follow the relation path alias_of -> works_at, which entity do you reach after 2 hops?",
1617
+ "reward": 0.7753896096874865,
1618
+ "success": 0,
1619
+ "support_edge_count": 2,
1620
+ "task_answer": "org_helios_labs",
1621
+ "task_id": "swarm_v2_r7_23",
1622
+ "task_type": "swarm_v2_2hop_trace",
1623
+ "truth_edges": [
1624
+ {
1625
+ "confidence": 1.0,
1626
+ "dst": "user_32",
1627
+ "rel": "alias_of",
1628
+ "src": "alias_32_394"
1629
+ },
1630
+ {
1631
+ "confidence": 1.0,
1632
+ "dst": "org_helios_labs",
1633
+ "rel": "works_at",
1634
+ "src": "user_32"
1635
+ }
1636
+ ]
1637
+ }
1638
+ ],
1639
+ "model_path": "Qwen/Qwen2.5-0.5B-Instruct",
1640
+ "summary": {
1641
+ "avg_completion_length": 397.875,
1642
+ "avg_graph_f1": 0.5347222222222222,
1643
+ "avg_reward": 0.5195708125237718,
1644
+ "episodes": 24,
1645
+ "task_success_rate": 0.0
1646
+ }
1647
+ }
1648
+ },
1649
+ "pipeline_mode": "swarm_v2",
1650
+ "skipped": false,
1651
+ "summary": {
1652
+ "compared_models": [
1653
+ "finetuned_answerer",
1654
+ "original_answerer"
1655
+ ],
1656
+ "delta_vs_original": {
1657
+ "avg_graph_f1": -0.5347222222222222,
1658
+ "avg_reward": 0.3800149451945456,
1659
+ "task_success_rate": 0.875
1660
+ },
1661
+ "finetuned_answerer": {
1662
+ "avg_completion_length": 566.5416666666666,
1663
+ "avg_graph_f1": 0.0,
1664
+ "avg_reward": 0.8995857577183174,
1665
+ "episodes": 24,
1666
+ "task_success_rate": 0.875
1667
+ },
1668
+ "generated_task_count": 24,
1669
+ "generator_valid_rate": 0.5,
1670
+ "original_answerer": {
1671
+ "avg_completion_length": 397.875,
1672
+ "avg_graph_f1": 0.5347222222222222,
1673
+ "avg_reward": 0.5195708125237718,
1674
+ "episodes": 24,
1675
+ "task_success_rate": 0.0
1676
+ },
1677
+ "top_generator_invalid_reasons": [
1678
+ [
1679
+ "duplicate_or_near_duplicate",
1680
+ 3
1681
+ ]
1682
+ ]
1683
+ },
1684
+ "validation_reports_path": "artifacts/local_hf_eval_round_006/post_training_eval_validation_reports.json"
1685
+ }
scripts/evaluate_hf_checkpoint.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from huggingface_hub import hf_hub_download, snapshot_download
10
+
11
+ from osint_env.config import clone_environment_config, load_shared_config
12
+ from osint_env.env.environment import OSINTEnvironment
13
+ from osint_env.llm import build_llm_client
14
+ from osint_env.training import load_self_play_config
15
+ from osint_env.training.self_play import _run_post_training_evaluation
16
+ from osint_env.viz import export_dashboard
17
+
18
+
19
+ def _build_parser() -> argparse.ArgumentParser:
20
+ parser = argparse.ArgumentParser(
21
+ description=(
22
+ "Download the latest self-play checkpoint from Hugging Face, "
23
+ "generate fresh questions, compare the finetuned checkpoint "
24
+ "against the base model, and export benchmark-style HTML."
25
+ )
26
+ )
27
+ parser.add_argument("--repo-id", required=True, help="HF repo id, for example Siddeshwar1625/osint-checkpoints.")
28
+ parser.add_argument(
29
+ "--run-prefix",
30
+ required=True,
31
+ help="Run folder inside the HF repo, for example self_play_hf_l40s_full.",
32
+ )
33
+ parser.add_argument("--repo-type", default="model", help="HF repo type. Defaults to model.")
34
+ parser.add_argument("--env-config", default="config/shared_config.json", help="Shared environment config.")
35
+ parser.add_argument(
36
+ "--train-config",
37
+ default="config/self_play_training_hf_l40s_full.json",
38
+ help="Self-play training config used for question generation and compare settings.",
39
+ )
40
+ parser.add_argument(
41
+ "--output-dir",
42
+ default="artifacts/hf_checkpoint_eval",
43
+ help="Directory where evaluation JSON and HTML artifacts will be written.",
44
+ )
45
+ parser.add_argument(
46
+ "--download-dir",
47
+ default="artifacts/hf_downloads",
48
+ help="Directory used for local HF downloads and cache materialization.",
49
+ )
50
+ parser.add_argument(
51
+ "--dashboard-name",
52
+ default="post_training_benchmark_dashboard.html",
53
+ help="Filename for the finetuned benchmark-style HTML dashboard.",
54
+ )
55
+ parser.add_argument(
56
+ "--original-dashboard-name",
57
+ default="post_training_benchmark_dashboard_original.html",
58
+ help="Filename for the base-model benchmark-style HTML dashboard.",
59
+ )
60
+ parser.add_argument(
61
+ "--leaderboard-name",
62
+ default="post_training_compare_leaderboard.json",
63
+ help="Filename for the two-row leaderboard JSON used by the HTML dashboards.",
64
+ )
65
+ parser.add_argument(
66
+ "--base-model",
67
+ default="",
68
+ help="Optional base model override. Defaults to the model recorded in self_play_summary.json.",
69
+ )
70
+ parser.add_argument(
71
+ "--finetuned-model-subpath",
72
+ default="",
73
+ help=(
74
+ "Optional HF path to the finetuned model directory inside the repo. "
75
+ "Defaults to the final answerer model recorded in self_play_summary.json."
76
+ ),
77
+ )
78
+ parser.add_argument(
79
+ "--env-llm-provider",
80
+ default="mock",
81
+ help="Provider used only for environment construction. Defaults to mock.",
82
+ )
83
+ parser.add_argument(
84
+ "--allow-env-llm-seeding",
85
+ action="store_true",
86
+ help=(
87
+ "Keep graph/task LLM seeding enabled while constructing the environment. "
88
+ "By default this script disables it to avoid depending on a local LLM server."
89
+ ),
90
+ )
91
+ parser.add_argument(
92
+ "--questions",
93
+ type=int,
94
+ default=0,
95
+ help="Optional override for post_training_eval_questions.",
96
+ )
97
+ parser.add_argument(
98
+ "--generated-task-max-new-tokens",
99
+ type=int,
100
+ default=0,
101
+ help="Optional override for generated_task_max_new_tokens.",
102
+ )
103
+ parser.add_argument(
104
+ "--answer-max-new-tokens",
105
+ type=int,
106
+ default=0,
107
+ help="Optional override for post_training_eval_answer_max_new_tokens.",
108
+ )
109
+ return parser
110
+
111
+
112
+ def _strip_artifacts_prefix(path_value: str) -> str:
113
+ path = Path(str(path_value).strip())
114
+ parts = path.parts
115
+ if parts and parts[0] == "artifacts":
116
+ return Path(*parts[1:]).as_posix()
117
+ return path.as_posix()
118
+
119
+
120
+ def _resolve_finetuned_model_subpath(summary: dict[str, Any], explicit: str) -> str:
121
+ if explicit.strip():
122
+ return explicit.strip().strip("/")
123
+
124
+ final_models = summary.get("final_models", {}) if isinstance(summary, dict) else {}
125
+ candidate = str(final_models.get("answerer") or final_models.get("generator") or "").strip()
126
+ if not candidate:
127
+ raise ValueError("Could not resolve final model path from self_play_summary.json.")
128
+ return _strip_artifacts_prefix(candidate)
129
+
130
+
131
+ def _load_summary(repo_id: str, repo_type: str, run_prefix: str, download_dir: Path) -> tuple[Path, dict[str, Any]]:
132
+ local_path = Path(
133
+ hf_hub_download(
134
+ repo_id=repo_id,
135
+ repo_type=repo_type,
136
+ filename=f"{run_prefix.strip('/')}/self_play_summary.json",
137
+ local_dir=str(download_dir),
138
+ )
139
+ )
140
+ payload = json.loads(local_path.read_text(encoding="utf-8"))
141
+ if not isinstance(payload, dict):
142
+ raise ValueError("self_play_summary.json did not contain a JSON object.")
143
+ return local_path, payload
144
+
145
+
146
+ def _download_model_dir(repo_id: str, repo_type: str, model_subpath: str, download_dir: Path) -> Path:
147
+ normalized = model_subpath.strip().strip("/")
148
+ snapshot_download(
149
+ repo_id=repo_id,
150
+ repo_type=repo_type,
151
+ allow_patterns=[f"{normalized}/*"],
152
+ local_dir=str(download_dir),
153
+ )
154
+ local_model_dir = download_dir / normalized
155
+ if not local_model_dir.exists():
156
+ raise FileNotFoundError(f"Downloaded model folder not found: {local_model_dir}")
157
+ return local_model_dir
158
+
159
+
160
+ def _benchmark_like_summary(summary: dict[str, Any]) -> dict[str, float]:
161
+ task_success_rate = float(summary.get("task_success_rate", 0.0))
162
+ avg_graph_f1 = float(summary.get("avg_graph_f1", 0.0))
163
+ avg_reward = float(summary.get("avg_reward", 0.0))
164
+ leaderboard_score = (
165
+ 0.28 * task_success_rate
166
+ + 0.20 * avg_graph_f1
167
+ + 0.05 * avg_reward
168
+ )
169
+ return {
170
+ "task_success_rate": task_success_rate,
171
+ "tool_efficiency": 0.0,
172
+ "avg_graph_f1": avg_graph_f1,
173
+ "avg_steps_to_solution": 0.0,
174
+ "deanonymization_accuracy": 0.0,
175
+ "avg_reward": avg_reward,
176
+ "avg_knowledge_carrier_reward": 0.0,
177
+ "avg_knowledge_indexing_reward": 0.0,
178
+ "avg_connectivity_reward": 0.0,
179
+ "avg_format_reward": 0.0,
180
+ "avg_relation_informativeness_reward": 0.0,
181
+ "avg_entity_informativeness_reward": 0.0,
182
+ "avg_diversity_reward": 0.0,
183
+ "avg_soft_shaping_reward": 0.0,
184
+ "avg_connectivity_gain_reward": 0.0,
185
+ "avg_compactness_reward": 0.0,
186
+ "avg_spawn_count": 0.0,
187
+ "spawn_completion_rate": 0.0,
188
+ "avg_spawn_critical_steps": 0.0,
189
+ "spawn_signal": 0.0,
190
+ "retrieval_signal": 0.0,
191
+ "structural_signal": 0.0,
192
+ "leaderboard_score": leaderboard_score,
193
+ }
194
+
195
+
196
+ def _benchmark_like_evaluation(
197
+ payload: dict[str, Any],
198
+ model_label: str,
199
+ ) -> dict[str, Any]:
200
+ model_evaluations = payload.get("model_evaluations", {}) if isinstance(payload, dict) else {}
201
+ model_payload = model_evaluations.get(model_label, {}) if isinstance(model_evaluations, dict) else {}
202
+ summary = model_payload.get("summary", {}) if isinstance(model_payload, dict) else {}
203
+ episodes = model_payload.get("episodes", []) if isinstance(model_payload, dict) else []
204
+
205
+ benchmark_episodes: list[dict[str, Any]] = []
206
+ for episode in episodes if isinstance(episodes, list) else []:
207
+ if not isinstance(episode, dict):
208
+ continue
209
+ benchmark_episodes.append(
210
+ {
211
+ "task_id": str(episode.get("task_id", "")),
212
+ "task_type": str(episode.get("task_type", "")),
213
+ "question": str(episode.get("question", "")),
214
+ "task_answer": str(episode.get("task_answer", "")),
215
+ "agent_answer": str(episode.get("agent_answer", "")),
216
+ "graph_f1": float(episode.get("graph_f1", 0.0)),
217
+ "reward": float(episode.get("reward", 0.0)),
218
+ "steps": 0,
219
+ "tool_calls": 0,
220
+ "success": int(episode.get("success", 0)),
221
+ "reward_components": {},
222
+ "spawn_count": 0,
223
+ "spawn_critical_steps": 0,
224
+ "pred_edges": list(episode.get("pred_edges", [])),
225
+ "truth_edges": list(episode.get("truth_edges", [])),
226
+ }
227
+ )
228
+
229
+ return {
230
+ "summary": _benchmark_like_summary(summary if isinstance(summary, dict) else {}),
231
+ "episodes": benchmark_episodes,
232
+ }
233
+
234
+
235
+ def _leaderboard_records(compare_payload: dict[str, Any]) -> list[dict[str, Any]]:
236
+ records: list[dict[str, Any]] = []
237
+ for idx, model_label in enumerate(("finetuned_answerer", "original_answerer"), start=1):
238
+ evaluation = _benchmark_like_evaluation(compare_payload, model_label)
239
+ records.append(
240
+ {
241
+ "run_id": f"post_train_{idx:02d}",
242
+ "run_name": model_label,
243
+ "episodes": len(evaluation.get("episodes", [])),
244
+ "config": {"source": "post_training_evaluation"},
245
+ "metrics": evaluation.get("summary", {}),
246
+ }
247
+ )
248
+ return records
249
+
250
+
251
+ def main() -> None:
252
+ args = _build_parser().parse_args()
253
+
254
+ download_dir = Path(args.download_dir)
255
+ output_dir = Path(args.output_dir)
256
+ download_dir.mkdir(parents=True, exist_ok=True)
257
+ output_dir.mkdir(parents=True, exist_ok=True)
258
+
259
+ summary_path, summary = _load_summary(
260
+ repo_id=args.repo_id,
261
+ repo_type=args.repo_type,
262
+ run_prefix=args.run_prefix,
263
+ download_dir=download_dir,
264
+ )
265
+ finetuned_model_subpath = _resolve_finetuned_model_subpath(summary, args.finetuned_model_subpath)
266
+ finetuned_model_dir = _download_model_dir(
267
+ repo_id=args.repo_id,
268
+ repo_type=args.repo_type,
269
+ model_subpath=finetuned_model_subpath,
270
+ download_dir=download_dir,
271
+ )
272
+
273
+ train_cfg = load_self_play_config(args.train_config)
274
+ if args.questions > 0:
275
+ train_cfg.post_training_eval_questions = int(args.questions)
276
+ if args.generated_task_max_new_tokens > 0:
277
+ train_cfg.generated_task_max_new_tokens = int(args.generated_task_max_new_tokens)
278
+ if args.answer_max_new_tokens > 0:
279
+ train_cfg.post_training_eval_answer_max_new_tokens = int(args.answer_max_new_tokens)
280
+
281
+ shared_cfg = load_shared_config(args.env_config)
282
+ env_cfg = clone_environment_config(shared_cfg.environment)
283
+ env_cfg.llm.provider = str(args.env_llm_provider).strip() or "mock"
284
+ if not args.allow_env_llm_seeding:
285
+ env_cfg.seeding.llm_generate_remaining_graph = False
286
+ env_cfg.seeding.llm_generate_remaining_tasks = False
287
+
288
+ base_model = str(args.base_model).strip() or str(
289
+ summary.get("initial_models", {}).get("answerer")
290
+ or summary.get("initial_models", {}).get("generator")
291
+ or train_cfg.shared_model_name_or_path
292
+ )
293
+ pipeline_mode = str(summary.get("pipeline_mode") or train_cfg.pipeline_mode or "swarm_v2")
294
+
295
+ compare_payload = _run_post_training_evaluation(
296
+ env_config=env_cfg,
297
+ training_config=train_cfg,
298
+ generator_model=str(finetuned_model_dir),
299
+ answerer_models={
300
+ "finetuned_answerer": str(finetuned_model_dir),
301
+ "original_answerer": base_model,
302
+ },
303
+ output_dir=output_dir,
304
+ pipeline_mode=pipeline_mode,
305
+ effective_dry_run=False,
306
+ )
307
+
308
+ env = OSINTEnvironment(env_cfg, llm=build_llm_client(env_cfg.llm))
309
+ env.reset()
310
+
311
+ leaderboard_records = _leaderboard_records(compare_payload)
312
+ leaderboard_path = output_dir / args.leaderboard_name
313
+ leaderboard_path.write_text(json.dumps(leaderboard_records, indent=2, sort_keys=True), encoding="utf-8")
314
+
315
+ finetuned_eval = _benchmark_like_evaluation(compare_payload, "finetuned_answerer")
316
+ original_eval = _benchmark_like_evaluation(compare_payload, "original_answerer")
317
+
318
+ finetuned_dashboard_path = output_dir / args.dashboard_name
319
+ original_dashboard_path = output_dir / args.original_dashboard_name
320
+ export_dashboard(env=env, evaluation=finetuned_eval, leaderboard_records=leaderboard_records, output_path=str(finetuned_dashboard_path))
321
+ export_dashboard(env=env, evaluation=original_eval, leaderboard_records=leaderboard_records, output_path=str(original_dashboard_path))
322
+
323
+ context = {
324
+ "repo_id": args.repo_id,
325
+ "repo_type": args.repo_type,
326
+ "run_prefix": args.run_prefix,
327
+ "summary_path": str(summary_path),
328
+ "downloaded_finetuned_model": str(finetuned_model_dir),
329
+ "base_model": base_model,
330
+ "pipeline_mode": pipeline_mode,
331
+ "environment_llm_provider": env_cfg.llm.provider,
332
+ "env_llm_seeding_enabled": bool(args.allow_env_llm_seeding),
333
+ "dashboard_paths": {
334
+ "finetuned": str(finetuned_dashboard_path),
335
+ "original": str(original_dashboard_path),
336
+ },
337
+ "leaderboard_path": str(leaderboard_path),
338
+ "evaluation_path": str(compare_payload.get("path", "")),
339
+ }
340
+ (output_dir / "evaluation_context.json").write_text(json.dumps(context, indent=2, sort_keys=True), encoding="utf-8")
341
+
342
+ print(
343
+ json.dumps(
344
+ {
345
+ "evaluation_path": compare_payload.get("path", ""),
346
+ "dashboard_path": str(finetuned_dashboard_path),
347
+ "original_dashboard_path": str(original_dashboard_path),
348
+ "leaderboard_path": str(leaderboard_path),
349
+ "summary": compare_payload.get("summary", {}),
350
+ },
351
+ indent=2,
352
+ sort_keys=True,
353
+ )
354
+ )
355
+
356
+
357
+ if __name__ == "__main__":
358
+ main()
server.py CHANGED
@@ -39,6 +39,8 @@ SPACE_DASHBOARD = Path("artifacts/space_dashboard.html")
39
  LATEST_BASELINE_OUTPUT = Path("artifacts/baselines/openai_fixed_levels_latest.json")
40
  LATEST_EVALUATION_OUTPUT = Path("artifacts/latest_evaluation.json")
41
  OPENENV_SPEC_PATH = Path("openenv.yaml")
 
 
42
 
43
  _SESSION_LOCK = Lock()
44
  _SESSIONS: dict[str, OSINTEnvironment] = {}
@@ -250,6 +252,15 @@ def _preview_snapshot() -> dict[str, Any]:
250
 
251
  def _space_snapshot() -> dict[str, Any]:
252
  snapshot = dict(_base_environment_snapshot())
 
 
 
 
 
 
 
 
 
253
 
254
  baseline_payload = _load_json(LATEST_BASELINE_OUTPUT)
255
  evaluation_payload = _load_json(LATEST_EVALUATION_OUTPUT)
@@ -273,6 +284,8 @@ def _space_snapshot() -> dict[str, Any]:
273
  )
274
  if dashboard_path.exists():
275
  snapshot["dashboard_path"] = str(dashboard_path)
 
 
276
  return snapshot
277
 
278
  env = _build_environment()
@@ -283,10 +296,16 @@ def _space_snapshot() -> dict[str, Any]:
283
  output_path=str(SPACE_DASHBOARD),
284
  )
285
  snapshot["dashboard_path"] = dashboard_path
 
 
286
  return snapshot
287
 
288
  preview = _preview_snapshot()
289
  preview["source"] = "preview"
 
 
 
 
290
  return preview
291
 
292
 
@@ -297,6 +316,16 @@ app = FastAPI(title="OSINT OpenEnv Space", version="0.1.0")
297
  def home() -> str:
298
  snapshot = _space_snapshot()
299
  summary = snapshot["summary"]
 
 
 
 
 
 
 
 
 
 
300
  difficulty_html = "".join(
301
  f"<li><strong>{level}</strong>: {count}</li>"
302
  for level, count in sorted(snapshot["difficulty_counts"].items())
@@ -354,6 +383,9 @@ def home() -> str:
354
  background: var(--brand);
355
  margin-right: 10px;
356
  }}
 
 
 
357
  a.link {{
358
  color: var(--accent);
359
  text-decoration: none;
@@ -379,6 +411,7 @@ def home() -> str:
379
  <p>The Space boots with the fixed-level benchmark so visitors get a stable environment snapshot instead of a different graph every restart.</p>
380
  <a class="button" href="/dashboard">Open Dashboard</a>
381
  <a class="link" href="/api/environment">Environment JSON</a>
 
382
  </section>
383
  <section class="card">
384
  <h2>Included Snapshot</h2>
@@ -558,6 +591,20 @@ def dashboard() -> FileResponse:
558
  return FileResponse(snapshot["dashboard_path"], media_type="text/html")
559
 
560
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561
  if __name__ == "__main__":
562
  import uvicorn
563
 
 
39
  LATEST_BASELINE_OUTPUT = Path("artifacts/baselines/openai_fixed_levels_latest.json")
40
  LATEST_EVALUATION_OUTPUT = Path("artifacts/latest_evaluation.json")
41
  OPENENV_SPEC_PATH = Path("openenv.yaml")
42
+ COMPARE_FINETUNED_DASHBOARD = Path("artifacts/local_hf_eval_round_006/post_training_benchmark_dashboard.html")
43
+ COMPARE_BASE_DASHBOARD = Path("artifacts/local_hf_eval_round_006/post_training_benchmark_dashboard_original.html")
44
 
45
  _SESSION_LOCK = Lock()
46
  _SESSIONS: dict[str, OSINTEnvironment] = {}
 
252
 
253
  def _space_snapshot() -> dict[str, Any]:
254
  snapshot = dict(_base_environment_snapshot())
255
+ compare_dashboards = {
256
+ "finetuned": str(COMPARE_FINETUNED_DASHBOARD),
257
+ "base": str(COMPARE_BASE_DASHBOARD),
258
+ }
259
+ available_compare_dashboards = {
260
+ name: path for name, path in compare_dashboards.items() if Path(path).exists()
261
+ }
262
+ if available_compare_dashboards:
263
+ snapshot["compare_dashboard_paths"] = available_compare_dashboards
264
 
265
  baseline_payload = _load_json(LATEST_BASELINE_OUTPUT)
266
  evaluation_payload = _load_json(LATEST_EVALUATION_OUTPUT)
 
284
  )
285
  if dashboard_path.exists():
286
  snapshot["dashboard_path"] = str(dashboard_path)
287
+ if COMPARE_FINETUNED_DASHBOARD.exists():
288
+ snapshot["dashboard_path"] = str(COMPARE_FINETUNED_DASHBOARD)
289
  return snapshot
290
 
291
  env = _build_environment()
 
296
  output_path=str(SPACE_DASHBOARD),
297
  )
298
  snapshot["dashboard_path"] = dashboard_path
299
+ if COMPARE_FINETUNED_DASHBOARD.exists():
300
+ snapshot["dashboard_path"] = str(COMPARE_FINETUNED_DASHBOARD)
301
  return snapshot
302
 
303
  preview = _preview_snapshot()
304
  preview["source"] = "preview"
305
+ if available_compare_dashboards:
306
+ preview["compare_dashboard_paths"] = available_compare_dashboards
307
+ if COMPARE_FINETUNED_DASHBOARD.exists():
308
+ preview["dashboard_path"] = str(COMPARE_FINETUNED_DASHBOARD)
309
  return preview
310
 
311
 
 
316
  def home() -> str:
317
  snapshot = _space_snapshot()
318
  summary = snapshot["summary"]
319
+ compare_dashboards = snapshot.get("compare_dashboard_paths", {})
320
+ compare_links_html = ""
321
+ if compare_dashboards:
322
+ finetuned_link = ""
323
+ base_link = ""
324
+ if compare_dashboards.get("finetuned"):
325
+ finetuned_link = '<a class="button" href="/dashboard/finetuned">Finetuned Dashboard</a>'
326
+ if compare_dashboards.get("base"):
327
+ base_link = '<a class="button secondary" href="/dashboard/base">Base Dashboard</a>'
328
+ compare_links_html = f"<div style=\"margin-top:10px\">{finetuned_link}{base_link}</div>"
329
  difficulty_html = "".join(
330
  f"<li><strong>{level}</strong>: {count}</li>"
331
  for level, count in sorted(snapshot["difficulty_counts"].items())
 
383
  background: var(--brand);
384
  margin-right: 10px;
385
  }}
386
+ a.button.secondary {{
387
+ background: var(--accent);
388
+ }}
389
  a.link {{
390
  color: var(--accent);
391
  text-decoration: none;
 
411
  <p>The Space boots with the fixed-level benchmark so visitors get a stable environment snapshot instead of a different graph every restart.</p>
412
  <a class="button" href="/dashboard">Open Dashboard</a>
413
  <a class="link" href="/api/environment">Environment JSON</a>
414
+ {compare_links_html}
415
  </section>
416
  <section class="card">
417
  <h2>Included Snapshot</h2>
 
591
  return FileResponse(snapshot["dashboard_path"], media_type="text/html")
592
 
593
 
594
+ @app.get("/dashboard/finetuned")
595
+ def dashboard_finetuned() -> FileResponse:
596
+ if not COMPARE_FINETUNED_DASHBOARD.exists():
597
+ raise HTTPException(status_code=404, detail="Finetuned dashboard not found")
598
+ return FileResponse(COMPARE_FINETUNED_DASHBOARD, media_type="text/html")
599
+
600
+
601
+ @app.get("/dashboard/base")
602
+ def dashboard_base() -> FileResponse:
603
+ if not COMPARE_BASE_DASHBOARD.exists():
604
+ raise HTTPException(status_code=404, detail="Base dashboard not found")
605
+ return FileResponse(COMPARE_BASE_DASHBOARD, media_type="text/html")
606
+
607
+
608
  if __name__ == "__main__":
609
  import uvicorn
610
 
src/osint_env/training/self_play.py CHANGED
@@ -1664,6 +1664,24 @@ def _run_post_training_evaluation(
1664
  "support_edge_count": len(support_edges),
1665
  "predicted_edge_count": len(pred_edges),
1666
  "completion_length": len(completion_text),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1667
  }
1668
  )
1669
 
 
1664
  "support_edge_count": len(support_edges),
1665
  "predicted_edge_count": len(pred_edges),
1666
  "completion_length": len(completion_text),
1667
+ "pred_edges": [
1668
+ {
1669
+ "src": edge.src,
1670
+ "rel": edge.rel,
1671
+ "dst": edge.dst,
1672
+ "confidence": float(edge.confidence),
1673
+ }
1674
+ for edge in pred_edges
1675
+ ],
1676
+ "truth_edges": [
1677
+ {
1678
+ "src": edge.src,
1679
+ "rel": edge.rel,
1680
+ "dst": edge.dst,
1681
+ "confidence": float(edge.confidence),
1682
+ }
1683
+ for edge in support_edges
1684
+ ],
1685
  }
1686
  )
1687