siddeshwar-kagatikar commited on
Commit
957b4b2
·
1 Parent(s): 8ad6382

added images (hf-space text-only subset)

Browse files
artifacts/baselines/openai_fixed_levels_latest.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/baselines/openai_fixed_levels_leaderboard.json ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "config": {
4
+ "max_steps": 8,
5
+ "model": "gpt-4.1-nano",
6
+ "provider": "openai",
7
+ "seed": 7,
8
+ "seed_file": "datasets/fixed_levels/seed_fixed_levels.json",
9
+ "shared_config_path": "datasets/fixed_levels/shared_config_fixed_levels.json"
10
+ },
11
+ "created_at": "2026-04-02T05:13:26+00:00",
12
+ "episodes": 3,
13
+ "metrics": {
14
+ "avg_compactness_reward": 0.0,
15
+ "avg_connectivity_gain_reward": 0.0,
16
+ "avg_connectivity_reward": -0.19999999999999998,
17
+ "avg_diversity_reward": 0.0,
18
+ "avg_entity_informativeness_reward": 0.0,
19
+ "avg_format_reward": 0.09999999999999999,
20
+ "avg_graph_f1": 0.0,
21
+ "avg_knowledge_carrier_reward": -0.16666666666666666,
22
+ "avg_knowledge_indexing_reward": 0.09999999999999999,
23
+ "avg_relation_informativeness_reward": 0.0,
24
+ "avg_reward": -0.6288888888888888,
25
+ "avg_soft_shaping_reward": 0.0,
26
+ "avg_spawn_count": 0.0,
27
+ "avg_spawn_critical_steps": 0.0,
28
+ "avg_steps_to_solution": 5.333333333333333,
29
+ "deanonymization_accuracy": 0.0,
30
+ "leaderboard_score": 0.2606214594106657,
31
+ "retrieval_signal": 0.4766666666666666,
32
+ "spawn_completion_rate": 0.0,
33
+ "spawn_signal": 0.4,
34
+ "structural_signal": 0.45,
35
+ "task_success_rate": 0.0,
36
+ "tool_efficiency": 1.0
37
+ },
38
+ "run_id": "run_0001",
39
+ "run_name": "openai_fixed_levels_baseline"
40
+ },
41
+ {
42
+ "config": {
43
+ "max_steps": 8,
44
+ "model": "gpt-4.1-nano",
45
+ "provider": "openai",
46
+ "seed": 7,
47
+ "seed_file": "datasets/fixed_levels/seed_fixed_levels.json",
48
+ "shared_config_path": "datasets/fixed_levels/shared_config_fixed_levels.json"
49
+ },
50
+ "created_at": "2026-04-02T05:47:22+00:00",
51
+ "episodes": 30,
52
+ "metrics": {
53
+ "avg_compactness_reward": 0.0,
54
+ "avg_connectivity_gain_reward": 0.010000000000000002,
55
+ "avg_connectivity_reward": -0.029999999999999995,
56
+ "avg_diversity_reward": 0.008,
57
+ "avg_entity_informativeness_reward": -0.000503934120399963,
58
+ "avg_format_reward": 0.014999999999999998,
59
+ "avg_graph_f1": 0.0,
60
+ "avg_knowledge_carrier_reward": -0.025,
61
+ "avg_knowledge_indexing_reward": 0.003193930313179388,
62
+ "avg_relation_informativeness_reward": -0.0038133420811587043,
63
+ "avg_reward": -0.35025037651763485,
64
+ "avg_soft_shaping_reward": -0.013000000000000001,
65
+ "avg_spawn_count": 0.0,
66
+ "avg_spawn_critical_steps": 0.0,
67
+ "avg_steps_to_solution": 7.9,
68
+ "deanonymization_accuracy": 0.0,
69
+ "leaderboard_score": 0.2523805392887,
70
+ "retrieval_signal": 0.4923678756096128,
71
+ "spawn_completion_rate": 0.0,
72
+ "spawn_signal": 0.4,
73
+ "structural_signal": 0.49383654475968825,
74
+ "task_success_rate": 0.0,
75
+ "tool_efficiency": 0.8528138528138528
76
+ },
77
+ "run_id": "run_0002",
78
+ "run_name": "openai_fixed_levels_baseline"
79
+ },
80
+ {
81
+ "config": {
82
+ "max_steps": 8,
83
+ "model": "gpt-4.1-nano",
84
+ "provider": "openai",
85
+ "seed": 7,
86
+ "seed_file": "datasets/fixed_levels/seed_fixed_levels.json",
87
+ "shared_config_path": "datasets/fixed_levels/shared_config_fixed_levels.json"
88
+ },
89
+ "created_at": "2026-04-02T10:15:11+00:00",
90
+ "episodes": 3,
91
+ "metrics": {
92
+ "avg_compactness_reward": 0.0,
93
+ "avg_connectivity_gain_reward": 0.0,
94
+ "avg_connectivity_reward": -0.19999999999999998,
95
+ "avg_diversity_reward": 0.0,
96
+ "avg_entity_informativeness_reward": 0.0,
97
+ "avg_format_reward": 0.09999999999999999,
98
+ "avg_graph_f1": 0.0,
99
+ "avg_knowledge_carrier_reward": -0.16666666666666666,
100
+ "avg_knowledge_indexing_reward": 0.18776223776223777,
101
+ "avg_relation_informativeness_reward": 0.0,
102
+ "avg_reward": 0.26679098679098673,
103
+ "avg_soft_shaping_reward": 0.0,
104
+ "avg_spawn_count": 0.0,
105
+ "avg_spawn_critical_steps": 0.0,
106
+ "avg_steps_to_solution": 6.666666666666667,
107
+ "deanonymization_accuracy": 0.0,
108
+ "leaderboard_score": 0.3691822627132265,
109
+ "retrieval_signal": 0.5073834498834499,
110
+ "spawn_completion_rate": 0.0,
111
+ "spawn_signal": 0.4,
112
+ "structural_signal": 0.45,
113
+ "task_success_rate": 0.3333333333333333,
114
+ "tool_efficiency": 1.0
115
+ },
116
+ "run_id": "run_0003",
117
+ "run_name": "openai_fixed_levels_baseline"
118
+ },
119
+ {
120
+ "config": {
121
+ "max_steps": 8,
122
+ "model": "gpt-5.4-mini",
123
+ "provider": "openai",
124
+ "seed": 7,
125
+ "seed_file": "datasets/fixed_levels/seed_fixed_levels.json",
126
+ "shared_config_path": "datasets/fixed_levels/shared_config_fixed_levels.json"
127
+ },
128
+ "created_at": "2026-04-02T10:23:56+00:00",
129
+ "episodes": 3,
130
+ "metrics": {
131
+ "avg_compactness_reward": 0.0,
132
+ "avg_connectivity_gain_reward": 0.0,
133
+ "avg_connectivity_reward": -0.3,
134
+ "avg_diversity_reward": 0.0,
135
+ "avg_entity_informativeness_reward": 0.0,
136
+ "avg_format_reward": 0.15,
137
+ "avg_graph_f1": 0.0,
138
+ "avg_knowledge_carrier_reward": -0.25,
139
+ "avg_knowledge_indexing_reward": 0.27412587412587414,
140
+ "avg_relation_informativeness_reward": 0.0,
141
+ "avg_reward": 0.6447878047878046,
142
+ "avg_soft_shaping_reward": 0.0,
143
+ "avg_spawn_count": 0.0,
144
+ "avg_spawn_critical_steps": 0.0,
145
+ "avg_steps_to_solution": 5.666666666666667,
146
+ "deanonymization_accuracy": 0.0,
147
+ "leaderboard_score": 0.46489058185132603,
148
+ "retrieval_signal": 0.508444055944056,
149
+ "spawn_completion_rate": 0.0,
150
+ "spawn_signal": 0.4,
151
+ "structural_signal": 0.425,
152
+ "task_success_rate": 0.6666666666666666,
153
+ "tool_efficiency": 1.0
154
+ },
155
+ "run_id": "run_0004",
156
+ "run_name": "openai_fixed_levels_baseline"
157
+ }
158
+ ]