File size: 3,622 Bytes
8ad6382
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
{
  "episodes": [
    {
      "agent_answer": "user_bharat",
      "graph_f1": 0.5714285714285715,
      "pred_edges": [
        {
          "confidence": 1.0,
          "dst": "user_ivy",
          "rel": "alias_of",
          "src": "alias_orchidfox"
        },
        {
          "confidence": 1.0,
          "dst": "post_midnight_manifest",
          "rel": "authored_post",
          "src": "alias_orchidfox"
        }
      ],
      "question": "alias_orchidfox -> post_midnight_manifest -> loc_dockyard17 -> connected collaborator on event_project_lantern. Who is it?",
      "reward": 4.375796665887621,
      "reward_components": {
        "compactness": 0.0,
        "connectivity": -0.15,
        "connectivity_gain": 0.2,
        "correctness": 1.15,
        "diversity": 0.12666666666666665,
        "duplicate_edge_penalty": -0.3,
        "efficiency": 0.08833333333333335,
        "entity_informativeness": 0.0036675120354726642,
        "format_reward": 0.15,
        "global_accuracy": 1.7,
        "graph_f1": 0.31428571428571433,
        "invalid_tool_penalty": 0.0,
        "knowledge_carrier": 0.5,
        "knowledge_indexing": 0.12272727272727273,
        "relation_informativeness": 0.08384894230149129,
        "repetition_penalty": 0.0,
        "soft_shaping": 0.3,
        "spawn_auxiliary": 0.3280854063558518,
        "spawn_breadth": 2.0,
        "spawn_count": 4.0,
        "spawn_critical_steps": 6.0,
        "spawn_depth": 2.0,
        "spawn_finished_subtasks": 4.0,
        "tool_novelty": -0.30000000000000004,
        "tool_relevance": 0.05818181818181818,
        "total": 4.589529441349951
      },
      "spawn_count": 4,
      "spawn_critical_steps": 6,
      "steps": 9,
      "success": 1,
      "task_answer": "user_bharat",
      "task_id": "seed_task_0",
      "task_type": "fixed_trace",
      "tool_calls": 4,
      "truth_edges": [
        {
          "confidence": 1.0,
          "dst": "user_ivy",
          "rel": "alias_of",
          "src": "alias_orchidfox"
        },
        {
          "confidence": 1.0,
          "dst": "post_midnight_manifest",
          "rel": "authored_post",
          "src": "alias_orchidfox"
        },
        {
          "confidence": 1.0,
          "dst": "loc_dockyard17",
          "rel": "references",
          "src": "post_midnight_manifest"
        },
        {
          "confidence": 0.95,
          "dst": "user_bharat",
          "rel": "connected_to",
          "src": "user_ivy"
        },
        {
          "confidence": 0.9,
          "dst": "event_project_lantern",
          "rel": "collaborates_on",
          "src": "user_bharat"
        }
      ]
    }
  ],
  "summary": {
    "avg_compactness_reward": 0.0,
    "avg_connectivity_gain_reward": 0.2,
    "avg_connectivity_reward": -0.15,
    "avg_diversity_reward": 0.12666666666666665,
    "avg_entity_informativeness_reward": 0.0036675120354726642,
    "avg_format_reward": 0.15,
    "avg_graph_f1": 0.5714285714285715,
    "avg_knowledge_carrier_reward": 0.5,
    "avg_knowledge_indexing_reward": 0.12272727272727273,
    "avg_relation_informativeness_reward": 0.08384894230149129,
    "avg_reward": 4.375796665887621,
    "avg_soft_shaping_reward": 0.3,
    "avg_spawn_count": 4.0,
    "avg_spawn_critical_steps": 6.0,
    "avg_steps_to_solution": 9.0,
    "deanonymization_accuracy": 0.0,
    "leaderboard_score": 0.6775552197990489,
    "retrieval_signal": 0.7179545454545455,
    "spawn_completion_rate": 1.0,
    "spawn_signal": 0.6666666666666666,
    "structural_signal": 0.5190032908673928,
    "task_success_rate": 1.0,
    "tool_efficiency": 0.5
  }
}