File size: 651 Bytes
d9175ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
name: tool_use_env
description: Evaluate AI agents on reliable tool usage under uncertainty

version: 1.0

entrypoint: server.app:app

actions:
  type: object
  properties:
    action_type:
      type: string
      enum:
        - use_calculator
        - use_search
        - answer_directly

observations:
  type: object
  properties:
    query:
      type: string
    tool_output:
      type: string
      nullable: true
    message:
      type: string

reward_range: [0.0, 1.0]

metadata:
  difficulty_levels:
    - easy
    - medium
    - hard

  features:
    - tool_selection
    - partial_rewards
    - decision_making
    - efficiency_penalty