name: tool_use_env description: Evaluate AI agents on reliable tool usage under uncertainty version: 1.0 entrypoint: server.app:app actions: type: object properties: action_type: type: string enum: - use_calculator - use_search - answer_directly observations: type: object properties: query: type: string tool_output: type: string nullable: true message: type: string reward_range: [0.0, 1.0] metadata: difficulty_levels: - easy - medium - hard features: - tool_selection - partial_rewards - decision_making - efficiency_penalty