openenv / openenv.yaml
AnkushRaheja's picture
Upload 2 files
082b34e verified
name: data-cleaning-benchmark
version: "1.0.0"
description: >
A multi-task LLM agent benchmark for real-world tabular data cleaning.
The agent receives a dirty dataset and must apply structured cleaning
actions to fix duplicates, missing values, format issues, and outliers.
author: "Jayesh"
license: MIT
tasks:
- id: task1_easy
task_id: task1_easy
name: "Basic Customer Data Cleanup"
difficulty: easy
max_steps: 20
description: "Remove duplicates, fill missing values, standardise country names."
grader: env.graders:grade_task1_easy
grader_fn: env.graders:grade_task1_easy
grader_path: env.graders:grade_task1_easy
- id: task2_medium
task_id: task2_medium
name: "E-commerce Orders Normalisation"
difficulty: medium
max_steps: 20
description: "Fix mixed date formats, convert price strings, correct category typos."
grader: env.graders:grade_task2_medium
grader_fn: env.graders:grade_task2_medium
grader_path: env.graders:grade_task2_medium
- id: task3_hard
task_id: task3_hard
name: "Analytics Data Deep Clean"
difficulty: hard
max_steps: 20
description: "Resolve duplicate user IDs, clip session outliers, fix invalid bounce rates."
grader: env.graders:grade_task3_hard
grader_fn: env.graders:grade_task3_hard
grader_path: env.graders:grade_task3_hard
- id: task4_medium_alt
task_id: task4_medium_alt
name: "E-commerce Orders Cleanup (Alt)"
difficulty: medium
max_steps: 20
description: "Alternative medium scenario sharing the same grading criteria as task2_medium."
grader: env.graders:grade_task4_medium_alt
grader_fn: env.graders:grade_task4_medium_alt
grader_path: env.graders:grade_task4_medium_alt
- id: task5_hard_alt
task_id: task5_hard_alt
name: "Analytics Deep Clean (Alt)"
difficulty: hard
max_steps: 20
description: "Alternative hard scenario sharing the same grading criteria as task3_hard."
grader: env.graders:grade_task5_hard_alt
grader_fn: env.graders:grade_task5_hard_alt
grader_path: env.graders:grade_task5_hard_alt
observation_space:
type: structured_json
fields:
- task_id
- task_description
- table_preview
- schema_info
- valid_actions
- step / max_steps
- cleaning_log
- issues_detected
action_space:
type: structured_json
actions:
- name: fill_missing
params: ["column", "strategy(mean|median|mode|constant)", "value?"]
- name: standardize_values
params: ["column", "mapping(dict)"]
- name: remove_duplicates
params: []
- name: remove_row
params: ["row_id(int)"]
- name: convert_type
params: ["column", "target_type(float|int|str|datetime)"]
- name: clip_outliers
params: ["column", "lower?", "upper?"]
- name: submit
params: []
reward:
type: shaped
intermediate: true
range: [0.01, 0.99]
description: >
Positive rewards for correct cleaning steps; small penalties for
invalid or wasted actions; final grader score awarded on submit().
api:
base_path: "/"
endpoints:
reset: "POST /reset"
step: "POST /step"
state: "GET /state"
step_legacy: "POST /step/{session_id}"
state_legacy: "GET /state/{session_id}"
health: "GET /health"
tasks: "GET /tasks"
runtime:
language: python
version: "3.11"
port: 7860
framework: fastapi
tags:
- openenv
- data-cleaning
- llm-benchmark
- tabular