File size: 2,043 Bytes
d2d30e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28070b8
d2d30e9
28070b8
d2d30e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
name: data-cleaning-env
version: "0.1.0"
description: >
  A real-world data cleaning environment where an AI agent fixes missing
  values, duplicate rows, format inconsistencies, outliers, and dtype errors
  across three progressively harder tasks.

author: openenv-hackathon
tags:
  - openenv
  - data-cleaning
  - rl
  - real-world

tasks:
  - id: task1
    name: "Fill Missing Values"
    difficulty: easy
    max_steps: 20
    description: >
      Fill all NaN values in an employee records dataset.
      Columns with missing data: age, salary, department.

  - id: task2
    name: "Fix Formats and Remove Duplicates"
    difficulty: medium
    max_steps: 30
    description: >
      Standardise phone numbers (NNN-NNN-NNNN) and dates (YYYY-MM-DD)
      in a product catalog, and remove ~15 duplicate rows.

  - id: task3
    name: "Full Cleaning Pipeline"
    difficulty: hard
    max_steps: 40
    description: >
      End-to-end pipeline on a customer database: fill missing values,
      remove duplicates, drop outliers in purchase_amount, standardise
      country capitalisation, and fix mixed date formats.

api:
  health:  GET  /health
  reset:   POST /reset
  step:    POST /step
  state:   POST /state
  docs:    GET  /docs

reward:
  range: [0.001, 0.999]
  partial: true
  terminal_bonus: 0.0

observation_space:
  type: object
  fields:
    done:            boolean
    reward:          float
    data_preview:    string   # First 10 rows as CSV
    data_shape:      list     # [rows, cols]
    missing_counts:  object   # {column: count}
    duplicate_count: integer
    dtype_issues:    object   # {column: issue_description}
    task_description: string
    message:         string
    step_count:      integer
    current_score:   float    # 0.0–1.0

action_space:
  type: object
  fields:
    operation: string   # fill_missing | drop_duplicates | fix_format | replace_value | drop_outliers | fix_dtype
    column:    string   # optional depending on operation
    params:    object   # optional operation parameters