Spaces:
Sleeping
Sleeping
Commit ·
4719066
0
Parent(s):
changed till step 8
Browse files- .dockerignore +44 -0
- .gitattributes +40 -0
- .gitignore +11 -0
- Dockerfile +26 -0
- README.md +321 -0
- baseline_scores.json +8 -0
- client.py +114 -0
- env.example +23 -0
- inference.py +286 -0
- inference_log.txt +0 -0
- models.py +45 -0
- openenv.yaml +73 -0
- pyproject.toml +26 -0
- requirements.txt +9 -0
- server/__init__.py +0 -0
- server/app.py +176 -0
- server/apps/base_app.py +19 -0
- server/business_rules.py +62 -0
- server/data_generator.py +214 -0
- server/environment.py +141 -0
- server/schema_drift.py +55 -0
- server/tasks/__init__.py +0 -0
- server/tasks/task1_missing.py +39 -0
- server/tasks/task2_format.py +68 -0
- server/tasks/task3_pipeline.py +104 -0
- server/workflow_engine.py +63 -0
- uv.lock +0 -0
.dockerignore
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ---- Git ----
|
| 2 |
+
.git
|
| 3 |
+
.gitignore
|
| 4 |
+
|
| 5 |
+
# ---- Python cache ----
|
| 6 |
+
__pycache__/
|
| 7 |
+
*.pyc
|
| 8 |
+
*.pyo
|
| 9 |
+
*.pyd
|
| 10 |
+
*.pytest_cache/
|
| 11 |
+
|
| 12 |
+
# ---- Virtual environments ----
|
| 13 |
+
venv/
|
| 14 |
+
.env/
|
| 15 |
+
.venv/
|
| 16 |
+
|
| 17 |
+
# ---- Environment files ----
|
| 18 |
+
.env
|
| 19 |
+
.env.*
|
| 20 |
+
|
| 21 |
+
# ---- OS files ----
|
| 22 |
+
.DS_Store
|
| 23 |
+
Thumbs.db
|
| 24 |
+
|
| 25 |
+
# ---- Logs ----
|
| 26 |
+
*.log
|
| 27 |
+
|
| 28 |
+
# ---- Model / large local files (if any) ----
|
| 29 |
+
checkpoints/
|
| 30 |
+
models/
|
| 31 |
+
*.pt
|
| 32 |
+
*.pth
|
| 33 |
+
*.bin
|
| 34 |
+
|
| 35 |
+
# ---- IDE files ----
|
| 36 |
+
.vscode/
|
| 37 |
+
.idea/
|
| 38 |
+
|
| 39 |
+
# ---- Node (if frontend exists) ----
|
| 40 |
+
node_modules/
|
| 41 |
+
|
| 42 |
+
# ---- Docker ----
|
| 43 |
+
Dockerfile*
|
| 44 |
+
docker-compose.yml
|
.gitattributes
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
training/orgos-training/orgos_lora_adapter/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.py[cod]
|
| 3 |
+
*.egg-info/
|
| 4 |
+
dist/
|
| 5 |
+
build/
|
| 6 |
+
.venv/
|
| 7 |
+
venv/
|
| 8 |
+
.env
|
| 9 |
+
*.env
|
| 10 |
+
.DS_Store
|
| 11 |
+
.claude/
|
Dockerfile
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# Non-root user for HuggingFace Spaces compatibility
|
| 4 |
+
RUN useradd -m -u 1000 appuser
|
| 5 |
+
|
| 6 |
+
WORKDIR /app
|
| 7 |
+
|
| 8 |
+
# Install dependencies first (layer cache)
|
| 9 |
+
COPY requirements.txt .
|
| 10 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
+
|
| 12 |
+
# Copy project files
|
| 13 |
+
COPY . .
|
| 14 |
+
|
| 15 |
+
# Switch to non-root
|
| 16 |
+
RUN chown -R appuser:appuser /app
|
| 17 |
+
USER appuser
|
| 18 |
+
|
| 19 |
+
EXPOSE 8000
|
| 20 |
+
|
| 21 |
+
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
| 22 |
+
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
|
| 23 |
+
|
| 24 |
+
# server.app:app — runs server/app.py from /app working directory
|
| 25 |
+
# models.py, client.py, inference.py live at /app root (on PYTHONPATH automatically)
|
| 26 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Data Cleaning Environment
|
| 3 |
+
emoji: 🧹
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
app_port: 8000
|
| 9 |
+
tags:
|
| 10 |
+
- openenv
|
| 11 |
+
- rl
|
| 12 |
+
- data-cleaning
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# Data Cleaning OpenEnv
|
| 16 |
+
|
| 17 |
+
A **real-world data cleaning environment** for training and evaluating AI agents.
|
| 18 |
+
|
| 19 |
+
An agent interacts with a dirty pandas DataFrame through a standard `reset() / step() / state()` HTTP API, learning to fix common data quality problems — missing values, duplicate rows, inconsistent formats, statistical outliers, and dtype errors — across three progressively harder tasks.
|
| 20 |
+
|
| 21 |
+
🤗 **Live HuggingFace Space:** https://srishtichugh-openenv-hack.hf.space
|
| 22 |
+
📖 **Interactive API docs:** https://srishtichugh-openenv-hack.hf.space/docs
|
| 23 |
+
✅ **Health check:** https://srishtichugh-openenv-hack.hf.space/health
|
| 24 |
+
|
| 25 |
+
---
|
| 26 |
+
|
| 27 |
+
## Environment Description & Motivation
|
| 28 |
+
|
| 29 |
+
Real-world datasets are almost never clean. Data engineers routinely spend 60–80 % of their time on data cleaning tasks: filling missing values with statistically appropriate strategies, removing duplicates, standardising inconsistent formats (phone numbers, dates, country names), and detecting extreme outliers.
|
| 30 |
+
|
| 31 |
+
This environment turns those tasks into a reinforcement learning challenge with:
|
| 32 |
+
|
| 33 |
+
- **Deterministic, programmatic graders** — ground-truth clean DataFrames are generated with a fixed seed; every reward signal is reproducible.
|
| 34 |
+
- **Meaningful partial rewards** — every step emits a delta reward proportional to how much of the dataset it cleaned, so the agent receives useful signal throughout the episode rather than only at the end.
|
| 35 |
+
- **Three difficulty levels** — easy, medium, hard — letting agents learn a curriculum from simple null-filling up to full multi-issue pipelines.
|
| 36 |
+
- **No external data downloads** — all datasets are generated synthetically via `numpy` + `Faker` with `seed=42`.
|
| 37 |
+
|
| 38 |
+
---
|
| 39 |
+
|
| 40 |
+
## Action Space
|
| 41 |
+
|
| 42 |
+
Actions are JSON objects sent to `POST /step`.
|
| 43 |
+
|
| 44 |
+
| `operation` | Required `column` | `params` | Description |
|
| 45 |
+
|---|---|---|---|
|
| 46 |
+
| `fill_missing` | ✅ | `{"strategy": "median\|mean\|mode\|constant", "value": ...}` | Fill NaN values in a column |
|
| 47 |
+
| `drop_duplicates` | ❌ | — | Remove all duplicate rows |
|
| 48 |
+
| `fix_format` | ✅ | — | Standardise phone/date/country format |
|
| 49 |
+
| `replace_value` | ✅ | `{"old": ..., "new": ...}` | Replace a specific value |
|
| 50 |
+
| `drop_outliers` | ✅ | — | Remove IQR outliers from a numeric column |
|
| 51 |
+
| `fix_dtype` | ✅ | `{"dtype": "float\|int\|str"}` | Cast column to correct dtype |
|
| 52 |
+
|
| 53 |
+
**Format rules enforced by `fix_format`:**
|
| 54 |
+
|
| 55 |
+
| Column | Target format |
|
| 56 |
+
|---|---|
|
| 57 |
+
| `phone` | `NNN-NNN-NNNN` |
|
| 58 |
+
| `listed_date` / `signup_date` | `YYYY-MM-DD` |
|
| 59 |
+
| `country` | Title-cased canonical name (`USA`, `UK`, `Canada`, `Australia`, `Germany`) |
|
| 60 |
+
|
| 61 |
+
**Example actions:**
|
| 62 |
+
```json
|
| 63 |
+
{"operation": "fill_missing", "column": "salary", "params": {"strategy": "median"}}
|
| 64 |
+
{"operation": "fill_missing", "column": "department", "params": {"strategy": "mode"}}
|
| 65 |
+
{"operation": "drop_duplicates"}
|
| 66 |
+
{"operation": "fix_format", "column": "phone"}
|
| 67 |
+
{"operation": "fix_format", "column": "signup_date"}
|
| 68 |
+
{"operation": "drop_outliers", "column": "purchase_amount"}
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
---
|
| 72 |
+
|
| 73 |
+
## Observation Space
|
| 74 |
+
|
| 75 |
+
Every `POST /reset` and `POST /step` returns:
|
| 76 |
+
```json
|
| 77 |
+
{
|
| 78 |
+
"observation": {
|
| 79 |
+
"done": false,
|
| 80 |
+
"reward": 0.40,
|
| 81 |
+
"data_preview": "name,age,salary,...\n...",
|
| 82 |
+
"data_shape": [100, 5],
|
| 83 |
+
"missing_counts": {"age": 20, "salary": 20, "department": 10},
|
| 84 |
+
"duplicate_count": 0,
|
| 85 |
+
"dtype_issues": {},
|
| 86 |
+
"task_description": "Task 1 (Easy) — Fill Missing Values\n...",
|
| 87 |
+
"message": "Filled 20 missing values in 'age' using median.",
|
| 88 |
+
"step_count": 1,
|
| 89 |
+
"current_score": 0.4000
|
| 90 |
+
},
|
| 91 |
+
"reward": 0.40,
|
| 92 |
+
"done": false,
|
| 93 |
+
"info": {}
|
| 94 |
+
}
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
| Field | Type | Description |
|
| 98 |
+
|---|---|---|
|
| 99 |
+
| `done` | bool | Episode finished (score ≥ 0.95 or max steps reached) |
|
| 100 |
+
| `reward` | float | Per-step delta reward (see Reward Function) |
|
| 101 |
+
| `data_preview` | string | First 10 rows of current DataFrame as CSV |
|
| 102 |
+
| `data_shape` | [int, int] | Current `[rows, cols]` |
|
| 103 |
+
| `missing_counts` | object | `{column: null_count}` for columns with NaN |
|
| 104 |
+
| `duplicate_count` | int | Number of duplicate rows |
|
| 105 |
+
| `dtype_issues` | object | `{column: issue_description}` for suspected dtype mismatches |
|
| 106 |
+
| `task_description` | string | Full task instructions with available operations |
|
| 107 |
+
| `message` | string | Human-readable result of the last action |
|
| 108 |
+
| `step_count` | int | Steps taken in this episode |
|
| 109 |
+
| `current_score` | float | Running grader score 0.0 – 1.0 |
|
| 110 |
+
|
| 111 |
+
---
|
| 112 |
+
|
| 113 |
+
## State Space
|
| 114 |
+
|
| 115 |
+
`GET /state` returns episode metadata (does not modify state):
|
| 116 |
+
```json
|
| 117 |
+
{
|
| 118 |
+
"episode_id": "a8f026a9-...",
|
| 119 |
+
"task_id": 1,
|
| 120 |
+
"step_count": 2,
|
| 121 |
+
"max_steps": 20,
|
| 122 |
+
"total_errors": 50,
|
| 123 |
+
"errors_remaining": 30
|
| 124 |
+
}
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
---
|
| 128 |
+
|
| 129 |
+
## Tasks
|
| 130 |
+
|
| 131 |
+
### Task 1 — Fill Missing Values *(Easy)*
|
| 132 |
+
|
| 133 |
+
| Property | Value |
|
| 134 |
+
|---|---|
|
| 135 |
+
| Dataset | 100-row employee records (name, age, salary, department, experience) |
|
| 136 |
+
| Issues | ~20 % NaN in `age`, `salary`; ~10 % NaN in `department` |
|
| 137 |
+
| Goal | Fill all missing values |
|
| 138 |
+
| Valid operations | `fill_missing` |
|
| 139 |
+
| Grader | `1.0 − remaining_nulls / original_nulls` |
|
| 140 |
+
| Max steps | 20 |
|
| 141 |
+
| Optimal steps | 3 (one per affected column) |
|
| 142 |
+
|
| 143 |
+
### Task 2 — Fix Formats + Remove Duplicates *(Medium)*
|
| 144 |
+
|
| 145 |
+
| Property | Value |
|
| 146 |
+
|---|---|
|
| 147 |
+
| Dataset | 215-row product catalog (product_id, price, category, phone, listed_date) |
|
| 148 |
+
| Issues | ~60 % phone numbers in mixed formats, ~60 % dates in mixed formats, 15 duplicate rows |
|
| 149 |
+
| Goal | Standardise all phone/date formats and remove duplicates |
|
| 150 |
+
| Valid operations | `fix_format`, `drop_duplicates` |
|
| 151 |
+
| Grader | `0.35 × phone_score + 0.35 × date_score + 0.30 × dupe_score` |
|
| 152 |
+
| Max steps | 30 |
|
| 153 |
+
| Optimal steps | 3 |
|
| 154 |
+
|
| 155 |
+
### Task 3 — Full Cleaning Pipeline *(Hard)*
|
| 156 |
+
|
| 157 |
+
| Property | Value |
|
| 158 |
+
|---|---|
|
| 159 |
+
| Dataset | 320-row customer database (name, age, purchase_amount, country, email, signup_date) |
|
| 160 |
+
| Issues | Missing values (4 cols), 20 duplicate rows, outliers in `purchase_amount` (~3× normal), mixed country capitalisation, mixed date formats |
|
| 161 |
+
| Goal | Fix all issues end-to-end |
|
| 162 |
+
| Valid operations | All 6 operations |
|
| 163 |
+
| Grader | `0.25×null + 0.20×dupe + 0.20×outlier + 0.175×country + 0.175×date` |
|
| 164 |
+
| Max steps | 40 |
|
| 165 |
+
| Optimal steps | 8 |
|
| 166 |
+
|
| 167 |
+
---
|
| 168 |
+
|
| 169 |
+
## Reward Function
|
| 170 |
+
|
| 171 |
+
| Scenario | Reward |
|
| 172 |
+
|---|---|
|
| 173 |
+
| Score improves (delta > 0) | `new_score − old_score` (positive) |
|
| 174 |
+
| Operation had no effect | `−0.01` |
|
| 175 |
+
| Invalid operation / bad column | `−0.05` |
|
| 176 |
+
| Episode completed (score ≥ 0.95) | `delta + 0.20` terminal bonus |
|
| 177 |
+
|
| 178 |
+
Rewards are bounded to **[−0.05, 1.2]**. A partial reward is emitted on every step, giving the agent dense signal throughout the episode.
|
| 179 |
+
|
| 180 |
+
---
|
| 181 |
+
|
| 182 |
+
## API Endpoints
|
| 183 |
+
|
| 184 |
+
| Method | Path | Description |
|
| 185 |
+
|---|---|---|
|
| 186 |
+
| `GET` | `/health` | Health check → `{"status": "healthy"}` |
|
| 187 |
+
| `POST` | `/reset` | Start episode. Body: `{"task_id": 1\|2\|3}` (optional; default: round-robin) |
|
| 188 |
+
| `POST` | `/step` | Execute action. Body: action JSON |
|
| 189 |
+
| `POST` | `/state` | Get episode metadata |
|
| 190 |
+
| `GET` | `/metadata` | Environment name, version, task list |
|
| 191 |
+
| `GET` | `/schema` | Full action / observation / state JSON schemas |
|
| 192 |
+
| `GET` | `/docs` | Interactive Swagger UI |
|
| 193 |
+
|
| 194 |
+
---
|
| 195 |
+
|
| 196 |
+
## Baseline Scores
|
| 197 |
+
|
| 198 |
+
| Task | Difficulty | Score |
|
| 199 |
+
|---|---|---|
|
| 200 |
+
| 1 — Fill Missing Values | Easy | 0.999 |
|
| 201 |
+
| 2 — Fix Formats + Duplicates | Medium | 0.999 |
|
| 202 |
+
| 3 — Full Cleaning Pipeline | Hard | 0.999 |
|
| 203 |
+
| **Average** | — | **0.999** |
|
| 204 |
+
|
| 205 |
+
*Produced by `google/gemma-3-27b-it` via NVIDIA NIM, `temperature=0`. Full step-by-step agent logs: `inference_log.txt`.*
|
| 206 |
+
|
| 207 |
+
---
|
| 208 |
+
|
| 209 |
+
## Setup & Usage
|
| 210 |
+
|
| 211 |
+
### Prerequisites
|
| 212 |
+
|
| 213 |
+
- Python 3.11+
|
| 214 |
+
- Docker (for containerised deployment)
|
| 215 |
+
|
| 216 |
+
### Local — Python
|
| 217 |
+
```bash
|
| 218 |
+
# 1. Clone and install dependencies
|
| 219 |
+
git clone https://github.com/Tanvi51204/openEnv.git
|
| 220 |
+
cd openEnv
|
| 221 |
+
pip install -r requirements.txt
|
| 222 |
+
|
| 223 |
+
# 2. Start the server
|
| 224 |
+
uvicorn server.app:app --host 0.0.0.0 --port 8000
|
| 225 |
+
|
| 226 |
+
# 3. Open Swagger UI
|
| 227 |
+
open http://localhost:8000/docs
|
| 228 |
+
```
|
| 229 |
+
|
| 230 |
+
### Local — Docker
|
| 231 |
+
```bash
|
| 232 |
+
docker build -t data-cleaning-env .
|
| 233 |
+
docker run -p 8000:8000 data-cleaning-env
|
| 234 |
+
```
|
| 235 |
+
|
| 236 |
+
### Quick API test
|
| 237 |
+
```bash
|
| 238 |
+
# Health
|
| 239 |
+
curl http://localhost:8000/health
|
| 240 |
+
|
| 241 |
+
# Start Task 1
|
| 242 |
+
curl -X POST http://localhost:8000/reset \
|
| 243 |
+
-H "Content-Type: application/json" \
|
| 244 |
+
-d '{"task_id": 1}'
|
| 245 |
+
|
| 246 |
+
# Fill missing values
|
| 247 |
+
curl -X POST http://localhost:8000/step \
|
| 248 |
+
-H "Content-Type: application/json" \
|
| 249 |
+
-d '{"operation": "fill_missing", "column": "salary", "params": {"strategy": "median"}}'
|
| 250 |
+
```
|
| 251 |
+
|
| 252 |
+
### Python client
|
| 253 |
+
```python
|
| 254 |
+
from client import DataCleaningEnvClient
|
| 255 |
+
from models import DataCleaningAction
|
| 256 |
+
|
| 257 |
+
with DataCleaningEnvClient("http://localhost:8000") as env:
|
| 258 |
+
result = env.reset(task_id=1)
|
| 259 |
+
print(result.observation.missing_counts) # {'age': 20, 'salary': 20, 'department': 10}
|
| 260 |
+
|
| 261 |
+
action = DataCleaningAction(
|
| 262 |
+
operation="fill_missing",
|
| 263 |
+
column="salary",
|
| 264 |
+
params={"strategy": "median"},
|
| 265 |
+
)
|
| 266 |
+
result = env.step(action)
|
| 267 |
+
print(result.observation.current_score) # 0.4
|
| 268 |
+
print(result.reward) # 0.4
|
| 269 |
+
```
|
| 270 |
+
|
| 271 |
+
### Run baseline inference
|
| 272 |
+
```bash
|
| 273 |
+
export API_BASE_URL="https://api.openai.com/v1"
|
| 274 |
+
export MODEL_NAME="gpt-4o-mini"
|
| 275 |
+
export HF_TOKEN="sk-..." # your API key
|
| 276 |
+
export ENV_URL="http://localhost:8000"
|
| 277 |
+
|
| 278 |
+
python inference.py
|
| 279 |
+
```
|
| 280 |
+
|
| 281 |
+
Produces `[START]` / `[STEP]` / `[END]` lines to stdout and `baseline_scores.json`.
|
| 282 |
+
|
| 283 |
+
### Environment variables
|
| 284 |
+
|
| 285 |
+
| Variable | Default | Description |
|
| 286 |
+
|---|---|---|
|
| 287 |
+
| `API_BASE_URL` | `https://api.openai.com/v1` | LLM API endpoint (OpenAI-compatible) |
|
| 288 |
+
| `MODEL_NAME` | `gpt-4o-mini` | Model identifier |
|
| 289 |
+
| `HF_TOKEN` | — | API key for LLM calls |
|
| 290 |
+
| `ENV_URL` | `http://localhost:8000` | Environment server URL |
|
| 291 |
+
|
| 292 |
+
---
|
| 293 |
+
|
| 294 |
+
## Project Structure
|
| 295 |
+
```
|
| 296 |
+
openenv-data-cleaning/
|
| 297 |
+
├── models.py Pydantic contracts — Action / Observation / State
|
| 298 |
+
├── client.py Sync HTTP client (reset / step / state / health)
|
| 299 |
+
├── inference.py Baseline LLM agent with [START]/[STEP]/[END] logging
|
| 300 |
+
├── openenv.yaml OpenEnv manifest
|
| 301 |
+
├── Dockerfile python:3.11-slim, non-root user, HEALTHCHECK
|
| 302 |
+
├── requirements.txt pip dependencies
|
| 303 |
+
├── pyproject.toml Python package metadata + openenv-core dependency
|
| 304 |
+
└── server/
|
| 305 |
+
├── app.py FastAPI routes + /metadata + /schema
|
| 306 |
+
├── environment.py reset / step / state logic + 6 operations + rewards
|
| 307 |
+
├── data_generator.py Synthetic dataset generation (seed=42, reproducible)
|
| 308 |
+
└── tasks/
|
| 309 |
+
├── task1_missing.py Easy — fill NaN grader
|
| 310 |
+
├── task2_format.py Medium — format + duplicates grader
|
| 311 |
+
└── task3_pipeline.py Hard — full pipeline grader
|
| 312 |
+
```
|
| 313 |
+
|
| 314 |
+
---
|
| 315 |
+
|
| 316 |
+
## Live Demo
|
| 317 |
+
|
| 318 |
+
🤗 **HuggingFace Space:** https://srishtichugh-openenv-hack.hf.space
|
| 319 |
+
|
| 320 |
+
- Health: https://srishtichugh-openenv-hack.hf.space/health
|
| 321 |
+
- Docs: https://srishtichugh-openenv-hack.hf.space/docs
|
baseline_scores.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"scores": {
|
| 3 |
+
"task1": 0.99,
|
| 4 |
+
"task2": 0.99,
|
| 5 |
+
"task3": 0.99
|
| 6 |
+
},
|
| 7 |
+
"average": 0.99
|
| 8 |
+
}
|
client.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Synchronous HTTP client for the Data Cleaning OpenEnv environment.
|
| 3 |
+
|
| 4 |
+
Usage
|
| 5 |
+
-----
|
| 6 |
+
from client import DataCleaningEnvClient, DataCleaningAction
|
| 7 |
+
|
| 8 |
+
client = DataCleaningEnvClient(base_url="http://localhost:8000")
|
| 9 |
+
|
| 10 |
+
# Start a new episode (task_id 1/2/3 or omit for round-robin)
|
| 11 |
+
result = client.reset(task_id=1)
|
| 12 |
+
print(result.observation.task_description)
|
| 13 |
+
|
| 14 |
+
# Take a step
|
| 15 |
+
action = DataCleaningAction(
|
| 16 |
+
operation="fill_missing",
|
| 17 |
+
column="salary",
|
| 18 |
+
params={"strategy": "median"},
|
| 19 |
+
)
|
| 20 |
+
result = client.step(action)
|
| 21 |
+
print(result.observation.current_score, result.reward, result.done)
|
| 22 |
+
|
| 23 |
+
# Inspect state
|
| 24 |
+
state = client.state()
|
| 25 |
+
print(state.episode_id, state.errors_remaining)
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
from typing import Optional
|
| 29 |
+
import httpx
|
| 30 |
+
from pydantic import BaseModel
|
| 31 |
+
|
| 32 |
+
from models import DataCleaningAction, DataCleaningObservation, DataCleaningState
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class StepResult(BaseModel):
|
| 36 |
+
"""Returned by reset() and step()."""
|
| 37 |
+
observation: DataCleaningObservation
|
| 38 |
+
reward: float
|
| 39 |
+
done: bool
|
| 40 |
+
info: dict = {}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class DataCleaningEnvClient:
|
| 44 |
+
"""
|
| 45 |
+
Thin synchronous wrapper around the DataCleaning HTTP API.
|
| 46 |
+
|
| 47 |
+
All methods raise httpx.HTTPStatusError on non-2xx responses.
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
def __init__(self, base_url: str = "http://localhost:8000", timeout: float = 30.0):
|
| 51 |
+
self.base_url = base_url.rstrip("/")
|
| 52 |
+
self._client = httpx.Client(base_url=self.base_url, timeout=timeout)
|
| 53 |
+
|
| 54 |
+
# ------------------------------------------------------------------
|
| 55 |
+
# Core API
|
| 56 |
+
# ------------------------------------------------------------------
|
| 57 |
+
|
| 58 |
+
def reset(self, task_id: Optional[int] = None) -> StepResult:
|
| 59 |
+
"""
|
| 60 |
+
Start a new episode.
|
| 61 |
+
|
| 62 |
+
Parameters
|
| 63 |
+
----------
|
| 64 |
+
task_id : int | None
|
| 65 |
+
1 = Easy (fill missing values)
|
| 66 |
+
2 = Medium (fix formats + duplicates)
|
| 67 |
+
3 = Hard (full pipeline)
|
| 68 |
+
None = round-robin (1 → 2 → 3 → 1 …)
|
| 69 |
+
"""
|
| 70 |
+
payload = {"task_id": task_id} if task_id is not None else {}
|
| 71 |
+
resp = self._client.post("/reset", json=payload)
|
| 72 |
+
resp.raise_for_status()
|
| 73 |
+
return StepResult(**resp.json())
|
| 74 |
+
|
| 75 |
+
def step(self, action: DataCleaningAction) -> StepResult:
|
| 76 |
+
"""
|
| 77 |
+
Apply one cleaning operation and return the updated observation.
|
| 78 |
+
|
| 79 |
+
Parameters
|
| 80 |
+
----------
|
| 81 |
+
action : DataCleaningAction
|
| 82 |
+
operation : str – one of fill_missing / drop_duplicates /
|
| 83 |
+
fix_format / replace_value / drop_outliers / fix_dtype
|
| 84 |
+
column : str – target column (optional for drop_duplicates)
|
| 85 |
+
params : dict – operation-specific parameters
|
| 86 |
+
"""
|
| 87 |
+
resp = self._client.post("/step", json=action.model_dump())
|
| 88 |
+
resp.raise_for_status()
|
| 89 |
+
return StepResult(**resp.json())
|
| 90 |
+
|
| 91 |
+
def state(self) -> DataCleaningState:
|
| 92 |
+
"""Return current episode metadata without modifying state."""
|
| 93 |
+
resp = self._client.get("/state")
|
| 94 |
+
resp.raise_for_status()
|
| 95 |
+
return DataCleaningState(**resp.json())
|
| 96 |
+
|
| 97 |
+
def health(self) -> dict:
|
| 98 |
+
"""Ping the server. Returns {"status": "ok"} if healthy."""
|
| 99 |
+
resp = self._client.get("/health")
|
| 100 |
+
resp.raise_for_status()
|
| 101 |
+
return resp.json()
|
| 102 |
+
|
| 103 |
+
# ------------------------------------------------------------------
|
| 104 |
+
# Context manager support
|
| 105 |
+
# ------------------------------------------------------------------
|
| 106 |
+
|
| 107 |
+
def __enter__(self):
|
| 108 |
+
return self
|
| 109 |
+
|
| 110 |
+
def __exit__(self, *_):
|
| 111 |
+
self.close()
|
| 112 |
+
|
| 113 |
+
def close(self):
|
| 114 |
+
self._client.close()
|
env.example
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ============================================================
|
| 2 |
+
# Data Cleaning OpenEnv — Environment Variables
|
| 3 |
+
# Copy this file to .env and fill in your values.
|
| 4 |
+
# Never commit your real .env to version control.
|
| 5 |
+
# ============================================================
|
| 6 |
+
|
| 7 |
+
# LLM API endpoint (OpenAI-compatible).
|
| 8 |
+
# Default points to OpenAI; swap for any compatible provider.
|
| 9 |
+
API_BASE_URL=https://api.openai.com/v1
|
| 10 |
+
|
| 11 |
+
# Model identifier to use for baseline inference.
|
| 12 |
+
# Examples: gpt-4o-mini, gpt-4o, mistralai/Mistral-7B-Instruct-v0.2
|
| 13 |
+
MODEL_NAME=gpt-4o-mini
|
| 14 |
+
|
| 15 |
+
# API key for the LLM provider above.
|
| 16 |
+
# For OpenAI: starts with sk-...
|
| 17 |
+
# For HuggingFace Inference: starts with hf_...
|
| 18 |
+
HF_TOKEN=your-api-key-here
|
| 19 |
+
|
| 20 |
+
# Base URL of the running environment server.
|
| 21 |
+
# Use http://localhost:8000 for local development,
|
| 22 |
+
# or your HuggingFace Space URL for remote runs.
|
| 23 |
+
ENV_URL=http://localhost:8000
|
inference.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Baseline inference script for the Data Cleaning OpenEnv environment.
|
| 3 |
+
Uses the OpenAI client against all 3 tasks and reports scores.
|
| 4 |
+
|
| 5 |
+
Required environment variables:
|
| 6 |
+
API_BASE_URL — LLM API endpoint (OpenAI-compatible)
|
| 7 |
+
MODEL_NAME — model identifier
|
| 8 |
+
HF_TOKEN — API key
|
| 9 |
+
ENV_URL — environment server URL (default: http://localhost:8000)
|
| 10 |
+
|
| 11 |
+
STDOUT FORMAT (OpenEnv spec):
|
| 12 |
+
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 13 |
+
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 14 |
+
[END] task=<task_name> score=<0.00> steps=<n>
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
import re
|
| 20 |
+
import sys
|
| 21 |
+
import time
|
| 22 |
+
from typing import List, Optional
|
| 23 |
+
import httpx
|
| 24 |
+
from openai import OpenAI
|
| 25 |
+
|
| 26 |
+
# ------------------------------------------------------------------
|
| 27 |
+
# Config
|
| 28 |
+
# ------------------------------------------------------------------
|
| 29 |
+
|
| 30 |
+
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
|
| 31 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
|
| 32 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 33 |
+
ENV_URL = os.environ.get("ENV_URL", "http://localhost:8000")
|
| 34 |
+
|
| 35 |
+
if not HF_TOKEN:
|
| 36 |
+
print("[WARNING] HF_TOKEN is not set — LLM calls may fail.", file=sys.stderr)
|
| 37 |
+
|
| 38 |
+
client = OpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
|
| 39 |
+
|
| 40 |
+
SYSTEM_PROMPT = """You are a data cleaning agent. You control a data cleaning environment
|
| 41 |
+
through JSON actions. Each turn you receive an observation JSON describing the current state
|
| 42 |
+
of a dataset (preview, missing counts, duplicate count, dtype issues, current score, etc.)
|
| 43 |
+
and a task description.
|
| 44 |
+
|
| 45 |
+
Your job is to pick the single best action to improve the dataset quality.
|
| 46 |
+
|
| 47 |
+
Respond ONLY with a valid JSON object — no markdown, no explanation, just the JSON.
|
| 48 |
+
|
| 49 |
+
Available operations and their required parameters:
|
| 50 |
+
|
| 51 |
+
1. fill_missing
|
| 52 |
+
{"operation": "fill_missing", "column": "<col>", "params": {"strategy": "median|mean|mode|constant", "value": <only if constant>}}
|
| 53 |
+
|
| 54 |
+
2. drop_duplicates
|
| 55 |
+
{"operation": "drop_duplicates"}
|
| 56 |
+
|
| 57 |
+
3. fix_format
|
| 58 |
+
{"operation": "fix_format", "column": "phone|listed_date|signup_date|country"}
|
| 59 |
+
|
| 60 |
+
4. replace_value
|
| 61 |
+
{"operation": "replace_value", "column": "<col>", "params": {"old": "<val>", "new": "<val>"}}
|
| 62 |
+
|
| 63 |
+
5. drop_outliers
|
| 64 |
+
{"operation": "drop_outliers", "column": "<numeric_col>"}
|
| 65 |
+
|
| 66 |
+
6. fix_dtype
|
| 67 |
+
{"operation": "fix_dtype", "column": "<col>", "params": {"dtype": "float|int|str"}}
|
| 68 |
+
|
| 69 |
+
Rules:
|
| 70 |
+
- Address the highest-impact issues first (missing values > duplicates > outliers > format).
|
| 71 |
+
- Do not repeat an operation that returned no effect (watch the 'message' field).
|
| 72 |
+
- Stop when current_score >= 0.95.
|
| 73 |
+
"""
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# ------------------------------------------------------------------
|
| 77 |
+
# OpenEnv stdout logging helpers
|
| 78 |
+
# ------------------------------------------------------------------
|
| 79 |
+
|
| 80 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 81 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 85 |
+
error_val = error if error else "null"
|
| 86 |
+
done_val = str(done).lower()
|
| 87 |
+
print(
|
| 88 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
|
| 89 |
+
flush=True,
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def log_end(task_name: str, score: float, steps: int) -> None:
|
| 94 |
+
safe_score = max(0.01, min(0.99, float(score)))
|
| 95 |
+
print(
|
| 96 |
+
f"[END] task={task_name} score={safe_score:.4f} steps={steps}",
|
| 97 |
+
flush=True
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# ------------------------------------------------------------------
|
| 102 |
+
# HTTP helpers
|
| 103 |
+
# ------------------------------------------------------------------
|
| 104 |
+
|
| 105 |
+
def api_post(path: str, payload: dict = None) -> dict:
|
| 106 |
+
url = ENV_URL.rstrip("/") + path
|
| 107 |
+
resp = httpx.post(url, json=payload or {}, timeout=30)
|
| 108 |
+
resp.raise_for_status()
|
| 109 |
+
return resp.json()
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def api_get(path: str) -> dict:
|
| 113 |
+
url = ENV_URL.rstrip("/") + path
|
| 114 |
+
resp = httpx.get(url, timeout=10)
|
| 115 |
+
resp.raise_for_status()
|
| 116 |
+
return resp.json()
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
# ------------------------------------------------------------------
|
| 120 |
+
# Agent loop
|
| 121 |
+
# ------------------------------------------------------------------
|
| 122 |
+
|
| 123 |
+
def obs_to_text(obs: dict) -> str:
|
| 124 |
+
lines = [
|
| 125 |
+
f"current_score: {obs['current_score']}",
|
| 126 |
+
f"step_count: {obs['step_count']}",
|
| 127 |
+
f"data_shape: {obs['data_shape']}",
|
| 128 |
+
f"duplicate_count: {obs['duplicate_count']}",
|
| 129 |
+
f"missing_counts: {json.dumps(obs['missing_counts'])}",
|
| 130 |
+
f"dtype_issues: {json.dumps(obs['dtype_issues'])}",
|
| 131 |
+
f"message: {obs['message']}",
|
| 132 |
+
"",
|
| 133 |
+
"=== DATA PREVIEW (first 10 rows) ===",
|
| 134 |
+
obs["data_preview"],
|
| 135 |
+
"",
|
| 136 |
+
"=== TASK DESCRIPTION ===",
|
| 137 |
+
obs["task_description"],
|
| 138 |
+
]
|
| 139 |
+
return "\n".join(lines)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def run_task(task_id: int) -> float:
|
| 143 |
+
task_name = f"data-cleaning-task{task_id}"
|
| 144 |
+
|
| 145 |
+
# Human-readable header (stderr so it doesn't interfere with stdout format)
|
| 146 |
+
print(f"\n{'='*60}", file=sys.stderr)
|
| 147 |
+
print(f" Running Task {task_id}", file=sys.stderr)
|
| 148 |
+
print(f"{'='*60}", file=sys.stderr)
|
| 149 |
+
|
| 150 |
+
result = api_post("/reset", {"task_id": task_id})
|
| 151 |
+
obs = result["observation"]
|
| 152 |
+
history = []
|
| 153 |
+
rewards: List[float] = []
|
| 154 |
+
steps_taken = 0
|
| 155 |
+
success = False
|
| 156 |
+
|
| 157 |
+
log_start(task=task_name, env="data-cleaning-openenv", model=MODEL_NAME)
|
| 158 |
+
|
| 159 |
+
try:
|
| 160 |
+
for step_num in range(1, 50):
|
| 161 |
+
if obs["done"]:
|
| 162 |
+
success = obs["current_score"] >= 0.95
|
| 163 |
+
break
|
| 164 |
+
|
| 165 |
+
obs_text = obs_to_text(obs)
|
| 166 |
+
history.append({"role": "user", "content": obs_text})
|
| 167 |
+
|
| 168 |
+
try:
|
| 169 |
+
response = client.chat.completions.create(
|
| 170 |
+
model = MODEL_NAME,
|
| 171 |
+
messages = [{"role": "system", "content": SYSTEM_PROMPT}] + history,
|
| 172 |
+
temperature = 0.0,
|
| 173 |
+
max_tokens = 256,
|
| 174 |
+
)
|
| 175 |
+
action_str = response.choices[0].message.content.strip()
|
| 176 |
+
except Exception as exc:
|
| 177 |
+
print(f" Step {step_num}: LLM call failed: {exc}", file=sys.stderr)
|
| 178 |
+
log_step(step_num, "null", 0.0, True, str(exc))
|
| 179 |
+
break
|
| 180 |
+
|
| 181 |
+
history.append({"role": "assistant", "content": action_str})
|
| 182 |
+
|
| 183 |
+
# Parse action JSON
|
| 184 |
+
action = None
|
| 185 |
+
try:
|
| 186 |
+
action = json.loads(action_str)
|
| 187 |
+
except json.JSONDecodeError:
|
| 188 |
+
m = re.search(r"\{.*\}", action_str, re.DOTALL)
|
| 189 |
+
if m:
|
| 190 |
+
try:
|
| 191 |
+
action = json.loads(m.group())
|
| 192 |
+
except Exception:
|
| 193 |
+
pass
|
| 194 |
+
|
| 195 |
+
if action is None:
|
| 196 |
+
print(f" Step {step_num}: Could not parse action JSON, skipping.", file=sys.stderr)
|
| 197 |
+
log_step(step_num, action_str, -0.05, False, "json_parse_error")
|
| 198 |
+
break
|
| 199 |
+
|
| 200 |
+
action_label = json.dumps(action, separators=(",", ":"))
|
| 201 |
+
print(
|
| 202 |
+
f" Step {step_num:2d} | score={obs['current_score']:.4f} | action={action_label}",
|
| 203 |
+
file=sys.stderr,
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
result = api_post("/step", action)
|
| 207 |
+
obs = result["observation"]
|
| 208 |
+
step_reward = result["reward"]
|
| 209 |
+
done = result["done"]
|
| 210 |
+
error_msg = None if obs["message"].startswith("Fill") or step_reward >= 0 else obs["message"]
|
| 211 |
+
|
| 212 |
+
print(f" -> {obs['message']}", file=sys.stderr)
|
| 213 |
+
|
| 214 |
+
rewards.append(step_reward)
|
| 215 |
+
steps_taken = step_num
|
| 216 |
+
|
| 217 |
+
log_step(
|
| 218 |
+
step = step_num,
|
| 219 |
+
action = action_label,
|
| 220 |
+
reward = step_reward,
|
| 221 |
+
done = done,
|
| 222 |
+
error = error_msg,
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
if done:
|
| 226 |
+
success = obs["current_score"] >= 0.95
|
| 227 |
+
break
|
| 228 |
+
|
| 229 |
+
time.sleep(0.3)
|
| 230 |
+
|
| 231 |
+
finally:
|
| 232 |
+
final = obs.get("current_score", 0.01) if isinstance(obs, dict) else 0.01
|
| 233 |
+
log_end(task_name=task_name, score=final, steps=steps_taken)
|
| 234 |
+
|
| 235 |
+
final_score = obs["current_score"]
|
| 236 |
+
print(
|
| 237 |
+
f"\n Task {task_id} final score: {final_score:.4f} (steps used: {obs['step_count']})",
|
| 238 |
+
file=sys.stderr,
|
| 239 |
+
)
|
| 240 |
+
return final_score
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
# ------------------------------------------------------------------
|
| 244 |
+
# Main
|
| 245 |
+
# ------------------------------------------------------------------
|
| 246 |
+
|
| 247 |
+
def main():
|
| 248 |
+
print("Data Cleaning OpenEnv -- Baseline Inference", file=sys.stderr)
|
| 249 |
+
print(f"Model : {MODEL_NAME}", file=sys.stderr)
|
| 250 |
+
print(f"Env : {ENV_URL}", file=sys.stderr)
|
| 251 |
+
|
| 252 |
+
# Smoke-test health endpoint
|
| 253 |
+
try:
|
| 254 |
+
health = api_get("/health")
|
| 255 |
+
assert health.get("status") in ("ok", "healthy"), f"Unexpected status: {health}"
|
| 256 |
+
print("Health check: OK\n", file=sys.stderr)
|
| 257 |
+
except Exception as exc:
|
| 258 |
+
print(f"[ERROR] Environment not reachable at {ENV_URL}: {exc}", file=sys.stderr)
|
| 259 |
+
print("[ERROR] Make sure the server is running and ENV_URL is correct.", file=sys.stderr)
|
| 260 |
+
sys.exit(1)
|
| 261 |
+
|
| 262 |
+
scores = {}
|
| 263 |
+
for task_id in [1, 2, 3]:
|
| 264 |
+
try:
|
| 265 |
+
scores[f"task{task_id}"] = run_task(task_id)
|
| 266 |
+
except Exception as exc:
|
| 267 |
+
print(f"[ERROR] Task {task_id} failed: {exc}", file=sys.stderr)
|
| 268 |
+
scores[f"task{task_id}"] = 0.01
|
| 269 |
+
|
| 270 |
+
print("\n" + "="*60, file=sys.stderr)
|
| 271 |
+
print(" BASELINE RESULTS", file=sys.stderr)
|
| 272 |
+
print("="*60, file=sys.stderr)
|
| 273 |
+
for k, v in scores.items():
|
| 274 |
+
print(f" {k}: {v:.4f}", file=sys.stderr)
|
| 275 |
+
avg = round(sum(scores.values()) / len(scores), 4)
|
| 276 |
+
print(f" average: {avg:.4f}", file=sys.stderr)
|
| 277 |
+
print("="*60, file=sys.stderr)
|
| 278 |
+
|
| 279 |
+
# Write scores to file for automated validators
|
| 280 |
+
with open("baseline_scores.json", "w") as f:
|
| 281 |
+
json.dump({"scores": scores, "average": avg}, f, indent=2)
|
| 282 |
+
print("\nScores written to baseline_scores.json", file=sys.stderr)
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
if __name__ == "__main__":
|
| 286 |
+
main()
|
inference_log.txt
ADDED
|
Binary file (28.1 kB). View file
|
|
|
models.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# models.py
|
| 2 |
+
|
| 3 |
+
class OrgOSAction(BaseModel):
|
| 4 |
+
app: str # "jira" | "zendesk" | "salesforce" | "workday"
|
| 5 |
+
operation: str # app-specific operation name
|
| 6 |
+
args: Dict[str, Any] = {}
|
| 7 |
+
|
| 8 |
+
class RewardBreakdown(BaseModel):
|
| 9 |
+
workflow_completion: float = 0.0 # 0.30 weight
|
| 10 |
+
rule_compliance: float = 0.0 # 0.25 weight
|
| 11 |
+
schema_adaptation: float = 0.0 # 0.20 weight
|
| 12 |
+
efficiency: float = 0.0 # 0.15 weight
|
| 13 |
+
policy_drift_handling: float = 0.0 # 0.10 weight
|
| 14 |
+
|
| 15 |
+
class OrgOSObservation(BaseModel):
|
| 16 |
+
done: bool
|
| 17 |
+
reward: float
|
| 18 |
+
current_score: float
|
| 19 |
+
workflow_id: str # "A", "B", or "C"
|
| 20 |
+
step_count: int
|
| 21 |
+
# Per-app state views (what the agent sees)
|
| 22 |
+
app_states: Dict[str, str] # app_name → CSV/JSON string preview
|
| 23 |
+
# Workflow progress
|
| 24 |
+
workflow_goal: str
|
| 25 |
+
completed_steps: List[str]
|
| 26 |
+
pending_steps: List[str]
|
| 27 |
+
# Schema drift info (partial — agent must probe to discover rest)
|
| 28 |
+
schema_hints: Dict[str, str] # e.g. {"jira.priority": "severity"}
|
| 29 |
+
# Business rules in effect this episode
|
| 30 |
+
active_rules: Dict[str, Any] # {"sla_p0_minutes": 15, "approval_threshold": 5000}
|
| 31 |
+
# Per-step feedback
|
| 32 |
+
rule_violations: List[str] # violations that just occurred
|
| 33 |
+
reward_breakdown: RewardBreakdown
|
| 34 |
+
message: str
|
| 35 |
+
|
| 36 |
+
class OrgOSState(BaseModel):
|
| 37 |
+
episode_id: str
|
| 38 |
+
workflow_id: str
|
| 39 |
+
schema_versions: Dict[str, str] # {"jira": "v2", "zendesk": "v1", ...}
|
| 40 |
+
step_count: int
|
| 41 |
+
max_steps: int
|
| 42 |
+
rule_violation_count: int
|
| 43 |
+
workflow_completion: float
|
| 44 |
+
rule_compliance_rate: float
|
| 45 |
+
policy_drift_active: bool
|
openenv.yaml
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: data-cleaning-env
|
| 2 |
+
version: "0.1.0"
|
| 3 |
+
description: >
|
| 4 |
+
A real-world data cleaning environment where an AI agent fixes missing
|
| 5 |
+
values, duplicate rows, format inconsistencies, outliers, and dtype errors
|
| 6 |
+
across three progressively harder tasks.
|
| 7 |
+
|
| 8 |
+
author: openenv-hackathon
|
| 9 |
+
tags:
|
| 10 |
+
- openenv
|
| 11 |
+
- data-cleaning
|
| 12 |
+
- rl
|
| 13 |
+
- real-world
|
| 14 |
+
|
| 15 |
+
tasks:
|
| 16 |
+
- id: task1
|
| 17 |
+
name: "Fill Missing Values"
|
| 18 |
+
difficulty: easy
|
| 19 |
+
max_steps: 20
|
| 20 |
+
description: >
|
| 21 |
+
Fill all NaN values in an employee records dataset.
|
| 22 |
+
Columns with missing data: age, salary, department.
|
| 23 |
+
|
| 24 |
+
- id: task2
|
| 25 |
+
name: "Fix Formats and Remove Duplicates"
|
| 26 |
+
difficulty: medium
|
| 27 |
+
max_steps: 30
|
| 28 |
+
description: >
|
| 29 |
+
Standardise phone numbers (NNN-NNN-NNNN) and dates (YYYY-MM-DD)
|
| 30 |
+
in a product catalog, and remove ~15 duplicate rows.
|
| 31 |
+
|
| 32 |
+
- id: task3
|
| 33 |
+
name: "Full Cleaning Pipeline"
|
| 34 |
+
difficulty: hard
|
| 35 |
+
max_steps: 40
|
| 36 |
+
description: >
|
| 37 |
+
End-to-end pipeline on a customer database: fill missing values,
|
| 38 |
+
remove duplicates, drop outliers in purchase_amount, standardise
|
| 39 |
+
country capitalisation, and fix mixed date formats.
|
| 40 |
+
|
| 41 |
+
api:
|
| 42 |
+
health: GET /health
|
| 43 |
+
reset: POST /reset
|
| 44 |
+
step: POST /step
|
| 45 |
+
state: POST /state
|
| 46 |
+
docs: GET /docs
|
| 47 |
+
|
| 48 |
+
reward:
|
| 49 |
+
range: [0.001, 0.999]
|
| 50 |
+
partial: true
|
| 51 |
+
terminal_bonus: 0.0
|
| 52 |
+
|
| 53 |
+
observation_space:
|
| 54 |
+
type: object
|
| 55 |
+
fields:
|
| 56 |
+
done: boolean
|
| 57 |
+
reward: float
|
| 58 |
+
data_preview: string # First 10 rows as CSV
|
| 59 |
+
data_shape: list # [rows, cols]
|
| 60 |
+
missing_counts: object # {column: count}
|
| 61 |
+
duplicate_count: integer
|
| 62 |
+
dtype_issues: object # {column: issue_description}
|
| 63 |
+
task_description: string
|
| 64 |
+
message: string
|
| 65 |
+
step_count: integer
|
| 66 |
+
current_score: float # 0.0–1.0
|
| 67 |
+
|
| 68 |
+
action_space:
|
| 69 |
+
type: object
|
| 70 |
+
fields:
|
| 71 |
+
operation: string # fill_missing | drop_duplicates | fix_format | replace_value | drop_outliers | fix_dtype
|
| 72 |
+
column: string # optional depending on operation
|
| 73 |
+
params: object # optional operation parameters
|
pyproject.toml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "data-cleaning-env"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Real-world data cleaning environment for OpenEnv / Scaler hackathon"
|
| 5 |
+
requires-python = ">=3.11"
|
| 6 |
+
dependencies = [
|
| 7 |
+
"fastapi==0.135.2",
|
| 8 |
+
"uvicorn[standard]==0.40.0",
|
| 9 |
+
"pydantic==2.12.5",
|
| 10 |
+
"pandas==2.2.3",
|
| 11 |
+
"numpy==2.2.4",
|
| 12 |
+
"faker==40.12.0",
|
| 13 |
+
"openai==2.15.0",
|
| 14 |
+
"httpx==0.28.1",
|
| 15 |
+
"openenv-core==0.2.3",
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
[project.scripts]
|
| 19 |
+
server = "server.app:main"
|
| 20 |
+
|
| 21 |
+
[build-system]
|
| 22 |
+
requires = ["hatchling"]
|
| 23 |
+
build-backend = "hatchling.build"
|
| 24 |
+
|
| 25 |
+
[tool.hatch.build.targets.wheel]
|
| 26 |
+
packages = ["server"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.135.2
|
| 2 |
+
uvicorn[standard]==0.40.0
|
| 3 |
+
pydantic==2.12.5
|
| 4 |
+
pandas==2.2.3
|
| 5 |
+
numpy==2.2.4
|
| 6 |
+
faker==40.12.0
|
| 7 |
+
openai==2.15.0
|
| 8 |
+
httpx==0.28.1
|
| 9 |
+
openenv-core==0.2.3
|
server/__init__.py
ADDED
|
File without changes
|
server/app.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI application exposing the OpenEnv-compatible HTTP API.
|
| 3 |
+
Endpoints: GET /health, GET /metadata, GET /schema,
|
| 4 |
+
POST /reset, POST /step, GET /state, POST /state, GET /docs
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from typing import Any, Dict, Optional
|
| 8 |
+
from fastapi import Body, FastAPI, HTTPException
|
| 9 |
+
from pydantic import BaseModel
|
| 10 |
+
import uvicorn
|
| 11 |
+
|
| 12 |
+
from models import DataCleaningAction, DataCleaningObservation, DataCleaningState
|
| 13 |
+
from server.environment import DataCleaningEnvironment
|
| 14 |
+
|
| 15 |
+
app = FastAPI(
|
| 16 |
+
title="Data Cleaning OpenEnv",
|
| 17 |
+
description="A real-world data cleaning environment for AI agent training.",
|
| 18 |
+
version="0.1.0",
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# Single shared environment instance (stateful server)
|
| 22 |
+
env = DataCleaningEnvironment()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# New reset body accepts workflow_id
|
| 26 |
+
class ResetRequest(BaseModel):
|
| 27 |
+
workflow_id: Optional[str] = None # "A", "B", "C", or None for round-robin
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class StepResponse(BaseModel):
|
| 31 |
+
observation: DataCleaningObservation
|
| 32 |
+
reward: float
|
| 33 |
+
done: bool
|
| 34 |
+
info: dict = {}
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# ------------------------------------------------------------------
|
| 38 |
+
# Routes
|
| 39 |
+
# ------------------------------------------------------------------
|
| 40 |
+
|
| 41 |
+
@app.get("/health")
|
| 42 |
+
def health():
|
| 43 |
+
return {"status": "healthy"}
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@app.get("/metadata")
|
| 47 |
+
def metadata():
|
| 48 |
+
return {
|
| 49 |
+
"name": "data-cleaning-env",
|
| 50 |
+
"description": (
|
| 51 |
+
"A real-world data cleaning environment where an AI agent fixes "
|
| 52 |
+
"missing values, duplicate rows, format inconsistencies, outliers, "
|
| 53 |
+
"and dtype errors across three progressively harder tasks."
|
| 54 |
+
),
|
| 55 |
+
"version": "0.1.0",
|
| 56 |
+
"tags": ["openenv", "data-cleaning", "rl", "real-world"],
|
| 57 |
+
"tasks": [
|
| 58 |
+
{"id": "task1", "name": "Fill Missing Values", "difficulty": "easy"},
|
| 59 |
+
{"id": "task2", "name": "Fix Formats and Remove Duplicates", "difficulty": "medium"},
|
| 60 |
+
{"id": "task3", "name": "Full Cleaning Pipeline", "difficulty": "hard"},
|
| 61 |
+
],
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@app.get("/schema")
|
| 66 |
+
def schema():
|
| 67 |
+
return {
|
| 68 |
+
"action": {
|
| 69 |
+
"type": "object",
|
| 70 |
+
"properties": {
|
| 71 |
+
"operation": {
|
| 72 |
+
"type": "string",
|
| 73 |
+
"enum": [
|
| 74 |
+
"fill_missing",
|
| 75 |
+
"drop_duplicates",
|
| 76 |
+
"fix_format",
|
| 77 |
+
"replace_value",
|
| 78 |
+
"drop_outliers",
|
| 79 |
+
"fix_dtype",
|
| 80 |
+
],
|
| 81 |
+
},
|
| 82 |
+
"column": {"type": "string", "nullable": True},
|
| 83 |
+
"params": {"type": "object", "nullable": True},
|
| 84 |
+
},
|
| 85 |
+
"required": ["operation"],
|
| 86 |
+
},
|
| 87 |
+
"observation": {
|
| 88 |
+
"type": "object",
|
| 89 |
+
"properties": {
|
| 90 |
+
"done": {"type": "boolean"},
|
| 91 |
+
"reward": {"type": "number"},
|
| 92 |
+
"data_preview": {"type": "string"},
|
| 93 |
+
"data_shape": {"type": "array", "items": {"type": "integer"}},
|
| 94 |
+
"missing_counts": {"type": "object"},
|
| 95 |
+
"duplicate_count": {"type": "integer"},
|
| 96 |
+
"dtype_issues": {"type": "object"},
|
| 97 |
+
"task_description": {"type": "string"},
|
| 98 |
+
"message": {"type": "string"},
|
| 99 |
+
"step_count": {"type": "integer"},
|
| 100 |
+
"current_score": {"type": "number"},
|
| 101 |
+
},
|
| 102 |
+
},
|
| 103 |
+
"state": {
|
| 104 |
+
"type": "object",
|
| 105 |
+
"properties": {
|
| 106 |
+
"episode_id": {"type": "string"},
|
| 107 |
+
"task_id": {"type": "integer"},
|
| 108 |
+
"step_count": {"type": "integer"},
|
| 109 |
+
"max_steps": {"type": "integer"},
|
| 110 |
+
"total_errors": {"type": "integer"},
|
| 111 |
+
"errors_remaining": {"type": "integer"},
|
| 112 |
+
},
|
| 113 |
+
},
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
@app.post("/reset", response_model=StepResponse)
|
| 118 |
+
def reset(req: ResetRequest = ResetRequest()):
|
| 119 |
+
try:
|
| 120 |
+
obs = env.reset(task_id=req.task_id)
|
| 121 |
+
except ValueError as e:
|
| 122 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 123 |
+
return StepResponse(observation=obs, reward=obs.reward, done=False)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
@app.post("/step", response_model=StepResponse)
|
| 127 |
+
async def step(body: Dict[str, Any] = Body(...)):
|
| 128 |
+
"""
|
| 129 |
+
Accept both openenv-core wrapped format:
|
| 130 |
+
{"action": {"operation": "...", ...}, "timeout_s": 15}
|
| 131 |
+
and direct format (for backward compat with our own client/inference):
|
| 132 |
+
{"operation": "...", "column": "...", "params": {...}}
|
| 133 |
+
"""
|
| 134 |
+
action_data = body.get("action", body)
|
| 135 |
+
try:
|
| 136 |
+
action = DataCleaningAction(**action_data)
|
| 137 |
+
obs = env.step(action)
|
| 138 |
+
except (TypeError, KeyError, Exception) as e:
|
| 139 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 140 |
+
return StepResponse(observation=obs, reward=obs.reward, done=obs.done)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
@app.get("/state", response_model=DataCleaningState)
|
| 144 |
+
def state_get():
|
| 145 |
+
"""GET /state — openenv-core spec."""
|
| 146 |
+
return env.state()
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
@app.post("/state", response_model=DataCleaningState)
|
| 150 |
+
def state_post():
|
| 151 |
+
"""POST /state — backward compatibility."""
|
| 152 |
+
return env.state()
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
@app.get("/", response_class=HTMLResponse)
|
| 157 |
+
def ui():
|
| 158 |
+
"""Serve the demo dashboard."""
|
| 159 |
+
return FileResponse("ui/index.html")
|
| 160 |
+
|
| 161 |
+
@app.get("/schema/apps")
|
| 162 |
+
def app_schemas():
|
| 163 |
+
"""Return the canonical action space per app — used by the UI."""
|
| 164 |
+
return {...} # maps app → list of operations + their arg schemas
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
# ------------------------------------------------------------------
|
| 168 |
+
# Entry point (required by openenv-core and [project.scripts])
|
| 169 |
+
# ------------------------------------------------------------------
|
| 170 |
+
|
| 171 |
+
def main():
|
| 172 |
+
uvicorn.run("server.app:app", host="0.0.0.0", port=8000)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
if __name__ == "__main__":
|
| 176 |
+
main()
|
server/apps/base_app.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class BaseApp(ABC):
|
| 2 |
+
APP_NAME: str = ""
|
| 3 |
+
|
| 4 |
+
# --- Core interface every app must implement ---
|
| 5 |
+
@abstractmethod
|
| 6 |
+
def initialize(self, records: List[Dict]) -> None:
|
| 7 |
+
"""Load synthetic records into in-memory state."""
|
| 8 |
+
|
| 9 |
+
@abstractmethod
|
| 10 |
+
def execute(self, operation: str, args: Dict) -> Dict:
|
| 11 |
+
"""Execute an operation. Returns {"success": bool, "data": ..., "message": str}"""
|
| 12 |
+
|
| 13 |
+
@abstractmethod
|
| 14 |
+
def get_state_view(self, max_rows: int = 5) -> str:
|
| 15 |
+
"""Return agent-visible snapshot as a compact string."""
|
| 16 |
+
|
| 17 |
+
@abstractmethod
|
| 18 |
+
def count_open_items(self) -> int:
|
| 19 |
+
"""Count pending/open work items (used by grader)."""
|
server/business_rules.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DEFAULT_RULES = {
|
| 2 |
+
"sla_p0_minutes": 30, # P0 tickets: acknowledge within 30 min
|
| 3 |
+
"sla_p1_hours": 4, # P1 tickets: first response within 4h
|
| 4 |
+
"approval_threshold": 10_000, # $ above which manager approval needed
|
| 5 |
+
"max_tickets_per_agent": 10, # RBAC: agent capacity cap
|
| 6 |
+
"gdpr_max_days": 30, # compliance: GDPR ticket resolution
|
| 7 |
+
"rbac": {
|
| 8 |
+
"support": {"salesforce": ["read"], "jira": ["read", "create_issue"]},
|
| 9 |
+
"engineer": {"jira": ["*"], "zendesk": ["read"]},
|
| 10 |
+
"manager": {"*": ["*"]},
|
| 11 |
+
}
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
POLICY_DRIFT_EVENTS = {
|
| 15 |
+
"sla_tighten": {"sla_p0_minutes": 15, "sla_p1_hours": 2},
|
| 16 |
+
"approval_tighten": {"approval_threshold": 5_000},
|
| 17 |
+
"gdpr_expedite": {"gdpr_max_days": 7},
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
class BusinessRuleEngine:
|
| 21 |
+
def __init__(self):
|
| 22 |
+
self.rules = DEFAULT_RULES.copy()
|
| 23 |
+
self._violation_log: List[str] = []
|
| 24 |
+
|
| 25 |
+
def apply_policy_drift(self, event: str) -> None:
|
| 26 |
+
"""Called mid-episode or at episode start to change rules."""
|
| 27 |
+
if event in POLICY_DRIFT_EVENTS:
|
| 28 |
+
self.rules.update(POLICY_DRIFT_EVENTS[event])
|
| 29 |
+
|
| 30 |
+
def check_action(self, action: OrgOSAction, context: Dict) -> Tuple[bool, str, float]:
|
| 31 |
+
"""Returns (allowed, reason, penalty)."""
|
| 32 |
+
violations = []
|
| 33 |
+
|
| 34 |
+
# RBAC check
|
| 35 |
+
role = context.get("agent_role", "support")
|
| 36 |
+
app_perms = self.rules["rbac"].get(role, {})
|
| 37 |
+
allowed_ops = app_perms.get(action.app, app_perms.get("*", []))
|
| 38 |
+
if "*" not in allowed_ops and action.operation not in allowed_ops:
|
| 39 |
+
violations.append(f"RBAC: {role} cannot {action.operation} on {action.app}")
|
| 40 |
+
return False, violations[0], -0.25
|
| 41 |
+
|
| 42 |
+
# Approval threshold check
|
| 43 |
+
if action.operation in ("request_budget_approval", "update_deal_stage"):
|
| 44 |
+
amount = action.args.get("amount", 0)
|
| 45 |
+
if amount > self.rules["approval_threshold"] and not context.get("manager_approved"):
|
| 46 |
+
violations.append(f"Approval required: ${amount} > ${self.rules['approval_threshold']}")
|
| 47 |
+
return False, violations[0], -0.10
|
| 48 |
+
|
| 49 |
+
self._violation_log.extend(violations)
|
| 50 |
+
return True, "", 0.0
|
| 51 |
+
|
| 52 |
+
def check_sla(self, ticket: Dict, elapsed_minutes: float) -> Tuple[bool, float]:
|
| 53 |
+
"""Returns (sla_met, penalty)."""
|
| 54 |
+
priority = ticket.get("priority", ticket.get("urgency", "p2"))
|
| 55 |
+
if priority in ("p0", "critical") and elapsed_minutes > self.rules["sla_p0_minutes"]:
|
| 56 |
+
return False, -0.15
|
| 57 |
+
return True, 0.0
|
| 58 |
+
|
| 59 |
+
def get_violations_this_step(self) -> List[str]:
|
| 60 |
+
v = self._violation_log.copy()
|
| 61 |
+
self._violation_log.clear()
|
| 62 |
+
return v
|
server/data_generator.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Synthetic dataset generation with a fixed seed for full reproducibility.
|
| 3 |
+
All datasets are generated purely from numpy/random — no external downloads.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import random
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
SEED = 42
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# ---------------------------------------------------------------------------
|
| 14 |
+
# Task 1 — Employee records with missing values
|
| 15 |
+
# ---------------------------------------------------------------------------
|
| 16 |
+
|
| 17 |
+
def generate_task1_datasets():
|
| 18 |
+
"""Returns (dirty_df, clean_df) for Task 1."""
|
| 19 |
+
rng = np.random.default_rng(SEED)
|
| 20 |
+
random.seed(SEED)
|
| 21 |
+
|
| 22 |
+
n = 100
|
| 23 |
+
departments = ["Engineering", "Marketing", "Sales", "HR", "Finance"]
|
| 24 |
+
first_names = ["Alice", "Bob", "Carol", "David", "Eve", "Frank", "Grace",
|
| 25 |
+
"Heidi", "Ivan", "Judy", "Karl", "Laura", "Mallory", "Niaj",
|
| 26 |
+
"Oscar", "Peggy", "Quinn", "Romeo", "Sybil", "Trent"]
|
| 27 |
+
last_names = ["Smith", "Jones", "Brown", "Taylor", "Wilson", "Davis",
|
| 28 |
+
"Miller", "Anderson", "Thomas", "Jackson"]
|
| 29 |
+
|
| 30 |
+
names = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(n)]
|
| 31 |
+
ages = rng.integers(22, 60, size=n).astype(float)
|
| 32 |
+
salaries = rng.integers(40_000, 120_000, size=n).astype(float)
|
| 33 |
+
depts = rng.choice(departments, size=n)
|
| 34 |
+
experience = rng.integers(0, 30, size=n).astype(float)
|
| 35 |
+
|
| 36 |
+
clean_df = pd.DataFrame({
|
| 37 |
+
"name": names,
|
| 38 |
+
"age": ages,
|
| 39 |
+
"salary": salaries,
|
| 40 |
+
"department": depts,
|
| 41 |
+
"experience": experience,
|
| 42 |
+
})
|
| 43 |
+
|
| 44 |
+
dirty_df = clean_df.copy()
|
| 45 |
+
|
| 46 |
+
# Inject ~20 % NaN into age, salary, department
|
| 47 |
+
for col, frac in [("age", 0.20), ("salary", 0.20), ("department", 0.10)]:
|
| 48 |
+
idx = rng.choice(n, size=int(n * frac), replace=False)
|
| 49 |
+
dirty_df.loc[idx, col] = np.nan
|
| 50 |
+
|
| 51 |
+
return dirty_df.reset_index(drop=True), clean_df.reset_index(drop=True)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# ---------------------------------------------------------------------------
|
| 55 |
+
# Task 2 — Product catalog with format & duplicate issues
|
| 56 |
+
# ---------------------------------------------------------------------------
|
| 57 |
+
|
| 58 |
+
def _scramble_phone(phone: str, rng) -> str:
|
| 59 |
+
digits = phone.replace("-", "")
|
| 60 |
+
fmt = rng.integers(0, 3)
|
| 61 |
+
if fmt == 0:
|
| 62 |
+
return digits # 5551234567
|
| 63 |
+
elif fmt == 1:
|
| 64 |
+
return f"({digits[:3]}){digits[3:]}" # (555)1234567
|
| 65 |
+
else:
|
| 66 |
+
return phone # 555-123-4567 (canonical)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _scramble_date(date_str: str, rng) -> str:
|
| 70 |
+
dt = pd.to_datetime(date_str)
|
| 71 |
+
fmt = rng.integers(0, 3)
|
| 72 |
+
if fmt == 0:
|
| 73 |
+
return dt.strftime("%Y-%m-%d")
|
| 74 |
+
elif fmt == 1:
|
| 75 |
+
return dt.strftime("%b %d %Y")
|
| 76 |
+
else:
|
| 77 |
+
return dt.strftime("%d/%m/%Y")
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def generate_task2_datasets():
|
| 81 |
+
"""Returns (dirty_df, clean_df) for Task 2."""
|
| 82 |
+
rng = np.random.default_rng(SEED)
|
| 83 |
+
random.seed(SEED)
|
| 84 |
+
|
| 85 |
+
n = 200
|
| 86 |
+
categories = ["Electronics", "Clothing", "Food", "Books", "Toys"]
|
| 87 |
+
|
| 88 |
+
product_ids = [f"P{str(i).zfill(4)}" for i in range(1, n + 1)]
|
| 89 |
+
product_names = [f"Product_{i}" for i in range(1, n + 1)]
|
| 90 |
+
prices = np.round(rng.uniform(5.0, 500.0, size=n), 2)
|
| 91 |
+
categories_col = rng.choice(categories, size=n)
|
| 92 |
+
phones = [
|
| 93 |
+
f"{rng.integers(100,999)}-{rng.integers(100,999)}-{rng.integers(1000,9999)}"
|
| 94 |
+
for _ in range(n)
|
| 95 |
+
]
|
| 96 |
+
days_offset = rng.integers(0, 1000, size=n)
|
| 97 |
+
dates = [
|
| 98 |
+
(pd.Timestamp("2020-01-01") + pd.Timedelta(days=int(d))).strftime("%Y-%m-%d")
|
| 99 |
+
for d in days_offset
|
| 100 |
+
]
|
| 101 |
+
|
| 102 |
+
clean_df = pd.DataFrame({
|
| 103 |
+
"product_id": product_ids,
|
| 104 |
+
"product_name": product_names,
|
| 105 |
+
"price": prices,
|
| 106 |
+
"category": categories_col,
|
| 107 |
+
"phone": phones,
|
| 108 |
+
"listed_date": dates,
|
| 109 |
+
})
|
| 110 |
+
|
| 111 |
+
dirty_df = clean_df.copy()
|
| 112 |
+
|
| 113 |
+
# Scramble ~60 % of phone formats
|
| 114 |
+
phone_idx = rng.choice(n, size=int(n * 0.6), replace=False)
|
| 115 |
+
dirty_df.loc[phone_idx, "phone"] = [
|
| 116 |
+
_scramble_phone(dirty_df.loc[i, "phone"], rng) for i in phone_idx
|
| 117 |
+
]
|
| 118 |
+
|
| 119 |
+
# Scramble ~60 % of date formats
|
| 120 |
+
date_idx = rng.choice(n, size=int(n * 0.6), replace=False)
|
| 121 |
+
dirty_df.loc[date_idx, "listed_date"] = [
|
| 122 |
+
_scramble_date(dirty_df.loc[i, "listed_date"], rng) for i in date_idx
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
# Add 15 duplicate rows
|
| 126 |
+
dup_idx = rng.choice(n, size=15, replace=False)
|
| 127 |
+
dup_rows = dirty_df.iloc[dup_idx].copy()
|
| 128 |
+
dirty_df = pd.concat([dirty_df, dup_rows], ignore_index=True)
|
| 129 |
+
|
| 130 |
+
return dirty_df.reset_index(drop=True), clean_df.reset_index(drop=True)
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
# ---------------------------------------------------------------------------
|
| 134 |
+
# Task 3 — Customer database: full pipeline
|
| 135 |
+
# ---------------------------------------------------------------------------
|
| 136 |
+
|
| 137 |
+
def generate_task3_datasets():
|
| 138 |
+
"""Returns (dirty_df, clean_df) for Task 3."""
|
| 139 |
+
rng = np.random.default_rng(SEED)
|
| 140 |
+
random.seed(SEED)
|
| 141 |
+
|
| 142 |
+
n = 300
|
| 143 |
+
countries = ["USA", "UK", "Canada", "Australia", "Germany"]
|
| 144 |
+
first_names = ["Alice", "Bob", "Carol", "David", "Eve", "Frank", "Grace",
|
| 145 |
+
"Heidi", "Ivan", "Judy"]
|
| 146 |
+
last_names = ["Smith", "Jones", "Brown", "Taylor", "Wilson"]
|
| 147 |
+
|
| 148 |
+
names = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(n)]
|
| 149 |
+
ages = rng.integers(18, 75, size=n).astype(float)
|
| 150 |
+
purchase_amounts = np.round(rng.uniform(10.0, 500.0, size=n), 2)
|
| 151 |
+
countries_col = rng.choice(countries, size=n)
|
| 152 |
+
emails = [f"user{i}@example.com" for i in range(1, n + 1)]
|
| 153 |
+
days_offset = rng.integers(0, 730, size=n)
|
| 154 |
+
signup_dates = [
|
| 155 |
+
(pd.Timestamp("2022-01-01") + pd.Timedelta(days=int(d))).strftime("%Y-%m-%d")
|
| 156 |
+
for d in days_offset
|
| 157 |
+
]
|
| 158 |
+
|
| 159 |
+
clean_df = pd.DataFrame({
|
| 160 |
+
"name": names,
|
| 161 |
+
"age": ages,
|
| 162 |
+
"purchase_amount": purchase_amounts,
|
| 163 |
+
"country": countries_col,
|
| 164 |
+
"email": emails,
|
| 165 |
+
"signup_date": signup_dates,
|
| 166 |
+
})
|
| 167 |
+
|
| 168 |
+
dirty_df = clean_df.copy()
|
| 169 |
+
|
| 170 |
+
# Missing values (~15 % in age, purchase_amount, country, signup_date)
|
| 171 |
+
for col, frac in [("age", 0.15), ("purchase_amount", 0.15),
|
| 172 |
+
("country", 0.10), ("signup_date", 0.10)]:
|
| 173 |
+
idx = rng.choice(n, size=int(n * frac), replace=False)
|
| 174 |
+
dirty_df.loc[idx, col] = np.nan
|
| 175 |
+
|
| 176 |
+
# Outliers in purchase_amount (~3 %)
|
| 177 |
+
out_idx = rng.choice(n, size=int(n * 0.03), replace=False)
|
| 178 |
+
dirty_df.loc[out_idx, "purchase_amount"] = (
|
| 179 |
+
dirty_df.loc[out_idx, "purchase_amount"] * 10
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
# Mixed case in country (~40 %)
|
| 183 |
+
case_idx = rng.choice(n, size=int(n * 0.40), replace=False)
|
| 184 |
+
dirty_df.loc[case_idx, "country"] = dirty_df.loc[case_idx, "country"].str.lower()
|
| 185 |
+
|
| 186 |
+
# Mixed date formats (~50 %) — only scramble non-null entries
|
| 187 |
+
date_idx = rng.choice(n, size=int(n * 0.50), replace=False)
|
| 188 |
+
valid_date_idx = [i for i in date_idx if pd.notna(dirty_df.loc[i, "signup_date"])]
|
| 189 |
+
for i in valid_date_idx:
|
| 190 |
+
dirty_df.loc[i, "signup_date"] = _scramble_date(dirty_df.loc[i, "signup_date"], rng)
|
| 191 |
+
|
| 192 |
+
# 20 duplicate rows
|
| 193 |
+
dup_idx = rng.choice(n, size=20, replace=False)
|
| 194 |
+
dup_rows = dirty_df.iloc[dup_idx].copy()
|
| 195 |
+
dirty_df = pd.concat([dirty_df, dup_rows], ignore_index=True)
|
| 196 |
+
|
| 197 |
+
return dirty_df.reset_index(drop=True), clean_df.reset_index(drop=True)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def generate_jira_records(n=50, seed=42) -> List[Dict]:
|
| 201 |
+
"""50 engineering tickets with priority, assignee, status, linked_ticket."""
|
| 202 |
+
|
| 203 |
+
def generate_zendesk_records(n=40, seed=42) -> List[Dict]:
|
| 204 |
+
"""40 support tickets with urgency, agent_email, state, customer_id."""
|
| 205 |
+
|
| 206 |
+
def generate_salesforce_records(n=30, seed=42) -> List[Dict]:
|
| 207 |
+
"""30 accounts with deal_stage, health, owner_name, arr."""
|
| 208 |
+
|
| 209 |
+
def generate_workday_records(n=20, seed=42) -> List[Dict]:
|
| 210 |
+
"""20 employee/HR records with level, manager_id, resolution."""
|
| 211 |
+
|
| 212 |
+
def generate_episode_data(workflow_id: str, seed: int = 42) -> Dict[str, List[Dict]]:
|
| 213 |
+
"""Generate correlated data for a full episode across all 4 apps.
|
| 214 |
+
Ensures tickets in Zendesk reference customers in Salesforce, etc."""
|
server/environment.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class OrgOSEnvironment:
|
| 2 |
+
MAX_STEPS = {"A": 15, "B": 20, "C": 18}
|
| 3 |
+
WORKFLOWS = ["A", "B", "C"]
|
| 4 |
+
|
| 5 |
+
def __init__(self):
|
| 6 |
+
self._drift = SchemaDriftEngine(seed=42)
|
| 7 |
+
self._rules = BusinessRuleEngine()
|
| 8 |
+
self._workflow = WorkflowEngine()
|
| 9 |
+
self._apps: Dict[str, BaseApp] = {
|
| 10 |
+
"jira": JiraApp(self._drift),
|
| 11 |
+
"zendesk": ZendeskApp(self._drift),
|
| 12 |
+
"salesforce": SalesforceApp(self._drift),
|
| 13 |
+
"workday": WorkdayApp(self._drift),
|
| 14 |
+
}
|
| 15 |
+
self._episode_num = 0
|
| 16 |
+
self._episode_id = ""
|
| 17 |
+
self._workflow_id = "A"
|
| 18 |
+
self._step_count = 0
|
| 19 |
+
self._last_score = 0.001
|
| 20 |
+
self._policy_drift_applied = False
|
| 21 |
+
|
| 22 |
+
# Reward component trackers
|
| 23 |
+
self._wf_score = 0.0 # workflow completion
|
| 24 |
+
self._rule_score = 1.0 # compliance (starts perfect, penalized on violation)
|
| 25 |
+
self._schema_score = 0.0 # schema adaptation successes
|
| 26 |
+
self._efficiency = 1.0 # degrades with no-ops
|
| 27 |
+
self._policy_score = 0.0 # policy drift handling
|
| 28 |
+
|
| 29 |
+
def reset(self, workflow_id: Optional[str] = None) -> OrgOSObservation:
|
| 30 |
+
self._episode_num += 1
|
| 31 |
+
self._episode_id = str(uuid.uuid4())
|
| 32 |
+
self._workflow_id = workflow_id or self.WORKFLOWS[(self._episode_num - 1) % 3]
|
| 33 |
+
self._step_count = 0
|
| 34 |
+
self._last_score = 0.001
|
| 35 |
+
self._rule_score = 1.0
|
| 36 |
+
self._wf_score = 0.0
|
| 37 |
+
self._schema_score = 0.0
|
| 38 |
+
self._efficiency = 1.0
|
| 39 |
+
self._policy_score = 0.0
|
| 40 |
+
self._policy_drift_applied = False
|
| 41 |
+
|
| 42 |
+
# Sample schema versions for this episode
|
| 43 |
+
self._drift.sample_for_episode(self._episode_num)
|
| 44 |
+
|
| 45 |
+
# Possibly activate policy drift (every 3rd episode)
|
| 46 |
+
self._rules = BusinessRuleEngine()
|
| 47 |
+
if self._episode_num % 3 == 0:
|
| 48 |
+
self._rules.apply_policy_drift("sla_tighten")
|
| 49 |
+
self._policy_drift_applied = True
|
| 50 |
+
|
| 51 |
+
# Load fresh synthetic data into each app
|
| 52 |
+
records = generate_episode_data(self._workflow_id, seed=42 + self._episode_num)
|
| 53 |
+
for app_name, app in self._apps.items():
|
| 54 |
+
app.initialize(records[app_name])
|
| 55 |
+
|
| 56 |
+
# Start workflow tracking
|
| 57 |
+
self._workflow.start(self._workflow_id)
|
| 58 |
+
|
| 59 |
+
return self._build_obs(0.001, False, "Episode started. Study the workflow goal and schema hints.")
|
| 60 |
+
|
| 61 |
+
def step(self, action: OrgOSAction) -> OrgOSObservation:
|
| 62 |
+
self._step_count += 1
|
| 63 |
+
old_score = self._last_score
|
| 64 |
+
extra_penalty = 0.0
|
| 65 |
+
|
| 66 |
+
# 1. Validate app exists
|
| 67 |
+
if action.app not in self._apps:
|
| 68 |
+
return self._build_obs(old_score - 0.05, False, f"Unknown app '{action.app}'")
|
| 69 |
+
|
| 70 |
+
# 2. Business rule check (RBAC, approvals)
|
| 71 |
+
ctx = {"agent_role": "support", "manager_approved": False}
|
| 72 |
+
allowed, reason, rule_penalty = self._rules.check_action(action, ctx)
|
| 73 |
+
if not allowed:
|
| 74 |
+
self._rule_score = max(0.0, self._rule_score - 0.08)
|
| 75 |
+
extra_penalty = rule_penalty
|
| 76 |
+
return self._build_obs(
|
| 77 |
+
max(-0.25, old_score + extra_penalty),
|
| 78 |
+
False, f"Rule violation: {reason}"
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
# 3. Execute on app
|
| 82 |
+
result = self._apps[action.app].execute(action.operation, action.args)
|
| 83 |
+
if not result["success"]:
|
| 84 |
+
self._efficiency -= 0.02 # penalize failed/no-op actions
|
| 85 |
+
return self._build_obs(old_score - 0.01, False, result["message"])
|
| 86 |
+
|
| 87 |
+
# 4. Check schema drift adaptation
|
| 88 |
+
# If agent used canonical field names on a v2/v3 schema → penalize
|
| 89 |
+
if result.get("schema_error"):
|
| 90 |
+
extra_penalty -= 0.20
|
| 91 |
+
return self._build_obs(old_score - 0.20, False,
|
| 92 |
+
f"Stale schema: field '{result['schema_error']}' not found in current schema")
|
| 93 |
+
elif result.get("schema_adapted"):
|
| 94 |
+
# Agent correctly used drifted field name → bonus
|
| 95 |
+
self._schema_score = min(1.0, self._schema_score + 0.1)
|
| 96 |
+
|
| 97 |
+
# 5. Re-evaluate workflow completion
|
| 98 |
+
self._wf_score = self._workflow.evaluate(self._apps)
|
| 99 |
+
|
| 100 |
+
# 6. Check SLA violations
|
| 101 |
+
sla_ok, sla_pen = self._rules.check_sla(result.get("ticket", {}),
|
| 102 |
+
self._step_count * 2.5) # 2.5 min per step
|
| 103 |
+
if not sla_ok:
|
| 104 |
+
extra_penalty += sla_pen
|
| 105 |
+
self._rule_score = max(0.0, self._rule_score - 0.05)
|
| 106 |
+
|
| 107 |
+
# 7. Compute composite score
|
| 108 |
+
new_score = self._compute_score()
|
| 109 |
+
delta = new_score - old_score + extra_penalty
|
| 110 |
+
self._last_score = max(0.001, min(0.999, new_score))
|
| 111 |
+
|
| 112 |
+
# 8. Terminal condition
|
| 113 |
+
done = (self._wf_score >= 0.95 or
|
| 114 |
+
self._step_count >= self.MAX_STEPS[self._workflow_id])
|
| 115 |
+
if done and self._wf_score >= 0.95:
|
| 116 |
+
delta += 0.20 # terminal bonus
|
| 117 |
+
|
| 118 |
+
return self._build_obs(delta, done, result["message"])
|
| 119 |
+
|
| 120 |
+
def _compute_score(self) -> float:
|
| 121 |
+
raw = (
|
| 122 |
+
0.30 * self._wf_score +
|
| 123 |
+
0.25 * self._rule_score +
|
| 124 |
+
0.20 * self._schema_score +
|
| 125 |
+
0.15 * self._efficiency +
|
| 126 |
+
0.10 * self._policy_score
|
| 127 |
+
)
|
| 128 |
+
return max(0.001, min(0.999, raw))
|
| 129 |
+
|
| 130 |
+
def state(self) -> OrgOSState:
|
| 131 |
+
return OrgOSState(
|
| 132 |
+
episode_id=self._episode_id,
|
| 133 |
+
workflow_id=self._workflow_id,
|
| 134 |
+
schema_versions=self._drift._versions,
|
| 135 |
+
step_count=self._step_count,
|
| 136 |
+
max_steps=self.MAX_STEPS.get(self._workflow_id, 15),
|
| 137 |
+
rule_violation_count=len(self._rules._violation_log),
|
| 138 |
+
workflow_completion=self._wf_score,
|
| 139 |
+
rule_compliance_rate=self._rule_score,
|
| 140 |
+
policy_drift_active=self._policy_drift_applied,
|
| 141 |
+
)
|
server/schema_drift.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Canonical → actual field names per app per schema version
|
| 2 |
+
SCHEMA_MAP = {
|
| 3 |
+
"jira": {
|
| 4 |
+
"v1": {"priority": "priority", "assignee": "assignee", "status": "status"},
|
| 5 |
+
"v2": {"priority": "severity", "assignee": "owner", "status": "state"},
|
| 6 |
+
"v3": {"priority": "urgency_level", "assignee": "assigned_to", "status": "current_state",
|
| 7 |
+
"sla_deadline": "due_by"}, # v3 adds a new field
|
| 8 |
+
},
|
| 9 |
+
"zendesk": {
|
| 10 |
+
"v1": {"urgency": "urgency", "agent_email": "agent_email", "state": "state"},
|
| 11 |
+
"v2": {"urgency": "priority", "agent_email": "handler", "state": "ticket_state"},
|
| 12 |
+
"v3": {"urgency": "impact_level", "agent_email": "assigned_agent","state": "resolution_status"},
|
| 13 |
+
},
|
| 14 |
+
"salesforce": {
|
| 15 |
+
"v1": {"deal_stage": "deal_stage", "health": "health", "owner": "owner_name"},
|
| 16 |
+
"v2": {"deal_stage": "pipeline_stage","health": "account_health", "owner": "account_owner"},
|
| 17 |
+
"v3": {"deal_stage": "stage", "health": "risk_score", "owner": "rep_email",
|
| 18 |
+
"arr": "annual_recurring_revenue"},
|
| 19 |
+
},
|
| 20 |
+
"workday": {
|
| 21 |
+
"v1": {"level": "level", "manager_id": "manager_id", "status": "resolution"},
|
| 22 |
+
"v2": {"level": "job_level", "manager_id": "reports_to", "status": "request_status"},
|
| 23 |
+
"v3": {"level": "seniority", "manager_id": "direct_manager","status": "approval_state"},
|
| 24 |
+
},
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
class SchemaDriftEngine:
|
| 28 |
+
def __init__(self, seed: int = 42):
|
| 29 |
+
self._seed = seed
|
| 30 |
+
self._versions: Dict[str, str] = {} # app → "v1"/"v2"/"v3"
|
| 31 |
+
|
| 32 |
+
def sample_for_episode(self, episode_num: int) -> None:
|
| 33 |
+
"""Sample schema versions deterministically per episode."""
|
| 34 |
+
rng = random.Random(self._seed + episode_num)
|
| 35 |
+
self._versions = {app: rng.choice(["v1", "v2", "v3"]) for app in SCHEMA_MAP}
|
| 36 |
+
|
| 37 |
+
def translate_record(self, record: Dict, app: str) -> Dict:
|
| 38 |
+
"""Rename canonical field names → current schema's field names."""
|
| 39 |
+
version = self._versions.get(app, "v1")
|
| 40 |
+
mapping = SCHEMA_MAP[app][version]
|
| 41 |
+
return {mapping.get(k, k): v for k, v in record.items()}
|
| 42 |
+
|
| 43 |
+
def get_hints(self) -> Dict[str, str]:
|
| 44 |
+
"""Return partial schema hints visible in observation.
|
| 45 |
+
Only reveal 1 random field per app (agent must probe for the rest)."""
|
| 46 |
+
hints = {}
|
| 47 |
+
rng = random.Random(self._seed)
|
| 48 |
+
for app, version in self._versions.items():
|
| 49 |
+
mapping = SCHEMA_MAP[app][version]
|
| 50 |
+
# Reveal only fields that actually changed (v2/v3)
|
| 51 |
+
changed = {f"{app}.{k}": v for k, v in mapping.items() if k != v}
|
| 52 |
+
if changed:
|
| 53 |
+
key = rng.choice(list(changed.keys()))
|
| 54 |
+
hints[key] = changed[key]
|
| 55 |
+
return hints
|
server/tasks/__init__.py
ADDED
|
File without changes
|
server/tasks/task1_missing.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task 1 — Easy: Fill Missing Values
|
| 3 |
+
Objective: Fill all NaN values in the employee records DataFrame.
|
| 4 |
+
Score: 1.0 - (remaining_nulls / original_nulls)
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from server.data_generator import generate_task1_datasets
|
| 8 |
+
|
| 9 |
+
TASK_ID = 1
|
| 10 |
+
MAX_STEPS = 20
|
| 11 |
+
DESCRIPTION = (
|
| 12 |
+
"Task 1 (Easy) — Fill Missing Values\n"
|
| 13 |
+
"You have an employee records dataset with missing values (NaN) in "
|
| 14 |
+
"'age', 'salary', and 'department' columns. "
|
| 15 |
+
"Your goal is to fill all missing values so the dataset is complete.\n\n"
|
| 16 |
+
"Available operation: fill_missing\n"
|
| 17 |
+
" params.strategy: 'median' | 'mean' | 'mode' | 'constant'\n"
|
| 18 |
+
" params.value: (required when strategy='constant') the fill value\n"
|
| 19 |
+
"Example action: {\"operation\": \"fill_missing\", \"column\": \"age\", \"params\": {\"strategy\": \"median\"}}"
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def load():
|
| 24 |
+
"""Return (dirty_df, clean_df, original_null_count)."""
|
| 25 |
+
dirty, clean = generate_task1_datasets()
|
| 26 |
+
original_nulls = int(dirty.isnull().sum().sum())
|
| 27 |
+
return dirty.copy(), clean, original_nulls
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def score(current_df, original_nulls: int) -> float:
|
| 31 |
+
"""Score in [0, 1]: fraction of nulls filled."""
|
| 32 |
+
if original_nulls == 0:
|
| 33 |
+
return 0.99
|
| 34 |
+
remaining = int(current_df.isnull().sum().sum())
|
| 35 |
+
return round(max(0.01, min(0.99, 1.0 - remaining / original_nulls)), 4)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def count_errors(current_df) -> int:
|
| 39 |
+
return int(current_df.isnull().sum().sum())
|
server/tasks/task2_format.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task 2 — Medium: Fix Formats + Remove Duplicates
|
| 3 |
+
Objective: Standardise phone & date formats and drop duplicate rows.
|
| 4 |
+
Score: weighted average of format_score (0.7) + dupe_score (0.3)
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import re
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from server.data_generator import generate_task2_datasets
|
| 10 |
+
|
| 11 |
+
TASK_ID = 2
|
| 12 |
+
MAX_STEPS = 30
|
| 13 |
+
DESCRIPTION = (
|
| 14 |
+
"Task 2 (Medium) — Fix Formats and Remove Duplicates\n"
|
| 15 |
+
"You have a product catalog with:\n"
|
| 16 |
+
" • Phone numbers in mixed formats (need: NNN-NNN-NNNN)\n"
|
| 17 |
+
" • Dates in mixed formats (need: YYYY-MM-DD)\n"
|
| 18 |
+
" • Duplicate rows (~15)\n\n"
|
| 19 |
+
"Available operations:\n"
|
| 20 |
+
" fix_format — column: 'phone' | 'listed_date'\n"
|
| 21 |
+
" drop_duplicates — no column needed\n\n"
|
| 22 |
+
"Example actions:\n"
|
| 23 |
+
' {"operation": "fix_format", "column": "phone"}\n'
|
| 24 |
+
' {"operation": "fix_format", "column": "listed_date"}\n'
|
| 25 |
+
' {"operation": "drop_duplicates"}'
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
PHONE_RE = re.compile(r"^\d{3}-\d{3}-\d{4}$")
|
| 29 |
+
DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def load():
|
| 33 |
+
dirty, clean = generate_task2_datasets()
|
| 34 |
+
original_phone_issues = int((~dirty["phone"].str.match(PHONE_RE)).sum())
|
| 35 |
+
original_date_issues = int((~dirty["listed_date"].apply(
|
| 36 |
+
lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
|
| 37 |
+
)).sum())
|
| 38 |
+
original_dupes = len(dirty) - len(dirty.drop_duplicates())
|
| 39 |
+
meta = {
|
| 40 |
+
"orig_phone": original_phone_issues,
|
| 41 |
+
"orig_date": original_date_issues,
|
| 42 |
+
"orig_dupes": original_dupes,
|
| 43 |
+
}
|
| 44 |
+
return dirty.copy(), clean, meta
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def score(current_df, meta: dict) -> float:
|
| 48 |
+
phone_issues = int((~current_df["phone"].str.match(PHONE_RE)).sum())
|
| 49 |
+
date_issues = int((~current_df["listed_date"].apply(
|
| 50 |
+
lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
|
| 51 |
+
)).sum())
|
| 52 |
+
dupes = len(current_df) - len(current_df.drop_duplicates())
|
| 53 |
+
|
| 54 |
+
phone_score = 1.0 - phone_issues / max(meta["orig_phone"], 1)
|
| 55 |
+
date_score = 1.0 - date_issues / max(meta["orig_date"], 1)
|
| 56 |
+
dupe_score = 1.0 - dupes / max(meta["orig_dupes"], 1)
|
| 57 |
+
|
| 58 |
+
combined = 0.35 * phone_score + 0.35 * date_score + 0.30 * dupe_score
|
| 59 |
+
return round(max(0.01, min(0.99, combined)), 4)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def count_errors(current_df, meta: dict) -> int:
|
| 63 |
+
phone_issues = int((~current_df["phone"].str.match(PHONE_RE)).sum())
|
| 64 |
+
date_issues = int((~current_df["listed_date"].apply(
|
| 65 |
+
lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
|
| 66 |
+
)).sum())
|
| 67 |
+
dupes = len(current_df) - len(current_df.drop_duplicates())
|
| 68 |
+
return phone_issues + date_issues + dupes
|
server/tasks/task3_pipeline.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task 3 — Hard: Full Cleaning Pipeline
|
| 3 |
+
Objective: Fix missing values, remove duplicates, handle outliers, standardise
|
| 4 |
+
country capitalisation and date formats.
|
| 5 |
+
Score: equal-weight average of 4 sub-scores.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import re
|
| 9 |
+
import numpy as np
|
| 10 |
+
import pandas as pd
|
| 11 |
+
from server.data_generator import generate_task3_datasets
|
| 12 |
+
|
| 13 |
+
TASK_ID = 3
|
| 14 |
+
MAX_STEPS = 40
|
| 15 |
+
DESCRIPTION = (
|
| 16 |
+
"Task 3 (Hard) — Full Cleaning Pipeline\n"
|
| 17 |
+
"You have a customer database with multiple issues:\n"
|
| 18 |
+
" 1. Missing values in 'age', 'purchase_amount', 'country', 'signup_date'\n"
|
| 19 |
+
" 2. ~20 duplicate rows\n"
|
| 20 |
+
" 3. Outliers in 'purchase_amount' (injected values ~10x normal)\n"
|
| 21 |
+
" 4. Mixed case in 'country' (need: title case, e.g. 'Usa' → 'USA')\n"
|
| 22 |
+
" 5. Mixed date formats in 'signup_date' (need: YYYY-MM-DD)\n\n"
|
| 23 |
+
"Available operations:\n"
|
| 24 |
+
" fill_missing — column + params.strategy ('median'|'mean'|'mode'|'constant')\n"
|
| 25 |
+
" drop_duplicates — no column needed\n"
|
| 26 |
+
" drop_outliers — column (numeric); uses IQR method\n"
|
| 27 |
+
" fix_format — column: 'country' | 'signup_date'\n"
|
| 28 |
+
" fix_dtype — column + params.dtype ('float'|'int'|'str')\n\n"
|
| 29 |
+
"Example actions:\n"
|
| 30 |
+
' {"operation": "fill_missing", "column": "age", "params": {"strategy": "median"}}\n'
|
| 31 |
+
' {"operation": "drop_duplicates"}\n'
|
| 32 |
+
' {"operation": "drop_outliers", "column": "purchase_amount"}\n'
|
| 33 |
+
' {"operation": "fix_format", "column": "signup_date"}\n'
|
| 34 |
+
' {"operation": "fix_format", "column": "country"}'
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")
|
| 38 |
+
VALID_COUNTRIES = {"USA", "UK", "Canada", "Australia", "Germany"}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def load():
|
| 42 |
+
dirty, clean = generate_task3_datasets()
|
| 43 |
+
orig_nulls = int(dirty.isnull().sum().sum())
|
| 44 |
+
orig_dupes = len(dirty) - len(dirty.drop_duplicates())
|
| 45 |
+
|
| 46 |
+
# Outlier baseline: count rows where purchase_amount > Q3 + 3*IQR
|
| 47 |
+
pa = dirty["purchase_amount"].dropna()
|
| 48 |
+
q1, q3 = pa.quantile(0.25), pa.quantile(0.75)
|
| 49 |
+
iqr = q3 - q1
|
| 50 |
+
orig_outliers = int((pa > q3 + 3 * iqr).sum())
|
| 51 |
+
|
| 52 |
+
orig_country_issues = int((~dirty["country"].isin(VALID_COUNTRIES) &
|
| 53 |
+
dirty["country"].notna()).sum())
|
| 54 |
+
orig_date_issues = int((~dirty["signup_date"].apply(
|
| 55 |
+
lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
|
| 56 |
+
)).sum())
|
| 57 |
+
|
| 58 |
+
meta = {
|
| 59 |
+
"orig_nulls": orig_nulls,
|
| 60 |
+
"orig_dupes": orig_dupes,
|
| 61 |
+
"orig_outliers": max(orig_outliers, 1),
|
| 62 |
+
"orig_country_issues": max(orig_country_issues, 1),
|
| 63 |
+
"orig_date_issues": max(orig_date_issues, 1),
|
| 64 |
+
"q1": q1, "q3": q3, "iqr": iqr,
|
| 65 |
+
}
|
| 66 |
+
return dirty.copy(), clean, meta
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def score(current_df, meta: dict) -> float:
|
| 70 |
+
remaining_nulls = int(current_df.isnull().sum().sum())
|
| 71 |
+
remaining_dupes = len(current_df) - len(current_df.drop_duplicates())
|
| 72 |
+
|
| 73 |
+
pa = current_df["purchase_amount"].dropna()
|
| 74 |
+
remaining_outliers = int((pa > meta["q3"] + 3 * meta["iqr"]).sum())
|
| 75 |
+
|
| 76 |
+
remaining_country = int((~current_df["country"].isin(VALID_COUNTRIES) &
|
| 77 |
+
current_df["country"].notna()).sum())
|
| 78 |
+
remaining_dates = int((~current_df["signup_date"].apply(
|
| 79 |
+
lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
|
| 80 |
+
)).sum())
|
| 81 |
+
|
| 82 |
+
null_score = 1.0 - remaining_nulls / max(meta["orig_nulls"], 1)
|
| 83 |
+
dupe_score = 1.0 - remaining_dupes / max(meta["orig_dupes"], 1)
|
| 84 |
+
outlier_score = 1.0 - remaining_outliers / meta["orig_outliers"]
|
| 85 |
+
country_score = 1.0 - remaining_country / meta["orig_country_issues"]
|
| 86 |
+
date_score = 1.0 - remaining_dates / meta["orig_date_issues"]
|
| 87 |
+
|
| 88 |
+
combined = 0.25 * null_score + 0.20 * dupe_score + 0.20 * outlier_score \
|
| 89 |
+
+ 0.175 * country_score + 0.175 * date_score
|
| 90 |
+
return round(max(0.01, min(0.99, combined)), 4)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def count_errors(current_df, meta: dict) -> int:
|
| 94 |
+
remaining_nulls = int(current_df.isnull().sum().sum())
|
| 95 |
+
remaining_dupes = len(current_df) - len(current_df.drop_duplicates())
|
| 96 |
+
pa = current_df["purchase_amount"].dropna()
|
| 97 |
+
remaining_outliers = int((pa > meta["q3"] + 3 * meta["iqr"]).sum())
|
| 98 |
+
remaining_country = int((~current_df["country"].isin(VALID_COUNTRIES) &
|
| 99 |
+
current_df["country"].notna()).sum())
|
| 100 |
+
remaining_dates = int((~current_df["signup_date"].apply(
|
| 101 |
+
lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
|
| 102 |
+
)).sum())
|
| 103 |
+
return remaining_nulls + remaining_dupes + remaining_outliers + \
|
| 104 |
+
remaining_country + remaining_dates
|
server/workflow_engine.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@dataclass
|
| 2 |
+
class WorkflowStep:
|
| 3 |
+
step_id: str
|
| 4 |
+
description: str
|
| 5 |
+
app: str
|
| 6 |
+
operation: str
|
| 7 |
+
# Callable that checks if this step was completed given the app states
|
| 8 |
+
completion_check: Callable[[Dict[str, "BaseApp"]], bool]
|
| 9 |
+
|
| 10 |
+
# Workflow A: Customer Bug → Engineering Fix
|
| 11 |
+
WORKFLOW_A_STEPS = [
|
| 12 |
+
WorkflowStep("A1", "Acknowledge ticket in Zendesk",
|
| 13 |
+
"zendesk", "acknowledge_ticket",
|
| 14 |
+
lambda apps: apps["zendesk"].ticket_acknowledged()),
|
| 15 |
+
|
| 16 |
+
WorkflowStep("A2", "Escalate to Jira — create linked issue",
|
| 17 |
+
"jira", "create_issue",
|
| 18 |
+
lambda apps: apps["jira"].has_linked_issue()),
|
| 19 |
+
|
| 20 |
+
WorkflowStep("A3", "Check if customer is paying (Salesforce lookup)",
|
| 21 |
+
"salesforce", "get_account",
|
| 22 |
+
lambda apps: apps["salesforce"].account_checked()),
|
| 23 |
+
|
| 24 |
+
WorkflowStep("A4", "Assign correct engineer in Jira based on priority",
|
| 25 |
+
"jira", "assign_owner",
|
| 26 |
+
lambda apps: apps["jira"].issue_assigned()),
|
| 27 |
+
|
| 28 |
+
WorkflowStep("A5", "Log SLA status in Workday",
|
| 29 |
+
"workday", "log_sla_event",
|
| 30 |
+
lambda apps: apps["workday"].sla_logged()),
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
# Workflow B: Employee Onboarding
|
| 34 |
+
WORKFLOW_B_STEPS = [
|
| 35 |
+
WorkflowStep("B1", "Create employee record in Workday", ...),
|
| 36 |
+
WorkflowStep("B2", "Provision Jira access based on role", ...),
|
| 37 |
+
WorkflowStep("B3", "Add to Salesforce team by territory", ...),
|
| 38 |
+
WorkflowStep("B4", "Create Zendesk support profile if customer-facing", ...),
|
| 39 |
+
]
|
| 40 |
+
|
| 41 |
+
# Workflow C: Churn Risk Alert
|
| 42 |
+
WORKFLOW_C_STEPS = [
|
| 43 |
+
WorkflowStep("C1", "Flag at-risk account in Salesforce", ...),
|
| 44 |
+
WorkflowStep("C2", "Query recent support volume in Zendesk", ...),
|
| 45 |
+
WorkflowStep("C3", "Check outstanding bugs in Jira", ...),
|
| 46 |
+
WorkflowStep("C4", "Synthesize churn score and assign intervention owner", ...),
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
class WorkflowEngine:
|
| 50 |
+
WORKFLOWS = {"A": WORKFLOW_A_STEPS, "B": WORKFLOW_B_STEPS, "C": WORKFLOW_C_STEPS}
|
| 51 |
+
|
| 52 |
+
def start(self, workflow_id: str) -> None:
|
| 53 |
+
self._steps = self.WORKFLOWS[workflow_id].copy()
|
| 54 |
+
self._completed: List[str] = []
|
| 55 |
+
|
| 56 |
+
def evaluate(self, apps: Dict) -> float:
|
| 57 |
+
"""Check all steps and return completion ratio (0.0-1.0)."""
|
| 58 |
+
completed = sum(1 for s in self._steps if s.completion_check(apps))
|
| 59 |
+
self._completed = [s.step_id for s in self._steps if s.completion_check(apps)]
|
| 60 |
+
return completed / len(self._steps)
|
| 61 |
+
|
| 62 |
+
def get_pending(self) -> List[str]:
|
| 63 |
+
return [s.description for s in self._steps if s.step_id not in self._completed]
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|