Taniieeee83 commited on
Commit
4719066
·
0 Parent(s):

changed till step 8

Browse files
.dockerignore ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ---- Git ----
2
+ .git
3
+ .gitignore
4
+
5
+ # ---- Python cache ----
6
+ __pycache__/
7
+ *.pyc
8
+ *.pyo
9
+ *.pyd
10
+ *.pytest_cache/
11
+
12
+ # ---- Virtual environments ----
13
+ venv/
14
+ .env/
15
+ .venv/
16
+
17
+ # ---- Environment files ----
18
+ .env
19
+ .env.*
20
+
21
+ # ---- OS files ----
22
+ .DS_Store
23
+ Thumbs.db
24
+
25
+ # ---- Logs ----
26
+ *.log
27
+
28
+ # ---- Model / large local files (if any) ----
29
+ checkpoints/
30
+ models/
31
+ *.pt
32
+ *.pth
33
+ *.bin
34
+
35
+ # ---- IDE files ----
36
+ .vscode/
37
+ .idea/
38
+
39
+ # ---- Node (if frontend exists) ----
40
+ node_modules/
41
+
42
+ # ---- Docker ----
43
+ Dockerfile*
44
+ docker-compose.yml
.gitattributes ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ training/orgos-training/orgos_lora_adapter/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ *.png filter=lfs diff=lfs merge=lfs -text
38
+ *.jpg filter=lfs diff=lfs merge=lfs -text
39
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
40
+ *.gif filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .venv/
7
+ venv/
8
+ .env
9
+ *.env
10
+ .DS_Store
11
+ .claude/
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Non-root user for HuggingFace Spaces compatibility
4
+ RUN useradd -m -u 1000 appuser
5
+
6
+ WORKDIR /app
7
+
8
+ # Install dependencies first (layer cache)
9
+ COPY requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ # Copy project files
13
+ COPY . .
14
+
15
+ # Switch to non-root
16
+ RUN chown -R appuser:appuser /app
17
+ USER appuser
18
+
19
+ EXPOSE 8000
20
+
21
+ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
22
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
23
+
24
+ # server.app:app — runs server/app.py from /app working directory
25
+ # models.py, client.py, inference.py live at /app root (on PYTHONPATH automatically)
26
+ CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
README.md ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Data Cleaning Environment
3
+ emoji: 🧹
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ pinned: false
8
+ app_port: 8000
9
+ tags:
10
+ - openenv
11
+ - rl
12
+ - data-cleaning
13
+ ---
14
+
15
+ # Data Cleaning OpenEnv
16
+
17
+ A **real-world data cleaning environment** for training and evaluating AI agents.
18
+
19
+ An agent interacts with a dirty pandas DataFrame through a standard `reset() / step() / state()` HTTP API, learning to fix common data quality problems — missing values, duplicate rows, inconsistent formats, statistical outliers, and dtype errors — across three progressively harder tasks.
20
+
21
+ 🤗 **Live HuggingFace Space:** https://srishtichugh-openenv-hack.hf.space
22
+ 📖 **Interactive API docs:** https://srishtichugh-openenv-hack.hf.space/docs
23
+ ✅ **Health check:** https://srishtichugh-openenv-hack.hf.space/health
24
+
25
+ ---
26
+
27
+ ## Environment Description & Motivation
28
+
29
+ Real-world datasets are almost never clean. Data engineers routinely spend 60–80 % of their time on data cleaning tasks: filling missing values with statistically appropriate strategies, removing duplicates, standardising inconsistent formats (phone numbers, dates, country names), and detecting extreme outliers.
30
+
31
+ This environment turns those tasks into a reinforcement learning challenge with:
32
+
33
+ - **Deterministic, programmatic graders** — ground-truth clean DataFrames are generated with a fixed seed; every reward signal is reproducible.
34
+ - **Meaningful partial rewards** — every step emits a delta reward proportional to how much of the dataset it cleaned, so the agent receives useful signal throughout the episode rather than only at the end.
35
+ - **Three difficulty levels** — easy, medium, hard — letting agents learn a curriculum from simple null-filling up to full multi-issue pipelines.
36
+ - **No external data downloads** — all datasets are generated synthetically via `numpy` + `Faker` with `seed=42`.
37
+
38
+ ---
39
+
40
+ ## Action Space
41
+
42
+ Actions are JSON objects sent to `POST /step`.
43
+
44
+ | `operation` | Required `column` | `params` | Description |
45
+ |---|---|---|---|
46
+ | `fill_missing` | ✅ | `{"strategy": "median\|mean\|mode\|constant", "value": ...}` | Fill NaN values in a column |
47
+ | `drop_duplicates` | ❌ | — | Remove all duplicate rows |
48
+ | `fix_format` | ✅ | — | Standardise phone/date/country format |
49
+ | `replace_value` | ✅ | `{"old": ..., "new": ...}` | Replace a specific value |
50
+ | `drop_outliers` | ✅ | — | Remove IQR outliers from a numeric column |
51
+ | `fix_dtype` | ✅ | `{"dtype": "float\|int\|str"}` | Cast column to correct dtype |
52
+
53
+ **Format rules enforced by `fix_format`:**
54
+
55
+ | Column | Target format |
56
+ |---|---|
57
+ | `phone` | `NNN-NNN-NNNN` |
58
+ | `listed_date` / `signup_date` | `YYYY-MM-DD` |
59
+ | `country` | Title-cased canonical name (`USA`, `UK`, `Canada`, `Australia`, `Germany`) |
60
+
61
+ **Example actions:**
62
+ ```json
63
+ {"operation": "fill_missing", "column": "salary", "params": {"strategy": "median"}}
64
+ {"operation": "fill_missing", "column": "department", "params": {"strategy": "mode"}}
65
+ {"operation": "drop_duplicates"}
66
+ {"operation": "fix_format", "column": "phone"}
67
+ {"operation": "fix_format", "column": "signup_date"}
68
+ {"operation": "drop_outliers", "column": "purchase_amount"}
69
+ ```
70
+
71
+ ---
72
+
73
+ ## Observation Space
74
+
75
+ Every `POST /reset` and `POST /step` returns:
76
+ ```json
77
+ {
78
+ "observation": {
79
+ "done": false,
80
+ "reward": 0.40,
81
+ "data_preview": "name,age,salary,...\n...",
82
+ "data_shape": [100, 5],
83
+ "missing_counts": {"age": 20, "salary": 20, "department": 10},
84
+ "duplicate_count": 0,
85
+ "dtype_issues": {},
86
+ "task_description": "Task 1 (Easy) — Fill Missing Values\n...",
87
+ "message": "Filled 20 missing values in 'age' using median.",
88
+ "step_count": 1,
89
+ "current_score": 0.4000
90
+ },
91
+ "reward": 0.40,
92
+ "done": false,
93
+ "info": {}
94
+ }
95
+ ```
96
+
97
+ | Field | Type | Description |
98
+ |---|---|---|
99
+ | `done` | bool | Episode finished (score ≥ 0.95 or max steps reached) |
100
+ | `reward` | float | Per-step delta reward (see Reward Function) |
101
+ | `data_preview` | string | First 10 rows of current DataFrame as CSV |
102
+ | `data_shape` | [int, int] | Current `[rows, cols]` |
103
+ | `missing_counts` | object | `{column: null_count}` for columns with NaN |
104
+ | `duplicate_count` | int | Number of duplicate rows |
105
+ | `dtype_issues` | object | `{column: issue_description}` for suspected dtype mismatches |
106
+ | `task_description` | string | Full task instructions with available operations |
107
+ | `message` | string | Human-readable result of the last action |
108
+ | `step_count` | int | Steps taken in this episode |
109
+ | `current_score` | float | Running grader score 0.0 – 1.0 |
110
+
111
+ ---
112
+
113
+ ## State Space
114
+
115
+ `GET /state` returns episode metadata (does not modify state):
116
+ ```json
117
+ {
118
+ "episode_id": "a8f026a9-...",
119
+ "task_id": 1,
120
+ "step_count": 2,
121
+ "max_steps": 20,
122
+ "total_errors": 50,
123
+ "errors_remaining": 30
124
+ }
125
+ ```
126
+
127
+ ---
128
+
129
+ ## Tasks
130
+
131
+ ### Task 1 — Fill Missing Values *(Easy)*
132
+
133
+ | Property | Value |
134
+ |---|---|
135
+ | Dataset | 100-row employee records (name, age, salary, department, experience) |
136
+ | Issues | ~20 % NaN in `age`, `salary`; ~10 % NaN in `department` |
137
+ | Goal | Fill all missing values |
138
+ | Valid operations | `fill_missing` |
139
+ | Grader | `1.0 − remaining_nulls / original_nulls` |
140
+ | Max steps | 20 |
141
+ | Optimal steps | 3 (one per affected column) |
142
+
143
+ ### Task 2 — Fix Formats + Remove Duplicates *(Medium)*
144
+
145
+ | Property | Value |
146
+ |---|---|
147
+ | Dataset | 215-row product catalog (product_id, price, category, phone, listed_date) |
148
+ | Issues | ~60 % phone numbers in mixed formats, ~60 % dates in mixed formats, 15 duplicate rows |
149
+ | Goal | Standardise all phone/date formats and remove duplicates |
150
+ | Valid operations | `fix_format`, `drop_duplicates` |
151
+ | Grader | `0.35 × phone_score + 0.35 × date_score + 0.30 × dupe_score` |
152
+ | Max steps | 30 |
153
+ | Optimal steps | 3 |
154
+
155
+ ### Task 3 — Full Cleaning Pipeline *(Hard)*
156
+
157
+ | Property | Value |
158
+ |---|---|
159
+ | Dataset | 320-row customer database (name, age, purchase_amount, country, email, signup_date) |
160
+ | Issues | Missing values (4 cols), 20 duplicate rows, outliers in `purchase_amount` (~3× normal), mixed country capitalisation, mixed date formats |
161
+ | Goal | Fix all issues end-to-end |
162
+ | Valid operations | All 6 operations |
163
+ | Grader | `0.25×null + 0.20×dupe + 0.20×outlier + 0.175×country + 0.175×date` |
164
+ | Max steps | 40 |
165
+ | Optimal steps | 8 |
166
+
167
+ ---
168
+
169
+ ## Reward Function
170
+
171
+ | Scenario | Reward |
172
+ |---|---|
173
+ | Score improves (delta > 0) | `new_score − old_score` (positive) |
174
+ | Operation had no effect | `−0.01` |
175
+ | Invalid operation / bad column | `−0.05` |
176
+ | Episode completed (score ≥ 0.95) | `delta + 0.20` terminal bonus |
177
+
178
+ Rewards are bounded to **[−0.05, 1.2]**. A partial reward is emitted on every step, giving the agent dense signal throughout the episode.
179
+
180
+ ---
181
+
182
+ ## API Endpoints
183
+
184
+ | Method | Path | Description |
185
+ |---|---|---|
186
+ | `GET` | `/health` | Health check → `{"status": "healthy"}` |
187
+ | `POST` | `/reset` | Start episode. Body: `{"task_id": 1\|2\|3}` (optional; default: round-robin) |
188
+ | `POST` | `/step` | Execute action. Body: action JSON |
189
+ | `POST` | `/state` | Get episode metadata |
190
+ | `GET` | `/metadata` | Environment name, version, task list |
191
+ | `GET` | `/schema` | Full action / observation / state JSON schemas |
192
+ | `GET` | `/docs` | Interactive Swagger UI |
193
+
194
+ ---
195
+
196
+ ## Baseline Scores
197
+
198
+ | Task | Difficulty | Score |
199
+ |---|---|---|
200
+ | 1 — Fill Missing Values | Easy | 0.999 |
201
+ | 2 — Fix Formats + Duplicates | Medium | 0.999 |
202
+ | 3 — Full Cleaning Pipeline | Hard | 0.999 |
203
+ | **Average** | — | **0.999** |
204
+
205
+ *Produced by `google/gemma-3-27b-it` via NVIDIA NIM, `temperature=0`. Full step-by-step agent logs: `inference_log.txt`.*
206
+
207
+ ---
208
+
209
+ ## Setup & Usage
210
+
211
+ ### Prerequisites
212
+
213
+ - Python 3.11+
214
+ - Docker (for containerised deployment)
215
+
216
+ ### Local — Python
217
+ ```bash
218
+ # 1. Clone and install dependencies
219
+ git clone https://github.com/Tanvi51204/openEnv.git
220
+ cd openEnv
221
+ pip install -r requirements.txt
222
+
223
+ # 2. Start the server
224
+ uvicorn server.app:app --host 0.0.0.0 --port 8000
225
+
226
+ # 3. Open Swagger UI
227
+ open http://localhost:8000/docs
228
+ ```
229
+
230
+ ### Local — Docker
231
+ ```bash
232
+ docker build -t data-cleaning-env .
233
+ docker run -p 8000:8000 data-cleaning-env
234
+ ```
235
+
236
+ ### Quick API test
237
+ ```bash
238
+ # Health
239
+ curl http://localhost:8000/health
240
+
241
+ # Start Task 1
242
+ curl -X POST http://localhost:8000/reset \
243
+ -H "Content-Type: application/json" \
244
+ -d '{"task_id": 1}'
245
+
246
+ # Fill missing values
247
+ curl -X POST http://localhost:8000/step \
248
+ -H "Content-Type: application/json" \
249
+ -d '{"operation": "fill_missing", "column": "salary", "params": {"strategy": "median"}}'
250
+ ```
251
+
252
+ ### Python client
253
+ ```python
254
+ from client import DataCleaningEnvClient
255
+ from models import DataCleaningAction
256
+
257
+ with DataCleaningEnvClient("http://localhost:8000") as env:
258
+ result = env.reset(task_id=1)
259
+ print(result.observation.missing_counts) # {'age': 20, 'salary': 20, 'department': 10}
260
+
261
+ action = DataCleaningAction(
262
+ operation="fill_missing",
263
+ column="salary",
264
+ params={"strategy": "median"},
265
+ )
266
+ result = env.step(action)
267
+ print(result.observation.current_score) # 0.4
268
+ print(result.reward) # 0.4
269
+ ```
270
+
271
+ ### Run baseline inference
272
+ ```bash
273
+ export API_BASE_URL="https://api.openai.com/v1"
274
+ export MODEL_NAME="gpt-4o-mini"
275
+ export HF_TOKEN="sk-..." # your API key
276
+ export ENV_URL="http://localhost:8000"
277
+
278
+ python inference.py
279
+ ```
280
+
281
+ Produces `[START]` / `[STEP]` / `[END]` lines to stdout and `baseline_scores.json`.
282
+
283
+ ### Environment variables
284
+
285
+ | Variable | Default | Description |
286
+ |---|---|---|
287
+ | `API_BASE_URL` | `https://api.openai.com/v1` | LLM API endpoint (OpenAI-compatible) |
288
+ | `MODEL_NAME` | `gpt-4o-mini` | Model identifier |
289
+ | `HF_TOKEN` | — | API key for LLM calls |
290
+ | `ENV_URL` | `http://localhost:8000` | Environment server URL |
291
+
292
+ ---
293
+
294
+ ## Project Structure
295
+ ```
296
+ openenv-data-cleaning/
297
+ ├── models.py Pydantic contracts — Action / Observation / State
298
+ ├── client.py Sync HTTP client (reset / step / state / health)
299
+ ├── inference.py Baseline LLM agent with [START]/[STEP]/[END] logging
300
+ ├── openenv.yaml OpenEnv manifest
301
+ ├── Dockerfile python:3.11-slim, non-root user, HEALTHCHECK
302
+ ├── requirements.txt pip dependencies
303
+ ├── pyproject.toml Python package metadata + openenv-core dependency
304
+ └── server/
305
+ ├── app.py FastAPI routes + /metadata + /schema
306
+ ├── environment.py reset / step / state logic + 6 operations + rewards
307
+ ├── data_generator.py Synthetic dataset generation (seed=42, reproducible)
308
+ └── tasks/
309
+ ├── task1_missing.py Easy — fill NaN grader
310
+ ├── task2_format.py Medium — format + duplicates grader
311
+ └── task3_pipeline.py Hard — full pipeline grader
312
+ ```
313
+
314
+ ---
315
+
316
+ ## Live Demo
317
+
318
+ 🤗 **HuggingFace Space:** https://srishtichugh-openenv-hack.hf.space
319
+
320
+ - Health: https://srishtichugh-openenv-hack.hf.space/health
321
+ - Docs: https://srishtichugh-openenv-hack.hf.space/docs
baseline_scores.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "scores": {
3
+ "task1": 0.99,
4
+ "task2": 0.99,
5
+ "task3": 0.99
6
+ },
7
+ "average": 0.99
8
+ }
client.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Synchronous HTTP client for the Data Cleaning OpenEnv environment.
3
+
4
+ Usage
5
+ -----
6
+ from client import DataCleaningEnvClient, DataCleaningAction
7
+
8
+ client = DataCleaningEnvClient(base_url="http://localhost:8000")
9
+
10
+ # Start a new episode (task_id 1/2/3 or omit for round-robin)
11
+ result = client.reset(task_id=1)
12
+ print(result.observation.task_description)
13
+
14
+ # Take a step
15
+ action = DataCleaningAction(
16
+ operation="fill_missing",
17
+ column="salary",
18
+ params={"strategy": "median"},
19
+ )
20
+ result = client.step(action)
21
+ print(result.observation.current_score, result.reward, result.done)
22
+
23
+ # Inspect state
24
+ state = client.state()
25
+ print(state.episode_id, state.errors_remaining)
26
+ """
27
+
28
+ from typing import Optional
29
+ import httpx
30
+ from pydantic import BaseModel
31
+
32
+ from models import DataCleaningAction, DataCleaningObservation, DataCleaningState
33
+
34
+
35
+ class StepResult(BaseModel):
36
+ """Returned by reset() and step()."""
37
+ observation: DataCleaningObservation
38
+ reward: float
39
+ done: bool
40
+ info: dict = {}
41
+
42
+
43
+ class DataCleaningEnvClient:
44
+ """
45
+ Thin synchronous wrapper around the DataCleaning HTTP API.
46
+
47
+ All methods raise httpx.HTTPStatusError on non-2xx responses.
48
+ """
49
+
50
+ def __init__(self, base_url: str = "http://localhost:8000", timeout: float = 30.0):
51
+ self.base_url = base_url.rstrip("/")
52
+ self._client = httpx.Client(base_url=self.base_url, timeout=timeout)
53
+
54
+ # ------------------------------------------------------------------
55
+ # Core API
56
+ # ------------------------------------------------------------------
57
+
58
+ def reset(self, task_id: Optional[int] = None) -> StepResult:
59
+ """
60
+ Start a new episode.
61
+
62
+ Parameters
63
+ ----------
64
+ task_id : int | None
65
+ 1 = Easy (fill missing values)
66
+ 2 = Medium (fix formats + duplicates)
67
+ 3 = Hard (full pipeline)
68
+ None = round-robin (1 → 2 → 3 → 1 …)
69
+ """
70
+ payload = {"task_id": task_id} if task_id is not None else {}
71
+ resp = self._client.post("/reset", json=payload)
72
+ resp.raise_for_status()
73
+ return StepResult(**resp.json())
74
+
75
+ def step(self, action: DataCleaningAction) -> StepResult:
76
+ """
77
+ Apply one cleaning operation and return the updated observation.
78
+
79
+ Parameters
80
+ ----------
81
+ action : DataCleaningAction
82
+ operation : str – one of fill_missing / drop_duplicates /
83
+ fix_format / replace_value / drop_outliers / fix_dtype
84
+ column : str – target column (optional for drop_duplicates)
85
+ params : dict – operation-specific parameters
86
+ """
87
+ resp = self._client.post("/step", json=action.model_dump())
88
+ resp.raise_for_status()
89
+ return StepResult(**resp.json())
90
+
91
+ def state(self) -> DataCleaningState:
92
+ """Return current episode metadata without modifying state."""
93
+ resp = self._client.get("/state")
94
+ resp.raise_for_status()
95
+ return DataCleaningState(**resp.json())
96
+
97
+ def health(self) -> dict:
98
+ """Ping the server. Returns {"status": "ok"} if healthy."""
99
+ resp = self._client.get("/health")
100
+ resp.raise_for_status()
101
+ return resp.json()
102
+
103
+ # ------------------------------------------------------------------
104
+ # Context manager support
105
+ # ------------------------------------------------------------------
106
+
107
+ def __enter__(self):
108
+ return self
109
+
110
+ def __exit__(self, *_):
111
+ self.close()
112
+
113
+ def close(self):
114
+ self._client.close()
env.example ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================================
2
+ # Data Cleaning OpenEnv — Environment Variables
3
+ # Copy this file to .env and fill in your values.
4
+ # Never commit your real .env to version control.
5
+ # ============================================================
6
+
7
+ # LLM API endpoint (OpenAI-compatible).
8
+ # Default points to OpenAI; swap for any compatible provider.
9
+ API_BASE_URL=https://api.openai.com/v1
10
+
11
+ # Model identifier to use for baseline inference.
12
+ # Examples: gpt-4o-mini, gpt-4o, mistralai/Mistral-7B-Instruct-v0.2
13
+ MODEL_NAME=gpt-4o-mini
14
+
15
+ # API key for the LLM provider above.
16
+ # For OpenAI: starts with sk-...
17
+ # For HuggingFace Inference: starts with hf_...
18
+ HF_TOKEN=your-api-key-here
19
+
20
+ # Base URL of the running environment server.
21
+ # Use http://localhost:8000 for local development,
22
+ # or your HuggingFace Space URL for remote runs.
23
+ ENV_URL=http://localhost:8000
inference.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Baseline inference script for the Data Cleaning OpenEnv environment.
3
+ Uses the OpenAI client against all 3 tasks and reports scores.
4
+
5
+ Required environment variables:
6
+ API_BASE_URL — LLM API endpoint (OpenAI-compatible)
7
+ MODEL_NAME — model identifier
8
+ HF_TOKEN — API key
9
+ ENV_URL — environment server URL (default: http://localhost:8000)
10
+
11
+ STDOUT FORMAT (OpenEnv spec):
12
+ [START] task=<task_name> env=<benchmark> model=<model_name>
13
+ [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
14
+ [END] task=<task_name> score=<0.00> steps=<n>
15
+ """
16
+
17
+ import json
18
+ import os
19
+ import re
20
+ import sys
21
+ import time
22
+ from typing import List, Optional
23
+ import httpx
24
+ from openai import OpenAI
25
+
26
+ # ------------------------------------------------------------------
27
+ # Config
28
+ # ------------------------------------------------------------------
29
+
30
+ API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
31
+ MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
32
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
33
+ ENV_URL = os.environ.get("ENV_URL", "http://localhost:8000")
34
+
35
+ if not HF_TOKEN:
36
+ print("[WARNING] HF_TOKEN is not set — LLM calls may fail.", file=sys.stderr)
37
+
38
+ client = OpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
39
+
40
+ SYSTEM_PROMPT = """You are a data cleaning agent. You control a data cleaning environment
41
+ through JSON actions. Each turn you receive an observation JSON describing the current state
42
+ of a dataset (preview, missing counts, duplicate count, dtype issues, current score, etc.)
43
+ and a task description.
44
+
45
+ Your job is to pick the single best action to improve the dataset quality.
46
+
47
+ Respond ONLY with a valid JSON object — no markdown, no explanation, just the JSON.
48
+
49
+ Available operations and their required parameters:
50
+
51
+ 1. fill_missing
52
+ {"operation": "fill_missing", "column": "<col>", "params": {"strategy": "median|mean|mode|constant", "value": <only if constant>}}
53
+
54
+ 2. drop_duplicates
55
+ {"operation": "drop_duplicates"}
56
+
57
+ 3. fix_format
58
+ {"operation": "fix_format", "column": "phone|listed_date|signup_date|country"}
59
+
60
+ 4. replace_value
61
+ {"operation": "replace_value", "column": "<col>", "params": {"old": "<val>", "new": "<val>"}}
62
+
63
+ 5. drop_outliers
64
+ {"operation": "drop_outliers", "column": "<numeric_col>"}
65
+
66
+ 6. fix_dtype
67
+ {"operation": "fix_dtype", "column": "<col>", "params": {"dtype": "float|int|str"}}
68
+
69
+ Rules:
70
+ - Address the highest-impact issues first (missing values > duplicates > outliers > format).
71
+ - Do not repeat an operation that returned no effect (watch the 'message' field).
72
+ - Stop when current_score >= 0.95.
73
+ """
74
+
75
+
76
+ # ------------------------------------------------------------------
77
+ # OpenEnv stdout logging helpers
78
+ # ------------------------------------------------------------------
79
+
80
+ def log_start(task: str, env: str, model: str) -> None:
81
+ print(f"[START] task={task} env={env} model={model}", flush=True)
82
+
83
+
84
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
85
+ error_val = error if error else "null"
86
+ done_val = str(done).lower()
87
+ print(
88
+ f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
89
+ flush=True,
90
+ )
91
+
92
+
93
+ def log_end(task_name: str, score: float, steps: int) -> None:
94
+ safe_score = max(0.01, min(0.99, float(score)))
95
+ print(
96
+ f"[END] task={task_name} score={safe_score:.4f} steps={steps}",
97
+ flush=True
98
+ )
99
+
100
+
101
+ # ------------------------------------------------------------------
102
+ # HTTP helpers
103
+ # ------------------------------------------------------------------
104
+
105
+ def api_post(path: str, payload: dict = None) -> dict:
106
+ url = ENV_URL.rstrip("/") + path
107
+ resp = httpx.post(url, json=payload or {}, timeout=30)
108
+ resp.raise_for_status()
109
+ return resp.json()
110
+
111
+
112
+ def api_get(path: str) -> dict:
113
+ url = ENV_URL.rstrip("/") + path
114
+ resp = httpx.get(url, timeout=10)
115
+ resp.raise_for_status()
116
+ return resp.json()
117
+
118
+
119
+ # ------------------------------------------------------------------
120
+ # Agent loop
121
+ # ------------------------------------------------------------------
122
+
123
+ def obs_to_text(obs: dict) -> str:
124
+ lines = [
125
+ f"current_score: {obs['current_score']}",
126
+ f"step_count: {obs['step_count']}",
127
+ f"data_shape: {obs['data_shape']}",
128
+ f"duplicate_count: {obs['duplicate_count']}",
129
+ f"missing_counts: {json.dumps(obs['missing_counts'])}",
130
+ f"dtype_issues: {json.dumps(obs['dtype_issues'])}",
131
+ f"message: {obs['message']}",
132
+ "",
133
+ "=== DATA PREVIEW (first 10 rows) ===",
134
+ obs["data_preview"],
135
+ "",
136
+ "=== TASK DESCRIPTION ===",
137
+ obs["task_description"],
138
+ ]
139
+ return "\n".join(lines)
140
+
141
+
142
+ def run_task(task_id: int) -> float:
143
+ task_name = f"data-cleaning-task{task_id}"
144
+
145
+ # Human-readable header (stderr so it doesn't interfere with stdout format)
146
+ print(f"\n{'='*60}", file=sys.stderr)
147
+ print(f" Running Task {task_id}", file=sys.stderr)
148
+ print(f"{'='*60}", file=sys.stderr)
149
+
150
+ result = api_post("/reset", {"task_id": task_id})
151
+ obs = result["observation"]
152
+ history = []
153
+ rewards: List[float] = []
154
+ steps_taken = 0
155
+ success = False
156
+
157
+ log_start(task=task_name, env="data-cleaning-openenv", model=MODEL_NAME)
158
+
159
+ try:
160
+ for step_num in range(1, 50):
161
+ if obs["done"]:
162
+ success = obs["current_score"] >= 0.95
163
+ break
164
+
165
+ obs_text = obs_to_text(obs)
166
+ history.append({"role": "user", "content": obs_text})
167
+
168
+ try:
169
+ response = client.chat.completions.create(
170
+ model = MODEL_NAME,
171
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}] + history,
172
+ temperature = 0.0,
173
+ max_tokens = 256,
174
+ )
175
+ action_str = response.choices[0].message.content.strip()
176
+ except Exception as exc:
177
+ print(f" Step {step_num}: LLM call failed: {exc}", file=sys.stderr)
178
+ log_step(step_num, "null", 0.0, True, str(exc))
179
+ break
180
+
181
+ history.append({"role": "assistant", "content": action_str})
182
+
183
+ # Parse action JSON
184
+ action = None
185
+ try:
186
+ action = json.loads(action_str)
187
+ except json.JSONDecodeError:
188
+ m = re.search(r"\{.*\}", action_str, re.DOTALL)
189
+ if m:
190
+ try:
191
+ action = json.loads(m.group())
192
+ except Exception:
193
+ pass
194
+
195
+ if action is None:
196
+ print(f" Step {step_num}: Could not parse action JSON, skipping.", file=sys.stderr)
197
+ log_step(step_num, action_str, -0.05, False, "json_parse_error")
198
+ break
199
+
200
+ action_label = json.dumps(action, separators=(",", ":"))
201
+ print(
202
+ f" Step {step_num:2d} | score={obs['current_score']:.4f} | action={action_label}",
203
+ file=sys.stderr,
204
+ )
205
+
206
+ result = api_post("/step", action)
207
+ obs = result["observation"]
208
+ step_reward = result["reward"]
209
+ done = result["done"]
210
+ error_msg = None if obs["message"].startswith("Fill") or step_reward >= 0 else obs["message"]
211
+
212
+ print(f" -> {obs['message']}", file=sys.stderr)
213
+
214
+ rewards.append(step_reward)
215
+ steps_taken = step_num
216
+
217
+ log_step(
218
+ step = step_num,
219
+ action = action_label,
220
+ reward = step_reward,
221
+ done = done,
222
+ error = error_msg,
223
+ )
224
+
225
+ if done:
226
+ success = obs["current_score"] >= 0.95
227
+ break
228
+
229
+ time.sleep(0.3)
230
+
231
+ finally:
232
+ final = obs.get("current_score", 0.01) if isinstance(obs, dict) else 0.01
233
+ log_end(task_name=task_name, score=final, steps=steps_taken)
234
+
235
+ final_score = obs["current_score"]
236
+ print(
237
+ f"\n Task {task_id} final score: {final_score:.4f} (steps used: {obs['step_count']})",
238
+ file=sys.stderr,
239
+ )
240
+ return final_score
241
+
242
+
243
+ # ------------------------------------------------------------------
244
+ # Main
245
+ # ------------------------------------------------------------------
246
+
247
+ def main():
248
+ print("Data Cleaning OpenEnv -- Baseline Inference", file=sys.stderr)
249
+ print(f"Model : {MODEL_NAME}", file=sys.stderr)
250
+ print(f"Env : {ENV_URL}", file=sys.stderr)
251
+
252
+ # Smoke-test health endpoint
253
+ try:
254
+ health = api_get("/health")
255
+ assert health.get("status") in ("ok", "healthy"), f"Unexpected status: {health}"
256
+ print("Health check: OK\n", file=sys.stderr)
257
+ except Exception as exc:
258
+ print(f"[ERROR] Environment not reachable at {ENV_URL}: {exc}", file=sys.stderr)
259
+ print("[ERROR] Make sure the server is running and ENV_URL is correct.", file=sys.stderr)
260
+ sys.exit(1)
261
+
262
+ scores = {}
263
+ for task_id in [1, 2, 3]:
264
+ try:
265
+ scores[f"task{task_id}"] = run_task(task_id)
266
+ except Exception as exc:
267
+ print(f"[ERROR] Task {task_id} failed: {exc}", file=sys.stderr)
268
+ scores[f"task{task_id}"] = 0.01
269
+
270
+ print("\n" + "="*60, file=sys.stderr)
271
+ print(" BASELINE RESULTS", file=sys.stderr)
272
+ print("="*60, file=sys.stderr)
273
+ for k, v in scores.items():
274
+ print(f" {k}: {v:.4f}", file=sys.stderr)
275
+ avg = round(sum(scores.values()) / len(scores), 4)
276
+ print(f" average: {avg:.4f}", file=sys.stderr)
277
+ print("="*60, file=sys.stderr)
278
+
279
+ # Write scores to file for automated validators
280
+ with open("baseline_scores.json", "w") as f:
281
+ json.dump({"scores": scores, "average": avg}, f, indent=2)
282
+ print("\nScores written to baseline_scores.json", file=sys.stderr)
283
+
284
+
285
+ if __name__ == "__main__":
286
+ main()
inference_log.txt ADDED
Binary file (28.1 kB). View file
 
models.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models.py
2
+
3
+ class OrgOSAction(BaseModel):
4
+ app: str # "jira" | "zendesk" | "salesforce" | "workday"
5
+ operation: str # app-specific operation name
6
+ args: Dict[str, Any] = {}
7
+
8
+ class RewardBreakdown(BaseModel):
9
+ workflow_completion: float = 0.0 # 0.30 weight
10
+ rule_compliance: float = 0.0 # 0.25 weight
11
+ schema_adaptation: float = 0.0 # 0.20 weight
12
+ efficiency: float = 0.0 # 0.15 weight
13
+ policy_drift_handling: float = 0.0 # 0.10 weight
14
+
15
+ class OrgOSObservation(BaseModel):
16
+ done: bool
17
+ reward: float
18
+ current_score: float
19
+ workflow_id: str # "A", "B", or "C"
20
+ step_count: int
21
+ # Per-app state views (what the agent sees)
22
+ app_states: Dict[str, str] # app_name → CSV/JSON string preview
23
+ # Workflow progress
24
+ workflow_goal: str
25
+ completed_steps: List[str]
26
+ pending_steps: List[str]
27
+ # Schema drift info (partial — agent must probe to discover rest)
28
+ schema_hints: Dict[str, str] # e.g. {"jira.priority": "severity"}
29
+ # Business rules in effect this episode
30
+ active_rules: Dict[str, Any] # {"sla_p0_minutes": 15, "approval_threshold": 5000}
31
+ # Per-step feedback
32
+ rule_violations: List[str] # violations that just occurred
33
+ reward_breakdown: RewardBreakdown
34
+ message: str
35
+
36
+ class OrgOSState(BaseModel):
37
+ episode_id: str
38
+ workflow_id: str
39
+ schema_versions: Dict[str, str] # {"jira": "v2", "zendesk": "v1", ...}
40
+ step_count: int
41
+ max_steps: int
42
+ rule_violation_count: int
43
+ workflow_completion: float
44
+ rule_compliance_rate: float
45
+ policy_drift_active: bool
openenv.yaml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: data-cleaning-env
2
+ version: "0.1.0"
3
+ description: >
4
+ A real-world data cleaning environment where an AI agent fixes missing
5
+ values, duplicate rows, format inconsistencies, outliers, and dtype errors
6
+ across three progressively harder tasks.
7
+
8
+ author: openenv-hackathon
9
+ tags:
10
+ - openenv
11
+ - data-cleaning
12
+ - rl
13
+ - real-world
14
+
15
+ tasks:
16
+ - id: task1
17
+ name: "Fill Missing Values"
18
+ difficulty: easy
19
+ max_steps: 20
20
+ description: >
21
+ Fill all NaN values in an employee records dataset.
22
+ Columns with missing data: age, salary, department.
23
+
24
+ - id: task2
25
+ name: "Fix Formats and Remove Duplicates"
26
+ difficulty: medium
27
+ max_steps: 30
28
+ description: >
29
+ Standardise phone numbers (NNN-NNN-NNNN) and dates (YYYY-MM-DD)
30
+ in a product catalog, and remove ~15 duplicate rows.
31
+
32
+ - id: task3
33
+ name: "Full Cleaning Pipeline"
34
+ difficulty: hard
35
+ max_steps: 40
36
+ description: >
37
+ End-to-end pipeline on a customer database: fill missing values,
38
+ remove duplicates, drop outliers in purchase_amount, standardise
39
+ country capitalisation, and fix mixed date formats.
40
+
41
+ api:
42
+ health: GET /health
43
+ reset: POST /reset
44
+ step: POST /step
45
+ state: POST /state
46
+ docs: GET /docs
47
+
48
+ reward:
49
+ range: [0.001, 0.999]
50
+ partial: true
51
+ terminal_bonus: 0.0
52
+
53
+ observation_space:
54
+ type: object
55
+ fields:
56
+ done: boolean
57
+ reward: float
58
+ data_preview: string # First 10 rows as CSV
59
+ data_shape: list # [rows, cols]
60
+ missing_counts: object # {column: count}
61
+ duplicate_count: integer
62
+ dtype_issues: object # {column: issue_description}
63
+ task_description: string
64
+ message: string
65
+ step_count: integer
66
+ current_score: float # 0.0–1.0
67
+
68
+ action_space:
69
+ type: object
70
+ fields:
71
+ operation: string # fill_missing | drop_duplicates | fix_format | replace_value | drop_outliers | fix_dtype
72
+ column: string # optional depending on operation
73
+ params: object # optional operation parameters
pyproject.toml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "data-cleaning-env"
3
+ version = "0.1.0"
4
+ description = "Real-world data cleaning environment for OpenEnv / Scaler hackathon"
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "fastapi==0.135.2",
8
+ "uvicorn[standard]==0.40.0",
9
+ "pydantic==2.12.5",
10
+ "pandas==2.2.3",
11
+ "numpy==2.2.4",
12
+ "faker==40.12.0",
13
+ "openai==2.15.0",
14
+ "httpx==0.28.1",
15
+ "openenv-core==0.2.3",
16
+ ]
17
+
18
+ [project.scripts]
19
+ server = "server.app:main"
20
+
21
+ [build-system]
22
+ requires = ["hatchling"]
23
+ build-backend = "hatchling.build"
24
+
25
+ [tool.hatch.build.targets.wheel]
26
+ packages = ["server"]
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.135.2
2
+ uvicorn[standard]==0.40.0
3
+ pydantic==2.12.5
4
+ pandas==2.2.3
5
+ numpy==2.2.4
6
+ faker==40.12.0
7
+ openai==2.15.0
8
+ httpx==0.28.1
9
+ openenv-core==0.2.3
server/__init__.py ADDED
File without changes
server/app.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI application exposing the OpenEnv-compatible HTTP API.
3
+ Endpoints: GET /health, GET /metadata, GET /schema,
4
+ POST /reset, POST /step, GET /state, POST /state, GET /docs
5
+ """
6
+
7
+ from typing import Any, Dict, Optional
8
+ from fastapi import Body, FastAPI, HTTPException
9
+ from pydantic import BaseModel
10
+ import uvicorn
11
+
12
+ from models import DataCleaningAction, DataCleaningObservation, DataCleaningState
13
+ from server.environment import DataCleaningEnvironment
14
+
15
+ app = FastAPI(
16
+ title="Data Cleaning OpenEnv",
17
+ description="A real-world data cleaning environment for AI agent training.",
18
+ version="0.1.0",
19
+ )
20
+
21
+ # Single shared environment instance (stateful server)
22
+ env = DataCleaningEnvironment()
23
+
24
+
25
+ # New reset body accepts workflow_id
26
+ class ResetRequest(BaseModel):
27
+ workflow_id: Optional[str] = None # "A", "B", "C", or None for round-robin
28
+
29
+
30
+ class StepResponse(BaseModel):
31
+ observation: DataCleaningObservation
32
+ reward: float
33
+ done: bool
34
+ info: dict = {}
35
+
36
+
37
+ # ------------------------------------------------------------------
38
+ # Routes
39
+ # ------------------------------------------------------------------
40
+
41
+ @app.get("/health")
42
+ def health():
43
+ return {"status": "healthy"}
44
+
45
+
46
+ @app.get("/metadata")
47
+ def metadata():
48
+ return {
49
+ "name": "data-cleaning-env",
50
+ "description": (
51
+ "A real-world data cleaning environment where an AI agent fixes "
52
+ "missing values, duplicate rows, format inconsistencies, outliers, "
53
+ "and dtype errors across three progressively harder tasks."
54
+ ),
55
+ "version": "0.1.0",
56
+ "tags": ["openenv", "data-cleaning", "rl", "real-world"],
57
+ "tasks": [
58
+ {"id": "task1", "name": "Fill Missing Values", "difficulty": "easy"},
59
+ {"id": "task2", "name": "Fix Formats and Remove Duplicates", "difficulty": "medium"},
60
+ {"id": "task3", "name": "Full Cleaning Pipeline", "difficulty": "hard"},
61
+ ],
62
+ }
63
+
64
+
65
+ @app.get("/schema")
66
+ def schema():
67
+ return {
68
+ "action": {
69
+ "type": "object",
70
+ "properties": {
71
+ "operation": {
72
+ "type": "string",
73
+ "enum": [
74
+ "fill_missing",
75
+ "drop_duplicates",
76
+ "fix_format",
77
+ "replace_value",
78
+ "drop_outliers",
79
+ "fix_dtype",
80
+ ],
81
+ },
82
+ "column": {"type": "string", "nullable": True},
83
+ "params": {"type": "object", "nullable": True},
84
+ },
85
+ "required": ["operation"],
86
+ },
87
+ "observation": {
88
+ "type": "object",
89
+ "properties": {
90
+ "done": {"type": "boolean"},
91
+ "reward": {"type": "number"},
92
+ "data_preview": {"type": "string"},
93
+ "data_shape": {"type": "array", "items": {"type": "integer"}},
94
+ "missing_counts": {"type": "object"},
95
+ "duplicate_count": {"type": "integer"},
96
+ "dtype_issues": {"type": "object"},
97
+ "task_description": {"type": "string"},
98
+ "message": {"type": "string"},
99
+ "step_count": {"type": "integer"},
100
+ "current_score": {"type": "number"},
101
+ },
102
+ },
103
+ "state": {
104
+ "type": "object",
105
+ "properties": {
106
+ "episode_id": {"type": "string"},
107
+ "task_id": {"type": "integer"},
108
+ "step_count": {"type": "integer"},
109
+ "max_steps": {"type": "integer"},
110
+ "total_errors": {"type": "integer"},
111
+ "errors_remaining": {"type": "integer"},
112
+ },
113
+ },
114
+ }
115
+
116
+
117
+ @app.post("/reset", response_model=StepResponse)
118
+ def reset(req: ResetRequest = ResetRequest()):
119
+ try:
120
+ obs = env.reset(task_id=req.task_id)
121
+ except ValueError as e:
122
+ raise HTTPException(status_code=400, detail=str(e))
123
+ return StepResponse(observation=obs, reward=obs.reward, done=False)
124
+
125
+
126
+ @app.post("/step", response_model=StepResponse)
127
+ async def step(body: Dict[str, Any] = Body(...)):
128
+ """
129
+ Accept both openenv-core wrapped format:
130
+ {"action": {"operation": "...", ...}, "timeout_s": 15}
131
+ and direct format (for backward compat with our own client/inference):
132
+ {"operation": "...", "column": "...", "params": {...}}
133
+ """
134
+ action_data = body.get("action", body)
135
+ try:
136
+ action = DataCleaningAction(**action_data)
137
+ obs = env.step(action)
138
+ except (TypeError, KeyError, Exception) as e:
139
+ raise HTTPException(status_code=400, detail=str(e))
140
+ return StepResponse(observation=obs, reward=obs.reward, done=obs.done)
141
+
142
+
143
+ @app.get("/state", response_model=DataCleaningState)
144
+ def state_get():
145
+ """GET /state — openenv-core spec."""
146
+ return env.state()
147
+
148
+
149
+ @app.post("/state", response_model=DataCleaningState)
150
+ def state_post():
151
+ """POST /state — backward compatibility."""
152
+ return env.state()
153
+
154
+
155
+
156
+ @app.get("/", response_class=HTMLResponse)
157
+ def ui():
158
+ """Serve the demo dashboard."""
159
+ return FileResponse("ui/index.html")
160
+
161
+ @app.get("/schema/apps")
162
+ def app_schemas():
163
+ """Return the canonical action space per app — used by the UI."""
164
+ return {...} # maps app → list of operations + their arg schemas
165
+
166
+
167
+ # ------------------------------------------------------------------
168
+ # Entry point (required by openenv-core and [project.scripts])
169
+ # ------------------------------------------------------------------
170
+
171
+ def main():
172
+ uvicorn.run("server.app:app", host="0.0.0.0", port=8000)
173
+
174
+
175
+ if __name__ == "__main__":
176
+ main()
server/apps/base_app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class BaseApp(ABC):
2
+ APP_NAME: str = ""
3
+
4
+ # --- Core interface every app must implement ---
5
+ @abstractmethod
6
+ def initialize(self, records: List[Dict]) -> None:
7
+ """Load synthetic records into in-memory state."""
8
+
9
+ @abstractmethod
10
+ def execute(self, operation: str, args: Dict) -> Dict:
11
+ """Execute an operation. Returns {"success": bool, "data": ..., "message": str}"""
12
+
13
+ @abstractmethod
14
+ def get_state_view(self, max_rows: int = 5) -> str:
15
+ """Return agent-visible snapshot as a compact string."""
16
+
17
+ @abstractmethod
18
+ def count_open_items(self) -> int:
19
+ """Count pending/open work items (used by grader)."""
server/business_rules.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DEFAULT_RULES = {
2
+ "sla_p0_minutes": 30, # P0 tickets: acknowledge within 30 min
3
+ "sla_p1_hours": 4, # P1 tickets: first response within 4h
4
+ "approval_threshold": 10_000, # $ above which manager approval needed
5
+ "max_tickets_per_agent": 10, # RBAC: agent capacity cap
6
+ "gdpr_max_days": 30, # compliance: GDPR ticket resolution
7
+ "rbac": {
8
+ "support": {"salesforce": ["read"], "jira": ["read", "create_issue"]},
9
+ "engineer": {"jira": ["*"], "zendesk": ["read"]},
10
+ "manager": {"*": ["*"]},
11
+ }
12
+ }
13
+
14
+ POLICY_DRIFT_EVENTS = {
15
+ "sla_tighten": {"sla_p0_minutes": 15, "sla_p1_hours": 2},
16
+ "approval_tighten": {"approval_threshold": 5_000},
17
+ "gdpr_expedite": {"gdpr_max_days": 7},
18
+ }
19
+
20
+ class BusinessRuleEngine:
21
+ def __init__(self):
22
+ self.rules = DEFAULT_RULES.copy()
23
+ self._violation_log: List[str] = []
24
+
25
+ def apply_policy_drift(self, event: str) -> None:
26
+ """Called mid-episode or at episode start to change rules."""
27
+ if event in POLICY_DRIFT_EVENTS:
28
+ self.rules.update(POLICY_DRIFT_EVENTS[event])
29
+
30
+ def check_action(self, action: OrgOSAction, context: Dict) -> Tuple[bool, str, float]:
31
+ """Returns (allowed, reason, penalty)."""
32
+ violations = []
33
+
34
+ # RBAC check
35
+ role = context.get("agent_role", "support")
36
+ app_perms = self.rules["rbac"].get(role, {})
37
+ allowed_ops = app_perms.get(action.app, app_perms.get("*", []))
38
+ if "*" not in allowed_ops and action.operation not in allowed_ops:
39
+ violations.append(f"RBAC: {role} cannot {action.operation} on {action.app}")
40
+ return False, violations[0], -0.25
41
+
42
+ # Approval threshold check
43
+ if action.operation in ("request_budget_approval", "update_deal_stage"):
44
+ amount = action.args.get("amount", 0)
45
+ if amount > self.rules["approval_threshold"] and not context.get("manager_approved"):
46
+ violations.append(f"Approval required: ${amount} > ${self.rules['approval_threshold']}")
47
+ return False, violations[0], -0.10
48
+
49
+ self._violation_log.extend(violations)
50
+ return True, "", 0.0
51
+
52
+ def check_sla(self, ticket: Dict, elapsed_minutes: float) -> Tuple[bool, float]:
53
+ """Returns (sla_met, penalty)."""
54
+ priority = ticket.get("priority", ticket.get("urgency", "p2"))
55
+ if priority in ("p0", "critical") and elapsed_minutes > self.rules["sla_p0_minutes"]:
56
+ return False, -0.15
57
+ return True, 0.0
58
+
59
+ def get_violations_this_step(self) -> List[str]:
60
+ v = self._violation_log.copy()
61
+ self._violation_log.clear()
62
+ return v
server/data_generator.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Synthetic dataset generation with a fixed seed for full reproducibility.
3
+ All datasets are generated purely from numpy/random — no external downloads.
4
+ """
5
+
6
+ import random
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ SEED = 42
11
+
12
+
13
+ # ---------------------------------------------------------------------------
14
+ # Task 1 — Employee records with missing values
15
+ # ---------------------------------------------------------------------------
16
+
17
+ def generate_task1_datasets():
18
+ """Returns (dirty_df, clean_df) for Task 1."""
19
+ rng = np.random.default_rng(SEED)
20
+ random.seed(SEED)
21
+
22
+ n = 100
23
+ departments = ["Engineering", "Marketing", "Sales", "HR", "Finance"]
24
+ first_names = ["Alice", "Bob", "Carol", "David", "Eve", "Frank", "Grace",
25
+ "Heidi", "Ivan", "Judy", "Karl", "Laura", "Mallory", "Niaj",
26
+ "Oscar", "Peggy", "Quinn", "Romeo", "Sybil", "Trent"]
27
+ last_names = ["Smith", "Jones", "Brown", "Taylor", "Wilson", "Davis",
28
+ "Miller", "Anderson", "Thomas", "Jackson"]
29
+
30
+ names = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(n)]
31
+ ages = rng.integers(22, 60, size=n).astype(float)
32
+ salaries = rng.integers(40_000, 120_000, size=n).astype(float)
33
+ depts = rng.choice(departments, size=n)
34
+ experience = rng.integers(0, 30, size=n).astype(float)
35
+
36
+ clean_df = pd.DataFrame({
37
+ "name": names,
38
+ "age": ages,
39
+ "salary": salaries,
40
+ "department": depts,
41
+ "experience": experience,
42
+ })
43
+
44
+ dirty_df = clean_df.copy()
45
+
46
+ # Inject ~20 % NaN into age, salary, department
47
+ for col, frac in [("age", 0.20), ("salary", 0.20), ("department", 0.10)]:
48
+ idx = rng.choice(n, size=int(n * frac), replace=False)
49
+ dirty_df.loc[idx, col] = np.nan
50
+
51
+ return dirty_df.reset_index(drop=True), clean_df.reset_index(drop=True)
52
+
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # Task 2 — Product catalog with format & duplicate issues
56
+ # ---------------------------------------------------------------------------
57
+
58
+ def _scramble_phone(phone: str, rng) -> str:
59
+ digits = phone.replace("-", "")
60
+ fmt = rng.integers(0, 3)
61
+ if fmt == 0:
62
+ return digits # 5551234567
63
+ elif fmt == 1:
64
+ return f"({digits[:3]}){digits[3:]}" # (555)1234567
65
+ else:
66
+ return phone # 555-123-4567 (canonical)
67
+
68
+
69
+ def _scramble_date(date_str: str, rng) -> str:
70
+ dt = pd.to_datetime(date_str)
71
+ fmt = rng.integers(0, 3)
72
+ if fmt == 0:
73
+ return dt.strftime("%Y-%m-%d")
74
+ elif fmt == 1:
75
+ return dt.strftime("%b %d %Y")
76
+ else:
77
+ return dt.strftime("%d/%m/%Y")
78
+
79
+
80
+ def generate_task2_datasets():
81
+ """Returns (dirty_df, clean_df) for Task 2."""
82
+ rng = np.random.default_rng(SEED)
83
+ random.seed(SEED)
84
+
85
+ n = 200
86
+ categories = ["Electronics", "Clothing", "Food", "Books", "Toys"]
87
+
88
+ product_ids = [f"P{str(i).zfill(4)}" for i in range(1, n + 1)]
89
+ product_names = [f"Product_{i}" for i in range(1, n + 1)]
90
+ prices = np.round(rng.uniform(5.0, 500.0, size=n), 2)
91
+ categories_col = rng.choice(categories, size=n)
92
+ phones = [
93
+ f"{rng.integers(100,999)}-{rng.integers(100,999)}-{rng.integers(1000,9999)}"
94
+ for _ in range(n)
95
+ ]
96
+ days_offset = rng.integers(0, 1000, size=n)
97
+ dates = [
98
+ (pd.Timestamp("2020-01-01") + pd.Timedelta(days=int(d))).strftime("%Y-%m-%d")
99
+ for d in days_offset
100
+ ]
101
+
102
+ clean_df = pd.DataFrame({
103
+ "product_id": product_ids,
104
+ "product_name": product_names,
105
+ "price": prices,
106
+ "category": categories_col,
107
+ "phone": phones,
108
+ "listed_date": dates,
109
+ })
110
+
111
+ dirty_df = clean_df.copy()
112
+
113
+ # Scramble ~60 % of phone formats
114
+ phone_idx = rng.choice(n, size=int(n * 0.6), replace=False)
115
+ dirty_df.loc[phone_idx, "phone"] = [
116
+ _scramble_phone(dirty_df.loc[i, "phone"], rng) for i in phone_idx
117
+ ]
118
+
119
+ # Scramble ~60 % of date formats
120
+ date_idx = rng.choice(n, size=int(n * 0.6), replace=False)
121
+ dirty_df.loc[date_idx, "listed_date"] = [
122
+ _scramble_date(dirty_df.loc[i, "listed_date"], rng) for i in date_idx
123
+ ]
124
+
125
+ # Add 15 duplicate rows
126
+ dup_idx = rng.choice(n, size=15, replace=False)
127
+ dup_rows = dirty_df.iloc[dup_idx].copy()
128
+ dirty_df = pd.concat([dirty_df, dup_rows], ignore_index=True)
129
+
130
+ return dirty_df.reset_index(drop=True), clean_df.reset_index(drop=True)
131
+
132
+
133
+ # ---------------------------------------------------------------------------
134
+ # Task 3 — Customer database: full pipeline
135
+ # ---------------------------------------------------------------------------
136
+
137
+ def generate_task3_datasets():
138
+ """Returns (dirty_df, clean_df) for Task 3."""
139
+ rng = np.random.default_rng(SEED)
140
+ random.seed(SEED)
141
+
142
+ n = 300
143
+ countries = ["USA", "UK", "Canada", "Australia", "Germany"]
144
+ first_names = ["Alice", "Bob", "Carol", "David", "Eve", "Frank", "Grace",
145
+ "Heidi", "Ivan", "Judy"]
146
+ last_names = ["Smith", "Jones", "Brown", "Taylor", "Wilson"]
147
+
148
+ names = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(n)]
149
+ ages = rng.integers(18, 75, size=n).astype(float)
150
+ purchase_amounts = np.round(rng.uniform(10.0, 500.0, size=n), 2)
151
+ countries_col = rng.choice(countries, size=n)
152
+ emails = [f"user{i}@example.com" for i in range(1, n + 1)]
153
+ days_offset = rng.integers(0, 730, size=n)
154
+ signup_dates = [
155
+ (pd.Timestamp("2022-01-01") + pd.Timedelta(days=int(d))).strftime("%Y-%m-%d")
156
+ for d in days_offset
157
+ ]
158
+
159
+ clean_df = pd.DataFrame({
160
+ "name": names,
161
+ "age": ages,
162
+ "purchase_amount": purchase_amounts,
163
+ "country": countries_col,
164
+ "email": emails,
165
+ "signup_date": signup_dates,
166
+ })
167
+
168
+ dirty_df = clean_df.copy()
169
+
170
+ # Missing values (~15 % in age, purchase_amount, country, signup_date)
171
+ for col, frac in [("age", 0.15), ("purchase_amount", 0.15),
172
+ ("country", 0.10), ("signup_date", 0.10)]:
173
+ idx = rng.choice(n, size=int(n * frac), replace=False)
174
+ dirty_df.loc[idx, col] = np.nan
175
+
176
+ # Outliers in purchase_amount (~3 %)
177
+ out_idx = rng.choice(n, size=int(n * 0.03), replace=False)
178
+ dirty_df.loc[out_idx, "purchase_amount"] = (
179
+ dirty_df.loc[out_idx, "purchase_amount"] * 10
180
+ )
181
+
182
+ # Mixed case in country (~40 %)
183
+ case_idx = rng.choice(n, size=int(n * 0.40), replace=False)
184
+ dirty_df.loc[case_idx, "country"] = dirty_df.loc[case_idx, "country"].str.lower()
185
+
186
+ # Mixed date formats (~50 %) — only scramble non-null entries
187
+ date_idx = rng.choice(n, size=int(n * 0.50), replace=False)
188
+ valid_date_idx = [i for i in date_idx if pd.notna(dirty_df.loc[i, "signup_date"])]
189
+ for i in valid_date_idx:
190
+ dirty_df.loc[i, "signup_date"] = _scramble_date(dirty_df.loc[i, "signup_date"], rng)
191
+
192
+ # 20 duplicate rows
193
+ dup_idx = rng.choice(n, size=20, replace=False)
194
+ dup_rows = dirty_df.iloc[dup_idx].copy()
195
+ dirty_df = pd.concat([dirty_df, dup_rows], ignore_index=True)
196
+
197
+ return dirty_df.reset_index(drop=True), clean_df.reset_index(drop=True)
198
+
199
+
200
+ def generate_jira_records(n=50, seed=42) -> List[Dict]:
201
+ """50 engineering tickets with priority, assignee, status, linked_ticket."""
202
+
203
+ def generate_zendesk_records(n=40, seed=42) -> List[Dict]:
204
+ """40 support tickets with urgency, agent_email, state, customer_id."""
205
+
206
+ def generate_salesforce_records(n=30, seed=42) -> List[Dict]:
207
+ """30 accounts with deal_stage, health, owner_name, arr."""
208
+
209
+ def generate_workday_records(n=20, seed=42) -> List[Dict]:
210
+ """20 employee/HR records with level, manager_id, resolution."""
211
+
212
+ def generate_episode_data(workflow_id: str, seed: int = 42) -> Dict[str, List[Dict]]:
213
+ """Generate correlated data for a full episode across all 4 apps.
214
+ Ensures tickets in Zendesk reference customers in Salesforce, etc."""
server/environment.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class OrgOSEnvironment:
2
+ MAX_STEPS = {"A": 15, "B": 20, "C": 18}
3
+ WORKFLOWS = ["A", "B", "C"]
4
+
5
+ def __init__(self):
6
+ self._drift = SchemaDriftEngine(seed=42)
7
+ self._rules = BusinessRuleEngine()
8
+ self._workflow = WorkflowEngine()
9
+ self._apps: Dict[str, BaseApp] = {
10
+ "jira": JiraApp(self._drift),
11
+ "zendesk": ZendeskApp(self._drift),
12
+ "salesforce": SalesforceApp(self._drift),
13
+ "workday": WorkdayApp(self._drift),
14
+ }
15
+ self._episode_num = 0
16
+ self._episode_id = ""
17
+ self._workflow_id = "A"
18
+ self._step_count = 0
19
+ self._last_score = 0.001
20
+ self._policy_drift_applied = False
21
+
22
+ # Reward component trackers
23
+ self._wf_score = 0.0 # workflow completion
24
+ self._rule_score = 1.0 # compliance (starts perfect, penalized on violation)
25
+ self._schema_score = 0.0 # schema adaptation successes
26
+ self._efficiency = 1.0 # degrades with no-ops
27
+ self._policy_score = 0.0 # policy drift handling
28
+
29
+ def reset(self, workflow_id: Optional[str] = None) -> OrgOSObservation:
30
+ self._episode_num += 1
31
+ self._episode_id = str(uuid.uuid4())
32
+ self._workflow_id = workflow_id or self.WORKFLOWS[(self._episode_num - 1) % 3]
33
+ self._step_count = 0
34
+ self._last_score = 0.001
35
+ self._rule_score = 1.0
36
+ self._wf_score = 0.0
37
+ self._schema_score = 0.0
38
+ self._efficiency = 1.0
39
+ self._policy_score = 0.0
40
+ self._policy_drift_applied = False
41
+
42
+ # Sample schema versions for this episode
43
+ self._drift.sample_for_episode(self._episode_num)
44
+
45
+ # Possibly activate policy drift (every 3rd episode)
46
+ self._rules = BusinessRuleEngine()
47
+ if self._episode_num % 3 == 0:
48
+ self._rules.apply_policy_drift("sla_tighten")
49
+ self._policy_drift_applied = True
50
+
51
+ # Load fresh synthetic data into each app
52
+ records = generate_episode_data(self._workflow_id, seed=42 + self._episode_num)
53
+ for app_name, app in self._apps.items():
54
+ app.initialize(records[app_name])
55
+
56
+ # Start workflow tracking
57
+ self._workflow.start(self._workflow_id)
58
+
59
+ return self._build_obs(0.001, False, "Episode started. Study the workflow goal and schema hints.")
60
+
61
+ def step(self, action: OrgOSAction) -> OrgOSObservation:
62
+ self._step_count += 1
63
+ old_score = self._last_score
64
+ extra_penalty = 0.0
65
+
66
+ # 1. Validate app exists
67
+ if action.app not in self._apps:
68
+ return self._build_obs(old_score - 0.05, False, f"Unknown app '{action.app}'")
69
+
70
+ # 2. Business rule check (RBAC, approvals)
71
+ ctx = {"agent_role": "support", "manager_approved": False}
72
+ allowed, reason, rule_penalty = self._rules.check_action(action, ctx)
73
+ if not allowed:
74
+ self._rule_score = max(0.0, self._rule_score - 0.08)
75
+ extra_penalty = rule_penalty
76
+ return self._build_obs(
77
+ max(-0.25, old_score + extra_penalty),
78
+ False, f"Rule violation: {reason}"
79
+ )
80
+
81
+ # 3. Execute on app
82
+ result = self._apps[action.app].execute(action.operation, action.args)
83
+ if not result["success"]:
84
+ self._efficiency -= 0.02 # penalize failed/no-op actions
85
+ return self._build_obs(old_score - 0.01, False, result["message"])
86
+
87
+ # 4. Check schema drift adaptation
88
+ # If agent used canonical field names on a v2/v3 schema → penalize
89
+ if result.get("schema_error"):
90
+ extra_penalty -= 0.20
91
+ return self._build_obs(old_score - 0.20, False,
92
+ f"Stale schema: field '{result['schema_error']}' not found in current schema")
93
+ elif result.get("schema_adapted"):
94
+ # Agent correctly used drifted field name → bonus
95
+ self._schema_score = min(1.0, self._schema_score + 0.1)
96
+
97
+ # 5. Re-evaluate workflow completion
98
+ self._wf_score = self._workflow.evaluate(self._apps)
99
+
100
+ # 6. Check SLA violations
101
+ sla_ok, sla_pen = self._rules.check_sla(result.get("ticket", {}),
102
+ self._step_count * 2.5) # 2.5 min per step
103
+ if not sla_ok:
104
+ extra_penalty += sla_pen
105
+ self._rule_score = max(0.0, self._rule_score - 0.05)
106
+
107
+ # 7. Compute composite score
108
+ new_score = self._compute_score()
109
+ delta = new_score - old_score + extra_penalty
110
+ self._last_score = max(0.001, min(0.999, new_score))
111
+
112
+ # 8. Terminal condition
113
+ done = (self._wf_score >= 0.95 or
114
+ self._step_count >= self.MAX_STEPS[self._workflow_id])
115
+ if done and self._wf_score >= 0.95:
116
+ delta += 0.20 # terminal bonus
117
+
118
+ return self._build_obs(delta, done, result["message"])
119
+
120
+ def _compute_score(self) -> float:
121
+ raw = (
122
+ 0.30 * self._wf_score +
123
+ 0.25 * self._rule_score +
124
+ 0.20 * self._schema_score +
125
+ 0.15 * self._efficiency +
126
+ 0.10 * self._policy_score
127
+ )
128
+ return max(0.001, min(0.999, raw))
129
+
130
+ def state(self) -> OrgOSState:
131
+ return OrgOSState(
132
+ episode_id=self._episode_id,
133
+ workflow_id=self._workflow_id,
134
+ schema_versions=self._drift._versions,
135
+ step_count=self._step_count,
136
+ max_steps=self.MAX_STEPS.get(self._workflow_id, 15),
137
+ rule_violation_count=len(self._rules._violation_log),
138
+ workflow_completion=self._wf_score,
139
+ rule_compliance_rate=self._rule_score,
140
+ policy_drift_active=self._policy_drift_applied,
141
+ )
server/schema_drift.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Canonical → actual field names per app per schema version
2
+ SCHEMA_MAP = {
3
+ "jira": {
4
+ "v1": {"priority": "priority", "assignee": "assignee", "status": "status"},
5
+ "v2": {"priority": "severity", "assignee": "owner", "status": "state"},
6
+ "v3": {"priority": "urgency_level", "assignee": "assigned_to", "status": "current_state",
7
+ "sla_deadline": "due_by"}, # v3 adds a new field
8
+ },
9
+ "zendesk": {
10
+ "v1": {"urgency": "urgency", "agent_email": "agent_email", "state": "state"},
11
+ "v2": {"urgency": "priority", "agent_email": "handler", "state": "ticket_state"},
12
+ "v3": {"urgency": "impact_level", "agent_email": "assigned_agent","state": "resolution_status"},
13
+ },
14
+ "salesforce": {
15
+ "v1": {"deal_stage": "deal_stage", "health": "health", "owner": "owner_name"},
16
+ "v2": {"deal_stage": "pipeline_stage","health": "account_health", "owner": "account_owner"},
17
+ "v3": {"deal_stage": "stage", "health": "risk_score", "owner": "rep_email",
18
+ "arr": "annual_recurring_revenue"},
19
+ },
20
+ "workday": {
21
+ "v1": {"level": "level", "manager_id": "manager_id", "status": "resolution"},
22
+ "v2": {"level": "job_level", "manager_id": "reports_to", "status": "request_status"},
23
+ "v3": {"level": "seniority", "manager_id": "direct_manager","status": "approval_state"},
24
+ },
25
+ }
26
+
27
+ class SchemaDriftEngine:
28
+ def __init__(self, seed: int = 42):
29
+ self._seed = seed
30
+ self._versions: Dict[str, str] = {} # app → "v1"/"v2"/"v3"
31
+
32
+ def sample_for_episode(self, episode_num: int) -> None:
33
+ """Sample schema versions deterministically per episode."""
34
+ rng = random.Random(self._seed + episode_num)
35
+ self._versions = {app: rng.choice(["v1", "v2", "v3"]) for app in SCHEMA_MAP}
36
+
37
+ def translate_record(self, record: Dict, app: str) -> Dict:
38
+ """Rename canonical field names → current schema's field names."""
39
+ version = self._versions.get(app, "v1")
40
+ mapping = SCHEMA_MAP[app][version]
41
+ return {mapping.get(k, k): v for k, v in record.items()}
42
+
43
+ def get_hints(self) -> Dict[str, str]:
44
+ """Return partial schema hints visible in observation.
45
+ Only reveal 1 random field per app (agent must probe for the rest)."""
46
+ hints = {}
47
+ rng = random.Random(self._seed)
48
+ for app, version in self._versions.items():
49
+ mapping = SCHEMA_MAP[app][version]
50
+ # Reveal only fields that actually changed (v2/v3)
51
+ changed = {f"{app}.{k}": v for k, v in mapping.items() if k != v}
52
+ if changed:
53
+ key = rng.choice(list(changed.keys()))
54
+ hints[key] = changed[key]
55
+ return hints
server/tasks/__init__.py ADDED
File without changes
server/tasks/task1_missing.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Task 1 — Easy: Fill Missing Values
3
+ Objective: Fill all NaN values in the employee records DataFrame.
4
+ Score: 1.0 - (remaining_nulls / original_nulls)
5
+ """
6
+
7
+ from server.data_generator import generate_task1_datasets
8
+
9
+ TASK_ID = 1
10
+ MAX_STEPS = 20
11
+ DESCRIPTION = (
12
+ "Task 1 (Easy) — Fill Missing Values\n"
13
+ "You have an employee records dataset with missing values (NaN) in "
14
+ "'age', 'salary', and 'department' columns. "
15
+ "Your goal is to fill all missing values so the dataset is complete.\n\n"
16
+ "Available operation: fill_missing\n"
17
+ " params.strategy: 'median' | 'mean' | 'mode' | 'constant'\n"
18
+ " params.value: (required when strategy='constant') the fill value\n"
19
+ "Example action: {\"operation\": \"fill_missing\", \"column\": \"age\", \"params\": {\"strategy\": \"median\"}}"
20
+ )
21
+
22
+
23
+ def load():
24
+ """Return (dirty_df, clean_df, original_null_count)."""
25
+ dirty, clean = generate_task1_datasets()
26
+ original_nulls = int(dirty.isnull().sum().sum())
27
+ return dirty.copy(), clean, original_nulls
28
+
29
+
30
+ def score(current_df, original_nulls: int) -> float:
31
+ """Score in [0, 1]: fraction of nulls filled."""
32
+ if original_nulls == 0:
33
+ return 0.99
34
+ remaining = int(current_df.isnull().sum().sum())
35
+ return round(max(0.01, min(0.99, 1.0 - remaining / original_nulls)), 4)
36
+
37
+
38
+ def count_errors(current_df) -> int:
39
+ return int(current_df.isnull().sum().sum())
server/tasks/task2_format.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Task 2 — Medium: Fix Formats + Remove Duplicates
3
+ Objective: Standardise phone & date formats and drop duplicate rows.
4
+ Score: weighted average of format_score (0.7) + dupe_score (0.3)
5
+ """
6
+
7
+ import re
8
+ import pandas as pd
9
+ from server.data_generator import generate_task2_datasets
10
+
11
+ TASK_ID = 2
12
+ MAX_STEPS = 30
13
+ DESCRIPTION = (
14
+ "Task 2 (Medium) — Fix Formats and Remove Duplicates\n"
15
+ "You have a product catalog with:\n"
16
+ " • Phone numbers in mixed formats (need: NNN-NNN-NNNN)\n"
17
+ " • Dates in mixed formats (need: YYYY-MM-DD)\n"
18
+ " • Duplicate rows (~15)\n\n"
19
+ "Available operations:\n"
20
+ " fix_format — column: 'phone' | 'listed_date'\n"
21
+ " drop_duplicates — no column needed\n\n"
22
+ "Example actions:\n"
23
+ ' {"operation": "fix_format", "column": "phone"}\n'
24
+ ' {"operation": "fix_format", "column": "listed_date"}\n'
25
+ ' {"operation": "drop_duplicates"}'
26
+ )
27
+
28
+ PHONE_RE = re.compile(r"^\d{3}-\d{3}-\d{4}$")
29
+ DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")
30
+
31
+
32
+ def load():
33
+ dirty, clean = generate_task2_datasets()
34
+ original_phone_issues = int((~dirty["phone"].str.match(PHONE_RE)).sum())
35
+ original_date_issues = int((~dirty["listed_date"].apply(
36
+ lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
37
+ )).sum())
38
+ original_dupes = len(dirty) - len(dirty.drop_duplicates())
39
+ meta = {
40
+ "orig_phone": original_phone_issues,
41
+ "orig_date": original_date_issues,
42
+ "orig_dupes": original_dupes,
43
+ }
44
+ return dirty.copy(), clean, meta
45
+
46
+
47
+ def score(current_df, meta: dict) -> float:
48
+ phone_issues = int((~current_df["phone"].str.match(PHONE_RE)).sum())
49
+ date_issues = int((~current_df["listed_date"].apply(
50
+ lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
51
+ )).sum())
52
+ dupes = len(current_df) - len(current_df.drop_duplicates())
53
+
54
+ phone_score = 1.0 - phone_issues / max(meta["orig_phone"], 1)
55
+ date_score = 1.0 - date_issues / max(meta["orig_date"], 1)
56
+ dupe_score = 1.0 - dupes / max(meta["orig_dupes"], 1)
57
+
58
+ combined = 0.35 * phone_score + 0.35 * date_score + 0.30 * dupe_score
59
+ return round(max(0.01, min(0.99, combined)), 4)
60
+
61
+
62
+ def count_errors(current_df, meta: dict) -> int:
63
+ phone_issues = int((~current_df["phone"].str.match(PHONE_RE)).sum())
64
+ date_issues = int((~current_df["listed_date"].apply(
65
+ lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
66
+ )).sum())
67
+ dupes = len(current_df) - len(current_df.drop_duplicates())
68
+ return phone_issues + date_issues + dupes
server/tasks/task3_pipeline.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Task 3 — Hard: Full Cleaning Pipeline
3
+ Objective: Fix missing values, remove duplicates, handle outliers, standardise
4
+ country capitalisation and date formats.
5
+ Score: equal-weight average of 4 sub-scores.
6
+ """
7
+
8
+ import re
9
+ import numpy as np
10
+ import pandas as pd
11
+ from server.data_generator import generate_task3_datasets
12
+
13
+ TASK_ID = 3
14
+ MAX_STEPS = 40
15
+ DESCRIPTION = (
16
+ "Task 3 (Hard) — Full Cleaning Pipeline\n"
17
+ "You have a customer database with multiple issues:\n"
18
+ " 1. Missing values in 'age', 'purchase_amount', 'country', 'signup_date'\n"
19
+ " 2. ~20 duplicate rows\n"
20
+ " 3. Outliers in 'purchase_amount' (injected values ~10x normal)\n"
21
+ " 4. Mixed case in 'country' (need: title case, e.g. 'Usa' → 'USA')\n"
22
+ " 5. Mixed date formats in 'signup_date' (need: YYYY-MM-DD)\n\n"
23
+ "Available operations:\n"
24
+ " fill_missing — column + params.strategy ('median'|'mean'|'mode'|'constant')\n"
25
+ " drop_duplicates — no column needed\n"
26
+ " drop_outliers — column (numeric); uses IQR method\n"
27
+ " fix_format — column: 'country' | 'signup_date'\n"
28
+ " fix_dtype — column + params.dtype ('float'|'int'|'str')\n\n"
29
+ "Example actions:\n"
30
+ ' {"operation": "fill_missing", "column": "age", "params": {"strategy": "median"}}\n'
31
+ ' {"operation": "drop_duplicates"}\n'
32
+ ' {"operation": "drop_outliers", "column": "purchase_amount"}\n'
33
+ ' {"operation": "fix_format", "column": "signup_date"}\n'
34
+ ' {"operation": "fix_format", "column": "country"}'
35
+ )
36
+
37
+ DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")
38
+ VALID_COUNTRIES = {"USA", "UK", "Canada", "Australia", "Germany"}
39
+
40
+
41
+ def load():
42
+ dirty, clean = generate_task3_datasets()
43
+ orig_nulls = int(dirty.isnull().sum().sum())
44
+ orig_dupes = len(dirty) - len(dirty.drop_duplicates())
45
+
46
+ # Outlier baseline: count rows where purchase_amount > Q3 + 3*IQR
47
+ pa = dirty["purchase_amount"].dropna()
48
+ q1, q3 = pa.quantile(0.25), pa.quantile(0.75)
49
+ iqr = q3 - q1
50
+ orig_outliers = int((pa > q3 + 3 * iqr).sum())
51
+
52
+ orig_country_issues = int((~dirty["country"].isin(VALID_COUNTRIES) &
53
+ dirty["country"].notna()).sum())
54
+ orig_date_issues = int((~dirty["signup_date"].apply(
55
+ lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
56
+ )).sum())
57
+
58
+ meta = {
59
+ "orig_nulls": orig_nulls,
60
+ "orig_dupes": orig_dupes,
61
+ "orig_outliers": max(orig_outliers, 1),
62
+ "orig_country_issues": max(orig_country_issues, 1),
63
+ "orig_date_issues": max(orig_date_issues, 1),
64
+ "q1": q1, "q3": q3, "iqr": iqr,
65
+ }
66
+ return dirty.copy(), clean, meta
67
+
68
+
69
+ def score(current_df, meta: dict) -> float:
70
+ remaining_nulls = int(current_df.isnull().sum().sum())
71
+ remaining_dupes = len(current_df) - len(current_df.drop_duplicates())
72
+
73
+ pa = current_df["purchase_amount"].dropna()
74
+ remaining_outliers = int((pa > meta["q3"] + 3 * meta["iqr"]).sum())
75
+
76
+ remaining_country = int((~current_df["country"].isin(VALID_COUNTRIES) &
77
+ current_df["country"].notna()).sum())
78
+ remaining_dates = int((~current_df["signup_date"].apply(
79
+ lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
80
+ )).sum())
81
+
82
+ null_score = 1.0 - remaining_nulls / max(meta["orig_nulls"], 1)
83
+ dupe_score = 1.0 - remaining_dupes / max(meta["orig_dupes"], 1)
84
+ outlier_score = 1.0 - remaining_outliers / meta["orig_outliers"]
85
+ country_score = 1.0 - remaining_country / meta["orig_country_issues"]
86
+ date_score = 1.0 - remaining_dates / meta["orig_date_issues"]
87
+
88
+ combined = 0.25 * null_score + 0.20 * dupe_score + 0.20 * outlier_score \
89
+ + 0.175 * country_score + 0.175 * date_score
90
+ return round(max(0.01, min(0.99, combined)), 4)
91
+
92
+
93
+ def count_errors(current_df, meta: dict) -> int:
94
+ remaining_nulls = int(current_df.isnull().sum().sum())
95
+ remaining_dupes = len(current_df) - len(current_df.drop_duplicates())
96
+ pa = current_df["purchase_amount"].dropna()
97
+ remaining_outliers = int((pa > meta["q3"] + 3 * meta["iqr"]).sum())
98
+ remaining_country = int((~current_df["country"].isin(VALID_COUNTRIES) &
99
+ current_df["country"].notna()).sum())
100
+ remaining_dates = int((~current_df["signup_date"].apply(
101
+ lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
102
+ )).sum())
103
+ return remaining_nulls + remaining_dupes + remaining_outliers + \
104
+ remaining_country + remaining_dates
server/workflow_engine.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @dataclass
2
+ class WorkflowStep:
3
+ step_id: str
4
+ description: str
5
+ app: str
6
+ operation: str
7
+ # Callable that checks if this step was completed given the app states
8
+ completion_check: Callable[[Dict[str, "BaseApp"]], bool]
9
+
10
+ # Workflow A: Customer Bug → Engineering Fix
11
+ WORKFLOW_A_STEPS = [
12
+ WorkflowStep("A1", "Acknowledge ticket in Zendesk",
13
+ "zendesk", "acknowledge_ticket",
14
+ lambda apps: apps["zendesk"].ticket_acknowledged()),
15
+
16
+ WorkflowStep("A2", "Escalate to Jira — create linked issue",
17
+ "jira", "create_issue",
18
+ lambda apps: apps["jira"].has_linked_issue()),
19
+
20
+ WorkflowStep("A3", "Check if customer is paying (Salesforce lookup)",
21
+ "salesforce", "get_account",
22
+ lambda apps: apps["salesforce"].account_checked()),
23
+
24
+ WorkflowStep("A4", "Assign correct engineer in Jira based on priority",
25
+ "jira", "assign_owner",
26
+ lambda apps: apps["jira"].issue_assigned()),
27
+
28
+ WorkflowStep("A5", "Log SLA status in Workday",
29
+ "workday", "log_sla_event",
30
+ lambda apps: apps["workday"].sla_logged()),
31
+ ]
32
+
33
+ # Workflow B: Employee Onboarding
34
+ WORKFLOW_B_STEPS = [
35
+ WorkflowStep("B1", "Create employee record in Workday", ...),
36
+ WorkflowStep("B2", "Provision Jira access based on role", ...),
37
+ WorkflowStep("B3", "Add to Salesforce team by territory", ...),
38
+ WorkflowStep("B4", "Create Zendesk support profile if customer-facing", ...),
39
+ ]
40
+
41
+ # Workflow C: Churn Risk Alert
42
+ WORKFLOW_C_STEPS = [
43
+ WorkflowStep("C1", "Flag at-risk account in Salesforce", ...),
44
+ WorkflowStep("C2", "Query recent support volume in Zendesk", ...),
45
+ WorkflowStep("C3", "Check outstanding bugs in Jira", ...),
46
+ WorkflowStep("C4", "Synthesize churn score and assign intervention owner", ...),
47
+ ]
48
+
49
+ class WorkflowEngine:
50
+ WORKFLOWS = {"A": WORKFLOW_A_STEPS, "B": WORKFLOW_B_STEPS, "C": WORKFLOW_C_STEPS}
51
+
52
+ def start(self, workflow_id: str) -> None:
53
+ self._steps = self.WORKFLOWS[workflow_id].copy()
54
+ self._completed: List[str] = []
55
+
56
+ def evaluate(self, apps: Dict) -> float:
57
+ """Check all steps and return completion ratio (0.0-1.0)."""
58
+ completed = sum(1 for s in self._steps if s.completion_check(apps))
59
+ self._completed = [s.step_id for s in self._steps if s.completion_check(apps)]
60
+ return completed / len(self._steps)
61
+
62
+ def get_pending(self) -> List[str]:
63
+ return [s.description for s in self._steps if s.step_id not in self._completed]
uv.lock ADDED
The diff for this file is too large to render. See raw diff