Spaces:

srishtichugh
/

orgOS

Sleeping

App Files Files Community

Taniieeee83 commited on Apr 21

Commit

4719066

0 Parent(s):

changed till step 8

Browse files

Files changed (27) hide show

.dockerignore +44 -0
.gitattributes +40 -0
.gitignore +11 -0
Dockerfile +26 -0
README.md +321 -0
baseline_scores.json +8 -0
client.py +114 -0
env.example +23 -0
inference.py +286 -0
inference_log.txt +0 -0
models.py +45 -0
openenv.yaml +73 -0
pyproject.toml +26 -0
requirements.txt +9 -0
server/__init__.py +0 -0
server/app.py +176 -0
server/apps/base_app.py +19 -0
server/business_rules.py +62 -0
server/data_generator.py +214 -0
server/environment.py +141 -0
server/schema_drift.py +55 -0
server/tasks/__init__.py +0 -0
server/tasks/task1_missing.py +39 -0
server/tasks/task2_format.py +68 -0
server/tasks/task3_pipeline.py +104 -0
server/workflow_engine.py +63 -0
uv.lock +0 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,44 @@

+# ---- Git ----
+.git
+.gitignore
+# ---- Python cache ----
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.pytest_cache/
+# ---- Virtual environments ----
+venv/
+.env/
+.venv/
+# ---- Environment files ----
+.env
+.env.*
+# ---- OS files ----
+.DS_Store
+Thumbs.db
+# ---- Logs ----
+*.log
+# ---- Model / large local files (if any) ----
+checkpoints/
+models/
+*.pt
+*.pth
+*.bin
+# ---- IDE files ----
+.vscode/
+.idea/
+# ---- Node (if frontend exists) ----
+node_modules/
+# ---- Docker ----
+Dockerfile*
+docker-compose.yml

.gitattributes ADDED Viewed

	@@ -0,0 +1,40 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+training/orgos-training/orgos_lora_adapter/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+__pycache__/
+*.py[cod]
+*.egg-info/
+dist/
+build/
+.venv/
+venv/
+.env
+*.env
+.DS_Store
+.claude/

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.11-slim
+# Non-root user for HuggingFace Spaces compatibility
+RUN useradd -m -u 1000 appuser
+WORKDIR /app
+# Install dependencies first (layer cache)
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy project files
+COPY . .
+# Switch to non-root
+RUN chown -R appuser:appuser /app
+USER appuser
+EXPOSE 8000
+HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+# server.app:app — runs server/app.py from /app working directory
+# models.py, client.py, inference.py live at /app root (on PYTHONPATH automatically)
+CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]

README.md ADDED Viewed

	@@ -0,0 +1,321 @@

+---
+title: Data Cleaning Environment
+emoji: 🧹
+colorFrom: blue
+colorTo: green
+sdk: docker
+pinned: false
+app_port: 8000
+tags:
+  - openenv
+  - rl
+  - data-cleaning
+---
+# Data Cleaning OpenEnv
+A **real-world data cleaning environment** for training and evaluating AI agents.
+An agent interacts with a dirty pandas DataFrame through a standard `reset() / step() / state()` HTTP API, learning to fix common data quality problems — missing values, duplicate rows, inconsistent formats, statistical outliers, and dtype errors — across three progressively harder tasks.
+🤗 **Live HuggingFace Space:** https://srishtichugh-openenv-hack.hf.space
+📖 **Interactive API docs:** https://srishtichugh-openenv-hack.hf.space/docs
+✅ **Health check:** https://srishtichugh-openenv-hack.hf.space/health
+---
+## Environment Description & Motivation
+Real-world datasets are almost never clean. Data engineers routinely spend 60–80 % of their time on data cleaning tasks: filling missing values with statistically appropriate strategies, removing duplicates, standardising inconsistent formats (phone numbers, dates, country names), and detecting extreme outliers.
+This environment turns those tasks into a reinforcement learning challenge with:
+- **Deterministic, programmatic graders** — ground-truth clean DataFrames are generated with a fixed seed; every reward signal is reproducible.
+- **Meaningful partial rewards** — every step emits a delta reward proportional to how much of the dataset it cleaned, so the agent receives useful signal throughout the episode rather than only at the end.
+- **Three difficulty levels** — easy, medium, hard — letting agents learn a curriculum from simple null-filling up to full multi-issue pipelines.
+- **No external data downloads** — all datasets are generated synthetically via `numpy` + `Faker` with `seed=42`.
+---
+## Action Space
+Actions are JSON objects sent to `POST /step`.
+| `operation` | Required `column` | `params` | Description |
+|---|---|---|---|
+| `fill_missing` | ✅ | `{"strategy": "median\|mean\|mode\|constant", "value": ...}` | Fill NaN values in a column |
+| `drop_duplicates` | ❌ | — | Remove all duplicate rows |
+| `fix_format` | ✅ | — | Standardise phone/date/country format |
+| `replace_value` | ✅ | `{"old": ..., "new": ...}` | Replace a specific value |
+| `drop_outliers` | ✅ | — | Remove IQR outliers from a numeric column |
+| `fix_dtype` | ✅ | `{"dtype": "float\|int\|str"}` | Cast column to correct dtype |
+**Format rules enforced by `fix_format`:**
+| Column | Target format |
+|---|---|
+| `phone` | `NNN-NNN-NNNN` |
+| `listed_date` / `signup_date` | `YYYY-MM-DD` |
+| `country` | Title-cased canonical name (`USA`, `UK`, `Canada`, `Australia`, `Germany`) |
+**Example actions:**
+```json
+{"operation": "fill_missing",    "column": "salary",          "params": {"strategy": "median"}}
+{"operation": "fill_missing",    "column": "department",      "params": {"strategy": "mode"}}
+{"operation": "drop_duplicates"}
+{"operation": "fix_format",      "column": "phone"}
+{"operation": "fix_format",      "column": "signup_date"}
+{"operation": "drop_outliers",   "column": "purchase_amount"}
+```
+---
+## Observation Space
+Every `POST /reset` and `POST /step` returns:
+```json
+{
+  "observation": {
+    "done":             false,
+    "reward":           0.40,
+    "data_preview":     "name,age,salary,...\n...",
+    "data_shape":       [100, 5],
+    "missing_counts":   {"age": 20, "salary": 20, "department": 10},
+    "duplicate_count":  0,
+    "dtype_issues":     {},
+    "task_description": "Task 1 (Easy) — Fill Missing Values\n...",
+    "message":          "Filled 20 missing values in 'age' using median.",
+    "step_count":       1,
+    "current_score":    0.4000
+  },
+  "reward": 0.40,
+  "done":   false,
+  "info":   {}
+}
+```
+| Field | Type | Description |
+|---|---|---|
+| `done` | bool | Episode finished (score ≥ 0.95 or max steps reached) |
+| `reward` | float | Per-step delta reward (see Reward Function) |
+| `data_preview` | string | First 10 rows of current DataFrame as CSV |
+| `data_shape` | [int, int] | Current `[rows, cols]` |
+| `missing_counts` | object | `{column: null_count}` for columns with NaN |
+| `duplicate_count` | int | Number of duplicate rows |
+| `dtype_issues` | object | `{column: issue_description}` for suspected dtype mismatches |
+| `task_description` | string | Full task instructions with available operations |
+| `message` | string | Human-readable result of the last action |
+| `step_count` | int | Steps taken in this episode |
+| `current_score` | float | Running grader score 0.0 – 1.0 |
+---
+## State Space
+`GET /state` returns episode metadata (does not modify state):
+```json
+{
+  "episode_id":      "a8f026a9-...",
+  "task_id":         1,
+  "step_count":      2,
+  "max_steps":       20,
+  "total_errors":    50,
+  "errors_remaining": 30
+}
+```
+---
+## Tasks
+### Task 1 — Fill Missing Values *(Easy)*
+| Property | Value |
+|---|---|
+| Dataset | 100-row employee records (name, age, salary, department, experience) |
+| Issues | ~20 % NaN in `age`, `salary`; ~10 % NaN in `department` |
+| Goal | Fill all missing values |
+| Valid operations | `fill_missing` |
+| Grader | `1.0 − remaining_nulls / original_nulls` |
+| Max steps | 20 |
+| Optimal steps | 3 (one per affected column) |
+### Task 2 — Fix Formats + Remove Duplicates *(Medium)*
+| Property | Value |
+|---|---|
+| Dataset | 215-row product catalog (product_id, price, category, phone, listed_date) |
+| Issues | ~60 % phone numbers in mixed formats, ~60 % dates in mixed formats, 15 duplicate rows |
+| Goal | Standardise all phone/date formats and remove duplicates |
+| Valid operations | `fix_format`, `drop_duplicates` |
+| Grader | `0.35 × phone_score + 0.35 × date_score + 0.30 × dupe_score` |
+| Max steps | 30 |
+| Optimal steps | 3 |
+### Task 3 — Full Cleaning Pipeline *(Hard)*
+| Property | Value |
+|---|---|
+| Dataset | 320-row customer database (name, age, purchase_amount, country, email, signup_date) |
+| Issues | Missing values (4 cols), 20 duplicate rows, outliers in `purchase_amount` (~3× normal), mixed country capitalisation, mixed date formats |
+| Goal | Fix all issues end-to-end |
+| Valid operations | All 6 operations |
+| Grader | `0.25×null + 0.20×dupe + 0.20×outlier + 0.175×country + 0.175×date` |
+| Max steps | 40 |
+| Optimal steps | 8 |
+---
+## Reward Function
+| Scenario | Reward |
+|---|---|
+| Score improves (delta > 0) | `new_score − old_score` (positive) |
+| Operation had no effect | `−0.01` |
+| Invalid operation / bad column | `−0.05` |
+| Episode completed (score ≥ 0.95) | `delta + 0.20` terminal bonus |
+Rewards are bounded to **[−0.05, 1.2]**. A partial reward is emitted on every step, giving the agent dense signal throughout the episode.
+---
+## API Endpoints
+| Method | Path | Description |
+|---|---|---|
+| `GET` | `/health` | Health check → `{"status": "healthy"}` |
+| `POST` | `/reset` | Start episode. Body: `{"task_id": 1\|2\|3}` (optional; default: round-robin) |
+| `POST` | `/step` | Execute action. Body: action JSON |
+| `POST` | `/state` | Get episode metadata |
+| `GET` | `/metadata` | Environment name, version, task list |
+| `GET` | `/schema` | Full action / observation / state JSON schemas |
+| `GET` | `/docs` | Interactive Swagger UI |
+---
+## Baseline Scores
+| Task | Difficulty | Score |
+|---|---|---|
+| 1 — Fill Missing Values | Easy | 0.999 |
+| 2 — Fix Formats + Duplicates | Medium | 0.999 |
+| 3 — Full Cleaning Pipeline | Hard | 0.999 |
+| **Average** | — | **0.999** |
+*Produced by `google/gemma-3-27b-it` via NVIDIA NIM, `temperature=0`. Full step-by-step agent logs: `inference_log.txt`.*
+---
+## Setup & Usage
+### Prerequisites
+- Python 3.11+
+- Docker (for containerised deployment)
+### Local — Python
+```bash
+# 1. Clone and install dependencies
+git clone https://github.com/Tanvi51204/openEnv.git
+cd openEnv
+pip install -r requirements.txt
+# 2. Start the server
+uvicorn server.app:app --host 0.0.0.0 --port 8000
+# 3. Open Swagger UI
+open http://localhost:8000/docs
+```
+### Local — Docker
+```bash
+docker build -t data-cleaning-env .
+docker run -p 8000:8000 data-cleaning-env
+```
+### Quick API test
+```bash
+# Health
+curl http://localhost:8000/health
+# Start Task 1
+curl -X POST http://localhost:8000/reset \
+  -H "Content-Type: application/json" \
+  -d '{"task_id": 1}'
+# Fill missing values
+curl -X POST http://localhost:8000/step \
+  -H "Content-Type: application/json" \
+  -d '{"operation": "fill_missing", "column": "salary", "params": {"strategy": "median"}}'
+```
+### Python client
+```python
+from client import DataCleaningEnvClient
+from models import DataCleaningAction
+with DataCleaningEnvClient("http://localhost:8000") as env:
+    result = env.reset(task_id=1)
+    print(result.observation.missing_counts)   # {'age': 20, 'salary': 20, 'department': 10}
+    action = DataCleaningAction(
+        operation="fill_missing",
+        column="salary",
+        params={"strategy": "median"},
+    )
+    result = env.step(action)
+    print(result.observation.current_score)    # 0.4
+    print(result.reward)                       # 0.4
+```
+### Run baseline inference
+```bash
+export API_BASE_URL="https://api.openai.com/v1"
+export MODEL_NAME="gpt-4o-mini"
+export HF_TOKEN="sk-..."          # your API key
+export ENV_URL="http://localhost:8000"
+python inference.py
+```
+Produces `[START]` / `[STEP]` / `[END]` lines to stdout and `baseline_scores.json`.
+### Environment variables
+| Variable | Default | Description |
+|---|---|---|
+| `API_BASE_URL` | `https://api.openai.com/v1` | LLM API endpoint (OpenAI-compatible) |
+| `MODEL_NAME` | `gpt-4o-mini` | Model identifier |
+| `HF_TOKEN` | — | API key for LLM calls |
+| `ENV_URL` | `http://localhost:8000` | Environment server URL |
+---
+## Project Structure
+```
+openenv-data-cleaning/
+├── models.py              Pydantic contracts — Action / Observation / State
+├── client.py              Sync HTTP client (reset / step / state / health)
+├── inference.py           Baseline LLM agent with [START]/[STEP]/[END] logging
+├── openenv.yaml           OpenEnv manifest
+├── Dockerfile             python:3.11-slim, non-root user, HEALTHCHECK
+├── requirements.txt       pip dependencies
+├── pyproject.toml         Python package metadata + openenv-core dependency
+└── server/
+    ├── app.py             FastAPI routes + /metadata + /schema
+    ├── environment.py     reset / step / state logic + 6 operations + rewards
+    ├── data_generator.py  Synthetic dataset generation (seed=42, reproducible)
+    └── tasks/
+        ├── task1_missing.py    Easy  — fill NaN grader
+        ├── task2_format.py     Medium — format + duplicates grader
+        └── task3_pipeline.py   Hard  — full pipeline grader
+```
+---
+## Live Demo
+🤗 **HuggingFace Space:** https://srishtichugh-openenv-hack.hf.space
+- Health: https://srishtichugh-openenv-hack.hf.space/health
+- Docs:   https://srishtichugh-openenv-hack.hf.space/docs

baseline_scores.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "scores": {
+    "task1": 0.99,
+    "task2": 0.99,
+    "task3": 0.99
+  },
+  "average": 0.99
+}

client.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""
+Synchronous HTTP client for the Data Cleaning OpenEnv environment.
+Usage
+-----
+    from client import DataCleaningEnvClient, DataCleaningAction
+    client = DataCleaningEnvClient(base_url="http://localhost:8000")
+    # Start a new episode (task_id 1/2/3 or omit for round-robin)
+    result = client.reset(task_id=1)
+    print(result.observation.task_description)
+    # Take a step
+    action = DataCleaningAction(
+        operation="fill_missing",
+        column="salary",
+        params={"strategy": "median"},
+    )
+    result = client.step(action)
+    print(result.observation.current_score, result.reward, result.done)
+    # Inspect state
+    state = client.state()
+    print(state.episode_id, state.errors_remaining)
+"""
+from typing import Optional
+import httpx
+from pydantic import BaseModel
+from models import DataCleaningAction, DataCleaningObservation, DataCleaningState
+class StepResult(BaseModel):
+    """Returned by reset() and step()."""
+    observation: DataCleaningObservation
+    reward: float
+    done: bool
+    info: dict = {}
+class DataCleaningEnvClient:
+    """
+    Thin synchronous wrapper around the DataCleaning HTTP API.
+    All methods raise httpx.HTTPStatusError on non-2xx responses.
+    """
+    def __init__(self, base_url: str = "http://localhost:8000", timeout: float = 30.0):
+        self.base_url = base_url.rstrip("/")
+        self._client   = httpx.Client(base_url=self.base_url, timeout=timeout)
+    # ------------------------------------------------------------------
+    # Core API
+    # ------------------------------------------------------------------
+    def reset(self, task_id: Optional[int] = None) -> StepResult:
+        """
+        Start a new episode.
+        Parameters
+        ----------
+        task_id : int | None
+            1 = Easy   (fill missing values)
+            2 = Medium (fix formats + duplicates)
+            3 = Hard   (full pipeline)
+            None = round-robin (1 → 2 → 3 → 1 …)
+        """
+        payload = {"task_id": task_id} if task_id is not None else {}
+        resp    = self._client.post("/reset", json=payload)
+        resp.raise_for_status()
+        return StepResult(**resp.json())
+    def step(self, action: DataCleaningAction) -> StepResult:
+        """
+        Apply one cleaning operation and return the updated observation.
+        Parameters
+        ----------
+        action : DataCleaningAction
+            operation : str   – one of fill_missing / drop_duplicates /
+                                fix_format / replace_value / drop_outliers / fix_dtype
+            column    : str   – target column (optional for drop_duplicates)
+            params    : dict  – operation-specific parameters
+        """
+        resp = self._client.post("/step", json=action.model_dump())
+        resp.raise_for_status()
+        return StepResult(**resp.json())
+    def state(self) -> DataCleaningState:
+        """Return current episode metadata without modifying state."""
+        resp = self._client.get("/state")
+        resp.raise_for_status()
+        return DataCleaningState(**resp.json())
+    def health(self) -> dict:
+        """Ping the server. Returns {"status": "ok"} if healthy."""
+        resp = self._client.get("/health")
+        resp.raise_for_status()
+        return resp.json()
+    # ------------------------------------------------------------------
+    # Context manager support
+    # ------------------------------------------------------------------
+    def __enter__(self):
+        return self
+    def __exit__(self, *_):
+        self.close()
+    def close(self):
+        self._client.close()

env.example ADDED Viewed

	@@ -0,0 +1,23 @@

+# ============================================================
+#  Data Cleaning OpenEnv — Environment Variables
+#  Copy this file to .env and fill in your values.
+#  Never commit your real .env to version control.
+# ============================================================
+# LLM API endpoint (OpenAI-compatible).
+# Default points to OpenAI; swap for any compatible provider.
+API_BASE_URL=https://api.openai.com/v1
+# Model identifier to use for baseline inference.
+# Examples: gpt-4o-mini, gpt-4o, mistralai/Mistral-7B-Instruct-v0.2
+MODEL_NAME=gpt-4o-mini
+# API key for the LLM provider above.
+# For OpenAI: starts with sk-...
+# For HuggingFace Inference: starts with hf_...
+HF_TOKEN=your-api-key-here
+# Base URL of the running environment server.
+# Use http://localhost:8000 for local development,
+# or your HuggingFace Space URL for remote runs.
+ENV_URL=http://localhost:8000

inference.py ADDED Viewed

	@@ -0,0 +1,286 @@

+"""
+Baseline inference script for the Data Cleaning OpenEnv environment.
+Uses the OpenAI client against all 3 tasks and reports scores.
+Required environment variables:
+    API_BASE_URL   — LLM API endpoint (OpenAI-compatible)
+    MODEL_NAME     — model identifier
+    HF_TOKEN       — API key
+    ENV_URL        — environment server URL (default: http://localhost:8000)
+STDOUT FORMAT (OpenEnv spec):
+    [START] task=<task_name> env=<benchmark> model=<model_name>
+    [STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
+    [END]   task=<task_name> score=<0.00> steps=<n>
+"""
+import json
+import os
+import re
+import sys
+import time
+from typing import List, Optional
+import httpx
+from openai import OpenAI
+# ------------------------------------------------------------------
+# Config
+# ------------------------------------------------------------------
+API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
+MODEL_NAME   = os.environ.get("MODEL_NAME",   "gpt-4o-mini")
+HF_TOKEN     = os.environ.get("HF_TOKEN",     "")
+ENV_URL      = os.environ.get("ENV_URL",      "http://localhost:8000")
+if not HF_TOKEN:
+    print("[WARNING] HF_TOKEN is not set — LLM calls may fail.", file=sys.stderr)
+client = OpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
+SYSTEM_PROMPT = """You are a data cleaning agent. You control a data cleaning environment
+through JSON actions. Each turn you receive an observation JSON describing the current state
+of a dataset (preview, missing counts, duplicate count, dtype issues, current score, etc.)
+and a task description.
+Your job is to pick the single best action to improve the dataset quality.
+Respond ONLY with a valid JSON object — no markdown, no explanation, just the JSON.
+Available operations and their required parameters:
+1. fill_missing
+   {"operation": "fill_missing", "column": "<col>", "params": {"strategy": "median|mean|mode|constant", "value": <only if constant>}}
+2. drop_duplicates
+   {"operation": "drop_duplicates"}
+3. fix_format
+   {"operation": "fix_format", "column": "phone|listed_date|signup_date|country"}
+4. replace_value
+   {"operation": "replace_value", "column": "<col>", "params": {"old": "<val>", "new": "<val>"}}
+5. drop_outliers
+   {"operation": "drop_outliers", "column": "<numeric_col>"}
+6. fix_dtype
+   {"operation": "fix_dtype", "column": "<col>", "params": {"dtype": "float|int|str"}}
+Rules:
+- Address the highest-impact issues first (missing values > duplicates > outliers > format).
+- Do not repeat an operation that returned no effect (watch the 'message' field).
+- Stop when current_score >= 0.95.
+"""
+# ------------------------------------------------------------------
+# OpenEnv stdout logging helpers
+# ------------------------------------------------------------------
+def log_start(task: str, env: str, model: str) -> None:
+    print(f"[START] task={task} env={env} model={model}", flush=True)
+def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
+    error_val = error if error else "null"
+    done_val  = str(done).lower()
+    print(
+        f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
+        flush=True,
+    )
+def log_end(task_name: str, score: float, steps: int) -> None:
+    safe_score = max(0.01, min(0.99, float(score)))
+    print(
+        f"[END] task={task_name} score={safe_score:.4f} steps={steps}",
+        flush=True
+    )
+# ------------------------------------------------------------------
+# HTTP helpers
+# ------------------------------------------------------------------
+def api_post(path: str, payload: dict = None) -> dict:
+    url  = ENV_URL.rstrip("/") + path
+    resp = httpx.post(url, json=payload or {}, timeout=30)
+    resp.raise_for_status()
+    return resp.json()
+def api_get(path: str) -> dict:
+    url  = ENV_URL.rstrip("/") + path
+    resp = httpx.get(url, timeout=10)
+    resp.raise_for_status()
+    return resp.json()
+# ------------------------------------------------------------------
+# Agent loop
+# ------------------------------------------------------------------
+def obs_to_text(obs: dict) -> str:
+    lines = [
+        f"current_score: {obs['current_score']}",
+        f"step_count:    {obs['step_count']}",
+        f"data_shape:    {obs['data_shape']}",
+        f"duplicate_count: {obs['duplicate_count']}",
+        f"missing_counts: {json.dumps(obs['missing_counts'])}",
+        f"dtype_issues:   {json.dumps(obs['dtype_issues'])}",
+        f"message:        {obs['message']}",
+        "",
+        "=== DATA PREVIEW (first 10 rows) ===",
+        obs["data_preview"],
+        "",
+        "=== TASK DESCRIPTION ===",
+        obs["task_description"],
+    ]
+    return "\n".join(lines)
+def run_task(task_id: int) -> float:
+    task_name = f"data-cleaning-task{task_id}"
+    # Human-readable header (stderr so it doesn't interfere with stdout format)
+    print(f"\n{'='*60}", file=sys.stderr)
+    print(f"  Running Task {task_id}", file=sys.stderr)
+    print(f"{'='*60}", file=sys.stderr)
+    result  = api_post("/reset", {"task_id": task_id})
+    obs     = result["observation"]
+    history = []
+    rewards: List[float] = []
+    steps_taken = 0
+    success = False
+    log_start(task=task_name, env="data-cleaning-openenv", model=MODEL_NAME)
+    try:
+        for step_num in range(1, 50):
+            if obs["done"]:
+                success = obs["current_score"] >= 0.95
+                break
+            obs_text = obs_to_text(obs)
+            history.append({"role": "user", "content": obs_text})
+            try:
+                response = client.chat.completions.create(
+                    model       = MODEL_NAME,
+                    messages    = [{"role": "system", "content": SYSTEM_PROMPT}] + history,
+                    temperature = 0.0,
+                    max_tokens  = 256,
+                )
+                action_str = response.choices[0].message.content.strip()
+            except Exception as exc:
+                print(f"  Step {step_num}: LLM call failed: {exc}", file=sys.stderr)
+                log_step(step_num, "null", 0.0, True, str(exc))
+                break
+            history.append({"role": "assistant", "content": action_str})
+            # Parse action JSON
+            action = None
+            try:
+                action = json.loads(action_str)
+            except json.JSONDecodeError:
+                m = re.search(r"\{.*\}", action_str, re.DOTALL)
+                if m:
+                    try:
+                        action = json.loads(m.group())
+                    except Exception:
+                        pass
+            if action is None:
+                print(f"  Step {step_num}: Could not parse action JSON, skipping.", file=sys.stderr)
+                log_step(step_num, action_str, -0.05, False, "json_parse_error")
+                break
+            action_label = json.dumps(action, separators=(",", ":"))
+            print(
+                f"  Step {step_num:2d} | score={obs['current_score']:.4f} | action={action_label}",
+                file=sys.stderr,
+            )
+            result      = api_post("/step", action)
+            obs         = result["observation"]
+            step_reward = result["reward"]
+            done        = result["done"]
+            error_msg   = None if obs["message"].startswith("Fill") or step_reward >= 0 else obs["message"]
+            print(f"           -> {obs['message']}", file=sys.stderr)
+            rewards.append(step_reward)
+            steps_taken = step_num
+            log_step(
+                step   = step_num,
+                action = action_label,
+                reward = step_reward,
+                done   = done,
+                error  = error_msg,
+            )
+            if done:
+                success = obs["current_score"] >= 0.95
+                break
+            time.sleep(0.3)
+    finally:
+        final = obs.get("current_score", 0.01) if isinstance(obs, dict) else 0.01
+        log_end(task_name=task_name, score=final, steps=steps_taken)
+    final_score = obs["current_score"]
+    print(
+        f"\n  Task {task_id} final score: {final_score:.4f}  (steps used: {obs['step_count']})",
+        file=sys.stderr,
+    )
+    return final_score
+# ------------------------------------------------------------------
+# Main
+# ------------------------------------------------------------------
+def main():
+    print("Data Cleaning OpenEnv -- Baseline Inference", file=sys.stderr)
+    print(f"Model : {MODEL_NAME}", file=sys.stderr)
+    print(f"Env   : {ENV_URL}", file=sys.stderr)
+    # Smoke-test health endpoint
+    try:
+        health = api_get("/health")
+        assert health.get("status") in ("ok", "healthy"), f"Unexpected status: {health}"
+        print("Health check: OK\n", file=sys.stderr)
+    except Exception as exc:
+        print(f"[ERROR] Environment not reachable at {ENV_URL}: {exc}", file=sys.stderr)
+        print("[ERROR] Make sure the server is running and ENV_URL is correct.", file=sys.stderr)
+        sys.exit(1)
+    scores = {}
+    for task_id in [1, 2, 3]:
+        try:
+            scores[f"task{task_id}"] = run_task(task_id)
+        except Exception as exc:
+            print(f"[ERROR] Task {task_id} failed: {exc}", file=sys.stderr)
+            scores[f"task{task_id}"] = 0.01
+    print("\n" + "="*60, file=sys.stderr)
+    print("  BASELINE RESULTS", file=sys.stderr)
+    print("="*60, file=sys.stderr)
+    for k, v in scores.items():
+        print(f"  {k}: {v:.4f}", file=sys.stderr)
+    avg = round(sum(scores.values()) / len(scores), 4)
+    print(f"  average: {avg:.4f}", file=sys.stderr)
+    print("="*60, file=sys.stderr)
+    # Write scores to file for automated validators
+    with open("baseline_scores.json", "w") as f:
+        json.dump({"scores": scores, "average": avg}, f, indent=2)
+    print("\nScores written to baseline_scores.json", file=sys.stderr)
+if __name__ == "__main__":
+    main()

inference_log.txt ADDED Viewed

Binary file (28.1 kB). View file

models.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# models.py
+class OrgOSAction(BaseModel):
+    app: str           # "jira" | "zendesk" | "salesforce" | "workday"
+    operation: str     # app-specific operation name
+    args: Dict[str, Any] = {}
+class RewardBreakdown(BaseModel):
+    workflow_completion: float = 0.0   # 0.30 weight
+    rule_compliance: float = 0.0       # 0.25 weight
+    schema_adaptation: float = 0.0     # 0.20 weight
+    efficiency: float = 0.0            # 0.15 weight
+    policy_drift_handling: float = 0.0 # 0.10 weight
+class OrgOSObservation(BaseModel):
+    done: bool
+    reward: float
+    current_score: float
+    workflow_id: str               # "A", "B", or "C"
+    step_count: int
+    # Per-app state views (what the agent sees)
+    app_states: Dict[str, str]     # app_name → CSV/JSON string preview
+    # Workflow progress
+    workflow_goal: str
+    completed_steps: List[str]
+    pending_steps: List[str]
+    # Schema drift info (partial — agent must probe to discover rest)
+    schema_hints: Dict[str, str]   # e.g. {"jira.priority": "severity"}
+    # Business rules in effect this episode
+    active_rules: Dict[str, Any]   # {"sla_p0_minutes": 15, "approval_threshold": 5000}
+    # Per-step feedback
+    rule_violations: List[str]     # violations that just occurred
+    reward_breakdown: RewardBreakdown
+    message: str
+class OrgOSState(BaseModel):
+    episode_id: str
+    workflow_id: str
+    schema_versions: Dict[str, str]     # {"jira": "v2", "zendesk": "v1", ...}
+    step_count: int
+    max_steps: int
+    rule_violation_count: int
+    workflow_completion: float
+    rule_compliance_rate: float
+    policy_drift_active: bool

openenv.yaml ADDED Viewed

	@@ -0,0 +1,73 @@

+name: data-cleaning-env
+version: "0.1.0"
+description: >
+  A real-world data cleaning environment where an AI agent fixes missing
+  values, duplicate rows, format inconsistencies, outliers, and dtype errors
+  across three progressively harder tasks.
+author: openenv-hackathon
+tags:
+  - openenv
+  - data-cleaning
+  - rl
+  - real-world
+tasks:
+  - id: task1
+    name: "Fill Missing Values"
+    difficulty: easy
+    max_steps: 20
+    description: >
+      Fill all NaN values in an employee records dataset.
+      Columns with missing data: age, salary, department.
+  - id: task2
+    name: "Fix Formats and Remove Duplicates"
+    difficulty: medium
+    max_steps: 30
+    description: >
+      Standardise phone numbers (NNN-NNN-NNNN) and dates (YYYY-MM-DD)
+      in a product catalog, and remove ~15 duplicate rows.
+  - id: task3
+    name: "Full Cleaning Pipeline"
+    difficulty: hard
+    max_steps: 40
+    description: >
+      End-to-end pipeline on a customer database: fill missing values,
+      remove duplicates, drop outliers in purchase_amount, standardise
+      country capitalisation, and fix mixed date formats.
+api:
+  health:  GET  /health
+  reset:   POST /reset
+  step:    POST /step
+  state:   POST /state
+  docs:    GET  /docs
+reward:
+  range: [0.001, 0.999]
+  partial: true
+  terminal_bonus: 0.0
+observation_space:
+  type: object
+  fields:
+    done:            boolean
+    reward:          float
+    data_preview:    string   # First 10 rows as CSV
+    data_shape:      list     # [rows, cols]
+    missing_counts:  object   # {column: count}
+    duplicate_count: integer
+    dtype_issues:    object   # {column: issue_description}
+    task_description: string
+    message:         string
+    step_count:      integer
+    current_score:   float    # 0.0–1.0
+action_space:
+  type: object
+  fields:
+    operation: string   # fill_missing | drop_duplicates | fix_format | replace_value | drop_outliers | fix_dtype
+    column:    string   # optional depending on operation
+    params:    object   # optional operation parameters

pyproject.toml ADDED Viewed

	@@ -0,0 +1,26 @@

+[project]
+name = "data-cleaning-env"
+version = "0.1.0"
+description = "Real-world data cleaning environment for OpenEnv / Scaler hackathon"
+requires-python = ">=3.11"
+dependencies = [
+    "fastapi==0.135.2",
+    "uvicorn[standard]==0.40.0",
+    "pydantic==2.12.5",
+    "pandas==2.2.3",
+    "numpy==2.2.4",
+    "faker==40.12.0",
+    "openai==2.15.0",
+    "httpx==0.28.1",
+    "openenv-core==0.2.3",
+]
+[project.scripts]
+server = "server.app:main"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["server"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi==0.135.2
+uvicorn[standard]==0.40.0
+pydantic==2.12.5
+pandas==2.2.3
+numpy==2.2.4
+faker==40.12.0
+openai==2.15.0
+httpx==0.28.1
+openenv-core==0.2.3

server/__init__.py ADDED Viewed

File without changes

server/app.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""
+FastAPI application exposing the OpenEnv-compatible HTTP API.
+Endpoints: GET /health, GET /metadata, GET /schema,
+           POST /reset, POST /step, GET /state, POST /state, GET /docs
+"""
+from typing import Any, Dict, Optional
+from fastapi import Body, FastAPI, HTTPException
+from pydantic import BaseModel
+import uvicorn
+from models import DataCleaningAction, DataCleaningObservation, DataCleaningState
+from server.environment import DataCleaningEnvironment
+app = FastAPI(
+    title="Data Cleaning OpenEnv",
+    description="A real-world data cleaning environment for AI agent training.",
+    version="0.1.0",
+)
+# Single shared environment instance (stateful server)
+env = DataCleaningEnvironment()
+# New reset body accepts workflow_id
+class ResetRequest(BaseModel):
+    workflow_id: Optional[str] = None  # "A", "B", "C", or None for round-robin
+class StepResponse(BaseModel):
+    observation: DataCleaningObservation
+    reward: float
+    done: bool
+    info: dict = {}
+# ------------------------------------------------------------------
+# Routes
+# ------------------------------------------------------------------
+@app.get("/health")
+def health():
+    return {"status": "healthy"}
+@app.get("/metadata")
+def metadata():
+    return {
+        "name": "data-cleaning-env",
+        "description": (
+            "A real-world data cleaning environment where an AI agent fixes "
+            "missing values, duplicate rows, format inconsistencies, outliers, "
+            "and dtype errors across three progressively harder tasks."
+        ),
+        "version": "0.1.0",
+        "tags": ["openenv", "data-cleaning", "rl", "real-world"],
+        "tasks": [
+            {"id": "task1", "name": "Fill Missing Values", "difficulty": "easy"},
+            {"id": "task2", "name": "Fix Formats and Remove Duplicates", "difficulty": "medium"},
+            {"id": "task3", "name": "Full Cleaning Pipeline", "difficulty": "hard"},
+        ],
+    }
+@app.get("/schema")
+def schema():
+    return {
+        "action": {
+            "type": "object",
+            "properties": {
+                "operation": {
+                    "type": "string",
+                    "enum": [
+                        "fill_missing",
+                        "drop_duplicates",
+                        "fix_format",
+                        "replace_value",
+                        "drop_outliers",
+                        "fix_dtype",
+                    ],
+                },
+                "column": {"type": "string", "nullable": True},
+                "params": {"type": "object", "nullable": True},
+            },
+            "required": ["operation"],
+        },
+        "observation": {
+            "type": "object",
+            "properties": {
+                "done":             {"type": "boolean"},
+                "reward":           {"type": "number"},
+                "data_preview":     {"type": "string"},
+                "data_shape":       {"type": "array", "items": {"type": "integer"}},
+                "missing_counts":   {"type": "object"},
+                "duplicate_count":  {"type": "integer"},
+                "dtype_issues":     {"type": "object"},
+                "task_description": {"type": "string"},
+                "message":          {"type": "string"},
+                "step_count":       {"type": "integer"},
+                "current_score":    {"type": "number"},
+            },
+        },
+        "state": {
+            "type": "object",
+            "properties": {
+                "episode_id":       {"type": "string"},
+                "task_id":          {"type": "integer"},
+                "step_count":       {"type": "integer"},
+                "max_steps":        {"type": "integer"},
+                "total_errors":     {"type": "integer"},
+                "errors_remaining": {"type": "integer"},
+            },
+        },
+    }
+@app.post("/reset", response_model=StepResponse)
+def reset(req: ResetRequest = ResetRequest()):
+    try:
+        obs = env.reset(task_id=req.task_id)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    return StepResponse(observation=obs, reward=obs.reward, done=False)
+@app.post("/step", response_model=StepResponse)
+async def step(body: Dict[str, Any] = Body(...)):
+    """
+    Accept both openenv-core wrapped format:
+        {"action": {"operation": "...", ...}, "timeout_s": 15}
+    and direct format (for backward compat with our own client/inference):
+        {"operation": "...", "column": "...", "params": {...}}
+    """
+    action_data = body.get("action", body)
+    try:
+        action = DataCleaningAction(**action_data)
+        obs = env.step(action)
+    except (TypeError, KeyError, Exception) as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    return StepResponse(observation=obs, reward=obs.reward, done=obs.done)
+@app.get("/state", response_model=DataCleaningState)
+def state_get():
+    """GET /state — openenv-core spec."""
+    return env.state()
+@app.post("/state", response_model=DataCleaningState)
+def state_post():
+    """POST /state — backward compatibility."""
+    return env.state()
+@app.get("/", response_class=HTMLResponse)
+def ui():
+    """Serve the demo dashboard."""
+    return FileResponse("ui/index.html")
+@app.get("/schema/apps")
+def app_schemas():
+    """Return the canonical action space per app — used by the UI."""
+    return {...}  # maps app → list of operations + their arg schemas
+# ------------------------------------------------------------------
+# Entry point (required by openenv-core and [project.scripts])
+# ------------------------------------------------------------------
+def main():
+    uvicorn.run("server.app:app", host="0.0.0.0", port=8000)
+if __name__ == "__main__":
+    main()

server/apps/base_app.py ADDED Viewed

	@@ -0,0 +1,19 @@

+class BaseApp(ABC):
+    APP_NAME: str = ""
+    # --- Core interface every app must implement ---
+    @abstractmethod
+    def initialize(self, records: List[Dict]) -> None:
+        """Load synthetic records into in-memory state."""
+    @abstractmethod
+    def execute(self, operation: str, args: Dict) -> Dict:
+        """Execute an operation. Returns {"success": bool, "data": ..., "message": str}"""
+    @abstractmethod
+    def get_state_view(self, max_rows: int = 5) -> str:
+        """Return agent-visible snapshot as a compact string."""
+    @abstractmethod
+    def count_open_items(self) -> int:
+        """Count pending/open work items (used by grader)."""

server/business_rules.py ADDED Viewed

	@@ -0,0 +1,62 @@

+DEFAULT_RULES = {
+    "sla_p0_minutes": 30,          # P0 tickets: acknowledge within 30 min
+    "sla_p1_hours": 4,             # P1 tickets: first response within 4h
+    "approval_threshold": 10_000,  # $ above which manager approval needed
+    "max_tickets_per_agent": 10,   # RBAC: agent capacity cap
+    "gdpr_max_days": 30,           # compliance: GDPR ticket resolution
+    "rbac": {
+        "support": {"salesforce": ["read"], "jira": ["read", "create_issue"]},
+        "engineer": {"jira": ["*"], "zendesk": ["read"]},
+        "manager": {"*": ["*"]},
+    }
+}
+POLICY_DRIFT_EVENTS = {
+    "sla_tighten":          {"sla_p0_minutes": 15, "sla_p1_hours": 2},
+    "approval_tighten":     {"approval_threshold": 5_000},
+    "gdpr_expedite":        {"gdpr_max_days": 7},
+}
+class BusinessRuleEngine:
+    def __init__(self):
+        self.rules = DEFAULT_RULES.copy()
+        self._violation_log: List[str] = []
+    def apply_policy_drift(self, event: str) -> None:
+        """Called mid-episode or at episode start to change rules."""
+        if event in POLICY_DRIFT_EVENTS:
+            self.rules.update(POLICY_DRIFT_EVENTS[event])
+    def check_action(self, action: OrgOSAction, context: Dict) -> Tuple[bool, str, float]:
+        """Returns (allowed, reason, penalty)."""
+        violations = []
+        # RBAC check
+        role = context.get("agent_role", "support")
+        app_perms = self.rules["rbac"].get(role, {})
+        allowed_ops = app_perms.get(action.app, app_perms.get("*", []))
+        if "*" not in allowed_ops and action.operation not in allowed_ops:
+            violations.append(f"RBAC: {role} cannot {action.operation} on {action.app}")
+            return False, violations[0], -0.25
+        # Approval threshold check
+        if action.operation in ("request_budget_approval", "update_deal_stage"):
+            amount = action.args.get("amount", 0)
+            if amount > self.rules["approval_threshold"] and not context.get("manager_approved"):
+                violations.append(f"Approval required: ${amount} > ${self.rules['approval_threshold']}")
+                return False, violations[0], -0.10
+        self._violation_log.extend(violations)
+        return True, "", 0.0
+    def check_sla(self, ticket: Dict, elapsed_minutes: float) -> Tuple[bool, float]:
+        """Returns (sla_met, penalty)."""
+        priority = ticket.get("priority", ticket.get("urgency", "p2"))
+        if priority in ("p0", "critical") and elapsed_minutes > self.rules["sla_p0_minutes"]:
+            return False, -0.15
+        return True, 0.0
+    def get_violations_this_step(self) -> List[str]:
+        v = self._violation_log.copy()
+        self._violation_log.clear()
+        return v

server/data_generator.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""
+Synthetic dataset generation with a fixed seed for full reproducibility.
+All datasets are generated purely from numpy/random — no external downloads.
+"""
+import random
+import numpy as np
+import pandas as pd
+SEED = 42
+# ---------------------------------------------------------------------------
+# Task 1 — Employee records with missing values
+# ---------------------------------------------------------------------------
+def generate_task1_datasets():
+    """Returns (dirty_df, clean_df) for Task 1."""
+    rng = np.random.default_rng(SEED)
+    random.seed(SEED)
+    n = 100
+    departments = ["Engineering", "Marketing", "Sales", "HR", "Finance"]
+    first_names = ["Alice", "Bob", "Carol", "David", "Eve", "Frank", "Grace",
+                   "Heidi", "Ivan", "Judy", "Karl", "Laura", "Mallory", "Niaj",
+                   "Oscar", "Peggy", "Quinn", "Romeo", "Sybil", "Trent"]
+    last_names  = ["Smith", "Jones", "Brown", "Taylor", "Wilson", "Davis",
+                   "Miller", "Anderson", "Thomas", "Jackson"]
+    names       = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(n)]
+    ages        = rng.integers(22, 60, size=n).astype(float)
+    salaries    = rng.integers(40_000, 120_000, size=n).astype(float)
+    depts       = rng.choice(departments, size=n)
+    experience  = rng.integers(0, 30, size=n).astype(float)
+    clean_df = pd.DataFrame({
+        "name":       names,
+        "age":        ages,
+        "salary":     salaries,
+        "department": depts,
+        "experience": experience,
+    })
+    dirty_df = clean_df.copy()
+    # Inject ~20 % NaN into age, salary, department
+    for col, frac in [("age", 0.20), ("salary", 0.20), ("department", 0.10)]:
+        idx = rng.choice(n, size=int(n * frac), replace=False)
+        dirty_df.loc[idx, col] = np.nan
+    return dirty_df.reset_index(drop=True), clean_df.reset_index(drop=True)
+# ---------------------------------------------------------------------------
+# Task 2 — Product catalog with format & duplicate issues
+# ---------------------------------------------------------------------------
+def _scramble_phone(phone: str, rng) -> str:
+    digits = phone.replace("-", "")
+    fmt = rng.integers(0, 3)
+    if fmt == 0:
+        return digits                          # 5551234567
+    elif fmt == 1:
+        return f"({digits[:3]}){digits[3:]}"   # (555)1234567
+    else:
+        return phone                           # 555-123-4567  (canonical)
+def _scramble_date(date_str: str, rng) -> str:
+    dt = pd.to_datetime(date_str)
+    fmt = rng.integers(0, 3)
+    if fmt == 0:
+        return dt.strftime("%Y-%m-%d")
+    elif fmt == 1:
+        return dt.strftime("%b %d %Y")
+    else:
+        return dt.strftime("%d/%m/%Y")
+def generate_task2_datasets():
+    """Returns (dirty_df, clean_df) for Task 2."""
+    rng = np.random.default_rng(SEED)
+    random.seed(SEED)
+    n = 200
+    categories = ["Electronics", "Clothing", "Food", "Books", "Toys"]
+    product_ids   = [f"P{str(i).zfill(4)}" for i in range(1, n + 1)]
+    product_names = [f"Product_{i}" for i in range(1, n + 1)]
+    prices        = np.round(rng.uniform(5.0, 500.0, size=n), 2)
+    categories_col = rng.choice(categories, size=n)
+    phones        = [
+        f"{rng.integers(100,999)}-{rng.integers(100,999)}-{rng.integers(1000,9999)}"
+        for _ in range(n)
+    ]
+    days_offset   = rng.integers(0, 1000, size=n)
+    dates         = [
+        (pd.Timestamp("2020-01-01") + pd.Timedelta(days=int(d))).strftime("%Y-%m-%d")
+        for d in days_offset
+    ]
+    clean_df = pd.DataFrame({
+        "product_id":   product_ids,
+        "product_name": product_names,
+        "price":        prices,
+        "category":     categories_col,
+        "phone":        phones,
+        "listed_date":  dates,
+    })
+    dirty_df = clean_df.copy()
+    # Scramble ~60 % of phone formats
+    phone_idx = rng.choice(n, size=int(n * 0.6), replace=False)
+    dirty_df.loc[phone_idx, "phone"] = [
+        _scramble_phone(dirty_df.loc[i, "phone"], rng) for i in phone_idx
+    ]
+    # Scramble ~60 % of date formats
+    date_idx = rng.choice(n, size=int(n * 0.6), replace=False)
+    dirty_df.loc[date_idx, "listed_date"] = [
+        _scramble_date(dirty_df.loc[i, "listed_date"], rng) for i in date_idx
+    ]
+    # Add 15 duplicate rows
+    dup_idx  = rng.choice(n, size=15, replace=False)
+    dup_rows = dirty_df.iloc[dup_idx].copy()
+    dirty_df = pd.concat([dirty_df, dup_rows], ignore_index=True)
+    return dirty_df.reset_index(drop=True), clean_df.reset_index(drop=True)
+# ---------------------------------------------------------------------------
+# Task 3 — Customer database: full pipeline
+# ---------------------------------------------------------------------------
+def generate_task3_datasets():
+    """Returns (dirty_df, clean_df) for Task 3."""
+    rng = np.random.default_rng(SEED)
+    random.seed(SEED)
+    n = 300
+    countries  = ["USA", "UK", "Canada", "Australia", "Germany"]
+    first_names = ["Alice", "Bob", "Carol", "David", "Eve", "Frank", "Grace",
+                   "Heidi", "Ivan", "Judy"]
+    last_names  = ["Smith", "Jones", "Brown", "Taylor", "Wilson"]
+    names             = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(n)]
+    ages              = rng.integers(18, 75, size=n).astype(float)
+    purchase_amounts  = np.round(rng.uniform(10.0, 500.0, size=n), 2)
+    countries_col     = rng.choice(countries, size=n)
+    emails            = [f"user{i}@example.com" for i in range(1, n + 1)]
+    days_offset       = rng.integers(0, 730, size=n)
+    signup_dates      = [
+        (pd.Timestamp("2022-01-01") + pd.Timedelta(days=int(d))).strftime("%Y-%m-%d")
+        for d in days_offset
+    ]
+    clean_df = pd.DataFrame({
+        "name":            names,
+        "age":             ages,
+        "purchase_amount": purchase_amounts,
+        "country":         countries_col,
+        "email":           emails,
+        "signup_date":     signup_dates,
+    })
+    dirty_df = clean_df.copy()
+    # Missing values (~15 % in age, purchase_amount, country, signup_date)
+    for col, frac in [("age", 0.15), ("purchase_amount", 0.15),
+                      ("country", 0.10), ("signup_date", 0.10)]:
+        idx = rng.choice(n, size=int(n * frac), replace=False)
+        dirty_df.loc[idx, col] = np.nan
+    # Outliers in purchase_amount (~3 %)
+    out_idx = rng.choice(n, size=int(n * 0.03), replace=False)
+    dirty_df.loc[out_idx, "purchase_amount"] = (
+        dirty_df.loc[out_idx, "purchase_amount"] * 10
+    )
+    # Mixed case in country (~40 %)
+    case_idx = rng.choice(n, size=int(n * 0.40), replace=False)
+    dirty_df.loc[case_idx, "country"] = dirty_df.loc[case_idx, "country"].str.lower()
+    # Mixed date formats (~50 %) — only scramble non-null entries
+    date_idx = rng.choice(n, size=int(n * 0.50), replace=False)
+    valid_date_idx = [i for i in date_idx if pd.notna(dirty_df.loc[i, "signup_date"])]
+    for i in valid_date_idx:
+        dirty_df.loc[i, "signup_date"] = _scramble_date(dirty_df.loc[i, "signup_date"], rng)
+    # 20 duplicate rows
+    dup_idx  = rng.choice(n, size=20, replace=False)
+    dup_rows = dirty_df.iloc[dup_idx].copy()
+    dirty_df = pd.concat([dirty_df, dup_rows], ignore_index=True)
+    return dirty_df.reset_index(drop=True), clean_df.reset_index(drop=True)
+def generate_jira_records(n=50, seed=42) -> List[Dict]:
+    """50 engineering tickets with priority, assignee, status, linked_ticket."""
+def generate_zendesk_records(n=40, seed=42) -> List[Dict]:
+    """40 support tickets with urgency, agent_email, state, customer_id."""
+def generate_salesforce_records(n=30, seed=42) -> List[Dict]:
+    """30 accounts with deal_stage, health, owner_name, arr."""
+def generate_workday_records(n=20, seed=42) -> List[Dict]:
+    """20 employee/HR records with level, manager_id, resolution."""
+def generate_episode_data(workflow_id: str, seed: int = 42) -> Dict[str, List[Dict]]:
+    """Generate correlated data for a full episode across all 4 apps.
+    Ensures tickets in Zendesk reference customers in Salesforce, etc."""

server/environment.py ADDED Viewed

	@@ -0,0 +1,141 @@

+class OrgOSEnvironment:
+    MAX_STEPS = {"A": 15, "B": 20, "C": 18}
+    WORKFLOWS = ["A", "B", "C"]
+    def __init__(self):
+        self._drift    = SchemaDriftEngine(seed=42)
+        self._rules    = BusinessRuleEngine()
+        self._workflow = WorkflowEngine()
+        self._apps: Dict[str, BaseApp] = {
+            "jira":        JiraApp(self._drift),
+            "zendesk":     ZendeskApp(self._drift),
+            "salesforce":  SalesforceApp(self._drift),
+            "workday":     WorkdayApp(self._drift),
+        }
+        self._episode_num   = 0
+        self._episode_id    = ""
+        self._workflow_id   = "A"
+        self._step_count    = 0
+        self._last_score    = 0.001
+        self._policy_drift_applied = False
+        # Reward component trackers
+        self._wf_score      = 0.0   # workflow completion
+        self._rule_score    = 1.0   # compliance (starts perfect, penalized on violation)
+        self._schema_score  = 0.0   # schema adaptation successes
+        self._efficiency    = 1.0   # degrades with no-ops
+        self._policy_score  = 0.0   # policy drift handling
+    def reset(self, workflow_id: Optional[str] = None) -> OrgOSObservation:
+        self._episode_num += 1
+        self._episode_id = str(uuid.uuid4())
+        self._workflow_id = workflow_id or self.WORKFLOWS[(self._episode_num - 1) % 3]
+        self._step_count  = 0
+        self._last_score  = 0.001
+        self._rule_score  = 1.0
+        self._wf_score    = 0.0
+        self._schema_score = 0.0
+        self._efficiency  = 1.0
+        self._policy_score = 0.0
+        self._policy_drift_applied = False
+        # Sample schema versions for this episode
+        self._drift.sample_for_episode(self._episode_num)
+        # Possibly activate policy drift (every 3rd episode)
+        self._rules = BusinessRuleEngine()
+        if self._episode_num % 3 == 0:
+            self._rules.apply_policy_drift("sla_tighten")
+            self._policy_drift_applied = True
+        # Load fresh synthetic data into each app
+        records = generate_episode_data(self._workflow_id, seed=42 + self._episode_num)
+        for app_name, app in self._apps.items():
+            app.initialize(records[app_name])
+        # Start workflow tracking
+        self._workflow.start(self._workflow_id)
+        return self._build_obs(0.001, False, "Episode started. Study the workflow goal and schema hints.")
+    def step(self, action: OrgOSAction) -> OrgOSObservation:
+        self._step_count += 1
+        old_score = self._last_score
+        extra_penalty = 0.0
+        # 1. Validate app exists
+        if action.app not in self._apps:
+            return self._build_obs(old_score - 0.05, False, f"Unknown app '{action.app}'")
+        # 2. Business rule check (RBAC, approvals)
+        ctx = {"agent_role": "support", "manager_approved": False}
+        allowed, reason, rule_penalty = self._rules.check_action(action, ctx)
+        if not allowed:
+            self._rule_score = max(0.0, self._rule_score - 0.08)
+            extra_penalty = rule_penalty
+            return self._build_obs(
+                max(-0.25, old_score + extra_penalty),
+                False, f"Rule violation: {reason}"
+            )
+        # 3. Execute on app
+        result = self._apps[action.app].execute(action.operation, action.args)
+        if not result["success"]:
+            self._efficiency -= 0.02  # penalize failed/no-op actions
+            return self._build_obs(old_score - 0.01, False, result["message"])
+        # 4. Check schema drift adaptation
+        # If agent used canonical field names on a v2/v3 schema → penalize
+        if result.get("schema_error"):
+            extra_penalty -= 0.20
+            return self._build_obs(old_score - 0.20, False,
+                f"Stale schema: field '{result['schema_error']}' not found in current schema")
+        elif result.get("schema_adapted"):
+            # Agent correctly used drifted field name → bonus
+            self._schema_score = min(1.0, self._schema_score + 0.1)
+        # 5. Re-evaluate workflow completion
+        self._wf_score = self._workflow.evaluate(self._apps)
+        # 6. Check SLA violations
+        sla_ok, sla_pen = self._rules.check_sla(result.get("ticket", {}),
+                                                  self._step_count * 2.5)  # 2.5 min per step
+        if not sla_ok:
+            extra_penalty += sla_pen
+            self._rule_score = max(0.0, self._rule_score - 0.05)
+        # 7. Compute composite score
+        new_score = self._compute_score()
+        delta = new_score - old_score + extra_penalty
+        self._last_score = max(0.001, min(0.999, new_score))
+        # 8. Terminal condition
+        done = (self._wf_score >= 0.95 or
+                self._step_count >= self.MAX_STEPS[self._workflow_id])
+        if done and self._wf_score >= 0.95:
+            delta += 0.20  # terminal bonus
+        return self._build_obs(delta, done, result["message"])
+    def _compute_score(self) -> float:
+        raw = (
+            0.30 * self._wf_score +
+            0.25 * self._rule_score +
+            0.20 * self._schema_score +
+            0.15 * self._efficiency +
+            0.10 * self._policy_score
+        )
+        return max(0.001, min(0.999, raw))
+    def state(self) -> OrgOSState:
+        return OrgOSState(
+            episode_id=self._episode_id,
+            workflow_id=self._workflow_id,
+            schema_versions=self._drift._versions,
+            step_count=self._step_count,
+            max_steps=self.MAX_STEPS.get(self._workflow_id, 15),
+            rule_violation_count=len(self._rules._violation_log),
+            workflow_completion=self._wf_score,
+            rule_compliance_rate=self._rule_score,
+            policy_drift_active=self._policy_drift_applied,
+        )

server/schema_drift.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Canonical → actual field names per app per schema version
+SCHEMA_MAP = {
+    "jira": {
+        "v1": {"priority": "priority",      "assignee": "assignee",       "status": "status"},
+        "v2": {"priority": "severity",       "assignee": "owner",          "status": "state"},
+        "v3": {"priority": "urgency_level",  "assignee": "assigned_to",    "status": "current_state",
+               "sla_deadline": "due_by"},  # v3 adds a new field
+    },
+    "zendesk": {
+        "v1": {"urgency": "urgency",         "agent_email": "agent_email", "state": "state"},
+        "v2": {"urgency": "priority",        "agent_email": "handler",     "state": "ticket_state"},
+        "v3": {"urgency": "impact_level",    "agent_email": "assigned_agent","state": "resolution_status"},
+    },
+    "salesforce": {
+        "v1": {"deal_stage": "deal_stage",   "health": "health",           "owner": "owner_name"},
+        "v2": {"deal_stage": "pipeline_stage","health": "account_health",  "owner": "account_owner"},
+        "v3": {"deal_stage": "stage",        "health": "risk_score",       "owner": "rep_email",
+               "arr": "annual_recurring_revenue"},
+    },
+    "workday": {
+        "v1": {"level": "level",             "manager_id": "manager_id",   "status": "resolution"},
+        "v2": {"level": "job_level",         "manager_id": "reports_to",   "status": "request_status"},
+        "v3": {"level": "seniority",         "manager_id": "direct_manager","status": "approval_state"},
+    },
+}
+class SchemaDriftEngine:
+    def __init__(self, seed: int = 42):
+        self._seed = seed
+        self._versions: Dict[str, str] = {}  # app → "v1"/"v2"/"v3"
+    def sample_for_episode(self, episode_num: int) -> None:
+        """Sample schema versions deterministically per episode."""
+        rng = random.Random(self._seed + episode_num)
+        self._versions = {app: rng.choice(["v1", "v2", "v3"]) for app in SCHEMA_MAP}
+    def translate_record(self, record: Dict, app: str) -> Dict:
+        """Rename canonical field names → current schema's field names."""
+        version = self._versions.get(app, "v1")
+        mapping = SCHEMA_MAP[app][version]
+        return {mapping.get(k, k): v for k, v in record.items()}
+    def get_hints(self) -> Dict[str, str]:
+        """Return partial schema hints visible in observation.
+        Only reveal 1 random field per app (agent must probe for the rest)."""
+        hints = {}
+        rng = random.Random(self._seed)
+        for app, version in self._versions.items():
+            mapping = SCHEMA_MAP[app][version]
+            # Reveal only fields that actually changed (v2/v3)
+            changed = {f"{app}.{k}": v for k, v in mapping.items() if k != v}
+            if changed:
+                key = rng.choice(list(changed.keys()))
+                hints[key] = changed[key]
+        return hints

server/tasks/__init__.py ADDED Viewed

File without changes

server/tasks/task1_missing.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""
+Task 1 — Easy: Fill Missing Values
+Objective: Fill all NaN values in the employee records DataFrame.
+Score: 1.0 - (remaining_nulls / original_nulls)
+"""
+from server.data_generator import generate_task1_datasets
+TASK_ID = 1
+MAX_STEPS = 20
+DESCRIPTION = (
+    "Task 1 (Easy) — Fill Missing Values\n"
+    "You have an employee records dataset with missing values (NaN) in "
+    "'age', 'salary', and 'department' columns. "
+    "Your goal is to fill all missing values so the dataset is complete.\n\n"
+    "Available operation: fill_missing\n"
+    "  params.strategy: 'median' | 'mean' | 'mode' | 'constant'\n"
+    "  params.value: (required when strategy='constant') the fill value\n"
+    "Example action: {\"operation\": \"fill_missing\", \"column\": \"age\", \"params\": {\"strategy\": \"median\"}}"
+)
+def load():
+    """Return (dirty_df, clean_df, original_null_count)."""
+    dirty, clean = generate_task1_datasets()
+    original_nulls = int(dirty.isnull().sum().sum())
+    return dirty.copy(), clean, original_nulls
+def score(current_df, original_nulls: int) -> float:
+    """Score in [0, 1]: fraction of nulls filled."""
+    if original_nulls == 0:
+        return 0.99
+    remaining = int(current_df.isnull().sum().sum())
+    return round(max(0.01, min(0.99, 1.0 - remaining / original_nulls)), 4)
+def count_errors(current_df) -> int:
+    return int(current_df.isnull().sum().sum())

server/tasks/task2_format.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+Task 2 — Medium: Fix Formats + Remove Duplicates
+Objective: Standardise phone & date formats and drop duplicate rows.
+Score: weighted average of format_score (0.7) + dupe_score (0.3)
+"""
+import re
+import pandas as pd
+from server.data_generator import generate_task2_datasets
+TASK_ID = 2
+MAX_STEPS = 30
+DESCRIPTION = (
+    "Task 2 (Medium) — Fix Formats and Remove Duplicates\n"
+    "You have a product catalog with:\n"
+    "  • Phone numbers in mixed formats (need: NNN-NNN-NNNN)\n"
+    "  • Dates in mixed formats (need: YYYY-MM-DD)\n"
+    "  • Duplicate rows (~15)\n\n"
+    "Available operations:\n"
+    "  fix_format  — column: 'phone' | 'listed_date'\n"
+    "  drop_duplicates — no column needed\n\n"
+    "Example actions:\n"
+    '  {"operation": "fix_format", "column": "phone"}\n'
+    '  {"operation": "fix_format", "column": "listed_date"}\n'
+    '  {"operation": "drop_duplicates"}'
+)
+PHONE_RE = re.compile(r"^\d{3}-\d{3}-\d{4}$")
+DATE_RE  = re.compile(r"^\d{4}-\d{2}-\d{2}$")
+def load():
+    dirty, clean = generate_task2_datasets()
+    original_phone_issues = int((~dirty["phone"].str.match(PHONE_RE)).sum())
+    original_date_issues  = int((~dirty["listed_date"].apply(
+        lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
+    )).sum())
+    original_dupes = len(dirty) - len(dirty.drop_duplicates())
+    meta = {
+        "orig_phone": original_phone_issues,
+        "orig_date":  original_date_issues,
+        "orig_dupes": original_dupes,
+    }
+    return dirty.copy(), clean, meta
+def score(current_df, meta: dict) -> float:
+    phone_issues = int((~current_df["phone"].str.match(PHONE_RE)).sum())
+    date_issues  = int((~current_df["listed_date"].apply(
+        lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
+    )).sum())
+    dupes        = len(current_df) - len(current_df.drop_duplicates())
+    phone_score = 1.0 - phone_issues / max(meta["orig_phone"], 1)
+    date_score  = 1.0 - date_issues  / max(meta["orig_date"],  1)
+    dupe_score  = 1.0 - dupes        / max(meta["orig_dupes"], 1)
+    combined = 0.35 * phone_score + 0.35 * date_score + 0.30 * dupe_score
+    return round(max(0.01, min(0.99, combined)), 4)
+def count_errors(current_df, meta: dict) -> int:
+    phone_issues = int((~current_df["phone"].str.match(PHONE_RE)).sum())
+    date_issues  = int((~current_df["listed_date"].apply(
+        lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
+    )).sum())
+    dupes = len(current_df) - len(current_df.drop_duplicates())
+    return phone_issues + date_issues + dupes

server/tasks/task3_pipeline.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Task 3 — Hard: Full Cleaning Pipeline
+Objective: Fix missing values, remove duplicates, handle outliers, standardise
+           country capitalisation and date formats.
+Score: equal-weight average of 4 sub-scores.
+"""
+import re
+import numpy as np
+import pandas as pd
+from server.data_generator import generate_task3_datasets
+TASK_ID = 3
+MAX_STEPS = 40
+DESCRIPTION = (
+    "Task 3 (Hard) — Full Cleaning Pipeline\n"
+    "You have a customer database with multiple issues:\n"
+    "  1. Missing values in 'age', 'purchase_amount', 'country', 'signup_date'\n"
+    "  2. ~20 duplicate rows\n"
+    "  3. Outliers in 'purchase_amount' (injected values ~10x normal)\n"
+    "  4. Mixed case in 'country' (need: title case, e.g. 'Usa' → 'USA')\n"
+    "  5. Mixed date formats in 'signup_date' (need: YYYY-MM-DD)\n\n"
+    "Available operations:\n"
+    "  fill_missing    — column + params.strategy ('median'|'mean'|'mode'|'constant')\n"
+    "  drop_duplicates — no column needed\n"
+    "  drop_outliers   — column (numeric); uses IQR method\n"
+    "  fix_format      — column: 'country' | 'signup_date'\n"
+    "  fix_dtype       — column + params.dtype ('float'|'int'|'str')\n\n"
+    "Example actions:\n"
+    '  {"operation": "fill_missing",    "column": "age",             "params": {"strategy": "median"}}\n'
+    '  {"operation": "drop_duplicates"}\n'
+    '  {"operation": "drop_outliers",   "column": "purchase_amount"}\n'
+    '  {"operation": "fix_format",      "column": "signup_date"}\n'
+    '  {"operation": "fix_format",      "column": "country"}'
+)
+DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")
+VALID_COUNTRIES = {"USA", "UK", "Canada", "Australia", "Germany"}
+def load():
+    dirty, clean = generate_task3_datasets()
+    orig_nulls = int(dirty.isnull().sum().sum())
+    orig_dupes = len(dirty) - len(dirty.drop_duplicates())
+    # Outlier baseline: count rows where purchase_amount > Q3 + 3*IQR
+    pa = dirty["purchase_amount"].dropna()
+    q1, q3 = pa.quantile(0.25), pa.quantile(0.75)
+    iqr = q3 - q1
+    orig_outliers = int((pa > q3 + 3 * iqr).sum())
+    orig_country_issues = int((~dirty["country"].isin(VALID_COUNTRIES) &
+                               dirty["country"].notna()).sum())
+    orig_date_issues    = int((~dirty["signup_date"].apply(
+        lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
+    )).sum())
+    meta = {
+        "orig_nulls":           orig_nulls,
+        "orig_dupes":           orig_dupes,
+        "orig_outliers":        max(orig_outliers, 1),
+        "orig_country_issues":  max(orig_country_issues, 1),
+        "orig_date_issues":     max(orig_date_issues, 1),
+        "q1": q1, "q3": q3, "iqr": iqr,
+    }
+    return dirty.copy(), clean, meta
+def score(current_df, meta: dict) -> float:
+    remaining_nulls = int(current_df.isnull().sum().sum())
+    remaining_dupes = len(current_df) - len(current_df.drop_duplicates())
+    pa = current_df["purchase_amount"].dropna()
+    remaining_outliers = int((pa > meta["q3"] + 3 * meta["iqr"]).sum())
+    remaining_country = int((~current_df["country"].isin(VALID_COUNTRIES) &
+                              current_df["country"].notna()).sum())
+    remaining_dates   = int((~current_df["signup_date"].apply(
+        lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
+    )).sum())
+    null_score     = 1.0 - remaining_nulls    / max(meta["orig_nulls"],    1)
+    dupe_score     = 1.0 - remaining_dupes    / max(meta["orig_dupes"],    1)
+    outlier_score  = 1.0 - remaining_outliers / meta["orig_outliers"]
+    country_score  = 1.0 - remaining_country  / meta["orig_country_issues"]
+    date_score     = 1.0 - remaining_dates    / meta["orig_date_issues"]
+    combined = 0.25 * null_score + 0.20 * dupe_score + 0.20 * outlier_score \
+             + 0.175 * country_score + 0.175 * date_score
+    return round(max(0.01, min(0.99, combined)), 4)
+def count_errors(current_df, meta: dict) -> int:
+    remaining_nulls = int(current_df.isnull().sum().sum())
+    remaining_dupes = len(current_df) - len(current_df.drop_duplicates())
+    pa = current_df["purchase_amount"].dropna()
+    remaining_outliers = int((pa > meta["q3"] + 3 * meta["iqr"]).sum())
+    remaining_country = int((~current_df["country"].isin(VALID_COUNTRIES) &
+                              current_df["country"].notna()).sum())
+    remaining_dates   = int((~current_df["signup_date"].apply(
+        lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
+    )).sum())
+    return remaining_nulls + remaining_dupes + remaining_outliers + \
+           remaining_country + remaining_dates

server/workflow_engine.py ADDED Viewed

	@@ -0,0 +1,63 @@

+@dataclass
+class WorkflowStep:
+    step_id: str
+    description: str
+    app: str
+    operation: str
+    # Callable that checks if this step was completed given the app states
+    completion_check: Callable[[Dict[str, "BaseApp"]], bool]
+# Workflow A: Customer Bug → Engineering Fix
+WORKFLOW_A_STEPS = [
+    WorkflowStep("A1", "Acknowledge ticket in Zendesk",
+                 "zendesk", "acknowledge_ticket",
+                 lambda apps: apps["zendesk"].ticket_acknowledged()),
+    WorkflowStep("A2", "Escalate to Jira — create linked issue",
+                 "jira", "create_issue",
+                 lambda apps: apps["jira"].has_linked_issue()),
+    WorkflowStep("A3", "Check if customer is paying (Salesforce lookup)",
+                 "salesforce", "get_account",
+                 lambda apps: apps["salesforce"].account_checked()),
+    WorkflowStep("A4", "Assign correct engineer in Jira based on priority",
+                 "jira", "assign_owner",
+                 lambda apps: apps["jira"].issue_assigned()),
+    WorkflowStep("A5", "Log SLA status in Workday",
+                 "workday", "log_sla_event",
+                 lambda apps: apps["workday"].sla_logged()),
+]
+# Workflow B: Employee Onboarding
+WORKFLOW_B_STEPS = [
+    WorkflowStep("B1", "Create employee record in Workday", ...),
+    WorkflowStep("B2", "Provision Jira access based on role", ...),
+    WorkflowStep("B3", "Add to Salesforce team by territory", ...),
+    WorkflowStep("B4", "Create Zendesk support profile if customer-facing", ...),
+]
+# Workflow C: Churn Risk Alert
+WORKFLOW_C_STEPS = [
+    WorkflowStep("C1", "Flag at-risk account in Salesforce", ...),
+    WorkflowStep("C2", "Query recent support volume in Zendesk", ...),
+    WorkflowStep("C3", "Check outstanding bugs in Jira", ...),
+    WorkflowStep("C4", "Synthesize churn score and assign intervention owner", ...),
+]
+class WorkflowEngine:
+    WORKFLOWS = {"A": WORKFLOW_A_STEPS, "B": WORKFLOW_B_STEPS, "C": WORKFLOW_C_STEPS}
+    def start(self, workflow_id: str) -> None:
+        self._steps = self.WORKFLOWS[workflow_id].copy()
+        self._completed: List[str] = []
+    def evaluate(self, apps: Dict) -> float:
+        """Check all steps and return completion ratio (0.0-1.0)."""
+        completed = sum(1 for s in self._steps if s.completion_check(apps))
+        self._completed = [s.step_id for s in self._steps if s.completion_check(apps)]
+        return completed / len(self._steps)
+    def get_pending(self) -> List[str]:
+        return [s.description for s in self._steps if s.step_id not in self._completed]

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff