OpenEnv_hack / server /tasks /task3_pipeline.py
srishtichugh's picture
add ui
40fcf49
"""
Task 3 β€” Hard: Full Cleaning Pipeline
Objective: Fix missing values, remove duplicates, handle outliers, standardise
country capitalisation and date formats.
Score: equal-weight average of 4 sub-scores.
"""
import re
import numpy as np
import pandas as pd
from server.data_generator import generate_task3_datasets
TASK_ID = 3
MAX_STEPS = 40
DESCRIPTION = (
"Task 3 (Hard) β€” Full Cleaning Pipeline\n"
"You have a customer database with multiple issues:\n"
" 1. Missing values in 'age', 'purchase_amount', 'country', 'signup_date'\n"
" 2. ~20 duplicate rows\n"
" 3. Outliers in 'purchase_amount' (injected values ~10x normal)\n"
" 4. Mixed case in 'country' (need: title case, e.g. 'Usa' β†’ 'USA')\n"
" 5. Mixed date formats in 'signup_date' (need: YYYY-MM-DD)\n\n"
"Available operations:\n"
" fill_missing β€” column + params.strategy ('median'|'mean'|'mode'|'constant')\n"
" drop_duplicates β€” no column needed\n"
" drop_outliers β€” column (numeric); uses IQR method\n"
" fix_format β€” column: 'country' | 'signup_date'\n"
" fix_dtype β€” column + params.dtype ('float'|'int'|'str')\n\n"
"Example actions:\n"
' {"operation": "fill_missing", "column": "age", "params": {"strategy": "median"}}\n'
' {"operation": "drop_duplicates"}\n'
' {"operation": "drop_outliers", "column": "purchase_amount"}\n'
' {"operation": "fix_format", "column": "signup_date"}\n'
' {"operation": "fix_format", "column": "country"}'
)
DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")
VALID_COUNTRIES = {"USA", "UK", "Canada", "Australia", "Germany"}
# Cache at module load β€” seed=42 makes output identical every time
def _build_meta(dirty):
orig_nulls = int(dirty.isnull().sum().sum())
orig_dupes = len(dirty) - len(dirty.drop_duplicates())
pa = dirty["purchase_amount"].dropna()
q1, q3 = pa.quantile(0.25), pa.quantile(0.75)
iqr = q3 - q1
orig_outliers = int((pa > q3 + 3 * iqr).sum())
orig_country_issues = int((~dirty["country"].isin(VALID_COUNTRIES) &
dirty["country"].notna()).sum())
orig_date_issues = int((~dirty["signup_date"].apply(
lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
)).sum())
return {
"orig_nulls": orig_nulls,
"orig_dupes": orig_dupes,
"orig_outliers": max(orig_outliers, 1),
"orig_country_issues": max(orig_country_issues, 1),
"orig_date_issues": max(orig_date_issues, 1),
"q1": q1, "q3": q3, "iqr": iqr,
}
_DIRTY_TEMPLATE, _CLEAN_DF = generate_task3_datasets()
_META_TEMPLATE = _build_meta(_DIRTY_TEMPLATE)
def load():
"""Return (dirty_df, clean_df, meta) β€” uses cached template."""
return _DIRTY_TEMPLATE.copy(), _CLEAN_DF, dict(_META_TEMPLATE)
def score(current_df, meta: dict) -> float:
remaining_nulls = int(current_df.isnull().sum().sum())
remaining_dupes = len(current_df) - len(current_df.drop_duplicates())
pa = current_df["purchase_amount"].dropna()
remaining_outliers = int((pa > meta["q3"] + 3 * meta["iqr"]).sum())
remaining_country = int((~current_df["country"].isin(VALID_COUNTRIES) &
current_df["country"].notna()).sum())
remaining_dates = int((~current_df["signup_date"].apply(
lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
)).sum())
null_score = 1.0 - remaining_nulls / max(meta["orig_nulls"], 1)
dupe_score = 1.0 - remaining_dupes / max(meta["orig_dupes"], 1)
outlier_score = 1.0 - remaining_outliers / meta["orig_outliers"]
country_score = 1.0 - remaining_country / meta["orig_country_issues"]
date_score = 1.0 - remaining_dates / meta["orig_date_issues"]
combined = 0.25 * null_score + 0.20 * dupe_score + 0.20 * outlier_score \
+ 0.175 * country_score + 0.175 * date_score
return round(max(0.01, min(0.99, combined)), 4)
def count_errors(current_df, meta: dict) -> int:
remaining_nulls = int(current_df.isnull().sum().sum())
remaining_dupes = len(current_df) - len(current_df.drop_duplicates())
pa = current_df["purchase_amount"].dropna()
remaining_outliers = int((pa > meta["q3"] + 3 * meta["iqr"]).sum())
remaining_country = int((~current_df["country"].isin(VALID_COUNTRIES) &
current_df["country"].notna()).sum())
remaining_dates = int((~current_df["signup_date"].apply(
lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
)).sum())
return remaining_nulls + remaining_dupes + remaining_outliers + \
remaining_country + remaining_dates