Spaces:
Sleeping
Sleeping
| """ | |
| Task 3 β Hard: Full Cleaning Pipeline | |
| Objective: Fix missing values, remove duplicates, handle outliers, standardise | |
| country capitalisation and date formats. | |
| Score: equal-weight average of 4 sub-scores. | |
| """ | |
| import re | |
| import numpy as np | |
| import pandas as pd | |
| from server.data_generator import generate_task3_datasets | |
| TASK_ID = 3 | |
| MAX_STEPS = 40 | |
| DESCRIPTION = ( | |
| "Task 3 (Hard) β Full Cleaning Pipeline\n" | |
| "You have a customer database with multiple issues:\n" | |
| " 1. Missing values in 'age', 'purchase_amount', 'country', 'signup_date'\n" | |
| " 2. ~20 duplicate rows\n" | |
| " 3. Outliers in 'purchase_amount' (injected values ~10x normal)\n" | |
| " 4. Mixed case in 'country' (need: title case, e.g. 'Usa' β 'USA')\n" | |
| " 5. Mixed date formats in 'signup_date' (need: YYYY-MM-DD)\n\n" | |
| "Available operations:\n" | |
| " fill_missing β column + params.strategy ('median'|'mean'|'mode'|'constant')\n" | |
| " drop_duplicates β no column needed\n" | |
| " drop_outliers β column (numeric); uses IQR method\n" | |
| " fix_format β column: 'country' | 'signup_date'\n" | |
| " fix_dtype β column + params.dtype ('float'|'int'|'str')\n\n" | |
| "Example actions:\n" | |
| ' {"operation": "fill_missing", "column": "age", "params": {"strategy": "median"}}\n' | |
| ' {"operation": "drop_duplicates"}\n' | |
| ' {"operation": "drop_outliers", "column": "purchase_amount"}\n' | |
| ' {"operation": "fix_format", "column": "signup_date"}\n' | |
| ' {"operation": "fix_format", "column": "country"}' | |
| ) | |
| DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$") | |
| VALID_COUNTRIES = {"USA", "UK", "Canada", "Australia", "Germany"} | |
| # Cache at module load β seed=42 makes output identical every time | |
| def _build_meta(dirty): | |
| orig_nulls = int(dirty.isnull().sum().sum()) | |
| orig_dupes = len(dirty) - len(dirty.drop_duplicates()) | |
| pa = dirty["purchase_amount"].dropna() | |
| q1, q3 = pa.quantile(0.25), pa.quantile(0.75) | |
| iqr = q3 - q1 | |
| orig_outliers = int((pa > q3 + 3 * iqr).sum()) | |
| orig_country_issues = int((~dirty["country"].isin(VALID_COUNTRIES) & | |
| dirty["country"].notna()).sum()) | |
| orig_date_issues = int((~dirty["signup_date"].apply( | |
| lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False | |
| )).sum()) | |
| return { | |
| "orig_nulls": orig_nulls, | |
| "orig_dupes": orig_dupes, | |
| "orig_outliers": max(orig_outliers, 1), | |
| "orig_country_issues": max(orig_country_issues, 1), | |
| "orig_date_issues": max(orig_date_issues, 1), | |
| "q1": q1, "q3": q3, "iqr": iqr, | |
| } | |
| _DIRTY_TEMPLATE, _CLEAN_DF = generate_task3_datasets() | |
| _META_TEMPLATE = _build_meta(_DIRTY_TEMPLATE) | |
| def load(): | |
| """Return (dirty_df, clean_df, meta) β uses cached template.""" | |
| return _DIRTY_TEMPLATE.copy(), _CLEAN_DF, dict(_META_TEMPLATE) | |
| def score(current_df, meta: dict) -> float: | |
| remaining_nulls = int(current_df.isnull().sum().sum()) | |
| remaining_dupes = len(current_df) - len(current_df.drop_duplicates()) | |
| pa = current_df["purchase_amount"].dropna() | |
| remaining_outliers = int((pa > meta["q3"] + 3 * meta["iqr"]).sum()) | |
| remaining_country = int((~current_df["country"].isin(VALID_COUNTRIES) & | |
| current_df["country"].notna()).sum()) | |
| remaining_dates = int((~current_df["signup_date"].apply( | |
| lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False | |
| )).sum()) | |
| null_score = 1.0 - remaining_nulls / max(meta["orig_nulls"], 1) | |
| dupe_score = 1.0 - remaining_dupes / max(meta["orig_dupes"], 1) | |
| outlier_score = 1.0 - remaining_outliers / meta["orig_outliers"] | |
| country_score = 1.0 - remaining_country / meta["orig_country_issues"] | |
| date_score = 1.0 - remaining_dates / meta["orig_date_issues"] | |
| combined = 0.25 * null_score + 0.20 * dupe_score + 0.20 * outlier_score \ | |
| + 0.175 * country_score + 0.175 * date_score | |
| return round(max(0.01, min(0.99, combined)), 4) | |
| def count_errors(current_df, meta: dict) -> int: | |
| remaining_nulls = int(current_df.isnull().sum().sum()) | |
| remaining_dupes = len(current_df) - len(current_df.drop_duplicates()) | |
| pa = current_df["purchase_amount"].dropna() | |
| remaining_outliers = int((pa > meta["q3"] + 3 * meta["iqr"]).sum()) | |
| remaining_country = int((~current_df["country"].isin(VALID_COUNTRIES) & | |
| current_df["country"].notna()).sum()) | |
| remaining_dates = int((~current_df["signup_date"].apply( | |
| lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False | |
| )).sum()) | |
| return remaining_nulls + remaining_dupes + remaining_outliers + \ | |
| remaining_country + remaining_dates |