| |
| """Data Cleaning Plugin""" |
| import pandas as pd |
| from typing import Dict, Any |
|
|
| class DataCleaner: |
| """Clean and standardize messy data for analytics.""" |
| def clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: |
| df = df.copy() |
| df.columns = df.columns.astype(str).str.strip().str.lower().str.replace(' ', '_').str.replace(r'[^a-z0-9_]', '', regex=True) |
| df = df.dropna(how='all', axis=0).dropna(how='all', axis=1) |
| null_values = ['', 'null', 'NULL', 'None', 'N/A', 'n/a', '#N/A', '-', '?', 'unknown'] |
| for col in df.select_dtypes(include=['object', 'string']).columns: |
| df[col] = df[col].astype(str).str.strip().replace(null_values, pd.NA) |
| df = df.drop_duplicates() |
| return df |
| |
| def enforce_schema(self, df: pd.DataFrame) -> pd.DataFrame: |
| df = df.copy() |
| for col in df.columns: |
| try: |
| if 'date' in col or 'time' in col: |
| df[col] = pd.to_datetime(df[col], errors='coerce') |
| elif any(kw in col for kw in ['amount', 'price', 'cost', 'value', 'count']): |
| df[col] = pd.to_numeric(df[col], errors='coerce') |
| except: pass |
| return df |
|
|