Upload 4 files
Browse files
plugins/processors/data_cleaner.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Data Cleaning Plugin"""
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from typing import Dict, Any
|
| 5 |
+
|
| 6 |
+
class DataCleaner:
|
| 7 |
+
"""Clean and standardize messy data for analytics."""
|
| 8 |
+
def clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 9 |
+
df = df.copy()
|
| 10 |
+
df.columns = df.columns.astype(str).str.strip().str.lower().str.replace(' ', '_').str.replace(r'[^a-z0-9_]', '', regex=True)
|
| 11 |
+
df = df.dropna(how='all', axis=0).dropna(how='all', axis=1)
|
| 12 |
+
null_values = ['', 'null', 'NULL', 'None', 'N/A', 'n/a', '#N/A', '-', '?', 'unknown']
|
| 13 |
+
for col in df.select_dtypes(include=['object', 'string']).columns:
|
| 14 |
+
df[col] = df[col].astype(str).str.strip().replace(null_values, pd.NA)
|
| 15 |
+
df = df.drop_duplicates()
|
| 16 |
+
return df
|
| 17 |
+
|
| 18 |
+
def enforce_schema(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 19 |
+
df = df.copy()
|
| 20 |
+
for col in df.columns:
|
| 21 |
+
try:
|
| 22 |
+
if 'date' in col or 'time' in col:
|
| 23 |
+
df[col] = pd.to_datetime(df[col], errors='coerce')
|
| 24 |
+
elif any(kw in col for kw in ['amount', 'price', 'cost', 'value', 'count']):
|
| 25 |
+
df[col] = pd.to_numeric(df[col], errors='coerce')
|
| 26 |
+
except: pass
|
| 27 |
+
return df
|
plugins/processors/date_normalizer.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Date Normalizer Plugin"""
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
class DateNormalizer:
|
| 7 |
+
"""Normalizes date columns to a standard format."""
|
| 8 |
+
def normalize(self, df: pd.DataFrame, date_column: str = 'date', format: Optional[str] = None) -> pd.DataFrame:
|
| 9 |
+
df = df.copy()
|
| 10 |
+
if date_column not in df.columns:
|
| 11 |
+
return df
|
| 12 |
+
try:
|
| 13 |
+
df[date_column] = pd.to_datetime(df[date_column], format=format, errors='coerce')
|
| 14 |
+
return df
|
| 15 |
+
except Exception as e:
|
| 16 |
+
print(f"Date normalization failed for column {date_column}: {e}")
|
| 17 |
+
return df
|
plugins/processors/schema_detector.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Schema Detector Plugin"""
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from typing import Dict, Any
|
| 5 |
+
|
| 6 |
+
class SchemaDetector:
|
| 7 |
+
"""Detects and reports data schema."""
|
| 8 |
+
def get_schema(self, df: pd.DataFrame) -> Dict[str, Any]:
|
| 9 |
+
schema = {}
|
| 10 |
+
for col in df.columns:
|
| 11 |
+
dtype = str(df[col].dtype)
|
| 12 |
+
if pd.api.types.is_numeric_dtype(df[col]):
|
| 13 |
+
base_type = "Numeric"
|
| 14 |
+
elif pd.api.types.is_datetime64_any_dtype(df[col]):
|
| 15 |
+
base_type = "Datetime"
|
| 16 |
+
elif df[col].nunique() < min(10, len(df) / 5):
|
| 17 |
+
base_type = "Categorical"
|
| 18 |
+
else:
|
| 19 |
+
base_type = "Text/Object"
|
| 20 |
+
schema[col] = {
|
| 21 |
+
"inferred_type": base_type,
|
| 22 |
+
"pandas_dtype": dtype,
|
| 23 |
+
"non_null_count": int(df[col].count()),
|
| 24 |
+
"unique_values": int(df[col].nunique())
|
| 25 |
+
}
|
| 26 |
+
return schema
|
plugins/processors/text_processor.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Text Processing Plugin"""
|
| 3 |
+
import re
|
| 4 |
+
from typing import List
|
| 5 |
+
|
| 6 |
+
class TextProcessor:
|
| 7 |
+
"""Clean and process text data."""
|
| 8 |
+
def clean_text(self, text: str) -> str:
|
| 9 |
+
"""Remove extra whitespace, special chars."""
|
| 10 |
+
text = re.sub(r'\s+', ' ', text) # Multiple spaces to one
|
| 11 |
+
text = text.strip()
|
| 12 |
+
return text
|
| 13 |
+
|
| 14 |
+
def extract_emails(self, text: str) -> List[str]:
|
| 15 |
+
"""Extract email addresses from text."""
|
| 16 |
+
pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
|
| 17 |
+
return re.findall(pattern, text)
|
| 18 |
+
|
| 19 |
+
def extract_urls(self, text: str) -> List[str]:
|
| 20 |
+
"""Extract URLs from text."""
|
| 21 |
+
pattern = r'https?://[^\s]+'
|
| 22 |
+
return re.findall(pattern, text)
|
| 23 |
+
|
| 24 |
+
def tokenize(self, text: str) -> List[str]:
|
| 25 |
+
"""Simple word tokenization."""
|
| 26 |
+
return text.lower().split()
|