JatinAutonomousLabs commited on
Commit
9d08bab
·
verified ·
1 Parent(s): 5afb91c

Upload 4 files

Browse files
plugins/processors/data_cleaner.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Data Cleaning Plugin"""
3
+ import pandas as pd
4
+ from typing import Dict, Any
5
+
6
+ class DataCleaner:
7
+ """Clean and standardize messy data for analytics."""
8
+ def clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
9
+ df = df.copy()
10
+ df.columns = df.columns.astype(str).str.strip().str.lower().str.replace(' ', '_').str.replace(r'[^a-z0-9_]', '', regex=True)
11
+ df = df.dropna(how='all', axis=0).dropna(how='all', axis=1)
12
+ null_values = ['', 'null', 'NULL', 'None', 'N/A', 'n/a', '#N/A', '-', '?', 'unknown']
13
+ for col in df.select_dtypes(include=['object', 'string']).columns:
14
+ df[col] = df[col].astype(str).str.strip().replace(null_values, pd.NA)
15
+ df = df.drop_duplicates()
16
+ return df
17
+
18
+ def enforce_schema(self, df: pd.DataFrame) -> pd.DataFrame:
19
+ df = df.copy()
20
+ for col in df.columns:
21
+ try:
22
+ if 'date' in col or 'time' in col:
23
+ df[col] = pd.to_datetime(df[col], errors='coerce')
24
+ elif any(kw in col for kw in ['amount', 'price', 'cost', 'value', 'count']):
25
+ df[col] = pd.to_numeric(df[col], errors='coerce')
26
+ except: pass
27
+ return df
plugins/processors/date_normalizer.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Date Normalizer Plugin"""
3
+ import pandas as pd
4
+ from typing import Optional
5
+
6
+ class DateNormalizer:
7
+ """Normalizes date columns to a standard format."""
8
+ def normalize(self, df: pd.DataFrame, date_column: str = 'date', format: Optional[str] = None) -> pd.DataFrame:
9
+ df = df.copy()
10
+ if date_column not in df.columns:
11
+ return df
12
+ try:
13
+ df[date_column] = pd.to_datetime(df[date_column], format=format, errors='coerce')
14
+ return df
15
+ except Exception as e:
16
+ print(f"Date normalization failed for column {date_column}: {e}")
17
+ return df
plugins/processors/schema_detector.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Schema Detector Plugin"""
3
+ import pandas as pd
4
+ from typing import Dict, Any
5
+
6
+ class SchemaDetector:
7
+ """Detects and reports data schema."""
8
+ def get_schema(self, df: pd.DataFrame) -> Dict[str, Any]:
9
+ schema = {}
10
+ for col in df.columns:
11
+ dtype = str(df[col].dtype)
12
+ if pd.api.types.is_numeric_dtype(df[col]):
13
+ base_type = "Numeric"
14
+ elif pd.api.types.is_datetime64_any_dtype(df[col]):
15
+ base_type = "Datetime"
16
+ elif df[col].nunique() < min(10, len(df) / 5):
17
+ base_type = "Categorical"
18
+ else:
19
+ base_type = "Text/Object"
20
+ schema[col] = {
21
+ "inferred_type": base_type,
22
+ "pandas_dtype": dtype,
23
+ "non_null_count": int(df[col].count()),
24
+ "unique_values": int(df[col].nunique())
25
+ }
26
+ return schema
plugins/processors/text_processor.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Text Processing Plugin"""
3
+ import re
4
+ from typing import List
5
+
6
+ class TextProcessor:
7
+ """Clean and process text data."""
8
+ def clean_text(self, text: str) -> str:
9
+ """Remove extra whitespace, special chars."""
10
+ text = re.sub(r'\s+', ' ', text) # Multiple spaces to one
11
+ text = text.strip()
12
+ return text
13
+
14
+ def extract_emails(self, text: str) -> List[str]:
15
+ """Extract email addresses from text."""
16
+ pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
17
+ return re.findall(pattern, text)
18
+
19
+ def extract_urls(self, text: str) -> List[str]:
20
+ """Extract URLs from text."""
21
+ pattern = r'https?://[^\s]+'
22
+ return re.findall(pattern, text)
23
+
24
+ def tokenize(self, text: str) -> List[str]:
25
+ """Simple word tokenization."""
26
+ return text.lower().split()