Spaces:

JatinAutonomousLabs
/

Excel_AI_Assistant

Paused

JatinAutonomousLabs commited on Oct 15, 2025

Commit

9d08bab

verified ·

1 Parent(s): 5afb91c

Upload 4 files

Files changed (4) hide show

plugins/processors/data_cleaner.py ADDED Viewed

+#!/usr/bin/env python3
+"""Data Cleaning Plugin"""
+import pandas as pd
+from typing import Dict, Any
+class DataCleaner:
+    """Clean and standardize messy data for analytics."""
+    def clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = df.copy()
+        df.columns = df.columns.astype(str).str.strip().str.lower().str.replace(' ', '_').str.replace(r'[^a-z0-9_]', '', regex=True)
+        df = df.dropna(how='all', axis=0).dropna(how='all', axis=1)
+        null_values = ['', 'null', 'NULL', 'None', 'N/A', 'n/a', '#N/A', '-', '?', 'unknown']
+        for col in df.select_dtypes(include=['object', 'string']).columns:
+            df[col] = df[col].astype(str).str.strip().replace(null_values, pd.NA)
+        df = df.drop_duplicates()
+        return df
+    def enforce_schema(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = df.copy()
+        for col in df.columns:
+            try:
+                if 'date' in col or 'time' in col:
+                    df[col] = pd.to_datetime(df[col], errors='coerce')
+                elif any(kw in col for kw in ['amount', 'price', 'cost', 'value', 'count']):
+                    df[col] = pd.to_numeric(df[col], errors='coerce')
+            except: pass
+        return df

plugins/processors/date_normalizer.py ADDED Viewed

+#!/usr/bin/env python3
+"""Date Normalizer Plugin"""
+import pandas as pd
+from typing import Optional
+class DateNormalizer:
+    """Normalizes date columns to a standard format."""
+    def normalize(self, df: pd.DataFrame, date_column: str = 'date', format: Optional[str] = None) -> pd.DataFrame:
+        df = df.copy()
+        if date_column not in df.columns:
+            return df
+        try:
+            df[date_column] = pd.to_datetime(df[date_column], format=format, errors='coerce')
+            return df
+        except Exception as e:
+            print(f"Date normalization failed for column {date_column}: {e}")
+            return df

plugins/processors/schema_detector.py ADDED Viewed

+#!/usr/bin/env python3
+"""Schema Detector Plugin"""
+import pandas as pd
+from typing import Dict, Any
+class SchemaDetector:
+    """Detects and reports data schema."""
+    def get_schema(self, df: pd.DataFrame) -> Dict[str, Any]:
+        schema = {}
+        for col in df.columns:
+            dtype = str(df[col].dtype)
+            if pd.api.types.is_numeric_dtype(df[col]):
+                base_type = "Numeric"
+            elif pd.api.types.is_datetime64_any_dtype(df[col]):
+                base_type = "Datetime"
+            elif df[col].nunique() < min(10, len(df) / 5):
+                base_type = "Categorical"
+            else:
+                base_type = "Text/Object"
+            schema[col] = {
+                "inferred_type": base_type,
+                "pandas_dtype": dtype,
+                "non_null_count": int(df[col].count()),
+                "unique_values": int(df[col].nunique())
+            }
+        return schema

plugins/processors/text_processor.py ADDED Viewed

+#!/usr/bin/env python3
+"""Text Processing Plugin"""
+import re
+from typing import List
+class TextProcessor:
+    """Clean and process text data."""
+    def clean_text(self, text: str) -> str:
+        """Remove extra whitespace, special chars."""
+        text = re.sub(r'\s+', ' ', text)  # Multiple spaces to one
+        text = text.strip()
+        return text
+    def extract_emails(self, text: str) -> List[str]:
+        """Extract email addresses from text."""
+        pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
+        return re.findall(pattern, text)
+    def extract_urls(self, text: str) -> List[str]:
+        """Extract URLs from text."""
+        pattern = r'https?://[^\s]+'
+        return re.findall(pattern, text)
+    def tokenize(self, text: str) -> List[str]:
+        """Simple word tokenization."""
+        return text.lower().split()