""" Preprocess air quality data for ANN training. - Handle missing values via interpolation - Create temporal features - Normalize features - Create sequences for time-series prediction """ import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler import pickle import os def load_data(path="data/india_air_quality_synthetic.csv"): df = pd.read_csv(path, parse_dates=['datetime']) return df def preprocess(df): """Full preprocessing pipeline.""" df = df.copy() # Sort by city and datetime df = df.sort_values(['city', 'datetime']).reset_index(drop=True) # Interpolate missing values per city numeric_cols = ['pm25', 'pm10', 'no2', 'so2', 'o3', 'co', 'temperature', 'humidity', 'wind_speed', 'aqi'] for col in numeric_cols: df[col] = df.groupby('city')[col].transform(lambda x: x.interpolate(method='linear', limit_direction='both')) # Drop any remaining NaNs df = df.dropna(subset=numeric_cols).reset_index(drop=True) # Feature engineering df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24) df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24) df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7) df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7) df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12) df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12) df['doy_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365) df['doy_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365) # City encoding (one-hot) city_dummies = pd.get_dummies(df['city'], prefix='city') df = pd.concat([df, city_dummies], axis=1) # Lag features (past 6 hours) lag_cols = ['pm25', 'pm10', 'no2', 'so2', 'o3', 'co', 'temperature', 'humidity', 'wind_speed', 'aqi'] for city in df['city'].unique(): city_mask = df['city'] == city for col in lag_cols: for lag in [1, 3, 6, 12, 24]: df.loc[city_mask, f'{col}_lag{lag}'] = df.loc[city_mask, col].shift(lag) # Rolling statistics (past 24 hours) for city in df['city'].unique(): city_mask = df['city'] == city for col in lag_cols: df.loc[city_mask, f'{col}_rolling_mean_24h'] = df.loc[city_mask, col].rolling(window=24, min_periods=1).mean() df.loc[city_mask, f'{col}_rolling_std_24h'] = df.loc[city_mask, col].rolling(window=24, min_periods=1).std() df = df.ffill().bfill() return df def create_sequences(df, feature_cols, target_cols, seq_length=24, pred_horizon=6): """Create sequences for time-series prediction.""" X, y = [], [] for city in df['city'].unique(): city_data = df[df['city'] == city].reset_index(drop=True) features = city_data[feature_cols].values targets = city_data[target_cols].values for i in range(len(city_data) - seq_length - pred_horizon + 1): X.append(features[i:i+seq_length]) y.append(targets[i+seq_length:i+seq_length+pred_horizon]) return np.array(X), np.array(y) def main(): print("Loading data...") df = load_data() print("Preprocessing...") df = preprocess(df) # Define feature and target columns base_features = ['pm25', 'pm10', 'no2', 'so2', 'o3', 'co', 'temperature', 'humidity', 'wind_speed', 'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', 'month_sin', 'month_cos', 'doy_sin', 'doy_cos'] # Add lag and rolling features lag_features = [c for c in df.columns if '_lag' in c or '_rolling_' in c] city_features = [c for c in df.columns if c.startswith('city_')] feature_cols = base_features + lag_features + city_features target_cols = ['pm25', 'pm10', 'no2', 'so2', 'o3', 'co', 'aqi'] print(f"Features: {len(feature_cols)}, Targets: {len(target_cols)}") # Scale features print("Scaling features...") scaler_X = StandardScaler() scaler_y = StandardScaler() df[feature_cols] = scaler_X.fit_transform(df[feature_cols]) df[target_cols] = scaler_y.fit_transform(df[target_cols]) # Create sequences print("Creating sequences...") X, y = create_sequences(df, feature_cols, target_cols, seq_length=24, pred_horizon=6) print(f"X shape: {X.shape}, y shape: {y.shape}") # Temporal split (80/10/10) n = len(X) train_end = int(0.8 * n) val_end = int(0.9 * n) X_train, y_train = X[:train_end], y[:train_end] X_val, y_val = X[train_end:val_end], y[train_end:val_end] X_test, y_test = X[val_end:], y[val_end:] print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}") # Save os.makedirs("processed", exist_ok=True) np.savez("processed/data.npz", X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, X_test=X_test, y_test=y_test) # Save metadata metadata = { 'feature_cols': feature_cols, 'target_cols': target_cols, 'seq_length': 24, 'pred_horizon': 6, 'scaler_X': scaler_X, 'scaler_y': scaler_y, 'cities': df['city'].unique().tolist() } with open("processed/metadata.pkl", "wb") as f: pickle.dump(metadata, f) print("Saved processed data to processed/") # Summary stats print("\n=== Dataset Summary ===") print(f"Total sequences: {n}") print(f"Sequence length: 24 hours") print(f"Prediction horizon: 6 hours") print(f"Features: {len(feature_cols)}") print(f"Targets: {len(target_cols)}") print(f"Cities: {metadata['cities']}") if __name__ == "__main__": main()