| """ |
| Preprocess air quality data for ANN training. |
| - Handle missing values via interpolation |
| - Create temporal features |
| - Normalize features |
| - Create sequences for time-series prediction |
| """ |
| import numpy as np |
| import pandas as pd |
| from sklearn.preprocessing import StandardScaler |
| import pickle |
| import os |
|
|
| def load_data(path="data/india_air_quality_synthetic.csv"): |
| df = pd.read_csv(path, parse_dates=['datetime']) |
| return df |
|
|
| def preprocess(df): |
| """Full preprocessing pipeline.""" |
| df = df.copy() |
| |
| |
| df = df.sort_values(['city', 'datetime']).reset_index(drop=True) |
| |
| |
| numeric_cols = ['pm25', 'pm10', 'no2', 'so2', 'o3', 'co', 'temperature', 'humidity', 'wind_speed', 'aqi'] |
| for col in numeric_cols: |
| df[col] = df.groupby('city')[col].transform(lambda x: x.interpolate(method='linear', limit_direction='both')) |
| |
| |
| df = df.dropna(subset=numeric_cols).reset_index(drop=True) |
| |
| |
| df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24) |
| df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24) |
| df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7) |
| df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7) |
| df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12) |
| df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12) |
| df['doy_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365) |
| df['doy_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365) |
| |
| |
| city_dummies = pd.get_dummies(df['city'], prefix='city') |
| df = pd.concat([df, city_dummies], axis=1) |
| |
| |
| lag_cols = ['pm25', 'pm10', 'no2', 'so2', 'o3', 'co', 'temperature', 'humidity', 'wind_speed', 'aqi'] |
| for city in df['city'].unique(): |
| city_mask = df['city'] == city |
| for col in lag_cols: |
| for lag in [1, 3, 6, 12, 24]: |
| df.loc[city_mask, f'{col}_lag{lag}'] = df.loc[city_mask, col].shift(lag) |
| |
| |
| for city in df['city'].unique(): |
| city_mask = df['city'] == city |
| for col in lag_cols: |
| df.loc[city_mask, f'{col}_rolling_mean_24h'] = df.loc[city_mask, col].rolling(window=24, min_periods=1).mean() |
| df.loc[city_mask, f'{col}_rolling_std_24h'] = df.loc[city_mask, col].rolling(window=24, min_periods=1).std() |
| |
| df = df.ffill().bfill() |
| |
| return df |
|
|
| def create_sequences(df, feature_cols, target_cols, seq_length=24, pred_horizon=6): |
| """Create sequences for time-series prediction.""" |
| X, y = [], [] |
| |
| for city in df['city'].unique(): |
| city_data = df[df['city'] == city].reset_index(drop=True) |
| features = city_data[feature_cols].values |
| targets = city_data[target_cols].values |
| |
| for i in range(len(city_data) - seq_length - pred_horizon + 1): |
| X.append(features[i:i+seq_length]) |
| y.append(targets[i+seq_length:i+seq_length+pred_horizon]) |
| |
| return np.array(X), np.array(y) |
|
|
| def main(): |
| print("Loading data...") |
| df = load_data() |
| |
| print("Preprocessing...") |
| df = preprocess(df) |
| |
| |
| base_features = ['pm25', 'pm10', 'no2', 'so2', 'o3', 'co', |
| 'temperature', 'humidity', 'wind_speed', |
| 'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', |
| 'month_sin', 'month_cos', 'doy_sin', 'doy_cos'] |
| |
| |
| lag_features = [c for c in df.columns if '_lag' in c or '_rolling_' in c] |
| city_features = [c for c in df.columns if c.startswith('city_')] |
| |
| feature_cols = base_features + lag_features + city_features |
| target_cols = ['pm25', 'pm10', 'no2', 'so2', 'o3', 'co', 'aqi'] |
| |
| print(f"Features: {len(feature_cols)}, Targets: {len(target_cols)}") |
| |
| |
| print("Scaling features...") |
| scaler_X = StandardScaler() |
| scaler_y = StandardScaler() |
| |
| df[feature_cols] = scaler_X.fit_transform(df[feature_cols]) |
| df[target_cols] = scaler_y.fit_transform(df[target_cols]) |
| |
| |
| print("Creating sequences...") |
| X, y = create_sequences(df, feature_cols, target_cols, seq_length=24, pred_horizon=6) |
| |
| print(f"X shape: {X.shape}, y shape: {y.shape}") |
| |
| |
| n = len(X) |
| train_end = int(0.8 * n) |
| val_end = int(0.9 * n) |
| |
| X_train, y_train = X[:train_end], y[:train_end] |
| X_val, y_val = X[train_end:val_end], y[train_end:val_end] |
| X_test, y_test = X[val_end:], y[val_end:] |
| |
| print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}") |
| |
| |
| os.makedirs("processed", exist_ok=True) |
| np.savez("processed/data.npz", |
| X_train=X_train, y_train=y_train, |
| X_val=X_val, y_val=y_val, |
| X_test=X_test, y_test=y_test) |
| |
| |
| metadata = { |
| 'feature_cols': feature_cols, |
| 'target_cols': target_cols, |
| 'seq_length': 24, |
| 'pred_horizon': 6, |
| 'scaler_X': scaler_X, |
| 'scaler_y': scaler_y, |
| 'cities': df['city'].unique().tolist() |
| } |
| |
| with open("processed/metadata.pkl", "wb") as f: |
| pickle.dump(metadata, f) |
| |
| print("Saved processed data to processed/") |
| |
| |
| print("\n=== Dataset Summary ===") |
| print(f"Total sequences: {n}") |
| print(f"Sequence length: 24 hours") |
| print(f"Prediction horizon: 6 hours") |
| print(f"Features: {len(feature_cols)}") |
| print(f"Targets: {len(target_cols)}") |
| print(f"Cities: {metadata['cities']}") |
|
|
| if __name__ == "__main__": |
| main() |
|
|