madhave282809's picture
Upload preprocess.py with huggingface_hub
1e3eef4 verified
"""
Preprocess air quality data for ANN training.
- Handle missing values via interpolation
- Create temporal features
- Normalize features
- Create sequences for time-series prediction
"""
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import pickle
import os
def load_data(path="data/india_air_quality_synthetic.csv"):
df = pd.read_csv(path, parse_dates=['datetime'])
return df
def preprocess(df):
"""Full preprocessing pipeline."""
df = df.copy()
# Sort by city and datetime
df = df.sort_values(['city', 'datetime']).reset_index(drop=True)
# Interpolate missing values per city
numeric_cols = ['pm25', 'pm10', 'no2', 'so2', 'o3', 'co', 'temperature', 'humidity', 'wind_speed', 'aqi']
for col in numeric_cols:
df[col] = df.groupby('city')[col].transform(lambda x: x.interpolate(method='linear', limit_direction='both'))
# Drop any remaining NaNs
df = df.dropna(subset=numeric_cols).reset_index(drop=True)
# Feature engineering
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['doy_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
df['doy_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
# City encoding (one-hot)
city_dummies = pd.get_dummies(df['city'], prefix='city')
df = pd.concat([df, city_dummies], axis=1)
# Lag features (past 6 hours)
lag_cols = ['pm25', 'pm10', 'no2', 'so2', 'o3', 'co', 'temperature', 'humidity', 'wind_speed', 'aqi']
for city in df['city'].unique():
city_mask = df['city'] == city
for col in lag_cols:
for lag in [1, 3, 6, 12, 24]:
df.loc[city_mask, f'{col}_lag{lag}'] = df.loc[city_mask, col].shift(lag)
# Rolling statistics (past 24 hours)
for city in df['city'].unique():
city_mask = df['city'] == city
for col in lag_cols:
df.loc[city_mask, f'{col}_rolling_mean_24h'] = df.loc[city_mask, col].rolling(window=24, min_periods=1).mean()
df.loc[city_mask, f'{col}_rolling_std_24h'] = df.loc[city_mask, col].rolling(window=24, min_periods=1).std()
df = df.ffill().bfill()
return df
def create_sequences(df, feature_cols, target_cols, seq_length=24, pred_horizon=6):
"""Create sequences for time-series prediction."""
X, y = [], []
for city in df['city'].unique():
city_data = df[df['city'] == city].reset_index(drop=True)
features = city_data[feature_cols].values
targets = city_data[target_cols].values
for i in range(len(city_data) - seq_length - pred_horizon + 1):
X.append(features[i:i+seq_length])
y.append(targets[i+seq_length:i+seq_length+pred_horizon])
return np.array(X), np.array(y)
def main():
print("Loading data...")
df = load_data()
print("Preprocessing...")
df = preprocess(df)
# Define feature and target columns
base_features = ['pm25', 'pm10', 'no2', 'so2', 'o3', 'co',
'temperature', 'humidity', 'wind_speed',
'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos',
'month_sin', 'month_cos', 'doy_sin', 'doy_cos']
# Add lag and rolling features
lag_features = [c for c in df.columns if '_lag' in c or '_rolling_' in c]
city_features = [c for c in df.columns if c.startswith('city_')]
feature_cols = base_features + lag_features + city_features
target_cols = ['pm25', 'pm10', 'no2', 'so2', 'o3', 'co', 'aqi']
print(f"Features: {len(feature_cols)}, Targets: {len(target_cols)}")
# Scale features
print("Scaling features...")
scaler_X = StandardScaler()
scaler_y = StandardScaler()
df[feature_cols] = scaler_X.fit_transform(df[feature_cols])
df[target_cols] = scaler_y.fit_transform(df[target_cols])
# Create sequences
print("Creating sequences...")
X, y = create_sequences(df, feature_cols, target_cols, seq_length=24, pred_horizon=6)
print(f"X shape: {X.shape}, y shape: {y.shape}")
# Temporal split (80/10/10)
n = len(X)
train_end = int(0.8 * n)
val_end = int(0.9 * n)
X_train, y_train = X[:train_end], y[:train_end]
X_val, y_val = X[train_end:val_end], y[train_end:val_end]
X_test, y_test = X[val_end:], y[val_end:]
print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
# Save
os.makedirs("processed", exist_ok=True)
np.savez("processed/data.npz",
X_train=X_train, y_train=y_train,
X_val=X_val, y_val=y_val,
X_test=X_test, y_test=y_test)
# Save metadata
metadata = {
'feature_cols': feature_cols,
'target_cols': target_cols,
'seq_length': 24,
'pred_horizon': 6,
'scaler_X': scaler_X,
'scaler_y': scaler_y,
'cities': df['city'].unique().tolist()
}
with open("processed/metadata.pkl", "wb") as f:
pickle.dump(metadata, f)
print("Saved processed data to processed/")
# Summary stats
print("\n=== Dataset Summary ===")
print(f"Total sequences: {n}")
print(f"Sequence length: 24 hours")
print(f"Prediction horizon: 6 hours")
print(f"Features: {len(feature_cols)}")
print(f"Targets: {len(target_cols)}")
print(f"Cities: {metadata['cities']}")
if __name__ == "__main__":
main()