""" Step 1B / Synthetic Dataset Generator (offline fallback) ========================================================== When the real Open-Meteo / Open-Topo-Data APIs are unreachable (e.g. behind a restrictive corporate proxy or in an offline classroom), this script generates a physically-plausible synthetic dataset with the *exact same schema* as scripts/1_download_dataset.py. This lets the end-to-end pipeline (preprocess + train + serve) be validated without network access. To switch back to real data later, delete data/raw_*.csv and run scripts/1_download_dataset.py. The synthetic generator encodes: * Standard atmosphere lapse rate (≈ -6.5 °C / km) * Hydrostatic pressure decay with altitude (~ -12 hPa / 100 m) * Tropical diurnal temperature cycle (cooler at night, warmer mid-afternoon) * Malaysia's bimodal monsoon precipitation seasonality (Apr-May, Oct-Nov peaks) * Humidity inversely correlated with temperature, plus monsoon boost * Heavy-tailed precipitation distribution (most hours dry, rare extremes) * CAPE rising with humid afternoon convection * Dew-point depression that shrinks toward saturation as humidity rises This is *NOT* a substitute for real ERA5 reanalysis data in the final thesis — its purpose is purely to exercise the ML pipeline end-to-end. Run: python scripts/1b_synth_dataset.py """ from __future__ import annotations from pathlib import Path import numpy as np import pandas as pd ROOT = Path(__file__).resolve().parent.parent DATA_DIR = ROOT / "data" DATA_DIR.mkdir(exist_ok=True) # Site (name, lat, lon, approx elevation_m) — same as scripts/1_download_dataset.py SITES = [ ("genting_highlands", 3.4225, 101.7935, 1742.0), ("cameron_highlands", 4.4694, 101.3776, 1500.0), ("frasers_hill", 3.7256, 101.7378, 1300.0), ("klang_valley", 3.0738, 101.5183, 120.0), ("mt_kinabalu_base", 6.0535, 116.5586, 1800.0), ] START = pd.Timestamp("2020-01-01 00:00:00") END = pd.Timestamp("2023-12-31 23:00:00") def generate_site(name: str, lat: float, lon: float, elev: float, rng: np.random.Generator) -> pd.DataFrame: """Generate hourly synthetic weather time-series for a single site.""" timestamps = pd.date_range(START, END, freq="h") n = len(timestamps) hour = timestamps.hour.to_numpy() doy = timestamps.dayofyear.to_numpy() # Temperature: tropical baseline 27 °C at sea level, lapse rate to altitude, # plus diurnal swing (±4 °C) and seasonal (±1.5 °C). sea_level_temp = 27.0 lapse = -6.5 * (elev / 1000.0) diurnal = -4.0 * np.cos(2 * np.pi * (hour - 3) / 24.0) seasonal = 1.5 * np.cos(2 * np.pi * (doy - 60) / 365.25) noise_T = rng.normal(0.0, 1.2, n) temperature = sea_level_temp + lapse + diurnal + seasonal + noise_T # Pressure: hydrostatic decay, plus 3-hourly random walk for synoptic systems. sea_level_p = 1010.0 p_alt = sea_level_p - 12.0 * (elev / 100.0) pressure = p_alt + rng.normal(0.0, 0.8, n) pressure = pd.Series(pressure).rolling(3, min_periods=1).mean().to_numpy() # Monsoon-driven rainy season: Apr-May and Oct-Nov are peak rainfall in # Peninsular Malaysia; weight precipitation probability accordingly. monsoon_weight = ( 0.5 + 0.5 * np.cos(2 * np.pi * (doy - 305) / 365.25) # NE monsoon + 0.4 * np.exp(-0.5 * ((doy - 135) / 25.0) ** 2) # SW pre-monsoon + 0.4 * np.exp(-0.5 * ((doy - 305) / 30.0) ** 2) ) # Humidity: anti-correlated with diurnal temperature; lifted by monsoon. humidity_base = 78.0 + 4.0 * monsoon_weight humidity = humidity_base - 0.9 * diurnal + rng.normal(0.0, 5.0, n) humidity = np.clip(humidity, 30.0, 100.0) # CAPE: builds with afternoon humid heat — peaks 13-16h on humid days. afternoon = np.exp(-0.5 * ((hour - 14.5) / 2.5) ** 2) cape = ( afternoon * (humidity - 60.0) * 25.0 * monsoon_weight + rng.normal(0.0, 80.0, n) ) cape = np.clip(cape, 0.0, 4500.0) # Cloud cover: tied to humidity & monsoon. cloud = np.clip( 0.55 * humidity + 25.0 * monsoon_weight + rng.normal(0.0, 8.0, n), 0.0, 100.0, ) # Dew point depression shrinks at high humidity (saturation). dew_dep = np.clip(36.0 - 0.32 * humidity + rng.normal(0.0, 1.4, n), 0.1, 30.0) dew_point = temperature - dew_dep # Wind: weak in tropics; daytime sea breeze in lowlands, slightly more wind aloft. wind_base = 5.0 + 0.0025 * elev wind_speed = np.clip( wind_base + 2.5 * afternoon + np.abs(rng.normal(0.0, 2.5, n)), 0.0, 60.0, ) # Direction: slow random walk so consecutive hours have correlated direction. dir_steps = rng.normal(0.0, 25.0, n).cumsum() wind_dir = (dir_steps % 360.0 + 180.0 * monsoon_weight) % 360.0 # Precipitation: zero-inflated; probability rises with humidity × monsoon × CAPE. rain_prob = ( 0.04 + 0.55 * monsoon_weight * (humidity > 80).astype(float) + 0.0001 * cape + 0.25 * afternoon * (humidity > 85).astype(float) ) rain_prob = np.clip(rain_prob, 0.0, 0.85) rain_event = rng.random(n) < rain_prob # When it rains, amount follows an exponential distribution (heavy-tailed). rain_amount = np.where( rain_event, rng.exponential(scale=2.8, size=n), # mm/h 0.0, ) df = pd.DataFrame({ "site": name, "latitude": lat, "longitude": lon, "elevation_m": elev, "time": timestamps, "temperature_2m": np.round(temperature, 2), "relative_humidity_2m": np.round(humidity, 1), "precipitation": np.round(rain_amount, 2), "wind_speed_10m": np.round(wind_speed, 2), "wind_direction_10m": np.round(wind_dir, 1), "surface_pressure": np.round(pressure, 1), "dew_point_2m": np.round(dew_point, 2), "cloud_cover": np.round(cloud, 1), "cape": np.round(cape, 0), }) return df def main() -> int: rng = np.random.default_rng(seed=42) print(f"Generating SYNTHETIC dataset for {len(SITES)} sites…") print(f" date range: {START.date()} → {END.date()}\n") for name, lat, lon, elev in SITES: out = DATA_DIR / f"raw_{name}.csv" df = generate_site(name, lat, lon, elev, rng) df.to_csv(out, index=False) rain_pct = (df["precipitation"] > 0.1).mean() * 100.0 print(f" [{name:<18}] {len(df):>6} rows rain-hours={rain_pct:4.1f}% → {out.name}") print("\nDone (synthetic). Next: python scripts/2_preprocess.py") return 0 if __name__ == "__main__": raise SystemExit(main())