| """ |
| Step 1B / Synthetic Dataset Generator (offline fallback) |
| ========================================================== |
| |
| When the real Open-Meteo / Open-Topo-Data APIs are unreachable (e.g. behind |
| a restrictive corporate proxy or in an offline classroom), this script |
| generates a physically-plausible synthetic dataset with the *exact same |
| schema* as scripts/1_download_dataset.py. |
| |
| This lets the end-to-end pipeline (preprocess + train + serve) be |
| validated without network access. To switch back to real data later, |
| delete data/raw_*.csv and run scripts/1_download_dataset.py. |
| |
| The synthetic generator encodes: |
| * Standard atmosphere lapse rate (≈ -6.5 °C / km) |
| * Hydrostatic pressure decay with altitude (~ -12 hPa / 100 m) |
| * Tropical diurnal temperature cycle (cooler at night, warmer mid-afternoon) |
| * Malaysia's bimodal monsoon precipitation seasonality (Apr-May, Oct-Nov peaks) |
| * Humidity inversely correlated with temperature, plus monsoon boost |
| * Heavy-tailed precipitation distribution (most hours dry, rare extremes) |
| * CAPE rising with humid afternoon convection |
| * Dew-point depression that shrinks toward saturation as humidity rises |
| |
| This is *NOT* a substitute for real ERA5 reanalysis data in the final |
| thesis — its purpose is purely to exercise the ML pipeline end-to-end. |
| |
| Run: python scripts/1b_synth_dataset.py |
| """ |
| from __future__ import annotations |
|
|
| from pathlib import Path |
|
|
| import numpy as np |
| import pandas as pd |
|
|
| ROOT = Path(__file__).resolve().parent.parent |
| DATA_DIR = ROOT / "data" |
| DATA_DIR.mkdir(exist_ok=True) |
|
|
| |
| SITES = [ |
| ("genting_highlands", 3.4225, 101.7935, 1742.0), |
| ("cameron_highlands", 4.4694, 101.3776, 1500.0), |
| ("frasers_hill", 3.7256, 101.7378, 1300.0), |
| ("klang_valley", 3.0738, 101.5183, 120.0), |
| ("mt_kinabalu_base", 6.0535, 116.5586, 1800.0), |
| ] |
|
|
| START = pd.Timestamp("2020-01-01 00:00:00") |
| END = pd.Timestamp("2023-12-31 23:00:00") |
|
|
|
|
| def generate_site(name: str, lat: float, lon: float, elev: float, |
| rng: np.random.Generator) -> pd.DataFrame: |
| """Generate hourly synthetic weather time-series for a single site.""" |
| timestamps = pd.date_range(START, END, freq="h") |
| n = len(timestamps) |
|
|
| hour = timestamps.hour.to_numpy() |
| doy = timestamps.dayofyear.to_numpy() |
|
|
| |
| |
| sea_level_temp = 27.0 |
| lapse = -6.5 * (elev / 1000.0) |
| diurnal = -4.0 * np.cos(2 * np.pi * (hour - 3) / 24.0) |
| seasonal = 1.5 * np.cos(2 * np.pi * (doy - 60) / 365.25) |
| noise_T = rng.normal(0.0, 1.2, n) |
| temperature = sea_level_temp + lapse + diurnal + seasonal + noise_T |
|
|
| |
| sea_level_p = 1010.0 |
| p_alt = sea_level_p - 12.0 * (elev / 100.0) |
| pressure = p_alt + rng.normal(0.0, 0.8, n) |
| pressure = pd.Series(pressure).rolling(3, min_periods=1).mean().to_numpy() |
|
|
| |
| |
| monsoon_weight = ( |
| 0.5 + 0.5 * np.cos(2 * np.pi * (doy - 305) / 365.25) |
| + 0.4 * np.exp(-0.5 * ((doy - 135) / 25.0) ** 2) |
| + 0.4 * np.exp(-0.5 * ((doy - 305) / 30.0) ** 2) |
| ) |
|
|
| |
| humidity_base = 78.0 + 4.0 * monsoon_weight |
| humidity = humidity_base - 0.9 * diurnal + rng.normal(0.0, 5.0, n) |
| humidity = np.clip(humidity, 30.0, 100.0) |
|
|
| |
| afternoon = np.exp(-0.5 * ((hour - 14.5) / 2.5) ** 2) |
| cape = ( |
| afternoon * (humidity - 60.0) * 25.0 * monsoon_weight |
| + rng.normal(0.0, 80.0, n) |
| ) |
| cape = np.clip(cape, 0.0, 4500.0) |
|
|
| |
| cloud = np.clip( |
| 0.55 * humidity + 25.0 * monsoon_weight + rng.normal(0.0, 8.0, n), |
| 0.0, 100.0, |
| ) |
|
|
| |
| dew_dep = np.clip(36.0 - 0.32 * humidity + rng.normal(0.0, 1.4, n), 0.1, 30.0) |
| dew_point = temperature - dew_dep |
|
|
| |
| wind_base = 5.0 + 0.0025 * elev |
| wind_speed = np.clip( |
| wind_base + 2.5 * afternoon + np.abs(rng.normal(0.0, 2.5, n)), |
| 0.0, 60.0, |
| ) |
| |
| dir_steps = rng.normal(0.0, 25.0, n).cumsum() |
| wind_dir = (dir_steps % 360.0 + 180.0 * monsoon_weight) % 360.0 |
|
|
| |
| rain_prob = ( |
| 0.04 |
| + 0.55 * monsoon_weight * (humidity > 80).astype(float) |
| + 0.0001 * cape |
| + 0.25 * afternoon * (humidity > 85).astype(float) |
| ) |
| rain_prob = np.clip(rain_prob, 0.0, 0.85) |
| rain_event = rng.random(n) < rain_prob |
| |
| rain_amount = np.where( |
| rain_event, |
| rng.exponential(scale=2.8, size=n), |
| 0.0, |
| ) |
|
|
| df = pd.DataFrame({ |
| "site": name, |
| "latitude": lat, |
| "longitude": lon, |
| "elevation_m": elev, |
| "time": timestamps, |
| "temperature_2m": np.round(temperature, 2), |
| "relative_humidity_2m": np.round(humidity, 1), |
| "precipitation": np.round(rain_amount, 2), |
| "wind_speed_10m": np.round(wind_speed, 2), |
| "wind_direction_10m": np.round(wind_dir, 1), |
| "surface_pressure": np.round(pressure, 1), |
| "dew_point_2m": np.round(dew_point, 2), |
| "cloud_cover": np.round(cloud, 1), |
| "cape": np.round(cape, 0), |
| }) |
| return df |
|
|
|
|
| def main() -> int: |
| rng = np.random.default_rng(seed=42) |
| print(f"Generating SYNTHETIC dataset for {len(SITES)} sites…") |
| print(f" date range: {START.date()} → {END.date()}\n") |
| for name, lat, lon, elev in SITES: |
| out = DATA_DIR / f"raw_{name}.csv" |
| df = generate_site(name, lat, lon, elev, rng) |
| df.to_csv(out, index=False) |
| rain_pct = (df["precipitation"] > 0.1).mean() * 100.0 |
| print(f" [{name:<18}] {len(df):>6} rows rain-hours={rain_pct:4.1f}% → {out.name}") |
| print("\nDone (synthetic). Next: python scripts/2_preprocess.py") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|