Spaces:
Paused
Paused
| """ | |
| Step 1B / Synthetic Dataset Generator (offline fallback) | |
| ========================================================== | |
| When the real Open-Meteo / Open-Topo-Data APIs are unreachable (e.g. behind | |
| a restrictive corporate proxy or in an offline classroom), this script | |
| generates a physically-plausible synthetic dataset with the *exact same | |
| schema* as scripts/1_download_dataset.py. | |
| This lets the end-to-end pipeline (preprocess + train + serve) be | |
| validated without network access. To switch back to real data later, | |
| delete data/raw_*.csv and run scripts/1_download_dataset.py. | |
| The synthetic generator encodes: | |
| * Standard atmosphere lapse rate (≈ -6.5 °C / km) | |
| * Hydrostatic pressure decay with altitude (~ -12 hPa / 100 m) | |
| * Tropical diurnal temperature cycle (cooler at night, warmer mid-afternoon) | |
| * Malaysia's bimodal monsoon precipitation seasonality (Apr-May, Oct-Nov peaks) | |
| * Humidity inversely correlated with temperature, plus monsoon boost | |
| * Heavy-tailed precipitation distribution (most hours dry, rare extremes) | |
| * CAPE rising with humid afternoon convection | |
| * Dew-point depression that shrinks toward saturation as humidity rises | |
| This is *NOT* a substitute for real ERA5 reanalysis data in the final | |
| thesis — its purpose is purely to exercise the ML pipeline end-to-end. | |
| Run: python scripts/1b_synth_dataset.py | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| import numpy as np | |
| import pandas as pd | |
| ROOT = Path(__file__).resolve().parent.parent | |
| DATA_DIR = ROOT / "data" | |
| DATA_DIR.mkdir(exist_ok=True) | |
| # Site (name, lat, lon, approx elevation_m) — same as scripts/1_download_dataset.py | |
| SITES = [ | |
| ("genting_highlands", 3.4225, 101.7935, 1742.0), | |
| ("cameron_highlands", 4.4694, 101.3776, 1500.0), | |
| ("frasers_hill", 3.7256, 101.7378, 1300.0), | |
| ("klang_valley", 3.0738, 101.5183, 120.0), | |
| ("mt_kinabalu_base", 6.0535, 116.5586, 1800.0), | |
| ] | |
| START = pd.Timestamp("2020-01-01 00:00:00") | |
| END = pd.Timestamp("2023-12-31 23:00:00") | |
| def generate_site(name: str, lat: float, lon: float, elev: float, | |
| rng: np.random.Generator) -> pd.DataFrame: | |
| """Generate hourly synthetic weather time-series for a single site.""" | |
| timestamps = pd.date_range(START, END, freq="h") | |
| n = len(timestamps) | |
| hour = timestamps.hour.to_numpy() | |
| doy = timestamps.dayofyear.to_numpy() | |
| # Temperature: tropical baseline 27 °C at sea level, lapse rate to altitude, | |
| # plus diurnal swing (±4 °C) and seasonal (±1.5 °C). | |
| sea_level_temp = 27.0 | |
| lapse = -6.5 * (elev / 1000.0) | |
| diurnal = -4.0 * np.cos(2 * np.pi * (hour - 3) / 24.0) | |
| seasonal = 1.5 * np.cos(2 * np.pi * (doy - 60) / 365.25) | |
| noise_T = rng.normal(0.0, 1.2, n) | |
| temperature = sea_level_temp + lapse + diurnal + seasonal + noise_T | |
| # Pressure: hydrostatic decay, plus 3-hourly random walk for synoptic systems. | |
| sea_level_p = 1010.0 | |
| p_alt = sea_level_p - 12.0 * (elev / 100.0) | |
| pressure = p_alt + rng.normal(0.0, 0.8, n) | |
| pressure = pd.Series(pressure).rolling(3, min_periods=1).mean().to_numpy() | |
| # Monsoon-driven rainy season: Apr-May and Oct-Nov are peak rainfall in | |
| # Peninsular Malaysia; weight precipitation probability accordingly. | |
| monsoon_weight = ( | |
| 0.5 + 0.5 * np.cos(2 * np.pi * (doy - 305) / 365.25) # NE monsoon | |
| + 0.4 * np.exp(-0.5 * ((doy - 135) / 25.0) ** 2) # SW pre-monsoon | |
| + 0.4 * np.exp(-0.5 * ((doy - 305) / 30.0) ** 2) | |
| ) | |
| # Humidity: anti-correlated with diurnal temperature; lifted by monsoon. | |
| humidity_base = 78.0 + 4.0 * monsoon_weight | |
| humidity = humidity_base - 0.9 * diurnal + rng.normal(0.0, 5.0, n) | |
| humidity = np.clip(humidity, 30.0, 100.0) | |
| # CAPE: builds with afternoon humid heat — peaks 13-16h on humid days. | |
| afternoon = np.exp(-0.5 * ((hour - 14.5) / 2.5) ** 2) | |
| cape = ( | |
| afternoon * (humidity - 60.0) * 25.0 * monsoon_weight | |
| + rng.normal(0.0, 80.0, n) | |
| ) | |
| cape = np.clip(cape, 0.0, 4500.0) | |
| # Cloud cover: tied to humidity & monsoon. | |
| cloud = np.clip( | |
| 0.55 * humidity + 25.0 * monsoon_weight + rng.normal(0.0, 8.0, n), | |
| 0.0, 100.0, | |
| ) | |
| # Dew point depression shrinks at high humidity (saturation). | |
| dew_dep = np.clip(36.0 - 0.32 * humidity + rng.normal(0.0, 1.4, n), 0.1, 30.0) | |
| dew_point = temperature - dew_dep | |
| # Wind: weak in tropics; daytime sea breeze in lowlands, slightly more wind aloft. | |
| wind_base = 5.0 + 0.0025 * elev | |
| wind_speed = np.clip( | |
| wind_base + 2.5 * afternoon + np.abs(rng.normal(0.0, 2.5, n)), | |
| 0.0, 60.0, | |
| ) | |
| # Direction: slow random walk so consecutive hours have correlated direction. | |
| dir_steps = rng.normal(0.0, 25.0, n).cumsum() | |
| wind_dir = (dir_steps % 360.0 + 180.0 * monsoon_weight) % 360.0 | |
| # Precipitation: zero-inflated; probability rises with humidity × monsoon × CAPE. | |
| rain_prob = ( | |
| 0.04 | |
| + 0.55 * monsoon_weight * (humidity > 80).astype(float) | |
| + 0.0001 * cape | |
| + 0.25 * afternoon * (humidity > 85).astype(float) | |
| ) | |
| rain_prob = np.clip(rain_prob, 0.0, 0.85) | |
| rain_event = rng.random(n) < rain_prob | |
| # When it rains, amount follows an exponential distribution (heavy-tailed). | |
| rain_amount = np.where( | |
| rain_event, | |
| rng.exponential(scale=2.8, size=n), # mm/h | |
| 0.0, | |
| ) | |
| df = pd.DataFrame({ | |
| "site": name, | |
| "latitude": lat, | |
| "longitude": lon, | |
| "elevation_m": elev, | |
| "time": timestamps, | |
| "temperature_2m": np.round(temperature, 2), | |
| "relative_humidity_2m": np.round(humidity, 1), | |
| "precipitation": np.round(rain_amount, 2), | |
| "wind_speed_10m": np.round(wind_speed, 2), | |
| "wind_direction_10m": np.round(wind_dir, 1), | |
| "surface_pressure": np.round(pressure, 1), | |
| "dew_point_2m": np.round(dew_point, 2), | |
| "cloud_cover": np.round(cloud, 1), | |
| "cape": np.round(cape, 0), | |
| }) | |
| return df | |
| def main() -> int: | |
| rng = np.random.default_rng(seed=42) | |
| print(f"Generating SYNTHETIC dataset for {len(SITES)} sites…") | |
| print(f" date range: {START.date()} → {END.date()}\n") | |
| for name, lat, lon, elev in SITES: | |
| out = DATA_DIR / f"raw_{name}.csv" | |
| df = generate_site(name, lat, lon, elev, rng) | |
| df.to_csv(out, index=False) | |
| rain_pct = (df["precipitation"] > 0.1).mean() * 100.0 | |
| print(f" [{name:<18}] {len(df):>6} rows rain-hours={rain_pct:4.1f}% → {out.name}") | |
| print("\nDone (synthetic). Next: python scripts/2_preprocess.py") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |