Spaces:
Paused
Paused
| """ | |
| Step 1 / Dataset Download | |
| ========================== | |
| Downloads hourly historical weather data from Open-Meteo Historical Weather API | |
| (backed by ECMWF ERA5 reanalysis) for 5 Malaysian mountain locations, | |
| plus elevation data from Open-Topo-Data (SRTM DEM). | |
| Parameters as confirmed with supervisor: | |
| - Location: Malaysia (mountain regions) | |
| - Time range: 2020-01-01 to 2023-12-31 | |
| - Variables: temperature_2m, relative_humidity_2m, precipitation, | |
| wind_speed_10m, wind_direction_10m, surface_pressure | |
| Output: data/raw_<site>.csv (one file per location) | |
| Run: python scripts/1_download_dataset.py | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| import time | |
| from pathlib import Path | |
| import httpx | |
| import pandas as pd | |
| ROOT = Path(__file__).resolve().parent.parent | |
| DATA_DIR = ROOT / "data" | |
| DATA_DIR.mkdir(exist_ok=True) | |
| # Malaysian mountain locations (lat, lon, name). | |
| # Chosen to span Peninsular Malaysia + Borneo and cover diverse terrain: | |
| # valleys, highlands, and one extreme peak for OOD reference. | |
| SITES = [ | |
| ("genting_highlands", 3.4225, 101.7935), | |
| ("cameron_highlands", 4.4694, 101.3776), | |
| ("frasers_hill", 3.7256, 101.7378), | |
| ("klang_valley", 3.0738, 101.5183), | |
| ("mt_kinabalu_base", 6.0535, 116.5586), | |
| ] | |
| START_DATE = "2020-01-01" | |
| END_DATE = "2023-12-31" | |
| HOURLY_VARS = [ | |
| "temperature_2m", | |
| "relative_humidity_2m", | |
| "precipitation", | |
| "wind_speed_10m", | |
| "wind_direction_10m", | |
| "surface_pressure", | |
| "dew_point_2m", | |
| "cloud_cover", | |
| "cape", | |
| ] | |
| OPEN_METEO_URL = "https://archive-api.open-meteo.com/v1/archive" | |
| OPEN_TOPO_URL = "https://api.opentopodata.org/v1/srtm30m" | |
| def fetch_elevation(lat: float, lon: float) -> float: | |
| """Fetch ground elevation in meters from Open-Topo-Data (SRTM 30m).""" | |
| resp = httpx.get( | |
| OPEN_TOPO_URL, | |
| params={"locations": f"{lat},{lon}"}, | |
| timeout=30.0, | |
| ) | |
| resp.raise_for_status() | |
| data = resp.json() | |
| return float(data["results"][0]["elevation"]) | |
| def fetch_hourly(lat: float, lon: float) -> pd.DataFrame: | |
| """Fetch hourly historical weather data for the configured date range.""" | |
| resp = httpx.get( | |
| OPEN_METEO_URL, | |
| params={ | |
| "latitude": lat, | |
| "longitude": lon, | |
| "start_date": START_DATE, | |
| "end_date": END_DATE, | |
| "hourly": ",".join(HOURLY_VARS), | |
| "timezone": "Asia/Kuala_Lumpur", | |
| "windspeed_unit": "kmh", | |
| }, | |
| timeout=120.0, | |
| ) | |
| resp.raise_for_status() | |
| payload = resp.json() | |
| df = pd.DataFrame(payload["hourly"]) | |
| df["time"] = pd.to_datetime(df["time"]) | |
| return df | |
| def download_site(name: str, lat: float, lon: float) -> Path: | |
| out = DATA_DIR / f"raw_{name}.csv" | |
| if out.exists(): | |
| print(f" [skip] {name}: already exists at {out}") | |
| return out | |
| print(f" [elev] fetching elevation for {name} ({lat}, {lon})…") | |
| elev = fetch_elevation(lat, lon) | |
| print(f" elevation = {elev:.1f} m") | |
| print(f" [hourly] fetching weather time-series for {name}…") | |
| df = fetch_hourly(lat, lon) | |
| df.insert(0, "site", name) | |
| df.insert(1, "latitude", lat) | |
| df.insert(2, "longitude", lon) | |
| df.insert(3, "elevation_m", elev) | |
| df.to_csv(out, index=False) | |
| print(f" [save] {len(df):>6} rows → {out}") | |
| return out | |
| def main() -> int: | |
| print(f"Downloading {len(SITES)} sites from Open-Meteo + Open-Topo-Data…") | |
| print(f" date range: {START_DATE} → {END_DATE}") | |
| print(f" variables: {', '.join(HOURLY_VARS)}\n") | |
| for name, lat, lon in SITES: | |
| print(f"[ {name} ]") | |
| try: | |
| download_site(name, lat, lon) | |
| except httpx.HTTPError as exc: | |
| print(f" [error] {exc}", file=sys.stderr) | |
| return 1 | |
| time.sleep(1.0) # be polite to the public APIs | |
| print("\nDone. Next step:") | |
| print(" python scripts/2_preprocess.py") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |