Spaces:
Paused
Paused
File size: 4,043 Bytes
4eefabb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | """
Step 1 / Dataset Download
==========================
Downloads hourly historical weather data from Open-Meteo Historical Weather API
(backed by ECMWF ERA5 reanalysis) for 5 Malaysian mountain locations,
plus elevation data from Open-Topo-Data (SRTM DEM).
Parameters as confirmed with supervisor:
- Location: Malaysia (mountain regions)
- Time range: 2020-01-01 to 2023-12-31
- Variables: temperature_2m, relative_humidity_2m, precipitation,
wind_speed_10m, wind_direction_10m, surface_pressure
Output: data/raw_<site>.csv (one file per location)
Run: python scripts/1_download_dataset.py
"""
from __future__ import annotations
import sys
import time
from pathlib import Path
import httpx
import pandas as pd
ROOT = Path(__file__).resolve().parent.parent
DATA_DIR = ROOT / "data"
DATA_DIR.mkdir(exist_ok=True)
# Malaysian mountain locations (lat, lon, name).
# Chosen to span Peninsular Malaysia + Borneo and cover diverse terrain:
# valleys, highlands, and one extreme peak for OOD reference.
SITES = [
("genting_highlands", 3.4225, 101.7935),
("cameron_highlands", 4.4694, 101.3776),
("frasers_hill", 3.7256, 101.7378),
("klang_valley", 3.0738, 101.5183),
("mt_kinabalu_base", 6.0535, 116.5586),
]
START_DATE = "2020-01-01"
END_DATE = "2023-12-31"
HOURLY_VARS = [
"temperature_2m",
"relative_humidity_2m",
"precipitation",
"wind_speed_10m",
"wind_direction_10m",
"surface_pressure",
"dew_point_2m",
"cloud_cover",
"cape",
]
OPEN_METEO_URL = "https://archive-api.open-meteo.com/v1/archive"
OPEN_TOPO_URL = "https://api.opentopodata.org/v1/srtm30m"
def fetch_elevation(lat: float, lon: float) -> float:
"""Fetch ground elevation in meters from Open-Topo-Data (SRTM 30m)."""
resp = httpx.get(
OPEN_TOPO_URL,
params={"locations": f"{lat},{lon}"},
timeout=30.0,
)
resp.raise_for_status()
data = resp.json()
return float(data["results"][0]["elevation"])
def fetch_hourly(lat: float, lon: float) -> pd.DataFrame:
"""Fetch hourly historical weather data for the configured date range."""
resp = httpx.get(
OPEN_METEO_URL,
params={
"latitude": lat,
"longitude": lon,
"start_date": START_DATE,
"end_date": END_DATE,
"hourly": ",".join(HOURLY_VARS),
"timezone": "Asia/Kuala_Lumpur",
"windspeed_unit": "kmh",
},
timeout=120.0,
)
resp.raise_for_status()
payload = resp.json()
df = pd.DataFrame(payload["hourly"])
df["time"] = pd.to_datetime(df["time"])
return df
def download_site(name: str, lat: float, lon: float) -> Path:
out = DATA_DIR / f"raw_{name}.csv"
if out.exists():
print(f" [skip] {name}: already exists at {out}")
return out
print(f" [elev] fetching elevation for {name} ({lat}, {lon})…")
elev = fetch_elevation(lat, lon)
print(f" elevation = {elev:.1f} m")
print(f" [hourly] fetching weather time-series for {name}…")
df = fetch_hourly(lat, lon)
df.insert(0, "site", name)
df.insert(1, "latitude", lat)
df.insert(2, "longitude", lon)
df.insert(3, "elevation_m", elev)
df.to_csv(out, index=False)
print(f" [save] {len(df):>6} rows → {out}")
return out
def main() -> int:
print(f"Downloading {len(SITES)} sites from Open-Meteo + Open-Topo-Data…")
print(f" date range: {START_DATE} → {END_DATE}")
print(f" variables: {', '.join(HOURLY_VARS)}\n")
for name, lat, lon in SITES:
print(f"[ {name} ]")
try:
download_site(name, lat, lon)
except httpx.HTTPError as exc:
print(f" [error] {exc}", file=sys.stderr)
return 1
time.sleep(1.0) # be polite to the public APIs
print("\nDone. Next step:")
print(" python scripts/2_preprocess.py")
return 0
if __name__ == "__main__":
raise SystemExit(main())
|