File size: 4,043 Bytes
4eefabb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
Step 1 / Dataset Download
==========================
Downloads hourly historical weather data from Open-Meteo Historical Weather API
(backed by ECMWF ERA5 reanalysis) for 5 Malaysian mountain locations,
plus elevation data from Open-Topo-Data (SRTM DEM).

Parameters as confirmed with supervisor:
    - Location: Malaysia (mountain regions)
    - Time range: 2020-01-01 to 2023-12-31
    - Variables: temperature_2m, relative_humidity_2m, precipitation,
                 wind_speed_10m, wind_direction_10m, surface_pressure

Output: data/raw_<site>.csv  (one file per location)

Run:  python scripts/1_download_dataset.py
"""
from __future__ import annotations

import sys
import time
from pathlib import Path

import httpx
import pandas as pd

ROOT = Path(__file__).resolve().parent.parent
DATA_DIR = ROOT / "data"
DATA_DIR.mkdir(exist_ok=True)

# Malaysian mountain locations (lat, lon, name).
# Chosen to span Peninsular Malaysia + Borneo and cover diverse terrain:
# valleys, highlands, and one extreme peak for OOD reference.
SITES = [
    ("genting_highlands", 3.4225, 101.7935),
    ("cameron_highlands", 4.4694, 101.3776),
    ("frasers_hill",      3.7256, 101.7378),
    ("klang_valley",      3.0738, 101.5183),
    ("mt_kinabalu_base",  6.0535, 116.5586),
]

START_DATE = "2020-01-01"
END_DATE   = "2023-12-31"

HOURLY_VARS = [
    "temperature_2m",
    "relative_humidity_2m",
    "precipitation",
    "wind_speed_10m",
    "wind_direction_10m",
    "surface_pressure",
    "dew_point_2m",
    "cloud_cover",
    "cape",
]

OPEN_METEO_URL = "https://archive-api.open-meteo.com/v1/archive"
OPEN_TOPO_URL  = "https://api.opentopodata.org/v1/srtm30m"


def fetch_elevation(lat: float, lon: float) -> float:
    """Fetch ground elevation in meters from Open-Topo-Data (SRTM 30m)."""
    resp = httpx.get(
        OPEN_TOPO_URL,
        params={"locations": f"{lat},{lon}"},
        timeout=30.0,
    )
    resp.raise_for_status()
    data = resp.json()
    return float(data["results"][0]["elevation"])


def fetch_hourly(lat: float, lon: float) -> pd.DataFrame:
    """Fetch hourly historical weather data for the configured date range."""
    resp = httpx.get(
        OPEN_METEO_URL,
        params={
            "latitude":   lat,
            "longitude":  lon,
            "start_date": START_DATE,
            "end_date":   END_DATE,
            "hourly":     ",".join(HOURLY_VARS),
            "timezone":   "Asia/Kuala_Lumpur",
            "windspeed_unit": "kmh",
        },
        timeout=120.0,
    )
    resp.raise_for_status()
    payload = resp.json()
    df = pd.DataFrame(payload["hourly"])
    df["time"] = pd.to_datetime(df["time"])
    return df


def download_site(name: str, lat: float, lon: float) -> Path:
    out = DATA_DIR / f"raw_{name}.csv"
    if out.exists():
        print(f"  [skip] {name}: already exists at {out}")
        return out

    print(f"  [elev] fetching elevation for {name} ({lat}, {lon})…")
    elev = fetch_elevation(lat, lon)
    print(f"         elevation = {elev:.1f} m")

    print(f"  [hourly] fetching weather time-series for {name}…")
    df = fetch_hourly(lat, lon)

    df.insert(0, "site",         name)
    df.insert(1, "latitude",     lat)
    df.insert(2, "longitude",    lon)
    df.insert(3, "elevation_m",  elev)

    df.to_csv(out, index=False)
    print(f"  [save] {len(df):>6} rows → {out}")
    return out


def main() -> int:
    print(f"Downloading {len(SITES)} sites from Open-Meteo + Open-Topo-Data…")
    print(f"  date range: {START_DATE}{END_DATE}")
    print(f"  variables:  {', '.join(HOURLY_VARS)}\n")

    for name, lat, lon in SITES:
        print(f"[ {name} ]")
        try:
            download_site(name, lat, lon)
        except httpx.HTTPError as exc:
            print(f"  [error] {exc}", file=sys.stderr)
            return 1
        time.sleep(1.0)  # be polite to the public APIs

    print("\nDone. Next step:")
    print("  python scripts/2_preprocess.py")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())