predictive_irrigation_models / tools /historical_weather.py
paolog-fbk's picture
Upload folder using huggingface_hub
64ab846 verified
import openmeteo_requests
import pandas as pd
import requests
import requests_cache
from retry_requests import retry
def get_historical_weather_data(location_ids_df, start_date="2023-01-01", end_date="2024-12-31"):
"""
Fetch historical weather data for multiple locations (sensors locations).
Parameters:
-----------
location_ids_df : pd.DataFrame
DataFrame with columns: ['datastream_name', 'datastream_id', 'x', 'y']
where x = latitude, y = longitude
start_date : str
Start date in 'YYYY-MM-DD' format
end_date : str
End date in 'YYYY-MM-DD' format
Returns:
--------
pd.DataFrame with weather data exploded per sensor
"""
# setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)
# avoid failing due to sensors with no location
location_ids_df = location_ids_df.dropna(subset=['x', 'y'])
# unique locations and create mapping
unique_locations = location_ids_df[['x', 'y']].drop_duplicates().reset_index(drop=True)
# create mapping: (lat, lon) -> list of sensors at that location
location_sensor_map = {}
for _, row in location_ids_df.iterrows():
lat = row['x'] # x is latitude
lon = row['y'] # y is longitude
key = (lat, lon)
if key not in location_sensor_map:
location_sensor_map[key] = []
location_sensor_map[key].append({
'datastream_name': row['datastream_name'],
'datastream_id': row['datastream_id']
})
# get location lists for API call
lats = unique_locations['x'].tolist() # x is latitude
lons = unique_locations['y'].tolist() # y is longitude
# gen comma-separated strings for API
lat_str = ",".join(f"{lat}" for lat in lats)
lon_str = ",".join(f"{lon}" for lon in lons)
# API call
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
"latitude": lat_str,
"longitude": lon_str,
"start_date": start_date,
"end_date": end_date,
"hourly": ["temperature_2m", "relative_humidity_2m", "precipitation", "et0_fao_evapotranspiration",
"wind_speed_10m", "soil_temperature_0_to_7cm", "soil_moisture_0_to_7cm", "direct_radiation"]
}
responses = openmeteo.weather_api(url, params=params)
# print info from first response (keep previous prints)
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()}{response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")
# Process each response and re-add correct lat/lon
all_dfs = []
for i, (response, lat, lon) in enumerate(zip(responses, lats, lons)):
hourly = response.Hourly()
df = pd.DataFrame({
"datetime": pd.date_range(
start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
freq=pd.Timedelta(seconds=hourly.Interval()),
inclusive="left"
),
"temperature_2m": hourly.Variables(0).ValuesAsNumpy(),
"relative_humidity_2m": hourly.Variables(1).ValuesAsNumpy(),
"precipitation": hourly.Variables(2).ValuesAsNumpy(),
"et0_fao_evapotranspiration": hourly.Variables(3).ValuesAsNumpy(),
"wind_speed_10m": hourly.Variables(4).ValuesAsNumpy(),
"soil_temperature_0_to_7cm": hourly.Variables(5).ValuesAsNumpy(),
"soil_moisture_0_to_7cm": hourly.Variables(6).ValuesAsNumpy(),
"direct_radiation": hourly.Variables(7).ValuesAsNumpy(),
"latitude": lat,
"longitude": lon
})
all_dfs.append(df)
# Combine all location dataframes
weather_df = pd.concat(all_dfs, ignore_index=True)
# Explode data by sensors: each sensor at a location gets its own rows
exploded_dfs = []
for (lat, lon), sensors in location_sensor_map.items():
location_weather = weather_df[(weather_df['latitude'] == lat) & (weather_df['longitude'] == lon)].copy()
for sensor in sensors:
sensor_df = location_weather.copy()
sensor_df['datastream_name'] = sensor['datastream_name']
sensor_df['datastream_id'] = sensor['datastream_id']
exploded_dfs.append(sensor_df)
final_df = pd.concat(exploded_dfs, ignore_index=True).drop(columns=['datastream_id', 'latitude', 'longitude'])
return final_df
# for now used for forecast as forecast is not sensor level
# def get_historical_weather_data_old(latitude, longitude, start_date="2023-01-01", end_date="2024-12-31"):
# # Setup the Open-Meteo API client with cache and retry on error
# cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
# retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
# openmeteo = openmeteo_requests.Client(session=retry_session)
#
# # Make sure all required weather variables are listed here
# # The order of variables in hourly or daily is important to assign them correctly below
# url = "https://archive-api.open-meteo.com/v1/archive"
# params = {
# "latitude": latitude,
# "longitude": longitude,
# "start_date": start_date,
# "end_date": end_date,
# "hourly": ["temperature_2m", "relative_humidity_2m", "precipitation", "et0_fao_evapotranspiration",
# "wind_speed_10m", "soil_temperature_0_to_7cm", "soil_moisture_0_to_7cm", "direct_radiation"]
# }
# responses = openmeteo.weather_api(url, params=params)
#
# # keep just to keep previous prints
# response = responses[0]
# print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
# print(f"Elevation {response.Elevation()} m asl")
# print(f"Timezone {response.Timezone()}{response.TimezoneAbbreviation()}")
# print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")
#
# all_dfs = []
#
# for i, (response, lat, lon) in enumerate(zip(responses, lats, lons)):
# hourly = response.Hourly()
# df = pd.DataFrame({
# "datetime": pd.date_range(
# start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
# end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
# freq=pd.Timedelta(seconds=hourly.Interval()),
# inclusive="left"
# ),
# "temperature_2m": hourly.Variables(0).ValuesAsNumpy(),
# "precipitation": hourly.Variables(2).ValuesAsNumpy(),
# "wind_speed_10m": hourly.Variables(4).ValuesAsNumpy(),
# "relative_humidity_2m": hourly.Variables(1).ValuesAsNumpy(),
# "et0_fao_evapotranspiration": hourly.Variables(3).ValuesAsNumpy(),
# "soil_temperature_0_to_7cm": hourly.Variables(5).ValuesAsNumpy(),
# "soil_moisture_0_to_7cm": hourly.Variables(6).ValuesAsNumpy(),
# "direct_radiation": hourly.Variables(7).ValuesAsNumpy(),
# "latitude": lat,
# "longitude": lon,
# "location_id": i
# })
# all_dfs.append(df)
#
# final_df = pd.concat(all_dfs, ignore_index=True)
#
# return final_df
# def get_historical_weather_daily(latitude, longitude, start_date, end_date):
# # Setup the Open-Meteo API client with cache and retry on error
# #cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
# #retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
#
# session = requests.Session()
# retry_session = retry(session, retries=5, backoff_factor=0.2)
# openmeteo = openmeteo_requests.Client(session=retry_session)
#
# url = "https://archive-api.open-meteo.com/v1/archive"
# params = {
# "latitude": latitude,
# "longitude": longitude,
# "start_date": start_date,
# "end_date": end_date,
# "daily": ["temperature_2m_min", "temperature_2m_max", "precipitation_sum", "et0_fao_evapotranspiration"]
# }
#
# responses = openmeteo.weather_api(url, params=params)
# response = responses[0]
#
# print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
# print(f"Elevation {response.Elevation()} m asl")
# print(f"Timezone {response.Timezone()}{response.TimezoneAbbreviation()}")
# print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")
#
# # Extract daily variables
# daily = response.Daily()
# daily_temperature_2m_min = daily.Variables(0).ValuesAsNumpy()
# daily_temperature_2m_max = daily.Variables(1).ValuesAsNumpy()
# daily_precipitation_sum = daily.Variables(2).ValuesAsNumpy()
# daily_et0_fao_evapotranspiration = daily.Variables(3).ValuesAsNumpy()
#
# # Build dataframe
# daily_data = {
# "Date": pd.date_range(
# start=pd.to_datetime(daily.Time(), unit="s", utc=True),
# end=pd.to_datetime(daily.TimeEnd(), unit="s", utc=True),
# freq=pd.Timedelta(seconds=daily.Interval()),
# inclusive="left"
# ),
# "MinTemp": daily_temperature_2m_min,
# "MaxTemp": daily_temperature_2m_max,
# "Precipitation": daily_precipitation_sum,
# "ReferenceET": daily_et0_fao_evapotranspiration
# }
#
# final_df = pd.DataFrame(data=daily_data)
# final_df["Date"] = final_df["Date"].dt.strftime("%Y-%m-%d")
#
# cols = [c for c in final_df.columns if c != "Date"] + ["Date"]
# final_df = final_df[cols]
#
# return final_df