franch commited on Feb 26

Commit

df27dfb

verified ·

1 Parent(s): 39d3c0f

Add source code and examples

Browse files

Files changed (40) hide show

.dockerignore +15 -0
.gitattributes +1 -0
.github/workflows/ci.yml +29 -0
.gitignore +17 -0
.pre-commit-config.yaml +17 -0
Dockerfile +26 -0
LICENSE +24 -0
MODEL_CARD.md +82 -0
Makefile +22 -0
README.md +220 -52
convgru_ensemble/__init__.py +8 -0
convgru_ensemble/__main__.py +5 -0
convgru_ensemble/cli.py +126 -0
convgru_ensemble/datamodule.py +378 -0
convgru_ensemble/hub.py +102 -0
convgru_ensemble/lightning_model.py +560 -0
convgru_ensemble/losses.py +458 -0
convgru_ensemble/model.py +569 -0
convgru_ensemble/py.typed +0 -0
convgru_ensemble/serve.py +139 -0
convgru_ensemble/train.py +316 -0
convgru_ensemble/utils.py +122 -0
docker-compose.yml +12 -0
examples/sample_data.nc +3 -0
importance_sampler/filter_nan.py +362 -0
importance_sampler/output/sampled_datacubes_2021-01-01-2025-12-11_24x256x256_3x16x16_10000.csv +0 -0
importance_sampler/output/sampled_datacubes_2021-01-01-2025-12-11_24x256x256_3x16x16_10000_metadata.json +21 -0
importance_sampler/sample_valid_datacubes.py +257 -0
notebooks/test_pretrained_model.ipynb +172 -0
pyproject.toml +76 -0
scripts/upload_to_hub.py +35 -0
tests/conftest.py +23 -0
tests/test_hub.py +16 -0
tests/test_inference.py +28 -0
tests/test_lightning_model.py +41 -0
tests/test_losses.py +33 -0
tests/test_model.py +34 -0
tests/test_serve.py +66 -0
tests/test_utils.py +17 -0
uv.lock +0 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,15 @@

+.git
+.github
+.venv
+__pycache__
+*.pyc
+*.zarr
+logs/
+data/
+notebooks/
+importance_sampler/
+tests/
+*.egg-info
+.ruff_cache
+.pytest_cache
+.claude

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/sample_data.nc filter=lfs diff=lfs merge=lfs -text

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,29 @@

+name: CI
+on:
+  push:
+  pull_request:
+jobs:
+  lint-test-build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          python-version: "3.13"
+      - name: Sync dependencies
+        run: uv sync --all-groups --extra serve
+      - name: Lint
+        run: uv run ruff check .
+      - name: Tests
+        run: uv run pytest -q
+      - name: Build package
+        run: uv build

.gitignore ADDED Viewed

	@@ -0,0 +1,17 @@

+*.nc
+!examples/*.nc
+*.zarr/
+__pycache__/
+.ipynb_checkpoints/
+checkpoints/
+*.ckpt
+*.mp4
+dist/
+build/
+*.egg-info/
+.pytest_cache/
+.ruff_cache/
+logs/
+data/
+.env
+.venv/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-added-large-files
+        args: ['--maxkb=12000']
+      - id: check-merge-conflict
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.9.10
+    hooks:
+      - id: ruff
+        args: [--fix]
+      - id: ruff-format

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.13-slim
+WORKDIR /app
+# Install uv for fast dependency management
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+# Copy dependency files first for layer caching
+COPY pyproject.toml uv.lock ./
+# Install dependencies (serve extras, no dev)
+RUN uv sync --extra serve --no-dev --no-install-project
+# Copy package source
+COPY convgru_ensemble/ ./convgru_ensemble/
+# Install the project itself
+RUN uv sync --extra serve --no-dev
+# Model checkpoint is mounted at runtime or downloaded from HF Hub
+ENV MODEL_CHECKPOINT=/app/model.ckpt
+ENV DEVICE=cpu
+EXPOSE 8000
+CMD ["uv", "run", "uvicorn", "convgru_ensemble.serve:app", "--host", "0.0.0.0", "--port", "8000"]

LICENSE ADDED Viewed

	@@ -0,0 +1,24 @@

+BSD 2-Clause License
+Copyright (c) 2026, Data Science for Industry and Physics @ FBK
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

MODEL_CARD.md ADDED Viewed

	@@ -0,0 +1,82 @@

+---
+license: bsd-2-clause
+language: en
+tags:
+  - weather
+  - nowcasting
+  - radar
+  - precipitation
+  - ensemble-forecasting
+  - convgru
+  - earth-observation
+library_name: pytorch
+pipeline_tag: image-to-image
+---
+# IRENE — Italian Radar Ensemble Nowcasting Experiment
+**IRENE** is a ConvGRU encoder-decoder model for short-term precipitation forecasting (nowcasting) from radar data. The model generates probabilistic ensemble forecasts, producing multiple plausible future scenarios from a single input sequence.
+## Model Description
+- **Architecture**: ConvGRU encoder-decoder with PixelShuffle/PixelUnshuffle for spatial scaling
+- **Input**: Sequence of past radar rain rate fields (T, H, W) in mm/h
+- **Output**: Ensemble of future rain rate forecasts (E, T, H, W) in mm/h
+- **Temporal resolution**: 5 minutes per timestep
+- **Training loss**: Continuous Ranked Probability Score (CRPS) with temporal consistency regularization
+The model encodes past radar observations into multi-scale hidden states using stacked ConvGRU blocks with PixelUnshuffle downsampling. The decoder generates forecasts by unrolling with different random noise inputs, producing diverse ensemble members that capture forecast uncertainty.
+## Intended Uses
+- Short-term precipitation forecasting (0-60 min ahead) from radar reflectivity data
+- Probabilistic nowcasting with uncertainty quantification via ensemble spread
+- Research on deep learning for weather prediction
+- Fine-tuning on regional radar datasets
+## How to Use
+```python
+from convgru_ensemble import RadarLightningModel
+# Load from HuggingFace Hub
+model = RadarLightningModel.from_pretrained("it4lia/irene")
+# Run inference on past radar data (rain rate in mm/h)
+import numpy as np
+past = np.random.rand(6, 256, 256).astype(np.float32)  # 6 past timesteps
+forecasts = model.predict(past, forecast_steps=12, ensemble_size=10)
+# forecasts.shape = (10, 12, 256, 256) — 10 ensemble members, 12 future steps
+```
+## Training Data
+Trained on the Italian DPC (Dipartimento della Protezione Civile) radar mosaic surface rain intensity (SRI) dataset, covering the Italian territory at ~1 km resolution with 5-minute temporal resolution.
+## Training Procedure
+- **Optimizer**: Adam (lr=1e-4)
+- **Loss**: CRPS with temporal consistency penalty (lambda=0.01)
+- **Batch size**: 16
+- **Ensemble size during training**: 2 members
+- **Input window**: 6 past timesteps (30 min)
+- **Forecast horizon**: 12 future timesteps (60 min)
+- **Data augmentation**: Random rotations and flips
+- **NaN handling**: Masked loss for missing radar data
+## Limitations
+- Trained on Italian radar data; performance may degrade on other domains without fine-tuning
+- 5-minute temporal resolution only
+- Best suited for convective and stratiform precipitation; extreme events may be underrepresented
+- Ensemble spread is generated via noisy decoder inputs, not a full Bayesian approach
+## Acknowledgements
+This model was developed as part of the **Italian AI-Factory** (IT4LIA), an EU-funded initiative supporting the adoption of AI across SMEs, academia, and public/private sectors. The AI-Factory provides free HPC compute, consultancy, and AI-ready datasets. This work showcases capabilities in the **Earth (weather and climate) vertical domain**.
+Developed at **Fondazione Bruno Kessler (FBK)**, Trento, Italy.
+## License
+BSD 2-Clause License

Makefile ADDED Viewed

	@@ -0,0 +1,22 @@

+.PHONY: install lint test serve docker-build docker-run
+install:
+	uv sync --all-groups --extra serve
+lint:
+	uv run ruff check .
+format:
+	uv run ruff format .
+test:
+	uv run pytest -q
+serve:
+	uv run uvicorn convgru_ensemble.serve:app --host 0.0.0.0 --port 8000
+docker-build:
+	docker build -t convgru-ensemble .
+docker-run:
+	docker run -p 8000:8000 -v ./checkpoints:/app/checkpoints convgru-ensemble

README.md CHANGED Viewed

@@ -1,82 +1,250 @@
----
-license: bsd-2-clause
-language: en
-tags:
-  - weather
-  - nowcasting
-  - radar
-  - precipitation
-  - ensemble-forecasting
-  - convgru
-  - earth-observation
-library_name: pytorch
-pipeline_tag: image-to-image
----
-# IRENE — Italian Radar Ensemble Nowcasting Experiment
-**IRENE** is a ConvGRU encoder-decoder model for short-term precipitation forecasting (nowcasting) from radar data. The model generates probabilistic ensemble forecasts, producing multiple plausible future scenarios from a single input sequence.
-## Model Description
-- **Architecture**: ConvGRU encoder-decoder with PixelShuffle/PixelUnshuffle for spatial scaling
-- **Input**: Sequence of past radar rain rate fields (T, H, W) in mm/h
-- **Output**: Ensemble of future rain rate forecasts (E, T, H, W) in mm/h
-- **Temporal resolution**: 5 minutes per timestep
-- **Training loss**: Continuous Ranked Probability Score (CRPS) with temporal consistency regularization
-The model encodes past radar observations into multi-scale hidden states using stacked ConvGRU blocks with PixelUnshuffle downsampling. The decoder generates forecasts by unrolling with different random noise inputs, producing diverse ensemble members that capture forecast uncertainty.
-## Intended Uses
-- Short-term precipitation forecasting (0-60 min ahead) from radar reflectivity data
-- Probabilistic nowcasting with uncertainty quantification via ensemble spread
-- Research on deep learning for weather prediction
-- Fine-tuning on regional radar datasets
-## How to Use
 ```python
 from convgru_ensemble import RadarLightningModel
-# Load from HuggingFace Hub
 model = RadarLightningModel.from_pretrained("it4lia/irene")
-# Run inference on past radar data (rain rate in mm/h)
 import numpy as np
-past = np.random.rand(6, 256, 256).astype(np.float32)  # 6 past timesteps
 forecasts = model.predict(past, forecast_steps=12, ensemble_size=10)
-# forecasts.shape = (10, 12, 256, 256) — 10 ensemble members, 12 future steps
 ```
-## Training Data
-Trained on the Italian DPC (Dipartimento della Protezione Civile) radar mosaic surface rain intensity (SRI) dataset, covering the Italian territory at ~1 km resolution with 5-minute temporal resolution.
-## Training Procedure
-- **Optimizer**: Adam (lr=1e-4)
-- **Loss**: CRPS with temporal consistency penalty (lambda=0.01)
-- **Batch size**: 16
-- **Ensemble size during training**: 2 members
-- **Input window**: 6 past timesteps (30 min)
-- **Forecast horizon**: 12 future timesteps (60 min)
-- **Data augmentation**: Random rotations and flips
-- **NaN handling**: Masked loss for missing radar data
-## Limitations
-- Trained on Italian radar data; performance may degrade on other domains without fine-tuning
-- 5-minute temporal resolution only
-- Best suited for convective and stratiform precipitation; extreme events may be underrepresented
-- Ensemble spread is generated via noisy decoder inputs, not a full Bayesian approach
 ## Acknowledgements
-This model was developed as part of the **Italian AI-Factory** (IT4LIA), an EU-funded initiative supporting the adoption of AI across SMEs, academia, and public/private sectors. The AI-Factory provides free HPC compute, consultancy, and AI-ready datasets. This work showcases capabilities in the **Earth (weather and climate) vertical domain**.
-Developed at **Fondazione Bruno Kessler (FBK)**, Trento, Italy.
 ## License
-BSD 2-Clause License

+<div align="center">
+# ConvGRU-Ensemble
+**Ensemble precipitation nowcasting using Convolutional GRU networks**
+*Pretrained model for Italy:* ***IRENE*** — **I**talian **R**adar **E**nsemble **N**owcasting **E**xperiment
+[![CI](https://github.com/DSIP-FBK/ConvGRU-Ensemble/actions/workflows/ci.yml/badge.svg)](https://github.com/DSIP-FBK/ConvGRU-Ensemble/actions)
+[![License: BSD-2](https://img.shields.io/badge/license-BSD--2-blue.svg)](LICENSE)
+[![Python 3.13+](https://img.shields.io/badge/python-3.13%2B-blue.svg)](https://python.org)
+[![HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97-Model-yellow)](https://huggingface.co/it4lia/irene)
+<br>
+<a href="https://www.fbk.eu"><img src="https://webvalley.fbk.eu/static/img/logos/fbk-logo-blue.png" height="55" alt="Fondazione Bruno Kessler"></a>
+&nbsp;&nbsp;&nbsp;&nbsp;
+<a href="https://it4lia-aifactory.eu"><img src="https://it4lia-aifactory.eu/wp-content/uploads/2025/05/logo-IT4LIA-AI-factory.svg" height="55" alt="IT4LIA AI-Factory"></a>
+&nbsp;&nbsp;&nbsp;&nbsp;
+<a href="https://www.italiameteo.eu"><img src="https://it4lia-aifactory.eu/wp-content/uploads/2025/08/logo-italiameteo.svg" height="55" alt="ItaliaMeteo"></a>
+<br><br>
+The model encodes past radar frames into multi-scale hidden states and decodes them into an **ensemble of probabilistic forecasts** by running the decoder multiple times with different noise inputs, trained with **CRPS loss**.
+</div>
+---
+## Quick Start
+<details open>
+<summary><b>Load from HuggingFace Hub</b></summary>
 ```python
 from convgru_ensemble import RadarLightningModel
 model = RadarLightningModel.from_pretrained("it4lia/irene")
 import numpy as np
+past = np.load("past_radar.npy")  # rain rate in mm/h, shape (T_past, H, W)
 forecasts = model.predict(past, forecast_steps=12, ensemble_size=10)
+# forecasts.shape = (10, 12, H, W) — 10 members, 12 future steps, mm/h
+```
+</details>
+<details>
+<summary><b>CLI Inference</b></summary>
+```bash
+convgru-ensemble predict \
+    --input examples/sample_data.nc \
+    --hub-repo it4lia/irene \
+    --forecast-steps 12 \
+    --ensemble-size 10 \
+    --output predictions.nc
+```
+</details>
+<details>
+<summary><b>Serve via API</b></summary>
+```bash
+# With Docker
+docker compose up
+# Or directly
+pip install convgru-ensemble[serve]
+convgru-ensemble serve --hub-repo it4lia/irene --port 8000
+```
+```bash
+curl -X POST http://localhost:8000/predict \
+    -F "file=@input.nc" -o predictions.nc
+```
+| Endpoint | Method | Description |
+|---|---|---|
+| `/health` | GET | Health check |
+| `/model/info` | GET | Model metadata and hyperparameters |
+| `/predict` | POST | Upload NetCDF, get ensemble forecast as NetCDF |
+</details>
+<details>
+<summary><b>Fine-tune on your data</b></summary>
+```bash
+pip install convgru-ensemble
+# See "Training" section below
 ```
+</details>
+## Setup
+Requires Python >= 3.13. Uses [uv](https://github.com/astral-sh/uv) for dependency management.
+```bash
+uv sync                    # core dependencies
+uv sync --extra serve      # + FastAPI serving
+```
+## Data Preparation
+The training pipeline expects a Zarr dataset with a rain rate variable `RR` indexed by `(time, x, y)`.
+<details>
+<summary><b>1. Filter valid datacubes</b></summary>
+Scan the Zarr and find all space-time datacubes with fewer than `n_nan` NaN values:
+```bash
+cd importance_sampler
+uv run python filter_nan.py path/to/dataset.zarr \
+    --start_date 2021-01-01 --end_date 2025-12-11 \
+    --Dt 24 --w 256 --h 256 \
+    --step_T 3 --step_X 16 --step_Y 16 \
+    --n_nan 10000 --n_workers 8
+```
+</details>
+<details>
+<summary><b>2. Importance sampling</b></summary>
+Sample valid datacubes with higher probability for rainier events:
+```bash
+uv run python sample_valid_datacubes.py path/to/dataset.zarr valid_datacubes_*.csv \
+    --q_min 1e-4 --m 0.1 --n_workers 8
+```
+A pre-sampled CSV is provided in [`importance_sampler/output/`](importance_sampler/output/).
+</details>
+## Training
+Training is configured via [Fiddle](https://github.com/google/fiddle). Run with defaults:
+```bash
+uv run python -m convgru_ensemble.train
+```
+Override parameters from the command line:
+```bash
+uv run python -m convgru_ensemble.train \
+    --config config:experiment \
+    --config set:model.num_blocks=5 \
+    --config set:model.forecast_steps=12 \
+    --config set:model.loss_class=crps \
+    --config set:model.ensemble_size=2 \
+    --config set:datamodule.batch_size=16 \
+    --config set:trainer.max_epochs=100
+```
+Monitor with TensorBoard: `uv run tensorboard --logdir logs/`
+| Parameter | Description | Default |
+|---|---|---|
+| `model.num_blocks` | Encoder/decoder depth | `5` |
+| `model.forecast_steps` | Future steps to predict | `12` |
+| `model.ensemble_size` | Ensemble members during training | `2` |
+| `model.loss_class` | Loss function (`mse`, `mae`, `crps`, `afcrps`) | `crps` |
+| `model.masked_loss` | Mask NaN regions in loss | `True` |
+| `datamodule.steps` | Total timesteps per sample (past + future) | `18` |
+| `datamodule.batch_size` | Batch size | `16` |
+## Architecture
+```
+Input (B, T_past, 1, H, W)
+    |
+    v
++--------------------------+
+|        Encoder           |  ConvGRU + PixelUnshuffle (x num_blocks)
+|  Spatial dims halve at   |  Channels: 1 -> 4 -> 16 -> 64 -> 256 -> 1024
+|  each block              |
++----------+---------------+
+           | hidden states
+           v
++--------------------------+
+|        Decoder           |  ConvGRU + PixelShuffle (x num_blocks)
+|  Noise input (x M runs)  |  Each run produces one ensemble member
+|  for ensemble generation |
++----------+---------------+
+           |
+           v
+Output (B, T_future, M, H, W)
+```
+## Docker
+```bash
+docker build -t convgru-ensemble .
+# Run with local checkpoint
+docker run -p 8000:8000 -v ./checkpoints:/app/checkpoints \
+    -e MODEL_CHECKPOINT=/app/checkpoints/model.ckpt convgru-ensemble
+# Run with HuggingFace Hub
+docker run -p 8000:8000 -e HF_REPO_ID=it4lia/irene convgru-ensemble
+```
+## Project Structure
+```
+ConvGRU-Ensemble/
++-- convgru_ensemble/          # Python package
+|   +-- model.py               # ConvGRU encoder-decoder architecture
+|   +-- losses.py              # CRPS, afCRPS, masked loss wrappers
+|   +-- lightning_model.py     # PyTorch Lightning training module
+|   +-- datamodule.py          # Dataset and data loading
+|   +-- train.py               # Training entry point (Fiddle config)
+|   +-- utils.py               # Rain rate <-> reflectivity conversions
+|   +-- hub.py                 # HuggingFace Hub upload/download
+|   +-- cli.py                 # CLI for inference and serving
+|   +-- serve.py               # FastAPI inference server
++-- examples/                  # Sample data for testing
++-- importance_sampler/        # Data preparation scripts
++-- notebooks/                 # Example notebooks
++-- scripts/                   # Utility scripts (e.g., upload to Hub)
++-- tests/                     # Test suite
++-- Dockerfile                 # Container for serving API
++-- MODEL_CARD.md              # HuggingFace model card template
+```
 ## Acknowledgements
+<div align="center">
+Developed at **Fondazione Bruno Kessler (FBK)**, Trento, Italy, as part of the **Italian AI-Factory (IT4LIA)**, an EU-funded initiative supporting AI adoption across SMEs, academia, and public/private sectors. This work showcases capabilities in the **Earth (weather and climate) vertical domain**.
+<br>
+<a href="https://www.fbk.eu"><img src="https://webvalley.fbk.eu/static/img/logos/fbk-logo-blue.png" height="45" alt="Fondazione Bruno Kessler"></a>
+&nbsp;&nbsp;&nbsp;&nbsp;
+<a href="https://it4lia-aifactory.eu"><img src="https://it4lia-aifactory.eu/wp-content/uploads/2025/05/logo-IT4LIA-AI-factory.svg" height="45" alt="IT4LIA AI-Factory"></a>
+&nbsp;&nbsp;&nbsp;&nbsp;
+<a href="https://www.italiameteo.eu"><img src="https://it4lia-aifactory.eu/wp-content/uploads/2025/08/logo-italiameteo.svg" height="45" alt="ItaliaMeteo"></a>
+</div>
 ## License
+BSD 2-Clause — see [LICENSE](LICENSE).

convgru_ensemble/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""ConvGRU-Ensemble: Ensemble precipitation nowcasting using Convolutional GRU networks."""
+__version__ = "0.1.0"
+from convgru_ensemble.lightning_model import RadarLightningModel
+from convgru_ensemble.model import EncoderDecoder
+__all__ = ["EncoderDecoder", "RadarLightningModel"]

convgru_ensemble/__main__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Allow running the package as ``python -m convgru_ensemble``."""
+from convgru_ensemble.cli import main
+main()

convgru_ensemble/cli.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""Command-line interface for ConvGRU-Ensemble inference and serving."""
+import time
+import fire
+import numpy as np
+import xarray as xr
+def _load_model(checkpoint: str | None = None, hub_repo: str | None = None, device: str = "cpu"):
+    """Load model from local checkpoint or HuggingFace Hub."""
+    from .lightning_model import RadarLightningModel
+    if hub_repo is not None:
+        print(f"Loading model from HuggingFace Hub: {hub_repo}")
+        return RadarLightningModel.from_pretrained(hub_repo, device=device)
+    elif checkpoint is not None:
+        print(f"Loading model from checkpoint: {checkpoint}")
+        return RadarLightningModel.from_checkpoint(checkpoint, device=device)
+    else:
+        raise ValueError("Either --checkpoint or --hub-repo must be provided.")
+def predict(
+    input: str,
+    checkpoint: str | None = None,
+    hub_repo: str | None = None,
+    variable: str = "RR",
+    forecast_steps: int = 12,
+    ensemble_size: int = 10,
+    device: str = "cpu",
+    output: str = "predictions.nc",
+):
+    """
+    Run inference on a NetCDF input file and save predictions as NetCDF.
+    Args:
+        input: Path to input NetCDF file with rain rate data (T, H, W) or (T, Y, X).
+        checkpoint: Path to local .ckpt checkpoint file.
+        hub_repo: HuggingFace Hub repo ID (e.g., 'it4lia/irene'). Alternative to --checkpoint.
+        variable: Name of the rain rate variable in the NetCDF file.
+        forecast_steps: Number of future timesteps to forecast.
+        ensemble_size: Number of ensemble members to generate.
+        device: Device for inference ('cpu' or 'cuda').
+        output: Path for the output NetCDF file.
+    """
+    model = _load_model(checkpoint, hub_repo, device)
+    # Load input data
+    print(f"Loading input: {input}")
+    ds = xr.open_dataset(input)
+    if variable not in ds:
+        available = list(ds.data_vars)
+        raise ValueError(f"Variable '{variable}' not found. Available: {available}")
+    data = ds[variable].values  # (T, H, W) or similar
+    if data.ndim != 3:
+        raise ValueError(f"Expected 3D data (T, H, W), got shape {data.shape}")
+    print(f"Input shape: {data.shape}")
+    past = data.astype(np.float32)
+    # Run inference
+    t0 = time.perf_counter()
+    preds = model.predict(past, forecast_steps=forecast_steps, ensemble_size=ensemble_size)
+    elapsed = time.perf_counter() - t0
+    print(f"Output shape: {preds.shape} (ensemble, time, H, W)")
+    print(f"Elapsed: {elapsed:.2f}s")
+    # Build output dataset
+    ds_out = xr.Dataset(
+        {
+            "precipitation_forecast": xr.DataArray(
+                data=preds,
+                dims=["ensemble_member", "forecast_step", "y", "x"],
+                attrs={"units": "mm/h", "long_name": "Ensemble precipitation forecast"},
+            ),
+        },
+        attrs={
+            "model": "ConvGRU-Ensemble",
+            "forecast_steps": forecast_steps,
+            "ensemble_size": ensemble_size,
+            "source_file": str(input),
+        },
+    )
+    ds_out.to_netcdf(output)
+    print(f"Predictions saved to: {output}")
+def serve(
+    checkpoint: str | None = None,
+    hub_repo: str | None = None,
+    host: str = "0.0.0.0",
+    port: int = 8000,
+    device: str = "cpu",
+):
+    """
+    Start the FastAPI inference server.
+    Args:
+        checkpoint: Path to local .ckpt checkpoint file.
+        hub_repo: HuggingFace Hub repo ID (e.g., 'it4lia/irene'). Alternative to --checkpoint.
+        host: Host to bind to.
+        port: Port to listen on.
+        device: Device for inference ('cpu' or 'cuda').
+    """
+    import os
+    if checkpoint is not None:
+        os.environ["MODEL_CHECKPOINT"] = checkpoint
+    if hub_repo is not None:
+        os.environ["HF_REPO_ID"] = hub_repo
+    os.environ.setdefault("DEVICE", device)
+    import uvicorn
+    uvicorn.run("convgru_ensemble.serve:app", host=host, port=port)
+def main():
+    fire.Fire({"predict": predict, "serve": serve})
+if __name__ == "__main__":
+    main()

convgru_ensemble/datamodule.py ADDED Viewed

	@@ -0,0 +1,378 @@

+import time
+import numpy as np
+import pandas as pd
+import pytorch_lightning as pl
+import torch
+import xarray as xr
+from torch.utils.data import DataLoader, Dataset
+from .utils import rainrate_to_normalized
+class SampledRadarDataset(Dataset):
+    """
+    PyTorch dataset that loads radar datacubes from a Zarr store using
+    pre-sampled spatial-temporal coordinates from a CSV file.
+    Each sample is a spatio-temporal datacube of shape ``(T, 1, H, W)``
+    converted from rain rate to normalized reflectivity.
+    Parameters
+    ----------
+    zarr_path : str
+        Path to the Zarr dataset containing the ``'RR'`` rain rate variable.
+    csv_path : str
+        Path to the CSV file with columns ``(t, x, y)`` specifying the
+        top-left corner of each datacube.
+    steps : int
+        Number of timesteps to extract per sample.
+    return_mask : bool, optional
+        If ``True``, also return a spatial NaN mask. Default is ``False``.
+    deterministic : bool, optional
+        If ``True``, use a fixed random seed (42) for reproducibility.
+        Default is ``False``.
+    augment : bool, optional
+        If ``True``, apply random spatial augmentations (rotation, flips).
+        Default is ``False``.
+    indices : sequence of int or None, optional
+        Subset of row indices to use from the CSV. If ``None``, use all rows.
+        Default is ``None``.
+    """
+    def __init__(
+        self,
+        zarr_path: str,
+        csv_path: str,
+        steps: int,
+        return_mask: bool = False,
+        deterministic: bool = False,
+        augment: bool = False,
+        indices=None,
+    ):
+        """
+        Initialize SampledRadarDataset.
+        Parameters
+        ----------
+        zarr_path : str
+            Path to the Zarr dataset containing the ``'RR'`` rain rate
+            variable.
+        csv_path : str
+            Path to the CSV file with columns ``(t, x, y)``.
+        steps : int
+            Number of timesteps to extract per sample.
+        return_mask : bool, optional
+            If ``True``, also return a spatial NaN mask. Default is ``False``.
+        deterministic : bool, optional
+            If ``True``, use a fixed random seed (42). Default is ``False``.
+        augment : bool, optional
+            If ``True``, apply random spatial augmentations. Default is
+            ``False``.
+        indices : sequence of int or None, optional
+            Subset of row indices from the CSV. Default is ``None``.
+        """
+        self.coords = pd.read_csv(csv_path).sort_values("t")
+        if indices is not None:
+            self.coords = self.coords.iloc[list(indices)].reset_index(drop=True)
+        self.zg = xr.open_zarr(zarr_path)
+        self.RR = self.zg["RR"]
+        self.rng = np.random.default_rng(seed=42) if deterministic else np.random.default_rng(int(time.time()))
+        self.return_mask = return_mask
+        self.augment = augment
+        if augment:
+            print("Data augmentation is enabled.")
+        # default valid grid size and time step
+        self.w = 256
+        self.h = 256
+        self.dt = 24
+        self.steps = steps
+        # raise warning if steps > dt
+        if self.steps > self.dt:
+            print(f"Warning: requested steps ({self.steps}) > sampled time window ({self.dt})")
+    def __len__(self):
+        """
+        Return the number of samples in the dataset.
+        Returns
+        -------
+        length : int
+            Number of datacube samples.
+        """
+        return len(self.coords)
+    def shape(self):
+        """
+        Return the nominal shape of the full dataset.
+        Returns
+        -------
+        shape : tuple of int
+            ``(num_samples, steps, 1, width, height)``.
+        """
+        return (len(self.coords), self.steps, 1, self.w, self.h)
+    def _apply_augmentations(
+        self, *tensors, rotate_prob: float = 0.5, hflip_prob: float = 0.5, vflip_prob: float = 0.5
+    ):
+        """
+        Apply random spatial augmentations consistently to all input tensors.
+        All tensors receive the same random transformation so that spatial
+        alignment is preserved (e.g. between data and mask).
+        Parameters
+        ----------
+        *tensors : torch.Tensor
+            One or more tensors of shape ``(T, C, H, W)``.
+        rotate_prob : float, optional
+            Probability of applying a random 90-degree rotation. Default is
+            ``0.5``.
+        hflip_prob : float, optional
+            Probability of applying a horizontal flip. Default is ``0.5``.
+        vflip_prob : float, optional
+            Probability of applying a vertical flip. Default is ``0.5``.
+        Returns
+        -------
+        augmented : torch.Tensor or tuple of torch.Tensor
+            Single tensor if one input was given, otherwise a tuple of
+            augmented tensors.
+        """
+        # Random 90-degree rotation (0, 90, 180, or 270 degrees)
+        if self.rng.random() < rotate_prob:
+            k = self.rng.integers(1, 4)  # 1=90, 2=180, 3=270 degrees
+            tensors = [torch.rot90(t, k, dims=[-2, -1]) for t in tensors]
+        # Random horizontal flip
+        if self.rng.random() < hflip_prob:
+            tensors = [torch.flip(t, dims=[-1]) for t in tensors]
+        # Random vertical flip
+        if self.rng.random() < vflip_prob:
+            tensors = [torch.flip(t, dims=[-2]) for t in tensors]
+        tensors = [t.contiguous() for t in tensors]
+        return tensors[0] if len(tensors) == 1 else tuple(tensors)
+    def __getitem__(self, idx: int):
+        """
+        Load and return a single datacube sample.
+        Parameters
+        ----------
+        idx : int
+            Index of the sample in the dataset.
+        Returns
+        -------
+        sample : dict of str to torch.Tensor
+            Dictionary with key ``'data'`` containing a tensor of shape
+            ``(T, 1, H, W)``. If ``return_mask`` is ``True``, also contains
+            ``'mask'`` of shape ``(1, 1, H, W)``.
+        """
+        t0, x0, y0 = self.coords.iloc[idx]
+        x_slice = slice(x0, x0 + self.w)
+        y_slice = slice(y0, y0 + self.h)
+        if self.steps < self.dt:
+            # radom sampling within available time window
+            t_start = self.rng.integers(t0, t0 + self.dt - self.steps + 1)
+        else:
+            t_start = t0
+        t_slice = slice(t_start, t_start + self.steps)
+        data = rainrate_to_normalized(self.RR[t_slice, x_slice, y_slice])
+        # create a mask for all nan values over time dimension
+        # shape: (1, H, W) - NOT repeated over time, broadcasting handles it
+        if self.return_mask:
+            mask = (~(np.isnan(data).any(axis=0, keepdims=True))).astype(np.float32)
+        # replace nan values with -1
+        data = np.nan_to_num(data, nan=-1.0)
+        # convert to tensors
+        data = torch.from_numpy(data[:, np.newaxis, :, :])
+        if self.return_mask:
+            mask = torch.from_numpy(mask.values[:, np.newaxis, :, :])
+        # apply augmentations (training only)
+        if self.augment:
+            if self.return_mask:
+                data, mask = self._apply_augmentations(data, mask)
+            else:
+                data = self._apply_augmentations(data)
+        if self.return_mask:
+            return {"data": data, "mask": mask}
+        else:
+            return {"data": data}
+class RadarDataModule(pl.LightningDataModule):
+    """
+    PyTorch Lightning data module for radar datacube datasets.
+    Handles train/val/test splitting and DataLoader creation from a single
+    Zarr store and CSV coordinate file.
+    Parameters
+    ----------
+    zarr_path : str
+        Path to the Zarr dataset.
+    csv_path : str
+        Path to the CSV file with datacube coordinates.
+    steps : int
+        Number of timesteps per sample.
+    train_ratio : float, optional
+        Fraction of data used for training. Default is ``0.7``.
+    val_ratio : float, optional
+        Fraction of data used for validation. Default is ``0.15``.
+    return_mask : bool, optional
+        Whether to return NaN masks. Default is ``False``.
+    deterministic : bool, optional
+        Whether to use fixed random seeds. Default is ``False``.
+    augment : bool, optional
+        Whether to apply data augmentation (training set only). Default is
+        ``True``.
+    **dataloader_kwargs
+        Additional keyword arguments forwarded to ``DataLoader`` (e.g.
+        ``batch_size``, ``num_workers``, ``pin_memory``).
+    """
+    def __init__(
+        self,
+        zarr_path,
+        csv_path,
+        steps,
+        train_ratio=0.7,
+        val_ratio=0.15,
+        return_mask=False,
+        deterministic=False,
+        augment=True,
+        **dataloader_kwargs,
+    ):
+        """
+        Initialize RadarDataModule.
+        Parameters
+        ----------
+        zarr_path : str
+            Path to the Zarr dataset.
+        csv_path : str
+            Path to the CSV file with datacube coordinates.
+        steps : int
+            Number of timesteps per sample.
+        train_ratio : float, optional
+            Fraction of data for training. Default is ``0.7``.
+        val_ratio : float, optional
+            Fraction of data for validation. Default is ``0.15``.
+        return_mask : bool, optional
+            Whether to return NaN masks. Default is ``False``.
+        deterministic : bool, optional
+            Whether to use fixed random seeds. Default is ``False``.
+        augment : bool, optional
+            Whether to apply data augmentation. Default is ``True``.
+        **dataloader_kwargs
+            Forwarded to ``DataLoader``.
+        """
+        super().__init__()
+        self.zarr_path = zarr_path
+        self.csv_path = csv_path
+        self.steps = steps
+        self.train_ratio = train_ratio
+        self.val_ratio = val_ratio
+        self.dataloader_kwargs = dataloader_kwargs
+        self.return_mask = return_mask
+        self.deterministic = deterministic
+        self.augment = augment
+    def setup(self, stage=None):
+        """
+        Create train, validation, and test datasets from the CSV coordinates.
+        Splits are chronological: the first ``train_ratio`` fraction is used
+        for training, the next ``val_ratio`` for validation, and the rest for
+        testing. Augmentation is only applied to the training set.
+        Parameters
+        ----------
+        stage : str or None, optional
+            Lightning stage (``'fit'``, ``'test'``, etc.). Ignored; all
+            datasets are always created. Default is ``None``.
+        """
+        # Load CSV to get total length for splitting
+        coords = pd.read_csv(self.csv_path).sort_values("t")
+        n = len(coords)
+        # Compute split indices
+        train_end = int(n * self.train_ratio)
+        val_end = int(n * (self.train_ratio + self.val_ratio))
+        # Create separate datasets (augmentation only for training)
+        self.train_dataset = SampledRadarDataset(
+            self.zarr_path,
+            self.csv_path,
+            self.steps,
+            self.return_mask,
+            self.deterministic,
+            augment=self.augment,
+            indices=range(0, train_end),
+        )
+        self.val_dataset = SampledRadarDataset(
+            self.zarr_path,
+            self.csv_path,
+            self.steps,
+            self.return_mask,
+            self.deterministic,
+            augment=False,
+            indices=range(train_end, val_end),
+        )
+        self.test_dataset = SampledRadarDataset(
+            self.zarr_path,
+            self.csv_path,
+            self.steps,
+            self.return_mask,
+            self.deterministic,
+            augment=False,
+            indices=range(val_end, n),
+        )
+    def train_dataloader(self):
+        """
+        Return the training DataLoader.
+        Returns
+        -------
+        loader : DataLoader
+            DataLoader over the training dataset with shuffling enabled.
+        """
+        return DataLoader(self.train_dataset, shuffle=True, **self.dataloader_kwargs)
+    def val_dataloader(self):
+        """
+        Return the validation DataLoader.
+        Returns
+        -------
+        loader : DataLoader
+            DataLoader over the validation dataset without shuffling.
+        """
+        return DataLoader(self.val_dataset, shuffle=False, **self.dataloader_kwargs)
+    def test_dataloader(self):
+        """
+        Return the test DataLoader.
+        Returns
+        -------
+        loader : DataLoader
+            DataLoader over the test dataset without shuffling.
+        """
+        return DataLoader(self.test_dataset, shuffle=False, **self.dataloader_kwargs)

convgru_ensemble/hub.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""HuggingFace Hub integration for uploading and downloading ConvGRU-Ensemble models."""
+import json
+import shutil
+import tempfile
+from pathlib import Path
+from huggingface_hub import HfApi, hf_hub_download
+def push_to_hub(
+    checkpoint_path: str,
+    repo_id: str,
+    model_card_path: str | None = None,
+    private: bool = False,
+) -> str:
+    """
+    Upload a trained model checkpoint to HuggingFace Hub.
+    Parameters
+    ----------
+    checkpoint_path : str
+        Path to the ``.ckpt`` checkpoint file.
+    repo_id : str
+        HuggingFace Hub repository ID (e.g., ``'it4lia/irene'``).
+    model_card_path : str or None, optional
+        Path to a model card markdown file. If provided, it is uploaded
+        as ``README.md``. Default is ``None``.
+    private : bool, optional
+        Whether to create a private repository. Default is ``False``.
+    Returns
+    -------
+    url : str
+        URL of the uploaded model on HuggingFace Hub.
+    """
+    import torch
+    api = HfApi()
+    api.create_repo(repo_id=repo_id, exist_ok=True, private=private)
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        tmp_path = Path(tmp_dir)
+        # Copy checkpoint
+        shutil.copy2(checkpoint_path, tmp_path / "model.ckpt")
+        # Extract and save model config from checkpoint
+        ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
+        if "hyper_parameters" in ckpt:
+            hparams = ckpt["hyper_parameters"]
+            # Convert non-serializable values to strings
+            config = {}
+            for k, v in hparams.items():
+                try:
+                    json.dumps(v)
+                    config[k] = v
+                except (TypeError, ValueError):
+                    config[k] = str(v)
+            with open(tmp_path / "config.json", "w") as f:
+                json.dump(config, f, indent=2)
+        # Copy model card as README.md
+        if model_card_path is not None:
+            shutil.copy2(model_card_path, tmp_path / "README.md")
+        url = api.upload_folder(
+            folder_path=str(tmp_path),
+            repo_id=repo_id,
+            commit_message="Upload ConvGRU-Ensemble model",
+        )
+    return url
+def from_pretrained(
+    repo_id: str,
+    filename: str = "model.ckpt",
+    device: str = "cpu",
+) -> "RadarLightningModel":  # noqa: F821
+    """
+    Download and load a pretrained model from HuggingFace Hub.
+    Parameters
+    ----------
+    repo_id : str
+        HuggingFace Hub repository ID (e.g., ``'it4lia/irene'``).
+    filename : str, optional
+        Name of the checkpoint file in the repository. Default is
+        ``'model.ckpt'``.
+    device : str, optional
+        Device to map the model weights to. Default is ``'cpu'``.
+    Returns
+    -------
+    model : RadarLightningModel
+        Model with loaded pretrained weights.
+    """
+    from .lightning_model import RadarLightningModel
+    ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename)
+    return RadarLightningModel.from_checkpoint(ckpt_path, device=device)

convgru_ensemble/lightning_model.py ADDED Viewed

	@@ -0,0 +1,560 @@

+from typing import Any
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torchvision
+from .losses import build_loss
+from .model import EncoderDecoder
+from .utils import normalized_to_rainrate, rainrate_to_normalized
+def apply_radar_colormap(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Convert grayscale radar values to RGB using the STEPS-BE colorscale.
+    Maps normalized values in [0, 1] (representing 0-60 dBZ) to a 14-color
+    discrete colormap. Pixels below 10 dBZ are rendered as white.
+    Parameters
+    ----------
+    tensor : torch.Tensor
+        Grayscale tensor with values in [0, 1], of shape ``(N, 1, H, W)``.
+    Returns
+    -------
+    rgb : torch.Tensor
+        RGB tensor of shape ``(N, 3, H, W)`` with values in [0, 1].
+    """
+    # STEPS-BE colors (RGB values normalized to 0-1)
+    colors = (
+        torch.tensor(
+            [
+                [0, 255, 255],  # cyan
+                [0, 191, 255],  # deepskyblue
+                [30, 144, 255],  # dodgerblue
+                [0, 0, 255],  # blue
+                [127, 255, 0],  # chartreuse
+                [50, 205, 50],  # limegreen
+                [0, 128, 0],  # green
+                [0, 100, 0],  # darkgreen
+                [255, 255, 0],  # yellow
+                [255, 215, 0],  # gold
+                [255, 165, 0],  # orange
+                [255, 0, 0],  # red
+                [255, 0, 255],  # magenta
+                [139, 0, 139],  # darkmagenta
+            ],
+            dtype=torch.float32,
+            device=tensor.device,
+        )
+        / 255.0
+    )
+    # dBZ levels: 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60 (11 levels, 10 intervals)
+    # But we have 14 colors, so extend to cover 10-80 dBZ range with 5 dBZ steps
+    # Normalized thresholds (0-1 maps to 0-60 dBZ)
+    # We'll use 14 intervals from 10 dBZ onwards
+    num_colors = len(colors)
+    min_dbz_norm = 10 / 60  # ~0.167, below this is background
+    max_dbz_norm = 1.0
+    thresholds = torch.linspace(min_dbz_norm, max_dbz_norm, num_colors + 1, device=tensor.device)
+    # Output tensor (N, 3, H, W) - initialize with white for values below 10 dBZ
+    N, _, H, W = tensor.shape
+    output = torch.ones(N, 3, H, W, dtype=torch.float32, device=tensor.device)
+    # Apply colormap: find which bin each pixel falls into
+    for i in range(num_colors - 1):
+        mask = (tensor[:, 0] >= thresholds[i]) & (tensor[:, 0] < thresholds[i + 1])
+        for c in range(3):
+            output[:, c][mask] = colors[i, c]
+    # Last color handles all values >= second-to-last threshold (inclusive of max)
+    mask = tensor[:, 0] >= thresholds[num_colors - 1]
+    for c in range(3):
+        output[:, c][mask] = colors[-1, c]
+    return output
+class RadarLightningModel(pl.LightningModule):
+    """
+    PyTorch Lightning module for radar precipitation nowcasting.
+    Wraps an :class:`EncoderDecoder` model and handles training, validation,
+    and test steps including loss computation, ensemble generation, and
+    TensorBoard image logging.
+    Parameters
+    ----------
+    input_channels : int
+        Number of input channels per grid point.
+    num_blocks : int
+        Number of encoder/decoder blocks in the model.
+    ensemble_size : int, optional
+        Number of ensemble members to generate. Default is ``1``.
+    noisy_decoder : bool, optional
+        Whether to use random noise as decoder input. Default is ``False``.
+    forecast_steps : int or None, optional
+        Number of future timesteps to forecast. Default is ``None``.
+    loss_class : type, str, or None, optional
+        Loss function class or its string name (see ``PIXEL_LOSSES``).
+        Default is ``None`` (MSELoss).
+    loss_params : dict or None, optional
+        Keyword arguments for the loss constructor. Default is ``None``.
+    masked_loss : bool, optional
+        Whether to wrap the loss with :class:`MaskedLoss`. Default is
+        ``False``.
+    optimizer_class : type or None, optional
+        Optimizer class. Default is ``None`` (Adam).
+    optimizer_params : dict or None, optional
+        Keyword arguments for the optimizer. Default is ``None``.
+    lr_scheduler_class : type or None, optional
+        Learning rate scheduler class. Default is ``None``.
+    lr_scheduler_params : dict or None, optional
+        Keyword arguments for the LR scheduler. Default is ``None``.
+    """
+    def __init__(
+        self,
+        input_channels: int,
+        num_blocks: int,
+        ensemble_size: int = 1,
+        noisy_decoder: bool = False,
+        forecast_steps: type | int | None = None,
+        loss_class: type | str | None = None,
+        loss_params: dict[str, Any] | None = None,
+        masked_loss: bool = False,
+        optimizer_class: type | None = None,
+        optimizer_params: dict[str, Any] | None = None,
+        lr_scheduler_class: type | None = None,
+        lr_scheduler_params: dict[str, Any] | None = None,
+    ) -> None:
+        """
+        Initialize RadarLightningModel.
+        Parameters
+        ----------
+        input_channels : int
+            Number of input channels per grid point.
+        num_blocks : int
+            Number of encoder/decoder blocks.
+        ensemble_size : int, optional
+            Number of ensemble members. Default is ``1``.
+        noisy_decoder : bool, optional
+            Use random noise as decoder input. Default is ``False``.
+        forecast_steps : int or None, optional
+            Number of future timesteps to forecast. Default is ``None``.
+        loss_class : type, str, or None, optional
+            Loss function class or name. Default is ``None``.
+        loss_params : dict or None, optional
+            Loss constructor kwargs. Default is ``None``.
+        masked_loss : bool, optional
+            Wrap loss with masking. Default is ``False``.
+        optimizer_class : type or None, optional
+            Optimizer class. Default is ``None``.
+        optimizer_params : dict or None, optional
+            Optimizer kwargs. Default is ``None``.
+        lr_scheduler_class : type or None, optional
+            LR scheduler class. Default is ``None``.
+        lr_scheduler_params : dict or None, optional
+            LR scheduler kwargs. Default is ``None``.
+        """
+        super().__init__()
+        self.save_hyperparameters()
+        # Initialize model
+        self.model = EncoderDecoder(self.hparams.input_channels, self.hparams.num_blocks)
+        self.criterion = build_loss(
+            loss_class=self.hparams.loss_class,
+            loss_params=self.hparams.loss_params,
+            masked_loss=self.hparams.masked_loss,
+        )
+        self.log_images_iterations = [50, 100, 200, 500, 750, 1000, 2000, 5000]
+        if self.hparams.ensemble_size > 1:
+            print(f"Using ensemble mode: {self.hparams.ensemble_size} independent ensemble members will be generated.")
+    def forward(self, x: torch.Tensor, forecast_steps: int, ensemble_size: int | None = None) -> torch.Tensor:
+        """
+        Run the encoder-decoder forward pass.
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape ``(B, T, C, H, W)``.
+        forecast_steps : int
+            Number of future timesteps to forecast.
+        ensemble_size : int or None, optional
+            Number of ensemble members. If ``None``, uses the value from
+            ``hparams``. Default is ``None``.
+        Returns
+        -------
+        preds : torch.Tensor
+            Predictions of shape ``(B, forecast_steps, C, H, W)`` or
+            ``(B, forecast_steps, ensemble_size, H, W)`` for ensembles.
+        """
+        ensemble_size = self.hparams.ensemble_size if ensemble_size is None else ensemble_size
+        return self.model(
+            x, steps=forecast_steps, noisy_decoder=self.hparams.noisy_decoder, ensemble_size=ensemble_size
+        )
+    def shared_step(
+        self, batch: dict[str, torch.Tensor], split: str = "train", ensemble_size: int | None = None
+    ) -> torch.Tensor:
+        """
+        Shared forward step used during training, validation, and testing.
+        Splits the input into past and future, runs the model, computes the
+        loss, and logs metrics and optional images.
+        Parameters
+        ----------
+        batch : dict of str to torch.Tensor
+            Batch dictionary with key ``'data'`` of shape
+            ``(B, T_total, C, H, W)`` and optionally ``'mask'``.
+        split : str, optional
+            One of ``'train'``, ``'val'``, or ``'test'``. Controls logging
+            behavior. Default is ``'train'``.
+        ensemble_size : int or None, optional
+            Override for the number of ensemble members. Default is ``None``.
+        Returns
+        -------
+        loss : torch.Tensor
+            Scalar loss value.
+        """
+        data = batch["data"]
+        past = data[:, : -self.hparams.forecast_steps]
+        future = data[:, -self.hparams.forecast_steps :]
+        preds = self(past, forecast_steps=self.hparams.forecast_steps, ensemble_size=ensemble_size).clamp(
+            min=-1, max=1
+        )  # Ensure predictions are within [-1, 1]
+        if self.hparams.masked_loss:
+            mask = batch["mask"][:, -self.hparams.forecast_steps :]
+            loss = self.criterion(preds, future, mask)
+        else:
+            loss = self.criterion(preds, future)
+        # Handle tuple return from composite losses
+        if isinstance(loss, tuple):
+            loss, log_dict = loss
+            # log_dict already contains split-prefixed keys like 'val/pixel_loss'
+            self.log_dict(
+                log_dict, prog_bar=False, logger=True, on_step=(split == "train"), on_epoch=True, sync_dist=True
+            )
+        self.log(f"{split}_loss", loss, prog_bar=True, on_epoch=True, on_step=(split == "train"), sync_dist=True)
+        # Log ensemble diversity for ensemble training
+        if self.hparams.ensemble_size > 1:
+            ensemble_std = preds.std(dim=2).mean()  # std across ensemble members
+            self.log(f"{split}_ensemble_std", ensemble_std, on_epoch=True, sync_dist=True)
+        if split == "train" and (
+            self.global_step in self.log_images_iterations or self.global_step % self.log_images_iterations[-1] == 0
+        ):
+            self.log_images(past, future, preds, split=split)
+        return loss
+    def log_images(self, past: torch.Tensor, future: torch.Tensor, preds: torch.Tensor, split: str = "val") -> None:
+        """
+        Log radar image grids to TensorBoard.
+        Visualizes the first sample in the batch, showing past frames, ground
+        truth future, ensemble average, and individual ensemble members using
+        the STEPS-BE radar colormap.
+        Parameters
+        ----------
+        past : torch.Tensor
+            Past input frames of shape ``(B, T_past, C, H, W)``.
+        future : torch.Tensor
+            Ground truth future frames of shape ``(B, T_future, C, H, W)``.
+        preds : torch.Tensor
+            Predicted frames of shape ``(B, T_future, C_or_E, H, W)``.
+        split : str, optional
+            Split name used as TensorBoard tag prefix. Default is ``'val'``.
+        """
+        # Log first sample in the batch
+        sample_idx = 0
+        # Log past separately
+        past_sample = past[sample_idx]
+        if self.hparams.ensemble_size > 1:
+            past_sample = past_sample.mean(dim=1, keepdim=True)
+        past_norm = (past_sample + 1) / 2
+        past_rgb = apply_radar_colormap(past_norm)
+        past_grid = torchvision.utils.make_grid(past_rgb, nrow=past_sample.shape[0])
+        self.logger.experiment.add_image(f"{split}/past", past_grid, self.global_step)
+        # Create combined preds grid: future (ground truth) as first row, then avg + ensemble members
+        future_sample = future[sample_idx]  # (T, C, H, W)
+        preds_sample = preds[sample_idx]  # (T, E, H, W) or (T, C, H, W)
+        if self.hparams.ensemble_size > 1:
+            # Layout: rows = [future, avg, member0, member1, ...], cols = timesteps
+            preds_avg = preds_sample.mean(dim=1, keepdim=True)  # (T, E, H, W) -> (T, 1, H, W)
+            num_members_to_log = min(3, preds_sample.shape[1])
+            # Collect all rows: future first, then average, then individual members
+            rows = [future_sample]  # (T, 1, H, W)
+            rows.append(preds_avg)  # (T, 1, H, W)
+            for i in range(num_members_to_log):
+                rows.append(preds_sample[:, i : i + 1, :, :])  # (T, 1, H, W)
+            # Stack all rows: (num_rows * T, 1, H, W)
+            all_frames = torch.cat(rows, dim=0)  # ((2 + num_members) * T, 1, H, W)
+            all_frames_norm = (all_frames + 1) / 2
+            all_frames_rgb = apply_radar_colormap(all_frames_norm)
+            grid = torchvision.utils.make_grid(all_frames_rgb, nrow=future_sample.shape[0])
+            self.logger.experiment.add_image(f"{split}/preds", grid, self.global_step)
+        else:
+            # For non-ensemble: show future and preds in two rows
+            rows = [future_sample, preds_sample]  # Each is (T, C, H, W)
+            all_frames = torch.cat(rows, dim=0)  # (2 * T, C, H, W)
+            all_frames_norm = (all_frames + 1) / 2
+            all_frames_rgb = apply_radar_colormap(all_frames_norm)
+            grid = torchvision.utils.make_grid(all_frames_rgb, nrow=future_sample.shape[0])
+            self.logger.experiment.add_image(f"{split}/preds", grid, self.global_step)
+    def training_step(self, batch: dict[str, torch.Tensor], batch_idx: int) -> torch.Tensor:
+        """
+        Execute a single training step.
+        Parameters
+        ----------
+        batch : dict of str to torch.Tensor
+            Training batch.
+        batch_idx : int
+            Index of the batch.
+        Returns
+        -------
+        loss : torch.Tensor
+            Training loss.
+        """
+        loss = self.shared_step(batch, split="train")
+        return loss
+    def validation_step(
+        self,
+        batch: dict[str, torch.Tensor],
+        batch_idx: int,
+    ) -> torch.Tensor:
+        """
+        Execute a single validation step.
+        Uses 10 ensemble members for evaluation.
+        Parameters
+        ----------
+        batch : dict of str to torch.Tensor
+            Validation batch.
+        batch_idx : int
+            Index of the batch.
+        Returns
+        -------
+        loss : torch.Tensor
+            Validation loss.
+        """
+        loss = self.shared_step(batch, split="val", ensemble_size=10)
+        return loss
+    def test_step(self, batch: dict[str, torch.Tensor], batch_idx: int) -> torch.Tensor:
+        """
+        Execute a single test step.
+        Uses 10 ensemble members for evaluation.
+        Parameters
+        ----------
+        batch : dict of str to torch.Tensor
+            Test batch.
+        batch_idx : int
+            Index of the batch.
+        Returns
+        -------
+        loss : torch.Tensor
+            Test loss.
+        """
+        loss = self.shared_step(batch, split="test", ensemble_size=10)
+        return loss
+    def configure_optimizers(self) -> dict[str, Any]:
+        """
+        Configure the optimizer and optional learning rate scheduler.
+        Falls back to Adam with default parameters if no optimizer is
+        specified. If a scheduler is provided, it monitors ``val_loss``.
+        Returns
+        -------
+        config : dict
+            Dictionary with ``'optimizer'`` and optionally ``'lr_scheduler'``
+            keys, as expected by PyTorch Lightning.
+        """
+        if self.hparams.optimizer_class is not None:
+            optimizer = (
+                self.hparams.optimizer_class(self.parameters(), **self.hparams.optimizer_params)
+                if self.hparams.optimizer_params is not None
+                else self.hparams.optimizer_class(self.parameters())
+            )
+            print(
+                f"Using optimizer: {self.hparams.optimizer_class.__name__} with params {self.hparams.optimizer_params}"
+            )
+        else:
+            optimizer = torch.optim.Adam(self.parameters())
+            print("Using default Adam optimizer with default parameters.")
+        if self.hparams.lr_scheduler_class is not None:
+            lr_scheduler = (
+                self.hparams.lr_scheduler_class(optimizer, **self.hparams.lr_scheduler_params)
+                if self.hparams.lr_scheduler_params is not None
+                else self.hparams.lr_scheduler_class(optimizer)
+            )
+            print(
+                f"Using LR scheduler: {self.hparams.lr_scheduler_class.__name__} with params {self.hparams.lr_scheduler_params}"
+            )
+            return {"optimizer": optimizer, "lr_scheduler": {"scheduler": lr_scheduler, "monitor": "val_loss"}}
+        else:
+            return {"optimizer": optimizer}
+    @classmethod
+    def from_checkpoint(cls, checkpoint_path: str, device: str = "cpu") -> "RadarLightningModel":
+        """
+        Load a model from a checkpoint file.
+        Parameters
+        ----------
+        checkpoint_path : str
+            Path to the ``.ckpt`` checkpoint file.
+        device : str, optional
+            Device to map the checkpoint weights to. Default is ``'cpu'``.
+        Returns
+        -------
+        model : RadarLightningModel
+            Model with loaded weights.
+        """
+        return cls.load_from_checkpoint(
+            checkpoint_path,
+            map_location=torch.device(device),
+            strict=True,
+            weights_only=False,
+        )
+    @classmethod
+    def from_pretrained(cls, repo_id: str, filename: str = "model.ckpt", device: str = "cpu") -> "RadarLightningModel":
+        """
+        Load a pretrained model from HuggingFace Hub.
+        Parameters
+        ----------
+        repo_id : str
+            HuggingFace Hub repository ID (e.g., ``'it4lia/irene'``).
+        filename : str, optional
+            Name of the checkpoint file in the repository. Default is
+            ``'model.ckpt'``.
+        device : str, optional
+            Device to map the model weights to. Default is ``'cpu'``.
+        Returns
+        -------
+        model : RadarLightningModel
+            Model with loaded pretrained weights.
+        """
+        from .hub import from_pretrained
+        return from_pretrained(repo_id, filename, device)
+    def predict(self, past: torch.Tensor, forecast_steps: int = 1, ensemble_size: int | None = 1) -> torch.Tensor:
+        """
+        Generate precipitation forecasts from past radar observations.
+        Handles padding, NaN removal, unit conversion, and reshaping
+        automatically. Input should be raw rain rate values.
+        Parameters
+        ----------
+        past : torch.Tensor
+            Past radar frames as rain rate in mm/h, of shape ``(T, H, W)``.
+        forecast_steps : int, optional
+            Number of future timesteps to forecast. Default is ``1``.
+        ensemble_size : int, optional
+            Number of ensemble members to generate. If ``None``, uses the
+            value from ``hparams``. Default is ``1``.
+        Returns
+        -------
+        preds : np.ndarray
+            Forecasted rain rate in mm/h, of shape
+            ``(ensemble_size, forecast_steps, H, W)``.
+        Raises
+        ------
+        ValueError
+            If ``past`` does not have exactly 3 dimensions.
+        """
+        if len(past.shape) != 3:
+            raise ValueError("Input must be of shape (T, H, W)")
+        T, H, W = past.shape
+        ensemble_size = self.hparams.ensemble_size if ensemble_size is None else ensemble_size
+        # Each block the model decrease the resolution by a factor of 2
+        # The input must be divisible by 2^(num_blocks-1)
+        divisor = 2 ** (self.hparams.num_blocks)
+        padH = (divisor - (H % divisor)) % divisor
+        padW = (divisor - (W % divisor)) % divisor
+        padded_past = past
+        if padH != 0 or padW != 0:
+            padded_past = np.pad(past, ((0, 0), (0, padH), (0, padW)), mode="constant", constant_values=0)
+        # Remove Nan
+        past_clean = np.nan_to_num(padded_past)
+        # Reshape the input to (B, T, C, H, W)
+        past_clean = past_clean[np.newaxis, :, np.newaxis, ...]
+        # Rainrate to normalized reflectivity
+        norm_past = rainrate_to_normalized(past_clean)
+        # Numpy to torch tensor
+        x = torch.from_numpy(norm_past)
+        # Move to device
+        x = x.to(self.device)
+        # Forward pass
+        self.eval()
+        with torch.no_grad():
+            preds = self.model(x, forecast_steps, self.hparams.noisy_decoder, ensemble_size)
+        # Move to CPU
+        preds = preds.cpu()
+        # Tensor to numpy array
+        preds = preds.numpy()
+        # Rescale back to rain rate
+        preds = normalized_to_rainrate(preds)
+        # Remove the batch (T, E, H, W)
+        preds = preds.squeeze(0)
+        # Swap the Time and Ensemble dimensions (E, T, H, W)
+        preds = np.swapaxes(preds, 0, 1)
+        # Remove the padding
+        preds = preds[..., :H, :W]
+        return preds

convgru_ensemble/losses.py ADDED Viewed

	@@ -0,0 +1,458 @@

+from typing import Any
+import torch
+from torch import nn
+class LossWithReduction(nn.Module):
+    """
+    Base class for losses with reduction options.
+    Parameters
+    ----------
+    reduction : str, optional
+        Reduction mode to apply to the loss. Must be one of ``'mean'``,
+        ``'sum'``, or ``'none'``. Default is ``'mean'``.
+    """
+    def __init__(self, reduction: str = "mean"):
+        """
+        Initialize LossWithReduction.
+        Parameters
+        ----------
+        reduction : str, optional
+            Reduction mode to apply to the loss. Must be one of ``'mean'``,
+            ``'sum'``, or ``'none'``. Default is ``'mean'``.
+        """
+        super().__init__()
+        assert reduction in ["mean", "sum", "none"], "reduction must be 'mean', 'sum', or 'none'"
+        self.reduction = reduction
+    def apply_reduction(self, loss: torch.Tensor) -> torch.Tensor:
+        """
+        Apply the specified reduction to the loss tensor.
+        Parameters
+        ----------
+        loss : torch.Tensor
+            Loss tensor to reduce.
+        Returns
+        -------
+        reduced_loss : torch.Tensor
+            Reduced loss tensor.
+        """
+        if self.reduction == "mean":
+            return loss.mean()
+        elif self.reduction == "sum":
+            return loss.sum()
+        else:  # 'none'
+            return loss
+class MaskedLoss(LossWithReduction):
+    """
+    Wrapper to apply a mask to a given loss function.
+    Masks out invalid pixels before computing the loss, ensuring that only
+    valid regions contribute to the final value.
+    Parameters
+    ----------
+    elementwise_loss : nn.Module
+        Base loss function to be masked. Must accept ``(preds, target)`` and
+        return element-wise (unreduced) loss. Should be instantiated with
+        ``reduction='none'``.
+    reduction : str, optional
+        Reduction mode applied after masking. Must be one of ``'mean'``,
+        ``'sum'``, or ``'none'``. Default is ``'mean'``.
+    """
+    def __init__(self, elementwise_loss: nn.Module, reduction: str = "mean"):
+        """
+        Initialize MaskedLoss.
+        Parameters
+        ----------
+        elementwise_loss : nn.Module
+            Base loss function to be masked. Must accept ``(preds, target)``
+            and return element-wise (unreduced) loss. Should be instantiated
+            with ``reduction='none'``.
+        reduction : str, optional
+            Reduction mode applied after masking. Must be one of ``'mean'``,
+            ``'sum'``, or ``'none'``. Default is ``'mean'``.
+        """
+        super().__init__(reduction=reduction)
+        self.elementwise_loss = elementwise_loss
+    def forward(self, preds: torch.Tensor, target: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        """
+        Compute masked loss.
+        Parameters
+        ----------
+        preds : torch.Tensor
+            Predictions of shape (B, T, C, *D).
+        target : torch.Tensor
+            Target of shape (B, T, C, *D).
+        mask : torch.Tensor
+            Mask of shape (B, T, C, *D)
+                       or (B, T, 1, *D)
+                       or (B, 1, 1, *D), with 1 for valid and 0 for invalid pixels.
+            Broadcasted to match preds/target shape if needed.
+        Returns
+        -------
+        loss : torch.Tensor
+            Scalar loss value.
+        """
+        # assert preds.shape == target.shape, f"preds and target must have the same shape, got {preds.shape} and {target.shape}"
+        # assert mask.shape == preds.shape, f"mask must have the same shape as preds, got {mask.shape} and {preds.shape}"
+        # Compute element-wise loss
+        elementwise_loss = self.elementwise_loss(preds, target)  # shape (B, T, C, *D)
+        # Apply mask (broadcast if needed)
+        masked_loss = elementwise_loss * mask  # shape (B, T, C, *D)
+        # Average over valid pixels
+        # Account for broadcasting: mask.sum() × broadcast_factor
+        broadcast_factor = elementwise_loss.numel() // mask.numel()
+        valid_pixels = mask.sum() * broadcast_factor
+        if valid_pixels > 0:
+            if self.reduction == "mean":
+                return masked_loss.sum() / valid_pixels
+            elif self.reduction == "sum":
+                return masked_loss.sum()
+            else:  # 'none'
+                return masked_loss
+        else:
+            return torch.tensor(0.0, device=preds.device)
+class CRPS(LossWithReduction):
+    r"""
+    Continuous Ranked Probability Score (CRPS) loss with optional temporal
+    consistency regularization.
+    CRPS = E[|X - y|] - 0.5 * E[|X - X'|], where X, X' are independent
+    samples from the forecast distribution and y is the observation.
+    Parameters
+    ----------
+    temporal_lambda : float, optional
+        Weight for the temporal consistency penalty. If ``0.0`` (default),
+        the penalty is disabled. When enabled, adds a penalty for large
+        differences between consecutive timesteps within each ensemble
+        member, preventing pulsing artifacts.
+    reduction : str, optional
+        Reduction mode. Must be one of ``'mean'``, ``'sum'``, or ``'none'``.
+        ``'mean'`` averages over batch and all non-ensemble dimensions.
+        Default is ``'mean'``.
+    Expected shapes
+    ---------------
+    preds : (B, T, M, \*D)
+        Ensemble predictions with time T on dim=1, ensemble size M on dim=2.
+    target : (B, T, C, \*D)
+        Deterministic target / analysis with channel C on dim=2 (should be 1).
+    """
+    def __init__(self, temporal_lambda: float = 0.0, reduction: str = "mean"):
+        """
+        Initialize CRPS loss.
+        Parameters
+        ----------
+        temporal_lambda : float, optional
+            Weight for the temporal consistency penalty. Default is ``0.0``
+            (disabled).
+        reduction : str, optional
+            Reduction mode. Must be one of ``'mean'``, ``'sum'``, or
+            ``'none'``. Default is ``'mean'``.
+        """
+        super().__init__(reduction=reduction)
+        self.temporal_lambda = temporal_lambda
+    def forward(self, preds: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        """
+        Compute CRPS loss.
+        CRPS = E[|X - y|] - 0.5 * E[|X - X'|], where X, X' are independent
+        samples from the forecast distribution and y is the observation.
+        Parameters
+        ----------
+        preds : torch.Tensor
+            Ensemble forecasts of shape ``(B, T, M, *D)``, where B is batch
+            size, T is the number of timesteps, M is ensemble size, and
+            ``*D`` are spatial dimensions.
+        target : torch.Tensor
+            Verifying observation / analysis of shape ``(B, T, C, *D)``,
+            where C should be 1. Broadcasts against ``preds`` on dim=2.
+        Returns
+        -------
+        loss : torch.Tensor
+            Scalar if ``reduction='mean'`` or ``'sum'``, otherwise a tensor
+            of shape ``(B, T, 1, *D)`` (or ``(B, 1, 1, *D)`` when
+            ``temporal_lambda > 0``).
+        """
+        # preds: (B, T, M, *D)
+        # target: (B, T, C, *D) where C should be 1
+        # target broadcasts against preds: (B, T, 1, *D) vs (B, T, M, *D)
+        # First term: E[|X - y|]
+        # Compute absolute difference between each ensemble member and target
+        diff_to_target = torch.abs(preds - target)  # (B, T, M, *D)
+        term1 = diff_to_target.mean(dim=2)  # Average over ensemble: (B, T, *D)
+        # Second term: 0.5 * E[|X - X'|]
+        # Compute pairwise differences between ensemble members
+        # preds: (B, T, M, *D)
+        # Expand for pairwise differences
+        # preds_i: (B, T, M, 1, *D)
+        # preds_j: (B, T, 1, M, *D)
+        preds_i = preds.unsqueeze(3)
+        preds_j = preds.unsqueeze(2)
+        # Pairwise absolute differences: (B, T, M, M, *D)
+        pairwise_diff = torch.abs(preds_i - preds_j)
+        # Average over both ensemble dimensions
+        # Sum over M*M pairs and divide by M*M
+        term2 = 0.5 * pairwise_diff.mean(dim=(2, 3))  # (B, T, *D)
+        # CRPS
+        crps = term1 - term2  # (B, T, *D)
+        # Temporal consistency penalty
+        if self.temporal_lambda > 0:
+            # average over time dimension
+            crps = crps.mean(dim=1)  # (B, *D)
+            # preds: (B, T, M, *D)
+            # Compute differences between consecutive timesteps per ensemble member
+            temporal_diff = preds[:, 1:, :, ...] - preds[:, :-1, :, ...]  # (B, T-1, M, *D)
+            temporal_penalty = torch.abs(temporal_diff).mean(
+                dim=(1, 2)
+            )  # average over time and ensemble dimensions (B, *D)
+            # Add penalty to CRPS (before reduction, averaged over time)
+            crps = crps + self.temporal_lambda * temporal_penalty
+            crps = crps[:, None, None, ...]  # add time and channel dims back for consistency (B, 1, 1, *D)
+        else:
+            # Keep singleton channel dim for MaskedLoss compatibility: (B, T, 1, *D)
+            crps = crps.unsqueeze(2)
+        return self.apply_reduction(crps)
+class afCRPS(LossWithReduction):
+    r"""
+    Almost fair CRPS (afCRPS) loss as in eq. (4) of Lang et al. (2024).
+    Interpolates between the standard (energy-score style) CRPS and the
+    fair CRPS via the fairness parameter ``alpha``.
+    Parameters
+    ----------
+    alpha : float, optional
+        Fairness parameter in ``(0, 1]``. ``alpha=1`` recovers the fair
+        CRPS. Lang et al. (2024) recommend ``alpha=0.95``. Default is
+        ``0.95``.
+    temporal_lambda : float, optional
+        Weight for the temporal consistency penalty. If ``0.0`` (default),
+        the penalty is disabled. When enabled, adds a penalty for large
+        differences between consecutive timesteps within each ensemble
+        member.
+    reduction : str, optional
+        Reduction mode. Must be one of ``'mean'``, ``'sum'``, or ``'none'``.
+        ``'mean'`` averages over batch and all non-ensemble dimensions.
+        Default is ``'mean'``.
+    Expected shapes
+    ---------------
+    preds : (B, T, M, \*D)
+        Ensemble predictions with time T on dim=1, ensemble size M on dim=2.
+    target : (B, T, C, \*D)
+        Deterministic target / analysis with channel C on dim=2 (should be 1).
+    """
+    def __init__(self, alpha: float = 0.95, temporal_lambda: float = 0.0, reduction: str = "mean"):
+        """
+        Initialize afCRPS loss.
+        Parameters
+        ----------
+        alpha : float, optional
+            Fairness parameter in ``(0, 1]``. ``alpha=1`` recovers the fair
+            CRPS. Default is ``0.95``.
+        temporal_lambda : float, optional
+            Weight for the temporal consistency penalty. Default is ``0.0``
+            (disabled).
+        reduction : str, optional
+            Reduction mode. Must be one of ``'mean'``, ``'sum'``, or
+            ``'none'``. Default is ``'mean'``.
+        Raises
+        ------
+        ValueError
+            If ``alpha`` is not in ``(0, 1]``.
+        """
+        super().__init__(reduction=reduction)
+        if not (0.0 < alpha <= 1.0):
+            raise ValueError("alpha must be in (0, 1].")
+        self.alpha = alpha
+        self.temporal_lambda = temporal_lambda
+    def forward(self, preds: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        """
+        Compute afCRPS over an ensemble.
+        Parameters
+        ----------
+        preds : torch.Tensor
+            (B, T, M, *D) ensemble forecasts.
+        target : torch.Tensor
+            (B, T, C, *D) verifying observation / analysis (C=1).
+        Returns
+        -------
+        loss : torch.Tensor
+            Scalar if reduction=='mean', else per-sample tensor.
+        """
+        if preds.dim() < 3:
+            raise ValueError("preds must have at least 3 dimensions.")
+        if target.shape[0] != preds.shape[0]:
+            raise ValueError("batch dimension of preds and target must match.")
+        if target.shape[1] != preds.shape[1]:
+            raise ValueError("time dimension of preds and target must match.")
+        # preds: (B, T, M, *D), target: (B, T, 1, *D)
+        M = preds.shape[2]
+        if M < 2:
+            raise ValueError("Ensemble size M must be >= 2 for afCRPS.")
+        eps = (1.0 - self.alpha) / float(M)
+        # |x_j - y| : (B, T, M, *D)  — target broadcasts via C=1
+        abs_x_minus_y = (preds - target).abs()
+        # Pairwise terms over ensemble dim (dim=2), j != k
+        #   x_j: (B, T, M, 1, *D)
+        #   x_k: (B, T, 1, M, *D)
+        x_j = preds.unsqueeze(3)
+        x_k = preds.unsqueeze(2)
+        # |x_j - y|, |x_k - y| broadcast to (B, T, M, M, *D)
+        abs_xj_minus_y = abs_x_minus_y.unsqueeze(3)
+        abs_xk_minus_y = abs_x_minus_y.unsqueeze(2)
+        # |x_j - x_k|: (B, T, M, M, *D)
+        abs_xj_minus_xk = (x_j - x_k).abs()
+        # Per (j,k) term: |x_j - y| + |x_k - y| - (1 - eps)|x_j - x_k|
+        term = abs_xj_minus_y + abs_xk_minus_y - (1.0 - eps) * abs_xj_minus_xk
+        # Exclude j == k (diagonal) since eq. (4) sums over k != j
+        idx = torch.arange(M, device=preds.device)
+        mask = idx[:, None] != idx[None, :]  # (M, M)
+        term = term * mask.view(1, 1, M, M, *([1] * (term.dim() - 4)))
+        # Sum over j and k dims → (B, T, *D)
+        summed = term.sum(dim=(2, 3))
+        # Normalization factor 1 / [2 M (M - 1)]
+        afcrps = summed / (2.0 * M * (M - 1))  # (B, T, *D)
+        # Temporal consistency penalty
+        if self.temporal_lambda > 0:
+            # average over time dimension
+            afcrps = afcrps.mean(dim=1)  # (B, *D)
+            # preds: (B, T, M, *D)
+            # Compute differences between consecutive timesteps per ensemble member
+            temporal_diff = preds[:, 1:, :, ...] - preds[:, :-1, :, ...]  # (B, T-1, M, *D)
+            temporal_penalty = torch.abs(temporal_diff).mean(
+                dim=(1, 2)
+            )  # average over time and ensemble dimensions (B, *D)
+            # Add penalty to afCRPS (before reduction, averaged over time)
+            afcrps = afcrps + self.temporal_lambda * temporal_penalty
+            afcrps = afcrps[:, None, None, ...]  # add time and channel dims back for consistency (B, 1, 1, *D)
+        else:
+            # Keep singleton channel dim for MaskedLoss compatibility: (B, T, 1, *D)
+            afcrps = afcrps.unsqueeze(2)
+        return self.apply_reduction(afcrps)
+PIXEL_LOSSES = {"mse": nn.MSELoss, "mae": nn.L1Loss, "crps": CRPS, "afcrps": afCRPS}
+def build_loss(
+    loss_class: type | str,
+    loss_params: dict[str, Any] | None = None,
+    masked_loss: bool = False,
+) -> nn.Module:
+    """
+    Build a loss function, optionally wrapped with masking.
+    Resolves a loss class by name (from ``PIXEL_LOSSES``) or accepts a class
+    directly, instantiates it with the given parameters, and optionally wraps
+    it in a :class:`MaskedLoss`.
+    Parameters
+    ----------
+    loss_class : type or str
+        Loss class or its string name. Accepted string names are the keys of
+        ``PIXEL_LOSSES``: ``'mse'``, ``'mae'``, ``'crps'``, ``'afcrps'``.
+        If ``None``, defaults to ``nn.MSELoss``.
+    loss_params : dict of str to any, optional
+        Keyword arguments forwarded to the loss class constructor. If
+        ``masked_loss`` is ``True``, the ``'reduction'`` key (if present)
+        is extracted and passed to :class:`MaskedLoss` instead. Default is
+        ``None``.
+    masked_loss : bool, optional
+        If ``True``, the loss is wrapped in :class:`MaskedLoss` and the
+        inner loss is instantiated with ``reduction='none'``. Default is
+        ``False``.
+    Returns
+    -------
+    criterion : nn.Module
+        Instantiated loss module, optionally wrapped in :class:`MaskedLoss`.
+    Raises
+    ------
+    ValueError
+        If ``loss_class`` is a string not found in ``PIXEL_LOSSES``.
+    """
+    if isinstance(loss_class, str):
+        if loss_class.lower() not in PIXEL_LOSSES:
+            raise ValueError(f"Unknown loss class '{loss_class}'. Available: {list(PIXEL_LOSSES.keys())}")
+        loss_class = PIXEL_LOSSES[loss_class.lower()]
+    elif loss_class is None:
+        loss_class = nn.MSELoss  # default
+        print("No loss_class provided, using default MSELoss.")
+    params = loss_params.copy() if loss_params is not None else None
+    # if the loss is masked, the reduction is handled in MaskedLoss
+    if masked_loss and params is not None:
+        # pop 'reduction' from loss_params and pass to MaskedLoss
+        reduction = params.pop("reduction", "mean")
+        criterion = MaskedLoss(loss_class(reduction="none", **params), reduction=reduction)
+        print(f"Using masked loss: {loss_class.__name__} with params {params} and reduction {reduction}")
+    elif masked_loss:
+        criterion = MaskedLoss(loss_class(reduction="none"), reduction="mean")
+        print(f"Using masked loss: {loss_class.__name__} with default params and reduction 'mean'")
+    else:
+        if params is not None:
+            criterion = loss_class(**params)
+            print(f"Using custom loss: {loss_class.__name__} with params {params}")
+        else:
+            criterion = loss_class()
+            print(f"Using loss: {loss_class.__name__} with default params")
+    return criterion

convgru_ensemble/model.py ADDED Viewed

	@@ -0,0 +1,569 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ResidualConvBlock(nn.Module):
+    """
+    Residual convolutional block with two convolutions and a skip connection.
+    Applies two 2D convolutions with a ReLU activation in between. If the
+    input and output channel counts differ, a 1x1 projection is used for the
+    residual path.
+    Parameters
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_size : int, optional
+        Kernel size for both convolutions. Default is ``3``.
+    padding : int, optional
+        Padding for both convolutions. Default is ``1``.
+    """
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3, padding: int = 1):
+        """
+        Initialize ResidualConvBlock.
+        Parameters
+        ----------
+        in_channels : int
+            Number of input channels.
+        out_channels : int
+            Number of output channels.
+        kernel_size : int, optional
+            Kernel size for both convolutions. Default is ``3``.
+        padding : int, optional
+            Padding for both convolutions. Default is ``1``.
+        """
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size, padding=padding)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size, padding=padding)
+        if in_channels != out_channels:
+            self.proj = nn.Conv2d(in_channels, out_channels, kernel_size=1)
+        else:
+            self.proj = None
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass through the residual convolutional block.
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape ``(B, C_in, H, W)``.
+        Returns
+        -------
+        out : torch.Tensor
+            Output tensor of shape ``(B, C_out, H, W)``.
+        """
+        residual = x
+        out = F.relu(self.conv1(x))
+        out = self.conv2(out)
+        if self.proj is not None:
+            residual = self.proj(residual)
+        out += residual
+        out = F.relu(out)
+        return out
+class ConvGRUCell(nn.Module):
+    """
+    Convolutional GRU cell operating on 2D spatial grids.
+    Implements a single-step GRU update where all linear projections are
+    replaced by 2D convolutions, preserving spatial structure.
+    Parameters
+    ----------
+    input_size : int
+        Number of channels in the input tensor.
+    hidden_size : int
+        Number of channels in the hidden state.
+    kernel_size : int, optional
+        Kernel size for the convolutional gates. Default is ``3``.
+    conv_layer : nn.Module, optional
+        Convolutional layer class to use. Default is ``nn.Conv2d``.
+    """
+    def __init__(self, input_size: int, hidden_size: int, kernel_size: int = 3, conv_layer: nn.Module = nn.Conv2d):
+        """
+        Initialize ConvGRUCell.
+        Parameters
+        ----------
+        input_size : int
+            Number of channels in the input tensor.
+        hidden_size : int
+            Number of channels in the hidden state.
+        kernel_size : int, optional
+            Kernel size for the convolutional gates. Default is ``3``.
+        conv_layer : nn.Module, optional
+            Convolutional layer class to use. Default is ``nn.Conv2d``.
+        """
+        super().__init__()
+        padding = kernel_size // 2
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        # update and reset gates are combined for optimization
+        self.combined_gates = conv_layer(input_size + hidden_size, 2 * hidden_size, kernel_size, padding=padding)
+        self.out_gate = conv_layer(input_size + hidden_size, hidden_size, kernel_size, padding=padding)
+    def forward(self, inpt: torch.Tensor | None = None, h_s: torch.Tensor | None = None) -> torch.Tensor:
+        """
+        Forward the ConvGRU cell for a single timestep.
+        If either input is ``None``, it is initialized to zeros based on the
+        shape of the other. If both are ``None``, a ``ValueError`` is raised.
+        Parameters
+        ----------
+        inpt : torch.Tensor or None, optional
+            Input tensor of shape ``(B, input_size, H, W)``. Default is
+            ``None``.
+        h_s : torch.Tensor or None, optional
+            Hidden state tensor of shape ``(B, hidden_size, H, W)``. Default
+            is ``None``.
+        Returns
+        -------
+        new_state : torch.Tensor
+            Updated hidden state of shape ``(B, hidden_size, H, W)``.
+        Raises
+        ------
+        ValueError
+            If both ``inpt`` and ``h_s`` are ``None``.
+        """
+        if h_s is None and inpt is None:
+            raise ValueError("Both input and state can't be None")
+        elif h_s is None:
+            h_s = torch.zeros(
+                inpt.size(0), self.hidden_size, inpt.size(2), inpt.size(3), dtype=inpt.dtype, device=inpt.device
+            )
+        elif inpt is None:
+            inpt = torch.zeros(
+                h_s.size(0), self.input_size, h_s.size(2), h_s.size(3), dtype=h_s.dtype, device=h_s.device
+            )
+        gamma, beta = torch.chunk(self.combined_gates(torch.cat([inpt, h_s], dim=1)), 2, dim=1)
+        update = torch.sigmoid(gamma)
+        reset = torch.sigmoid(beta)
+        out_inputs = torch.tanh(self.out_gate(torch.cat([inpt, h_s * reset], dim=1)))
+        new_state = h_s * (1 - update) + out_inputs * update
+        return new_state
+class ConvGRU(nn.Module):
+    """
+    Convolutional GRU that unrolls a :class:`ConvGRUCell` over a sequence.
+    Parameters
+    ----------
+    input_size : int
+        Number of channels in the input tensor.
+    hidden_size : int
+        Number of channels in the hidden state.
+    kernel_size : int, optional
+        Kernel size for the convolutional gates. Default is ``3``.
+    conv_layer : nn.Module, optional
+        Convolutional layer class to use. Default is ``nn.Conv2d``.
+    """
+    def __init__(self, input_size: int, hidden_size: int, kernel_size: int = 3, conv_layer: nn.Module = nn.Conv2d):
+        """
+        Initialize ConvGRU.
+        Parameters
+        ----------
+        input_size : int
+            Number of channels in the input tensor.
+        hidden_size : int
+            Number of channels in the hidden state.
+        kernel_size : int, optional
+            Kernel size for the convolutional gates. Default is ``3``.
+        conv_layer : nn.Module, optional
+            Convolutional layer class to use. Default is ``nn.Conv2d``.
+        """
+        super().__init__()
+        self.cell = ConvGRUCell(input_size, hidden_size, kernel_size, conv_layer)
+    def forward(self, x: torch.Tensor | None = None, h: torch.Tensor | None = None) -> torch.Tensor:
+        """
+        Unroll the ConvGRU cell over the sequence (time) dimension.
+        .. code-block:: text
+               x[:, 0]              x[:, 1]
+                  |                    |
+                  v                    v
+               *------*             *------*
+        h -->  | Cell | --> h_0 --> | Cell | --> h_1 ...
+               *------*             *------*
+        If either input is ``None``, it is initialized to zeros based on the
+        shape of the other. If both are ``None``, a ``ValueError`` is raised.
+        Parameters
+        ----------
+        x : torch.Tensor or None, optional
+            Input tensor of shape ``(B, T, input_size, H, W)``. Default is
+            ``None``.
+        h : torch.Tensor or None, optional
+            Initial hidden state of shape ``(B, hidden_size, H, W)``. Default
+            is ``None``.
+        Returns
+        -------
+        hidden_states : torch.Tensor
+            Stacked hidden states of shape ``(B, T, hidden_size, H, W)``,
+            i.e. ``[h_0, h_1, h_2, ...]``.
+        """
+        h_s = []
+        for i in range(x.size(1)):
+            h = self.cell(x[:, i], h)
+            h_s.append(h)
+        return torch.stack(h_s, dim=1)
+class EncoderBlock(nn.Module):
+    """
+    ConvGRU-based encoder block with spatial downsampling.
+    Applies a :class:`ConvGRU` followed by ``nn.PixelUnshuffle(2)`` to
+    halve spatial dimensions and quadruple channels.
+    Parameters
+    ----------
+    input_size : int
+        Number of input channels.
+    kernel_size : int, optional
+        Kernel size for the ConvGRU. Default is ``3``.
+    conv_layer : nn.Module, optional
+        Convolutional layer class to use. Default is ``nn.Conv2d``.
+    """
+    def __init__(self, input_size: int, kernel_size: int = 3, conv_layer: nn.Module = nn.Conv2d):
+        """
+        Initialize EncoderBlock.
+        Parameters
+        ----------
+        input_size : int
+            Number of input channels.
+        kernel_size : int, optional
+            Kernel size for the ConvGRU. Default is ``3``.
+        conv_layer : nn.Module, optional
+            Convolutional layer class to use. Default is ``nn.Conv2d``.
+        """
+        super().__init__()
+        self.convgru = ConvGRU(input_size, input_size, kernel_size, conv_layer)
+        self.down = nn.PixelUnshuffle(2)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward the encoder block.
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape ``(B, T, C, H, W)``.
+        Returns
+        -------
+        out : torch.Tensor
+            Downsampled tensor of shape ``(B, T, C*4, H/2, W/2)``.
+        """
+        x = self.convgru(x)
+        x = self.down(x)
+        return x
+class Encoder(nn.Module):
+    r"""
+    ConvGRU-based encoder that stacks multiple :class:`EncoderBlock` layers.
+    After each block the spatial resolution is halved via pixel-unshuffle.
+    .. code-block:: text
+         ///    Encoder Block 1    \\\                ///    Encoder Block 2    \\\
+     /--------------------------------------------\ /---------------------------------------\
+    |                                              |                                         |
+    *        *---------*      *-----------------*  *   *---------*      *-----------------*  *
+        X -> | ConvGRU | ---> | Pixel Unshuffle | ---> | ConvGRU | ---> | Pixel Unshuffle | ---> ...
+        |    *---------*  |   *-----------------*  |   *---------*  |   *-----------------*  |
+        v                 v                        v                v                        v
+      (b,t,c,h,w)      (b,t,c,h,w)          (b,t,c*4,h/2,w/2) (b,t,c*4,h/2,w/2)    (b,t,c*16,h/4,w/4)
+    Parameters
+    ----------
+    input_channels : int, optional
+        Number of input channels. Default is ``1``.
+    num_blocks : int, optional
+        Number of encoder blocks to stack. Default is ``4``.
+    **kwargs
+        Additional keyword arguments forwarded to each :class:`EncoderBlock`.
+    """
+    def __init__(self, input_channels: int = 1, num_blocks: int = 4, **kwargs):
+        """
+        Initialize Encoder.
+        Parameters
+        ----------
+        input_channels : int, optional
+            Number of input channels. Default is ``1``.
+        num_blocks : int, optional
+            Number of encoder blocks to stack. Default is ``4``.
+        **kwargs
+            Additional keyword arguments forwarded to each
+            :class:`EncoderBlock`.
+        """
+        super().__init__()
+        self.channel_sizes = [input_channels * 4**i for i in range(num_blocks)]  # [1, 4, 16, 64]
+        self.blocks = nn.ModuleList([EncoderBlock(self.channel_sizes[i], **kwargs) for i in range(num_blocks)])
+    def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
+        """
+        Forward the encoder through all blocks.
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape ``(B, T, C, H, W)``.
+        Returns
+        -------
+        hidden_states : list of torch.Tensor
+            Hidden state tensors from each block, with progressively reduced
+            spatial dimensions:
+            ``[(B, T, C*4, H/2, W/2), (B, T, C*16, H/4, W/4), ...]``.
+        """
+        hidden_states = []
+        for block in self.blocks:
+            x = block(x)
+            hidden_states.append(x)
+        return hidden_states
+class DecoderBlock(nn.Module):
+    """
+    ConvGRU-based decoder block with spatial upsampling.
+    Applies a :class:`ConvGRU` followed by ``nn.PixelShuffle(2)`` to double
+    spatial dimensions and quarter channels.
+    Parameters
+    ----------
+    input_size : int
+        Number of input channels.
+    hidden_size : int
+        Number of hidden channels for the ConvGRU.
+    kernel_size : int, optional
+        Kernel size for the ConvGRU. Default is ``3``.
+    conv_layer : nn.Module, optional
+        Convolutional layer class to use. Default is ``nn.Conv2d``.
+    """
+    def __init__(self, input_size: int, hidden_size: int, kernel_size: int = 3, conv_layer: nn.Module = nn.Conv2d):
+        """
+        Initialize DecoderBlock.
+        Parameters
+        ----------
+        input_size : int
+            Number of input channels.
+        hidden_size : int
+            Number of hidden channels for the ConvGRU.
+        kernel_size : int, optional
+            Kernel size for the ConvGRU. Default is ``3``.
+        conv_layer : nn.Module, optional
+            Convolutional layer class to use. Default is ``nn.Conv2d``.
+        """
+        super().__init__()
+        self.convgru = ConvGRU(input_size, hidden_size, kernel_size, conv_layer)
+        self.up = nn.PixelShuffle(2)
+    def forward(self, x: torch.Tensor, hidden_state: torch.Tensor) -> torch.Tensor:
+        """
+        Forward the decoder block.
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape ``(B, T, C, H, W)``.
+        hidden_state : torch.Tensor
+            Hidden state from the corresponding encoder block, of shape
+            ``(B, hidden_size, H, W)``.
+        Returns
+        -------
+        out : torch.Tensor
+            Upsampled tensor of shape ``(B, T, hidden_size // 4, H*2, W*2)``.
+        """
+        x = self.convgru(x, hidden_state)
+        x = self.up(x)
+        return x
+class Decoder(nn.Module):
+    r"""
+    ConvGRU-based decoder that stacks multiple :class:`DecoderBlock` layers.
+    After each block the spatial resolution is doubled via pixel-shuffle.
+    Hidden sizes are computed from the desired output channels.
+    Parameters
+    ----------
+    output_channels : int, optional
+        Number of output channels. Default is ``1``.
+    num_blocks : int, optional
+        Number of decoder blocks to stack. Default is ``4``.
+    **kwargs
+        Additional keyword arguments forwarded to each :class:`DecoderBlock`.
+    """
+    def __init__(self, output_channels: int = 1, num_blocks: int = 4, **kwargs):
+        """
+        Initialize Decoder.
+        Parameters
+        ----------
+        output_channels : int, optional
+            Number of output channels. Default is ``1``.
+        num_blocks : int, optional
+            Number of decoder blocks to stack. Default is ``4``.
+        **kwargs
+            Additional keyword arguments forwarded to each
+            :class:`DecoderBlock`.
+        """
+        super().__init__()
+        self.channel_sizes = [output_channels * 4 ** (i + 1) for i in reversed(range(num_blocks))]  # [256, 64, 16, 4]
+        self.blocks = nn.ModuleList(
+            [DecoderBlock(self.channel_sizes[i], self.channel_sizes[i], **kwargs) for i in range(num_blocks)]
+        )
+    def forward(self, x: torch.Tensor, hidden_states: list[torch.Tensor]) -> torch.Tensor:
+        """
+        Forward the decoder through all blocks.
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape ``(B, T, C, H, W)``.
+        hidden_states : list of torch.Tensor
+            Hidden states from the encoder (in reverse order), one per block.
+        Returns
+        -------
+        out : torch.Tensor
+            Output tensor of shape
+            ``(B, T, output_channels, H * 2^num_blocks, W * 2^num_blocks)``.
+        """
+        for block, hidden_state in zip(self.blocks, hidden_states, strict=True):
+            x = block(x, hidden_state)
+        return x
+class EncoderDecoder(nn.Module):
+    """
+    Full encoder-decoder model for spatio-temporal forecasting.
+    Encodes an input sequence into multi-scale hidden states and decodes
+    them into a forecast sequence, optionally generating multiple ensemble
+    members via noisy decoder inputs.
+    Parameters
+    ----------
+    channels : int, optional
+        Number of input/output channels. Default is ``1``.
+    num_blocks : int, optional
+        Number of encoder and decoder blocks. Default is ``4``.
+    **kwargs
+        Additional keyword arguments forwarded to :class:`Encoder` and
+        :class:`Decoder`.
+    """
+    def __init__(self, channels: int = 1, num_blocks: int = 4, **kwargs):
+        """
+        Initialize EncoderDecoder.
+        Parameters
+        ----------
+        channels : int, optional
+            Number of input/output channels. Default is ``1``.
+        num_blocks : int, optional
+            Number of encoder and decoder blocks. Default is ``4``.
+        **kwargs
+            Additional keyword arguments forwarded to :class:`Encoder` and
+            :class:`Decoder`.
+        """
+        super().__init__()
+        self.encoder = Encoder(channels, num_blocks, **kwargs)
+        self.decoder = Decoder(channels, num_blocks, **kwargs)
+    def forward(self, x: torch.Tensor, steps: int, noisy_decoder: bool = False, ensemble_size: int = 1) -> torch.Tensor:
+        """
+        Forward the encoder-decoder model.
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape ``(B, T, C, H, W)``.
+        steps : int
+            Number of future timesteps to forecast.
+        noisy_decoder : bool, optional
+            If ``True``, feed random noise (instead of zeros) as input to the
+            decoder. Default is ``False``.
+        ensemble_size : int, optional
+            Number of ensemble members to generate. When ``> 1``, the decoder
+            is always run with noisy inputs. Default is ``1``.
+        Returns
+        -------
+        preds : torch.Tensor
+            Forecast tensor. Shape is ``(B, steps, C, H, W)`` when
+            ``ensemble_size == 1``, or
+            ``(B, steps, ensemble_size * C, H, W)`` when ``ensemble_size > 1``
+            (for C=1, this is ``(B, steps, ensemble_size, H, W)``).
+        """
+        # encode the input tensor into a sequence of hidden states
+        encoded = self.encoder(x)
+        # create a tensor with the same shape as the last hidden state of the encoder to use as a input for the decoder
+        x_dec_shape = list(encoded[-1].shape)
+        # set the desired number of timestep for the output
+        x_dec_shape[1] = steps
+        # collect all the last hidden states of the encoder blocks in reverse order
+        last_hidden_per_block = [e[:, -1] for e in reversed(encoded)]
+        if ensemble_size > 1:
+            # Generate M ensemble members by running decoder M times with different noise
+            preds = []
+            for _ in range(ensemble_size):
+                # the input will be random noise for each ensemble member
+                x_dec = torch.randn(x_dec_shape, dtype=encoded[-1].dtype, device=encoded[-1].device)
+                # decode (unroll) the input hidden states into a forecast sequence of N timesteps
+                decoded = self.decoder(x_dec, last_hidden_per_block)
+                preds.append(decoded)
+            # stack along channel/ensemble dimension: (B, T, M, H, W)
+            return torch.cat(preds, dim=2)
+        else:
+            # the input will be of random values if noisy_decoder is True, otherwise with zeros
+            x_dec_func = torch.randn if noisy_decoder else torch.zeros
+            # create the input tensor for the decoder
+            x_dec = x_dec_func(x_dec_shape, dtype=encoded[-1].dtype, device=encoded[-1].device)
+            # decode (unroll) the input hidden states into a forecast sequence of N timesteps
+            decoded = self.decoder(x_dec, last_hidden_per_block)
+            return decoded

convgru_ensemble/py.typed ADDED Viewed

File without changes

convgru_ensemble/serve.py ADDED Viewed

	@@ -0,0 +1,139 @@

+"""FastAPI inference server for ConvGRU-Ensemble nowcasting model."""
+import io
+import os
+import time
+from contextlib import asynccontextmanager
+import numpy as np
+import xarray as xr
+from fastapi import FastAPI, File, Query, UploadFile
+from fastapi.responses import Response
+_model = None
+def _load_model():
+    from .lightning_model import RadarLightningModel
+    device = os.environ.get("DEVICE", "cpu")
+    checkpoint = os.environ.get("MODEL_CHECKPOINT")
+    hub_repo = os.environ.get("HF_REPO_ID")
+    if hub_repo:
+        return RadarLightningModel.from_pretrained(hub_repo, device=device)
+    elif checkpoint:
+        return RadarLightningModel.from_checkpoint(checkpoint, device=device)
+    else:
+        raise RuntimeError("Set MODEL_CHECKPOINT or HF_REPO_ID environment variable.")
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global _model
+    _model = _load_model()
+    yield
+    _model = None
+app = FastAPI(
+    title="ConvGRU-Ensemble Nowcasting API",
+    version="0.1.0",
+    description="Ensemble precipitation nowcasting from radar data",
+    lifespan=lifespan,
+)
+@app.get("/health")
+async def health():
+    """Health check endpoint."""
+    return {"status": "ok", "model_loaded": _model is not None}
+@app.get("/model/info")
+async def model_info():
+    """Return model metadata."""
+    if _model is None:
+        return {"error": "Model not loaded"}
+    hp = _model.hparams
+    return {
+        "architecture": "ConvGRU-Ensemble EncoderDecoder",
+        "input_channels": hp.input_channels,
+        "num_blocks": hp.num_blocks,
+        "forecast_steps": hp.forecast_steps,
+        "ensemble_size": hp.ensemble_size,
+        "noisy_decoder": hp.noisy_decoder,
+        "loss_class": str(hp.loss_class),
+        "device": str(_model.device),
+    }
+@app.post("/predict")
+async def predict(
+    file: UploadFile = File(..., description="NetCDF file with rain rate data (T, H, W)"),  # noqa: B008
+    variable: str = Query("RR", description="Name of the rain rate variable"),  # noqa: B008
+    forecast_steps: int = Query(12, ge=1, le=48, description="Number of future timesteps"),  # noqa: B008
+    ensemble_size: int = Query(10, ge=1, le=50, description="Number of ensemble members"),  # noqa: B008
+):
+    """
+    Run ensemble nowcasting inference on uploaded NetCDF data.
+    Accepts a NetCDF file containing past radar rain rate observations and
+    returns NetCDF predictions with ensemble forecasts.
+    """
+    t0 = time.perf_counter()
+    # Read uploaded NetCDF
+    content = await file.read()
+    ds = xr.open_dataset(io.BytesIO(content))
+    if variable not in ds:
+        available = list(ds.data_vars)
+        return Response(
+            content=f"Variable '{variable}' not found. Available: {available}",
+            status_code=422,
+        )
+    data = ds[variable].values
+    if data.ndim != 3:
+        return Response(
+            content=f"Expected 3D data (T, H, W), got shape {data.shape}",
+            status_code=422,
+        )
+    past = data.astype(np.float32)
+    # Run inference
+    preds = _model.predict(past, forecast_steps=forecast_steps, ensemble_size=ensemble_size)
+    elapsed = time.perf_counter() - t0
+    # Build output NetCDF
+    ds_out = xr.Dataset(
+        {
+            "precipitation_forecast": xr.DataArray(
+                data=preds,
+                dims=["ensemble_member", "forecast_step", "y", "x"],
+                attrs={"units": "mm/h", "long_name": "Ensemble precipitation forecast"},
+            ),
+        },
+        attrs={
+            "model": "ConvGRU-Ensemble",
+            "forecast_steps": forecast_steps,
+            "ensemble_size": ensemble_size,
+            "elapsed_seconds": f"{elapsed:.3f}",
+        },
+    )
+    buf = io.BytesIO()
+    ds_out.to_netcdf(buf)
+    buf.seek(0)
+    return Response(
+        content=buf.getvalue(),
+        media_type="application/x-netcdf",
+        headers={
+            "Content-Disposition": "attachment; filename=predictions.nc",
+            "X-Elapsed-Seconds": f"{elapsed:.3f}",
+        },
+    )

convgru_ensemble/train.py ADDED Viewed

	@@ -0,0 +1,316 @@

+# train.py
+import os
+import sys
+from datetime import datetime
+import fiddle as fdl
+import torch
+import yaml
+from absl import app, flags
+from fiddle import absl_flags, printing
+from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
+from pytorch_lightning.loggers import TensorBoardLogger
+from .datamodule import RadarDataModule
+from .lightning_model import RadarLightningModel
+from .losses import PIXEL_LOSSES
+seed_everything(42, workers=True)
+FLAGS = flags.FLAGS
+flags.DEFINE_bool("print_config", False, "Print configuration and exit.")
+flags.DEFINE_string("export_yaml", None, "Export configuration to YAML file and exit.")
+def experiment() -> fdl.Config:
+    """
+    Define the default experiment configuration.
+    Returns a Fiddle config that can be overridden from the command line
+    with ``--config config:experiment --config set:path.to.value=X``.
+    Returns
+    -------
+    cfg : fdl.Config
+        Nested Fiddle configuration containing datamodule, model, trainer,
+        callbacks, and logger settings.
+    """
+    cfg = fdl.Config(dict)
+    # resume from checkpoint
+    cfg.checkpoint_path = None
+    # enable mixed precision for float32 matmuls if available
+    cfg.float32_matmul_precision = None
+    # compile model with torch.compile if desired
+    cfg.compile_model = False
+    # DataModule
+    cfg.datamodule = fdl.Config(
+        RadarDataModule,
+        zarr_path="./data/italian-radar-dpc-sri.zarr",
+        csv_path="./importance_sampler/output/sampled_datacubes_2021-01-01-2025-12-11_24x256x256_3x16x16_10000.csv",
+        steps=18,
+        train_ratio=0.90,
+        val_ratio=0.05,
+        return_mask=True,
+        deterministic=False,
+        augment=True,
+        # DataLoader params
+        batch_size=16,
+        num_workers=8,
+        pin_memory=True,
+        multiprocessing_context="fork",
+    )
+    # Lightning Model
+    cfg.model = fdl.Config(
+        RadarLightningModel,
+        input_channels=1,
+        forecast_steps=12,
+        num_blocks=5,
+        ensemble_size=2,
+        noisy_decoder=False,
+        loss_class="crps",
+        loss_params={"temporal_lambda": 0.01},
+        masked_loss=True,
+        optimizer_class=torch.optim.Adam,
+        optimizer_params={"lr": 1e-4, "fused": True},
+        lr_scheduler_class=torch.optim.lr_scheduler.ReduceLROnPlateau,
+        lr_scheduler_params={"mode": "min", "factor": 0.5, "patience": 10},
+    )
+    # Trainer
+    cfg.trainer = fdl.Config(
+        Trainer,
+        accelerator="auto",
+        # gradient_clip_val=1.0,
+        max_epochs=1,
+    )
+    # Callbacks
+    cfg.callbacks = fdl.Config(dict)
+    cfg.callbacks.checkpoint_val = fdl.Config(
+        ModelCheckpoint,
+        monitor="val_loss",
+        save_top_k=1,
+        mode="min",
+        dirpath=None,
+        filename=None,  # Set dynamically: best-val-{ckpt_name}
+        save_last=False,
+    )
+    cfg.callbacks.checkpoint_train = fdl.Config(
+        ModelCheckpoint,
+        monitor="train_loss_epoch",
+        save_top_k=1,
+        mode="min",
+        dirpath=None,
+        filename=None,  # Set dynamically: best-train-{ckpt_name}
+        save_last=False,
+    )
+    cfg.callbacks.early_stopping = fdl.Config(
+        EarlyStopping,
+        monitor="val_loss",
+        patience=100,
+        mode="min",
+    )
+    cfg.callbacks.lr_monitor = fdl.Config(
+        LearningRateMonitor,
+        logging_interval="step",
+        log_momentum=False,
+        log_weight_decay=False,
+    )
+    # Loggers
+    cfg.loggers = fdl.Config(dict)
+    cfg.loggers.tensorboard = fdl.Config(
+        TensorBoardLogger,
+        save_dir="logs",
+        name=None,  # Set dynamically in train()
+        version=None,  # Set dynamically in train()
+    )
+    return cfg
+_CONFIG = absl_flags.DEFINE_fiddle_config(
+    "config",
+    default_module=sys.modules[__name__],
+    help_string="Experiment configuration.",
+)
+def train(cfg: fdl.Config) -> None:
+    """
+    Run training with the given Fiddle configuration.
+    Builds all components (model, datamodule, trainer, callbacks, loggers),
+    sets up dynamic naming for checkpoints and TensorBoard logs, saves the
+    config as YAML, and runs ``trainer.fit`` followed by ``trainer.test``.
+    Parameters
+    ----------
+    cfg : fdl.Config
+        Fiddle configuration as returned by :func:`experiment`.
+    """
+    # enable tensor cores for float32 matmuls if available
+    if cfg.float32_matmul_precision is not None:
+        torch.set_float32_matmul_precision(cfg.float32_matmul_precision)
+    # Compute dynamic values for naming
+    future_steps = cfg.model.forecast_steps
+    past_steps = cfg.datamodule.steps - future_steps
+    if cfg.model.loss_class is None:
+        loss_name = "MSELoss"
+    elif isinstance(cfg.model.loss_class, type):
+        loss_name = cfg.model.loss_class.__name__
+    else:
+        loss_name = (
+            PIXEL_LOSSES[cfg.model.loss_class.lower()].__name__
+            if cfg.model.loss_class.lower() in PIXEL_LOSSES
+            else str(cfg.model.loss_class)
+        )
+    lr = (
+        cfg.model.optimizer_params["lr"]
+        if cfg.model.optimizer_params is not None and "lr" in cfg.model.optimizer_params
+        else "default"
+    )
+    noise_str: str = "_noise" if cfg.model.noisy_decoder else ""
+    ckpt_base_name: str = f"{past_steps}past-{future_steps}fut{noise_str}_bs{cfg.datamodule.batch_size}_lr{lr}"
+    # Set dynamic logger name and version first (checkpoint folder depends on it)
+    if cfg.loggers.tensorboard.name is None:
+        cfg.loggers.tensorboard.name = f"{loss_name}_{past_steps}past-{future_steps}fut{noise_str}"
+    jobid = os.getenv("SLURM_JOB_ID", None)
+    tb_version = f"_{cfg.loggers.tensorboard.version}" if cfg.loggers.tensorboard.version is not None else ""
+    if jobid is not None:
+        cfg.loggers.tensorboard.version = f"job{jobid}_{ckpt_base_name}{tb_version}"
+    else:
+        cfg.loggers.tensorboard.version = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{ckpt_base_name}{tb_version}"
+    # Set checkpoint paths inside tensorboard experiment folder
+    tb_log_dir = f"{cfg.loggers.tensorboard.save_dir}/{cfg.loggers.tensorboard.name}/{cfg.loggers.tensorboard.version}"
+    ckpt_dir = f"{tb_log_dir}/checkpoints"
+    # Val checkpoint
+    if cfg.callbacks.checkpoint_val.dirpath is None:
+        cfg.callbacks.checkpoint_val.dirpath = ckpt_dir
+    if cfg.callbacks.checkpoint_val.filename is None:
+        cfg.callbacks.checkpoint_val.filename = "best-val-" + ckpt_base_name + "_ep{epoch:03d}_loss{val_loss:.4f}"
+    # Train checkpoint
+    if cfg.callbacks.checkpoint_train.dirpath is None:
+        cfg.callbacks.checkpoint_train.dirpath = ckpt_dir
+    if cfg.callbacks.checkpoint_train.filename is None:
+        cfg.callbacks.checkpoint_train.filename = (
+            "best-train-" + ckpt_base_name + "_ep{epoch:03d}_loss{train_loss_epoch:.4f}"
+        )
+    # Build all callbacks and loggers dynamically
+    callbacks_dict = fdl.build(cfg.callbacks)
+    loggers_dict = fdl.build(cfg.loggers)
+    callbacks = list(callbacks_dict.values())
+    loggers = list(loggers_dict.values())
+    # Add loggers and callbacks to trainer config
+    cfg.trainer.logger = loggers
+    cfg.trainer.callbacks = callbacks
+    print(printing.as_str_flattened(cfg))
+    # Save config to tensorboard folder
+    os.makedirs(tb_log_dir, exist_ok=True)
+    config_path = f"{tb_log_dir}/config.yaml"
+    with open(config_path, "w") as f:
+        yaml.dump(config_to_dict(cfg), f, default_flow_style=False, sort_keys=False)
+    print(f"Config saved to {config_path}")
+    # Build all components
+    built = fdl.build(cfg)
+    datamodule: RadarDataModule = built["datamodule"]
+    if cfg.checkpoint_path is not None:
+        print(f"Resuming training from checkpoint: {cfg.checkpoint_path}")
+        model = RadarLightningModel.load_from_checkpoint(cfg.checkpoint_path, strict=True, weights_only=False)
+    else:
+        model = built["model"]
+    trainer: Trainer = built["trainer"]
+    datamodule.setup()
+    print(
+        f"Train: {len(datamodule.train_dataset)}, Val: {len(datamodule.val_dataset)}, Test: {len(datamodule.test_dataset)}"
+    )
+    if cfg.compile_model:
+        print("Compiling model with torch.compile()...")
+        model = torch.compile(model, dynamic=True)
+    trainer.fit(model, datamodule=datamodule)
+    trainer.test(model, datamodule=datamodule)
+    print(f"Best val: {callbacks_dict['checkpoint_val'].best_model_path}")
+    print(f"Best train: {callbacks_dict['checkpoint_train'].best_model_path}")
+def config_to_dict(cfg: fdl.Config) -> dict:
+    """
+    Recursively convert a Fiddle config to a nested dictionary.
+    Parameters
+    ----------
+    cfg : fdl.Config
+        Fiddle configuration object.
+    Returns
+    -------
+    result : dict
+        Plain dictionary suitable for YAML serialization.
+    """
+    result = {}
+    for key, value in fdl.ordered_arguments(cfg).items():
+        result[key] = config_to_dict(value) if isinstance(value, fdl.Config) else value
+    return result
+def main(argv: list[str]) -> None:
+    """
+    Entry point for the training script.
+    Handles ``--print_config`` and ``--export_yaml`` flags, then delegates
+    to :func:`train`.
+    Parameters
+    ----------
+    argv : list of str
+        Command-line arguments (unused, consumed by ``absl``).
+    """
+    del argv
+    cfg = _CONFIG.value
+    if FLAGS.print_config:
+        print(printing.as_str_flattened(cfg))
+        return
+    if FLAGS.export_yaml:
+        cfg_dict = config_to_dict(cfg)
+        with open(FLAGS.export_yaml, "w") as f:
+            yaml.dump(cfg_dict, f, default_flow_style=False, sort_keys=False)
+        print(f"Config exported to {FLAGS.export_yaml}")
+        return
+    train(cfg)
+# Example command to run training with custom configuration overrides.
+# uv run python train.py \
+#     --config config:experiment \
+#     --config set:callbacks.checkpoint.save_top_k=3 \
+#     --config set:model.num_blocks=5 \
+#     --config set:model.forecast_steps=12 \
+#     --config set:datamodule.steps=18 \
+#     --config set:datamodule.num_workers=32 \
+#     --config set:datamodule.batch_size=32
+if __name__ == "__main__":
+    app.run(main)

convgru_ensemble/utils.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import numpy as np
+def rainrate_to_reflectivity(rainrate: np.ndarray) -> np.ndarray:
+    """
+    Convert rain rate to reflectivity using the Marshall-Palmer relationship.
+    Applies Z = 200 * R^1.6 and converts to dBZ. Values below ~0.037 mm/h
+    are clipped to 0 dBZ; values above 60 dBZ are clipped to 60.
+    Parameters
+    ----------
+    rainrate : np.ndarray
+        Rain rate in mm/h. Can be any shape.
+    Returns
+    -------
+    reflectivity : np.ndarray
+        Reflectivity in dBZ, clipped to [0, 60]. Same shape as input.
+    """
+    epsilon = 1e-16
+    # We return 0 for any rain lighter than ~0.037mm/h
+    return (10 * np.log10(200 * rainrate**1.6 + epsilon)).clip(0, 60)
+def normalize_reflectivity(reflectivity: np.ndarray) -> np.ndarray:
+    """
+    Normalize reflectivity from [0, 60] dBZ to [-1, 1].
+    Parameters
+    ----------
+    reflectivity : np.ndarray
+        Reflectivity in dBZ, expected in [0, 60]. Can be any shape.
+    Returns
+    -------
+    normalized : np.ndarray
+        Normalized reflectivity in [-1, 1]. Same shape as input.
+    """
+    return (reflectivity / 30.0) - 1.0
+def denormalize_reflectivity(normalized: np.ndarray) -> np.ndarray:
+    """
+    Denormalize from [-1, 1] back to [0, 60] dBZ reflectivity.
+    Parameters
+    ----------
+    normalized : np.ndarray
+        Normalized reflectivity in [-1, 1]. Can be any shape.
+    Returns
+    -------
+    reflectivity : np.ndarray
+        Reflectivity in dBZ, in [0, 60]. Same shape as input.
+    """
+    return (normalized + 1.0) * 30.0
+def reflectivity_to_rainrate(reflectivity: np.ndarray) -> np.ndarray:
+    """
+    Convert reflectivity back to rain rate using the inverse Marshall-Palmer
+    relationship.
+    Applies R = (Z_linear / 200)^(1/1.6) where Z_linear = 10^(dBZ/10).
+    Parameters
+    ----------
+    reflectivity : np.ndarray
+        Reflectivity in dBZ. Can be any shape.
+    Returns
+    -------
+    rainrate : np.ndarray
+        Rain rate in mm/h. Same shape as input.
+    """
+    # Z = 200 * R^1.6
+    # R = (Z / 200)^(1/1.6)
+    z_linear = 10 ** (reflectivity / 10.0)
+    return (z_linear / 200.0) ** (1.0 / 1.6)
+def rainrate_to_normalized(rainrate: np.ndarray) -> np.ndarray:
+    """
+    Convert rain rate directly to normalized reflectivity.
+    Composes :func:`rainrate_to_reflectivity` and
+    :func:`normalize_reflectivity`.
+    Parameters
+    ----------
+    rainrate : np.ndarray
+        Rain rate in mm/h. Can be any shape.
+    Returns
+    -------
+    normalized : np.ndarray
+        Normalized reflectivity in [-1, 1]. Same shape as input.
+    """
+    reflectivity = rainrate_to_reflectivity(rainrate)
+    return normalize_reflectivity(reflectivity)
+def normalized_to_rainrate(normalized: np.ndarray) -> np.ndarray:
+    """
+    Convert normalized reflectivity back to rain rate.
+    Composes :func:`denormalize_reflectivity` and
+    :func:`reflectivity_to_rainrate`.
+    Parameters
+    ----------
+    normalized : np.ndarray
+        Normalized reflectivity in [-1, 1]. Can be any shape.
+    Returns
+    -------
+    rainrate : np.ndarray
+        Rain rate in mm/h. Same shape as input.
+    """
+    reflectivity = denormalize_reflectivity(normalized)
+    return reflectivity_to_rainrate(reflectivity)

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,12 @@

+services:
+  api:
+    build: .
+    ports:
+      - "8000:8000"
+    volumes:
+      - ./checkpoints:/app/checkpoints
+    environment:
+      - MODEL_CHECKPOINT=/app/checkpoints/model.ckpt
+      - DEVICE=cpu
+      # Alternative: download from HuggingFace Hub
+      # - HF_REPO_ID=it4lia/irene

examples/sample_data.nc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a94b2f0c576f8e80cac470b1994bf8faed3eae11fb6fee6feba21e1a52dc0e7
+size 11232217

importance_sampler/filter_nan.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import argparse
+import os
+import sys
+import time
+from functools import partial
+from multiprocessing import Pool
+from queue import Queue
+from threading import Thread
+import numpy as np
+import pandas as pd
+import xarray as xr
+from tqdm import tqdm
+START = time.time()
+# === Functions ===
+def dim_nan_count(mask, dim, delta, dim_len):
+    """
+    Count NaN values in a rolling window along a single dimension.
+    Uses a cumulative-sum trick to efficiently compute the number of NaN
+    values within each window of size ``delta`` along the specified axis.
+    Parameters
+    ----------
+    mask : np.ndarray
+        3-D binary array where 1 indicates NaN and 0 indicates valid.
+    dim : int
+        Axis along which to compute the rolling NaN count (0, 1, or 2).
+    delta : int
+        Window size (number of pixels) along ``dim``.
+    dim_len : int
+        Length of ``dim`` in the input array.
+    Returns
+    -------
+    nan_counts : np.ndarray
+        Integer array with the NaN count for each window position.
+    """
+    cumsum = np.cumsum(mask, axis=dim, dtype=np.int32)
+    # Pad with zeros at the start along 'dim'
+    pad_width = [(1, 0) if i == dim else (0, 0) for i in range(3)]
+    padded_cumsum = np.pad(cumsum, pad_width=pad_width, mode="constant", constant_values=0)
+    # Rolling window: padded[start+delta:start+delta+dim_len] - padded[start:start+dim_len-delta]
+    slices_start = [slice(dim_len - delta) if i == dim else slice(None) for i in range(3)]
+    slices_end = [slice(delta, dim_len) if i == dim else slice(None) for i in range(3)]
+    # number of nans in each delta
+    return padded_cumsum[tuple(slices_end)] - padded_cumsum[tuple(slices_start)]
+def dc_nan_count(chunk, deltas, dim_lenghts):
+    """
+    Count the number of NaN values in each 3-D datacube within a chunk.
+    Applies :func:`dim_nan_count` sequentially along time, X, and Y to
+    compute the total NaN count for every possible datacube position.
+    Parameters
+    ----------
+    chunk : np.ndarray
+        3-D array of shape ``(T, X, Y)`` — a time chunk of the full Zarr
+        dataset.
+    deltas : tuple of int
+        Datacube dimensions ``(Dt, w, h)`` along time, X, and Y.
+    dim_lenghts : tuple of int
+        Shape of ``chunk``, i.e. ``(T, X, Y)``.
+    Returns
+    -------
+    nans_cube_chunk : np.ndarray
+        Integer array where ``nans_cube_chunk[it, ix, iy]`` is the number of
+        NaN values in the datacube ``chunk[it:it+Dt, ix:ix+w, iy:iy+h]``.
+    """
+    # Compute NaN mask and cumsum along time axis
+    nan_mask = np.isnan(chunk).astype(np.int16)
+    # Number of NaN along time
+    nans_t = dim_nan_count(nan_mask, dim=0, delta=deltas[0], dim_len=dim_lenghts[0])
+    # Number of NaN along X x T
+    nans_xt = dim_nan_count(nans_t, dim=1, delta=deltas[1], dim_len=dim_lenghts[1])
+    # Number of NaN in the datacube (Y x X x T)
+    nans_cube_chunk = dim_nan_count(nans_xt, dim=2, delta=deltas[2], dim_len=dim_lenghts[2])
+    return nans_cube_chunk
+def process_chunk(time_range, t_start_idx, data, N_nan, deltas, steps, valid_starts_gap, dc_nan_count):
+    """
+    Process a single time chunk and return valid datacube indices.
+    Loads the chunk from the Zarr array, counts NaN values per datacube,
+    filters by the maximum allowed NaN threshold and time-continuity
+    constraints, and returns the valid ``(t, x, y)`` indices.
+    Parameters
+    ----------
+    time_range : array-like of int
+        Two-element sequence ``(start_t, end_t)`` defining the chunk
+        boundaries (relative to ``t_start_idx``).
+    t_start_idx : int
+        Absolute time index corresponding to the dataset start date.
+    data : xr.DataArray
+        Zarr-backed data array with the ``'RR'`` variable.
+    N_nan : int
+        Maximum number of NaN values allowed per datacube.
+    deltas : tuple of int
+        Datacube dimensions ``(Dt, w, h)``.
+    steps : tuple of int
+        Stride ``(step_T, step_X, step_Y)`` for subsampling valid indices.
+    valid_starts_gap : np.ndarray
+        Array of valid time-start indices that have no temporal gaps.
+    dc_nan_count : callable
+        Function to count NaN values per datacube (see :func:`dc_nan_count`).
+    Returns
+    -------
+    idx_t : np.ndarray
+        Absolute time indices of valid datacubes.
+    idx_x : np.ndarray
+        X indices of valid datacubes.
+    idx_y : np.ndarray
+        Y indices of valid datacubes.
+    """
+    try:
+        # start_t: start index of the chunk
+        start_t, end_t = time_range
+        # Chunk from Zarr (T, X, Y)
+        chunk = data[start_t + t_start_idx : end_t + t_start_idx, :, :]
+        dim_lenghts = chunk.shape  # shape: (T, X, Y)
+        # Compute the number of NaNs in each datacube in chunk
+        nans_cube_chunk = dc_nan_count(chunk, deltas, dim_lenghts)
+        del chunk
+        # Apply the mask
+        valid_mask = nans_cube_chunk <= N_nan
+        del nans_cube_chunk
+        # This indices are relative to the chunk
+        idx_t_rel, idx_x, idx_y = np.where(valid_mask)
+        del valid_mask
+        # Cast to int32
+        idx_t_rel = idx_t_rel.astype(np.int32)
+        idx_x = idx_x.astype(np.int32)
+        idx_y = idx_y.astype(np.int32)
+        # Convert relative time indices
+        idx_t = idx_t_rel + start_t
+        # Keep only time indices in valid_starts_gap
+        time_mask = np.isin(idx_t, valid_starts_gap)
+        idx_t = idx_t[time_mask] + t_start_idx  # also convert to absolute index
+        idx_x = idx_x[time_mask]
+        idx_y = idx_y[time_mask]
+        # Filter datacube indices according to steps
+        stride_mask = (idx_t % steps[0] == 0) & (idx_x % steps[1] == 0) & (idx_y % steps[2] == 0)
+        idx_x = idx_x[stride_mask]
+        idx_y = idx_y[stride_mask]
+        idx_t = idx_t[stride_mask]
+        return idx_t, idx_x, idx_y
+    except Exception as e:
+        print(f"Error processing chunk starting at t={start_t}: {e}", file=sys.stderr)
+        sys.exit(1)
+def file_writer(output_queue, filename, batch_size=1000):
+    """
+    Dedicated writer thread that flushes results to a CSV file in batches.
+    Reads ``(idx_t, idx_x, idx_y)`` tuples from the queue and writes them
+    as rows to the output file. Stops when a ``None`` sentinel is received.
+    Parameters
+    ----------
+    output_queue : queue.Queue
+        Thread-safe queue providing ``(idx_t, idx_x, idx_y)`` tuples.
+    filename : str
+        Path to the output CSV file.
+    batch_size : int, optional
+        Number of rows to buffer before flushing to disk. Default is
+        ``1000``.
+    """
+    with open(filename, "w") as f:
+        f.write("t,x,y\n")
+        batch = []
+        while True:
+            item = output_queue.get()
+            if item is None:  # Sentinel value to stop
+                # Write remaining batch
+                for t, x, y in batch:
+                    f.write(f"{t},{x},{y}\n")
+                break
+            batch.extend(zip(*item, strict=True))
+            if len(batch) >= batch_size:
+                for t, x, y in batch:
+                    f.write(f"{t},{x},{y}\n")
+                f.flush()
+                batch = []
+        print(f"Results saved to {filename}")
+# === Parse Arguments ===
+parser = argparse.ArgumentParser(description="Process valid datacubes from Zarr dataset")
+parser.add_argument("zarr_path", help="Path to the Zarr dataset")
+parser.add_argument("--start_date", default=None, type=str, help="Start date (YYYY-MM-DD)")
+parser.add_argument("--end_date", default=None, type=str, help="End date (YYYY-MM-DD)")
+parser.add_argument("--Dt", type=int, default=24, help="Time depth")
+parser.add_argument("--w", type=int, default=256, help="Spatial width")
+parser.add_argument("--h", type=int, default=256, help="Spatial height")
+parser.add_argument("--step_T", type=int, default=3, help="Time step")
+parser.add_argument("--step_X", type=int, default=16, help="X step")
+parser.add_argument("--step_Y", type=int, default=16, help="Y step")
+parser.add_argument("--n_workers", type=int, default=8, help="Number of parallel workers")
+parser.add_argument("--n_nan", type=int, default=10000, help="Maximum NaNs per datacube")
+args = parser.parse_args()
+# === PARAMETERS ===
+Dt = args.Dt  # time depth
+w = args.w  # x width
+h = args.h  # y height
+step_T = args.step_T
+step_X = args.step_X
+step_Y = args.step_Y
+N_nan = args.n_nan  # maximum number of nans in each datacube
+n_workers = args.n_workers
+time_chunk_size = 3 * Dt
+# === Dataset Loading ===
+print(f"Opening Zarr dataset: {args.zarr_path}")
+try:
+    # zg = zarr.open(args.zarr_path, mode='r')
+    zg = xr.open_zarr(args.zarr_path, decode_times=True)
+    RR_full = zg["RR"]
+    time_array_full = pd.to_datetime(zg["time"][:])
+    print(f"Full dataset shape: T={RR_full.shape[0]}, X={RR_full.shape[1]}, Y={RR_full.shape[2]}")
+    print(f"Full dataset time range: {time_array_full[0]} to {time_array_full[-1]}")
+except Exception as e:
+    print(f"Error loading Zarr dataset: {e}")
+    sys.exit(1)
+# Filter the dates
+start_date = pd.to_datetime(args.start_date) if args.start_date else time_array_full[0]
+end_date = pd.to_datetime(args.end_date) if args.end_date else time_array_full[-1]
+# Find indices corresponding to date range
+mask = (time_array_full >= start_date) & (time_array_full <= end_date)
+valid_indices = np.where(mask)[0]
+if len(valid_indices) == 0:
+    print(f"No data found between {start_date} and {end_date}")
+    sys.exit(1)
+t_start_idx = valid_indices[0]
+t_end_idx = valid_indices[-1] + 1
+# Slice the data
+size_T = t_end_idx - t_start_idx
+size_X = RR_full.shape[1]
+size_Y = RR_full.shape[2]
+time_array = time_array_full[t_start_idx:t_end_idx]
+print(f"Filtered dataset shape: T={size_T}, X={size_X}, Y={size_Y}")
+print(f"Filtered dataset time range: {time_array[0]} to {time_array[-1]}")
+# Calculate maximum valid indices
+max_x = size_X - w + 1
+max_y = size_Y - h + 1
+max_t = size_T - Dt + 1
+# === Time Continuity ===
+print("Checking time continuity...")
+try:
+    expected_step = pd.Timedelta("00:05:00")
+    time_diffs = time_array[1:] - time_array[:-1]
+    gaps = (time_diffs != expected_step).astype(int)
+    # Check continuity for windows of size Dt
+    window_sum = np.convolve(gaps, np.ones(Dt - 1, dtype=int), mode="valid")
+    # Find valid starting times: continuous windows at T_step intervals
+    valid_starts_gap = np.where(window_sum == 0)[0]
+    print(f"Found {len(valid_starts_gap)} valid time starts without gaps")
+except Exception as e:
+    print(f"Error in time continuity check: {e}", file=sys.stderr)
+    sys.exit(1)
+# === Chunked NaN Processing ===
+# Memory per chunk (x 4 because float32)
+estimated_chunk_memory_gb = (time_chunk_size * size_X * size_Y * 4) / (1024**3)
+print(f"Estimated memory per chunk: {estimated_chunk_memory_gb:.2f} GB")
+print(f"Estimated total memory: {(estimated_chunk_memory_gb * n_workers):.2f} GB")
+# Process time in chunks with overlap Dt
+t_starts = np.arange(0, max_t, time_chunk_size)
+t_ends = np.minimum(t_starts + time_chunk_size + Dt - 1, size_T)
+t_pairs = np.stack((t_starts, t_ends), axis=1)
+# Create partial function with fixed parameters
+process_chunk_partial = partial(
+    process_chunk,
+    t_start_idx=t_start_idx,
+    data=RR_full,
+    N_nan=N_nan,
+    deltas=(Dt, w, h),
+    steps=(step_T, step_X, step_Y),
+    valid_starts_gap=valid_starts_gap,
+    dc_nan_count=dc_nan_count,
+)
+# Chek if file exists
+output_file = f"valid_datacubes_{args.start_date}-{args.end_date}_{Dt}x{w}x{h}_{step_T}x{step_X}x{step_Y}_{N_nan}.csv"
+if os.path.exists(output_file):
+    response = input(f"File {output_file} already exists. Overwrite? (y/n): ")
+    if response.lower() != "y":
+        print("Exiting without overwriting.")
+        sys.exit(0)
+    else:
+        print(f"Overwriting {output_file}...")
+# Start writer thread
+output_queue = Queue(maxsize=100)
+writer_thread = Thread(target=file_writer, args=(output_queue, output_file, 1000))
+writer_thread.daemon = False
+writer_thread.start()
+# Process chunks in parallel
+with Pool(n_workers) as pool:
+    for _i, hits in enumerate(
+        tqdm(pool.imap(process_chunk_partial, t_pairs, chunksize=1), total=len(t_starts), desc="Processing time chunks")
+    ):
+        output_queue.put(hits)
+# Signal writer thread to stop
+output_queue.put(None)
+writer_thread.join()
+print(f"Done in {time.time() - START}s.")
+sys.exit(0)

importance_sampler/output/sampled_datacubes_2021-01-01-2025-12-11_24x256x256_3x16x16_10000.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

importance_sampler/output/sampled_datacubes_2021-01-01-2025-12-11_24x256x256_3x16x16_10000_metadata.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "csv": "valid_datacubes_2021-01-01-2025-12-11_24x256x256_3x16x16_10000.csv",
+  "zarr": "/leonardo_scratch/fast/AI4W_forecast_2/italian-radar-dpc-sri.zarr",
+  "file": "sampled_datacubes_2021-01-01-2025-12-11_24x256x256_3x16x16_10000.csv",
+  "start_date": "2021-01-01",
+  "end_date": "2025-12-11",
+  "Dt": 24,
+  "w": 256,
+  "h": 256,
+  "step_T": 3,
+  "step_X": 16,
+  "step_Y": 16,
+  "N_nan": 10000,
+  "N_rand": 1,
+  "n_workers": 112,
+  "qmin": 0.0001,
+  "m": 0.1,
+  "s": 1,
+  "seed": null,
+  "timestamp": "2025-12-13 21:52:41"
+}

importance_sampler/sample_valid_datacubes.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import argparse
+import json
+import os
+import sys
+import time
+from functools import partial
+from multiprocessing import Pool
+from queue import Queue
+from threading import Thread
+import numpy as np
+import pandas as pd
+import xarray as xr
+from tqdm import tqdm
+START = time.time()
+SEED = None  # for reproducibility
+# === Parse Arguments ===
+parser = argparse.ArgumentParser(description="Importance sampler of the valid datacubes (after the nan filtering)")
+parser.add_argument("zarr_path", help="Path to the Zarr dataset")
+parser.add_argument(
+    "csv_path", help="Path to the CSV with the valid datacube coordinates (created by the nan filtering)"
+)
+parser.add_argument("--q_min", type=float, default=1e-4, help="Minimum selection probability (default 1e-4)")
+parser.add_argument("--s", type=float, default=1, help="Denominator in the exponential")
+parser.add_argument("--m", type=float, default=0.1, help="Factor weighting the mean rescaled rain rate (dafault 0.1)")
+parser.add_argument("--n_workers", type=int, default=8, help="Number of parallel workers (default 8)")
+parser.add_argument("--n_rand", type=int, default=1, help="Number of random sampling of each datacube (dafaut 1)")
+args = parser.parse_args()
+# === PARAMETERS ===
+s = args.s
+qmin = args.q_min
+m = args.m
+n_workers = args.n_workers  # number of parallel workers
+N_rand = args.n_rand  # number of random numbers per region
+chunksize = 16000  # = 500 CSV lines per workers
+# Parameters from CSV filename
+name_arr = args.csv_path.split("_")
+dates = name_arr[2]
+start_date = "-".join(dates.split("-")[0:3])
+end_date = "-".join(dates.split("-")[3:])
+Dt, w, h = name_arr[3].split("x")
+step_T, step_X, step_Y = name_arr[4].split("x")
+N_nan = name_arr[5][:-4]
+# Casting
+Dt, w, h = int(Dt), int(w), int(h)
+step_T, step_X, step_Y = int(step_T), int(step_X), int(step_Y)
+N_nan = int(N_nan)
+# === FUNCTIONS ===
+def acceptance_probability(data):
+    """
+    Calculate the acceptance probability for importance sampling.
+    The probability is ``min(1, q_min + m * mean(data))``, where ``q_min``
+    and ``m`` are module-level parameters.
+    Parameters
+    ----------
+    data : np.ndarray
+        Rescaled rain rate data for a single datacube.
+    Returns
+    -------
+    q : float
+        Acceptance probability in ``[q_min, 1]``.
+    """
+    return min(1.0, qmin + m * np.nanmean(data))
+def process_datacube(coord, RR, N_rand, seed, acceptance_probability):
+    """
+    Process a single space-time region for importance sampling.
+    Loads the datacube, rescales rain rate, computes an acceptance
+    probability, and performs ``N_rand`` random acceptance trials.
+    Parameters
+    ----------
+    coord : array-like of int
+        Three-element sequence ``(it, ix, iy)`` specifying the datacube
+        origin.
+    RR : xr.DataArray
+        Rain rate data array from the Zarr dataset.
+    N_rand : int
+        Number of random acceptance trials per datacube.
+    seed : int or None
+        Random seed for reproducibility. If ``None``, non-deterministic.
+    acceptance_probability : callable
+        Function that takes a data array and returns a probability in
+        ``[0, 1]``.
+    Returns
+    -------
+    hits : list of tuple of int
+        List of accepted ``(it, ix, iy)`` tuples (may contain duplicates
+        if accepted multiple times).
+    """
+    try:
+        it, ix, iy = coord
+        time_slice = slice(it, it + Dt)
+        x_slice = slice(ix, ix + w)
+        y_slice = slice(iy, iy + h)
+        # Load data from Zarr
+        data = RR[time_slice, x_slice, y_slice]
+        data = 1 - np.exp(-data / s)
+        # Calculate acceptance probability
+        q = acceptance_probability(data)
+        # Generate random numbers with seed for reproducibility
+        rng = np.random.default_rng(seed)
+        random_numbers = rng.random(N_rand)
+        accepted_count = np.sum(random_numbers <= q)
+        # Return accepted hits
+        hits = [(it, ix, iy)] * accepted_count
+        return hits
+    except Exception as e:
+        print(f"Error processing region ({it}, {ix}, {iy}): {e}", file=sys.stderr)
+        return []
+def file_writer(output_queue, filename, batch_size=1000):
+    """
+    Dedicated writer thread that flushes results to a CSV file in batches.
+    Reads lists of ``(t, x, y)`` tuples from the queue and writes them as
+    CSV rows. Stops when a ``None`` sentinel is received.
+    Parameters
+    ----------
+    output_queue : queue.Queue
+        Thread-safe queue providing lists of ``(t, x, y)`` tuples.
+    filename : str
+        Path to the output CSV file.
+    batch_size : int, optional
+        Number of rows to buffer before flushing to disk. Default is
+        ``1000``.
+    """
+    with open(filename, "w") as f:
+        f.write("t,x,y\n")
+        batch = []
+        while True:
+            item = output_queue.get()
+            if item is None:  # Sentinel value to stop
+                # Write remaining batch
+                for t, x, y in batch:
+                    f.write(f"{t},{x},{y}\n")
+                break
+            batch.extend(item)
+            if len(batch) >= batch_size:
+                for t, x, y in batch:
+                    f.write(f"{t},{x},{y}\n")
+                f.flush()
+                batch = []
+        print(f"Results saved to {filename}")
+# === Dataset Loading ===
+print(f"Opening Zarr dataset: {args.zarr_path}")
+try:
+    zg = xr.open_zarr(args.zarr_path, mode="r")
+    RR = zg["RR"]
+except Exception as e:
+    print(f"Error loading Zarr dataset: {e}")
+    sys.exit(1)
+# Chek if file exists
+output_file = f"sampled_datacubes_{start_date}-{end_date}_{Dt}x{w}x{h}_{step_T}x{step_X}x{step_Y}_{N_nan}.csv"
+if os.path.exists(output_file):
+    response = input(f"File {output_file} already exists. Overwrite? (y/n): ")
+    if response.lower() != "y":
+        print("Exiting without overwriting.")
+        sys.exit(0)
+    else:
+        print(f"Overwriting {output_file}...")
+# Start writer thread
+output_queue = Queue(maxsize=100)
+writer_thread = Thread(target=file_writer, args=(output_queue, output_file, 1000))
+writer_thread.daemon = False
+writer_thread.start()
+# save metadata
+metadata = {
+    "csv": args.csv_path,
+    "zarr": args.zarr_path,
+    "file": output_file,
+    "start_date": start_date,
+    "end_date": end_date,
+    "Dt": Dt,
+    "w": w,
+    "h": h,
+    "step_T": step_T,
+    "step_X": step_X,
+    "step_Y": step_Y,
+    "N_nan": N_nan,
+    "N_rand": N_rand,
+    "n_workers": n_workers,
+    "qmin": qmin,
+    "m": m,
+    "s": s,
+    "seed": SEED,
+    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+}
+metadata_filename = output_file.replace(".csv", "_metadata.json")
+with open(metadata_filename, "w") as f:
+    json.dump(metadata, f, indent=2)
+print(f"Saved run metadata to {metadata_filename}")
+# === IMPORTANCE SAMPLING ===
+# Create partial function with fixed parameters
+process_datacube_partial = partial(
+    process_datacube, RR=RR, N_rand=N_rand, seed=SEED, acceptance_probability=acceptance_probability
+)
+pool_chunksize = max(1, chunksize // n_workers)
+with Pool(n_workers) as pool:
+    pbar = tqdm(desc="Processing CSV chunks")
+    # Loading the CSV by chunks
+    for chunk in pd.read_csv(
+        args.csv_path,
+        usecols=["t", "x", "y"],
+        dtype={"t": "int32", "x": "int32", "y": "int32"},
+        engine="c",
+        chunksize=chunksize,
+    ):
+        for hits in pool.imap(process_datacube_partial, chunk.values, chunksize=pool_chunksize):
+            if hits:
+                output_queue.put(hits)
+            pbar.update(1)
+    pbar.close()
+# Signal writer thread to stop
+output_queue.put(None)
+writer_thread.join()
+print(f"Done in {time.time() - START}s.")
+sys.exit(0)

notebooks/test_pretrained_model.ipynb ADDED Viewed

	@@ -0,0 +1,172 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a5d812be",
+   "metadata": {},
+   "source": [
+    "# Test pretrained model\n",
+    "This notebook tests the pretrained model on a single datacube taken from the radar dataset (https://arcodatahub.com/datasets/datasets/italian-radar-dpc-sri.zarr)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19c9a668",
+   "metadata": {},
+   "outputs": [],
+   "source": "import matplotlib.animation as animation\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pysteps.visualization.precipfields as pysteps_plot\nimport xarray as xr\nfrom IPython.display import HTML\n\nfrom convgru_ensemble import RadarLightningModel"
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3bd5ff98",
+   "metadata": {},
+   "source": "# Load radar data\nWe first load a sample of the italian radar dataset provided in the `examples/` folder."
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "008fb6ae",
+   "metadata": {},
+   "outputs": [],
+   "source": "radar = xr.open_dataarray('../examples/sample_data.nc')\nradar"
+  },
+  {
+   "cell_type": "markdown",
+   "id": "92f738bc",
+   "metadata": {},
+   "source": [
+    "This contains 18 sequences of radar images on the whole Italy, from the 28th to the 29th of October 2024. This is one of the most intense precipitation on Italy during 2024."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fcb5529a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create figure\n",
+    "fig, ax = plt.subplots(figsize=(4, 4.5))\n",
+    "\n",
+    "def update(frame):\n",
+    "    ax.clear()\n",
+    "    data = radar.isel(time=frame)\n",
+    "    pysteps_plot.plot_precip_field(data.values, ax=ax, colorbar=False)\n",
+    "    ax.set_title(f'Precipitation - {data.time.values}')\n",
+    "    return ax,\n",
+    "\n",
+    "# Create animation\n",
+    "ani = animation.FuncAnimation(fig, update, frames=len(radar.time),\n",
+    "                             interval=500, blit=False, repeat=True)\n",
+    "\n",
+    "# Display in notebook\n",
+    "display(HTML(ani.to_jshtml()))\n",
+    "plt.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e806b20a",
+   "metadata": {},
+   "source": [
+    "# Initialize the model \n",
+    "Initialize the model and load the weights from the checkpoint. You can change the number of future steps (forecast steps) and the ensemble size (ensemble_size). The other hyperparameters are fixed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c8ce3ff4",
+   "metadata": {},
+   "outputs": [],
+   "source": "# Set model's parameters\nforecast_steps   = 12\nensemble_size    = 10\n\n# Initialize the model and load the checkpoint\n# Option A: Load from local checkpoint\nmodel = RadarLightningModel.from_checkpoint(checkpoint_path=\"../checkpoints/ConvGRU-CRPS_6past_12fut.ckpt\")\n\n# Option B: Load from HuggingFace Hub\n# model = RadarLightningModel.from_pretrained(\"it4lia/irene\")"
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6f6fc1f9",
+   "metadata": {},
+   "source": [
+    "# Run the inference\n",
+    "We can run the inference and plot the forecast"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6d340b2",
+   "metadata": {},
+   "outputs": [],
+   "source": "# Past and future steps\npast_steps = 7\nforecast_steps = 12\npast, future = radar[:past_steps], radar[past_steps:past_steps+forecast_steps]\n\n# Predict the future rainrate\npred = model.predict(past, forecast_steps, ensemble_size=2)"
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f0705829",
+   "metadata": {},
+   "source": [
+    "### Plot the forecast"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b0646f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Ensemble mean\n",
+    "ensemble_mean = np.nanmean(pred, axis=0)\n",
+    "\n",
+    "# Initialize plots\n",
+    "fig, axs = plt.subplots(1, 4, figsize=(16,4.5))\n",
+    "row_labels = ['Ground Truth', 'Ensemble Mean', 'Member 1', 'Member 2']\n",
+    "data_sources = [future, ensemble_mean, pred[0], pred[1]]\n",
+    "\n",
+    "# Plot initial frame\n",
+    "for i, (ax, label, data) in enumerate(zip(axs, row_labels, data_sources)):\n",
+    "    plot_precip_field(data[0], ax=ax, units='mm/h', colorscale='pysteps')\n",
+    "    ax.set_title(label, fontsize=14)\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "\n",
+    "# Animation function\n",
+    "def update(frame):\n",
+    "    for i, (ax, data) in enumerate(zip(axs, data_sources)):\n",
+    "        ax.clear()\n",
+    "        plot_precip_field(data[frame], ax=ax, units='mm/h', colorscale='pysteps', colorbar=False)\n",
+    "        ax.set_title(f'{row_labels[i]} - Step {frame}', fontsize=14)\n",
+    "    return axs\n",
+    "\n",
+    "# Create animation\n",
+    "anim = FuncAnimation(fig, update, frames=forecast_steps, interval=500, blit=False)\n",
+    "\n",
+    "# Display\n",
+    "HTML(anim.to_jshtml())\n",
+    "\n",
+    "display(HTML(anim.to_jshtml()))\n",
+    "plt.close()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,76 @@

+[build-system]
+requires = ["setuptools>=69", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "ConvGRU-Ensemble"
+version = "0.1.0"
+description = "Ensemble precipitation nowcasting using Convolutional GRU networks"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "absl-py>=2.3.1",
+    "einops>=0.8.1",
+    "etils>=1.13.0",
+    "fiddle>=0.3.0",
+    "fire>=0.7.1",
+    "importlib-resources>=6.5.2",
+    "jupyterlab>=4.5.5",
+    "matplotlib>=3.10.8",
+    "numpy>=2.3.5",
+    "pandas>=2.3.3",
+    "pysteps>=1.19.0",
+    "pytorch-lightning>=2.6.0",
+    "pyyaml>=6.0.3",
+    "tqdm>=4.67.1",
+    "tensorboard>=2.20.0",
+    "torch>=2.9.1",
+    "torchvision>=0.24.1",
+    "xarray>=2025.12.0",
+    "zarr>=3.1.5",
+    "huggingface_hub>=0.27.0",
+]
+[project.optional-dependencies]
+serve = [
+    "fastapi>=0.115.0",
+    "uvicorn>=0.34.0",
+    "python-multipart>=0.0.18",
+]
+[project.scripts]
+convgru-ensemble = "convgru_ensemble.cli:main"
+[dependency-groups]
+dev = [
+    "pytest>=8.3.5",
+    "pre-commit>=4.0.0",
+    "ruff>=0.9.10",
+]
+[tool.setuptools]
+packages = ["convgru_ensemble"]
+[tool.ruff]
+target-version = "py313"
+line-length = 120
+extend-exclude = ["notebooks/", "importance_sampler/"]
+[tool.ruff.lint]
+select = [
+    "E",    # pycodestyle errors
+    "W",    # pycodestyle warnings
+    "F",    # pyflakes
+    "I",    # isort
+    "UP",   # pyupgrade
+    "B",    # flake8-bugbear
+    "SIM",  # flake8-simplify
+    "NPY",  # numpy-specific rules
+]
+ignore = [
+    "E501",   # line too long (handled by formatter)
+    "SIM108", # ternary operator (readability preference)
+]
+[tool.ruff.lint.isort]
+known-first-party = ["convgru_ensemble"]

scripts/upload_to_hub.py ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/usr/bin/env python
+"""Upload a trained ConvGRU-Ensemble model to HuggingFace Hub."""
+import argparse
+from pathlib import Path
+def main():
+    parser = argparse.ArgumentParser(description="Upload model to HuggingFace Hub")
+    parser.add_argument("--checkpoint", required=True, help="Path to .ckpt checkpoint file")
+    parser.add_argument("--repo-id", required=True, help="HuggingFace repo ID (e.g., it4lia/irene)")
+    parser.add_argument("--model-card", default=None, help="Path to model card markdown file")
+    parser.add_argument("--private", action="store_true", help="Create a private repository")
+    args = parser.parse_args()
+    # Default model card
+    model_card = args.model_card
+    if model_card is None:
+        default_card = Path(__file__).parent.parent / "MODEL_CARD.md"
+        if default_card.exists():
+            model_card = str(default_card)
+    from convgru_ensemble.hub import push_to_hub
+    url = push_to_hub(
+        checkpoint_path=args.checkpoint,
+        repo_id=args.repo_id,
+        model_card_path=model_card,
+        private=args.private,
+    )
+    print(f"Model uploaded: {url}")
+if __name__ == "__main__":
+    main()

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import numpy as np
+import pytest
+from convgru_ensemble.lightning_model import RadarLightningModel
+@pytest.fixture
+def model_small():
+    """Small model with num_blocks=2 for fast testing."""
+    return RadarLightningModel(
+        input_channels=1,
+        num_blocks=2,
+        forecast_steps=2,
+        ensemble_size=2,
+        noisy_decoder=False,
+    )
+@pytest.fixture
+def sample_rain_rate():
+    """Synthetic rain rate data of shape (T, H, W)."""
+    rng = np.random.default_rng(42)
+    return rng.random((4, 16, 16), dtype=np.float32) * 10.0  # 0-10 mm/h

tests/test_hub.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from unittest.mock import MagicMock, patch
+from convgru_ensemble.hub import from_pretrained
+@patch("convgru_ensemble.hub.hf_hub_download")
+@patch("convgru_ensemble.hub.RadarLightningModel", create=True)
+def test_from_pretrained_calls_hf_hub_download(mock_model_cls, mock_download):
+    mock_download.return_value = "/tmp/cached/model.ckpt"
+    # Patch the import inside the function
+    with patch("convgru_ensemble.lightning_model.RadarLightningModel") as mock_cls:
+        mock_cls.from_checkpoint.return_value = MagicMock()
+        from_pretrained("it4lia/irene", filename="model.ckpt", device="cpu")
+    mock_download.assert_called_once_with(repo_id="it4lia/irene", filename="model.ckpt")

tests/test_inference.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import numpy as np
+import xarray as xr
+from convgru_ensemble.lightning_model import RadarLightningModel
+def test_inference_on_sample_data():
+    """End-to-end inference on the sample NetCDF with a freshly initialized model."""
+    ds = xr.open_dataset("examples/sample_data.nc")
+    rain = ds["RR"].values  # (54, 1400, 1200)
+    # Use 6 past frames, full spatial extent
+    past = rain[:6].astype(np.float32)
+    _, H, W = past.shape
+    model = RadarLightningModel(
+        input_channels=1,
+        num_blocks=3,
+        forecast_steps=4,
+        ensemble_size=1,
+        noisy_decoder=False,
+    )
+    preds = model.predict(past, forecast_steps=4, ensemble_size=1)
+    assert preds.shape == (1, 4, H, W)
+    assert np.isfinite(preds).all()
+    assert preds.dtype == np.float64 or preds.dtype == np.float32

tests/test_lightning_model.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import numpy as np
+import torch
+from convgru_ensemble.lightning_model import RadarLightningModel
+def test_predict_handles_unpadded_inputs():
+    model = RadarLightningModel(
+        input_channels=1,
+        num_blocks=1,
+        forecast_steps=2,
+        ensemble_size=1,
+        noisy_decoder=False,
+    )
+    past = np.zeros((4, 8, 8), dtype=np.float32)
+    preds = model.predict(past, forecast_steps=2, ensemble_size=1)
+    assert preds.shape == (1, 2, 8, 8)
+    assert np.isfinite(preds).all()
+def test_from_checkpoint_delegates_to_lightning_loader(monkeypatch):
+    captured = {}
+    def fake_loader(cls, checkpoint_path, map_location=None, strict=None, weights_only=None):
+        captured["checkpoint_path"] = checkpoint_path
+        captured["map_location"] = map_location
+        captured["strict"] = strict
+        captured["weights_only"] = weights_only
+        return "loaded-model"
+    monkeypatch.setattr(RadarLightningModel, "load_from_checkpoint", classmethod(fake_loader))
+    loaded = RadarLightningModel.from_checkpoint("/tmp/model.ckpt", device="cpu")
+    assert loaded == "loaded-model"
+    assert captured["checkpoint_path"] == "/tmp/model.ckpt"
+    assert isinstance(captured["map_location"], torch.device)
+    assert captured["map_location"].type == "cpu"
+    assert captured["strict"] is True

tests/test_losses.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+from convgru_ensemble.losses import CRPS, MaskedLoss, build_loss
+def test_crps_reduces_to_scalar():
+    loss_fn = CRPS(reduction="mean")
+    preds = torch.randn(2, 4, 5, 8, 8)  # (B, T, M, H, W)
+    target = torch.randn(2, 4, 1, 8, 8)  # (B, T, 1, H, W)
+    loss = loss_fn(preds, target)
+    assert loss.dim() == 0  # scalar
+    assert loss.item() > 0 or loss.item() == 0
+def test_masked_loss_ignores_masked_pixels():
+    base_loss = torch.nn.MSELoss(reduction="none")
+    loss_fn = MaskedLoss(base_loss, reduction="mean")
+    preds = torch.ones(1, 2, 1, 4, 4)
+    target = torch.zeros(1, 2, 1, 4, 4)
+    # Mask out everything — loss should be 0
+    mask = torch.zeros(1, 1, 1, 4, 4)
+    loss = loss_fn(preds, target, mask)
+    assert loss.item() == 0.0
+def test_build_loss_by_name():
+    criterion = build_loss("crps", loss_params=None, masked_loss=False)
+    assert isinstance(criterion, CRPS)
+def test_build_loss_masked():
+    criterion = build_loss("mse", loss_params=None, masked_loss=True)
+    assert isinstance(criterion, MaskedLoss)

tests/test_model.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+from convgru_ensemble.model import EncoderDecoder
+def test_forward_single_output_shape():
+    model = EncoderDecoder(channels=1, num_blocks=2)
+    x = torch.randn(2, 4, 1, 16, 16)
+    out = model(x, steps=3, noisy_decoder=False, ensemble_size=1)
+    assert out.shape == (2, 3, 1, 16, 16)
+def test_forward_ensemble_output_shape():
+    model = EncoderDecoder(channels=1, num_blocks=2)
+    x = torch.randn(2, 4, 1, 16, 16)
+    out = model(x, steps=3, noisy_decoder=False, ensemble_size=5)
+    assert out.shape == (2, 3, 5, 16, 16)
+def test_forward_different_num_blocks():
+    model = EncoderDecoder(channels=1, num_blocks=3)
+    x = torch.randn(1, 4, 1, 32, 32)
+    out = model(x, steps=2, ensemble_size=1)
+    assert out.shape == (1, 2, 1, 32, 32)
+def test_noisy_decoder_produces_different_outputs():
+    model = EncoderDecoder(channels=1, num_blocks=2)
+    model.eval()
+    x = torch.randn(1, 4, 1, 16, 16)
+    out1 = model(x, steps=2, noisy_decoder=True, ensemble_size=1)
+    out2 = model(x, steps=2, noisy_decoder=True, ensemble_size=1)
+    # Noisy decoder should produce different outputs (with very high probability)
+    assert not torch.allclose(out1, out2)

tests/test_serve.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import io
+from unittest.mock import MagicMock, patch
+import numpy as np
+import pytest
+import xarray as xr
+from convgru_ensemble.lightning_model import RadarLightningModel
+@pytest.fixture
+def mock_model():
+    model = MagicMock(spec=RadarLightningModel)
+    model.hparams = MagicMock()
+    model.hparams.input_channels = 1
+    model.hparams.num_blocks = 2
+    model.hparams.forecast_steps = 12
+    model.hparams.ensemble_size = 2
+    model.hparams.noisy_decoder = False
+    model.hparams.loss_class = "crps"
+    model.device = "cpu"
+    model.predict.return_value = np.zeros((10, 12, 8, 8), dtype=np.float32)
+    return model
+@pytest.fixture
+def client(mock_model):
+    from fastapi.testclient import TestClient
+    with patch("convgru_ensemble.serve._load_model", return_value=mock_model):
+        from convgru_ensemble.serve import app
+        with TestClient(app) as c:
+            yield c
+def test_health(client):
+    resp = client.get("/health")
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["status"] == "ok"
+    assert data["model_loaded"] is True
+def test_model_info(client):
+    resp = client.get("/model/info")
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["architecture"] == "ConvGRU-Ensemble EncoderDecoder"
+    assert data["num_blocks"] == 2
+def test_predict_returns_netcdf(client):
+    # Create a small NetCDF file in memory
+    ds = xr.Dataset({"RR": xr.DataArray(np.zeros((4, 8, 8), dtype=np.float32), dims=["time", "y", "x"])})
+    buf = io.BytesIO()
+    ds.to_netcdf(buf)
+    buf.seek(0)
+    resp = client.post(
+        "/predict?forecast_steps=12&ensemble_size=10",
+        files={"file": ("input.nc", buf, "application/x-netcdf")},
+    )
+    assert resp.status_code == 200
+    assert resp.headers["content-type"] == "application/x-netcdf"
+    assert "X-Elapsed-Seconds" in resp.headers

tests/test_utils.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import numpy as np
+from convgru_ensemble.utils import normalized_to_rainrate, rainrate_to_normalized
+def test_roundtrip_conversion():
+    rain = np.array([0.0, 1.0, 5.0, 20.0, 50.0], dtype=np.float32)
+    normalized = rainrate_to_normalized(rain)
+    recovered = normalized_to_rainrate(normalized)
+    np.testing.assert_allclose(recovered, rain, rtol=0.01, atol=0.05)
+def test_zero_rain_maps_to_low_normalized():
+    rain = np.array([0.0], dtype=np.float32)
+    normalized = rainrate_to_normalized(rain)
+    # Zero rain should map to approximately -1 (0 dBZ → normalized -1)
+    assert normalized[0] < -0.9

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff