Jdice27
/

AirTrackLM

Model card Files Files and versions

xet

Community

Jdice27 commited on 13 days ago

Commit

e8142ba

verified ·

1 Parent(s): 48b8bfe

Add data_pipeline.py

Browse files

Files changed (1) hide show

data_pipeline.py +960 -0

data_pipeline.py ADDED Viewed

	@@ -0,0 +1,960 @@

+"""
+AirTrackLM - Data Pipeline
+==========================
+Converts raw ADS-B (lat, lon, alt, timestamp) to model-ready tensors.
+Pipeline:
+1. Load trajectories from `traffic` library or raw CSV
+2. Resample to fixed time interval (default 5s)
+3. Convert lat/lon/alt to ENU (East-North-Up) coordinates using first point as origin
+4. Compute velocity via 3-point central derivative on ENU positions
+5. Derive COG, SOG from x-y ground velocity; ROT from COG; altitude rate from z velocity
+6. Binary geohash encoding (40-bit per axis, following LLM4STP)
+7. Discretize features into bins
+8. Compute uncertainty scores
+9. Build sliding-window PyTorch Dataset
+"""
+import numpy as np
+import torch
+from torch.utils.data import Dataset, DataLoader
+from typing import Optional, Tuple, List, Dict
+import pyproj
+from dataclasses import dataclass, field
+# ============================================================
+# 1. ENU Coordinate Conversion
+# ============================================================
+class ENUConverter:
+    """
+    Convert WGS84 (lat, lon, alt) to local East-North-Up (ENU) coordinates.
+    Origin is set to the first point of each trajectory.
+    Uses pyproj for geodetically correct transformations.
+    """
+    def __init__(self, origin_lat: float, origin_lon: float, origin_alt: float = 0.0):
+        self.origin_lat = origin_lat
+        self.origin_lon = origin_lon
+        self.origin_alt = origin_alt
+        # ECEF transformer
+        self.ecef = pyproj.Proj(proj='geocent', ellps='WGS84', datum='WGS84')
+        self.lla = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
+        self.transformer_to_ecef = pyproj.Transformer.from_proj(self.lla, self.ecef, always_xy=True)
+        self.transformer_to_lla = pyproj.Transformer.from_proj(self.ecef, self.lla, always_xy=True)
+        # Origin in ECEF
+        self.x0, self.y0, self.z0 = self.transformer_to_ecef.transform(
+            origin_lon, origin_lat, origin_alt
+        )
+        # Rotation matrix (ECEF -> ENU)
+        lat_r = np.radians(origin_lat)
+        lon_r = np.radians(origin_lon)
+        self.R = np.array([
+            [-np.sin(lon_r),                np.cos(lon_r),               0            ],
+            [-np.sin(lat_r)*np.cos(lon_r), -np.sin(lat_r)*np.sin(lon_r), np.cos(lat_r)],
+            [ np.cos(lat_r)*np.cos(lon_r),  np.cos(lat_r)*np.sin(lon_r), np.sin(lat_r)]
+        ])
+    def to_enu(self, lats: np.ndarray, lons: np.ndarray, alts: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """Convert arrays of lat/lon/alt to ENU (meters)."""
+        # To ECEF
+        x, y, z = self.transformer_to_ecef.transform(lons, lats, alts)
+        # Offset from origin
+        dx = x - self.x0
+        dy = y - self.y0
+        dz = z - self.z0
+        # Rotate to ENU
+        ecef_delta = np.stack([dx, dy, dz], axis=0)  # (3, N)
+        enu = self.R @ ecef_delta  # (3, N)
+        east = enu[0]   # meters
+        north = enu[1]  # meters
+        up = enu[2]     # meters
+        return east, north, up
+    def from_enu(self, east: np.ndarray, north: np.ndarray, up: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """Convert ENU back to lat/lon/alt."""
+        enu = np.stack([east, north, up], axis=0)
+        ecef_delta = self.R.T @ enu
+        x = ecef_delta[0] + self.x0
+        y = ecef_delta[1] + self.y0
+        z = ecef_delta[2] + self.z0
+        lons, lats, alts = self.transformer_to_lla.transform(x, y, z)
+        return lats, lons, alts
+# ============================================================
+# 2. Three-Point Central Derivative
+# ============================================================
+def three_point_derivative(values: np.ndarray, dt: np.ndarray) -> np.ndarray:
+    """
+    Compute derivative using 3-point central difference.
+    For interior points (i=1..N-2):
+        f'(i) = (f(i+1) - f(i-1)) / (t(i+1) - t(i-1))
+    For endpoints:
+        f'(0) = (f(1) - f(0)) / (t(1) - t(0))         # forward difference
+        f'(N-1) = (f(N-1) - f(N-2)) / (t(N-1) - t(N-2))  # backward difference
+    Args:
+        values: shape (N,) — the signal to differentiate
+        dt: shape (N,) — cumulative time from start (seconds)
+    Returns:
+        derivative: shape (N,) — rate of change per second
+    """
+    N = len(values)
+    deriv = np.zeros(N)
+    if N < 2:
+        return deriv
+    # Forward difference for first point
+    dt_fwd = dt[1] - dt[0]
+    if dt_fwd > 0:
+        deriv[0] = (values[1] - values[0]) / dt_fwd
+    # Central difference for interior points
+    for i in range(1, N - 1):
+        dt_span = dt[i + 1] - dt[i - 1]
+        if dt_span > 0:
+            deriv[i] = (values[i + 1] - values[i - 1]) / dt_span
+    # Backward difference for last point
+    dt_bwd = dt[-1] - dt[-2]
+    if dt_bwd > 0:
+        deriv[-1] = (values[-1] - values[-2]) / dt_bwd
+    return deriv
+def three_point_derivative_vectorized(values: np.ndarray, dt: np.ndarray) -> np.ndarray:
+    """Vectorized version of 3-point central derivative."""
+    N = len(values)
+    deriv = np.zeros(N)
+    if N < 2:
+        return deriv
+    # Forward difference for first point
+    dt_fwd = dt[1] - dt[0]
+    if dt_fwd > 0:
+        deriv[0] = (values[1] - values[0]) / dt_fwd
+    # Central difference for interior points (vectorized)
+    if N > 2:
+        dt_span = dt[2:] - dt[:-2]  # (N-2,)
+        mask = dt_span > 0
+        val_diff = values[2:] - values[:-2]  # (N-2,)
+        deriv[1:-1] = np.where(mask, val_diff / np.maximum(dt_span, 1e-10), 0.0)
+    # Backward difference for last point
+    dt_bwd = dt[-1] - dt[-2]
+    if dt_bwd > 0:
+        deriv[-1] = (values[-1] - values[-2]) / dt_bwd
+    return deriv
+# ============================================================
+# 3. Feature Derivation from ENU positions
+# ============================================================
+def derive_features_enu(
+    east: np.ndarray,
+    north: np.ndarray,
+    up: np.ndarray,
+    timestamps: np.ndarray
+) -> Dict[str, np.ndarray]:
+    """
+    Derive COG, SOG, ROT, and altitude rate from ENU positions
+    using 3-point central derivatives.
+    Args:
+        east, north, up: ENU coordinates in meters, shape (N,)
+        timestamps: Unix timestamps in seconds, shape (N,)
+    Returns:
+        dict with keys: 'vx', 'vy', 'vz', 'COG', 'SOG', 'ROT', 'alt_rate'
+        Each is shape (N,)
+    """
+    # Cumulative time from start
+    t = timestamps - timestamps[0]
+    # 3-point derivative on ENU positions → velocities (m/s)
+    vx = three_point_derivative_vectorized(east, t)    # East velocity (m/s)
+    vy = three_point_derivative_vectorized(north, t)   # North velocity (m/s)
+    vz = three_point_derivative_vectorized(up, t)      # Up velocity (m/s)
+    # SOG from ground plane velocity: sqrt(vx² + vy²), convert m/s → knots
+    sog_ms = np.sqrt(vx**2 + vy**2)
+    sog_knots = sog_ms * 1.94384  # m/s to knots
+    # COG from ground velocity components: atan2(vx, vy) → degrees from North
+    # atan2(East, North) gives bearing from North, clockwise
+    cog_deg = np.degrees(np.arctan2(vx, vy)) % 360
+    # ROT: derivative of COG (degrees/second)
+    # Need to handle circular wraparound — unwrap COG first
+    cog_unwrapped = np.unwrap(np.radians(cog_deg))
+    rot_rad_s = three_point_derivative_vectorized(cog_unwrapped, t)
+    rot_deg_s = np.degrees(rot_rad_s)
+    # Altitude rate: vz converted to ft/min
+    alt_rate_ftmin = vz * 196.85  # m/s → ft/min (1 m/s = 196.85 ft/min)
+    return {
+        'vx': vx,
+        'vy': vy,
+        'vz': vz,
+        'COG': cog_deg,
+        'SOG': sog_knots,
+        'ROT': rot_deg_s,
+        'alt_rate': alt_rate_ftmin,
+    }
+# ============================================================
+# 4. Binary Geohash Encoding (following LLM4STP, 40-bit precision)
+# ============================================================
+def binary_geohash_encode(
+    values: np.ndarray,
+    precision: int = 40,
+    v_min: float = 0.0,
+    v_max: float = 1.0
+) -> np.ndarray:
+    """
+    Encode normalized values as binary geohash via successive bisection.
+    Matches LLM4STP's num2bits() implementation.
+    Args:
+        values: shape (N,) — normalized to [v_min, v_max]
+        precision: number of bits
+        v_min, v_max: range bounds
+    Returns:
+        bits: shape (N, precision) — binary encoding (0/1)
+    """
+    N = len(values)
+    bits = np.zeros((N, precision), dtype=np.int64)
+    _min = np.full(N, v_min)
+    _max = np.full(N, v_max)
+    for p in range(precision):
+        mid = (_min + _max) / 2
+        mask = values > mid
+        bits[:, p] = mask.astype(np.int64)
+        _min = np.where(mask, mid, _min)
+        _max = np.where(mask, _max, mid)
+    return bits
+def binary_geohash_decode(
+    bits: np.ndarray,
+    precision: int = 40,
+    v_min: float = 0.0,
+    v_max: float = 1.0
+) -> np.ndarray:
+    """Decode binary geohash back to values."""
+    N = bits.shape[0]
+    _min = np.full(N, v_min)
+    _max = np.full(N, v_max)
+    for p in range(precision):
+        mid = (_min + _max) / 2
+        mask = bits[:, p].astype(bool)
+        _min = np.where(mask, mid, _min)
+        _max = np.where(mask, _max, mid)
+    return (_min + _max) / 2
+class GeohashEncoder:
+    """
+    3D geohash encoder for aviation.
+    Encodes (east, north, up) ENU coordinates as binary geohash.
+    Following LLM4STP: 40 bits per axis, coordinates normalized to [0,1].
+    Extended to 3 axes (E, N, U) for aviation 3D trajectory.
+    Total encoding: 40*3 = 120 bits per timestep.
+    """
+    def __init__(self, precision: int = 40):
+        self.precision = precision
+        # Normalization bounds — set from training data
+        self.e_min = None
+        self.e_max = None
+        self.n_min = None
+        self.n_max = None
+        self.u_min = None
+        self.u_max = None
+    def fit(self, east: np.ndarray, north: np.ndarray, up: np.ndarray, margin: float = 0.05):
+        """Set normalization bounds from training data with margin."""
+        e_range = east.max() - east.min()
+        n_range = north.max() - north.min()
+        u_range = up.max() - up.min()
+        self.e_min = east.min() - margin * max(e_range, 1.0)
+        self.e_max = east.max() + margin * max(e_range, 1.0)
+        self.n_min = north.min() - margin * max(n_range, 1.0)
+        self.n_max = north.max() + margin * max(n_range, 1.0)
+        self.u_min = up.min() - margin * max(u_range, 1.0)
+        self.u_max = up.max() + margin * max(u_range, 1.0)
+    def normalize(self, values: np.ndarray, v_min: float, v_max: float) -> np.ndarray:
+        """Normalize to [0, 1]."""
+        return np.clip((values - v_min) / max(v_max - v_min, 1e-10), 0.0, 1.0)
+    def encode(self, east: np.ndarray, north: np.ndarray, up: np.ndarray) -> np.ndarray:
+        """
+        Encode ENU positions as 3D binary geohash.
+        Returns:
+            bits: shape (N, precision*3) — concatenated [E_bits | N_bits | U_bits]
+        """
+        e_norm = self.normalize(east, self.e_min, self.e_max)
+        n_norm = self.normalize(north, self.n_min, self.n_max)
+        u_norm = self.normalize(up, self.u_min, self.u_max)
+        e_bits = binary_geohash_encode(e_norm, self.precision)
+        n_bits = binary_geohash_encode(n_norm, self.precision)
+        u_bits = binary_geohash_encode(u_norm, self.precision)
+        return np.concatenate([e_bits, n_bits, u_bits], axis=1)  # (N, 120)
+# ============================================================
+# 5. Feature Discretization
+# ============================================================
+@dataclass
+class FeatureBins:
+    """Configuration for discretizing continuous features into bins."""
+    # COG: [0, 360) degrees, 2° bins for high resolution
+    cog_edges: np.ndarray = field(default_factory=lambda: np.linspace(0, 360, 181))  # 180 bins
+    # SOG: [0, 600] knots, 2-knot bins
+    sog_edges: np.ndarray = field(default_factory=lambda: np.linspace(0, 600, 301))  # 300 bins
+    # ROT: [-6, 6] deg/s, 0.1 deg/s bins
+    rot_edges: np.ndarray = field(default_factory=lambda: np.linspace(-6, 6, 121))   # 120 bins
+    # Altitude rate: [-6000, 6000] ft/min, 100 ft/min bins
+    alt_rate_edges: np.ndarray = field(default_factory=lambda: np.linspace(-6000, 6000, 121))  # 120 bins
+    @property
+    def n_cog_bins(self): return len(self.cog_edges) - 1
+    @property
+    def n_sog_bins(self): return len(self.sog_edges) - 1
+    @property
+    def n_rot_bins(self): return len(self.rot_edges) - 1
+    @property
+    def n_alt_rate_bins(self): return len(self.alt_rate_edges) - 1
+    def digitize(self, values: np.ndarray, edges: np.ndarray) -> np.ndarray:
+        """Bin values. Returns indices in [0, n_bins-1], clipped."""
+        indices = np.digitize(values, edges) - 1
+        return np.clip(indices, 0, len(edges) - 2)
+    def encode_cog(self, cog: np.ndarray) -> np.ndarray:
+        return self.digitize(cog, self.cog_edges)
+    def encode_sog(self, sog: np.ndarray) -> np.ndarray:
+        return self.digitize(sog, self.sog_edges)
+    def encode_rot(self, rot: np.ndarray) -> np.ndarray:
+        rot_clipped = np.clip(rot, -6, 6)
+        return self.digitize(rot_clipped, self.rot_edges)
+    def encode_alt_rate(self, alt_rate: np.ndarray) -> np.ndarray:
+        ar_clipped = np.clip(alt_rate, -6000, 6000)
+        return self.digitize(ar_clipped, self.alt_rate_edges)
+# ============================================================
+# 6. Uncertainty Score
+# ============================================================
+def compute_uncertainty(
+    cog: np.ndarray,
+    sog: np.ndarray,
+    rot: np.ndarray,
+    alt_rate: np.ndarray,
+    window: int = 5
+) -> np.ndarray:
+    """
+    Compute trajectory uncertainty score from recent state variance.
+    High variance = high uncertainty (maneuvering aircraft).
+    Returns:
+        scores: shape (N,) — uncertainty scores (higher = more uncertain)
+    """
+    N = len(cog)
+    scores = np.zeros(N)
+    for i in range(N):
+        start = max(0, i - window + 1)
+        w = slice(start, i + 1)
+        # Circular variance for COG
+        cog_rad = np.radians(cog[w])
+        R_len = np.sqrt(np.mean(np.cos(cog_rad))**2 + np.mean(np.sin(cog_rad))**2)
+        cog_var = 1 - R_len  # circular variance [0, 1]
+        # Regular variance for others
+        sog_var = np.var(sog[w]) if len(sog[w]) > 1 else 0
+        rot_var = np.var(rot[w]) if len(rot[w]) > 1 else 0
+        alt_var = np.var(alt_rate[w]) if len(alt_rate[w]) > 1 else 0
+        # Normalize and combine (equal weights)
+        scores[i] = cog_var + sog_var / max(np.var(sog) + 1e-10, 1e-10) + \
+                     rot_var / max(np.var(rot) + 1e-10, 1e-10) + \
+                     alt_var / max(np.var(alt_rate) + 1e-10, 1e-10)
+    return scores
+def discretize_uncertainty(scores: np.ndarray, n_bins: int = 16) -> np.ndarray:
+    """Discretize uncertainty scores into quantile bins."""
+    if len(np.unique(scores)) < n_bins:
+        # Not enough unique values for quantile binning
+        edges = np.linspace(scores.min(), scores.max() + 1e-10, n_bins + 1)
+    else:
+        edges = np.quantile(scores, np.linspace(0, 1, n_bins + 1))
+        edges[-1] += 1e-10  # ensure max value is included
+    return np.clip(np.digitize(scores, edges) - 1, 0, n_bins - 1)
+# ============================================================
+# 7. Temporal Features
+# ============================================================
+def extract_temporal_features(timestamps: np.ndarray) -> Dict[str, np.ndarray]:
+    """
+    Extract temporal features from Unix timestamps.
+    Returns dict with:
+        'second_of_day': float seconds within the day [0, 86400)
+        'hour': int hour of day [0, 23]
+        'dow': int day of week [0, 6]
+        'month': int month [0, 11]
+        'dt': float seconds since previous point (0 for first)
+        'fractional_second': float sub-second component [0, 1)
+    """
+    import datetime
+    # Convert to datetime objects for calendar features
+    dts = [datetime.datetime.utcfromtimestamp(t) for t in timestamps]
+    hours = np.array([d.hour for d in dts], dtype=np.int64)
+    dows = np.array([d.weekday() for d in dts], dtype=np.int64)
+    months = np.array([d.month - 1 for d in dts], dtype=np.int64)  # 0-indexed
+    # Second of day (with fractional seconds)
+    second_of_day = np.array([
+        d.hour * 3600 + d.minute * 60 + d.second + d.microsecond / 1e6
+        for d in dts
+    ])
+    # Delta-t between consecutive points
+    dt = np.zeros(len(timestamps))
+    dt[1:] = np.diff(timestamps)
+    # Fractional second component
+    fractional_second = timestamps - np.floor(timestamps)
+    return {
+        'second_of_day': second_of_day,
+        'hour': hours,
+        'dow': dows,
+        'month': months,
+        'dt': dt,
+        'fractional_second': fractional_second,
+    }
+# ============================================================
+# 8. Full Trajectory Processor
+# ============================================================
+class TrajectoryProcessor:
+    """
+    Complete pipeline: raw ADS-B → model-ready features.
+    """
+    def __init__(
+        self,
+        resample_dt: float = 5.0,        # resample interval in seconds
+        geohash_precision: int = 40,       # bits per axis
+        n_uncertainty_bins: int = 16,
+        feature_bins: Optional[FeatureBins] = None,
+        min_trajectory_len: int = 20,      # minimum points after processing
+    ):
+        self.resample_dt = resample_dt
+        self.geohash_precision = geohash_precision
+        self.n_uncertainty_bins = n_uncertainty_bins
+        self.feature_bins = feature_bins or FeatureBins()
+        self.min_trajectory_len = min_trajectory_len
+        self.geohash_encoder = GeohashEncoder(precision=geohash_precision)
+        # Fit state
+        self._fitted = False
+    def resample_trajectory(
+        self, timestamps: np.ndarray, lats: np.ndarray, lons: np.ndarray, alts: np.ndarray
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """Resample trajectory to fixed time intervals via linear interpolation."""
+        t_start = timestamps[0]
+        t_end = timestamps[-1]
+        n_points = int((t_end - t_start) / self.resample_dt) + 1
+        t_new = np.linspace(t_start, t_start + (n_points - 1) * self.resample_dt, n_points)
+        lats_new = np.interp(t_new, timestamps, lats)
+        lons_new = np.interp(t_new, timestamps, lons)
+        alts_new = np.interp(t_new, timestamps, alts)
+        return t_new, lats_new, lons_new, alts_new
+    def process_trajectory(
+        self,
+        timestamps: np.ndarray,
+        lats: np.ndarray,
+        lons: np.ndarray,
+        alts: np.ndarray,
+        metadata: Optional[Dict] = None
+    ) -> Optional[Dict[str, np.ndarray]]:
+        """
+        Process a single trajectory from raw ADS-B to model features.
+        Returns None if trajectory is too short or invalid.
+        Returns dict with all features needed for the model.
+        """
+        # Sort by time
+        sort_idx = np.argsort(timestamps)
+        timestamps = timestamps[sort_idx]
+        lats = lats[sort_idx]
+        lons = lons[sort_idx]
+        alts = alts[sort_idx]
+        # Resample to fixed interval
+        timestamps, lats, lons, alts = self.resample_trajectory(timestamps, lats, lons, alts)
+        if len(timestamps) < self.min_trajectory_len:
+            return None
+        # Convert to ENU (origin = first point)
+        converter = ENUConverter(lats[0], lons[0], alts[0])
+        east, north, up = converter.to_enu(lats, lons, alts)
+        # Derive features via 3-point derivative on ENU
+        features = derive_features_enu(east, north, up, timestamps)
+        # Binary geohash encoding on ENU positions
+        # If encoder not yet fitted, store placeholder (will be re-encoded after fitting)
+        if self.geohash_encoder.e_min is not None:
+            geohash_bits = self.geohash_encoder.encode(east, north, up)  # (N, 120)
+        else:
+            geohash_bits = np.zeros((len(east), self.geohash_precision * 3), dtype=np.int64)
+        # Discretize kinematic features
+        cog_bins = self.feature_bins.encode_cog(features['COG'])
+        sog_bins = self.feature_bins.encode_sog(features['SOG'])
+        rot_bins = self.feature_bins.encode_rot(features['ROT'])
+        alt_rate_bins = self.feature_bins.encode_alt_rate(features['alt_rate'])
+        # Uncertainty — multiple methods
+        from uncertainty import (
+            compute_all_uncertainties, discretize_scores, UncertaintyConfig
+        )
+        uncert_config = UncertaintyConfig(
+            use_kinematic_variance=True,
+            use_prediction_residual=True,
+            use_spatial_density=True,
+            use_flight_phase_entropy=True,
+            use_temporal_irregularity=False,
+            n_bins=self.n_uncertainty_bins,
+            window=5,
+        )
+        raw_uncert = compute_all_uncertainties(
+            east, north, up, timestamps,
+            features['COG'], features['SOG'], features['ROT'], features['alt_rate'],
+            config=uncert_config,
+        )
+        # Discretize each method into bins → stack into (N, n_methods) array
+        uncert_methods = sorted(raw_uncert.keys())
+        uncert_bins_multi = np.stack([
+            discretize_scores(raw_uncert[m], self.n_uncertainty_bins)
+            for m in uncert_methods
+        ], axis=1)  # (N, n_methods)
+        # Also keep legacy single-method for backwards compat
+        if 'kinematic_var' in raw_uncert:
+            uncert_bins = discretize_scores(raw_uncert['kinematic_var'], self.n_uncertainty_bins)
+        else:
+            uncert_bins = uncert_bins_multi[:, 0]
+        # Temporal features
+        temporal = extract_temporal_features(timestamps)
+        return {
+            # Raw (for evaluation/debugging)
+            'timestamps': timestamps,
+            'lats': lats,
+            'lons': lons,
+            'alts': alts,
+            'east': east,
+            'north': north,
+            'up': up,
+            # Continuous features
+            'COG': features['COG'],
+            'SOG': features['SOG'],
+            'ROT': features['ROT'],
+            'alt_rate': features['alt_rate'],
+            'vx': features['vx'],
+            'vy': features['vy'],
+            'vz': features['vz'],
+            # Geohash (binary, 120 bits per timestep)
+            'geohash_bits': geohash_bits,
+            # Discretized features (bin indices)
+            'cog_bins': cog_bins,
+            'sog_bins': sog_bins,
+            'rot_bins': rot_bins,
+            'alt_rate_bins': alt_rate_bins,
+            # Uncertainty (bin indices)
+            'uncert_bins': uncert_bins,                # (N,) legacy single method
+            'uncert_bins_multi': uncert_bins_multi,    # (N, n_methods) multi-method
+            'uncert_method_names': uncert_methods,     # list of method names
+            # Temporal
+            'hour': temporal['hour'],
+            'dow': temporal['dow'],
+            'month': temporal['month'],
+            'second_of_day': temporal['second_of_day'],
+            'dt': temporal['dt'],
+            # ENU converter (for decoding predictions back to lat/lon)
+            'enu_origin': (converter.origin_lat, converter.origin_lon, converter.origin_alt),
+            # Metadata
+            'metadata': metadata or {},
+        }
+    def fit_geohash(self, all_east: np.ndarray, all_north: np.ndarray, all_up: np.ndarray):
+        """Fit geohash normalization bounds from all training trajectories."""
+        self.geohash_encoder.fit(all_east, all_north, all_up)
+        self._fitted = True
+# ============================================================
+# 9. PyTorch Dataset with Sliding Window
+# ============================================================
+@dataclass
+class PromptTokens:
+    """Prompt token IDs for metadata encoding."""
+    # Special tokens
+    BOS: int = 0
+    EOS: int = 1
+    PAD: int = 2
+    # Task tokens
+    PREDICT: int = 3
+    CLASSIFY: int = 4
+    DETECT_ANOMALY: int = 5
+    # Aircraft category
+    HEAVY: int = 6
+    LARGE: int = 7
+    SMALL: int = 8
+    ROTORCRAFT: int = 9
+    GLIDER: int = 10
+    UAV: int = 11
+    AIRCRAFT_UNKNOWN: int = 12
+    # Flight phase
+    CLIMB: int = 13
+    CRUISE: int = 14
+    DESCENT: int = 15
+    APPROACH: int = 16
+    GROUND: int = 17
+    PHASE_UNKNOWN: int = 18
+    # Region
+    CONUS: int = 19
+    EUROPE: int = 20
+    ASIA: int = 21
+    REGION_OTHER: int = 22
+    VOCAB_SIZE: int = 23
+class AirTrackDataset(Dataset):
+    """
+    Sliding-window dataset for next-state prediction.
+    Each sample is a window of `seq_len` consecutive states.
+    The model predicts state[t+1] from state[1:t] for all t.
+    """
+    def __init__(
+        self,
+        trajectories: List[Dict[str, np.ndarray]],
+        seq_len: int = 128,
+        stride: int = 64,
+        task: str = 'predict',  # 'predict' or 'classify'
+    ):
+        self.seq_len = seq_len
+        self.stride = stride
+        self.task = task
+        # Build index of (trajectory_idx, start_pos) for all valid windows
+        self.windows = []
+        self.trajectories = trajectories
+        for traj_idx, traj in enumerate(trajectories):
+            n_points = len(traj['timestamps'])
+            # Need seq_len + 1 points (seq_len inputs + 1 target for last position)
+            if n_points < seq_len + 1:
+                # Use entire trajectory if it's at least min length
+                if n_points >= 20:
+                    self.windows.append((traj_idx, 0, n_points))
+                continue
+            for start in range(0, n_points - seq_len, stride):
+                end = start + seq_len + 1  # +1 for next-state target
+                if end <= n_points:
+                    self.windows.append((traj_idx, start, end))
+        # Prompt tokens
+        self.prompt_tokens = PromptTokens()
+    def __len__(self):
+        return len(self.windows)
+    def __getitem__(self, idx):
+        traj_idx, start, end = self.windows[idx]
+        traj = self.trajectories[traj_idx]
+        # Slice the window
+        sl = slice(start, end)
+        # Geohash bits: (window_len, 120)
+        geohash_bits = torch.from_numpy(traj['geohash_bits'][sl]).float()
+        # Discretized features
+        cog_bins = torch.from_numpy(traj['cog_bins'][sl]).long()
+        sog_bins = torch.from_numpy(traj['sog_bins'][sl]).long()
+        rot_bins = torch.from_numpy(traj['rot_bins'][sl]).long()
+        alt_rate_bins = torch.from_numpy(traj['alt_rate_bins'][sl]).long()
+        # Uncertainty bins (single + multi)
+        uncert_bins = torch.from_numpy(traj['uncert_bins'][sl]).long()
+        if 'uncert_bins_multi' in traj:
+            uncert_bins_multi = torch.from_numpy(traj['uncert_bins_multi'][sl]).long()
+        else:
+            uncert_bins_multi = uncert_bins.unsqueeze(-1)
+        # Temporal features
+        hour = torch.from_numpy(traj['hour'][sl]).long()
+        dow = torch.from_numpy(traj['dow'][sl]).long()
+        month = torch.from_numpy(traj['month'][sl]).long()
+        # Second-of-day as continuous feature (for sinusoidal encoding)
+        second_of_day = torch.from_numpy(traj['second_of_day'][sl]).float()
+        # Delta-t between points
+        dt = torch.from_numpy(traj['dt'][sl]).float()
+        # Prompt tokens (fixed for prediction task)
+        task_token = self.prompt_tokens.PREDICT if self.task == 'predict' else self.prompt_tokens.CLASSIFY
+        prompt = torch.tensor([
+            self.prompt_tokens.BOS,
+            task_token,
+            self.prompt_tokens.AIRCRAFT_UNKNOWN,  # default; override with metadata
+            self.prompt_tokens.PHASE_UNKNOWN,
+            self.prompt_tokens.REGION_OTHER,
+        ], dtype=torch.long)
+        # Continuous ENU positions (for evaluation / regression head)
+        east = torch.from_numpy(traj['east'][sl]).float()
+        north = torch.from_numpy(traj['north'][sl]).float()
+        up = torch.from_numpy(traj['up'][sl]).float()
+        return {
+            'geohash_bits': geohash_bits,
+            'cog_bins': cog_bins,
+            'sog_bins': sog_bins,
+            'rot_bins': rot_bins,
+            'alt_rate_bins': alt_rate_bins,
+            'uncert_bins': uncert_bins,
+            'uncert_bins_multi': uncert_bins_multi,
+            'hour': hour,
+            'dow': dow,
+            'month': month,
+            'second_of_day': second_of_day,
+            'dt': dt,
+            'prompt': prompt,
+            'east': east,
+            'north': north,
+            'up': up,
+        }
+# ============================================================
+# 10. Data Loading Utilities
+# ============================================================
+def load_traffic_sample(name: str = 'quickstart') -> List[Dict]:
+    """
+    Load sample data from the `traffic` library.
+    Available collections: 'quickstart' (238 flights), 'switzerland', 'savan'
+    Individual flights: 'landing_denver', calibration flights, etc.
+    """
+    import traffic.data.samples as samples
+    data = getattr(samples, name)
+    trajectories = []
+    # Handle both Traffic (collection) and Flight (single) objects
+    flights = data if hasattr(data, '__iter__') else [data]
+    for flight in flights:
+        df = flight.data
+        if df is None or len(df) < 20:
+            continue
+        # Extract required columns — handle tz-aware and PyArrow timestamps
+        import pandas as pd
+        ts_series = pd.to_datetime(df['timestamp'])
+        if ts_series.dt.tz is not None:
+            ts_series = ts_series.dt.tz_convert('UTC').dt.tz_localize(None)
+        timestamps = ts_series.values.astype('int64').astype(np.float64) / 1e9
+        lats = df['latitude'].values.astype(np.float64)
+        lons = df['longitude'].values.astype(np.float64)
+        # Altitude: try barometric first, then geometric
+        if 'altitude' in df.columns:
+            alts = df['altitude'].values.astype(np.float64)
+        elif 'baro_altitude' in df.columns:
+            alts = df['baro_altitude'].values.astype(np.float64)
+        else:
+            alts = np.zeros(len(df))
+        # Handle NaNs
+        valid = ~(np.isnan(lats) | np.isnan(lons) | np.isnan(alts) | np.isnan(timestamps))
+        if valid.sum() < 20:
+            continue
+        trajectories.append({
+            'timestamps': timestamps[valid],
+            'lats': lats[valid],
+            'lons': lons[valid],
+            'alts': alts[valid],
+            'callsign': flight.callsign if hasattr(flight, 'callsign') else 'UNKNOWN',
+            'icao24': flight.icao24 if hasattr(flight, 'icao24') else 'UNKNOWN',
+        })
+    return trajectories
+def build_dataset(
+    raw_trajectories: List[Dict],
+    processor: TrajectoryProcessor,
+    seq_len: int = 128,
+    stride: int = 64,
+    fit_geohash: bool = True,
+) -> AirTrackDataset:
+    """
+    Process raw trajectories and build PyTorch dataset.
+    Args:
+        raw_trajectories: list of dicts with 'timestamps', 'lats', 'lons', 'alts'
+        processor: TrajectoryProcessor instance
+        seq_len: sliding window size
+        stride: sliding window stride
+        fit_geohash: if True, fit geohash bounds from this data
+    """
+    # First pass: convert to ENU and collect bounds for geohash fitting
+    processed = []
+    all_east, all_north, all_up = [], [], []
+    for raw in raw_trajectories:
+        result = processor.process_trajectory(
+            raw['timestamps'], raw['lats'], raw['lons'], raw['alts'],
+            metadata={k: v for k, v in raw.items() if k not in ['timestamps', 'lats', 'lons', 'alts']}
+        )
+        if result is not None:
+            processed.append(result)
+            all_east.append(result['east'])
+            all_north.append(result['north'])
+            all_up.append(result['up'])
+    if fit_geohash and processed:
+        # Fit geohash bounds from all trajectories
+        all_e = np.concatenate(all_east)
+        all_n = np.concatenate(all_north)
+        all_u = np.concatenate(all_up)
+        processor.fit_geohash(all_e, all_n, all_u)
+        # Re-encode geohash with fitted bounds
+        for traj in processed:
+            traj['geohash_bits'] = processor.geohash_encoder.encode(
+                traj['east'], traj['north'], traj['up']
+            )
+    print(f"Processed {len(processed)}/{len(raw_trajectories)} trajectories")
+    dataset = AirTrackDataset(processed, seq_len=seq_len, stride=stride)
+    print(f"Created dataset with {len(dataset)} windows")
+    return dataset
+if __name__ == '__main__':
+    # Quick test with traffic sample data
+    print("Loading traffic sample data...")
+    raw_trajs = load_traffic_sample()
+    print(f"Loaded {len(raw_trajs)} raw trajectories")
+    print("\nProcessing trajectories...")
+    processor = TrajectoryProcessor(resample_dt=5.0)
+    dataset = build_dataset(raw_trajs, processor, seq_len=64, stride=32)
+    print(f"\nDataset size: {len(dataset)}")
+    if len(dataset) > 0:
+        sample = dataset[0]
+        print("\nSample keys and shapes:")
+        for k, v in sample.items():
+            if isinstance(v, torch.Tensor):
+                print(f"  {k}: {v.shape} ({v.dtype})")
+            else:
+                print(f"  {k}: {type(v)}")