Jdice27
/

AirTrackLM

Model card Files Files and versions

xet

Community

Jdice27 commited on 12 days ago

Commit

744a6a7

verified ·

1 Parent(s): 1773f5e

Upload data_pipeline.py with huggingface_hub

Browse files

Files changed (1) hide show

data_pipeline.py +171 -582

data_pipeline.py CHANGED Viewed

@@ -6,13 +6,21 @@ Converts raw ADS-B (lat, lon, alt, timestamp) to model-ready tensors.
 Pipeline:
 1. Load trajectories from `traffic` library or raw CSV
 2. Resample to fixed time interval (default 5s)
-3. Convert lat/lon/alt to ENU (East-North-Up) coordinates using first point as origin
 4. Compute velocity via 3-point central derivative on ENU positions
 5. Derive COG, SOG from x-y ground velocity; ROT from COG; altitude rate from z velocity
-6. Binary geohash encoding (40-bit per axis, following LLM4STP)
 7. Discretize features into bins
 8. Compute uncertainty scores
 9. Build sliding-window PyTorch Dataset
 """
 import numpy as np
@@ -29,10 +37,8 @@ from dataclasses import dataclass, field
 class ENUConverter:
     """
-    Convert WGS84 (lat, lon, alt) to local East-North-Up (ENU) coordinates.
-    Origin is set to the first point of each trajectory.
-    Uses pyproj for geodetically correct transformations.
     """
     def __init__(self, origin_lat: float, origin_lon: float, origin_alt: float = 0.0):
@@ -40,18 +46,15 @@ class ENUConverter:
         self.origin_lon = origin_lon
         self.origin_alt = origin_alt
-        # ECEF transformer
         self.ecef = pyproj.Proj(proj='geocent', ellps='WGS84', datum='WGS84')
         self.lla = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
         self.transformer_to_ecef = pyproj.Transformer.from_proj(self.lla, self.ecef, always_xy=True)
         self.transformer_to_lla = pyproj.Transformer.from_proj(self.ecef, self.lla, always_xy=True)
-        # Origin in ECEF
         self.x0, self.y0, self.z0 = self.transformer_to_ecef.transform(
             origin_lon, origin_lat, origin_alt
         )
-        # Rotation matrix (ECEF -> ENU)
         lat_r = np.radians(origin_lat)
         lon_r = np.radians(origin_lon)
         self.R = np.array([
@@ -60,108 +63,50 @@ class ENUConverter:
             [ np.cos(lat_r)*np.cos(lon_r),  np.cos(lat_r)*np.sin(lon_r), np.sin(lat_r)]
         ])
-    def to_enu(self, lats: np.ndarray, lons: np.ndarray, alts: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Convert arrays of lat/lon/alt to ENU (meters)."""
-        # To ECEF
         x, y, z = self.transformer_to_ecef.transform(lons, lats, alts)
-        # Offset from origin
         dx = x - self.x0
         dy = y - self.y0
         dz = z - self.z0
-        # Rotate to ENU
-        ecef_delta = np.stack([dx, dy, dz], axis=0)  # (3, N)
-        enu = self.R @ ecef_delta  # (3, N)
-        east = enu[0]   # meters
-        north = enu[1]  # meters
-        up = enu[2]     # meters
-        return east, north, up
-    def from_enu(self, east: np.ndarray, north: np.ndarray, up: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Convert ENU back to lat/lon/alt."""
         enu = np.stack([east, north, up], axis=0)
         ecef_delta = self.R.T @ enu
         x = ecef_delta[0] + self.x0
         y = ecef_delta[1] + self.y0
         z = ecef_delta[2] + self.z0
         lons, lats, alts = self.transformer_to_lla.transform(x, y, z)
         return lats, lons, alts
 # ============================================================
-# 2. Three-Point Central Derivative
 # ============================================================
-def three_point_derivative(values: np.ndarray, dt: np.ndarray) -> np.ndarray:
     """
-    Compute derivative using 3-point central difference.
-    For interior points (i=1..N-2):
-        f'(i) = (f(i+1) - f(i-1)) / (t(i+1) - t(i-1))
-    For endpoints:
-        f'(0) = (f(1) - f(0)) / (t(1) - t(0))         # forward difference
-        f'(N-1) = (f(N-1) - f(N-2)) / (t(N-1) - t(N-2))  # backward difference
-    Args:
-        values: shape (N,) — the signal to differentiate
-        dt: shape (N,) — cumulative time from start (seconds)
-    Returns:
-        derivative: shape (N,) — rate of change per second
     """
     N = len(values)
     deriv = np.zeros(N)
     if N < 2:
         return deriv
-    # Forward difference for first point
-    dt_fwd = dt[1] - dt[0]
     if dt_fwd > 0:
         deriv[0] = (values[1] - values[0]) / dt_fwd
-    # Central difference for interior points
-    for i in range(1, N - 1):
-        dt_span = dt[i + 1] - dt[i - 1]
-        if dt_span > 0:
-            deriv[i] = (values[i + 1] - values[i - 1]) / dt_span
-    # Backward difference for last point
-    dt_bwd = dt[-1] - dt[-2]
-    if dt_bwd > 0:
-        deriv[-1] = (values[-1] - values[-2]) / dt_bwd
-    return deriv
-def three_point_derivative_vectorized(values: np.ndarray, dt: np.ndarray) -> np.ndarray:
-    """Vectorized version of 3-point central derivative."""
-    N = len(values)
-    deriv = np.zeros(N)
-    if N < 2:
-        return deriv
-    # Forward difference for first point
-    dt_fwd = dt[1] - dt[0]
-    if dt_fwd > 0:
-        deriv[0] = (values[1] - values[0]) / dt_fwd
-    # Central difference for interior points (vectorized)
     if N > 2:
-        dt_span = dt[2:] - dt[:-2]  # (N-2,)
         mask = dt_span > 0
-        val_diff = values[2:] - values[:-2]  # (N-2,)
         deriv[1:-1] = np.where(mask, val_diff / np.maximum(dt_span, 1e-10), 0.0)
-    # Backward difference for last point
-    dt_bwd = dt[-1] - dt[-2]
     if dt_bwd > 0:
         deriv[-1] = (values[-1] - values[-2]) / dt_bwd
@@ -172,172 +117,115 @@ def three_point_derivative_vectorized(values: np.ndarray, dt: np.ndarray) -> np.
 # 3. Feature Derivation from ENU positions
 # ============================================================
-def derive_features_enu(
-    east: np.ndarray,
-    north: np.ndarray,
-    up: np.ndarray,
-    timestamps: np.ndarray
-) -> Dict[str, np.ndarray]:
     """
-    Derive COG, SOG, ROT, and altitude rate from ENU positions
-    using 3-point central derivatives.
-    Args:
-        east, north, up: ENU coordinates in meters, shape (N,)
-        timestamps: Unix timestamps in seconds, shape (N,)
-    Returns:
-        dict with keys: 'vx', 'vy', 'vz', 'COG', 'SOG', 'ROT', 'alt_rate'
-        Each is shape (N,)
     """
-    # Cumulative time from start
     t = timestamps - timestamps[0]
-    # 3-point derivative on ENU positions → velocities (m/s)
-    vx = three_point_derivative_vectorized(east, t)    # East velocity (m/s)
-    vy = three_point_derivative_vectorized(north, t)   # North velocity (m/s)
-    vz = three_point_derivative_vectorized(up, t)      # Up velocity (m/s)
-    # SOG from ground plane velocity: sqrt(vx² + vy²), convert m/s → knots
     sog_ms = np.sqrt(vx**2 + vy**2)
-    sog_knots = sog_ms * 1.94384  # m/s to knots
-    # COG from ground velocity components: atan2(vx, vy) → degrees from North
-    # atan2(East, North) gives bearing from North, clockwise
     cog_deg = np.degrees(np.arctan2(vx, vy)) % 360
-    # ROT: derivative of COG (degrees/second)
-    # Need to handle circular wraparound — unwrap COG first
     cog_unwrapped = np.unwrap(np.radians(cog_deg))
-    rot_rad_s = three_point_derivative_vectorized(cog_unwrapped, t)
     rot_deg_s = np.degrees(rot_rad_s)
-    # Altitude rate: vz converted to ft/min
-    alt_rate_ftmin = vz * 196.85  # m/s → ft/min (1 m/s = 196.85 ft/min)
     return {
-        'vx': vx,
-        'vy': vy,
-        'vz': vz,
-        'COG': cog_deg,
-        'SOG': sog_knots,
-        'ROT': rot_deg_s,
-        'alt_rate': alt_rate_ftmin,
     }
 # ============================================================
-# 4. Binary Geohash Encoding (following LLM4STP, 40-bit precision)
 # ============================================================
-def binary_geohash_encode(
-    values: np.ndarray,
-    precision: int = 40,
-    v_min: float = 0.0,
-    v_max: float = 1.0
-) -> np.ndarray:
-    """
-    Encode normalized values as binary geohash via successive bisection.
-    Matches LLM4STP's num2bits() implementation.
-    Args:
-        values: shape (N,) — normalized to [v_min, v_max]
-        precision: number of bits
-        v_min, v_max: range bounds
-    Returns:
-        bits: shape (N, precision) — binary encoding (0/1)
-    """
     N = len(values)
     bits = np.zeros((N, precision), dtype=np.int64)
     _min = np.full(N, v_min)
     _max = np.full(N, v_max)
     for p in range(precision):
         mid = (_min + _max) / 2
         mask = values > mid
         bits[:, p] = mask.astype(np.int64)
         _min = np.where(mask, mid, _min)
         _max = np.where(mask, _max, mid)
     return bits
-def binary_geohash_decode(
-    bits: np.ndarray,
-    precision: int = 40,
-    v_min: float = 0.0,
-    v_max: float = 1.0
-) -> np.ndarray:
-    """Decode binary geohash back to values."""
     N = bits.shape[0]
     _min = np.full(N, v_min)
     _max = np.full(N, v_max)
     for p in range(precision):
         mid = (_min + _max) / 2
         mask = bits[:, p].astype(bool)
         _min = np.where(mask, mid, _min)
         _max = np.where(mask, _max, mid)
     return (_min + _max) / 2
 class GeohashEncoder:
     """
-    3D geohash encoder for aviation.
-    Encodes (east, north, up) ENU coordinates as binary geohash.
-    Following LLM4STP: 40 bits per axis, coordinates normalized to [0,1].
-    Extended to 3 axes (E, N, U) for aviation 3D trajectory.
-    Total encoding: 40*3 = 120 bits per timestep.
     """
-    def __init__(self, precision: int = 40):
         self.precision = precision
-        # Normalization bounds — set from training data
-        self.e_min = None
-        self.e_max = None
-        self.n_min = None
-        self.n_max = None
-        self.u_min = None
-        self.u_max = None
-    def fit(self, east: np.ndarray, north: np.ndarray, up: np.ndarray, margin: float = 0.05):
-        """Set normalization bounds from training data with margin."""
-        e_range = east.max() - east.min()
-        n_range = north.max() - north.min()
-        u_range = up.max() - up.min()
-        self.e_min = east.min() - margin * max(e_range, 1.0)
-        self.e_max = east.max() + margin * max(e_range, 1.0)
-        self.n_min = north.min() - margin * max(n_range, 1.0)
-        self.n_max = north.max() + margin * max(n_range, 1.0)
-        self.u_min = up.min() - margin * max(u_range, 1.0)
-        self.u_max = up.max() + margin * max(u_range, 1.0)
-    def normalize(self, values: np.ndarray, v_min: float, v_max: float) -> np.ndarray:
-        """Normalize to [0, 1]."""
         return np.clip((values - v_min) / max(v_max - v_min, 1e-10), 0.0, 1.0)
-    def encode(self, east: np.ndarray, north: np.ndarray, up: np.ndarray) -> np.ndarray:
-        """
-        Encode ENU positions as 3D binary geohash.
-        Returns:
-            bits: shape (N, precision*3) — concatenated [E_bits | N_bits | U_bits]
-        """
-        e_norm = self.normalize(east, self.e_min, self.e_max)
-        n_norm = self.normalize(north, self.n_min, self.n_max)
-        u_norm = self.normalize(up, self.u_min, self.u_max)
         e_bits = binary_geohash_encode(e_norm, self.precision)
         n_bits = binary_geohash_encode(n_norm, self.precision)
         u_bits = binary_geohash_encode(u_norm, self.precision)
         return np.concatenate([e_bits, n_bits, u_bits], axis=1)  # (N, 120)
 # ============================================================
@@ -346,413 +234,187 @@ class GeohashEncoder:
 @dataclass
 class FeatureBins:
-    """Configuration for discretizing continuous features into bins."""
-    # COG: [0, 360) degrees, 2° bins for high resolution
-    cog_edges: np.ndarray = field(default_factory=lambda: np.linspace(0, 360, 181))  # 180 bins
-    # SOG: [0, 600] knots, 2-knot bins
-    sog_edges: np.ndarray = field(default_factory=lambda: np.linspace(0, 600, 301))  # 300 bins
-    # ROT: [-6, 6] deg/s, 0.1 deg/s bins
-    rot_edges: np.ndarray = field(default_factory=lambda: np.linspace(-6, 6, 121))   # 120 bins
-    # Altitude rate: [-6000, 6000] ft/min, 100 ft/min bins
     alt_rate_edges: np.ndarray = field(default_factory=lambda: np.linspace(-6000, 6000, 121))  # 120 bins
     @property
     def n_cog_bins(self): return len(self.cog_edges) - 1
     @property
     def n_sog_bins(self): return len(self.sog_edges) - 1
     @property
     def n_rot_bins(self): return len(self.rot_edges) - 1
     @property
     def n_alt_rate_bins(self): return len(self.alt_rate_edges) - 1
-    def digitize(self, values: np.ndarray, edges: np.ndarray) -> np.ndarray:
-        """Bin values. Returns indices in [0, n_bins-1], clipped."""
-        indices = np.digitize(values, edges) - 1
-        return np.clip(indices, 0, len(edges) - 2)
-    def encode_cog(self, cog: np.ndarray) -> np.ndarray:
-        return self.digitize(cog, self.cog_edges)
-    def encode_sog(self, sog: np.ndarray) -> np.ndarray:
-        return self.digitize(sog, self.sog_edges)
-    def encode_rot(self, rot: np.ndarray) -> np.ndarray:
-        rot_clipped = np.clip(rot, -6, 6)
-        return self.digitize(rot_clipped, self.rot_edges)
-    def encode_alt_rate(self, alt_rate: np.ndarray) -> np.ndarray:
-        ar_clipped = np.clip(alt_rate, -6000, 6000)
-        return self.digitize(ar_clipped, self.alt_rate_edges)
 # ============================================================
-# 6. Uncertainty Score
 # ============================================================
-def compute_uncertainty(
-    cog: np.ndarray,
-    sog: np.ndarray,
-    rot: np.ndarray,
-    alt_rate: np.ndarray,
-    window: int = 5
-) -> np.ndarray:
-    """
-    Compute trajectory uncertainty score from recent state variance.
-    High variance = high uncertainty (maneuvering aircraft).
-    Returns:
-        scores: shape (N,) — uncertainty scores (higher = more uncertain)
-    """
-    N = len(cog)
-    scores = np.zeros(N)
-    for i in range(N):
-        start = max(0, i - window + 1)
-        w = slice(start, i + 1)
-        # Circular variance for COG
-        cog_rad = np.radians(cog[w])
-        R_len = np.sqrt(np.mean(np.cos(cog_rad))**2 + np.mean(np.sin(cog_rad))**2)
-        cog_var = 1 - R_len  # circular variance [0, 1]
-        # Regular variance for others
-        sog_var = np.var(sog[w]) if len(sog[w]) > 1 else 0
-        rot_var = np.var(rot[w]) if len(rot[w]) > 1 else 0
-        alt_var = np.var(alt_rate[w]) if len(alt_rate[w]) > 1 else 0
-        # Normalize and combine (equal weights)
-        scores[i] = cog_var + sog_var / max(np.var(sog) + 1e-10, 1e-10) + \
-                     rot_var / max(np.var(rot) + 1e-10, 1e-10) + \
-                     alt_var / max(np.var(alt_rate) + 1e-10, 1e-10)
-    return scores
-def discretize_uncertainty(scores: np.ndarray, n_bins: int = 16) -> np.ndarray:
-    """Discretize uncertainty scores into quantile bins."""
-    if len(np.unique(scores)) < n_bins:
-        # Not enough unique values for quantile binning
-        edges = np.linspace(scores.min(), scores.max() + 1e-10, n_bins + 1)
-    else:
-        edges = np.quantile(scores, np.linspace(0, 1, n_bins + 1))
-        edges[-1] += 1e-10  # ensure max value is included
-    return np.clip(np.digitize(scores, edges) - 1, 0, n_bins - 1)
-# ============================================================
-# 7. Temporal Features
-# ============================================================
-def extract_temporal_features(timestamps: np.ndarray) -> Dict[str, np.ndarray]:
-    """
-    Extract temporal features from Unix timestamps.
-    Returns dict with:
-        'second_of_day': float seconds within the day [0, 86400)
-        'hour': int hour of day [0, 23]
-        'dow': int day of week [0, 6]
-        'month': int month [0, 11]
-        'dt': float seconds since previous point (0 for first)
-        'fractional_second': float sub-second component [0, 1)
-    """
     import datetime
-    # Convert to datetime objects for calendar features
     dts = [datetime.datetime.utcfromtimestamp(t) for t in timestamps]
     hours = np.array([d.hour for d in dts], dtype=np.int64)
     dows = np.array([d.weekday() for d in dts], dtype=np.int64)
-    months = np.array([d.month - 1 for d in dts], dtype=np.int64)  # 0-indexed
-    # Second of day (with fractional seconds)
     second_of_day = np.array([
         d.hour * 3600 + d.minute * 60 + d.second + d.microsecond / 1e6
         for d in dts
-    ])
-    # Delta-t between consecutive points
-    dt = np.zeros(len(timestamps))
     dt[1:] = np.diff(timestamps)
-    # Fractional second component
-    fractional_second = timestamps - np.floor(timestamps)
     return {
         'second_of_day': second_of_day,
-        'hour': hours,
-        'dow': dows,
-        'month': months,
         'dt': dt,
-        'fractional_second': fractional_second,
     }
 # ============================================================
-# 8. Full Trajectory Processor
 # ============================================================
 class TrajectoryProcessor:
-    """
-    Complete pipeline: raw ADS-B → model-ready features.
-    """
-    def __init__(
-        self,
-        resample_dt: float = 5.0,        # resample interval in seconds
-        geohash_precision: int = 40,       # bits per axis
-        n_uncertainty_bins: int = 16,
-        feature_bins: Optional[FeatureBins] = None,
-        min_trajectory_len: int = 20,      # minimum points after processing
-    ):
         self.resample_dt = resample_dt
         self.geohash_precision = geohash_precision
         self.n_uncertainty_bins = n_uncertainty_bins
         self.feature_bins = feature_bins or FeatureBins()
         self.min_trajectory_len = min_trajectory_len
         self.geohash_encoder = GeohashEncoder(precision=geohash_precision)
-        # Fit state
         self._fitted = False
-    def resample_trajectory(
-        self, timestamps: np.ndarray, lats: np.ndarray, lons: np.ndarray, alts: np.ndarray
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
-        """Resample trajectory to fixed time intervals via linear interpolation."""
-        t_start = timestamps[0]
-        t_end = timestamps[-1]
         n_points = int((t_end - t_start) / self.resample_dt) + 1
         t_new = np.linspace(t_start, t_start + (n_points - 1) * self.resample_dt, n_points)
-        lats_new = np.interp(t_new, timestamps, lats)
-        lons_new = np.interp(t_new, timestamps, lons)
-        alts_new = np.interp(t_new, timestamps, alts)
-        return t_new, lats_new, lons_new, alts_new
-    def process_trajectory(
-        self,
-        timestamps: np.ndarray,
-        lats: np.ndarray,
-        lons: np.ndarray,
-        alts: np.ndarray,
-        metadata: Optional[Dict] = None
-    ) -> Optional[Dict[str, np.ndarray]]:
-        """
-        Process a single trajectory from raw ADS-B to model features.
-        Returns None if trajectory is too short or invalid.
-        Returns dict with all features needed for the model.
-        """
-        # Sort by time
         sort_idx = np.argsort(timestamps)
-        timestamps = timestamps[sort_idx]
-        lats = lats[sort_idx]
-        lons = lons[sort_idx]
-        alts = alts[sort_idx]
-        # Resample to fixed interval
         timestamps, lats, lons, alts = self.resample_trajectory(timestamps, lats, lons, alts)
         if len(timestamps) < self.min_trajectory_len:
             return None
-        # Convert to ENU (origin = first point)
         converter = ENUConverter(lats[0], lons[0], alts[0])
         east, north, up = converter.to_enu(lats, lons, alts)
-        # Derive features via 3-point derivative on ENU
         features = derive_features_enu(east, north, up, timestamps)
-        # Binary geohash encoding on ENU positions
-        # If encoder not yet fitted, store placeholder (will be re-encoded after fitting)
         if self.geohash_encoder.e_min is not None:
-            geohash_bits = self.geohash_encoder.encode(east, north, up)  # (N, 120)
         else:
             geohash_bits = np.zeros((len(east), self.geohash_precision * 3), dtype=np.int64)
-        # Discretize kinematic features
         cog_bins = self.feature_bins.encode_cog(features['COG'])
         sog_bins = self.feature_bins.encode_sog(features['SOG'])
         rot_bins = self.feature_bins.encode_rot(features['ROT'])
         alt_rate_bins = self.feature_bins.encode_alt_rate(features['alt_rate'])
-        # Uncertainty — multiple methods
-        from uncertainty import (
-            compute_all_uncertainties, discretize_scores, UncertaintyConfig
-        )
-        uncert_config = UncertaintyConfig(
-            use_kinematic_variance=True,
-            use_prediction_residual=True,
-            use_spatial_density=True,
-            use_flight_phase_entropy=True,
-            use_temporal_irregularity=False,
-            n_bins=self.n_uncertainty_bins,
-            window=5,
-        )
         raw_uncert = compute_all_uncertainties(
             east, north, up, timestamps,
             features['COG'], features['SOG'], features['ROT'], features['alt_rate'],
             config=uncert_config,
         )
-        # Discretize each method into bins → stack into (N, n_methods) array
         uncert_methods = sorted(raw_uncert.keys())
         uncert_bins_multi = np.stack([
             discretize_scores(raw_uncert[m], self.n_uncertainty_bins)
             for m in uncert_methods
-        ], axis=1)  # (N, n_methods)
-        # Also keep legacy single-method for backwards compat
-        if 'kinematic_var' in raw_uncert:
-            uncert_bins = discretize_scores(raw_uncert['kinematic_var'], self.n_uncertainty_bins)
-        else:
-            uncert_bins = uncert_bins_multi[:, 0]
-        # Temporal features
         temporal = extract_temporal_features(timestamps)
         return {
-            # Raw (for evaluation/debugging)
-            'timestamps': timestamps,
-            'lats': lats,
-            'lons': lons,
-            'alts': alts,
-            'east': east,
-            'north': north,
-            'up': up,
-            # Continuous features
-            'COG': features['COG'],
-            'SOG': features['SOG'],
-            'ROT': features['ROT'],
-            'alt_rate': features['alt_rate'],
-            'vx': features['vx'],
-            'vy': features['vy'],
-            'vz': features['vz'],
-            # Geohash (binary, 120 bits per timestep)
             'geohash_bits': geohash_bits,
-            # Discretized features (bin indices)
-            'cog_bins': cog_bins,
-            'sog_bins': sog_bins,
-            'rot_bins': rot_bins,
-            'alt_rate_bins': alt_rate_bins,
-            # Uncertainty (bin indices)
-            'uncert_bins': uncert_bins,                # (N,) legacy single method
-            'uncert_bins_multi': uncert_bins_multi,    # (N, n_methods) multi-method
-            'uncert_method_names': uncert_methods,     # list of method names
-            # Temporal
-            'hour': temporal['hour'],
-            'dow': temporal['dow'],
-            'month': temporal['month'],
-            'second_of_day': temporal['second_of_day'],
-            'dt': temporal['dt'],
-            # ENU converter (for decoding predictions back to lat/lon)
             'enu_origin': (converter.origin_lat, converter.origin_lon, converter.origin_alt),
-            # Metadata
             'metadata': metadata or {},
         }
-    def fit_geohash(self, all_east: np.ndarray, all_north: np.ndarray, all_up: np.ndarray):
-        """Fit geohash normalization bounds from all training trajectories."""
         self.geohash_encoder.fit(all_east, all_north, all_up)
         self._fitted = True
 # ============================================================
-# 9. PyTorch Dataset with Sliding Window
 # ============================================================
 @dataclass
 class PromptTokens:
-    """Prompt token IDs for metadata encoding."""
-    # Special tokens
-    BOS: int = 0
-    EOS: int = 1
-    PAD: int = 2
-    # Task tokens
-    PREDICT: int = 3
-    CLASSIFY: int = 4
-    DETECT_ANOMALY: int = 5
-    # Aircraft category
-    HEAVY: int = 6
-    LARGE: int = 7
-    SMALL: int = 8
-    ROTORCRAFT: int = 9
-    GLIDER: int = 10
-    UAV: int = 11
-    AIRCRAFT_UNKNOWN: int = 12
-    # Flight phase
-    CLIMB: int = 13
-    CRUISE: int = 14
-    DESCENT: int = 15
-    APPROACH: int = 16
-    GROUND: int = 17
-    PHASE_UNKNOWN: int = 18
-    # Region
-    CONUS: int = 19
-    EUROPE: int = 20
-    ASIA: int = 21
-    REGION_OTHER: int = 22
     VOCAB_SIZE: int = 23
 class AirTrackDataset(Dataset):
-    """
-    Sliding-window dataset for next-state prediction.
-    Each sample is a window of `seq_len` consecutive states.
-    The model predicts state[t+1] from state[1:t] for all t.
-    """
-    def __init__(
-        self,
-        trajectories: List[Dict[str, np.ndarray]],
-        seq_len: int = 128,
-        stride: int = 64,
-        task: str = 'predict',  # 'predict' or 'classify'
-    ):
         self.seq_len = seq_len
         self.stride = stride
         self.task = task
-        # Build index of (trajectory_idx, start_pos) for all valid windows
         self.windows = []
         self.trajectories = trajectories
         for traj_idx, traj in enumerate(trajectories):
             n_points = len(traj['timestamps'])
-            # Need seq_len + 1 points (seq_len inputs + 1 target for last position)
             if n_points < seq_len + 1:
-                # Use entire trajectory if it's at least min length
                 if n_points >= 20:
                     self.windows.append((traj_idx, 0, n_points))
                 continue
             for start in range(0, n_points - seq_len, stride):
-                end = start + seq_len + 1  # +1 for next-state target
                 if end <= n_points:
                     self.windows.append((traj_idx, start, end))
-        # Prompt tokens
         self.prompt_tokens = PromptTokens()
     def __len__(self):
@@ -761,99 +423,56 @@ class AirTrackDataset(Dataset):
     def __getitem__(self, idx):
         traj_idx, start, end = self.windows[idx]
         traj = self.trajectories[traj_idx]
-        # Slice the window
         sl = slice(start, end)
-        # Geohash bits: (window_len, 120)
-        geohash_bits = torch.from_numpy(traj['geohash_bits'][sl]).float()
-        # Discretized features
-        cog_bins = torch.from_numpy(traj['cog_bins'][sl]).long()
-        sog_bins = torch.from_numpy(traj['sog_bins'][sl]).long()
-        rot_bins = torch.from_numpy(traj['rot_bins'][sl]).long()
-        alt_rate_bins = torch.from_numpy(traj['alt_rate_bins'][sl]).long()
-        # Uncertainty bins (single + multi)
-        uncert_bins = torch.from_numpy(traj['uncert_bins'][sl]).long()
-        if 'uncert_bins_multi' in traj:
-            uncert_bins_multi = torch.from_numpy(traj['uncert_bins_multi'][sl]).long()
-        else:
-            uncert_bins_multi = uncert_bins.unsqueeze(-1)
-        # Temporal features
-        hour = torch.from_numpy(traj['hour'][sl]).long()
-        dow = torch.from_numpy(traj['dow'][sl]).long()
-        month = torch.from_numpy(traj['month'][sl]).long()
-        # Second-of-day as continuous feature (for sinusoidal encoding)
-        second_of_day = torch.from_numpy(traj['second_of_day'][sl]).float()
-        # Delta-t between points
-        dt = torch.from_numpy(traj['dt'][sl]).float()
-        # Prompt tokens (fixed for prediction task)
         task_token = self.prompt_tokens.PREDICT if self.task == 'predict' else self.prompt_tokens.CLASSIFY
         prompt = torch.tensor([
-            self.prompt_tokens.BOS,
-            task_token,
-            self.prompt_tokens.AIRCRAFT_UNKNOWN,  # default; override with metadata
             self.prompt_tokens.PHASE_UNKNOWN,
             self.prompt_tokens.REGION_OTHER,
         ], dtype=torch.long)
-        # Continuous ENU positions (for evaluation / regression head)
-        east = torch.from_numpy(traj['east'][sl]).float()
-        north = torch.from_numpy(traj['north'][sl]).float()
-        up = torch.from_numpy(traj['up'][sl]).float()
         return {
-            'geohash_bits': geohash_bits,
-            'cog_bins': cog_bins,
-            'sog_bins': sog_bins,
-            'rot_bins': rot_bins,
-            'alt_rate_bins': alt_rate_bins,
-            'uncert_bins': uncert_bins,
-            'uncert_bins_multi': uncert_bins_multi,
-            'hour': hour,
-            'dow': dow,
-            'month': month,
-            'second_of_day': second_of_day,
-            'dt': dt,
             'prompt': prompt,
-            'east': east,
-            'north': north,
-            'up': up,
         }
 # ============================================================
-# 10. Data Loading Utilities
 # ============================================================
-def load_traffic_sample(name: str = 'quickstart') -> List[Dict]:
-    """
-    Load sample data from the `traffic` library.
-    Available collections: 'quickstart' (238 flights), 'switzerland', 'savan'
-    Individual flights: 'landing_denver', calibration flights, etc.
-    """
     import traffic.data.samples as samples
     data = getattr(samples, name)
     trajectories = []
-    # Handle both Traffic (collection) and Flight (single) objects
     flights = data if hasattr(data, '__iter__') else [data]
     for flight in flights:
-        df = flight.data
         if df is None or len(df) < 20:
             continue
-        # Extract required columns — handle tz-aware and PyArrow timestamps
-        import pandas as pd
         ts_series = pd.to_datetime(df['timestamp'])
         if ts_series.dt.tz is not None:
             ts_series = ts_series.dt.tz_convert('UTC').dt.tz_localize(None)
@@ -861,7 +480,6 @@ def load_traffic_sample(name: str = 'quickstart') -> List[Dict]:
         lats = df['latitude'].values.astype(np.float64)
         lons = df['longitude'].values.astype(np.float64)
-        # Altitude: try barometric first, then geometric
         if 'altitude' in df.columns:
             alts = df['altitude'].values.astype(np.float64)
         elif 'baro_altitude' in df.columns:
@@ -869,41 +487,21 @@ def load_traffic_sample(name: str = 'quickstart') -> List[Dict]:
         else:
             alts = np.zeros(len(df))
-        # Handle NaNs
         valid = ~(np.isnan(lats) | np.isnan(lons) | np.isnan(alts) | np.isnan(timestamps))
         if valid.sum() < 20:
             continue
         trajectories.append({
             'timestamps': timestamps[valid],
-            'lats': lats[valid],
-            'lons': lons[valid],
-            'alts': alts[valid],
-            'callsign': flight.callsign if hasattr(flight, 'callsign') else 'UNKNOWN',
-            'icao24': flight.icao24 if hasattr(flight, 'icao24') else 'UNKNOWN',
         })
     return trajectories
-def build_dataset(
-    raw_trajectories: List[Dict],
-    processor: TrajectoryProcessor,
-    seq_len: int = 128,
-    stride: int = 64,
-    fit_geohash: bool = True,
-) -> AirTrackDataset:
-    """
-    Process raw trajectories and build PyTorch dataset.
-    Args:
-        raw_trajectories: list of dicts with 'timestamps', 'lats', 'lons', 'alts'
-        processor: TrajectoryProcessor instance
-        seq_len: sliding window size
-        stride: sliding window stride
-        fit_geohash: if True, fit geohash bounds from this data
-    """
-    # First pass: convert to ENU and collect bounds for geohash fitting
     processed = []
     all_east, all_north, all_up = [], [], []
@@ -919,28 +517,22 @@ def build_dataset(
             all_up.append(result['up'])
     if fit_geohash and processed:
-        # Fit geohash bounds from all trajectories
         all_e = np.concatenate(all_east)
         all_n = np.concatenate(all_north)
         all_u = np.concatenate(all_up)
         processor.fit_geohash(all_e, all_n, all_u)
-        # Re-encode geohash with fitted bounds
         for traj in processed:
             traj['geohash_bits'] = processor.geohash_encoder.encode(
                 traj['east'], traj['north'], traj['up']
             )
     print(f"Processed {len(processed)}/{len(raw_trajectories)} trajectories")
     dataset = AirTrackDataset(processed, seq_len=seq_len, stride=stride)
     print(f"Created dataset with {len(dataset)} windows")
     return dataset
 if __name__ == '__main__':
-    # Quick test with traffic sample data
     print("Loading traffic sample data...")
     raw_trajs = load_traffic_sample()
     print(f"Loaded {len(raw_trajs)} raw trajectories")
@@ -949,12 +541,9 @@ if __name__ == '__main__':
     processor = TrajectoryProcessor(resample_dt=5.0)
     dataset = build_dataset(raw_trajs, processor, seq_len=64, stride=32)
-    print(f"\nDataset size: {len(dataset)}")
     if len(dataset) > 0:
         sample = dataset[0]
-        print("\nSample keys and shapes:")
         for k, v in sample.items():
             if isinstance(v, torch.Tensor):
                 print(f"  {k}: {v.shape} ({v.dtype})")
-            else:
-                print(f"  {k}: {type(v)}")

 Pipeline:
 1. Load trajectories from `traffic` library or raw CSV
 2. Resample to fixed time interval (default 5s)
+3. Convert lat/lon/alt to ENU (East-North-Up) using first lat/lon point as origin
 4. Compute velocity via 3-point central derivative on ENU positions
 5. Derive COG, SOG from x-y ground velocity; ROT from COG; altitude rate from z velocity
+6. Binary geohash encoding (40-bit per axis, following LLM4STP approach)
 7. Discretize features into bins
 8. Compute uncertainty scores
 9. Build sliding-window PyTorch Dataset
+KEY DESIGN CHOICES:
+- Time: sub-second (fractional) resolution via float64 Unix timestamps + sinusoidal encoding
+- Geohash: 40-bit binary per axis with PER-TRAJECTORY normalization for max spatial resolution
+  40 bits → 2^40 ≈ 10^12 levels. For a 500km trajectory range, that's ~0.5μm resolution.
+  Tighter than any geohash resolution level.
+- COG/SOG: derived from ENU velocity components via 3-point central derivative (not from raw lat/lon)
+- ENU origin: first (lat, lon) of each trajectory
 """
 import numpy as np
 class ENUConverter:
     """
+    Convert WGS84 (lat, lon, alt) to local East-North-Up (ENU).
+    Origin = first point of each trajectory.
     """
     def __init__(self, origin_lat: float, origin_lon: float, origin_alt: float = 0.0):
         self.origin_lon = origin_lon
         self.origin_alt = origin_alt
         self.ecef = pyproj.Proj(proj='geocent', ellps='WGS84', datum='WGS84')
         self.lla = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
         self.transformer_to_ecef = pyproj.Transformer.from_proj(self.lla, self.ecef, always_xy=True)
         self.transformer_to_lla = pyproj.Transformer.from_proj(self.ecef, self.lla, always_xy=True)
         self.x0, self.y0, self.z0 = self.transformer_to_ecef.transform(
             origin_lon, origin_lat, origin_alt
         )
         lat_r = np.radians(origin_lat)
         lon_r = np.radians(origin_lon)
         self.R = np.array([
             [ np.cos(lat_r)*np.cos(lon_r),  np.cos(lat_r)*np.sin(lon_r), np.sin(lat_r)]
         ])
+    def to_enu(self, lats, lons, alts):
         x, y, z = self.transformer_to_ecef.transform(lons, lats, alts)
         dx = x - self.x0
         dy = y - self.y0
         dz = z - self.z0
+        ecef_delta = np.stack([dx, dy, dz], axis=0)
+        enu = self.R @ ecef_delta
+        return enu[0], enu[1], enu[2]
+    def from_enu(self, east, north, up):
         enu = np.stack([east, north, up], axis=0)
         ecef_delta = self.R.T @ enu
         x = ecef_delta[0] + self.x0
         y = ecef_delta[1] + self.y0
         z = ecef_delta[2] + self.z0
         lons, lats, alts = self.transformer_to_lla.transform(x, y, z)
         return lats, lons, alts
 # ============================================================
+# 2. Three-Point Central Derivative (vectorized)
 # ============================================================
+def three_point_derivative(values: np.ndarray, t: np.ndarray) -> np.ndarray:
     """
+    3-point central derivative. Interior: (f(i+1) - f(i-1)) / (t(i+1) - t(i-1))
+    Endpoints: forward/backward difference.
     """
     N = len(values)
     deriv = np.zeros(N)
     if N < 2:
         return deriv
+    dt_fwd = t[1] - t[0]
     if dt_fwd > 0:
         deriv[0] = (values[1] - values[0]) / dt_fwd
     if N > 2:
+        dt_span = t[2:] - t[:-2]
         mask = dt_span > 0
+        val_diff = values[2:] - values[:-2]
         deriv[1:-1] = np.where(mask, val_diff / np.maximum(dt_span, 1e-10), 0.0)
+    dt_bwd = t[-1] - t[-2]
     if dt_bwd > 0:
         deriv[-1] = (values[-1] - values[-2]) / dt_bwd
 # 3. Feature Derivation from ENU positions
 # ============================================================
+def derive_features_enu(east, north, up, timestamps):
     """
+    Derive COG, SOG, ROT, alt_rate from ENU positions using 3-point central derivatives.
+    COG = atan2(vx_east, vy_north) → bearing from North, clockwise [0, 360)
+    SOG = sqrt(vx² + vy²) converted to knots
+    ROT = d(COG)/dt via 3-point derivative on unwrapped COG
+    alt_rate = vz converted to ft/min
     """
     t = timestamps - timestamps[0]
+    vx = three_point_derivative(east, t)    # East velocity m/s
+    vy = three_point_derivative(north, t)   # North velocity m/s
+    vz = three_point_derivative(up, t)      # Up velocity m/s
     sog_ms = np.sqrt(vx**2 + vy**2)
+    sog_knots = sog_ms * 1.94384  # m/s → knots
+    # COG: atan2(East, North) gives bearing from North, clockwise
     cog_deg = np.degrees(np.arctan2(vx, vy)) % 360
+    # ROT: derivative of unwrapped COG
     cog_unwrapped = np.unwrap(np.radians(cog_deg))
+    rot_rad_s = three_point_derivative(cog_unwrapped, t)
     rot_deg_s = np.degrees(rot_rad_s)
+    # Altitude rate: m/s → ft/min
+    alt_rate_ftmin = vz * 196.85
     return {
+        'vx': vx, 'vy': vy, 'vz': vz,
+        'COG': cog_deg, 'SOG': sog_knots,
+        'ROT': rot_deg_s, 'alt_rate': alt_rate_ftmin,
     }
 # ============================================================
+# 4. Binary Geohash Encoding (40-bit per axis → 120 bits total)
 # ============================================================
+def binary_geohash_encode(values, precision=40, v_min=0.0, v_max=1.0):
+    """Successive bisection encoding to binary. Matches LLM4STP num2bits()."""
     N = len(values)
     bits = np.zeros((N, precision), dtype=np.int64)
     _min = np.full(N, v_min)
     _max = np.full(N, v_max)
     for p in range(precision):
         mid = (_min + _max) / 2
         mask = values > mid
         bits[:, p] = mask.astype(np.int64)
         _min = np.where(mask, mid, _min)
         _max = np.where(mask, _max, mid)
     return bits
+def binary_geohash_decode(bits, precision=40, v_min=0.0, v_max=1.0):
     N = bits.shape[0]
     _min = np.full(N, v_min)
     _max = np.full(N, v_max)
     for p in range(precision):
         mid = (_min + _max) / 2
         mask = bits[:, p].astype(bool)
         _min = np.where(mask, mid, _min)
         _max = np.where(mask, _max, mid)
     return (_min + _max) / 2
 class GeohashEncoder:
     """
+    3D geohash encoder for ENU coordinates.
+    40 bits per axis × 3 axes = 120 bits per timestep.
+    Uses PER-TRAJECTORY normalization with a small margin so the full
+    bit range encodes just the spatial extent of each trajectory.
+    For a typical trajectory spanning ~200km, 40 bits gives ~0.2mm resolution.
     """
+    def __init__(self, precision=40):
         self.precision = precision
+        self.e_min = self.e_max = None
+        self.n_min = self.n_max = None
+        self.u_min = self.u_max = None
+    def fit(self, east, north, up, margin=0.05):
+        """Fit normalization bounds with margin."""
+        for attr, data in [('e', east), ('n', north), ('u', up)]:
+            drange = data.max() - data.min()
+            m = margin * max(drange, 100.0)  # At least 100m range
+            setattr(self, f'{attr}_min', data.min() - m)
+            setattr(self, f'{attr}_max', data.max() + m)
+    def _normalize(self, values, v_min, v_max):
         return np.clip((values - v_min) / max(v_max - v_min, 1e-10), 0.0, 1.0)
+    def encode(self, east, north, up):
+        e_norm = self._normalize(east, self.e_min, self.e_max)
+        n_norm = self._normalize(north, self.n_min, self.n_max)
+        u_norm = self._normalize(up, self.u_min, self.u_max)
         e_bits = binary_geohash_encode(e_norm, self.precision)
         n_bits = binary_geohash_encode(n_norm, self.precision)
         u_bits = binary_geohash_encode(u_norm, self.precision)
         return np.concatenate([e_bits, n_bits, u_bits], axis=1)  # (N, 120)
+    def get_bounds(self):
+        return {
+            'e_min': self.e_min, 'e_max': self.e_max,
+            'n_min': self.n_min, 'n_max': self.n_max,
+            'u_min': self.u_min, 'u_max': self.u_max,
+        }
 # ============================================================
 @dataclass
 class FeatureBins:
+    """Feature discretization configuration."""
+    cog_edges: np.ndarray = field(default_factory=lambda: np.linspace(0, 360, 181))    # 180 bins, 2°
+    sog_edges: np.ndarray = field(default_factory=lambda: np.linspace(0, 600, 301))    # 300 bins, 2 kts
+    rot_edges: np.ndarray = field(default_factory=lambda: np.linspace(-6, 6, 121))     # 120 bins, 0.1°/s
     alt_rate_edges: np.ndarray = field(default_factory=lambda: np.linspace(-6000, 6000, 121))  # 120 bins
     @property
     def n_cog_bins(self): return len(self.cog_edges) - 1
     @property
     def n_sog_bins(self): return len(self.sog_edges) - 1
     @property
     def n_rot_bins(self): return len(self.rot_edges) - 1
     @property
     def n_alt_rate_bins(self): return len(self.alt_rate_edges) - 1
+    def _digitize(self, values, edges):
+        return np.clip(np.digitize(values, edges) - 1, 0, len(edges) - 2)
+    def encode_cog(self, cog): return self._digitize(cog, self.cog_edges)
+    def encode_sog(self, sog): return self._digitize(sog, self.sog_edges)
+    def encode_rot(self, rot): return self._digitize(np.clip(rot, -6, 6), self.rot_edges)
+    def encode_alt_rate(self, ar): return self._digitize(np.clip(ar, -6000, 6000), self.alt_rate_edges)
 # ============================================================
+# 6. Temporal Features (sub-second precision)
 # ============================================================
+def extract_temporal_features(timestamps):
+    """Extract temporal features preserving fractional second precision."""
     import datetime
     dts = [datetime.datetime.utcfromtimestamp(t) for t in timestamps]
     hours = np.array([d.hour for d in dts], dtype=np.int64)
     dows = np.array([d.weekday() for d in dts], dtype=np.int64)
+    months = np.array([d.month - 1 for d in dts], dtype=np.int64)
+    # Second of day with fractional seconds (sub-second precision)
     second_of_day = np.array([
         d.hour * 3600 + d.minute * 60 + d.second + d.microsecond / 1e6
         for d in dts
+    ], dtype=np.float64)
+    dt = np.zeros(len(timestamps), dtype=np.float64)
     dt[1:] = np.diff(timestamps)
     return {
         'second_of_day': second_of_day,
+        'hour': hours, 'dow': dows, 'month': months,
         'dt': dt,
     }
 # ============================================================
+# 7. Full Trajectory Processor
 # ============================================================
 class TrajectoryProcessor:
+    def __init__(self, resample_dt=5.0, geohash_precision=40, n_uncertainty_bins=16,
+                 feature_bins=None, min_trajectory_len=20):
         self.resample_dt = resample_dt
         self.geohash_precision = geohash_precision
         self.n_uncertainty_bins = n_uncertainty_bins
         self.feature_bins = feature_bins or FeatureBins()
         self.min_trajectory_len = min_trajectory_len
         self.geohash_encoder = GeohashEncoder(precision=geohash_precision)
         self._fitted = False
+    def resample_trajectory(self, timestamps, lats, lons, alts):
+        t_start, t_end = timestamps[0], timestamps[-1]
         n_points = int((t_end - t_start) / self.resample_dt) + 1
+        if n_points < 2:
+            return timestamps, lats, lons, alts
         t_new = np.linspace(t_start, t_start + (n_points - 1) * self.resample_dt, n_points)
+        return t_new, np.interp(t_new, timestamps, lats), np.interp(t_new, timestamps, lons), np.interp(t_new, timestamps, alts)
+    def process_trajectory(self, timestamps, lats, lons, alts, metadata=None):
         sort_idx = np.argsort(timestamps)
+        timestamps, lats, lons, alts = timestamps[sort_idx], lats[sort_idx], lons[sort_idx], alts[sort_idx]
         timestamps, lats, lons, alts = self.resample_trajectory(timestamps, lats, lons, alts)
         if len(timestamps) < self.min_trajectory_len:
             return None
+        # ENU conversion — origin = first point
         converter = ENUConverter(lats[0], lons[0], alts[0])
         east, north, up = converter.to_enu(lats, lons, alts)
+        # Derive kinematics from ENU via 3-point derivative
         features = derive_features_enu(east, north, up, timestamps)
+        # Binary geohash
         if self.geohash_encoder.e_min is not None:
+            geohash_bits = self.geohash_encoder.encode(east, north, up)
         else:
             geohash_bits = np.zeros((len(east), self.geohash_precision * 3), dtype=np.int64)
+        # Discretize kinematics
         cog_bins = self.feature_bins.encode_cog(features['COG'])
         sog_bins = self.feature_bins.encode_sog(features['SOG'])
         rot_bins = self.feature_bins.encode_rot(features['ROT'])
         alt_rate_bins = self.feature_bins.encode_alt_rate(features['alt_rate'])
+        # Uncertainty
+        from uncertainty import compute_all_uncertainties, discretize_scores, UncertaintyConfig
+        uncert_config = UncertaintyConfig(n_bins=self.n_uncertainty_bins, window=5)
         raw_uncert = compute_all_uncertainties(
             east, north, up, timestamps,
             features['COG'], features['SOG'], features['ROT'], features['alt_rate'],
             config=uncert_config,
         )
         uncert_methods = sorted(raw_uncert.keys())
         uncert_bins_multi = np.stack([
             discretize_scores(raw_uncert[m], self.n_uncertainty_bins)
             for m in uncert_methods
+        ], axis=1)
+        uncert_bins = uncert_bins_multi[:, 0] if uncert_bins_multi.shape[1] > 0 else np.zeros(len(east), dtype=np.int64)
         temporal = extract_temporal_features(timestamps)
         return {
+            'timestamps': timestamps, 'lats': lats, 'lons': lons, 'alts': alts,
+            'east': east, 'north': north, 'up': up,
+            'COG': features['COG'], 'SOG': features['SOG'],
+            'ROT': features['ROT'], 'alt_rate': features['alt_rate'],
+            'vx': features['vx'], 'vy': features['vy'], 'vz': features['vz'],
             'geohash_bits': geohash_bits,
+            'cog_bins': cog_bins, 'sog_bins': sog_bins,
+            'rot_bins': rot_bins, 'alt_rate_bins': alt_rate_bins,
+            'uncert_bins': uncert_bins, 'uncert_bins_multi': uncert_bins_multi,
+            'uncert_method_names': uncert_methods,
+            'hour': temporal['hour'], 'dow': temporal['dow'], 'month': temporal['month'],
+            'second_of_day': temporal['second_of_day'], 'dt': temporal['dt'],
             'enu_origin': (converter.origin_lat, converter.origin_lon, converter.origin_alt),
             'metadata': metadata or {},
         }
+    def fit_geohash(self, all_east, all_north, all_up):
         self.geohash_encoder.fit(all_east, all_north, all_up)
         self._fitted = True
 # ============================================================
+# 8. Prompt Tokens
 # ============================================================
 @dataclass
 class PromptTokens:
+    BOS: int = 0; EOS: int = 1; PAD: int = 2
+    PREDICT: int = 3; CLASSIFY: int = 4; DETECT_ANOMALY: int = 5
+    HEAVY: int = 6; LARGE: int = 7; SMALL: int = 8
+    ROTORCRAFT: int = 9; GLIDER: int = 10; UAV: int = 11; AIRCRAFT_UNKNOWN: int = 12
+    CLIMB: int = 13; CRUISE: int = 14; DESCENT: int = 15
+    APPROACH: int = 16; GROUND: int = 17; PHASE_UNKNOWN: int = 18
+    CONUS: int = 19; EUROPE: int = 20; ASIA: int = 21; REGION_OTHER: int = 22
     VOCAB_SIZE: int = 23
+# ============================================================
+# 9. PyTorch Dataset
+# ============================================================
 class AirTrackDataset(Dataset):
+    def __init__(self, trajectories, seq_len=128, stride=64, task='predict'):
         self.seq_len = seq_len
         self.stride = stride
         self.task = task
         self.windows = []
         self.trajectories = trajectories
         for traj_idx, traj in enumerate(trajectories):
             n_points = len(traj['timestamps'])
             if n_points < seq_len + 1:
                 if n_points >= 20:
                     self.windows.append((traj_idx, 0, n_points))
                 continue
             for start in range(0, n_points - seq_len, stride):
+                end = start + seq_len + 1
                 if end <= n_points:
                     self.windows.append((traj_idx, start, end))
         self.prompt_tokens = PromptTokens()
     def __len__(self):
     def __getitem__(self, idx):
         traj_idx, start, end = self.windows[idx]
         traj = self.trajectories[traj_idx]
         sl = slice(start, end)
         task_token = self.prompt_tokens.PREDICT if self.task == 'predict' else self.prompt_tokens.CLASSIFY
         prompt = torch.tensor([
+            self.prompt_tokens.BOS, task_token,
+            self.prompt_tokens.AIRCRAFT_UNKNOWN,
             self.prompt_tokens.PHASE_UNKNOWN,
             self.prompt_tokens.REGION_OTHER,
         ], dtype=torch.long)
         return {
+            'geohash_bits': torch.from_numpy(traj['geohash_bits'][sl]).float(),
+            'cog_bins': torch.from_numpy(traj['cog_bins'][sl]).long(),
+            'sog_bins': torch.from_numpy(traj['sog_bins'][sl]).long(),
+            'rot_bins': torch.from_numpy(traj['rot_bins'][sl]).long(),
+            'alt_rate_bins': torch.from_numpy(traj['alt_rate_bins'][sl]).long(),
+            'uncert_bins': torch.from_numpy(traj['uncert_bins'][sl]).long(),
+            'uncert_bins_multi': torch.from_numpy(traj['uncert_bins_multi'][sl]).long(),
+            'hour': torch.from_numpy(traj['hour'][sl]).long(),
+            'dow': torch.from_numpy(traj['dow'][sl]).long(),
+            'month': torch.from_numpy(traj['month'][sl]).long(),
+            'second_of_day': torch.from_numpy(traj['second_of_day'][sl]).float(),
+            'dt': torch.from_numpy(traj['dt'][sl]).float(),
             'prompt': prompt,
+            'east': torch.from_numpy(traj['east'][sl]).float(),
+            'north': torch.from_numpy(traj['north'][sl]).float(),
+            'up': torch.from_numpy(traj['up'][sl]).float(),
         }
 # ============================================================
+# 10. Data Loading
 # ============================================================
+def load_traffic_sample(name='quickstart'):
     import traffic.data.samples as samples
+    import pandas as pd
     data = getattr(samples, name)
     trajectories = []
     flights = data if hasattr(data, '__iter__') else [data]
     for flight in flights:
+        try:
+            df = flight.data
+        except Exception:
+            continue
         if df is None or len(df) < 20:
             continue
         ts_series = pd.to_datetime(df['timestamp'])
         if ts_series.dt.tz is not None:
             ts_series = ts_series.dt.tz_convert('UTC').dt.tz_localize(None)
         lats = df['latitude'].values.astype(np.float64)
         lons = df['longitude'].values.astype(np.float64)
         if 'altitude' in df.columns:
             alts = df['altitude'].values.astype(np.float64)
         elif 'baro_altitude' in df.columns:
         else:
             alts = np.zeros(len(df))
         valid = ~(np.isnan(lats) | np.isnan(lons) | np.isnan(alts) | np.isnan(timestamps))
         if valid.sum() < 20:
             continue
         trajectories.append({
             'timestamps': timestamps[valid],
+            'lats': lats[valid], 'lons': lons[valid], 'alts': alts[valid],
+            'callsign': getattr(flight, 'callsign', 'UNKNOWN'),
+            'icao24': getattr(flight, 'icao24', 'UNKNOWN'),
         })
     return trajectories
+def build_dataset(raw_trajectories, processor, seq_len=128, stride=64, fit_geohash=True):
     processed = []
     all_east, all_north, all_up = [], [], []
             all_up.append(result['up'])
     if fit_geohash and processed:
         all_e = np.concatenate(all_east)
         all_n = np.concatenate(all_north)
         all_u = np.concatenate(all_up)
         processor.fit_geohash(all_e, all_n, all_u)
         for traj in processed:
             traj['geohash_bits'] = processor.geohash_encoder.encode(
                 traj['east'], traj['north'], traj['up']
             )
     print(f"Processed {len(processed)}/{len(raw_trajectories)} trajectories")
     dataset = AirTrackDataset(processed, seq_len=seq_len, stride=stride)
     print(f"Created dataset with {len(dataset)} windows")
     return dataset
 if __name__ == '__main__':
     print("Loading traffic sample data...")
     raw_trajs = load_traffic_sample()
     print(f"Loaded {len(raw_trajs)} raw trajectories")
     processor = TrajectoryProcessor(resample_dt=5.0)
     dataset = build_dataset(raw_trajs, processor, seq_len=64, stride=32)
     if len(dataset) > 0:
         sample = dataset[0]
+        print("\nSample shapes:")
         for k, v in sample.items():
             if isinstance(v, torch.Tensor):
                 print(f"  {k}: {v.shape} ({v.dtype})")