guychuk
/

flight-jepa-v2

@@ -469,10 +469,13 @@ def get_last_pos(past_features, past_length):
     return past_features[torch.arange(B, device=past_features.device), idx, :3]
-def train_one_epoch(model, loader, optimizer, device, grad_clip=1.0):
     model.train()
     sums = {"nll": 0.0, "ade": 0.0, "jepa": 0.0, "total": 0.0, "n": 0}
-    for batch in loader:
         past_f = batch["past_features"].to(device)
         past_l = batch["past_length"].to(device)
         target = batch["target_pos"].to(device)
@@ -492,6 +495,13 @@ def train_one_epoch(model, loader, optimizer, device, grad_clip=1.0):
             sums["jepa"] += losses["jepa"].item() * bs
         sums["total"] += losses["total"].item() * bs
         sums["n"] += bs
     n = max(sums["n"], 1)
     return {k: v / n for k, v in sums.items() if k != "n"} | {
         "ade_train": sums["ade"] / n
@@ -617,7 +627,15 @@ def main():
     np.random.seed(args.seed)
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    print(f"[v2] device={device} tag={args.tag} lambda_jepa={args.lambda_jepa}")
     if HAS_TRACKIO and args.trackio_name:
         trackio.init(project="flight-jepa-v2", name=args.trackio_name,

     return past_features[torch.arange(B, device=past_features.device), idx, :3]
+def train_one_epoch(model, loader, optimizer, device, grad_clip=1.0,
+                     log_every: int = 50):
     model.train()
     sums = {"nll": 0.0, "ade": 0.0, "jepa": 0.0, "total": 0.0, "n": 0}
+    t0 = time.time()
+    n_batches = len(loader) if hasattr(loader, "__len__") else 0
+    for bi, batch in enumerate(loader):
         past_f = batch["past_features"].to(device)
         past_l = batch["past_length"].to(device)
         target = batch["target_pos"].to(device)
             sums["jepa"] += losses["jepa"].item() * bs
         sums["total"] += losses["total"].item() * bs
         sums["n"] += bs
+        if (bi + 1) % log_every == 0 or bi == 0:
+            dt = time.time() - t0
+            rate = (bi + 1) / max(dt, 0.001)
+            print(f"  [batch {bi+1}/{n_batches}] {dt:.1f}s elapsed, "
+                  f"{rate:.1f} batch/s, loss={losses['total'].item():.4f}",
+                  flush=True)
     n = max(sums["n"], 1)
     return {k: v / n for k, v in sums.items() if k != "n"} | {
         "ade_train": sums["ade"] / n
     np.random.seed(args.seed)
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"[v2] device={device} tag={args.tag} lambda_jepa={args.lambda_jepa}",
+          flush=True)
+    if device == "cuda":
+        print(f"[v2] cuda device: {torch.cuda.get_device_name(0)} "
+              f"vram={torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB",
+              flush=True)
+    else:
+        print("[v2] WARNING: CUDA not available, training on CPU. "
+              "This will be very slow.", flush=True)
     if HAS_TRACKIO and args.trackio_name:
         trackio.init(project="flight-jepa-v2", name=args.trackio_name,