Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

flowmask_model.py +79 -0
flownet_model.py +150 -0
loss_history.json +233 -0
model.pt +3 -0
predict.py +53 -259
train.log +63 -31

flowmask_model.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""Flow-Warp-Mask U-Net: predicts flow, occlusion mask, and generated frame."""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ResConvBlock(nn.Module):
+    def __init__(self, in_ch, out_ch):
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_ch, out_ch, 3, padding=1)
+        self.gn1 = nn.GroupNorm(min(8, out_ch), out_ch)
+        self.conv2 = nn.Conv2d(out_ch, out_ch, 3, padding=1)
+        self.gn2 = nn.GroupNorm(min(8, out_ch), out_ch)
+        self.proj = nn.Conv2d(in_ch, out_ch, 1) if in_ch != out_ch else nn.Identity()
+    def forward(self, x):
+        residual = self.proj(x)
+        x = F.silu(self.gn1(self.conv1(x)))
+        x = F.silu(self.gn2(self.conv2(x)))
+        return x + residual
+class FlowWarpMaskUNet(nn.Module):
+    def __init__(self, in_channels=12, channels=[48, 96, 192]):
+        super().__init__()
+        # Encoder
+        self.encoders = nn.ModuleList()
+        self.pools = nn.ModuleList()
+        prev_ch = in_channels
+        for ch in channels:
+            self.encoders.append(ResConvBlock(prev_ch, ch))
+            self.pools.append(nn.MaxPool2d(2))
+            prev_ch = ch
+        # Bottleneck
+        self.bottleneck = ResConvBlock(channels[-1], channels[-1] * 2)
+        # Decoder
+        self.upconvs = nn.ModuleList()
+        self.decoders = nn.ModuleList()
+        dec_channels = list(reversed(channels))
+        prev_ch = channels[-1] * 2
+        for ch in dec_channels:
+            self.upconvs.append(nn.ConvTranspose2d(prev_ch, ch, 2, stride=2))
+            self.decoders.append(ResConvBlock(ch * 2, ch))
+            prev_ch = ch
+        # Flow head (2 channels: dx, dy)
+        self.flow_head = nn.Conv2d(dec_channels[-1], 2, 1)
+        # Mask head (1 channel: occlusion mask, sigmoid applied)
+        self.mask_head = nn.Conv2d(dec_channels[-1], 1, 1)
+        # Generation head (3 channels: full frame for occluded areas)
+        self.gen_head = nn.Conv2d(dec_channels[-1], 3, 1)
+        # Initialize flow and mask heads near-zero for stable start
+        nn.init.zeros_(self.flow_head.weight)
+        nn.init.zeros_(self.flow_head.bias)
+        nn.init.zeros_(self.mask_head.weight)
+        nn.init.zeros_(self.mask_head.bias)
+    def forward(self, x):
+        skips = []
+        for enc, pool in zip(self.encoders, self.pools):
+            x = enc(x)
+            skips.append(x)
+            x = pool(x)
+        x = self.bottleneck(x)
+        for upconv, dec, skip in zip(self.upconvs, self.decoders, reversed(skips)):
+            x = upconv(x)
+            x = torch.cat([x, skip], dim=1)
+            x = dec(x)
+        flow = self.flow_head(x)
+        mask = torch.sigmoid(self.mask_head(x))
+        gen_frame = self.gen_head(x)
+        return flow, mask, gen_frame

flownet_model.py ADDED Viewed

	@@ -0,0 +1,150 @@

+"""Flow-Warp U-Net: predicts optical flow + residual, warps last frame."""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ResConvBlock(nn.Module):
+    def __init__(self, in_ch, out_ch):
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_ch, out_ch, 3, padding=1)
+        self.gn1 = nn.GroupNorm(min(8, out_ch), out_ch)
+        self.conv2 = nn.Conv2d(out_ch, out_ch, 3, padding=1)
+        self.gn2 = nn.GroupNorm(min(8, out_ch), out_ch)
+        self.proj = nn.Conv2d(in_ch, out_ch, 1) if in_ch != out_ch else nn.Identity()
+    def forward(self, x):
+        residual = self.proj(x)
+        x = F.silu(self.gn1(self.conv1(x)))
+        x = F.silu(self.gn2(self.conv2(x)))
+        return x + residual
+class FlowWarpUNet(nn.Module):
+    def __init__(self, in_channels=12, channels=[48, 96, 192, 384]):
+        super().__init__()
+        # Encoder
+        self.encoders = nn.ModuleList()
+        self.pools = nn.ModuleList()
+        prev_ch = in_channels
+        for ch in channels:
+            self.encoders.append(ResConvBlock(prev_ch, ch))
+            self.pools.append(nn.MaxPool2d(2))
+            prev_ch = ch
+        # Bottleneck
+        self.bottleneck = ResConvBlock(channels[-1], channels[-1] * 2)
+        # Decoder
+        self.upconvs = nn.ModuleList()
+        self.decoders = nn.ModuleList()
+        dec_channels = list(reversed(channels))
+        prev_ch = channels[-1] * 2
+        for ch in dec_channels:
+            self.upconvs.append(nn.ConvTranspose2d(prev_ch, ch, 2, stride=2))
+            self.decoders.append(ResConvBlock(ch * 2, ch))
+            prev_ch = ch
+        # Flow head (2 channels: dx, dy)
+        self.flow_head = nn.Conv2d(dec_channels[-1], 2, 1)
+        # Residual head (3 channels: RGB residual)
+        self.residual_head = nn.Conv2d(dec_channels[-1], 3, 1)
+        # Initialize flow head near-zero for stable start
+        nn.init.zeros_(self.flow_head.weight)
+        nn.init.zeros_(self.flow_head.bias)
+        # Initialize residual head near-zero too
+        nn.init.zeros_(self.residual_head.weight)
+        nn.init.zeros_(self.residual_head.bias)
+    def forward(self, x):
+        """
+        Args:
+            x: (B, 12, 64, 64) - 4 frames stacked
+        Returns:
+            flow: (B, 2, 64, 64) - optical flow (dx, dy) in pixels
+            residual: (B, 3, 64, 64) - residual correction
+        """
+        skips = []
+        for enc, pool in zip(self.encoders, self.pools):
+            x = enc(x)
+            skips.append(x)
+            x = pool(x)
+        x = self.bottleneck(x)
+        for upconv, dec, skip in zip(self.upconvs, self.decoders, reversed(skips)):
+            x = upconv(x)
+            x = torch.cat([x, skip], dim=1)
+            x = dec(x)
+        flow = self.flow_head(x)  # (B, 2, 64, 64)
+        residual = self.residual_head(x)  # (B, 3, 64, 64)
+        return flow, residual
+def differentiable_warp(img, flow):
+    """
+    Warp image by flow using bilinear sampling.
+    Args:
+        img: (B, C, H, W) - image to warp
+        flow: (B, 2, H, W) - flow field (dx, dy) in pixel coordinates
+    Returns:
+        warped: (B, C, H, W)
+    """
+    B, C, H, W = img.shape
+    # Create base grid
+    grid_y, grid_x = torch.meshgrid(
+        torch.arange(H, device=img.device, dtype=img.dtype),
+        torch.arange(W, device=img.device, dtype=img.dtype),
+        indexing='ij'
+    )
+    grid_x = grid_x.unsqueeze(0).expand(B, -1, -1)  # (B, H, W)
+    grid_y = grid_y.unsqueeze(0).expand(B, -1, -1)
+    # Add flow
+    new_x = grid_x + flow[:, 0]  # (B, H, W)
+    new_y = grid_y + flow[:, 1]
+    # Normalize to [-1, 1] for grid_sample
+    new_x = 2.0 * new_x / (W - 1) - 1.0
+    new_y = 2.0 * new_y / (H - 1) - 1.0
+    grid = torch.stack([new_x, new_y], dim=-1)  # (B, H, W, 2)
+    warped = F.grid_sample(img, grid, mode='bilinear', padding_mode='border', align_corners=True)
+    return warped
+def flow_smoothness_loss(flow):
+    """Penalize spatial gradients of flow field."""
+    dx = flow[:, :, :, 1:] - flow[:, :, :, :-1]
+    dy = flow[:, :, 1:, :] - flow[:, :, :-1, :]
+    return (dx.abs().mean() + dy.abs().mean()) / 2
+class GlobalSSIMLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.C1 = (0.01) ** 2
+        self.C2 = (0.03) ** 2
+    def forward(self, pred, target):
+        B, C, H, W = pred.shape
+        pred_flat = pred.view(B, C, -1)
+        target_flat = target.view(B, C, -1)
+        mu_pred = pred_flat.mean(dim=2)
+        mu_target = target_flat.mean(dim=2)
+        sigma_pred_sq = pred_flat.var(dim=2)
+        sigma_target_sq = target_flat.var(dim=2)
+        sigma_cross = ((pred_flat - mu_pred.unsqueeze(2)) *
+                       (target_flat - mu_target.unsqueeze(2))).mean(dim=2)
+        numerator = (2 * mu_pred * mu_target + self.C1) * (2 * sigma_cross + self.C2)
+        denominator = (mu_pred ** 2 + mu_target ** 2 + self.C1) * (sigma_pred_sq + sigma_target_sq + self.C2)
+        ssim = numerator / denominator
+        return 1 - ssim.mean()

loss_history.json ADDED Viewed

	@@ -0,0 +1,233 @@

+[
+  {
+    "epoch": 1,
+    "phase": "P1",
+    "loss": 0.093708
+  },
+  {
+    "epoch": 2,
+    "phase": "P1",
+    "loss": 0.075409
+  },
+  {
+    "epoch": 3,
+    "phase": "P1",
+    "loss": 0.070398
+  },
+  {
+    "epoch": 4,
+    "phase": "P1",
+    "loss": 0.066922
+  },
+  {
+    "epoch": 5,
+    "phase": "P1",
+    "loss": 0.064051
+  },
+  {
+    "epoch": 6,
+    "phase": "P1",
+    "loss": 0.061594
+  },
+  {
+    "epoch": 7,
+    "phase": "P1",
+    "loss": 0.058991
+  },
+  {
+    "epoch": 8,
+    "phase": "P1",
+    "loss": 0.056665
+  },
+  {
+    "epoch": 9,
+    "phase": "P1",
+    "loss": 0.054221
+  },
+  {
+    "epoch": 10,
+    "phase": "P1",
+    "loss": 0.052157
+  },
+  {
+    "epoch": 11,
+    "phase": "P1",
+    "loss": 0.050054
+  },
+  {
+    "epoch": 12,
+    "phase": "P1",
+    "loss": 0.048416
+  },
+  {
+    "epoch": 13,
+    "phase": "P1",
+    "loss": 0.047013
+  },
+  {
+    "epoch": 14,
+    "phase": "P1",
+    "loss": 0.046003
+  },
+  {
+    "epoch": 15,
+    "phase": "P1",
+    "loss": 0.0454
+  },
+  {
+    "epoch": 16,
+    "phase": "P2",
+    "loss": 0.071297
+  },
+  {
+    "epoch": 17,
+    "phase": "P2",
+    "loss": 0.069845
+  },
+  {
+    "epoch": 18,
+    "phase": "P2",
+    "loss": 0.067838
+  },
+  {
+    "epoch": 19,
+    "phase": "P2",
+    "loss": 0.102993
+  },
+  {
+    "epoch": 20,
+    "phase": "P2",
+    "loss": 0.098403,
+    "val_ssim": 0.8174
+  },
+  {
+    "epoch": 21,
+    "phase": "P2",
+    "loss": 0.095552
+  },
+  {
+    "epoch": 22,
+    "phase": "P2",
+    "loss": 0.142291
+  },
+  {
+    "epoch": 23,
+    "phase": "P2",
+    "loss": 0.137962
+  },
+  {
+    "epoch": 24,
+    "phase": "P2",
+    "loss": 0.133837
+  },
+  {
+    "epoch": 25,
+    "phase": "P2",
+    "loss": 0.129812,
+    "val_ssim": 0.854
+  },
+  {
+    "epoch": 26,
+    "phase": "P2",
+    "loss": 0.126053
+  },
+  {
+    "epoch": 27,
+    "phase": "P2",
+    "loss": 0.122985
+  },
+  {
+    "epoch": 28,
+    "phase": "P2",
+    "loss": 0.120476
+  },
+  {
+    "epoch": 29,
+    "phase": "P2",
+    "loss": 0.117592
+  },
+  {
+    "epoch": 30,
+    "phase": "P2",
+    "loss": 0.115456,
+    "val_ssim": 0.8644
+  },
+  {
+    "epoch": 31,
+    "phase": "P2",
+    "loss": 0.113231
+  },
+  {
+    "epoch": 32,
+    "phase": "P2",
+    "loss": 0.111175
+  },
+  {
+    "epoch": 33,
+    "phase": "P2",
+    "loss": 0.108953
+  },
+  {
+    "epoch": 34,
+    "phase": "P2",
+    "loss": 0.106131
+  },
+  {
+    "epoch": 35,
+    "phase": "P2",
+    "loss": 0.103505,
+    "val_ssim": 0.8744
+  },
+  {
+    "epoch": 36,
+    "phase": "P2",
+    "loss": 0.100435
+  },
+  {
+    "epoch": 37,
+    "phase": "P2",
+    "loss": 0.097286
+  },
+  {
+    "epoch": 38,
+    "phase": "P2",
+    "loss": 0.094014
+  },
+  {
+    "epoch": 39,
+    "phase": "P2",
+    "loss": 0.090802
+  },
+  {
+    "epoch": 40,
+    "phase": "P2",
+    "loss": 0.087507,
+    "val_ssim": 0.8852
+  },
+  {
+    "epoch": 41,
+    "phase": "P2",
+    "loss": 0.084485
+  },
+  {
+    "epoch": 42,
+    "phase": "P2",
+    "loss": 0.081661
+  },
+  {
+    "epoch": 43,
+    "phase": "P2",
+    "loss": 0.079401
+  },
+  {
+    "epoch": 44,
+    "phase": "P2",
+    "loss": 0.077772
+  },
+  {
+    "epoch": 45,
+    "phase": "P2",
+    "loss": 0.076937,
+    "val_ssim": 0.885
+  }
+]

model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4774304dae39b918b34dd4ededabc4a793ac7efdeb772a746587fad584ccfe83
+size 9089268

predict.py CHANGED Viewed

@@ -1,282 +1,76 @@
-"""Full PP swap: Pong direct int8, full PP model, Sonic AR fp16 + direct int8."""
 import sys
 import os
 import numpy as np
 import torch
 sys.path.insert(0, "/home/coder/code")
-from unet_model import UNet
-CONTEXT_FRAMES = 8
-PRED_FRAMES = 8
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-def detect_game(context_frames: np.ndarray) -> str:
-    first_8 = context_frames[:CONTEXT_FRAMES]
-    mean_val = first_8.mean()
-    std_val = first_8.std()
-    b_mean = first_8[:, :, :, 2].mean()
-    r_mean = first_8[:, :, :, 0].mean()
-    if mean_val > 100 and std_val < 80 and b_mean > r_mean * 1.5:
-        return "pole_position"
-    elif mean_val < 5 and 10 < std_val < 20:
-        return "pong"
-    else:
-        return "sonic"
-def load_int8_state_dict(path, device):
-    """Load int8 quantized state dict and dequantize to float32."""
-    quantized = torch.load(path, map_location='cpu', weights_only=False)
-    sd = {}
-    for k, v in quantized.items():
-        if 'int8' in v:
-            sd[k] = (v['int8'].float() * v['scale']).to(device)
-        else:
-            sd[k] = v['float'].to(device)
-    return sd
-class EnsembleModels:
-    def __init__(self):
-        self.models = {}
-        self.sonic_ar = None
-        self.sonic_direct = None
-        self.pong_direct = None
-        self.direct_cache = None
-        self.cache_step = 0
-    def reset_cache(self):
-        self.direct_cache = None
-        self.cache_step = 0
 def load_model(model_dir: str):
-    ens = EnsembleModels()
-    # Pong AR (fp16, 3 outputs)
-    pong = UNet(in_channels=24, out_channels=3,
-                enc_channels=(32, 64, 128), bottleneck_channels=128,
-                upsample_mode="bilinear").to(DEVICE)
-    sd = torch.load(os.path.join(model_dir, "model_pong.pt"),
-                    map_location=DEVICE, weights_only=True)
-    pong.load_state_dict({k: v.float() for k, v in sd.items()})
-    pong.eval()
-    ens.models["pong"] = pong
-    # Pong direct (int8 quantized, 24 outputs)
-    pong_direct = UNet(in_channels=24, out_channels=24,
-                       enc_channels=(32, 64, 128), bottleneck_channels=128,
-                       upsample_mode="bilinear").to(DEVICE)
-    sd = load_int8_state_dict(os.path.join(model_dir, "model_pong_direct.pt"), DEVICE)
-    pong_direct.load_state_dict(sd)
-    pong_direct.eval()
-    ens.pong_direct = pong_direct
-    # Sonic AR (fp16, 3 outputs) - kept in fp16 for AR chain quality
-    sonic_ar = UNet(in_channels=24, out_channels=3,
-                    enc_channels=(48, 96, 192), bottleneck_channels=256,
-                    upsample_mode="bilinear").to(DEVICE)
-    sd = torch.load(os.path.join(model_dir, "model_sonic_ar.pt"),
-                    map_location=DEVICE, weights_only=True)
-    sonic_ar.load_state_dict({k: v.float() for k, v in sd.items()})
-    sonic_ar.eval()
-    ens.sonic_ar = sonic_ar
-    # Sonic direct (int8 quantized, 24 outputs)
-    sonic_direct = UNet(in_channels=24, out_channels=24,
-                        enc_channels=(48, 96, 192), bottleneck_channels=256,
-                        upsample_mode="bilinear").to(DEVICE)
-    sd = load_int8_state_dict(os.path.join(model_dir, "model_sonic_direct.pt"), DEVICE)
-    sonic_direct.load_state_dict(sd)
-    sonic_direct.eval()
-    ens.sonic_direct = sonic_direct
-    # PP full direct (fp16, 24 outputs)
-    pp = UNet(in_channels=24, out_channels=24,
-              enc_channels=(32, 64, 128), bottleneck_channels=192,
-              upsample_mode="bilinear").to(DEVICE)
-    sd = torch.load(os.path.join(model_dir, "model_pole_position.pt"),
-                    map_location=DEVICE, weights_only=True)
-    pp.load_state_dict({k: v.float() for k, v in sd.items()})
-    pp.eval()
-    ens.models["pole_position"] = pp
-    return ens
-def _predict_8frames_direct(model, context_tensor, last_tensor, residual_scale=1.0):
-    output = model(context_tensor)
-    residuals = output.reshape(1, PRED_FRAMES, 3, 64, 64)
-    last_expanded = last_tensor.unsqueeze(1).expand_as(residuals)
-    return torch.clamp(last_expanded + residual_scale * residuals, 0, 1)
-def _predict_ar_frame(model, context_tensor, last_tensor, residual_scale=1.0):
-    residual = model(context_tensor)
-    return torch.clamp(last_tensor + residual_scale * residual, 0, 1)
-def predict_next_frame(ens, context_frames: np.ndarray) -> np.ndarray:
-    game = detect_game(context_frames)
-    n = len(context_frames)
-    if n < CONTEXT_FRAMES:
-        padding = np.stack([context_frames[0]] * (CONTEXT_FRAMES - n), axis=0)
-        frames = np.concatenate([padding, context_frames], axis=0)
     else:
-        frames = context_frames[-CONTEXT_FRAMES:]
-    frames_norm = frames.astype(np.float32) / 255.0
-    frames_t = np.transpose(frames_norm, (0, 3, 1, 2))
-    context = frames_t.reshape(1, -1, 64, 64)
-    last_frame = frames_norm[-1]
-    last_frame_t = np.transpose(last_frame, (2, 0, 1))[np.newaxis]
-    if game == "pong":
-        # Pong: AR+direct ensemble, float32 caching, no TTA
-        if ens.direct_cache is not None and n > CONTEXT_FRAMES and ens.cache_step < PRED_FRAMES:
-            result = ens.direct_cache[ens.cache_step]
-            ens.cache_step += 1
-            if ens.cache_step >= PRED_FRAMES:
-                ens.reset_cache()
-            return result
-        ens.reset_cache()
-        with torch.no_grad():
-            context_tensor = torch.from_numpy(context).to(DEVICE)
-            last_tensor = torch.from_numpy(last_frame_t).to(DEVICE)
-            direct_pred = _predict_8frames_direct(ens.pong_direct, context_tensor, last_tensor)
-            ar_preds = []
-            ctx = context_tensor.clone()
-            last_t = last_tensor.clone()
-            for step in range(PRED_FRAMES):
-                predicted = _predict_ar_frame(ens.models["pong"], ctx, last_t, residual_scale=1.02)
-                ar_preds.append(predicted)
-                ctx_frames = ctx.reshape(1, CONTEXT_FRAMES, 3, 64, 64)
-                ctx_frames = torch.cat([ctx_frames[:, 1:], predicted.unsqueeze(1)], dim=1)
-                ctx = ctx_frames.reshape(1, -1, 64, 64)
-                last_t = predicted
-            ar_pred = torch.stack(ar_preds, dim=1)
-            predicted = torch.zeros_like(direct_pred)
-            for step in range(PRED_FRAMES):
-                ar_weight = 0.85 - (step / (PRED_FRAMES - 1)) * 0.3
-                direct_weight = 1.0 - ar_weight
-                predicted[:, step] = ar_weight * ar_pred[:, step] + direct_weight * direct_pred[:, step]
-        predicted_np = predicted[0].cpu().numpy()
-        ens.direct_cache = []
-        for i in range(PRED_FRAMES):
-            frame = np.transpose(predicted_np[i], (1, 2, 0))
-            frame = np.round(frame * 255 + 0.2).clip(0, 255).astype(np.uint8)
-            ens.direct_cache.append(frame)
-        result = ens.direct_cache[ens.cache_step]
-        ens.cache_step += 1
-        return result
-    elif game == "sonic":
-        # Sonic: AR(fp16)+direct(int8) with step blending and TTA
-        if ens.direct_cache is not None and n > CONTEXT_FRAMES and ens.cache_step < PRED_FRAMES:
-            result = ens.direct_cache[ens.cache_step]
-            ens.cache_step += 1
-            if ens.cache_step >= PRED_FRAMES:
-                ens.reset_cache()
-            return result
-        ens.reset_cache()
-        with torch.no_grad():
-            context_tensor = torch.from_numpy(context).to(DEVICE)
-            last_tensor = torch.from_numpy(last_frame_t).to(DEVICE)
-            direct_orig = _predict_8frames_direct(ens.sonic_direct, context_tensor, last_tensor)
-            context_flipped = torch.flip(context_tensor, dims=[3])
-            last_flipped = torch.flip(last_tensor, dims=[3])
-            direct_flipped = _predict_8frames_direct(ens.sonic_direct, context_flipped, last_flipped)
-            direct_flipped = torch.flip(direct_flipped, dims=[4])
-            direct_pred = (direct_orig + direct_flipped) / 2.0
-            # Multi-run AR with noise diversity (fixed seed for reproducibility)
-            all_ar_runs = []
-            torch.manual_seed(2)
-            for noise_std in [0.0, 1.0/255.0, 2.0/255.0]:
-                ar_preds_run = []
-                ctx = context_tensor.clone()
-                ctx_flip = context_flipped.clone()
-                last_t = last_tensor.clone()
-                last_f = last_flipped.clone()
-                sonic_scales = [1.04, 1.04, 1.04, 1.08, 1.08, 1.08, 1.12, 1.12]
-                for step in range(PRED_FRAMES):
-                    ctx_in = ctx if noise_std == 0 else torch.clamp(ctx + torch.randn_like(ctx) * noise_std, 0, 1)
-                    ctx_flip_in = ctx_flip if noise_std == 0 else torch.clamp(ctx_flip + torch.randn_like(ctx_flip) * noise_std, 0, 1)
-                    ar_orig = _predict_ar_frame(ens.sonic_ar, ctx_in, last_t, residual_scale=sonic_scales[step])
-                    ar_flip = _predict_ar_frame(ens.sonic_ar, ctx_flip_in, last_f, residual_scale=sonic_scales[step])
-                    ar_flip_back = torch.flip(ar_flip, dims=[3])
-                    ar_frame = (ar_orig + ar_flip_back) / 2.0
-                    ar_preds_run.append(ar_frame)
-                    ctx_frames = ctx.reshape(1, CONTEXT_FRAMES, 3, 64, 64)
-                    ctx_frames = torch.cat([ctx_frames[:, 1:], ar_orig.unsqueeze(1)], dim=1)
-                    ctx = ctx_frames.reshape(1, -1, 64, 64)
-                    last_t = ar_orig
-                    ctx_flip_frames = ctx_flip.reshape(1, CONTEXT_FRAMES, 3, 64, 64)
-                    ctx_flip_frames = torch.cat([ctx_flip_frames[:, 1:], ar_flip.unsqueeze(1)], dim=1)
-                    ctx_flip = ctx_flip_frames.reshape(1, -1, 64, 64)
-                    last_f = ar_flip
-                all_ar_runs.append(torch.stack(ar_preds_run, dim=1))
-            ar_pred = sum(all_ar_runs) / len(all_ar_runs)
-            predicted = torch.zeros_like(direct_pred)
-            for step in range(PRED_FRAMES):
-                ar_weight = 0.65 - (step / (PRED_FRAMES - 1)) * 0.3
-                direct_weight = 1.0 - ar_weight
-                predicted[:, step] = ar_weight * ar_pred[:, step] + direct_weight * direct_pred[:, step]
-        predicted_np = predicted[0].cpu().numpy()
-        ens.direct_cache = []
-        for i in range(PRED_FRAMES):
-            frame = np.transpose(predicted_np[i], (1, 2, 0))
-            frame = np.round(frame * 255 + 0.2).clip(0, 255).astype(np.uint8)
-            ens.direct_cache.append(frame)
-        result = ens.direct_cache[ens.cache_step]
-        ens.cache_step += 1
-        return result
-    else:
-        # PP: direct with TTA and caching
-        if ens.direct_cache is not None and n > CONTEXT_FRAMES and ens.cache_step < PRED_FRAMES:
-            result = ens.direct_cache[ens.cache_step]
-            ens.cache_step += 1
-            if ens.cache_step >= PRED_FRAMES:
-                ens.reset_cache()
-            return result
-        ens.reset_cache()
-        with torch.no_grad():
-            context_tensor = torch.from_numpy(context).to(DEVICE)
-            last_tensor = torch.from_numpy(last_frame_t).to(DEVICE)
-            predicted_orig = _predict_8frames_direct(ens.models["pole_position"], context_tensor, last_tensor, residual_scale=0.97)
-            context_flipped = torch.flip(context_tensor, dims=[3])
-            last_flipped = torch.flip(last_tensor, dims=[3])
-            predicted_flipped = _predict_8frames_direct(ens.models["pole_position"], context_flipped, last_flipped, residual_scale=0.97)
-            predicted_flipped = torch.flip(predicted_flipped, dims=[4])
-            predicted = (predicted_orig + predicted_flipped) / 2.0
-        predicted_np = predicted[0].cpu().numpy()
-        ens.direct_cache = []
-        for i in range(PRED_FRAMES):
-            frame = np.transpose(predicted_np[i], (1, 2, 0))
-            frame = np.round(frame * 255 + 0.2).clip(0, 255).astype(np.uint8)
-            ens.direct_cache.append(frame)
-        result = ens.direct_cache[ens.cache_step]
-        ens.cache_step += 1
-        return result

+"""Prediction interface for Flow-Warp-Mask U-Net v9 with TTA."""
 import sys
 import os
 import numpy as np
 import torch
 sys.path.insert(0, "/home/coder/code")
+from flowmask_model import FlowWarpMaskUNet
+from flownet_model import differentiable_warp
+CONTEXT_LEN = 4
+CHANNELS = [48, 96, 192]
 def load_model(model_dir: str):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = FlowWarpMaskUNet(in_channels=12, channels=CHANNELS)
+    model_path = os.path.join(model_dir, "model.pt")
+    state_dict = torch.load(model_path, map_location=device, weights_only=True)
+    state_dict = {k: v.float() for k, v in state_dict.items()}
+    model.load_state_dict(state_dict)
+    model.to(device)
+    model.eval()
+    return {"model": model, "device": device}
+def _prepare_input(context_frames):
+    """Prepare 4-frame context tensor from numpy frames."""
+    if len(context_frames) >= CONTEXT_LEN:
+        frames = context_frames[-CONTEXT_LEN:]
     else:
+        pad_count = CONTEXT_LEN - len(context_frames)
+        padding = np.stack([context_frames[0]] * pad_count, axis=0)
+        frames = np.concatenate([padding, context_frames], axis=0)
+    frames_t = torch.from_numpy(frames.astype(np.float32) / 255.0)
+    frames_t = frames_t.permute(0, 3, 1, 2)  # (4, 3, 64, 64)
+    return frames_t
+def _run_model(model, frames_t, device):
+    """Run model on prepared frames, return prediction tensor."""
+    last_frame = frames_t[-1].unsqueeze(0)  # (1, 3, 64, 64)
+    inp = frames_t.reshape(1, -1, 64, 64)   # (1, 12, 64, 64)
+    inp = inp.to(device)
+    last_frame = last_frame.to(device)
+    flow, mask, gen_frame = model(inp)
+    warped = differentiable_warp(last_frame, flow)
+    pred = mask * warped + (1 - mask) * gen_frame
+    pred = torch.clamp(pred, 0, 1)
+    return pred
+def predict_next_frame(model_dict, context_frames: np.ndarray) -> np.ndarray:
+    model = model_dict["model"]
+    device = model_dict["device"]
+    frames_t = _prepare_input(context_frames)
+    with torch.no_grad():
+        # Original prediction
+        pred1 = _run_model(model, frames_t, device)
+        # TTA: horizontally flipped prediction
+        frames_flipped = frames_t.flip(-1)  # flip W dimension
+        pred2_flipped = _run_model(model, frames_flipped, device)
+        pred2 = pred2_flipped.flip(-1)  # flip back
+        # Average
+        pred = (pred1 + pred2) / 2.0
+    pred = pred[0].cpu().permute(1, 2, 0).numpy()
+    pred = (pred * 255).clip(0, 255).astype(np.uint8)
+    return pred

train.log CHANGED Viewed

@@ -1,31 +1,63 @@
-[2026-04-12 07:14:27] Starting PP SSIM-only training for 2026-04-12-153000-pp-ssim-only
-[2026-04-12 07:14:27] Device: cuda
-[2026-04-12 07:14:27]   PP: 1,465,848 params (2.8 MB fp16)
-[2026-04-12 07:14:28]   PP train: 4097 seqs (len=16)
-[2026-04-12 07:14:28]   PP val: 482 seqs (len=16)
-[2026-04-12 07:14:38]   E1/100 | T:0.089821(S:0.9041) V:0.069040(S:0.9263) LR:3.00e-04
-[2026-04-12 07:14:46]   E2/100 | T:0.074465(S:0.9199) V:0.061649(S:0.9337) LR:3.00e-04
-[2026-04-12 07:14:55]   E3/100 | T:0.069567(S:0.9251) V:0.060543(S:0.9351) LR:2.99e-04
-[2026-04-12 07:15:04]   E4/100 | T:0.066393(S:0.9284) V:0.058608(S:0.9372) LR:2.99e-04
-[2026-04-12 07:15:12]   E5/100 | T:0.063999(S:0.9309) V:0.057957(S:0.9378) LR:2.98e-04
-[2026-04-12 07:15:21]   E6/100 | T:0.061633(S:0.9334) V:0.056070(S:0.9396) LR:2.97e-04
-[2026-04-12 07:15:31]   E7/100 | T:0.060136(S:0.9350) V:0.051418(S:0.9447) LR:2.96e-04
-[2026-04-12 07:15:50]   E9/100 | T:0.057306(S:0.9380) V:0.050541(S:0.9454) LR:2.94e-04
-[2026-04-12 07:16:00]   E10/100 | T:0.055858(S:0.9396) V:0.052572(S:0.9431) LR:2.93e-04
-[2026-04-12 07:16:09]   E11/100 | T:0.055021(S:0.9404) V:0.049028(S:0.9470) LR:2.91e-04
-[2026-04-12 07:17:14]   E18/100 | T:0.048082(S:0.9480) V:0.047964(S:0.9479) LR:2.77e-04
-[2026-04-12 07:17:30]   E20/100 | T:0.047000(S:0.9492) V:0.049137(S:0.9465) LR:2.71e-04
-[2026-04-12 07:18:25]   E26/100 | T:0.043209(S:0.9533) V:0.047514(S:0.9482) LR:2.53e-04
-[2026-04-12 07:18:45]   E28/100 | T:0.042278(S:0.9543) V:0.046412(S:0.9495) LR:2.46e-04
-[2026-04-12 07:19:04]   E30/100 | T:0.041417(S:0.9553) V:0.047650(S:0.9481) LR:2.38e-04
-[2026-04-12 07:20:37]   E40/100 | T:0.037882(S:0.9592) V:0.048536(S:0.9469) LR:1.97e-04
-[2026-04-12 07:21:15]   E44/100 | T:0.036738(S:0.9604) V:0.046170(S:0.9495) LR:1.79e-04
-[2026-04-12 07:22:10]   E50/100 | T:0.035150(S:0.9621) V:0.048340(S:0.9468) LR:1.50e-04
-[2026-04-12 07:23:41]   E60/100 | T:0.033106(S:0.9644) V:0.048292(S:0.9467) LR:1.04e-04
-[2026-04-12 07:25:13]   E70/100 | T:0.031615(S:0.9660) V:0.047850(S:0.9472) LR:6.26e-05
-[2026-04-12 07:26:46]   E80/100 | T:0.030609(S:0.9671) V:0.047661(S:0.9474) LR:2.96e-05
-[2026-04-12 07:28:21]   E90/100 | T:0.030099(S:0.9676) V:0.048131(S:0.9468) LR:8.32e-06
-[2026-04-12 07:29:58]   E100/100 | T:0.029986(S:0.9678) V:0.048119(S:0.9469) LR:1.00e-06
-[2026-04-12 07:29:58]   Done. Best val loss: 0.046170
-[2026-04-12 07:29:58]   Model size: 2.8 MB
-[2026-04-12 07:29:58] Training complete!

+[23:37:08] Device: cuda
+[23:37:08] Model parameters: 4,534,230, channels=[48, 96, 192]
+[23:37:08] Phase 1: Single-step (15 epochs)
+[23:37:12]   45108 sequences
+[23:37:54] P1 Epoch 1/15 | loss=0.09371
+[23:38:34] P1 Epoch 2/15 | loss=0.07541
+[23:39:15] P1 Epoch 3/15 | loss=0.07040
+[23:39:56] P1 Epoch 4/15 | loss=0.06692
+[23:40:36] P1 Epoch 5/15 | loss=0.06405
+[23:41:17] P1 Epoch 6/15 | loss=0.06159
+[23:41:58] P1 Epoch 7/15 | loss=0.05899
+[23:42:40] P1 Epoch 8/15 | loss=0.05667
+[23:43:21] P1 Epoch 9/15 | loss=0.05422
+[23:44:01] P1 Epoch 10/15 | loss=0.05216
+[23:44:43] P1 Epoch 11/15 | loss=0.05005
+[23:45:23] P1 Epoch 12/15 | loss=0.04842
+[23:46:03] P1 Epoch 13/15 | loss=0.04701
+[23:46:45] P1 Epoch 14/15 | loss=0.04600
+[23:47:24] P1 Epoch 15/15 | loss=0.04540
+[23:47:24] Phase 2: Graduated AR (30 epochs)
+[23:49:24] P2 Epoch 1/30 (steps=2) | loss=0.07130 lr=0.000500
+[23:51:23] P2 Epoch 2/30 (steps=2) | loss=0.06985 lr=0.000500
+[23:53:18] P2 Epoch 3/30 (steps=2) | loss=0.06784 lr=0.000500
+[23:58:06] P2 Epoch 4/30 (steps=4) | loss=0.10299 lr=0.000500
+[00:02:59] P2 Epoch 5/30 (steps=4) | loss=0.09840 lr=0.000500
+[00:04:11]   Val SSIM=0.8174 | {'pong': 0.7108, 'sonic': 0.8111, 'pole_position': 0.9302}
+[00:04:11]   New best! SSIM=0.8174
+[00:09:08] P2 Epoch 6/30 (steps=4) | loss=0.09555 lr=0.000500
+[00:21:04] P2 Epoch 7/30 (steps=8) | loss=0.14229 lr=0.000500
+[00:32:46] P2 Epoch 8/30 (steps=8) | loss=0.13796 lr=0.000500
+[00:44:48] P2 Epoch 9/30 (steps=8) | loss=0.13384 lr=0.000500
+[00:57:15] P2 Epoch 10/30 (steps=8) | loss=0.12981 lr=0.000500
+[00:58:37]   Val SSIM=0.8540 | {'pong': 0.8022, 'sonic': 0.8237, 'pole_position': 0.936}
+[00:58:37]   New best! SSIM=0.8540
+[01:11:08] P2 Epoch 11/30 (steps=8) | loss=0.12605 lr=0.000500
+[01:23:41] P2 Epoch 12/30 (steps=8) | loss=0.12299 lr=0.000500
+[01:36:24] P2 Epoch 13/30 (steps=8) | loss=0.12048 lr=0.000500
+[01:48:54] P2 Epoch 14/30 (steps=8) | loss=0.11759 lr=0.000500
+[02:01:33] P2 Epoch 15/30 (steps=8) | loss=0.11546 lr=0.000500
+[02:02:55]   Val SSIM=0.8644 | {'pong': 0.829, 'sonic': 0.8264, 'pole_position': 0.9378}
+[02:02:55]   New best! SSIM=0.8644
+[02:15:31] P2 Epoch 16/30 (steps=8) | loss=0.11323 lr=0.000495
+[02:28:01] P2 Epoch 17/30 (steps=8) | loss=0.11117 lr=0.000478
+[02:40:14] P2 Epoch 18/30 (steps=8) | loss=0.10895 lr=0.000452
+[02:52:32] P2 Epoch 19/30 (steps=8) | loss=0.10613 lr=0.000417
+[03:05:05] P2 Epoch 20/30 (steps=8) | loss=0.10350 lr=0.000375
+[03:06:28]   Val SSIM=0.8744 | {'pong': 0.8512, 'sonic': 0.8308, 'pole_position': 0.9413}
+[03:06:28]   New best! SSIM=0.8744
+[03:19:19] P2 Epoch 21/30 (steps=8) | loss=0.10044 lr=0.000327
+[03:31:46] P2 Epoch 22/30 (steps=8) | loss=0.09729 lr=0.000276
+[03:44:25] P2 Epoch 23/30 (steps=8) | loss=0.09401 lr=0.000224
+[03:57:08] P2 Epoch 24/30 (steps=8) | loss=0.09080 lr=0.000173
+[04:09:49] P2 Epoch 25/30 (steps=8) | loss=0.08751 lr=0.000125
+[04:11:04]   Val SSIM=0.8852 | {'pong': 0.8764, 'sonic': 0.8329, 'pole_position': 0.9462}
+[04:11:04]   New best! SSIM=0.8852
+[04:23:43] P2 Epoch 26/30 (steps=8) | loss=0.08449 lr=0.000083
+[04:36:13] P2 Epoch 27/30 (steps=8) | loss=0.08166 lr=0.000048
+[04:48:48] P2 Epoch 28/30 (steps=8) | loss=0.07940 lr=0.000022
+[05:01:33] P2 Epoch 29/30 (steps=8) | loss=0.07777 lr=0.000010
+[05:14:14] P2 Epoch 30/30 (steps=8) | loss=0.07694 lr=0.000010
+[05:15:35]   Val SSIM=0.8850 | {'pong': 0.8783, 'sonic': 0.8292, 'pole_position': 0.9474}
+[05:15:35] Experiment dir: 9.1 MB
+[05:15:35] Training complete. Best val SSIM: 0.8852