Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

__pycache__/predict.cpython-311.pyc +0 -0
model_sonic_ar.pt +3 -0
model_sonic_direct.pt +3 -0
predict.py +111 -39

__pycache__/predict.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/predict.cpython-311.pyc and b/__pycache__/predict.cpython-311.pyc differ

model_sonic_ar.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:413d9fcfa15f30c74cdfda5f7d7c9dba8958fe027dfc09de563e6209c78378f5
+size 6180566

model_sonic_direct.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7e17327a6f03cb72a35bd3c48d481b4eebea5db6572ed2b3fa290b330bca304
+size 6182614

predict.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Hybrid v5: Best per-game models. AR for Pong, direct 8-frame for Sonic/PP with TTA."""
 import sys
 import os
 import numpy as np
@@ -26,9 +26,11 @@ def detect_game(context_frames: np.ndarray) -> str:
         return "sonic"
-class HybridModels:
     def __init__(self):
         self.models = {}
         self.direct_cache = None
         self.cache_step = 0
@@ -38,9 +40,9 @@ class HybridModels:
 def load_model(model_dir: str):
-    hybrid = HybridModels()
-    # Pong: AR model (3 outputs) from pergame-models
     pong = UNet(in_channels=24, out_channels=3,
                 enc_channels=(32, 64, 128), bottleneck_channels=128,
                 upsample_mode="bilinear").to(DEVICE)
@@ -48,19 +50,29 @@ def load_model(model_dir: str):
                     map_location=DEVICE, weights_only=True)
     pong.load_state_dict({k: v.float() for k, v in sd.items()})
     pong.eval()
-    hybrid.models["pong"] = pong
-    # Sonic: direct 8-frame model (24 outputs) from direct-improved
-    sonic = UNet(in_channels=24, out_channels=24,
-                 enc_channels=(48, 96, 192), bottleneck_channels=256,
-                 upsample_mode="bilinear").to(DEVICE)
-    sd = torch.load(os.path.join(model_dir, "model_sonic.pt"),
                     map_location=DEVICE, weights_only=True)
-    sonic.load_state_dict({k: v.float() for k, v in sd.items()})
-    sonic.eval()
-    hybrid.models["sonic"] = sonic
-    # PP: direct 8-frame model (24 outputs) from direct-8frame
     pp = UNet(in_channels=24, out_channels=24,
               enc_channels=(32, 64, 128), bottleneck_channels=192,
               upsample_mode="bilinear").to(DEVICE)
@@ -68,21 +80,25 @@ def load_model(model_dir: str):
                     map_location=DEVICE, weights_only=True)
     pp.load_state_dict({k: v.float() for k, v in sd.items()})
     pp.eval()
-    hybrid.models["pole_position"] = pp
-    return hybrid
-def _predict_8frames(model, context_tensor, last_tensor):
     output = model(context_tensor)  # (1, 24, 64, 64)
     residuals = output.reshape(1, PRED_FRAMES, 3, 64, 64)
     last_expanded = last_tensor.unsqueeze(1).expand_as(residuals)
     return torch.clamp(last_expanded + residuals, 0, 1)
-def predict_next_frame(hybrid, context_frames: np.ndarray) -> np.ndarray:
     game = detect_game(context_frames)
-    model = hybrid.models[game]
     n = len(context_frames)
     if n < CONTEXT_FRAMES:
@@ -99,49 +115,105 @@ def predict_next_frame(hybrid, context_frames: np.ndarray) -> np.ndarray:
     last_frame_t = np.transpose(last_frame, (2, 0, 1))[np.newaxis]
     if game == "pong":
-        # AR prediction for Pong (no TTA, no caching)
         with torch.no_grad():
             context_tensor = torch.from_numpy(context).to(DEVICE)
             last_tensor = torch.from_numpy(last_frame_t).to(DEVICE)
-            residual = model(context_tensor)
-            predicted = torch.clamp(last_tensor + residual, 0, 1)
         predicted_np = predicted[0].cpu().numpy()
         predicted_np = np.transpose(predicted_np, (1, 2, 0))
         predicted_np = (predicted_np * 255).clip(0, 255).astype(np.uint8)
         return predicted_np
-    else:
-        # Direct 8-frame for Sonic and PP with caching
-        if hybrid.direct_cache is not None and n > CONTEXT_FRAMES and hybrid.cache_step < PRED_FRAMES:
-            result = hybrid.direct_cache[hybrid.cache_step]
-            hybrid.cache_step += 1
-            if hybrid.cache_step >= PRED_FRAMES:
-                hybrid.reset_cache()
             return result
-        # New window: predict all 8 frames with TTA
-        hybrid.reset_cache()
         with torch.no_grad():
             context_tensor = torch.from_numpy(context).to(DEVICE)
             last_tensor = torch.from_numpy(last_frame_t).to(DEVICE)
-            predicted_orig = _predict_8frames(model, context_tensor, last_tensor)
-            # TTA: horizontal flip
             context_flipped = torch.flip(context_tensor, dims=[3])
             last_flipped = torch.flip(last_tensor, dims=[3])
-            predicted_flipped = _predict_8frames(model, context_flipped, last_flipped)
             predicted_flipped = torch.flip(predicted_flipped, dims=[4])
             predicted = (predicted_orig + predicted_flipped) / 2.0
-        predicted_np = predicted[0].cpu().numpy()  # (8, 3, 64, 64)
-        hybrid.direct_cache = []
         for i in range(PRED_FRAMES):
             frame = np.transpose(predicted_np[i], (1, 2, 0))
             frame = (frame * 255).clip(0, 255).astype(np.uint8)
-            hybrid.direct_cache.append(frame)
-        result = hybrid.direct_cache[hybrid.cache_step]
-        hybrid.cache_step += 1
         return result

+"""Ensemble hybrid: AR+direct ensemble for Sonic, AR for Pong, direct for PP."""
 import sys
 import os
 import numpy as np
         return "sonic"
+class EnsembleModels:
     def __init__(self):
         self.models = {}
+        self.sonic_ar = None
+        self.sonic_direct = None
         self.direct_cache = None
         self.cache_step = 0
 def load_model(model_dir: str):
+    ens = EnsembleModels()
+    # Pong: AR model (3 outputs)
     pong = UNet(in_channels=24, out_channels=3,
                 enc_channels=(32, 64, 128), bottleneck_channels=128,
                 upsample_mode="bilinear").to(DEVICE)
                     map_location=DEVICE, weights_only=True)
     pong.load_state_dict({k: v.float() for k, v in sd.items()})
     pong.eval()
+    ens.models["pong"] = pong
+    # Sonic AR model (3 outputs)
+    sonic_ar = UNet(in_channels=24, out_channels=3,
+                    enc_channels=(48, 96, 192), bottleneck_channels=256,
+                    upsample_mode="bilinear").to(DEVICE)
+    sd = torch.load(os.path.join(model_dir, "model_sonic_ar.pt"),
                     map_location=DEVICE, weights_only=True)
+    sonic_ar.load_state_dict({k: v.float() for k, v in sd.items()})
+    sonic_ar.eval()
+    ens.sonic_ar = sonic_ar
+    # Sonic direct model (24 outputs)
+    sonic_direct = UNet(in_channels=24, out_channels=24,
+                        enc_channels=(48, 96, 192), bottleneck_channels=256,
+                        upsample_mode="bilinear").to(DEVICE)
+    sd = torch.load(os.path.join(model_dir, "model_sonic_direct.pt"),
+                    map_location=DEVICE, weights_only=True)
+    sonic_direct.load_state_dict({k: v.float() for k, v in sd.items()})
+    sonic_direct.eval()
+    ens.sonic_direct = sonic_direct
+    # PP: direct 8-frame model (24 outputs)
     pp = UNet(in_channels=24, out_channels=24,
               enc_channels=(32, 64, 128), bottleneck_channels=192,
               upsample_mode="bilinear").to(DEVICE)
                     map_location=DEVICE, weights_only=True)
     pp.load_state_dict({k: v.float() for k, v in sd.items()})
     pp.eval()
+    ens.models["pole_position"] = pp
+    return ens
+def _predict_8frames_direct(model, context_tensor, last_tensor):
     output = model(context_tensor)  # (1, 24, 64, 64)
     residuals = output.reshape(1, PRED_FRAMES, 3, 64, 64)
     last_expanded = last_tensor.unsqueeze(1).expand_as(residuals)
     return torch.clamp(last_expanded + residuals, 0, 1)
+def _predict_ar_frame(model, context_tensor, last_tensor):
+    residual = model(context_tensor)  # (1, 3, 64, 64)
+    return torch.clamp(last_tensor + residual, 0, 1)
+def predict_next_frame(ens, context_frames: np.ndarray) -> np.ndarray:
     game = detect_game(context_frames)
     n = len(context_frames)
     if n < CONTEXT_FRAMES:
     last_frame_t = np.transpose(last_frame, (2, 0, 1))[np.newaxis]
     if game == "pong":
+        # AR prediction for Pong
         with torch.no_grad():
             context_tensor = torch.from_numpy(context).to(DEVICE)
             last_tensor = torch.from_numpy(last_frame_t).to(DEVICE)
+            predicted = _predict_ar_frame(ens.models["pong"], context_tensor, last_tensor)
         predicted_np = predicted[0].cpu().numpy()
         predicted_np = np.transpose(predicted_np, (1, 2, 0))
         predicted_np = (predicted_np * 255).clip(0, 255).astype(np.uint8)
         return predicted_np
+    elif game == "sonic":
+        # Ensemble: AR + direct for Sonic with caching
+        if ens.direct_cache is not None and n > CONTEXT_FRAMES and ens.cache_step < PRED_FRAMES:
+            result = ens.direct_cache[ens.cache_step]
+            ens.cache_step += 1
+            if ens.cache_step >= PRED_FRAMES:
+                ens.reset_cache()
             return result
+        ens.reset_cache()
         with torch.no_grad():
             context_tensor = torch.from_numpy(context).to(DEVICE)
             last_tensor = torch.from_numpy(last_frame_t).to(DEVICE)
+            # Direct prediction with TTA
+            direct_orig = _predict_8frames_direct(ens.sonic_direct, context_tensor, last_tensor)
+            context_flipped = torch.flip(context_tensor, dims=[3])
+            last_flipped = torch.flip(last_tensor, dims=[3])
+            direct_flipped = _predict_8frames_direct(ens.sonic_direct, context_flipped, last_flipped)
+            direct_flipped = torch.flip(direct_flipped, dims=[4])
+            direct_pred = (direct_orig + direct_flipped) / 2.0  # (1, 8, 3, 64, 64)
+            # AR prediction with TTA for each step
+            ar_preds = []
+            ctx = context_tensor.clone()
+            ctx_flip = context_flipped.clone()
+            last_t = last_tensor.clone()
+            last_f = last_flipped.clone()
+            for step in range(PRED_FRAMES):
+                ar_orig = _predict_ar_frame(ens.sonic_ar, ctx, last_t)
+                ar_flip = _predict_ar_frame(ens.sonic_ar, ctx_flip, last_f)
+                ar_flip_back = torch.flip(ar_flip, dims=[3])
+                ar_frame = (ar_orig + ar_flip_back) / 2.0
+                ar_preds.append(ar_frame)
+                # Shift context for next AR step
+                ctx_frames = ctx.reshape(1, CONTEXT_FRAMES, 3, 64, 64)
+                ctx_frames = torch.cat([ctx_frames[:, 1:], ar_orig.unsqueeze(1)], dim=1)
+                ctx = ctx_frames.reshape(1, -1, 64, 64)
+                last_t = ar_orig
+                ctx_flip_frames = ctx_flip.reshape(1, CONTEXT_FRAMES, 3, 64, 64)
+                ctx_flip_frames = torch.cat([ctx_flip_frames[:, 1:], ar_flip.unsqueeze(1)], dim=1)
+                ctx_flip = ctx_flip_frames.reshape(1, -1, 64, 64)
+                last_f = ar_flip
+            ar_pred = torch.stack(ar_preds, dim=1)  # (1, 8, 3, 64, 64)
+            # Ensemble: average AR and direct
+            predicted = (ar_pred + direct_pred) / 2.0
+        predicted_np = predicted[0].cpu().numpy()
+        ens.direct_cache = []
+        for i in range(PRED_FRAMES):
+            frame = np.transpose(predicted_np[i], (1, 2, 0))
+            frame = (frame * 255).clip(0, 255).astype(np.uint8)
+            ens.direct_cache.append(frame)
+        result = ens.direct_cache[ens.cache_step]
+        ens.cache_step += 1
+        return result
+    else:
+        # Direct 8-frame for PP with caching and TTA
+        if ens.direct_cache is not None and n > CONTEXT_FRAMES and ens.cache_step < PRED_FRAMES:
+            result = ens.direct_cache[ens.cache_step]
+            ens.cache_step += 1
+            if ens.cache_step >= PRED_FRAMES:
+                ens.reset_cache()
+            return result
+        ens.reset_cache()
+        with torch.no_grad():
+            context_tensor = torch.from_numpy(context).to(DEVICE)
+            last_tensor = torch.from_numpy(last_frame_t).to(DEVICE)
+            predicted_orig = _predict_8frames_direct(ens.models["pole_position"], context_tensor, last_tensor)
             context_flipped = torch.flip(context_tensor, dims=[3])
             last_flipped = torch.flip(last_tensor, dims=[3])
+            predicted_flipped = _predict_8frames_direct(ens.models["pole_position"], context_flipped, last_flipped)
             predicted_flipped = torch.flip(predicted_flipped, dims=[4])
             predicted = (predicted_orig + predicted_flipped) / 2.0
+        predicted_np = predicted[0].cpu().numpy()
+        ens.direct_cache = []
         for i in range(PRED_FRAMES):
             frame = np.transpose(predicted_np[i], (1, 2, 0))
             frame = (frame * 255).clip(0, 255).astype(np.uint8)
+            ens.direct_cache.append(frame)
+        result = ens.direct_cache[ens.cache_step]
+        ens.cache_step += 1
         return result