Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

__pycache__/predict.cpython-311.pyc +0 -0
model_pole_position.pt +1 -1
model_pong.pt +2 -2
predict.py +81 -60

__pycache__/predict.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/predict.cpython-311.pyc and b/__pycache__/predict.cpython-311.pyc differ

model_pole_position.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8e0affcef8e533a29037751e27948a3eb0f2fda2792ce2b3dfc876cadb09e281
 size 2971526

 version https://git-lfs.github.com/spec/v1
+oid sha256:27d26875071b536cc75cac27a0840b50cd6c9a8e1956c94f1cd08feacc49621f
 size 2971526

model_pong.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ab8070ddcde00333d7b52c89a0da9a61eece1e67c46163cd011ce4cd3c422f0c
-size 2436712

 version https://git-lfs.github.com/spec/v1
+oid sha256:d6c8b9235347bea94e7e5f5f0f225d4c1dbd13a749d5e28920c75c91902ecb11
+size 2435368

predict.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Direct 8-frame prediction for all games with TTA."""
 import sys
 import os
 import numpy as np
@@ -11,12 +11,6 @@ CONTEXT_FRAMES = 8
 PRED_FRAMES = 8
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-GAME_CONFIGS = {
-    "pong": {"enc_channels": (32, 64, 128), "bottleneck": 128},
-    "sonic": {"enc_channels": (48, 96, 192), "bottleneck": 256},
-    "pole_position": {"enc_channels": (32, 64, 128), "bottleneck": 192},
-}
 def detect_game(context_frames: np.ndarray) -> str:
     first_8 = context_frames[:CONTEXT_FRAMES]
@@ -32,55 +26,64 @@ def detect_game(context_frames: np.ndarray) -> str:
         return "sonic"
-class ModelCache:
-    def __init__(self, models):
-        self.models = models
-        self.cached_predictions = None
         self.cache_step = 0
     def reset_cache(self):
-        self.cached_predictions = None
         self.cache_step = 0
 def load_model(model_dir: str):
-    models = {}
-    for game, cfg in GAME_CONFIGS.items():
-        model = UNet(in_channels=24, out_channels=24,
-                     enc_channels=cfg["enc_channels"],
-                     bottleneck_channels=cfg["bottleneck"],
-                     upsample_mode="bilinear").to(DEVICE)
-        state_dict = torch.load(os.path.join(model_dir, f"model_{game}.pt"),
-                                map_location=DEVICE, weights_only=True)
-        state_dict = {k: v.float() for k, v in state_dict.items()}
-        model.load_state_dict(state_dict)
-        model.eval()
-        models[game] = model
-    return ModelCache(models)
 def _predict_8frames(model, context_tensor, last_tensor):
     output = model(context_tensor)  # (1, 24, 64, 64)
     residuals = output.reshape(1, PRED_FRAMES, 3, 64, 64)
     last_expanded = last_tensor.unsqueeze(1).expand_as(residuals)
-    return torch.clamp(last_expanded + residuals, 0, 1)  # (1, 8, 3, 64, 64)
-def predict_next_frame(cache, context_frames: np.ndarray) -> np.ndarray:
-    n = len(context_frames)
-    # If cache exists and context grew (AR rollout), return next cached frame
-    if cache.cached_predictions is not None and n > CONTEXT_FRAMES and cache.cache_step < PRED_FRAMES:
-        result = cache.cached_predictions[cache.cache_step]
-        cache.cache_step += 1
-        if cache.cache_step >= PRED_FRAMES:
-            cache.reset_cache()
-        return result
-    # New window: predict all 8 frames
-    cache.reset_cache()
     game = detect_game(context_frames)
-    model = cache.models[game]
     if n < CONTEXT_FRAMES:
         padding = np.stack([context_frames[0]] * (CONTEXT_FRAMES - n), axis=0)
@@ -95,32 +98,50 @@ def predict_next_frame(cache, context_frames: np.ndarray) -> np.ndarray:
     last_frame = frames_norm[-1]
     last_frame_t = np.transpose(last_frame, (2, 0, 1))[np.newaxis]
-    with torch.no_grad():
-        context_tensor = torch.from_numpy(context).to(DEVICE)
-        last_tensor = torch.from_numpy(last_frame_t).to(DEVICE)
-        predicted_orig = _predict_8frames(model, context_tensor, last_tensor)
-        if game == "pong":
-            # Pong: no TTA (asymmetric)
-            predicted = predicted_orig
-        else:
-            # TTA: horizontal flip (dim=3 is width for (B, T, C, H, W) reshaped from (B, 24, H, W))
-            # But we work on (1, 24, H, W) context - flip along dim 3 (width)
             context_flipped = torch.flip(context_tensor, dims=[3])
             last_flipped = torch.flip(last_tensor, dims=[3])
             predicted_flipped = _predict_8frames(model, context_flipped, last_flipped)
-            # Flip back: predicted_flipped is (1, 8, 3, H, W), flip width dim=4
             predicted_flipped = torch.flip(predicted_flipped, dims=[4])
             predicted = (predicted_orig + predicted_flipped) / 2.0
-    predicted_np = predicted[0].cpu().numpy()  # (8, 3, 64, 64)
-    cache.cached_predictions = []
-    for i in range(PRED_FRAMES):
-        frame = np.transpose(predicted_np[i], (1, 2, 0))
-        frame = (frame * 255).clip(0, 255).astype(np.uint8)
-        cache.cached_predictions.append(frame)
-    result = cache.cached_predictions[cache.cache_step]
-    cache.cache_step += 1
-    return result

+"""Hybrid v5: Best per-game models. AR for Pong, direct 8-frame for Sonic/PP with TTA."""
 import sys
 import os
 import numpy as np
 PRED_FRAMES = 8
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 def detect_game(context_frames: np.ndarray) -> str:
     first_8 = context_frames[:CONTEXT_FRAMES]
         return "sonic"
+class HybridModels:
+    def __init__(self):
+        self.models = {}
+        self.direct_cache = None
         self.cache_step = 0
     def reset_cache(self):
+        self.direct_cache = None
         self.cache_step = 0
 def load_model(model_dir: str):
+    hybrid = HybridModels()
+    # Pong: AR model (3 outputs) from pergame-models
+    pong = UNet(in_channels=24, out_channels=3,
+                enc_channels=(32, 64, 128), bottleneck_channels=128,
+                upsample_mode="bilinear").to(DEVICE)
+    sd = torch.load(os.path.join(model_dir, "model_pong.pt"),
+                    map_location=DEVICE, weights_only=True)
+    pong.load_state_dict({k: v.float() for k, v in sd.items()})
+    pong.eval()
+    hybrid.models["pong"] = pong
+    # Sonic: direct 8-frame model (24 outputs) from direct-improved
+    sonic = UNet(in_channels=24, out_channels=24,
+                 enc_channels=(48, 96, 192), bottleneck_channels=256,
+                 upsample_mode="bilinear").to(DEVICE)
+    sd = torch.load(os.path.join(model_dir, "model_sonic.pt"),
+                    map_location=DEVICE, weights_only=True)
+    sonic.load_state_dict({k: v.float() for k, v in sd.items()})
+    sonic.eval()
+    hybrid.models["sonic"] = sonic
+    # PP: direct 8-frame model (24 outputs) from direct-8frame
+    pp = UNet(in_channels=24, out_channels=24,
+              enc_channels=(32, 64, 128), bottleneck_channels=192,
+              upsample_mode="bilinear").to(DEVICE)
+    sd = torch.load(os.path.join(model_dir, "model_pole_position.pt"),
+                    map_location=DEVICE, weights_only=True)
+    pp.load_state_dict({k: v.float() for k, v in sd.items()})
+    pp.eval()
+    hybrid.models["pole_position"] = pp
+    return hybrid
 def _predict_8frames(model, context_tensor, last_tensor):
     output = model(context_tensor)  # (1, 24, 64, 64)
     residuals = output.reshape(1, PRED_FRAMES, 3, 64, 64)
     last_expanded = last_tensor.unsqueeze(1).expand_as(residuals)
+    return torch.clamp(last_expanded + residuals, 0, 1)
+def predict_next_frame(hybrid, context_frames: np.ndarray) -> np.ndarray:
     game = detect_game(context_frames)
+    model = hybrid.models[game]
+    n = len(context_frames)
     if n < CONTEXT_FRAMES:
         padding = np.stack([context_frames[0]] * (CONTEXT_FRAMES - n), axis=0)
     last_frame = frames_norm[-1]
     last_frame_t = np.transpose(last_frame, (2, 0, 1))[np.newaxis]
+    if game == "pong":
+        # AR prediction for Pong (no TTA, no caching)
+        with torch.no_grad():
+            context_tensor = torch.from_numpy(context).to(DEVICE)
+            last_tensor = torch.from_numpy(last_frame_t).to(DEVICE)
+            residual = model(context_tensor)
+            predicted = torch.clamp(last_tensor + residual, 0, 1)
+        predicted_np = predicted[0].cpu().numpy()
+        predicted_np = np.transpose(predicted_np, (1, 2, 0))
+        predicted_np = (predicted_np * 255).clip(0, 255).astype(np.uint8)
+        return predicted_np
+    else:
+        # Direct 8-frame for Sonic and PP with caching
+        if hybrid.direct_cache is not None and n > CONTEXT_FRAMES and hybrid.cache_step < PRED_FRAMES:
+            result = hybrid.direct_cache[hybrid.cache_step]
+            hybrid.cache_step += 1
+            if hybrid.cache_step >= PRED_FRAMES:
+                hybrid.reset_cache()
+            return result
+        # New window: predict all 8 frames with TTA
+        hybrid.reset_cache()
+        with torch.no_grad():
+            context_tensor = torch.from_numpy(context).to(DEVICE)
+            last_tensor = torch.from_numpy(last_frame_t).to(DEVICE)
+            predicted_orig = _predict_8frames(model, context_tensor, last_tensor)
+            # TTA: horizontal flip
             context_flipped = torch.flip(context_tensor, dims=[3])
             last_flipped = torch.flip(last_tensor, dims=[3])
             predicted_flipped = _predict_8frames(model, context_flipped, last_flipped)
             predicted_flipped = torch.flip(predicted_flipped, dims=[4])
             predicted = (predicted_orig + predicted_flipped) / 2.0
+        predicted_np = predicted[0].cpu().numpy()  # (8, 3, 64, 64)
+        hybrid.direct_cache = []
+        for i in range(PRED_FRAMES):
+            frame = np.transpose(predicted_np[i], (1, 2, 0))
+            frame = (frame * 255).clip(0, 255).astype(np.uint8)
+            hybrid.direct_cache.append(frame)
+        result = hybrid.direct_cache[hybrid.cache_step]
+        hybrid.cache_step += 1
+        return result