Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

__pycache__/predict.cpython-311.pyc +0 -0
model_pong.pt +2 -2
model_sonic.pt +2 -2
predict.py +80 -101

__pycache__/predict.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/predict.cpython-311.pyc and b/__pycache__/predict.cpython-311.pyc differ

model_pong.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d6c8b9235347bea94e7e5f5f0f225d4c1dbd13a749d5e28920c75c91902ecb11
-size 2435368

 version https://git-lfs.github.com/spec/v1
+oid sha256:b1a440d1801503eb7e00e8a6ce30b8f43058816440d98506b3e2c8629ca2eeff
+size 2436712

model_sonic.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:413d9fcfa15f30c74cdfda5f7d7c9dba8958fe027dfc09de563e6209c78378f5
-size 6180566

 version https://git-lfs.github.com/spec/v1
+oid sha256:9035098568ea4789c5dda58d685af07b4b5a0cdf300848f79ed6d96ad901da34
+size 6182614

predict.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Hybrid v3: AR for Pong/Sonic, direct 8-frame for PP."""
 import sys
 import os
 import numpy as np
@@ -11,6 +11,12 @@ CONTEXT_FRAMES = 8
 PRED_FRAMES = 8
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 def detect_game(context_frames: np.ndarray) -> str:
     first_8 = context_frames[:CONTEXT_FRAMES]
@@ -26,58 +32,56 @@ def detect_game(context_frames: np.ndarray) -> str:
         return "sonic"
-class HybridModels:
-    def __init__(self):
-        self.models = {}
-        self.pp_cache = None
-        self.pp_cache_step = 0
-    def reset_pp_cache(self):
-        self.pp_cache = None
-        self.pp_cache_step = 0
 def load_model(model_dir: str):
-    hybrid = HybridModels()
-    # Pong: standard AR model (3 outputs)
-    pong = UNet(in_channels=24, out_channels=3,
-                enc_channels=(32, 64, 128), bottleneck_channels=128,
-                upsample_mode="bilinear").to(DEVICE)
-    sd = torch.load(os.path.join(model_dir, "model_pong.pt"),
-                    map_location=DEVICE, weights_only=True)
-    pong.load_state_dict({k: v.float() for k, v in sd.items()})
-    pong.eval()
-    hybrid.models["pong"] = pong
-    # Sonic: standard AR model (3 outputs)
-    sonic = UNet(in_channels=24, out_channels=3,
-                 enc_channels=(48, 96, 192), bottleneck_channels=256,
-                 upsample_mode="bilinear").to(DEVICE)
-    sd = torch.load(os.path.join(model_dir, "model_sonic.pt"),
-                    map_location=DEVICE, weights_only=True)
-    sonic.load_state_dict({k: v.float() for k, v in sd.items()})
-    sonic.eval()
-    hybrid.models["sonic"] = sonic
-    # PP: direct 8-frame model (24 outputs)
-    pp = UNet(in_channels=24, out_channels=24,
-              enc_channels=(32, 64, 128), bottleneck_channels=192,
-              upsample_mode="bilinear").to(DEVICE)
-    sd = torch.load(os.path.join(model_dir, "model_pole_position.pt"),
-                    map_location=DEVICE, weights_only=True)
-    pp.load_state_dict({k: v.float() for k, v in sd.items()})
-    pp.eval()
-    hybrid.models["pole_position"] = pp
-    return hybrid
-def predict_next_frame(hybrid, context_frames: np.ndarray) -> np.ndarray:
-    game = detect_game(context_frames)
-    model = hybrid.models[game]
     n = len(context_frames)
     if n < CONTEXT_FRAMES:
         padding = np.stack([context_frames[0]] * (CONTEXT_FRAMES - n), axis=0)
         frames = np.concatenate([padding, context_frames], axis=0)
@@ -91,57 +95,32 @@ def predict_next_frame(hybrid, context_frames: np.ndarray) -> np.ndarray:
     last_frame = frames_norm[-1]
     last_frame_t = np.transpose(last_frame, (2, 0, 1))[np.newaxis]
-    if game == "pole_position":
-        # Direct 8-frame: use cache
-        if hybrid.pp_cache is not None and n > CONTEXT_FRAMES and hybrid.pp_cache_step < PRED_FRAMES:
-            result = hybrid.pp_cache[hybrid.pp_cache_step]
-            hybrid.pp_cache_step += 1
-            if hybrid.pp_cache_step >= PRED_FRAMES:
-                hybrid.reset_pp_cache()
-            return result
-        # First call: predict all 8 frames
-        hybrid.reset_pp_cache()
-        with torch.no_grad():
-            context_tensor = torch.from_numpy(context).to(DEVICE)
-            last_tensor = torch.from_numpy(last_frame_t).to(DEVICE)
-            output = model(context_tensor)  # (1, 24, 64, 64)
-            residuals = output.reshape(1, PRED_FRAMES, 3, 64, 64)
-            last_expanded = last_tensor.unsqueeze(1).expand_as(residuals)
-            predicted = torch.clamp(last_expanded + residuals, 0, 1)
-        predicted_np = predicted[0].cpu().numpy()  # (8, 3, 64, 64)
-        hybrid.pp_cache = []
-        for i in range(PRED_FRAMES):
-            frame = np.transpose(predicted_np[i], (1, 2, 0))
-            frame = (frame * 255).clip(0, 255).astype(np.uint8)
-            hybrid.pp_cache.append(frame)
-        result = hybrid.pp_cache[hybrid.pp_cache_step]
-        hybrid.pp_cache_step += 1
-        return result
-    else:
-        # AR prediction for Pong and Sonic
-        with torch.no_grad():
-            context_tensor = torch.from_numpy(context).to(DEVICE)
-            last_tensor = torch.from_numpy(last_frame_t).to(DEVICE)
-            residual_orig = model(context_tensor)
-            predicted_orig = torch.clamp(last_tensor + residual_orig, 0, 1)
-            if game == "pong":
-                predicted = predicted_orig
-            else:
-                # TTA for Sonic
-                context_flipped = torch.flip(context_tensor, dims=[3])
-                last_flipped = torch.flip(last_tensor, dims=[3])
-                residual_flipped = model(context_flipped)
-                predicted_flipped = torch.clamp(last_flipped + residual_flipped, 0, 1)
-                predicted_flipped = torch.flip(predicted_flipped, dims=[3])
-                predicted = (predicted_orig + predicted_flipped) / 2.0
-        predicted_np = predicted[0].cpu().numpy()
-        predicted_np = np.transpose(predicted_np, (1, 2, 0))
-        predicted_np = (predicted_np * 255).clip(0, 255).astype(np.uint8)
-        return predicted_np

+"""Direct 8-frame prediction for all games with TTA."""
 import sys
 import os
 import numpy as np
 PRED_FRAMES = 8
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+GAME_CONFIGS = {
+    "pong": {"enc_channels": (32, 64, 128), "bottleneck": 128},
+    "sonic": {"enc_channels": (48, 96, 192), "bottleneck": 256},
+    "pole_position": {"enc_channels": (32, 64, 128), "bottleneck": 192},
+}
 def detect_game(context_frames: np.ndarray) -> str:
     first_8 = context_frames[:CONTEXT_FRAMES]
         return "sonic"
+class ModelCache:
+    def __init__(self, models):
+        self.models = models
+        self.cached_predictions = None
+        self.cache_step = 0
+    def reset_cache(self):
+        self.cached_predictions = None
+        self.cache_step = 0
 def load_model(model_dir: str):
+    models = {}
+    for game, cfg in GAME_CONFIGS.items():
+        model = UNet(in_channels=24, out_channels=24,
+                     enc_channels=cfg["enc_channels"],
+                     bottleneck_channels=cfg["bottleneck"],
+                     upsample_mode="bilinear").to(DEVICE)
+        state_dict = torch.load(os.path.join(model_dir, f"model_{game}.pt"),
+                                map_location=DEVICE, weights_only=True)
+        state_dict = {k: v.float() for k, v in state_dict.items()}
+        model.load_state_dict(state_dict)
+        model.eval()
+        models[game] = model
+    return ModelCache(models)
+def _predict_8frames(model, context_tensor, last_tensor):
+    output = model(context_tensor)  # (1, 24, 64, 64)
+    residuals = output.reshape(1, PRED_FRAMES, 3, 64, 64)
+    last_expanded = last_tensor.unsqueeze(1).expand_as(residuals)
+    return torch.clamp(last_expanded + residuals, 0, 1)  # (1, 8, 3, 64, 64)
+def predict_next_frame(cache, context_frames: np.ndarray) -> np.ndarray:
     n = len(context_frames)
+    # If cache exists and context grew (AR rollout), return next cached frame
+    if cache.cached_predictions is not None and n > CONTEXT_FRAMES and cache.cache_step < PRED_FRAMES:
+        result = cache.cached_predictions[cache.cache_step]
+        cache.cache_step += 1
+        if cache.cache_step >= PRED_FRAMES:
+            cache.reset_cache()
+        return result
+    # New window: predict all 8 frames
+    cache.reset_cache()
+    game = detect_game(context_frames)
+    model = cache.models[game]
     if n < CONTEXT_FRAMES:
         padding = np.stack([context_frames[0]] * (CONTEXT_FRAMES - n), axis=0)
         frames = np.concatenate([padding, context_frames], axis=0)
     last_frame = frames_norm[-1]
     last_frame_t = np.transpose(last_frame, (2, 0, 1))[np.newaxis]
+    with torch.no_grad():
+        context_tensor = torch.from_numpy(context).to(DEVICE)
+        last_tensor = torch.from_numpy(last_frame_t).to(DEVICE)
+        predicted_orig = _predict_8frames(model, context_tensor, last_tensor)
+        if game == "pong":
+            # Pong: no TTA (asymmetric)
+            predicted = predicted_orig
+        else:
+            # TTA: horizontal flip (dim=3 is width for (B, T, C, H, W) reshaped from (B, 24, H, W))
+            # But we work on (1, 24, H, W) context - flip along dim 3 (width)
+            context_flipped = torch.flip(context_tensor, dims=[3])
+            last_flipped = torch.flip(last_tensor, dims=[3])
+            predicted_flipped = _predict_8frames(model, context_flipped, last_flipped)
+            # Flip back: predicted_flipped is (1, 8, 3, H, W), flip width dim=4
+            predicted_flipped = torch.flip(predicted_flipped, dims=[4])
+            predicted = (predicted_orig + predicted_flipped) / 2.0
+    predicted_np = predicted[0].cpu().numpy()  # (8, 3, 64, 64)
+    cache.cached_predictions = []
+    for i in range(PRED_FRAMES):
+        frame = np.transpose(predicted_np[i], (1, 2, 0))
+        frame = (frame * 255).clip(0, 255).astype(np.uint8)
+        cache.cached_predictions.append(frame)
+    result = cache.cached_predictions[cache.cache_step]
+    cache.cache_step += 1
+    return result