Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

__pycache__/predict.cpython-311.pyc +0 -0
model_sonic_ar.pt +2 -2
model_sonic_direct.pt +2 -2
predict.py +25 -20

__pycache__/predict.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/predict.cpython-311.pyc and b/__pycache__/predict.cpython-311.pyc differ

model_sonic_ar.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:413d9fcfa15f30c74cdfda5f7d7c9dba8958fe027dfc09de563e6209c78378f5
-size 6180566

 version https://git-lfs.github.com/spec/v1
+oid sha256:0853b8b0dad0a55f126be9bfd767d2e55fcc2ea9dcb379a79f6389c997e54816
+size 3129452

model_sonic_direct.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e7e17327a6f03cb72a35bd3c48d481b4eebea5db6572ed2b3fa290b330bca304
-size 6182614

 version https://git-lfs.github.com/spec/v1
+oid sha256:c51f9fb740cc1cb8dc93f119252905a47034bb4cab73b30f347e47de20ad3d6d
+size 3131348

predict.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""FP16 Pong ensemble: AR+direct for Pong, AR+direct for Sonic, direct for PP."""
 import sys
 import os
 import numpy as np
@@ -26,6 +26,18 @@ def detect_game(context_frames: np.ndarray) -> str:
         return "sonic"
 class EnsembleModels:
     def __init__(self):
         self.models = {}
@@ -43,7 +55,7 @@ class EnsembleModels:
 def load_model(model_dir: str):
     ens = EnsembleModels()
-    # Pong AR (3 outputs)
     pong = UNet(in_channels=24, out_channels=3,
                 enc_channels=(32, 64, 128), bottleneck_channels=128,
                 upsample_mode="bilinear").to(DEVICE)
@@ -53,7 +65,7 @@ def load_model(model_dir: str):
     pong.eval()
     ens.models["pong"] = pong
-    # Pong direct (24 outputs)
     pong_direct = UNet(in_channels=24, out_channels=24,
                        enc_channels=(32, 64, 128), bottleneck_channels=128,
                        upsample_mode="bilinear").to(DEVICE)
@@ -63,27 +75,25 @@ def load_model(model_dir: str):
     pong_direct.eval()
     ens.pong_direct = pong_direct
-    # Sonic AR (3 outputs)
     sonic_ar = UNet(in_channels=24, out_channels=3,
                     enc_channels=(48, 96, 192), bottleneck_channels=256,
                     upsample_mode="bilinear").to(DEVICE)
-    sd = torch.load(os.path.join(model_dir, "model_sonic_ar.pt"),
-                    map_location=DEVICE, weights_only=True)
-    sonic_ar.load_state_dict({k: v.float() for k, v in sd.items()})
     sonic_ar.eval()
     ens.sonic_ar = sonic_ar
-    # Sonic direct (24 outputs)
     sonic_direct = UNet(in_channels=24, out_channels=24,
                         enc_channels=(48, 96, 192), bottleneck_channels=256,
                         upsample_mode="bilinear").to(DEVICE)
-    sd = torch.load(os.path.join(model_dir, "model_sonic_direct.pt"),
-                    map_location=DEVICE, weights_only=True)
-    sonic_direct.load_state_dict({k: v.float() for k, v in sd.items()})
     sonic_direct.eval()
     ens.sonic_direct = sonic_direct
-    # PP compact direct (24 outputs)
     pp = UNet(in_channels=24, out_channels=24,
               enc_channels=(24, 48, 96), bottleneck_channels=128,
               upsample_mode="bilinear").to(DEVICE)
@@ -126,7 +136,7 @@ def predict_next_frame(ens, context_frames: np.ndarray) -> np.ndarray:
     last_frame_t = np.transpose(last_frame, (2, 0, 1))[np.newaxis]
     if game == "pong":
-        # Pong: AR+direct ensemble with float32 caching, no TTA
         if ens.direct_cache is not None and n > CONTEXT_FRAMES and ens.cache_step < PRED_FRAMES:
             result = ens.direct_cache[ens.cache_step]
             ens.cache_step += 1
@@ -135,21 +145,17 @@ def predict_next_frame(ens, context_frames: np.ndarray) -> np.ndarray:
             return result
         ens.reset_cache()
-        model_ar = ens.models["pong"]
-        model_direct = ens.pong_direct
         with torch.no_grad():
             context_tensor = torch.from_numpy(context).to(DEVICE)
             last_tensor = torch.from_numpy(last_frame_t).to(DEVICE)
-            # Direct prediction
-            direct_pred = _predict_8frames_direct(model_direct, context_tensor, last_tensor)
-            # AR prediction in float32
             ar_preds = []
             ctx = context_tensor.clone()
             last_t = last_tensor.clone()
             for step in range(PRED_FRAMES):
-                predicted = _predict_ar_frame(model_ar, ctx, last_t)
                 ar_preds.append(predicted)
                 ctx_frames = ctx.reshape(1, CONTEXT_FRAMES, 3, 64, 64)
                 ctx_frames = torch.cat([ctx_frames[:, 1:], predicted.unsqueeze(1)], dim=1)
@@ -158,7 +164,6 @@ def predict_next_frame(ens, context_frames: np.ndarray) -> np.ndarray:
             ar_pred = torch.stack(ar_preds, dim=1)
-            # Step-dependent blending: AR 0.7 -> 0.3
             predicted = torch.zeros_like(direct_pred)
             for step in range(PRED_FRAMES):
                 ar_weight = 0.7 - (step / (PRED_FRAMES - 1)) * 0.4

+"""Int8 ensemble: Sonic models quantized to int8, Pong/PP in fp16."""
 import sys
 import os
 import numpy as np
         return "sonic"
+def load_int8_state_dict(path, device):
+    """Load int8 quantized state dict and dequantize to float32."""
+    quantized = torch.load(path, map_location='cpu', weights_only=False)
+    sd = {}
+    for k, v in quantized.items():
+        if 'int8' in v:
+            sd[k] = (v['int8'].float() * v['scale']).to(device)
+        else:
+            sd[k] = v['float'].to(device)
+    return sd
 class EnsembleModels:
     def __init__(self):
         self.models = {}
 def load_model(model_dir: str):
     ens = EnsembleModels()
+    # Pong AR (fp16, 3 outputs)
     pong = UNet(in_channels=24, out_channels=3,
                 enc_channels=(32, 64, 128), bottleneck_channels=128,
                 upsample_mode="bilinear").to(DEVICE)
     pong.eval()
     ens.models["pong"] = pong
+    # Pong direct (fp16, 24 outputs)
     pong_direct = UNet(in_channels=24, out_channels=24,
                        enc_channels=(32, 64, 128), bottleneck_channels=128,
                        upsample_mode="bilinear").to(DEVICE)
     pong_direct.eval()
     ens.pong_direct = pong_direct
+    # Sonic AR (int8 quantized, 3 outputs)
     sonic_ar = UNet(in_channels=24, out_channels=3,
                     enc_channels=(48, 96, 192), bottleneck_channels=256,
                     upsample_mode="bilinear").to(DEVICE)
+    sd = load_int8_state_dict(os.path.join(model_dir, "model_sonic_ar.pt"), DEVICE)
+    sonic_ar.load_state_dict(sd)
     sonic_ar.eval()
     ens.sonic_ar = sonic_ar
+    # Sonic direct (int8 quantized, 24 outputs)
     sonic_direct = UNet(in_channels=24, out_channels=24,
                         enc_channels=(48, 96, 192), bottleneck_channels=256,
                         upsample_mode="bilinear").to(DEVICE)
+    sd = load_int8_state_dict(os.path.join(model_dir, "model_sonic_direct.pt"), DEVICE)
+    sonic_direct.load_state_dict(sd)
     sonic_direct.eval()
     ens.sonic_direct = sonic_direct
+    # PP compact direct (fp16, 24 outputs)
     pp = UNet(in_channels=24, out_channels=24,
               enc_channels=(24, 48, 96), bottleneck_channels=128,
               upsample_mode="bilinear").to(DEVICE)
     last_frame_t = np.transpose(last_frame, (2, 0, 1))[np.newaxis]
     if game == "pong":
+        # Pong: AR+direct ensemble, float32 caching, no TTA
         if ens.direct_cache is not None and n > CONTEXT_FRAMES and ens.cache_step < PRED_FRAMES:
             result = ens.direct_cache[ens.cache_step]
             ens.cache_step += 1
             return result
         ens.reset_cache()
         with torch.no_grad():
             context_tensor = torch.from_numpy(context).to(DEVICE)
             last_tensor = torch.from_numpy(last_frame_t).to(DEVICE)
+            direct_pred = _predict_8frames_direct(ens.pong_direct, context_tensor, last_tensor)
             ar_preds = []
             ctx = context_tensor.clone()
             last_t = last_tensor.clone()
             for step in range(PRED_FRAMES):
+                predicted = _predict_ar_frame(ens.models["pong"], ctx, last_t)
                 ar_preds.append(predicted)
                 ctx_frames = ctx.reshape(1, CONTEXT_FRAMES, 3, 64, 64)
                 ctx_frames = torch.cat([ctx_frames[:, 1:], predicted.unsqueeze(1)], dim=1)
             ar_pred = torch.stack(ar_preds, dim=1)
             predicted = torch.zeros_like(direct_pred)
             for step in range(PRED_FRAMES):
                 ar_weight = 0.7 - (step / (PRED_FRAMES - 1)) * 0.4