Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

__pycache__/predict.cpython-311.pyc +0 -0
predict.py +40 -44

__pycache__/predict.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/predict.cpython-311.pyc and b/__pycache__/predict.cpython-311.pyc differ

predict.py CHANGED Viewed

@@ -190,14 +190,6 @@ def predict_next_frame(ens, context_frames: np.ndarray) -> np.ndarray:
                 ens.reset_cache()
             return result
-        # Detect extreme scene transitions (threshold 80 on 0-255 scale)
-        scene_transition = False
-        for i in range(len(frames) - 1):
-            diff = np.abs(frames[i].astype(np.float32) - frames[i + 1].astype(np.float32)).mean()
-            if diff > 80.0 / 255.0:
-                scene_transition = True
-                break
         ens.reset_cache()
         with torch.no_grad():
             context_tensor = torch.from_numpy(context).to(DEVICE)
@@ -210,51 +202,55 @@ def predict_next_frame(ens, context_frames: np.ndarray) -> np.ndarray:
             direct_flipped = torch.flip(direct_flipped, dims=[4])
             direct_pred = (direct_orig + direct_flipped) / 2.0
-            if scene_transition:
-                # Extreme scene transition: direct-only
-                predicted = direct_pred
-            else:
-                # Normal: full AR+direct blend with noise diversity
-                all_ar_runs = []
-                for noise_std in [0.0, 1.0/255.0, 2.0/255.0]:
-                    ar_preds_run = []
-                    ctx = context_tensor.clone()
-                    ctx_flip = context_flipped.clone()
-                    last_t = last_tensor.clone()
-                    last_f = last_flipped.clone()
-                    for step in range(PRED_FRAMES):
-                        ctx_in = ctx if noise_std == 0 else torch.clamp(ctx + torch.randn_like(ctx) * noise_std, 0, 1)
-                        ctx_flip_in = ctx_flip if noise_std == 0 else torch.clamp(ctx_flip + torch.randn_like(ctx_flip) * noise_std, 0, 1)
-                        ar_orig = _predict_ar_frame(ens.sonic_ar, ctx_in, last_t)
-                        ar_flip = _predict_ar_frame(ens.sonic_ar, ctx_flip_in, last_f)
-                        ar_flip_back = torch.flip(ar_flip, dims=[3])
-                        ar_frame = (ar_orig + ar_flip_back) / 2.0
-                        ar_preds_run.append(ar_frame)
-                        ctx_frames = ctx.reshape(1, CONTEXT_FRAMES, 3, 64, 64)
-                        ctx_frames = torch.cat([ctx_frames[:, 1:], ar_orig.unsqueeze(1)], dim=1)
-                        ctx = ctx_frames.reshape(1, -1, 64, 64)
-                        last_t = ar_orig
-                        ctx_flip_frames = ctx_flip.reshape(1, CONTEXT_FRAMES, 3, 64, 64)
-                        ctx_flip_frames = torch.cat([ctx_flip_frames[:, 1:], ar_flip.unsqueeze(1)], dim=1)
-                        ctx_flip = ctx_flip_frames.reshape(1, -1, 64, 64)
-                        last_f = ar_flip
-                    all_ar_runs.append(torch.stack(ar_preds_run, dim=1))
-                ar_pred = sum(all_ar_runs) / len(all_ar_runs)
-                predicted = torch.zeros_like(direct_pred)
                 for step in range(PRED_FRAMES):
-                    ar_weight = 0.65 - (step / (PRED_FRAMES - 1)) * 0.3
-                    direct_weight = 1.0 - ar_weight
-                    predicted[:, step] = ar_weight * ar_pred[:, step] + direct_weight * direct_pred[:, step]
         predicted_np = predicted[0].cpu().numpy()
         ens.direct_cache = []
         for i in range(PRED_FRAMES):
             frame = np.transpose(predicted_np[i], (1, 2, 0))
             frame = (frame * 255).clip(0, 255).astype(np.uint8)
             ens.direct_cache.append(frame)
         result = ens.direct_cache[ens.cache_step]
         ens.cache_step += 1
         return result

                 ens.reset_cache()
             return result
         ens.reset_cache()
         with torch.no_grad():
             context_tensor = torch.from_numpy(context).to(DEVICE)
             direct_flipped = torch.flip(direct_flipped, dims=[4])
             direct_pred = (direct_orig + direct_flipped) / 2.0
+            # Multi-run AR with noise diversity
+            all_ar_runs = []
+            for noise_std in [0.0, 1.0/255.0, 2.0/255.0]:
+                ar_preds_run = []
+                ctx = context_tensor.clone()
+                ctx_flip = context_flipped.clone()
+                last_t = last_tensor.clone()
+                last_f = last_flipped.clone()
                 for step in range(PRED_FRAMES):
+                    ctx_in = ctx if noise_std == 0 else torch.clamp(ctx + torch.randn_like(ctx) * noise_std, 0, 1)
+                    ctx_flip_in = ctx_flip if noise_std == 0 else torch.clamp(ctx_flip + torch.randn_like(ctx_flip) * noise_std, 0, 1)
+                    ar_orig = _predict_ar_frame(ens.sonic_ar, ctx_in, last_t)
+                    ar_flip = _predict_ar_frame(ens.sonic_ar, ctx_flip_in, last_f)
+                    ar_flip_back = torch.flip(ar_flip, dims=[3])
+                    ar_frame = (ar_orig + ar_flip_back) / 2.0
+                    ar_preds_run.append(ar_frame)
+                    ctx_frames = ctx.reshape(1, CONTEXT_FRAMES, 3, 64, 64)
+                    ctx_frames = torch.cat([ctx_frames[:, 1:], ar_orig.unsqueeze(1)], dim=1)
+                    ctx = ctx_frames.reshape(1, -1, 64, 64)
+                    last_t = ar_orig
+                    ctx_flip_frames = ctx_flip.reshape(1, CONTEXT_FRAMES, 3, 64, 64)
+                    ctx_flip_frames = torch.cat([ctx_flip_frames[:, 1:], ar_flip.unsqueeze(1)], dim=1)
+                    ctx_flip = ctx_flip_frames.reshape(1, -1, 64, 64)
+                    last_f = ar_flip
+                all_ar_runs.append(torch.stack(ar_preds_run, dim=1))
+            ar_pred = sum(all_ar_runs) / len(all_ar_runs)
+            predicted = torch.zeros_like(direct_pred)
+            for step in range(PRED_FRAMES):
+                ar_weight = 0.65 - (step / (PRED_FRAMES - 1)) * 0.3
+                direct_weight = 1.0 - ar_weight
+                predicted[:, step] = ar_weight * ar_pred[:, step] + direct_weight * direct_pred[:, step]
         predicted_np = predicted[0].cpu().numpy()
         ens.direct_cache = []
+        last_ctx_uint8 = (last_frame * 255).clip(0, 255).astype(np.uint8)
+        catastrophic = False
         for i in range(PRED_FRAMES):
             frame = np.transpose(predicted_np[i], (1, 2, 0))
             frame = (frame * 255).clip(0, 255).astype(np.uint8)
+            diff = np.abs(frame.astype(np.float32) - last_ctx_uint8.astype(np.float32)).mean()
+            if diff > 100:
+                catastrophic = True
             ens.direct_cache.append(frame)
+        if catastrophic:
+            ens.direct_cache = [last_ctx_uint8.copy() for _ in range(PRED_FRAMES)]
         result = ens.direct_cache[ens.cache_step]
         ens.cache_step += 1
         return result