Spaces:

multimodalart
/

pid

Running on Zero

apolinario commited on 1 day ago

Commit

9014add

1 Parent(s): b0fc9e3

Patch Gemma2.forward to put torch.tensor on the embedding's device (ZeroGPU rejects the CPU normalizer cross-device multiply)

Files changed (1) hide show

app.py CHANGED Viewed

@@ -74,6 +74,29 @@ def _broadcasting_vmap_for_bhqkv(mask_function, bh_indices: bool = True):
 _mu._vmap_for_bhqkv = _broadcasting_vmap_for_bhqkv
 pipeline, pipe_cfg = load_pipeline(BACKBONE, dtype=DTYPE)
 pipeline.to("cuda")

 _mu._vmap_for_bhqkv = _broadcasting_vmap_for_bhqkv
+# Gemma2's forward does `normalizer = torch.tensor(hidden_size**0.5, dtype=...)`
+# without a device kwarg, so it lands on CPU while hidden_states is on cuda.
+# Vanilla CUDA tolerates the cross-device scalar op; ZeroGPU's __torch_function__
+# hijack rejects it. Force torch.tensor calls inside Gemma2.forward onto the
+# embedding's device.
+import transformers.models.gemma2.modeling_gemma2 as _gm
+_orig_gemma2_forward = _gm.Gemma2Model.forward
+def _patched_gemma2_forward(self, *args, **kwargs):
+    _orig_tt = torch.tensor
+    dev = self.embed_tokens.weight.device
+    def _tt(data, *a, **kw):
+        kw.setdefault("device", dev)
+        return _orig_tt(data, *a, **kw)
+    torch.tensor = _tt
+    try:
+        return _orig_gemma2_forward(self, *args, **kwargs)
+    finally:
+        torch.tensor = _orig_tt
+_gm.Gemma2Model.forward = _patched_gemma2_forward
 pipeline, pipe_cfg = load_pipeline(BACKBONE, dtype=DTYPE)
 pipeline.to("cuda")