obliteratus

Running on Zero

App Files Files Community

pliny-the-prompter commited on Mar 16

Commit

42c8118

verified ·

1 Parent(s): 24570b6

Upload 134 files

Browse files

Files changed (2) hide show

obliteratus/.DS_Store +0 -0
obliteratus/abliterate.py +23 -2

obliteratus/.DS_Store CHANGED Viewed

Binary files a/obliteratus/.DS_Store and b/obliteratus/.DS_Store differ

obliteratus/abliterate.py CHANGED Viewed

@@ -5726,8 +5726,17 @@ class AbliterationPipeline:
             elapsed = time.time() - t0
             self.log(f"[timing] Quick checkpoint saved to {self.output_dir} ({elapsed:.1f}s)")
         except Exception as e:
-            # Non-fatal: if quick checkpoint fails, the pipeline continues
-            # normally and REBIRTH will save the final model.
             self.log(f"Quick checkpoint save failed (non-fatal): {e}")
     def _pipeline_time_remaining(self, budget_secs: float = 300.0) -> float:
@@ -6319,10 +6328,22 @@ class AbliterationPipeline:
         When device_map="auto" offloads weights to disk, model.state_dict()
         returns meta tensors (no data) for those parameters.  We resolve them
         here so that save_pretrained gets real tensors.
         """
         model = self.handle.model
         state_dict = model.state_dict()
         # Check for meta tensors (= disk-offloaded weights)
         meta_keys = [k for k, v in state_dict.items() if v.device.type == "meta"]
         if not meta_keys:

             elapsed = time.time() - t0
             self.log(f"[timing] Quick checkpoint saved to {self.output_dir} ({elapsed:.1f}s)")
         except Exception as e:
+            # In staged ZeroGPU mode, the quick checkpoint is the ONLY way
+            # Stage 3 can recover the excised model (the GPU worker process
+            # boundary discards in-memory state).  A failure here is fatal.
+            if getattr(self, "_staged_state_dir", None):
+                self.log(f"Quick checkpoint save FAILED in staged mode: {e}")
+                raise RuntimeError(
+                    f"Quick checkpoint save failed during staged ZeroGPU execution. "
+                    f"Stage 3 (VERIFY+REBIRTH) cannot proceed without this checkpoint. "
+                    f"Original error: {e}"
+                ) from e
+            # Non-staged: model stays in memory, so this is truly non-fatal.
             self.log(f"Quick checkpoint save failed (non-fatal): {e}")
     def _pipeline_time_remaining(self, budget_secs: float = 300.0) -> float:
         When device_map="auto" offloads weights to disk, model.state_dict()
         returns meta tensors (no data) for those parameters.  We resolve them
         here so that save_pretrained gets real tensors.
+        All returned tensors are guaranteed to be contiguous, which is required
+        by the safetensors serializer.  After EXCISE, weight tensors may be
+        non-contiguous or share underlying storage (e.g. from in-place
+        projection operations), which causes ``SafetensorError`` during save.
         """
         model = self.handle.model
         state_dict = model.state_dict()
+        # Ensure all tensors are contiguous — safetensors cannot serialize
+        # non-contiguous tensors or tensors that share underlying storage.
+        state_dict = {
+            k: v.contiguous() if isinstance(v, torch.Tensor) and not v.is_contiguous() else v
+            for k, v in state_dict.items()
+        }
         # Check for meta tensors (= disk-offloaded weights)
         meta_keys = [k for k, v in state_dict.items() if v.device.type == "meta"]
         if not meta_keys: