Spaces:
Running on Zero
Running on Zero
Upload 134 files
Browse files- obliteratus/.DS_Store +0 -0
- obliteratus/abliterate.py +23 -2
obliteratus/.DS_Store
CHANGED
|
Binary files a/obliteratus/.DS_Store and b/obliteratus/.DS_Store differ
|
|
|
obliteratus/abliterate.py
CHANGED
|
@@ -5726,8 +5726,17 @@ class AbliterationPipeline:
|
|
| 5726 |
elapsed = time.time() - t0
|
| 5727 |
self.log(f"[timing] Quick checkpoint saved to {self.output_dir} ({elapsed:.1f}s)")
|
| 5728 |
except Exception as e:
|
| 5729 |
-
#
|
| 5730 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5731 |
self.log(f"Quick checkpoint save failed (non-fatal): {e}")
|
| 5732 |
|
| 5733 |
def _pipeline_time_remaining(self, budget_secs: float = 300.0) -> float:
|
|
@@ -6319,10 +6328,22 @@ class AbliterationPipeline:
|
|
| 6319 |
When device_map="auto" offloads weights to disk, model.state_dict()
|
| 6320 |
returns meta tensors (no data) for those parameters. We resolve them
|
| 6321 |
here so that save_pretrained gets real tensors.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6322 |
"""
|
| 6323 |
model = self.handle.model
|
| 6324 |
state_dict = model.state_dict()
|
| 6325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6326 |
# Check for meta tensors (= disk-offloaded weights)
|
| 6327 |
meta_keys = [k for k, v in state_dict.items() if v.device.type == "meta"]
|
| 6328 |
if not meta_keys:
|
|
|
|
| 5726 |
elapsed = time.time() - t0
|
| 5727 |
self.log(f"[timing] Quick checkpoint saved to {self.output_dir} ({elapsed:.1f}s)")
|
| 5728 |
except Exception as e:
|
| 5729 |
+
# In staged ZeroGPU mode, the quick checkpoint is the ONLY way
|
| 5730 |
+
# Stage 3 can recover the excised model (the GPU worker process
|
| 5731 |
+
# boundary discards in-memory state). A failure here is fatal.
|
| 5732 |
+
if getattr(self, "_staged_state_dir", None):
|
| 5733 |
+
self.log(f"Quick checkpoint save FAILED in staged mode: {e}")
|
| 5734 |
+
raise RuntimeError(
|
| 5735 |
+
f"Quick checkpoint save failed during staged ZeroGPU execution. "
|
| 5736 |
+
f"Stage 3 (VERIFY+REBIRTH) cannot proceed without this checkpoint. "
|
| 5737 |
+
f"Original error: {e}"
|
| 5738 |
+
) from e
|
| 5739 |
+
# Non-staged: model stays in memory, so this is truly non-fatal.
|
| 5740 |
self.log(f"Quick checkpoint save failed (non-fatal): {e}")
|
| 5741 |
|
| 5742 |
def _pipeline_time_remaining(self, budget_secs: float = 300.0) -> float:
|
|
|
|
| 6328 |
When device_map="auto" offloads weights to disk, model.state_dict()
|
| 6329 |
returns meta tensors (no data) for those parameters. We resolve them
|
| 6330 |
here so that save_pretrained gets real tensors.
|
| 6331 |
+
|
| 6332 |
+
All returned tensors are guaranteed to be contiguous, which is required
|
| 6333 |
+
by the safetensors serializer. After EXCISE, weight tensors may be
|
| 6334 |
+
non-contiguous or share underlying storage (e.g. from in-place
|
| 6335 |
+
projection operations), which causes ``SafetensorError`` during save.
|
| 6336 |
"""
|
| 6337 |
model = self.handle.model
|
| 6338 |
state_dict = model.state_dict()
|
| 6339 |
|
| 6340 |
+
# Ensure all tensors are contiguous — safetensors cannot serialize
|
| 6341 |
+
# non-contiguous tensors or tensors that share underlying storage.
|
| 6342 |
+
state_dict = {
|
| 6343 |
+
k: v.contiguous() if isinstance(v, torch.Tensor) and not v.is_contiguous() else v
|
| 6344 |
+
for k, v in state_dict.items()
|
| 6345 |
+
}
|
| 6346 |
+
|
| 6347 |
# Check for meta tensors (= disk-offloaded weights)
|
| 6348 |
meta_keys = [k for k, v in state_dict.items() if v.device.type == "meta"]
|
| 6349 |
if not meta_keys:
|