obliteratus

Running on Zero

App Files Files Community

pliny-the-prompter commited on Mar 5

Commit

a46d378

verified ·

1 Parent(s): f254212

Upload app.py

Browse files

Files changed (1) hide show

app.py +81 -39

app.py CHANGED Viewed

@@ -1947,24 +1947,20 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
         model = _state["model"]
         tokenizer = _state["tokenizer"]
-    if model is None or tokenizer is None:
-        yield "No model loaded yet. Go to the **Obliterate** tab first and liberate a model."
-        return
-    # ZeroGPU safety: ensure model is on GPU if available.
-    # Between GPU allocations, ZeroGPU may have moved the model to CPU/meta,
-    # or tensors may be stale from a previous GPU context.
-    # The @spaces.GPU decorator guarantees a GPU is available here.
-    _needs_reload = False
-    try:
-        dev = next(model.parameters()).device
-        if torch.cuda.is_available() and dev.type != "cuda":
-            model.to("cuda")
-    except (StopIteration, RuntimeError):
-        _needs_reload = True
-    # If model tensors are stale/meta, reload from the saved checkpoint
-    if _needs_reload and _ZEROGPU_AVAILABLE:
         checkpoint = _state.get("output_dir")
         if checkpoint and Path(checkpoint).exists():
             try:
@@ -1985,11 +1981,12 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
                 with _lock:
                     _state["model"] = model
                     _state["tokenizer"] = tokenizer
             except Exception:
                 yield "Model failed to reload from checkpoint. Try re-obliterating."
                 return
         else:
-            yield "Model tensors are stale (ZeroGPU). Re-obliterate to create a fresh checkpoint."
             return
     # Sanitize inputs to prevent resource exhaustion
@@ -2127,6 +2124,49 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
             return
     if not choice or choice not in _bench_configs:
         yield "**Error:** No benchmark result selected. Pick a model from the dropdown first.", ""
         return
@@ -2320,27 +2360,18 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
         tokenizer = _state["tokenizer"]
         model_name = _state["model_name"]
-    if abliterated_model is None or tokenizer is None:
-        yield (history_left + [{"role": "user", "content": message},
-                                {"role": "assistant", "content": "No abliterated model loaded. Obliterate a model first."}],
-               history_right + [{"role": "user", "content": message},
-                                 {"role": "assistant", "content": "No abliterated model loaded. Obliterate a model first."}],
-               "Load a model first.",
-               "#### Original (Pre-Abliteration)",
-               "#### Abliterated")
-        return
-    # ZeroGPU safety: ensure model is on GPU if available.
-    # If tensors are stale from a prior GPU context, reload from checkpoint.
-    _needs_reload = False
-    try:
-        dev = next(abliterated_model.parameters()).device
-        if torch.cuda.is_available() and dev.type != "cuda":
-            abliterated_model.to("cuda")
-    except (StopIteration, RuntimeError):
-        _needs_reload = True
-    if _needs_reload and _ZEROGPU_AVAILABLE:
         checkpoint = _state.get("output_dir")
         if checkpoint and Path(checkpoint).exists():
             try:
@@ -2361,8 +2392,19 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
                 with _lock:
                     _state["model"] = abliterated_model
                     _state["tokenizer"] = tokenizer
             except Exception:
                 pass  # Fall through — will fail at generation with a clear error
     # Build header strings showing model name on each side
     header_left = f"#### Original (Pre-Abliteration)\n`{model_name}`"

         model = _state["model"]
         tokenizer = _state["tokenizer"]
+    # ZeroGPU safety: detect whether we need to reload from checkpoint.
+    # Between GPU allocations, ZeroGPU may deallocate GPU memory, leaving
+    # model as None (garbage-collected) or with stale/meta tensors.
+    _needs_reload = model is None or tokenizer is None
+    if not _needs_reload:
+        try:
+            dev = next(model.parameters()).device
+            if torch.cuda.is_available() and dev.type != "cuda":
+                model.to("cuda")
+        except (StopIteration, RuntimeError):
+            _needs_reload = True
+    # Reload from saved checkpoint if model is missing or stale
+    if _needs_reload:
         checkpoint = _state.get("output_dir")
         if checkpoint and Path(checkpoint).exists():
             try:
                 with _lock:
                     _state["model"] = model
                     _state["tokenizer"] = tokenizer
+                    _state["status"] = "ready"
             except Exception:
                 yield "Model failed to reload from checkpoint. Try re-obliterating."
                 return
         else:
+            yield "No model loaded yet. Go to the **Obliterate** tab first and liberate a model."
             return
     # Sanitize inputs to prevent resource exhaustion
             return
     if not choice or choice not in _bench_configs:
+        # On ZeroGPU, global state may be lost between workers. If the model
+        # is already loaded (e.g. from the same obliteration session), allow
+        # chatting even though the session cache key is gone.
+        with _lock:
+            if _state["status"] == "ready" and _state["model"] is not None:
+                yield (
+                    f"**Ready!** Model already loaded — just type in the chat below.",
+                    get_chat_header(),
+                )
+                return
+            # Check if we can reload from a checkpoint on disk
+            checkpoint = _state.get("output_dir")
+            if checkpoint and Path(checkpoint).exists():
+                yield (
+                    f"**Loading model** from saved checkpoint...",
+                    "",
+                )
+        # If we have a checkpoint, attempt reload outside the lock
+        checkpoint = _state.get("output_dir")
+        if checkpoint and Path(checkpoint).exists():
+            is_preset = (_state.get("model_name") or "") in MODELS
+            try:
+                model_loaded = AutoModelForCausalLM.from_pretrained(
+                    checkpoint, device_map="auto", torch_dtype=torch.float16,
+                    trust_remote_code=is_preset,
+                )
+                tokenizer_loaded = AutoTokenizer.from_pretrained(
+                    checkpoint, trust_remote_code=is_preset,
+                )
+                if tokenizer_loaded.pad_token is None:
+                    tokenizer_loaded.pad_token = tokenizer_loaded.eos_token
+                with _lock:
+                    _state["model"] = model_loaded
+                    _state["tokenizer"] = tokenizer_loaded
+                    _state["status"] = "ready"
+                yield (
+                    f"**Loaded!** Model reloaded from checkpoint — ready to chat.",
+                    get_chat_header(),
+                )
+                return
+            except Exception as e:
+                yield f"**Error:** Could not reload model: {e}", get_chat_header()
+                return
         yield "**Error:** No benchmark result selected. Pick a model from the dropdown first.", ""
         return
         tokenizer = _state["tokenizer"]
         model_name = _state["model_name"]
+    # ZeroGPU safety: detect whether we need to reload from checkpoint.
+    # Model may be None (garbage-collected after GPU deallocation) or stale.
+    _needs_reload = abliterated_model is None or tokenizer is None
+    if not _needs_reload:
+        try:
+            dev = next(abliterated_model.parameters()).device
+            if torch.cuda.is_available() and dev.type != "cuda":
+                abliterated_model.to("cuda")
+        except (StopIteration, RuntimeError):
+            _needs_reload = True
+    if _needs_reload:
         checkpoint = _state.get("output_dir")
         if checkpoint and Path(checkpoint).exists():
             try:
                 with _lock:
                     _state["model"] = abliterated_model
                     _state["tokenizer"] = tokenizer
+                    _state["status"] = "ready"
             except Exception:
                 pass  # Fall through — will fail at generation with a clear error
+        else:
+            _no_model_msg = "No abliterated model loaded. Obliterate a model first."
+            yield (history_left + [{"role": "user", "content": message},
+                                    {"role": "assistant", "content": _no_model_msg}],
+                   history_right + [{"role": "user", "content": message},
+                                     {"role": "assistant", "content": _no_model_msg}],
+                   "Load a model first.",
+                   "#### Original (Pre-Abliteration)",
+                   "#### Abliterated")
+            return
     # Build header strings showing model name on each side
     header_left = f"#### Original (Pre-Abliteration)\n`{model_name}`"