Spaces:

Qluon
/

lbw-guard-direct-runner

Running

App Files Files Community

Radianis commited on 1 day ago

Commit

ac62897

1 Parent(s): ff99487

Add notebook-based Easy and Ablation runners

Browse files

Files changed (4) hide show

README.md +12 -4
_demo_runtime.py +0 -1441
app.py +1222 -242
requirements.txt +5 -4

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: LBW Guard Direct Runner
 emoji: 🚀
 colorFrom: green
 colorTo: blue
@@ -8,6 +8,7 @@ python_version: "3.12"
 app_file: app.py
 suggested_hardware: t4-medium
 models:
   - Qwen/Qwen2.5-0.5B
 datasets:
   - Salesforce/wikitext
@@ -22,10 +23,17 @@ Copyright (c) Qluon Inc. All rights reserved.
 Provided for Learn-By-Wire Guard evaluation and customer testing under the applicable Qluon license terms.
-# LBW Guard Direct Runner
-This Space runs a compact AdamW vs `lbw_guard` WikiText LoRA smoke test directly on Hugging Face Spaces.
-Use GPU hardware for meaningful runtime. CPU can load the app, but model training may be slow or fail on memory.
 The app writes run artifacts to the Space working directory. Add persistent storage if you need outputs to survive Space restarts.

 ---
+title: LBW Guard Colab Tests
 emoji: 🚀
 colorFrom: green
 colorTo: blue
 app_file: app.py
 suggested_hardware: t4-medium
 models:
+  - TinyLlama/TinyLlama_v1.1
   - Qwen/Qwen2.5-0.5B
 datasets:
   - Salesforce/wikitext
 Provided for Learn-By-Wire Guard evaluation and customer testing under the applicable Qluon license terms.
+# LBW Guard Colab Tests
+This private Space runs notebook-faithful Hugging Face versions of:
+- `LBW_Guard_Easy_Test_COLAB.ipynb`
+- `LBW_Guard_Ablation_Test_COLAB.ipynb`
+It installs `lbw-guard` from PyPI and does not vendor the local `lbw/` source folder.
+Paper: https://arxiv.org/abs/2605.19008
+Use GPU hardware for meaningful runtime. CPU can load the app, but training is intentionally capped to tiny smoke settings.
 The app writes run artifacts to the Space working directory. Add persistent storage if you need outputs to survive Space restarts.

_demo_runtime.py DELETED Viewed

@@ -1,1441 +0,0 @@
-#!/usr/bin/env python3
-"""Standalone customer demo runtime decoupled from the internal benchmark harness."""
-from __future__ import annotations
-import importlib.util
-import json
-import math
-import os
-import random
-import shutil
-import statistics
-import subprocess
-import sys
-import tempfile
-import time
-import warnings
-from array import array
-from collections import Counter, deque
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Sequence
-AUTOMATION_DIR = Path(__file__).resolve().parent
-TEST_ROOT = AUTOMATION_DIR.parent
-LBW_ROOT = TEST_ROOT.parent
-_hf_home = os.environ.setdefault("HF_HOME", str((LBW_ROOT / ".hf_cache").resolve()))
-os.environ.setdefault("HF_DATASETS_CACHE", str((Path(_hf_home) / "datasets").resolve()))
-os.environ.setdefault("TRANSFORMERS_CACHE", str((Path(_hf_home) / "transformers").resolve()))
-# Prevent background safetensors conversion threads from calling the HF
-# conversion Space during demo loads. This keeps repo-hosted .bin models
-# usable without noisy thread crashes when the service/network misbehaves.
-os.environ.setdefault("DISABLE_SAFETENSORS_CONVERSION", "1")
-_wandb_home = LBW_ROOT / ".wandb"
-os.environ.setdefault("WANDB_DIR", str(_wandb_home.resolve()))
-os.environ.setdefault("WANDB_CACHE_DIR", str((_wandb_home / "cache").resolve()))
-os.environ.setdefault("WANDB_CONFIG_DIR", str((_wandb_home / "config").resolve()))
-import torch
-from datasets import load_dataset
-from peft import LoraConfig, TaskType, get_peft_model
-from transformers import AutoModelForCausalLM, AutoTokenizer, get_cosine_schedule_with_warmup
-try:
-    import wandb
-except Exception:
-    wandb = None
-try:
-    from lbw import Guard
-except Exception:
-    Guard = None
-@dataclass
-class BenchmarkConfig:
-    model_name: str = "Qwen/Qwen2.5-3B"
-    device: str = "cuda"
-    enable_lora: bool = True
-    lora_r: int = 16
-    lora_alpha: int = 64
-    lora_dropout: float = 0.05
-    lora_target_modules: List[str] = field(
-        default_factory=lambda: [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-            "o_proj",
-            "gate_proj",
-            "up_proj",
-            "down_proj",
-        ]
-    )
-    seq_len: int = 256
-    batch_size: int = 2
-    grad_accum: int = 2
-    max_steps: int = 100
-    warmup_steps: int = 50
-    eval_every: int = 50
-    eval_batches: int = 50
-    schedule_mode: str = "all_cosine"
-    max_chars: int = 4_000_000
-    eval_chars: int = 1_000_000
-    full_wikitext_train: bool = False
-    full_wikitext_eval: bool = False
-    full_validation_ppl: bool = False
-    lr: float = 5e-4
-    weight_decay: float = 0.01
-    betas: tuple[float, float] = (0.9, 0.999)
-    lbw_stats_freq: int = 50
-    lbw_stress_th: float = 1.1  #1.1
-    lbw_spike_th: float = 1.5   #1.5
-    lbw_rec_fast: float = 0.05  #0.01
-    lbw_ema_decay: float = 0.95
-    use_wandb: bool = False
-    use_lbwgov: bool = False
-    print_all_metrics: bool = False
-    lbwgov_experiment_name: str = "LBW-Customer-Demo"
-    output_dir: str = str((AUTOMATION_DIR / "demo_outputs").resolve())
-    enable_benchmarks: bool = False
-    use_lm_eval: bool = False
-    lm_eval_ppl: bool = False
-    lm_eval_ppl_task: str = "wikitext_103_raw"
-    lm_eval_ppl_limit: Optional[float] = None
-    lm_eval_acc: bool = False
-    lm_eval_acc_tasks: str = "mmlu,arc_challenge"
-    lm_eval_acc_limit: Optional[float] = None
-    lm_eval_mmlu_limit: Optional[float] = None
-    lm_eval_arc_challenge_limit: Optional[float] = None
-    lm_eval_mmlu_fewshot: int = 5
-    lm_eval_arc_challenge_fewshot: int = 25
-    lm_eval_batch_size: str = "1"
-@dataclass
-class ChunkedTokens:
-    input_ids: torch.Tensor
-    labels: torch.Tensor
-    split: str = ""
-    char_count: int = 0
-    cap_chars: Optional[int] = None
-def _demo_log(config: Optional["BenchmarkConfig"], message: str) -> None:
-    if config is not None and not bool(getattr(config, "print_all_metrics", False)):
-        return
-    print(f"[DemoRuntime] {message}", flush=True)
-def _safe_float(value: Any) -> Optional[float]:
-    if value is None:
-        return None
-    try:
-        out = float(value)
-    except Exception:
-        return None
-    if math.isnan(out) or math.isinf(out):
-        return None
-    return out
-@dataclass
-class GovernanceMetricConfig:
-    ema_decay: float = 0.95
-    short_window: int = 20
-    long_window: int = 100
-    intervention_eps: float = 1e-3
-    stable_ratio_low: float = 0.90
-    stable_ratio_high: float = 1.10
-    unstable_ratio_high: float = 1.35
-    stagnation_ratio_low: float = 0.70
-    oscillation_flip_high: float = 0.30
-class GovernanceMetricsTracker:
-    def __init__(self, cfg: Optional[GovernanceMetricConfig] = None):
-        self.cfg = cfg or GovernanceMetricConfig()
-        self.grad_rms_history = deque(maxlen=self.cfg.long_window)
-        self.ratio_history = deque(maxlen=self.cfg.long_window)
-        self.regime_history = deque(maxlen=self.cfg.long_window)
-        self.flip_rate_history = deque(maxlen=self.cfg.long_window)
-        self.loss_ema: Optional[float] = None
-        self.grad_norm_ema: Optional[float] = None
-        self.grad_rms_ema: Optional[float] = None
-        self.prev_loss: Optional[float] = None
-        self.prev_prev_loss: Optional[float] = None
-        self.prev_regime: Optional[str] = None
-        self.prev_grad_sign_summary: Optional[tuple[int, int, int]] = None
-        self.total_logged_steps = 0
-        self.intervention_count = 0
-        self.regime_switch_count = 0
-        self.stress_entries = 0
-        self.total_control_energy = 0.0
-        self.max_control_energy = 0.0
-        self.open_recovery_start_step: Optional[int] = None
-        self.completed_recovery_latencies: List[int] = []
-        self.best_eval_loss: Optional[float] = None
-        self.best_eval_perplexity: Optional[float] = None
-    def _safe_float(self, value: Any, default: float = 0.0) -> float:
-        try:
-            out = float(value)
-            if math.isfinite(out):
-                return out
-        except Exception:
-            pass
-        return float(default)
-    def _ema_update(self, old: Optional[float], new: Optional[float]) -> Optional[float]:
-        if new is None:
-            return old
-        if old is None:
-            return float(new)
-        d = self.cfg.ema_decay
-        return float(d * old + (1.0 - d) * float(new))
-    def _std(self, values) -> float:
-        vals = [float(v) for v in values if v is not None and math.isfinite(float(v))]
-        if len(vals) < 2:
-            return 0.0
-        return float(statistics.pstdev(vals))
-    def _mean(self, values) -> float:
-        vals = [float(v) for v in values if v is not None and math.isfinite(float(v))]
-        if not vals:
-            return 0.0
-        return float(sum(vals) / len(vals))
-    def _compute_grad_sign_flip_rate(self, params) -> float:
-        pos = 0
-        neg = 0
-        zero = 0
-        for param in params:
-            if param.grad is None:
-                continue
-            grad = param.grad.detach()
-            if grad.numel() == 0:
-                continue
-            signs = torch.sign(grad)
-            pos += int((signs > 0).sum().item())
-            neg += int((signs < 0).sum().item())
-            zero += int((signs == 0).sum().item())
-        summary = (pos, neg, zero)
-        if self.prev_grad_sign_summary is None:
-            self.prev_grad_sign_summary = summary
-            return 0.0
-        prev_pos, prev_neg, _ = self.prev_grad_sign_summary
-        prev_total = max(prev_pos + prev_neg, 1)
-        cur_total = max(pos + neg, 1)
-        flip_rate = abs((pos / cur_total) - (prev_pos / prev_total))
-        self.prev_grad_sign_summary = summary
-        return float(flip_rate)
-    def classify_regime(self, *, ratio: float, flip_rate: float, loss_velocity: float, scale: float, stress_mode: str) -> str:
-        ratio = self._safe_float(ratio, 1.0)
-        flip_rate = self._safe_float(flip_rate, 0.0)
-        loss_velocity = self._safe_float(loss_velocity, 0.0)
-        scale = self._safe_float(scale, 1.0)
-        stress_mode = str(stress_mode or "unknown").lower()
-        if "stress" in stress_mode or ratio >= self.cfg.unstable_ratio_high:
-            return "unstable"
-        if flip_rate >= self.cfg.oscillation_flip_high:
-            return "oscillatory"
-        if ratio <= self.cfg.stagnation_ratio_low and abs(loss_velocity) < 1e-4:
-            return "stagnation"
-        if (self.cfg.stable_ratio_low <= ratio <= self.cfg.stable_ratio_high) and abs(scale - 1.0) <= 0.05:
-            return "stable"
-        return "transitional"
-    def update_step(
-        self,
-        *,
-        step: int,
-        trainable_params,
-        loss_val: float,
-        grad_norm: float,
-        grad_rms: float,
-        ema_grad_rms: float,
-        ratio: float,
-        scale: float,
-        stress_mode: str,
-        current_lr: float,
-    ) -> Dict[str, float]:
-        self.total_logged_steps += 1
-        loss_val = self._safe_float(loss_val)
-        grad_norm = self._safe_float(grad_norm)
-        grad_rms = self._safe_float(grad_rms)
-        ema_grad_rms = self._safe_float(ema_grad_rms)
-        ratio = self._safe_float(ratio, 1.0)
-        scale = self._safe_float(scale, 1.0)
-        current_lr = self._safe_float(current_lr)
-        self.loss_ema = self._ema_update(self.loss_ema, loss_val)
-        self.grad_norm_ema = self._ema_update(self.grad_norm_ema, grad_norm)
-        self.grad_rms_ema = self._ema_update(self.grad_rms_ema, grad_rms)
-        loss_velocity = 0.0 if self.prev_loss is None else (loss_val - self.prev_loss)
-        loss_acceleration = 0.0 if self.prev_loss is None or self.prev_prev_loss is None else (
-            loss_val - 2.0 * self.prev_loss + self.prev_prev_loss
-        )
-        flip_rate = self._compute_grad_sign_flip_rate(trainable_params)
-        grad_deviation = 0.0
-        if ema_grad_rms > 0:
-            grad_deviation = (grad_rms - ema_grad_rms) / max(ema_grad_rms, 1e-12)
-        control_energy = abs(scale - 1.0)
-        intervention_flag = 1.0 if control_energy > self.cfg.intervention_eps else 0.0
-        if intervention_flag > 0:
-            self.intervention_count += 1
-        self.total_control_energy += control_energy
-        self.max_control_energy = max(self.max_control_energy, control_energy)
-        regime = self.classify_regime(
-            ratio=ratio,
-            flip_rate=flip_rate,
-            loss_velocity=loss_velocity,
-            scale=scale,
-            stress_mode=stress_mode,
-        )
-        if self.prev_regime is not None and regime != self.prev_regime:
-            self.regime_switch_count += 1
-        if regime in {"unstable", "oscillatory"} and self.open_recovery_start_step is None:
-            self.open_recovery_start_step = step
-            self.stress_entries += 1
-        if regime == "stable" and self.open_recovery_start_step is not None:
-            self.completed_recovery_latencies.append(step - self.open_recovery_start_step)
-            self.open_recovery_start_step = None
-        self.grad_rms_history.append(grad_rms)
-        self.ratio_history.append(ratio)
-        self.regime_history.append(regime)
-        self.flip_rate_history.append(flip_rate)
-        short_grad_std = self._std(list(self.grad_rms_history)[-self.cfg.short_window :])
-        long_grad_std = self._std(self.grad_rms_history)
-        grad_variance_reduction = 0.0
-        if long_grad_std > 1e-12:
-            grad_variance_reduction = 1.0 - (short_grad_std / long_grad_std)
-        out = {
-            "obs/grad_direction_change_rate": flip_rate,
-            "obs/loss_velocity": loss_velocity,
-            "obs/loss_acceleration": loss_acceleration,
-            "obs/update_magnitude_proxy": scale * current_lr,
-            "state/grad_ratio": ratio,
-            "state/grad_deviation_score": grad_deviation,
-            "state/regime_stable": 1.0 if regime == "stable" else 0.0,
-            "state/regime_unstable": 1.0 if regime == "unstable" else 0.0,
-            "state/regime_oscillatory": 1.0 if regime == "oscillatory" else 0.0,
-            "state/regime_stagnation": 1.0 if regime == "stagnation" else 0.0,
-            "state/regime_transitional": 1.0 if regime == "transitional" else 0.0,
-            "control/action_strength": control_energy,
-            "control/intervention_flag": intervention_flag,
-            "loop/intervention_rate": self.intervention_count / max(self.total_logged_steps, 1),
-            "loop/regime_switch_count": float(self.regime_switch_count),
-            "loop/avg_control_energy": self.total_control_energy / max(self.total_logged_steps, 1),
-            "loop/max_control_energy": self.max_control_energy,
-            "effect/grad_variance_reduction": grad_variance_reduction,
-            "effect/recovery_latency_mean_steps": self._mean(self.completed_recovery_latencies),
-            "effect/recovery_events": float(len(self.completed_recovery_latencies)),
-        }
-        self.prev_prev_loss = self.prev_loss
-        self.prev_loss = loss_val
-        self.prev_regime = regime
-        return out
-    def update_eval(
-        self,
-        *,
-        eval_loss: Optional[float] = None,
-        eval_perplexity: Optional[float] = None,
-        avg_tps_wall: Optional[float] = None,
-    ) -> Dict[str, float]:
-        out: Dict[str, float] = {}
-        if eval_loss is not None:
-            if self.best_eval_loss is None or eval_loss < self.best_eval_loss:
-                self.best_eval_loss = float(eval_loss)
-            out["effect/best_eval_loss"] = float(self.best_eval_loss)
-            out["effect/eval_loss_gap_to_best"] = float(eval_loss - self.best_eval_loss)
-        if eval_perplexity is not None:
-            if self.best_eval_perplexity is None or eval_perplexity < self.best_eval_perplexity:
-                self.best_eval_perplexity = float(eval_perplexity)
-            out["effect/best_eval_perplexity"] = float(self.best_eval_perplexity)
-            out["effect/eval_perplexity_gap_to_best"] = float(eval_perplexity - self.best_eval_perplexity)
-        if avg_tps_wall is not None:
-            out["effect/efficiency_wall_tps"] = float(avg_tps_wall)
-        return out
-    def snapshot(self) -> Dict[str, Any]:
-        return {
-            "total_logged_steps": self.total_logged_steps,
-            "intervention_count": self.intervention_count,
-            "regime_switch_count": self.regime_switch_count,
-            "stress_entries": self.stress_entries,
-            "avg_control_energy": self.total_control_energy / max(self.total_logged_steps, 1),
-            "max_control_energy": self.max_control_energy,
-            "completed_recovery_latencies": list(self.completed_recovery_latencies),
-            "recent_regimes": list(self.regime_history),
-            "best_eval_loss": self.best_eval_loss,
-            "best_eval_perplexity": self.best_eval_perplexity,
-        }
-def _wants_cuda(device: Optional[str] = None) -> bool:
-    return str(device or "").strip().lower().startswith("cuda")
-def set_seed(seed: int, device: Optional[str] = None):
-    random.seed(seed)
-    torch.manual_seed(seed)
-    if _wants_cuda(device) and torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
-def normalize_optimizer_name(name: str) -> str:
-    aliases = {
-        "guard": "lbw_guard",
-        "lbw": "lbw_guard",
-        "lbw-guard": "lbw_guard",
-        "adam": "adamw",
-    }
-    key = str(name or "").strip().lower()
-    return aliases.get(key, key)
-def check_optimizer_support(name: str, device: Optional[str] = None) -> tuple[bool, str]:
-    normalized = normalize_optimizer_name(name)
-    if normalized not in {"adamw", "lbw_guard"}:
-        return False, "Standalone customer demo runtime supports only adamw and lbw_guard."
-    if normalized == "lbw_guard" and Guard is None:
-        return False, "LBW_Guard package not found. Install the standard LBW_Guard package in the active Python environment."
-    if normalized == "lbw_guard" and _wants_cuda(device) and torch.cuda.is_available() and int(torch.cuda.device_count()) > 1:
-        return False, "lbw_guard supports at most 1 visible GPU. Restrict CUDA_VISIBLE_DEVICES to one GPU."
-    return True, ""
-def _hf_offline_mode() -> bool:
-    return (
-        os.environ.get("HF_HUB_OFFLINE", "").lower() in {"1", "true", "yes"}
-        or os.environ.get("TRANSFORMERS_OFFLINE", "").lower() in {"1", "true", "yes"}
-    )
-def _hf_pretrained_kwargs() -> Dict[str, Any]:
-    kwargs: Dict[str, Any] = {"trust_remote_code": True}
-    if _hf_offline_mode():
-        kwargs["local_files_only"] = True
-    return kwargs
-def _resolve_model_dtype(device: torch.device):
-    return torch.bfloat16 if device.type == "cuda" else torch.float32
-def _resolve_model_device_map(device: torch.device):
-    if device.type != "cuda":
-        return None
-    return {"": (device.index if device.index is not None else 0)}
-def _load_tokenizer_and_model(model_name: str, device: torch.device):
-    hf_kwargs = _hf_pretrained_kwargs()
-    tokenizer = AutoTokenizer.from_pretrained(model_name, **hf_kwargs)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token or tokenizer.unk_token
-    model_kwargs: Dict[str, Any] = {
-        "torch_dtype": _resolve_model_dtype(device),
-        **hf_kwargs,
-    }
-    device_map = _resolve_model_device_map(device)
-    if device_map is not None:
-        model_kwargs["device_map"] = device_map
-    model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
-    if device_map is None:
-        model.to(device)
-    return tokenizer, model
-def build_wikitext_chunks(
-    tokenizer,
-    seq_len: int,
-    max_chars: Optional[int],
-    split: str,
-    *,
-    config: Optional[BenchmarkConfig] = None,
-) -> ChunkedTokens:
-    cap = None if max_chars is None else int(max_chars)
-    _demo_log(
-        config,
-        f"Preparing WikiText split='{split}'"
-        + (f" with char cap {cap:,}" if cap is not None else " with full split"),
-    )
-    ds = load_dataset("wikitext", "wikitext-103-raw-v1", split=split)
-    token_buf = array("I")
-    chars_used = 0
-    first_piece = True
-    rows_used = 0
-    next_report_chars = 500_000 if cap is None else max(250_000, cap // 4)
-    for row in ds:
-        text = str(row.get("text", "") or "")
-        if not text.strip():
-            continue
-        piece = text if first_piece else (" " + text)
-        if cap is not None:
-            remain = cap - chars_used
-            if remain <= 0:
-                break
-            if len(piece) > remain:
-                piece = piece[:remain]
-        chars_used += len(piece)
-        first_piece = False
-        rows_used += 1
-        ids_piece = tokenizer(piece, add_special_tokens=False)["input_ids"]
-        if ids_piece:
-            token_buf.extend(ids_piece)
-        if config is not None and bool(getattr(config, "print_all_metrics", False)) and chars_used >= next_report_chars:
-            target = f"/{cap:,}" if cap is not None else ""
-            _demo_log(config, f"Tokenizing split='{split}': {chars_used:,}{target} chars")
-            next_report_chars += 500_000 if cap is None else max(250_000, cap // 4)
-        if cap is not None and chars_used >= cap:
-            break
-    if len(token_buf) == 0:
-        raise RuntimeError(f"No tokens built for split '{split}'.")
-    ids = torch.tensor(token_buf, dtype=torch.long)
-    n = ids.numel() // seq_len
-    if n <= 0:
-        raise RuntimeError(f"Not enough tokens for seq_len {seq_len}. Increase max_chars.")
-    ids = ids[: n * seq_len].view(n, seq_len).contiguous()
-    _demo_log(
-        config,
-        f"Prepared split='{split}': {chars_used:,} chars across {rows_used:,} rows -> {ids.size(0):,} sequences of len {seq_len}",
-    )
-    return ChunkedTokens(input_ids=ids, labels=ids, split=split, char_count=int(chars_used), cap_chars=cap)
-def batch_iter(chunks: ChunkedTokens, batch_size: int, device: torch.device):
-    x, y = chunks.input_ids, chunks.labels
-    i, n = 0, x.size(0)
-    while True:
-        if i + batch_size > n:
-            i = 0
-        yield (
-            x[i : i + batch_size].to(device, non_blocking=True),
-            y[i : i + batch_size].to(device, non_blocking=True),
-        )
-        i += batch_size
-def evaluate_perplexity(model, eval_chunks: ChunkedTokens, config: BenchmarkConfig, device: torch.device, *, full_pass: bool = False):
-    model.eval()
-    total_nll = 0.0
-    total_tokens = 0
-    with torch.no_grad():
-        if full_pass:
-            x, y = eval_chunks.input_ids, eval_chunks.labels
-            n = x.size(0)
-            for i in range(0, n, config.batch_size):
-                ex = x[i : i + config.batch_size].to(device, non_blocking=True)
-                ey = y[i : i + config.batch_size].to(device, non_blocking=True)
-                with torch.autocast(device_type=device.type, dtype=torch.bfloat16, enabled=(device.type == "cuda")):
-                    out = model(input_ids=ex, labels=ey)
-                tok = int(ey[:, 1:].numel())
-                if tok > 0 and math.isfinite(float(out.loss.item())):
-                    total_nll += float(out.loss.item()) * float(tok)
-                    total_tokens += tok
-        else:
-            eval_iter = batch_iter(eval_chunks, config.batch_size, device)
-            for _ in range(max(1, int(config.eval_batches))):
-                ex, ey = next(eval_iter)
-                with torch.autocast(device_type=device.type, dtype=torch.bfloat16, enabled=(device.type == "cuda")):
-                    out = model(input_ids=ex, labels=ey)
-                tok = int(ey[:, 1:].numel())
-                if tok > 0 and math.isfinite(float(out.loss.item())):
-                    total_nll += float(out.loss.item()) * float(tok)
-                    total_tokens += tok
-    if total_tokens <= 0:
-        raise RuntimeError("Validation produced no batches.")
-    avg_eval_loss = float(total_nll / float(total_tokens))
-    return avg_eval_loss, math.exp(avg_eval_loss)
-def _compute_grad_rms(params) -> float:
-    sq_sum = 0.0
-    count = 0
-    for param in params:
-        if param.grad is None:
-            continue
-        grad = param.grad.detach()
-        sq_sum += float(torch.sum(grad.float() * grad.float()).item())
-        count += int(grad.numel())
-    if count <= 0:
-        return 0.0
-    return float(math.sqrt(sq_sum / float(count)))
-def get_optimizer(name: str, model, config: BenchmarkConfig):
-    params = [p for p in model.parameters() if p.requires_grad]
-    normalized = normalize_optimizer_name(name)
-    if normalized == "adamw":
-        return torch.optim.AdamW(
-            params,
-            lr=config.lr,
-            betas=config.betas,
-            weight_decay=config.weight_decay,
-        )
-    if normalized == "lbw_guard":
-        if Guard is None:
-            raise RuntimeError("LBW Guard package not available for lbw_guard.")
-        return Guard(
-            params,
-            lr=config.lr,
-            betas=config.betas,
-            weight_decay=config.weight_decay,
-            mode="eval",
-            auto_enabled=True,
-            stats_freq=int(config.lbw_stats_freq),
-            stress_threshold=config.lbw_stress_th,
-            spike_threshold=config.lbw_spike_th,
-            recovery_fast=config.lbw_rec_fast,
-            ema_decay=config.lbw_ema_decay,
-            use_max_rms=True,
-        )
-    raise ValueError(f"Unsupported optimizer for standalone demo runtime: {name}")
-class _SchedulerProxyOptimizer(torch.optim.Optimizer):
-    def __init__(self, param_groups: List[Dict[str, Any]]):
-        proxy_groups = []
-        for group in list(param_groups or []):
-            proxy_groups.append({"params": list(group.get("params", []) or []), "lr": float(group.get("lr", 0.0))})
-        super().__init__(proxy_groups, defaults={})
-        self.param_groups = param_groups
-    def step(self, closure=None):
-        del closure
-        return None
-def _build_scheduler_proxy_for_optimizer_like(opt: Any) -> Optional[torch.optim.Optimizer]:
-    param_groups = getattr(opt, "param_groups", None)
-    if not isinstance(param_groups, list) or not param_groups:
-        return None
-    try:
-        return _SchedulerProxyOptimizer(param_groups)
-    except Exception:
-        return None
-def _pick_lm_eval_perplexity_metrics(task_result: Dict[str, Any]) -> Dict[str, float]:
-    out: Dict[str, float] = {}
-    if not isinstance(task_result, dict):
-        return out
-    key_map = (
-        ("word_perplexity,none", "word_perplexity"),
-        ("word_perplexity", "word_perplexity"),
-        ("perplexity,none", "perplexity"),
-        ("perplexity", "perplexity"),
-        ("bits_per_byte,none", "bits_per_byte"),
-        ("bits_per_byte", "bits_per_byte"),
-    )
-    for src, dst in key_map:
-        value = task_result.get(src, None)
-        if isinstance(value, (int, float)):
-            out[dst] = float(value)
-    return out
-def _pick_lm_eval_accuracy_metrics(task_result: Dict[str, Any]) -> Dict[str, float]:
-    out: Dict[str, float] = {}
-    if not isinstance(task_result, dict):
-        return out
-    key_map = (
-        ("acc_norm,none", "acc_norm"),
-        ("acc_norm", "acc_norm"),
-        ("acc,none", "acc"),
-        ("acc", "acc"),
-        ("exact_match,none", "exact_match"),
-        ("exact_match", "exact_match"),
-    )
-    for src, dst in key_map:
-        value = task_result.get(src, None)
-        if isinstance(value, (int, float)):
-            out[dst] = float(value)
-    return out
-def _normalize_lm_eval_task_name(task_name: str) -> str:
-    task = str(task_name or "").strip()
-    if not task:
-        return task
-    normalized = task.lower().replace("-", "_")
-    aliases = {
-        "wikitext_103_raw": "wikitext_103_raw",
-        "wikitext103_raw": "wikitext_103_raw",
-        "wikitext103raw": "wikitext_103_raw",
-        "wikitext_103": "wikitext_103_raw",
-        "wikitext103": "wikitext_103_raw",
-        "wikitext_103_raw_v1": "wikitext_103_raw",
-        "wikitext": "wikitext",
-        "paloma_wikitext_103": "paloma_wikitext_103",
-        "mmlu": "mmlu",
-        "hendrycks_test": "mmlu",
-        "arc": "arc_challenge",
-        "arc_challenge": "arc_challenge",
-        "arcchallenge": "arc_challenge",
-    }
-    return aliases.get(normalized, normalized)
-def _parse_lm_eval_task_list(raw_tasks: Any) -> List[str]:
-    if raw_tasks is None:
-        return []
-    if isinstance(raw_tasks, str):
-        items = [part.strip() for part in raw_tasks.split(",")]
-    else:
-        items = [str(part).strip() for part in raw_tasks]
-    out: List[str] = []
-    seen = set()
-    for item in items:
-        if not item:
-            continue
-        normalized = _normalize_lm_eval_task_name(item)
-        if normalized and normalized not in seen:
-            seen.add(normalized)
-            out.append(normalized)
-    return out
-def _lm_eval_num_fewshot_for_task(config: BenchmarkConfig, task_name: str) -> int:
-    normalized = _normalize_lm_eval_task_name(task_name)
-    if normalized == "mmlu":
-        return int(getattr(config, "lm_eval_mmlu_fewshot", 5))
-    if normalized == "arc_challenge":
-        return int(getattr(config, "lm_eval_arc_challenge_fewshot", 25))
-    return 0
-def _lm_eval_limit_for_task(config: BenchmarkConfig, task_name: str) -> Optional[float]:
-    normalized = _normalize_lm_eval_task_name(task_name)
-    legacy_limit = getattr(config, "lm_eval_acc_limit", None)
-    if normalized == "mmlu":
-        value = getattr(config, "lm_eval_mmlu_limit", legacy_limit)
-    elif normalized == "arc_challenge":
-        value = getattr(config, "lm_eval_arc_challenge_limit", None)
-    else:
-        value = legacy_limit
-    if value is None:
-        return None
-    return float(value)
-def _load_lm_eval_results(output_path: Path) -> Dict[str, Any]:
-    candidates: List[Path] = []
-    if output_path.is_file():
-        candidates = [output_path]
-    elif output_path.is_dir():
-        candidates = sorted([p for p in output_path.rglob("*.json") if p.is_file()], key=lambda p: p.stat().st_mtime, reverse=True)
-    for cand in candidates:
-        try:
-            payload = json.loads(cand.read_text())
-            if isinstance(payload, dict) and isinstance(payload.get("results", None), dict):
-                return payload["results"]
-        except Exception:
-            continue
-    raise RuntimeError(f"Unable to parse lm_eval output from {output_path}")
-def _find_lm_eval_task_result(raw_results: Dict[str, Any], task_name: str) -> Dict[str, Any]:
-    task_key = task_name if task_name in raw_results else next(
-        (k for k in raw_results if k == task_name or k.startswith(task_name)),
-        None,
-    )
-    if task_key is None:
-        raise RuntimeError(f"lm_eval returned no results for '{task_name}'.")
-    task_result = raw_results.get(task_key, {})
-    if not isinstance(task_result, dict):
-        raise RuntimeError(f"lm_eval returned malformed results for '{task_name}'.")
-    return task_result
-def _resolve_lm_eval_command() -> List[str]:
-    lm_eval_bin = shutil.which("lm_eval")
-    if lm_eval_bin:
-        return [lm_eval_bin, "run"]
-    if importlib.util.find_spec("lm_eval") is not None:
-        return [sys.executable, "-m", "lm_eval", "run"]
-    raise RuntimeError("lm_eval not found. Install EleutherAI lm-evaluation-harness in your venv.")
-def _resolve_lm_eval_include_path() -> Optional[str]:
-    paths: List[str] = []
-    for local_tasks in (AUTOMATION_DIR / "lm_eval_tasks", TEST_ROOT / "lm_eval_tasks"):
-        if local_tasks.exists():
-            paths.append(str(local_tasks))
-    env_paths = str(os.environ.get("LM_EVAL_INCLUDE_PATH", "") or "").strip()
-    if env_paths:
-        for path in env_paths.split(":"):
-            path = path.strip()
-            if path:
-                paths.append(path)
-    return ":".join(paths) if paths else None
-def _run_lm_eval_with_retry(cmd: List[str], batch_size_value: str) -> None:
-    try:
-        subprocess.run(cmd, check=True)
-        return
-    except subprocess.CalledProcessError as exc:
-        bs = str(batch_size_value or "").strip().lower()
-        is_auto = bs.startswith("auto")
-        if (not is_auto) or ("out of memory" not in str(exc).lower() and "oom" not in str(exc).lower()):
-            raise
-        retry_cmd = list(cmd)
-        idx = retry_cmd.index("--batch_size")
-        retry_cmd[idx + 1] = "1"
-        subprocess.run(retry_cmd, check=True)
-def _prepare_adapter_dir(*, model=None, tokenizer=None) -> tuple[Path, tempfile.TemporaryDirectory]:
-    if model is None or tokenizer is None:
-        raise RuntimeError("model and tokenizer are required for lm_eval PPL.")
-    tmp_ctx = tempfile.TemporaryDirectory(prefix="lbw_demo_lmeval_")
-    out_dir = Path(tmp_ctx.name) / "peft_adapter"
-    model.save_pretrained(str(out_dir))
-    tokenizer.save_pretrained(str(out_dir))
-    return out_dir, tmp_ctx
-def _run_lm_eval_tasks_with_adapter(
-    adapter_path: Path,
-    *,
-    config: BenchmarkConfig,
-    device: torch.device,
-    tasks: Sequence[str],
-    limit: Optional[float],
-    output_name: str,
-    num_fewshot: int = 0,
-) -> Dict[str, Any]:
-    lm_eval_cmd = _resolve_lm_eval_command()
-    include_path = _resolve_lm_eval_include_path()
-    normalized_tasks = [_normalize_lm_eval_task_name(task) for task in tasks if str(task).strip()]
-    if not normalized_tasks:
-        raise RuntimeError("No lm_eval tasks were provided.")
-    out_dir = adapter_path.parent / output_name
-    out_dir.mkdir(parents=True, exist_ok=True)
-    lm_eval_dtype = "bfloat16" if device.type == "cuda" else "float32"
-    model_args = [
-        f"pretrained={config.model_name}",
-        f"peft={adapter_path}",
-        f"dtype={lm_eval_dtype}",
-        "trust_remote_code=True",
-    ]
-    batch_size_value = str(getattr(config, "lm_eval_batch_size", "1"))
-    cmd = [
-        *lm_eval_cmd,
-        "--model",
-        "hf",
-        "--model_args",
-        ",".join(model_args),
-        "--tasks",
-        ",".join(normalized_tasks),
-        "--num_fewshot",
-        str(int(num_fewshot)),
-        "--batch_size",
-        batch_size_value,
-        "--device",
-        ("cuda" if device.type == "cuda" else "cpu"),
-        "--output_path",
-        str(out_dir),
-    ]
-    if include_path is not None:
-        cmd.extend(["--include_path", include_path])
-    if limit is not None:
-        cmd.extend(["--limit", str(float(limit))])
-    _run_lm_eval_with_retry(cmd, batch_size_value)
-    return _load_lm_eval_results(out_dir)
-def _summarize_lm_eval_status(
-    requested: Dict[str, bool],
-    statuses: Dict[str, str],
-    errors: Dict[str, Optional[str]],
-) -> tuple[str, Optional[str]]:
-    active = [name for name, enabled in requested.items() if enabled]
-    if not active:
-        return "disabled", None
-    active_statuses = [str(statuses.get(name, "disabled")) for name in active]
-    joined_errors = "; ".join(
-        f"{name}: {errors[name]}"
-        for name in active
-        if str(errors.get(name) or "").strip()
-    ) or None
-    if all(status == "ok" for status in active_statuses):
-        return "ok", None
-    if any(status == "ok" for status in active_statuses):
-        return "partial", joined_errors
-    return "skipped", joined_errors
-def run_lm_eval_suite(model, tokenizer, config: BenchmarkConfig, device: torch.device) -> Dict[str, Any]:
-    requested = {
-        "ppl": bool(getattr(config, "lm_eval_ppl", False)),
-        "acc": bool(getattr(config, "lm_eval_acc", False)),
-    }
-    statuses: Dict[str, str] = {
-        "ppl": "disabled",
-        "acc": "disabled",
-    }
-    errors: Dict[str, Optional[str]] = {
-        "ppl": None,
-        "acc": None,
-    }
-    out: Dict[str, Any] = {}
-    adapter_path, tmp_ctx = _prepare_adapter_dir(model=model, tokenizer=tokenizer)
-    try:
-        if requested["ppl"]:
-            statuses["ppl"] = "requested"
-            ppl_task = _normalize_lm_eval_task_name(getattr(config, "lm_eval_ppl_task", "wikitext_103_raw"))
-            try:
-                raw_results = _run_lm_eval_tasks_with_adapter(
-                    adapter_path,
-                    config=config,
-                    device=device,
-                    tasks=[ppl_task],
-                    limit=getattr(config, "lm_eval_ppl_limit", None),
-                    output_name="lm_eval_ppl_out",
-                    num_fewshot=0,
-                )
-                ppl_metrics = _pick_lm_eval_perplexity_metrics(_find_lm_eval_task_result(raw_results, ppl_task))
-                if not ppl_metrics:
-                    raise RuntimeError(f"No perplexity-like metrics found for '{ppl_task}'.")
-                if "word_perplexity" in ppl_metrics:
-                    out["lm_eval/final_word_perplexity"] = float(ppl_metrics["word_perplexity"])
-                    out["final_eval/perplexity_lm_eval"] = float(ppl_metrics["word_perplexity"])
-                if "perplexity" in ppl_metrics:
-                    out["lm_eval/final_perplexity"] = float(ppl_metrics["perplexity"])
-                    out.setdefault("final_eval/perplexity_lm_eval", float(ppl_metrics["perplexity"]))
-                if "bits_per_byte" in ppl_metrics:
-                    out["lm_eval/final_bits_per_byte"] = float(ppl_metrics["bits_per_byte"])
-                statuses["ppl"] = "ok"
-            except Exception as exc:
-                statuses["ppl"] = "skipped"
-                errors["ppl"] = str(exc).strip() or type(exc).__name__
-        if requested["acc"]:
-            statuses["acc"] = "requested"
-            acc_tasks = _parse_lm_eval_task_list(getattr(config, "lm_eval_acc_tasks", "mmlu,arc_challenge"))
-            if not acc_tasks:
-                statuses["acc"] = "skipped"
-                errors["acc"] = "No lm_eval accuracy tasks configured."
-            else:
-                try:
-                    acc_out: Dict[str, Any] = {}
-                    for task_name in acc_tasks:
-                        raw_results = _run_lm_eval_tasks_with_adapter(
-                            adapter_path,
-                            config=config,
-                            device=device,
-                            tasks=[task_name],
-                            limit=_lm_eval_limit_for_task(config, task_name),
-                            output_name=f"lm_eval_acc_{task_name}_out",
-                            num_fewshot=_lm_eval_num_fewshot_for_task(config, task_name),
-                        )
-                        metrics = _pick_lm_eval_accuracy_metrics(_find_lm_eval_task_result(raw_results, task_name))
-                        if task_name == "mmlu":
-                            value = metrics.get("acc")
-                            if value is None:
-                                raise RuntimeError("No `acc` metric found for `mmlu`.")
-                            acc_out["lm_eval/final_mmlu_acc"] = float(value)
-                            acc_out["final_eval/mmlu_acc_lm_eval"] = float(value)
-                        elif task_name == "arc_challenge":
-                            value = metrics.get("acc_norm")
-                            if value is None:
-                                value = metrics.get("acc")
-                            if value is None:
-                                value = metrics.get("exact_match")
-                            if value is None:
-                                raise RuntimeError("No accuracy-like metric found for `arc_challenge`.")
-                            acc_out["lm_eval/final_arc_challenge_acc"] = float(value)
-                            acc_out["final_eval/arc_challenge_acc_lm_eval"] = float(value)
-                    out.update(acc_out)
-                    statuses["acc"] = "ok"
-                except Exception as exc:
-                    statuses["acc"] = "skipped"
-                    errors["acc"] = str(exc).strip() or type(exc).__name__
-        overall_status, overall_error = _summarize_lm_eval_status(requested, statuses, errors)
-        out.update(
-            {
-                "lm_eval_status": overall_status,
-                "lm_eval_error": overall_error,
-                "lm_eval_ppl_status": statuses["ppl"],
-                "lm_eval_ppl_error": errors["ppl"],
-                "lm_eval_acc_status": statuses["acc"],
-                "lm_eval_acc_error": errors["acc"],
-            }
-        )
-        return out
-    finally:
-        tmp_ctx.cleanup()
-def _current_learning_rate(opt: Any, scheduler: Optional[Any], config: BenchmarkConfig) -> float:
-    if scheduler is not None:
-        try:
-            return float(scheduler.get_last_lr()[0])
-        except Exception:
-            pass
-    try:
-        return float(opt.param_groups[0]["lr"])
-    except Exception:
-        return float(config.lr)
-def _optimizer_group_learning_rate(opt: Any, config: BenchmarkConfig) -> float:
-    try:
-        return float(opt.param_groups[0]["lr"])
-    except Exception:
-        return float(config.lr)
-def _build_scheduler(opt: Any, optimizer_name: str, config: BenchmarkConfig):
-    schedule_mode = str(getattr(config, "schedule_mode", "all_cosine") or "all_cosine").strip().lower()
-    if schedule_mode not in {"native", "all_cosine", "all_constant"}:
-        schedule_mode = "all_cosine"
-    if isinstance(opt, torch.optim.Optimizer):
-        target = opt
-    else:
-        target = _build_scheduler_proxy_for_optimizer_like(opt)
-    if target is None:
-        return None
-    if schedule_mode == "all_constant":
-        return torch.optim.lr_scheduler.LambdaLR(target, lr_lambda=lambda step: 1.0)
-    if schedule_mode == "all_cosine":
-        return get_cosine_schedule_with_warmup(target, num_warmup_steps=config.warmup_steps, num_training_steps=config.max_steps)
-    if optimizer_name == "lbw_guard":
-        return torch.optim.lr_scheduler.LambdaLR(target, lr_lambda=lambda step: 1.0)
-    return get_cosine_schedule_with_warmup(target, num_warmup_steps=config.warmup_steps, num_training_steps=config.max_steps)
-def _init_wandb_if_enabled(config: BenchmarkConfig, *, group_name: Optional[str], run_name: Optional[str]):
-    if not bool(getattr(config, "use_wandb", False)):
-        return None
-    if wandb is None:
-        print("[W&B] Disabled (wandb not installed)")
-        return None
-    try:
-        wandb.init(
-            project="LBW-Customer-Demo",
-            group=group_name,
-            name=run_name,
-            config=config.__dict__,
-            reinit=True,
-            settings=wandb.Settings(start_method="thread"),
-        )
-        return wandb
-    except Exception as exc:
-        print(f"[W&B] Disabled (wandb.init failed: {exc})")
-        return None
-def train_one_run(
-    optimizer_name: str,
-    config: BenchmarkConfig,
-    *,
-    group_name: Optional[str] = None,
-    run_name: Optional[str] = None,
-    shared_pre_bench_results=None,
-    shared_bench_dataset_bundle=None,
-) -> Dict[str, Any]:
-    del shared_pre_bench_results, shared_bench_dataset_bundle
-    normalized = normalize_optimizer_name(optimizer_name)
-    ok, reason = check_optimizer_support(normalized, device=config.device)
-    if not ok:
-        raise RuntimeError(f"{normalized}: {reason}")
-    device = torch.device(config.device)
-    if device.type != "cuda":
-        warnings.filterwarnings(
-            "ignore",
-            message="CUDA initialization: The NVIDIA driver on your system is too old.*",
-            category=UserWarning,
-        )
-    wb = _init_wandb_if_enabled(config, group_name=group_name, run_name=run_name or normalized)
-    _demo_log(config, f"Loading model and tokenizer: {config.model_name} on {device}")
-    tokenizer, model = _load_tokenizer_and_model(config.model_name, device)
-    _demo_log(config, "Model load complete")
-    train_cap = None if config.full_wikitext_train else config.max_chars
-    eval_cap = None if config.full_wikitext_eval else config.eval_chars
-    train_chunks = build_wikitext_chunks(tokenizer, config.seq_len, train_cap, "train", config=config)
-    eval_chunks = build_wikitext_chunks(tokenizer, config.seq_len, eval_cap, "validation", config=config)
-    train_iter = batch_iter(train_chunks, config.batch_size, device)
-    train_sequence_count = int(train_chunks.input_ids.size(0))
-    train_token_count = int(train_chunks.input_ids.numel())
-    eval_sequence_count = int(eval_chunks.input_ids.size(0))
-    eval_token_count = int(eval_chunks.input_ids.numel())
-    sequences_per_optimizer_step = max(int(config.batch_size * config.grad_accum), 1)
-    tokens_per_optimizer_step = max(int(config.batch_size * config.seq_len * config.grad_accum), 1)
-    steps_per_train_pass = int(math.ceil(train_sequence_count / float(sequences_per_optimizer_step)))
-    if bool(getattr(config, "enable_lora", True)):
-        _demo_log(config, "Attaching LoRA adapters")
-        lora_cfg = LoraConfig(
-            r=config.lora_r,
-            lora_alpha=config.lora_alpha,
-            lora_dropout=config.lora_dropout,
-            target_modules=config.lora_target_modules,
-            task_type=TaskType.CAUSAL_LM,
-            bias="none",
-        )
-        model = get_peft_model(model, lora_cfg)
-    else:
-        _demo_log(config, "Training without LoRA adapters")
-    model.train()
-    trainable_params = [p for p in model.parameters() if p.requires_grad]
-    if not trainable_params:
-        raise RuntimeError("No trainable parameters found for training.")
-    _demo_log(config, f"Creating optimizer: {normalized}")
-    opt = get_optimizer(normalized, model, config)
-    scheduler = _build_scheduler(opt, normalized, config)
-    governance_tracker = GovernanceMetricsTracker()
-    _demo_log(config, f"Starting training for {config.max_steps} optimizer steps")
-    train_start = time.time()
-    step_wall_start = train_start
-    step_compute_start = train_start
-    train_losses: List[float] = []
-    pure_tps_history: List[float] = []
-    wall_tps_history: List[float] = []
-    pure_step_time_history: List[float] = []
-    wall_step_time_history: List[float] = []
-    runtime_snapshot: Dict[str, Any] = {
-        "stress_mode": "none",
-        "scale": 1.0,
-        "ratio": 1.0,
-        "grad_rms": 0.0,
-        "scheduled_lr_used": float(config.lr),
-        "scheduled_lr_next": float(config.lr),
-        "effective_lr_main_used": float(config.lr),
-        "effective_lr_weight_decay_used": float(config.lr),
-        "train_sequences": train_sequence_count,
-        "train_tokens": train_token_count,
-        "train_chars": int(train_chunks.char_count),
-        "train_cap_chars": train_chunks.cap_chars,
-        "eval_sequences": eval_sequence_count,
-        "eval_tokens": eval_token_count,
-        "eval_chars": int(eval_chunks.char_count),
-        "eval_cap_chars": eval_chunks.cap_chars,
-        "sequences_per_optimizer_step": sequences_per_optimizer_step,
-        "tokens_per_optimizer_step": tokens_per_optimizer_step,
-        "steps_per_train_pass": steps_per_train_pass,
-        "epochs_completed": 0.0,
-    }
-    global_step = 0
-    accumulation_step = 0
-    while global_step < config.max_steps:
-        xb, yb = next(train_iter)
-        with torch.autocast(device_type=device.type, dtype=torch.bfloat16, enabled=(device.type == "cuda")):
-            outputs = model(input_ids=xb, labels=yb)
-            loss = outputs.loss / config.grad_accum
-        loss.backward()
-        accumulation_step += 1
-        if accumulation_step % config.grad_accum != 0:
-            continue
-        step_number = global_step + 1
-        grad_norm = torch.nn.utils.clip_grad_norm_(trainable_params, 1.0)
-        grad_norm_value = grad_norm.detach().item() if torch.is_tensor(grad_norm) else float(grad_norm)
-        grad_rms = 0.0 if normalized == "lbw_guard" else _compute_grad_rms(trainable_params)
-        loss_val = float(loss.item() * config.grad_accum)
-        scheduled_lr_used = _optimizer_group_learning_rate(opt, config)
-        if normalized == "lbw_guard":
-            opt.step()
-        else:
-            opt.step()
-        if scheduler is not None:
-            scheduler.step()
-        opt.zero_grad()
-        compute_end = time.time()
-        pure_step_time = max(compute_end - step_compute_start, 1e-12)
-        tokens_per_step = int(config.batch_size * config.seq_len * config.grad_accum)
-        pure_tps = tokens_per_step / pure_step_time
-        scheduled_lr_next = _current_learning_rate(opt, scheduler, config)
-        scale = 1.0
-        ratio = 1.0
-        ema_grad_rms = grad_rms
-        stress_mode = "none"
-        edition = normalized
-        effective_lr_main_used = scheduled_lr_used
-        effective_lr_weight_decay_used = scheduled_lr_used
-        if normalized == "lbw_guard":
-            lbw_state = dict(getattr(opt, "state", {}).get("lbw", {}) or {})
-            scale = float(lbw_state.get("scale", lbw_state.get("lbw_scale", 1.0)))
-            ratio = float(lbw_state.get("ratio", 1.0))
-            grad_rms = float(lbw_state.get("grad_rms", grad_rms))
-            ema_grad_rms = grad_rms / ratio if ratio > 0 else grad_rms
-            stress_mode = str(lbw_state.get("stress_mode", "unknown"))
-            edition = str(lbw_state.get("edition", lbw_state.get("mode", normalized)))
-            effective_lr_main_used = scheduled_lr_used * scale
-            effective_lr_weight_decay_used = scheduled_lr_used * scale
-        derived_gov_metrics = governance_tracker.update_step(
-            step=global_step,
-            trainable_params=trainable_params,
-            loss_val=loss_val,
-            grad_norm=grad_norm_value,
-            grad_rms=grad_rms,
-            ema_grad_rms=ema_grad_rms,
-            ratio=ratio,
-            scale=scale,
-            stress_mode=stress_mode,
-            current_lr=scheduled_lr_used,
-        )
-        train_losses.append(loss_val)
-        pure_tps_history.append(pure_tps)
-        pure_step_time_history.append(pure_step_time)
-        epochs_completed = (step_number * sequences_per_optimizer_step) / float(train_sequence_count)
-        eval_log: Dict[str, float] = {}
-        progress_every = max(
-            1,
-            min(
-                int(config.eval_every),
-                5 if int(config.max_steps) <= 50 else 10,
-            ),
-        )
-        if step_number % config.eval_every == 0:
-            avg_eval_loss, perp = evaluate_perplexity(model, eval_chunks, config, device)
-            eval_log = {
-                "eval/loss": avg_eval_loss,
-                "eval/perplexity": perp,
-            }
-            eval_log.update(
-                governance_tracker.update_eval(
-                    eval_loss=avg_eval_loss,
-                    eval_perplexity=perp,
-                    avg_tps_wall=(wall_tps_history[-1] if wall_tps_history else None),
-                )
-            )
-            _demo_log(
-                config,
-                f"step {step_number}/{config.max_steps}: loss={loss_val:.4f}, "
-                f"sampled_eval_loss={avg_eval_loss:.4f}, sampled_eval_ppl={perp:.4f}, "
-                f"scale={scale:.4f}, ratio={ratio:.4f}",
-            )
-            model.train()
-        elif bool(getattr(config, "print_all_metrics", False)) and (
-            step_number == 1
-            or step_number == config.max_steps
-            or step_number % progress_every == 0
-        ):
-            _demo_log(
-                config,
-                f"step {step_number}/{config.max_steps}: loss={loss_val:.4f}, scale={scale:.4f}, ratio={ratio:.4f}",
-            )
-        wall_end = time.time()
-        wall_step_time = max(wall_end - step_wall_start, 1e-12)
-        wall_tps = tokens_per_step / wall_step_time
-        wall_tps_history.append(wall_tps)
-        wall_step_time_history.append(wall_step_time)
-        train_log = {
-            "train/loss": loss_val,
-            "train/grad_norm": grad_norm_value,
-            "train/tokens_per_sec_pure": pure_tps,
-            "train/tokens_per_sec_wall": wall_tps,
-            "train/step_time_pure_sec": pure_step_time,
-            "train/step_time_wall_sec": wall_step_time,
-            "train/lr": scheduled_lr_used,
-            "train/lr_used": scheduled_lr_used,
-            "train/lr_next": scheduled_lr_next,
-            "train/effective_lr_main": effective_lr_main_used,
-            "train/effective_lr_weight_decay": effective_lr_weight_decay_used,
-            "train/steps_per_train_pass": float(steps_per_train_pass),
-            "train/epochs_completed": float(epochs_completed),
-            "lbw/scale": scale,
-            "lbw/ratio": ratio,
-            "lbw/grad_rms": grad_rms,
-            "lbw/ema_grad_rms": ema_grad_rms,
-            "lbw/stress_mode": stress_mode,
-            "lbw/edition": edition,
-        }
-        train_log.update(derived_gov_metrics)
-        if wb is not None:
-            wb.log({**train_log, **eval_log}, step=step_number)
-        runtime_snapshot = {
-            "stress_mode": stress_mode,
-            "scale": scale,
-            "ratio": ratio,
-            "grad_rms": grad_rms,
-            "scheduled_lr_used": scheduled_lr_used,
-            "scheduled_lr_next": scheduled_lr_next,
-            "effective_lr_main_used": effective_lr_main_used,
-            "effective_lr_weight_decay_used": effective_lr_weight_decay_used,
-            "train_sequences": train_sequence_count,
-            "train_tokens": train_token_count,
-            "train_chars": int(train_chunks.char_count),
-            "train_cap_chars": train_chunks.cap_chars,
-            "eval_sequences": eval_sequence_count,
-            "eval_tokens": eval_token_count,
-            "eval_chars": int(eval_chunks.char_count),
-            "eval_cap_chars": eval_chunks.cap_chars,
-            "sequences_per_optimizer_step": sequences_per_optimizer_step,
-            "tokens_per_optimizer_step": tokens_per_optimizer_step,
-            "steps_per_train_pass": steps_per_train_pass,
-            "epochs_completed": float(epochs_completed),
-        }
-        global_step += 1
-        step_wall_start = time.time()
-        step_compute_start = step_wall_start
-    training_wall_time = max(time.time() - train_start, 1e-12)
-    final_eval_is_full = bool(config.full_validation_ppl)
-    if final_eval_is_full:
-        final_eval_scope = "full_wikitext" if eval_chunks.cap_chars is None else "full_loaded_subset"
-        final_eval_scope_text = (
-            "over the full WikiText validation split"
-            if eval_chunks.cap_chars is None
-            else f"over the full loaded validation subset ({int(eval_chunks.char_count):,} chars; --eval-chars cap)"
-        )
-    else:
-        final_eval_scope = "sampled"
-        final_eval_scope_text = f"over {int(config.eval_batches)} sampled batches"
-    _demo_log(
-        config,
-        "Running final validation PPL " + final_eval_scope_text,
-    )
-    final_eval_start = time.time()
-    final_eval_loss, final_eval_perp = evaluate_perplexity(
-        model,
-        eval_chunks,
-        config,
-        device,
-        full_pass=final_eval_is_full,
-    )
-    final_eval_time_sec = max(time.time() - final_eval_start, 0.0)
-    final_eval_perp_lm_eval = None
-    final_eval_mmlu_acc_lm_eval = None
-    final_eval_arc_challenge_acc_lm_eval = None
-    lm_eval_status = "disabled"
-    lm_eval_error = None
-    lm_eval_ppl_status = "disabled"
-    lm_eval_ppl_error = None
-    lm_eval_acc_status = "disabled"
-    lm_eval_acc_error = None
-    lm_eval_time_sec = 0.0
-    if bool(config.use_lm_eval) and (bool(config.lm_eval_ppl) or bool(getattr(config, "lm_eval_acc", False))):
-        try:
-            lm_eval_start = time.time()
-            final_lm_eval_metrics = run_lm_eval_suite(model, tokenizer, config, device)
-            lm_eval_time_sec = max(time.time() - lm_eval_start, 0.0)
-            final_eval_perp_lm_eval = _safe_float(final_lm_eval_metrics.get("final_eval/perplexity_lm_eval"))
-            final_eval_mmlu_acc_lm_eval = _safe_float(final_lm_eval_metrics.get("final_eval/mmlu_acc_lm_eval"))
-            final_eval_arc_challenge_acc_lm_eval = _safe_float(
-                final_lm_eval_metrics.get("final_eval/arc_challenge_acc_lm_eval")
-            )
-            lm_eval_status = str(final_lm_eval_metrics.get("lm_eval_status") or "ok")
-            lm_eval_error = str(final_lm_eval_metrics.get("lm_eval_error") or "").strip() or None
-            lm_eval_ppl_status = str(final_lm_eval_metrics.get("lm_eval_ppl_status") or "disabled")
-            lm_eval_ppl_error = str(final_lm_eval_metrics.get("lm_eval_ppl_error") or "").strip() or None
-            lm_eval_acc_status = str(final_lm_eval_metrics.get("lm_eval_acc_status") or "disabled")
-            lm_eval_acc_error = str(final_lm_eval_metrics.get("lm_eval_acc_error") or "").strip() or None
-            if lm_eval_status in {"skipped", "partial"} and lm_eval_error:
-                print(f"[DemoRuntime] lm_eval issues: {lm_eval_error}")
-        except Exception as exc:
-            lm_eval_time_sec = max(time.time() - lm_eval_start, 0.0)
-            lm_eval_status = "skipped"
-            lm_eval_error = str(exc).strip() or type(exc).__name__
-            print(f"[DemoRuntime] lm_eval skipped: {lm_eval_error}")
-    wall_time = max(time.time() - train_start, 1e-12)
-    post_training_benchmark_time_sec = max(wall_time - training_wall_time, 0.0)
-    avg_tps_wall = float(sum(wall_tps_history) / len(wall_tps_history)) if wall_tps_history else 0.0
-    final_effect_metrics = governance_tracker.update_eval(
-        eval_loss=final_eval_loss,
-        eval_perplexity=final_eval_perp,
-        avg_tps_wall=avg_tps_wall,
-    )
-    governance_snapshot = governance_tracker.snapshot()
-    _demo_log(
-        config,
-        f"Finished: "
-        f"{'final_full_eval_loss' if final_eval_is_full else 'final_eval_loss'}={final_eval_loss:.4f}, "
-        f"{'final_full_eval_ppl' if final_eval_is_full else 'final_eval_ppl'}={final_eval_perp:.4f}, "
-        f"wall_time={wall_time:.1f}s",
-    )
-    if wb is not None:
-        wb.log(
-            {
-                "final/eval_loss": final_eval_loss,
-                "final/eval_perplexity": final_eval_perp,
-                **final_effect_metrics,
-            },
-            step=config.max_steps,
-        )
-        wb.finish()
-    return {
-        "optimizer": normalized,
-        "group_name": group_name,
-        "run_name": run_name,
-        "model_name": config.model_name,
-        "final_eval_loss": float(final_eval_loss),
-        "final_eval_perp": float(final_eval_perp),
-        "final_eval_perp_lm_eval": final_eval_perp_lm_eval,
-        "final_eval_mmlu_acc_lm_eval": final_eval_mmlu_acc_lm_eval,
-        "final_eval_arc_challenge_acc_lm_eval": final_eval_arc_challenge_acc_lm_eval,
-        "lm_eval_status": lm_eval_status,
-        "lm_eval_error": lm_eval_error,
-        "lm_eval_ppl_status": lm_eval_ppl_status,
-        "lm_eval_ppl_error": lm_eval_ppl_error,
-        "lm_eval_acc_status": lm_eval_acc_status,
-        "lm_eval_acc_error": lm_eval_acc_error,
-        "avg_tokens_per_sec_pure": float(sum(pure_tps_history) / len(pure_tps_history)) if pure_tps_history else 0.0,
-        "avg_tokens_per_sec_wall": avg_tps_wall,
-        "avg_step_time_pure_sec": float(sum(pure_step_time_history) / len(pure_step_time_history)) if pure_step_time_history else 0.0,
-        "avg_step_time_wall_sec": float(sum(wall_step_time_history) / len(wall_step_time_history)) if wall_step_time_history else 0.0,
-        "training_wall_time_sec": float(training_wall_time),
-        "final_eval_time_sec": float(final_eval_time_sec),
-        "lm_eval_time_sec": float(lm_eval_time_sec),
-        "post_training_benchmark_time_sec": float(post_training_benchmark_time_sec),
-        "wall_time_sec": float(wall_time),
-        "train_sequence_count": int(train_sequence_count),
-        "train_token_count": int(train_token_count),
-        "train_char_count": int(train_chunks.char_count),
-        "train_cap_chars": train_chunks.cap_chars,
-        "eval_sequence_count": int(eval_sequence_count),
-        "eval_token_count": int(eval_token_count),
-        "eval_char_count": int(eval_chunks.char_count),
-        "eval_cap_chars": eval_chunks.cap_chars,
-        "full_wikitext_train": bool(config.full_wikitext_train),
-        "full_wikitext_eval": bool(config.full_wikitext_eval),
-        "full_validation_ppl": bool(config.full_validation_ppl),
-        "final_eval_full_pass": bool(final_eval_is_full),
-        "final_eval_scope": final_eval_scope,
-        "sequences_per_optimizer_step": int(sequences_per_optimizer_step),
-        "tokens_per_optimizer_step": int(tokens_per_optimizer_step),
-        "steps_per_train_pass": int(steps_per_train_pass),
-        "epochs_completed": float((global_step * sequences_per_optimizer_step) / float(train_sequence_count)),
-        "runtime_snapshot": runtime_snapshot,
-        "governance_snapshot": governance_snapshot,
-        "final_effect_metrics": final_effect_metrics,
-        "train_loss_last": (float(train_losses[-1]) if train_losses else None),
-        "schedule_mode": config.schedule_mode,
-        "max_steps": int(config.max_steps),
-    }

app.py CHANGED Viewed

@@ -2,12 +2,12 @@ from __future__ import annotations
 import csv
 import gc
-import io
 import json
 import os
 import time
 import traceback
-from contextlib import redirect_stdout
 from pathlib import Path
 from typing import Any
@@ -23,8 +23,17 @@ os.environ.setdefault("DISABLE_SAFETENSORS_CONVERSION", "1")
 import gradio as gr
 import torch
-import _demo_runtime as runtime
 RUNS_DIR = ROOT / "runs"
@@ -34,81 +43,369 @@ def _device_default() -> str:
     return "cuda" if torch.cuda.is_available() else "cpu"
 def _safe_float(value: Any) -> float | None:
     if value is None:
         return None
     try:
-        return float(value)
     except Exception:
         return None
-def _build_config(
     *,
     model_name: str,
-    steps: int,
     lr: float,
-    seq_len: int,
-    train_chars: int,
-    eval_chars: int,
-    eval_batches: int,
     batch_size: int,
-    grad_accum: int,
-    seed: int,
-    device: str,
-) -> runtime.BenchmarkConfig:
-    config = runtime.BenchmarkConfig()
-    config.model_name = str(model_name).strip() or "Qwen/Qwen2.5-0.5B"
-    config.device = str(device or _device_default())
-    config.max_steps = int(steps)
-    config.lr = float(lr)
-    config.seq_len = int(seq_len)
-    config.batch_size = int(batch_size)
-    config.grad_accum = int(grad_accum)
-    config.warmup_steps = min(5, max(0, int(steps) // 5))
-    config.eval_every = max(1, min(int(steps), 10))
-    config.eval_batches = int(eval_batches)
-    config.max_chars = int(train_chars)
-    config.eval_chars = int(eval_chars)
-    config.full_wikitext_train = False
-    config.full_wikitext_eval = False
-    config.full_validation_ppl = False
-    config.schedule_mode = "all_cosine"
-    config.lora_r = 8
-    config.lora_alpha = 32
-    config.lora_dropout = 0.05
-    config.lbw_stats_freq = 5
-    config.lbw_stress_th = 1.1
-    config.lbw_spike_th = 1.5
-    config.lbw_rec_fast = 0.01
-    config.lbw_ema_decay = 0.95
-    config.use_wandb = False
-    config.use_lbwgov = False
-    config.print_all_metrics = True
-    config.output_dir = str((RUNS_DIR / f"run_{int(time.time())}").resolve())
-    config.use_lm_eval = False
-    config.lm_eval_ppl = False
-    config.lm_eval_acc = False
-    runtime.set_seed(int(seed), device=config.device)
-    return config
-def _result_row(result: dict[str, Any]) -> dict[str, Any]:
-    runtime_snapshot = dict(result.get("runtime_snapshot") or {})
-    governance_snapshot = dict(result.get("governance_snapshot") or {})
     return {
-        "optimizer": result.get("optimizer"),
-        "final_eval_perplexity": _safe_float(result.get("final_eval_perp")),
-        "final_eval_loss": _safe_float(result.get("final_eval_loss")),
-        "tokens_per_sec_wall": _safe_float(result.get("avg_tokens_per_sec_wall")),
-        "training_wall_time_sec": _safe_float(result.get("training_wall_time_sec")),
-        "wall_time_sec": _safe_float(result.get("wall_time_sec")),
-        "scale": _safe_float(runtime_snapshot.get("scale")),
-        "ratio": _safe_float(runtime_snapshot.get("ratio")),
-        "stress_mode": runtime_snapshot.get("stress_mode"),
-        "intervention_count": governance_snapshot.get("intervention_count"),
-        "regime_switch_count": governance_snapshot.get("regime_switch_count"),
     }
 def _gain_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
@@ -116,24 +413,30 @@ def _gain_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
     baseline = by_optimizer.get("adamw")
     if baseline is None:
         return []
-    gains = []
     for row in rows:
         if row.get("optimizer") == "adamw":
             continue
-        baseline_ppl = _safe_float(baseline.get("final_eval_perplexity"))
-        candidate_ppl = _safe_float(row.get("final_eval_perplexity"))
-        baseline_tps = _safe_float(baseline.get("tokens_per_sec_wall"))
-        candidate_tps = _safe_float(row.get("tokens_per_sec_wall"))
-        ppl_gain = None if baseline_ppl is None or candidate_ppl is None else baseline_ppl - candidate_ppl
-        speedup = None if baseline_tps in (None, 0.0) or candidate_tps is None else candidate_tps / baseline_tps
         gains.append(
             {
                 "optimizer": row.get("optimizer"),
-                "eval_perplexity_gain_vs_adamw": ppl_gain,
                 "eval_perplexity_pct_gain_vs_adamw": (
-                    None if baseline_ppl in (None, 0.0) or candidate_ppl is None else (baseline_ppl - candidate_ppl) / baseline_ppl
                 ),
-                "wall_tokens_per_sec_speedup_vs_adamw": speedup,
             }
         )
     return gains
@@ -149,220 +452,897 @@ def _write_csv(path: Path, rows: list[dict[str, Any]]) -> None:
         writer.writerows(rows)
-def run_demo(
     model_name: str,
-    steps: int,
-    lr: float,
     seq_len: int,
     train_chars: int,
     eval_chars: int,
-    eval_batches: int,
-    batch_size: int,
-    grad_accum: int,
     seed: int,
-    run_lbw_guard: bool,
-) -> Any:
-    if not run_lbw_guard:
-        optimizers = ["adamw"]
-    else:
-        optimizers = ["adamw", "lbw_guard"]
-    device = _device_default()
-    if device == "cpu" and int(steps) > 3:
-        yield (
-            "This Space is currently running on `cpu-basic`. "
-            "For CPU smoke checks, use `1-3` steps. For larger runs, switch the Space hardware to GPU first.",
-            None,
-            None,
-        )
-        return
-    if device == "cpu" and run_lbw_guard and int(steps) > 1:
-        yield (
-            "This Space is currently running on `cpu-basic`. "
-            "An AdamW + LBW comparison runs two full model passes, so CPU mode is capped at `1` step when comparison is enabled.",
-            None,
-            None,
-        )
-        return
-    config = _build_config(
-        model_name=model_name,
-        steps=steps,
-        lr=lr,
-        seq_len=seq_len,
-        train_chars=train_chars,
-        eval_chars=eval_chars,
-        eval_batches=eval_batches,
-        batch_size=batch_size,
-        grad_accum=grad_accum,
-        seed=seed,
-        device=device,
-    )
-    run_dir = Path(config.output_dir)
     run_dir.mkdir(parents=True, exist_ok=True)
-    log_buffer = io.StringIO()
     try:
-        results = []
-        yield (
-            f"Starting run on `{device}` with `{int(steps)}` optimizer step(s) for `{', '.join(optimizers)}`.\n\n"
-            "The first run may spend time downloading the model and WikiText dataset.",
-            None,
-            None,
         )
-        with redirect_stdout(log_buffer):
-            for optimizer_name in optimizers:
-                normalized = runtime.normalize_optimizer_name(optimizer_name)
-                ok, reason = runtime.check_optimizer_support(normalized, device=config.device)
-                if not ok:
-                    raise RuntimeError(f"{normalized}: {reason}")
-                yield (
-                    f"Running `{normalized}` on `{device}`...\n\n"
-                    "Progress inside the optimizer loop is written to the Space logs and will appear here when this phase completes.",
-                    None,
-                    None,
-                )
-                runtime.set_seed(int(seed), device=config.device)
-                run_config = runtime.BenchmarkConfig(**config.__dict__)
-                run_name = f"{normalized}_{int(time.time())}"
-                result = runtime.train_one_run(
-                    normalized,
-                    run_config,
-                    group_name="LBW-Guard-HF-Direct-Runner",
-                    run_name=run_name,
-                )
-                result["optimizer"] = normalized
-                results.append(result)
-                partial_rows = [_result_row(item) for item in results]
-                next_message = "Preparing the next phase..." if len(results) < len(optimizers) else "Preparing final metrics..."
-                yield (
-                    f"Completed `{normalized}`.\n\n"
-                    f"Finished phases: `{', '.join(str(row.get('optimizer')) for row in partial_rows)}`\n\n"
-                    f"{next_message}",
-                    None,
-                    None,
-                )
-                gc.collect()
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-        rows = [_result_row(result) for result in results]
         gains = _gain_rows(rows)
         payload = {
             "config": {
-                "model_name": model_name,
-                "device": device,
-                "steps": int(steps),
-                "lr": float(lr),
-                "seq_len": int(seq_len),
-                "train_chars": int(train_chars),
-                "eval_chars": int(eval_chars),
                 "eval_batches": int(eval_batches),
                 "batch_size": int(batch_size),
-                "grad_accum": int(grad_accum),
-                "seed": int(seed),
             },
-            "results": results,
-            "rows": rows,
             "gains": gains,
         }
-        json_path = run_dir / "lbw_guard_direct_runner_results.json"
-        csv_path = run_dir / "lbw_guard_direct_runner_metrics.csv"
-        gains_path = run_dir / "lbw_guard_direct_runner_gains.csv"
         json_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
         _write_csv(csv_path, rows)
         _write_csv(gains_path, gains)
-        summary = [
-            f"Device: `{device}`",
             "",
             "## Metrics",
             "",
-            "| Optimizer | Final Eval PPL | Final Eval Loss | Wall Tokens/s | Wall Time (s) | Scale | Ratio | Stress Mode |",
             "| --- | --- | --- | --- | --- | --- | --- | --- |",
         ]
         for row in rows:
             summary.append(
-                "| {optimizer} | {ppl:.4f} | {loss:.4f} | {tps:.2f} | {wall:.2f} | {scale:.4f} | {ratio:.4f} | {stress} |".format(
                     optimizer=row.get("optimizer"),
-                    ppl=float(row.get("final_eval_perplexity") or 0.0),
-                    loss=float(row.get("final_eval_loss") or 0.0),
-                    tps=float(row.get("tokens_per_sec_wall") or 0.0),
-                    wall=float(row.get("wall_time_sec") or 0.0),
-                    scale=float(row.get("scale") or 0.0),
-                    ratio=float(row.get("ratio") or 0.0),
                     stress=row.get("stress_mode") or "-",
                 )
             )
-        if gains:
-            summary.extend(["", "## Gains vs AdamW", ""])
-            for gain in gains:
-                pct = _safe_float(gain.get("eval_perplexity_pct_gain_vs_adamw"))
-                speedup = _safe_float(gain.get("wall_tokens_per_sec_speedup_vs_adamw"))
-                summary.append(
-                    f"- `{gain.get('optimizer')}` PPL gain: `{_safe_float(gain.get('eval_perplexity_gain_vs_adamw'))}`, "
-                    f"PPL pct gain: `{pct * 100.0:.2f}%`" if pct is not None else f"- `{gain.get('optimizer')}` PPL pct gain unavailable."
-                )
-                if speedup is not None:
-                    summary.append(f"- `{gain.get('optimizer')}` wall tokens/s speedup: `{speedup:.3f}x`.")
-        summary.extend(["", "## Runtime Log", "", "```text", log_buffer.getvalue()[-8000:], "```"])
-        yield "\n".join(summary), str(json_path), str(csv_path)
     except Exception:
         error_text = traceback.format_exc()
         error_path = run_dir / "error.txt"
-        error_path.write_text(error_text + "\n\n" + log_buffer.getvalue(), encoding="utf-8")
-        yield f"Run failed.\n\n```text\n{error_text}\n```", str(error_path), None
 INTRO = """
-# LBW Guard Direct Runner
-Run a compact AdamW vs `lbw_guard` LoRA smoke test directly inside this Hugging Face Space.
-Use GPU hardware for real runs. CPU mode is best treated as an import/build check.
-If the Space says `cpu-basic`, keep smoke tests to `1` step or change hardware to a GPU before running larger jobs.
 """
-with gr.Blocks(title="LBW Guard Direct Runner") as demo:
     gr.Markdown(INTRO)
-    with gr.Row():
-        model_name = gr.Textbox(value="Qwen/Qwen2.5-0.5B", label="Model")
-        run_lbw_guard = gr.Checkbox(value=True, label="Run LBW Guard comparison")
-    with gr.Row():
-        steps = gr.Slider(1, 20, value=1, step=1, label="Optimizer steps")
-        lr = gr.Number(value=5e-4, label="Learning rate")
-        seed = gr.Number(value=42, precision=0, label="Seed")
-    with gr.Row():
-        seq_len = gr.Dropdown([64, 128, 256], value=64, label="Sequence length")
-        batch_size = gr.Slider(1, 4, value=1, step=1, label="Batch size")
-        grad_accum = gr.Slider(1, 8, value=2, step=1, label="Gradient accumulation")
-    with gr.Row():
-        train_chars = gr.Slider(10_000, 500_000, value=50_000, step=10_000, label="Train char cap")
-        eval_chars = gr.Slider(5_000, 200_000, value=20_000, step=5_000, label="Eval char cap")
-        eval_batches = gr.Slider(1, 20, value=4, step=1, label="Eval batches")
-    run_button = gr.Button("Run Direct Smoke Test", variant="primary")
-    summary = gr.Markdown()
-    json_file = gr.File(label="Raw JSON")
-    metrics_file = gr.File(label="Metrics CSV")
-    run_button.click(
-        fn=run_demo,
-        inputs=[
-            model_name,
-            steps,
-            lr,
-            seq_len,
-            train_chars,
-            eval_chars,
-            eval_batches,
-            batch_size,
-            grad_accum,
-            seed,
-            run_lbw_guard,
-        ],
-        outputs=[summary, json_file, metrics_file],
-    )
 if __name__ == "__main__":

 import csv
 import gc
 import json
+import math
 import os
+import random
 import time
 import traceback
 from pathlib import Path
 from typing import Any
 import gradio as gr
 import torch
+from datasets import load_dataset
+from peft import LoraConfig, TaskType, get_peft_model
+from transformers import AutoModelForCausalLM, AutoTokenizer
+try:
+    import lbw
+except Exception as exc:  # pragma: no cover - shown in the Space UI.
+    lbw = None
+    LBW_IMPORT_ERROR = exc
+else:
+    LBW_IMPORT_ERROR = None
 RUNS_DIR = ROOT / "runs"
     return "cuda" if torch.cuda.is_available() else "cpu"
+def _set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
 def _safe_float(value: Any) -> float | None:
     if value is None:
         return None
     try:
+        out = float(value)
     except Exception:
         return None
+    if not math.isfinite(out):
+        return None
+    return out
+def _fmt_float(value: Any, digits: int = 4) -> str:
+    number = _safe_float(value)
+    return "-" if number is None else f"{number:.{digits}f}"
+def _append_log(logs: list[str], message: str) -> None:
+    logs.append(message)
+    print(message, flush=True)
+def _build_wikitext_chunks(
+    tokenizer,
+    *,
+    split: str,
+    max_chars: int | None,
+    seq_len: int,
+    logs: list[str],
+) -> dict[str, Any]:
+    cap = None if max_chars is None else int(max_chars)
+    _append_log(
+        logs,
+        f"Preparing WikiText split={split!r}" + (f" with char cap {cap:,}" if cap is not None else " with full split"),
+    )
+    ds = load_dataset("wikitext", "wikitext-103-raw-v1", split=split)
+    pieces: list[str] = []
+    chars_used = 0
+    rows_used = 0
+    first_piece = True
+    for row in ds:
+        text = str(row.get("text", "") or "")
+        if not text.strip():
+            continue
+        piece = text if first_piece else " " + text
+        if cap is not None:
+            remain = cap - chars_used
+            if remain <= 0:
+                break
+            if len(piece) > remain:
+                piece = piece[:remain]
+        pieces.append(piece)
+        chars_used += len(piece)
+        rows_used += 1
+        first_piece = False
+        if cap is not None and chars_used >= cap:
+            break
+    token_ids = tokenizer("".join(pieces), add_special_tokens=False)["input_ids"]
+    ids = torch.tensor(token_ids, dtype=torch.long)
+    sequence_count = ids.numel() // int(seq_len)
+    if sequence_count <= 0:
+        raise RuntimeError("Not enough tokens. Increase the train/eval char cap or reduce sequence length.")
+    ids = ids[: sequence_count * int(seq_len)].view(sequence_count, int(seq_len)).contiguous()
+    _append_log(
+        logs,
+        f"Prepared split={split!r}: {chars_used:,} chars across {rows_used:,} rows -> {ids.size(0):,} sequences",
+    )
+    return {"input_ids": ids, "chars": chars_used, "rows": rows_used, "cap": cap}
+def _batch_iter(chunks: dict[str, Any], *, batch_size: int, device: torch.device):
+    ids = chunks["input_ids"]
+    i = 0
+    while True:
+        if i + int(batch_size) > ids.size(0):
+            i = 0
+        batch = ids[i : i + int(batch_size)].to(device, non_blocking=True)
+        i += int(batch_size)
+        yield batch
+def _load_lora_model(
     *,
     model_name: str,
+    device: torch.device,
+    lora_r: int,
+    lora_alpha: int,
+    lora_dropout: float,
+):
+    dtype = torch.float16 if device.type == "cuda" else torch.float32
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=dtype,
+        low_cpu_mem_usage=True,
+    )
+    if getattr(model.config, "use_cache", None) is not None:
+        model.config.use_cache = False
+    model.to(device)
+    lora_cfg = LoraConfig(
+        r=int(lora_r),
+        lora_alpha=int(lora_alpha),
+        lora_dropout=float(lora_dropout),
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+        task_type=TaskType.CAUSAL_LM,
+        bias="none",
+    )
+    return get_peft_model(model, lora_cfg)
+def _make_optimizer(
+    name: str,
+    model,
+    *,
     lr: float,
+    betas: tuple[float, float],
+    weight_decay: float,
+    lbw_stats_freq: int,
+    lbw_stress_th: float,
+    lbw_spike_th: float,
+    lbw_rec_fast: float,
+    lbw_ema_decay: float,
+):
+    params = [param for param in model.parameters() if param.requires_grad]
+    if name == "adamw":
+        return torch.optim.AdamW(params, lr=float(lr), betas=betas, weight_decay=float(weight_decay))
+    if name == "lbw_guard":
+        if lbw is None:
+            raise RuntimeError(f"LBW Guard package import failed: {LBW_IMPORT_ERROR}")
+        return lbw.Guard(
+            params,
+            lr=float(lr),
+            betas=betas,
+            weight_decay=float(weight_decay),
+            mode="eval",
+            auto_enabled=True,
+            stats_freq=int(lbw_stats_freq),
+            stress_threshold=float(lbw_stress_th),
+            spike_threshold=float(lbw_spike_th),
+            recovery_fast=float(lbw_rec_fast),
+            ema_decay=float(lbw_ema_decay),
+            use_max_rms=True,
+        )
+    raise ValueError(f"Unknown optimizer: {name}")
+@torch.no_grad()
+def _evaluate_ppl(
+    model,
+    eval_chunks: dict[str, Any],
+    *,
     batch_size: int,
+    eval_batches: int,
+    device: torch.device,
+    full_pass: bool,
+) -> tuple[float, float]:
+    model.eval()
+    ids = eval_chunks["input_ids"]
+    max_sequences = ids.size(0) if full_pass else min(ids.size(0), int(eval_batches) * int(batch_size))
+    losses: list[float] = []
+    for start in range(0, max_sequences, int(batch_size)):
+        xb = ids[start : start + int(batch_size)].to(device, non_blocking=True)
+        with torch.autocast(device_type=device.type, dtype=torch.float16, enabled=(device.type == "cuda")):
+            loss = model(input_ids=xb, labels=xb).loss
+        losses.append(float(loss.detach().cpu()))
+    avg_loss = sum(losses) / max(len(losses), 1)
+    return avg_loss, math.exp(min(avg_loss, 20.0))
+def _optimizer_state(opt) -> dict[str, Any]:
+    state = dict(getattr(opt, "state", {}).get("lbw", {}) or {})
     return {
+        "scale": float(state.get("scale", state.get("lbw_scale", 1.0))),
+        "ratio": float(state.get("ratio", 1.0)),
+        "stress_mode": str(state.get("stress_mode", "none")),
+    }
+def _status_markdown(
+    *,
+    device_name: str,
+    rows: list[dict[str, Any]],
+    logs: list[str],
+    phase: str,
+) -> str:
+    summary = [
+        f"Device: `{device_name}`",
+        "",
+        f"Status: {phase}",
+        "",
+        "## Results",
+        "",
+        "| Optimizer | Final Eval PPL | Final Eval Loss | Scope | Scale | Ratio | Stress Mode | Wall Time (s) |",
+        "| --- | --- | --- | --- | --- | --- | --- | --- |",
+    ]
+    if rows:
+        for row in rows:
+            summary.append(
+                "| {optimizer} | {ppl} | {loss} | {scope} | {scale} | {ratio} | {stress} | {wall} |".format(
+                    optimizer=row.get("optimizer"),
+                    ppl=_fmt_float(row.get("final_eval_ppl")),
+                    loss=_fmt_float(row.get("final_eval_loss")),
+                    scope=row.get("final_eval_scope") or "-",
+                    scale=_fmt_float(row.get("scale")),
+                    ratio=_fmt_float(row.get("ratio")),
+                    stress=row.get("stress_mode") or "-",
+                    wall=_fmt_float(row.get("wall_time_sec"), digits=2),
+                )
+            )
+    else:
+        summary.append("| - | - | - | - | - | - | - | - |")
+    gains = _gain_rows(rows)
+    if gains:
+        summary.extend(["", "## LBW vs AdamW", ""])
+        for gain in gains:
+            pct = _safe_float(gain.get("eval_perplexity_pct_gain_vs_adamw"))
+            wall_speedup = _safe_float(gain.get("wall_time_speedup_vs_adamw"))
+            summary.append(
+                f"- `{gain.get('optimizer')}` PPL gain vs AdamW: `{_fmt_float(gain.get('eval_perplexity_gain_vs_adamw'))}`"
+                + (f" (`{pct * 100.0:.2f}%`)." if pct is not None else ".")
+            )
+            if wall_speedup is not None:
+                summary.append(f"- `{gain.get('optimizer')}` wall-time speedup vs AdamW: `{wall_speedup:.3f}x`.")
+    summary.extend(["", "## Runtime Log", "", "```text", "\n".join(logs[-80:]), "```"])
+    return "\n".join(summary)
+def _run_one_optimizer_events(
+    *,
+    optimizer_name: str,
+    model_name: str,
+    train_chunks: dict[str, Any],
+    eval_chunks: dict[str, Any],
+    device: torch.device,
+    seed: int,
+    max_steps: int,
+    eval_every: int,
+    eval_batches: int,
+    seq_len: int,
+    batch_size: int,
+    lr: float,
+    betas: tuple[float, float],
+    weight_decay: float,
+    full_validation_ppl: bool,
+    lora_r: int,
+    lora_alpha: int,
+    lora_dropout: float,
+    lbw_stats_freq: int,
+    lbw_stress_th: float,
+    lbw_spike_th: float,
+    lbw_rec_fast: float,
+    lbw_ema_decay: float,
+    logs: list[str],
+):
+    _set_seed(int(seed))
+    _append_log(logs, f"Loading {model_name} with LoRA for {optimizer_name}.")
+    model = _load_lora_model(
+        model_name=model_name,
+        device=device,
+        lora_r=lora_r,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+    )
+    model.train()
+    opt = _make_optimizer(
+        optimizer_name,
+        model,
+        lr=lr,
+        betas=betas,
+        weight_decay=weight_decay,
+        lbw_stats_freq=lbw_stats_freq,
+        lbw_stress_th=lbw_stress_th,
+        lbw_spike_th=lbw_spike_th,
+        lbw_rec_fast=lbw_rec_fast,
+        lbw_ema_decay=lbw_ema_decay,
+    )
+    train_batches = _batch_iter(train_chunks, batch_size=batch_size, device=device)
+    start_time = time.time()
+    last_loss = None
+    last_eval_loss = None
+    last_eval_ppl = None
+    state = _optimizer_state(opt)
+    trainable_params = [param for param in model.parameters() if param.requires_grad]
+    for step in range(1, int(max_steps) + 1):
+        xb = next(train_batches)
+        with torch.autocast(device_type=device.type, dtype=torch.float16, enabled=(device.type == "cuda")):
+            loss = model(input_ids=xb, labels=xb).loss
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(trainable_params, 1.0)
+        opt.step()
+        opt.zero_grad(set_to_none=True)
+        last_loss = float(loss.detach().cpu())
+        state = _optimizer_state(opt)
+        if step == 1 or step == int(max_steps) or step % int(eval_every) == 0:
+            last_eval_loss, last_eval_ppl = _evaluate_ppl(
+                model,
+                eval_chunks,
+                batch_size=batch_size,
+                eval_batches=eval_batches,
+                device=device,
+                full_pass=False,
+            )
+            message = (
+                f"{optimizer_name} step {step}/{int(max_steps)}: "
+                f"loss={last_loss:.4f}, sampled_eval_ppl={last_eval_ppl:.4f}, "
+                f"scale={state['scale']:.4f}, ratio={state['ratio']:.4f}"
+            )
+            _append_log(logs, message)
+            yield {"type": "progress", "message": message}
+            model.train()
+    final_full_pass = bool(full_validation_ppl)
+    if final_full_pass and eval_chunks["cap"] is None:
+        final_scope = "full_wikitext"
+    elif final_full_pass:
+        final_scope = "full_loaded_subset"
+    else:
+        final_scope = "sampled"
+    _append_log(logs, f"Running final {final_scope} validation PPL for {optimizer_name}.")
+    final_loss, final_ppl = _evaluate_ppl(
+        model,
+        eval_chunks,
+        batch_size=batch_size,
+        eval_batches=eval_batches,
+        device=device,
+        full_pass=final_full_pass,
+    )
+    state = _optimizer_state(opt)
+    wall_time = time.time() - start_time
+    result = {
+        "optimizer": optimizer_name,
+        "final_eval_ppl": final_ppl,
+        "final_eval_loss": final_loss,
+        "final_eval_scope": final_scope,
+        "train_chars": train_chunks["chars"],
+        "eval_chars": eval_chunks["chars"],
+        "train_sequences": int(train_chunks["input_ids"].size(0)),
+        "eval_sequences": int(eval_chunks["input_ids"].size(0)),
+        "tokens_per_step": int(batch_size) * int(seq_len),
+        "last_train_loss": last_loss,
+        "last_sampled_eval_loss": last_eval_loss,
+        "last_sampled_eval_ppl": last_eval_ppl,
+        "scale": state["scale"],
+        "ratio": state["ratio"],
+        "stress_mode": state["stress_mode"],
+        "wall_time_sec": wall_time,
     }
+    del model, opt
+    gc.collect()
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+    yield {"type": "result", "result": result}
 def _gain_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
     baseline = by_optimizer.get("adamw")
     if baseline is None:
         return []
+    baseline_ppl = _safe_float(baseline.get("final_eval_ppl"))
+    baseline_wall = _safe_float(baseline.get("wall_time_sec"))
+    gains: list[dict[str, Any]] = []
     for row in rows:
         if row.get("optimizer") == "adamw":
             continue
+        candidate_ppl = _safe_float(row.get("final_eval_ppl"))
+        candidate_wall = _safe_float(row.get("wall_time_sec"))
         gains.append(
             {
                 "optimizer": row.get("optimizer"),
+                "eval_perplexity_gain_vs_adamw": (
+                    None if baseline_ppl is None or candidate_ppl is None else baseline_ppl - candidate_ppl
+                ),
                 "eval_perplexity_pct_gain_vs_adamw": (
+                    None
+                    if baseline_ppl in (None, 0.0) or candidate_ppl is None
+                    else (baseline_ppl - candidate_ppl) / baseline_ppl
+                ),
+                "wall_time_speedup_vs_adamw": (
+                    None
+                    if baseline_wall in (None, 0.0) or candidate_wall in (None, 0.0)
+                    else baseline_wall / candidate_wall
                 ),
             }
         )
     return gains
         writer.writerows(rows)
+def _set_lr(opt, value: float) -> None:
+    for group in getattr(opt, "param_groups", []) or []:
+        group["lr"] = float(value)
+def _scheduled_lr(cfg: dict[str, Any], step: int) -> float:
+    base_lr = float(cfg["lr"])
+    warmup = max(int(cfg.get("warmup_steps", 0)), 0)
+    max_steps = max(int(cfg["max_steps"]), 1)
+    if warmup > 0 and int(step) <= warmup:
+        return base_lr * float(step) / float(warmup)
+    mode = str(cfg.get("schedule_mode", "constant")).strip().lower()
+    if mode == "cosine":
+        progress = (int(step) - warmup) / max(max_steps - warmup, 1)
+        progress = min(max(progress, 0.0), 1.0)
+        return base_lr * 0.5 * (1.0 + math.cos(math.pi * progress))
+    return base_lr
+def _parse_float_sweep(text: str, default: list[float]) -> list[float]:
+    raw = str(text or "").replace("\n", ",").replace(";", ",").split(",")
+    values: list[float] = []
+    for item in raw:
+        item = item.strip()
+        if not item:
+            continue
+        values.append(float(item))
+    return values or list(default)
+def _parse_int_sweep(text: str, default: list[int]) -> list[int]:
+    return [int(value) for value in _parse_float_sweep(text, [float(item) for item in default])]
+def run_easy_test(
     model_name: str,
+    run_lbw_guard: bool,
+    max_steps: int,
+    eval_every: int,
+    eval_batches: int,
     seq_len: int,
+    batch_size: int,
     train_chars: int,
     eval_chars: int,
+    full_wikitext_train: bool,
+    full_wikitext_eval: bool,
+    full_validation_ppl: bool,
+    lr: float,
     seed: int,
+):
+    logs: list[str] = []
+    rows: list[dict[str, Any]] = []
+    run_dir = RUNS_DIR / f"easy_test_{int(time.time())}"
     run_dir.mkdir(parents=True, exist_ok=True)
+    device_name = _device_default()
+    device = torch.device(device_name)
+    optimizers = ["adamw", "lbw_guard"] if bool(run_lbw_guard) else ["adamw"]
     try:
+        if device.type == "cpu" and (
+            int(max_steps) > 1
+            or int(train_chars) > 20_000
+            or int(eval_chars) > 8_000
+            or bool(full_wikitext_train)
+            or bool(full_wikitext_eval)
+            or bool(full_validation_ppl)
+        ):
+            yield (
+                "This Space is currently on `cpu-basic`. CPU mode is capped to 1 step, 20k train chars, "
+                "8k eval chars, and sampled validation. Switch the Space hardware to GPU for the Easy Test defaults.",
+                None,
+                None,
+                None,
+            )
+            return
+        if device.type == "cuda" and bool(run_lbw_guard) and torch.cuda.device_count() > 1:
+            yield (
+                "LBW Guard should run with one visible GPU. Set the Space to single-GPU hardware or restrict CUDA_VISIBLE_DEVICES.",
+                None,
+                None,
+                None,
+            )
+            return
+        _append_log(logs, f"Device: {device_name}")
+        if device.type == "cuda":
+            _append_log(logs, f"GPU: {torch.cuda.get_device_name(0)}")
+        _append_log(logs, f"Optimizers: {', '.join(optimizers)}")
+        yield _status_markdown(device_name=device_name, rows=rows, logs=logs, phase="Loading tokenizer"), None, None, None
+        _set_seed(int(seed))
+        resolved_model = str(model_name).strip() or "TinyLlama/TinyLlama_v1.1"
+        tokenizer = AutoTokenizer.from_pretrained(resolved_model, use_fast=True)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        train_cap = None if bool(full_wikitext_train) else int(train_chars)
+        eval_cap = None if bool(full_wikitext_eval) else int(eval_chars)
+        train_chunks = _build_wikitext_chunks(
+            tokenizer,
+            split="train",
+            max_chars=train_cap,
+            seq_len=int(seq_len),
+            logs=logs,
         )
+        yield _status_markdown(device_name=device_name, rows=rows, logs=logs, phase="Prepared train split"), None, None, None
+        eval_chunks = _build_wikitext_chunks(
+            tokenizer,
+            split="validation",
+            max_chars=eval_cap,
+            seq_len=int(seq_len),
+            logs=logs,
+        )
+        yield _status_markdown(device_name=device_name, rows=rows, logs=logs, phase="Prepared validation split"), None, None, None
+        for optimizer_name in optimizers:
+            _append_log(logs, f"=== {optimizer_name} ===")
+            yield _status_markdown(
+                device_name=device_name,
+                rows=rows,
+                logs=logs,
+                phase=f"Running {optimizer_name}",
+            ), None, None, None
+            for event in _run_one_optimizer_events(
+                optimizer_name=optimizer_name,
+                model_name=resolved_model,
+                train_chunks=train_chunks,
+                eval_chunks=eval_chunks,
+                device=device,
+                seed=int(seed),
+                max_steps=int(max_steps),
+                eval_every=max(1, int(eval_every)),
+                eval_batches=int(eval_batches),
+                seq_len=int(seq_len),
+                batch_size=int(batch_size),
+                lr=float(lr),
+                betas=(0.9, 0.999),
+                weight_decay=0.01,
+                full_validation_ppl=bool(full_validation_ppl),
+                lora_r=8,
+                lora_alpha=16,
+                lora_dropout=0.05,
+                lbw_stats_freq=10,
+                lbw_stress_th=1.1,
+                lbw_spike_th=1.5,
+                lbw_rec_fast=0.01,
+                lbw_ema_decay=0.95,
+                logs=logs,
+            ):
+                if event.get("type") == "result":
+                    rows.append(event["result"])
+                yield _status_markdown(
+                    device_name=device_name,
+                    rows=rows,
+                    logs=logs,
+                    phase=f"Running {optimizer_name}",
+                ), None, None, None
         gains = _gain_rows(rows)
         payload = {
+            "source": "LBW_Guard_Easy_Test_COLAB.ipynb",
             "config": {
+                "model_name": resolved_model,
+                "device": device_name,
+                "optimizers": optimizers,
+                "seed": int(seed),
+                "max_steps": int(max_steps),
+                "eval_every": int(eval_every),
                 "eval_batches": int(eval_batches),
+                "seq_len": int(seq_len),
                 "batch_size": int(batch_size),
+                "max_chars": train_cap,
+                "eval_chars": eval_cap,
+                "full_wikitext_train": bool(full_wikitext_train),
+                "full_wikitext_eval": bool(full_wikitext_eval),
+                "full_validation_ppl": bool(full_validation_ppl),
+                "lr": float(lr),
+                "betas": [0.9, 0.999],
+                "weight_decay": 0.01,
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lbw_stats_freq": 10,
+                "lbw_stress_th": 1.1,
+                "lbw_spike_th": 1.5,
+                "lbw_rec_fast": 0.01,
+                "lbw_ema_decay": 0.95,
             },
+            "results": rows,
             "gains": gains,
+            "logs": logs,
         }
+        json_path = run_dir / "lbw_guard_easy_test_results.json"
+        csv_path = run_dir / "lbw_guard_easy_test_results.csv"
+        gains_path = run_dir / "lbw_guard_easy_test_gains.csv"
         json_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
         _write_csv(csv_path, rows)
         _write_csv(gains_path, gains)
+        _append_log(logs, f"Wrote {csv_path}")
+        yield (
+            _status_markdown(device_name=device_name, rows=rows, logs=logs, phase="Complete"),
+            str(json_path),
+            str(csv_path),
+            str(gains_path),
+        )
+    except Exception:
+        error_text = traceback.format_exc()
+        error_path = run_dir / "error.txt"
+        error_path.write_text(error_text + "\n\n" + "\n".join(logs), encoding="utf-8")
+        yield f"Run failed.\n\n```text\n{error_text}\n```", str(error_path), None, None
+def _make_ablation_scenario(slug: str, label: str, note: str, base_config: dict[str, Any], overrides=None):
+    cfg = dict(base_config)
+    if overrides:
+        cfg.update(overrides)
+    return {
+        "slug": slug,
+        "label": label,
+        "note": note,
+        "config": cfg,
+    }
+def _build_ablation_scenarios(
+    *,
+    selected_ablations: list[str],
+    base_config: dict[str, Any],
+    lr_sweep: list[float],
+    step_sweep: list[int],
+    lora_r_sweep: list[int],
+) -> list[dict[str, Any]]:
+    selected = {str(item).strip().lower() for item in selected_ablations if str(item).strip()}
+    if not selected:
+        selected = {"optimizer"}
+    scenarios: list[dict[str, Any]] = []
+    if "optimizer" in selected:
+        scenarios.append(
+            _make_ablation_scenario(
+                "optimizer-adamw-vs-lbw-guard",
+                "Optimizer: AdamW vs lbw_guard",
+                "Direct optimizer comparison with the base config.",
+                base_config,
+            )
+        )
+    if "lr" in selected:
+        for lr in lr_sweep:
+            scenarios.append(
+                _make_ablation_scenario(
+                    f"lr-{lr:g}",
+                    f"Learning Rate: {lr:g}",
+                    "Learning-rate sensitivity check.",
+                    base_config,
+                    {"lr": float(lr)},
+                )
+            )
+    if "schedule" in selected:
+        for mode in ["constant", "cosine"]:
+            scenarios.append(
+                _make_ablation_scenario(
+                    f"schedule-{mode}",
+                    f"Schedule: {mode}",
+                    "Scheduler-shape sensitivity check.",
+                    base_config,
+                    {"schedule_mode": mode},
+                )
+            )
+    if "steps" in selected:
+        for steps in step_sweep:
+            scenarios.append(
+                _make_ablation_scenario(
+                    f"steps-{steps}",
+                    f"Steps: {steps}",
+                    "Training-length sensitivity check.",
+                    base_config,
+                    {"max_steps": int(steps), "eval_every": max(1, int(steps) // 4)},
+                )
+            )
+    if "data" in selected:
+        for item in [
+            {"max_chars": 20_000, "eval_chars": 8_000, "label": "small-data"},
+            {"max_chars": 80_000, "eval_chars": 20_000, "label": "larger-data"},
+        ]:
+            scenarios.append(
+                _make_ablation_scenario(
+                    item["label"],
+                    f"Data Slice: {item['label']}",
+                    "WikiText slice-size sensitivity check.",
+                    base_config,
+                    {"max_chars": int(item["max_chars"]), "eval_chars": int(item["eval_chars"])},
+                )
+            )
+    if "lora" in selected:
+        for rank in lora_r_sweep:
+            scenarios.append(
+                _make_ablation_scenario(
+                    f"lora-r{rank}",
+                    f"LoRA Rank: {rank}",
+                    "Adapter-capacity sensitivity check.",
+                    base_config,
+                    {"lora_r": int(rank), "lora_alpha": int(rank) * 2},
+                )
+            )
+    if not scenarios:
+        raise ValueError("No scenarios selected. Choose optimizer, lr, schedule, steps, data, or lora.")
+    return scenarios
+def _ablation_status_markdown(
+    *,
+    device_name: str,
+    rows: list[dict[str, Any]],
+    logs: list[str],
+    phase: str,
+    plan: list[dict[str, Any]],
+) -> str:
+    summary = [
+        f"Device: `{device_name}`",
+        "",
+        f"Status: {phase}",
+        "",
+        "## Plan",
+        "",
+        "| Scenario | Steps | LR | Schedule | Train Chars | Eval Chars | LoRA r |",
+        "| --- | --- | --- | --- | --- | --- | --- |",
+    ]
+    for item in plan:
+        cfg = item["config"]
+        summary.append(
+            "| {label} | {steps} | {lr:g} | {schedule} | {train_chars} | {eval_chars} | {lora_r} |".format(
+                label=item["label"],
+                steps=int(cfg["max_steps"]),
+                lr=float(cfg["lr"]),
+                schedule=cfg["schedule_mode"],
+                train_chars="FULL" if cfg["full_wikitext_train"] else int(cfg["max_chars"]),
+                eval_chars="FULL" if cfg["full_wikitext_eval"] else int(cfg["eval_chars"]),
+                lora_r=int(cfg["lora_r"]),
+            )
+        )
+    summary.extend(
+        [
             "",
             "## Metrics",
             "",
+            "| Scenario | Optimizer | Final Eval PPL | Final Eval Loss | Tokens/s | Scale | Ratio | Stress Mode |",
             "| --- | --- | --- | --- | --- | --- | --- | --- |",
         ]
+    )
+    if rows:
         for row in rows:
             summary.append(
+                "| {scenario} | {optimizer} | {ppl} | {loss} | {tps} | {scale} | {ratio} | {stress} |".format(
+                    scenario=row.get("scenario"),
                     optimizer=row.get("optimizer"),
+                    ppl=_fmt_float(row.get("final_eval_ppl")),
+                    loss=_fmt_float(row.get("final_eval_loss")),
+                    tps=_fmt_float(row.get("tokens_per_sec_wall"), digits=2),
+                    scale=_fmt_float(row.get("scale")),
+                    ratio=_fmt_float(row.get("ratio")),
                     stress=row.get("stress_mode") or "-",
                 )
             )
+    else:
+        summary.append("| - | - | - | - | - | - | - | - |")
+    gains = _build_ablation_gain_rows(rows)
+    if gains:
+        summary.extend(["", "## LBW vs AdamW", ""])
+        for gain in gains:
+            summary.append(
+                f"- `{gain.get('scenario')}`: `{gain.get('optimizer')}` "
+                f"PPL gain `{_fmt_float(gain.get('ppl_gain_pct_vs_adamw'))}%`, "
+                f"loss gain `{_fmt_float(gain.get('loss_gain_pct_vs_adamw'))}%`, "
+                f"speed gain `{_fmt_float(gain.get('speed_gain_pct_vs_adamw'))}%`."
+            )
+    summary.extend(["", "## Runtime Log", "", "```text", "\n".join(logs[-100:]), "```"])
+    return "\n".join(summary)
+def _run_ablation_optimizer_events(
+    *,
+    scenario_item: dict[str, Any],
+    optimizer_name: str,
+    model_name: str,
+    train_chunks: dict[str, Any],
+    eval_chunks: dict[str, Any],
+    device: torch.device,
+    logs: list[str],
+):
+    cfg = scenario_item["config"]
+    _set_seed(int(cfg["seed"]))
+    _append_log(logs, f"Loading {model_name} with LoRA for {scenario_item['slug']} / {optimizer_name}.")
+    model = _load_lora_model(
+        model_name=model_name,
+        device=device,
+        lora_r=int(cfg["lora_r"]),
+        lora_alpha=int(cfg["lora_alpha"]),
+        lora_dropout=float(cfg["lora_dropout"]),
+    )
+    model.train()
+    opt = _make_optimizer(
+        optimizer_name,
+        model,
+        lr=float(cfg["lr"]),
+        betas=tuple(cfg["betas"]),
+        weight_decay=float(cfg["weight_decay"]),
+        lbw_stats_freq=int(cfg["lbw_stats_freq"]),
+        lbw_stress_th=float(cfg["lbw_stress_th"]),
+        lbw_spike_th=float(cfg["lbw_spike_th"]),
+        lbw_rec_fast=float(cfg["lbw_rec_fast"]),
+        lbw_ema_decay=float(cfg["lbw_ema_decay"]),
+    )
+    train_batches = _batch_iter(train_chunks, batch_size=int(cfg["batch_size"]), device=device)
+    trainable_params = [param for param in model.parameters() if param.requires_grad]
+    start_time = time.time()
+    losses: list[float] = []
+    eval_loss = None
+    eval_ppl = None
+    last_lr = float(cfg["lr"])
+    state = _optimizer_state(opt)
+    for step in range(1, int(cfg["max_steps"]) + 1):
+        last_lr = _scheduled_lr(cfg, step)
+        _set_lr(opt, last_lr)
+        xb = next(train_batches)
+        with torch.autocast(device_type=device.type, dtype=torch.float16, enabled=(device.type == "cuda")):
+            loss = model(input_ids=xb, labels=xb).loss
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(trainable_params, 1.0)
+        opt.step()
+        opt.zero_grad(set_to_none=True)
+        loss_value = float(loss.detach().cpu())
+        losses.append(loss_value)
+        if step == 1 or step == int(cfg["max_steps"]) or step % int(cfg["eval_every"]) == 0:
+            eval_loss, eval_ppl = _evaluate_ppl(
+                model,
+                eval_chunks,
+                batch_size=int(cfg["batch_size"]),
+                eval_batches=int(cfg["eval_batches"]),
+                device=device,
+                full_pass=False,
+            )
+            state = _optimizer_state(opt)
+            message = (
+                f"[{scenario_item['slug']}] {optimizer_name} step {step}/{cfg['max_steps']}: "
+                f"loss={loss_value:.4f}, sampled_eval_ppl={eval_ppl:.4f}, "
+                f"lr={last_lr:.2e}, scale={state['scale']:.4f}, ratio={state['ratio']:.4f}"
+            )
+            _append_log(logs, message)
+            yield {"type": "progress", "message": message}
+            model.train()
+    final_full_pass = bool(cfg["full_validation_ppl"])
+    if final_full_pass and eval_chunks["cap"] is None:
+        final_scope = "full_wikitext"
+    elif final_full_pass:
+        final_scope = "full_loaded_subset"
+    else:
+        final_scope = "sampled"
+    _append_log(logs, f"Running final {final_scope} validation PPL for {scenario_item['slug']} / {optimizer_name}.")
+    final_loss, final_ppl = _evaluate_ppl(
+        model,
+        eval_chunks,
+        batch_size=int(cfg["batch_size"]),
+        eval_batches=int(cfg["eval_batches"]),
+        device=device,
+        full_pass=final_full_pass,
+    )
+    state = _optimizer_state(opt)
+    wall_time = max(time.time() - start_time, 1e-9)
+    trained_tokens = int(cfg["max_steps"]) * int(cfg["batch_size"]) * int(cfg["seq_len"])
+    result = {
+        "scenario_slug": scenario_item["slug"],
+        "scenario": scenario_item["label"],
+        "optimizer": optimizer_name,
+        "final_eval_ppl": final_ppl,
+        "final_eval_loss": final_loss,
+        "train_loss_last": losses[-1] if losses else None,
+        "last_sampled_eval_loss": eval_loss,
+        "last_sampled_eval_ppl": eval_ppl,
+        "final_eval_scope": final_scope,
+        "max_steps": int(cfg["max_steps"]),
+        "lr": float(cfg["lr"]),
+        "scheduled_lr_last": float(last_lr),
+        "schedule_mode": str(cfg["schedule_mode"]),
+        "batch_size": int(cfg["batch_size"]),
+        "seq_len": int(cfg["seq_len"]),
+        "lora_r": int(cfg["lora_r"]),
+        "train_chars": int(train_chunks["chars"]),
+        "eval_chars": int(eval_chunks["chars"]),
+        "train_sequences": int(train_chunks["input_ids"].size(0)),
+        "eval_sequences": int(eval_chunks["input_ids"].size(0)),
+        "scale": state["scale"],
+        "ratio": state["ratio"],
+        "stress_mode": state["stress_mode"],
+        "wall_time_sec": wall_time,
+        "tokens_per_sec_wall": trained_tokens / wall_time,
+    }
+    del model, opt
+    gc.collect()
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+    yield {"type": "result", "result": result}
+def _build_ablation_gain_rows(metrics: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    grouped: dict[str, list[dict[str, Any]]] = {}
+    for row in metrics:
+        grouped.setdefault(str(row.get("scenario_slug")), []).append(row)
+    gain_rows: list[dict[str, Any]] = []
+    for scenario_slug, rows in grouped.items():
+        baseline = next((row for row in rows if row.get("optimizer") == "adamw"), None)
+        if baseline is None:
+            continue
+        baseline_ppl = _safe_float(baseline.get("final_eval_ppl"))
+        baseline_loss = _safe_float(baseline.get("final_eval_loss"))
+        baseline_tps = _safe_float(baseline.get("tokens_per_sec_wall"))
+        for row in rows:
+            if row.get("optimizer") == "adamw":
+                continue
+            candidate_ppl = _safe_float(row.get("final_eval_ppl"))
+            candidate_loss = _safe_float(row.get("final_eval_loss"))
+            candidate_tps = _safe_float(row.get("tokens_per_sec_wall"))
+            gain_rows.append(
+                {
+                    "scenario_slug": scenario_slug,
+                    "scenario": row.get("scenario"),
+                    "optimizer": row.get("optimizer"),
+                    "adamw_final_eval_ppl": baseline_ppl,
+                    "optimizer_final_eval_ppl": candidate_ppl,
+                    "ppl_gain_pct_vs_adamw": (
+                        None
+                        if baseline_ppl in (None, 0.0) or candidate_ppl is None
+                        else (baseline_ppl - candidate_ppl) / baseline_ppl * 100.0
+                    ),
+                    "loss_gain_pct_vs_adamw": (
+                        None
+                        if baseline_loss in (None, 0.0) or candidate_loss is None
+                        else (baseline_loss - candidate_loss) / baseline_loss * 100.0
+                    ),
+                    "speed_gain_pct_vs_adamw": (
+                        None
+                        if baseline_tps in (None, 0.0) or candidate_tps is None
+                        else (candidate_tps - baseline_tps) / baseline_tps * 100.0
+                    ),
+                    "adamw_tokens_per_sec_wall": baseline_tps,
+                    "optimizer_tokens_per_sec_wall": candidate_tps,
+                    "lbw_scale": row.get("scale"),
+                    "lbw_ratio": row.get("ratio"),
+                    "lbw_stress_mode": row.get("stress_mode"),
+                }
+            )
+    return gain_rows
+def run_ablation_test(
+    model_name: str,
+    selected_ablations: list[str],
+    run_lbw_guard: bool,
+    max_steps: int,
+    eval_every: int,
+    eval_batches: int,
+    seq_len: int,
+    batch_size: int,
+    train_chars: int,
+    eval_chars: int,
+    full_wikitext_train: bool,
+    full_wikitext_eval: bool,
+    full_validation_ppl: bool,
+    lr: float,
+    schedule_mode: str,
+    warmup_steps: int,
+    seed: int,
+    lr_sweep_text: str,
+    step_sweep_text: str,
+    lora_r_sweep_text: str,
+):
+    logs: list[str] = []
+    rows: list[dict[str, Any]] = []
+    run_dir = RUNS_DIR / f"ablation_test_{int(time.time())}"
+    run_dir.mkdir(parents=True, exist_ok=True)
+    device_name = _device_default()
+    device = torch.device(device_name)
+    optimizers = ["adamw", "lbw_guard"] if bool(run_lbw_guard) else ["adamw"]
+    try:
+        base_config = {
+            "seed": int(seed),
+            "max_steps": int(max_steps),
+            "eval_every": max(1, int(eval_every)),
+            "eval_batches": int(eval_batches),
+            "seq_len": int(seq_len),
+            "batch_size": int(batch_size),
+            "max_chars": int(train_chars),
+            "eval_chars": int(eval_chars),
+            "full_wikitext_train": bool(full_wikitext_train),
+            "full_wikitext_eval": bool(full_wikitext_eval),
+            "full_validation_ppl": bool(full_validation_ppl),
+            "lr": float(lr),
+            "betas": (0.9, 0.999),
+            "weight_decay": 0.01,
+            "warmup_steps": int(warmup_steps),
+            "schedule_mode": str(schedule_mode or "constant").strip().lower(),
+            "lora_r": 8,
+            "lora_alpha": 16,
+            "lora_dropout": 0.05,
+            "lbw_stats_freq": 10,
+            "lbw_stress_th": 1.1,
+            "lbw_spike_th": 1.5,
+            "lbw_rec_fast": 0.01,
+            "lbw_ema_decay": 0.95,
+        }
+        lr_sweep = _parse_float_sweep(lr_sweep_text, [1e-3, 5e-4])
+        step_sweep = _parse_int_sweep(step_sweep_text, [100, 200])
+        lora_r_sweep = _parse_int_sweep(lora_r_sweep_text, [4, 8, 16])
+        scenarios = _build_ablation_scenarios(
+            selected_ablations=list(selected_ablations or ["optimizer"]),
+            base_config=base_config,
+            lr_sweep=lr_sweep,
+            step_sweep=step_sweep,
+            lora_r_sweep=lora_r_sweep,
+        )
+        if device.type == "cpu" and (
+            len(scenarios) > 1
+            or int(max_steps) > 1
+            or int(train_chars) > 20_000
+            or int(eval_chars) > 8_000
+            or bool(full_wikitext_train)
+            or bool(full_wikitext_eval)
+            or bool(full_validation_ppl)
+        ):
+            yield (
+                "This Space is currently on `cpu-basic`. CPU ablation mode is capped to one optimizer scenario, "
+                "1 step, 20k train chars, 8k eval chars, and sampled validation. Switch the Space hardware to GPU for ablations.",
+                None,
+                None,
+                None,
+            )
+            return
+        if device.type == "cuda" and bool(run_lbw_guard) and torch.cuda.device_count() > 1:
+            yield (
+                "LBW Guard should run with one visible GPU. Set the Space to single-GPU hardware or restrict CUDA_VISIBLE_DEVICES.",
+                None,
+                None,
+                None,
+            )
+            return
+        resolved_model = str(model_name).strip() or "Qwen/Qwen2.5-0.5B"
+        _append_log(logs, f"Device: {device_name}")
+        if device.type == "cuda":
+            _append_log(logs, f"GPU: {torch.cuda.get_device_name(0)}")
+        _append_log(logs, f"Selected ablations: {', '.join(selected_ablations or ['optimizer'])}")
+        _append_log(logs, f"Optimizers: {', '.join(optimizers)}")
+        yield _ablation_status_markdown(
+            device_name=device_name,
+            rows=rows,
+            logs=logs,
+            phase="Loading tokenizer",
+            plan=scenarios,
+        ), None, None, None
+        tokenizer = AutoTokenizer.from_pretrained(resolved_model, use_fast=True)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        data_cache: dict[tuple[int, int | None, int | None], dict[str, dict[str, Any]]] = {}
+        for scenario_item in scenarios:
+            cfg = scenario_item["config"]
+            train_cap = None if cfg["full_wikitext_train"] else int(cfg["max_chars"])
+            eval_cap = None if cfg["full_wikitext_eval"] else int(cfg["eval_chars"])
+            cache_key = (int(cfg["seq_len"]), train_cap, eval_cap)
+            if cache_key not in data_cache:
+                data_cache[cache_key] = {
+                    "train": _build_wikitext_chunks(
+                        tokenizer,
+                        split="train",
+                        max_chars=train_cap,
+                        seq_len=int(cfg["seq_len"]),
+                        logs=logs,
+                    ),
+                    "eval": _build_wikitext_chunks(
+                        tokenizer,
+                        split="validation",
+                        max_chars=eval_cap,
+                        seq_len=int(cfg["seq_len"]),
+                        logs=logs,
+                    ),
+                }
+            _append_log(logs, f"=== Scenario: {scenario_item['label']} ===")
+            for optimizer_name in optimizers:
+                _append_log(logs, f"--- {optimizer_name} ---")
+                yield _ablation_status_markdown(
+                    device_name=device_name,
+                    rows=rows,
+                    logs=logs,
+                    phase=f"Running {scenario_item['label']} / {optimizer_name}",
+                    plan=scenarios,
+                ), None, None, None
+                for event in _run_ablation_optimizer_events(
+                    scenario_item=scenario_item,
+                    optimizer_name=optimizer_name,
+                    model_name=resolved_model,
+                    train_chunks=data_cache[cache_key]["train"],
+                    eval_chunks=data_cache[cache_key]["eval"],
+                    device=device,
+                    logs=logs,
+                ):
+                    if event.get("type") == "result":
+                        rows.append(event["result"])
+                    yield _ablation_status_markdown(
+                        device_name=device_name,
+                        rows=rows,
+                        logs=logs,
+                        phase=f"Running {scenario_item['label']} / {optimizer_name}",
+                        plan=scenarios,
+                    ), None, None, None
+        gains = _build_ablation_gain_rows(rows)
+        payload = {
+            "source": "LBW_Guard_Ablation_Test_COLAB.ipynb",
+            "model_name": resolved_model,
+            "device": device_name,
+            "optimizers": optimizers,
+            "selected_ablations": list(selected_ablations or ["optimizer"]),
+            "base_config": base_config,
+            "scenarios": scenarios,
+            "results": rows,
+            "gains": gains,
+            "logs": logs,
+        }
+        json_path = run_dir / "lbw_guard_ablation_results.json"
+        metrics_path = run_dir / "lbw_guard_ablation_metrics.csv"
+        gains_path = run_dir / "lbw_guard_ablation_gains.csv"
+        json_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+        _write_csv(metrics_path, rows)
+        _write_csv(gains_path, gains)
+        _append_log(logs, f"Wrote {metrics_path}")
+        _append_log(logs, f"Wrote {gains_path}")
+        yield (
+            _ablation_status_markdown(device_name=device_name, rows=rows, logs=logs, phase="Complete", plan=scenarios),
+            str(json_path),
+            str(metrics_path),
+            str(gains_path),
+        )
     except Exception:
         error_text = traceback.format_exc()
         error_path = run_dir / "error.txt"
+        error_path.write_text(error_text + "\n\n" + "\n".join(logs), encoding="utf-8")
+        yield f"Run failed.\n\n```text\n{error_text}\n```", str(error_path), None, None
 INTRO = """
+# LBW Guard Colab Tests
+Runs notebook-faithful Hugging Face Space versions of:
+- `LBW_Guard_Easy_Test_COLAB.ipynb`
+- `LBW_Guard_Ablation_Test_COLAB.ipynb`
+Current hardware is detected at run time. GPU is recommended for the default Easy Test.
 """
+with gr.Blocks(title="LBW Guard Colab Tests") as demo:
     gr.Markdown(INTRO)
+    with gr.Tabs():
+        with gr.Tab("Easy Test"):
+            with gr.Row():
+                easy_model_name = gr.Textbox(value="TinyLlama/TinyLlama_v1.1", label="Model")
+                easy_run_lbw_guard = gr.Checkbox(value=True, label="Run LBW Guard comparison")
+            with gr.Row():
+                easy_max_steps = gr.Slider(1, 1000, value=5, step=1, label="Optimizer steps")
+                easy_eval_every = gr.Slider(1, 200, value=5, step=1, label="Eval every")
+                easy_eval_batches = gr.Slider(1, 128, value=8, step=1, label="Eval batches")
+            with gr.Row():
+                easy_seq_len = gr.Dropdown([64, 128, 256, 512], value=64, label="Sequence length")
+                easy_batch_size = gr.Slider(1, 8, value=1, step=1, label="Batch size")
+                easy_lr = gr.Number(value=5e-4, label="Learning rate")
+            with gr.Row():
+                easy_train_chars = gr.Slider(5_000, 2_000_000, value=20_000, step=5_000, label="Train char cap")
+                easy_eval_chars = gr.Slider(1_000, 500_000, value=8_000, step=1_000, label="Eval char cap")
+                easy_seed = gr.Number(value=42, precision=0, label="Seed")
+            with gr.Row():
+                easy_full_wikitext_train = gr.Checkbox(value=False, label="Full WikiText train")
+                easy_full_wikitext_eval = gr.Checkbox(value=False, label="Full WikiText eval")
+                easy_full_validation_ppl = gr.Checkbox(value=False, label="Full validation PPL")
+            easy_run_button = gr.Button("Run Easy Test", variant="primary")
+            easy_summary = gr.Markdown()
+            easy_json_file = gr.File(label="Raw JSON")
+            easy_results_file = gr.File(label="Results CSV")
+            easy_gains_file = gr.File(label="Gains CSV")
+            easy_run_button.click(
+                fn=run_easy_test,
+                inputs=[
+                    easy_model_name,
+                    easy_run_lbw_guard,
+                    easy_max_steps,
+                    easy_eval_every,
+                    easy_eval_batches,
+                    easy_seq_len,
+                    easy_batch_size,
+                    easy_train_chars,
+                    easy_eval_chars,
+                    easy_full_wikitext_train,
+                    easy_full_wikitext_eval,
+                    easy_full_validation_ppl,
+                    easy_lr,
+                    easy_seed,
+                ],
+                outputs=[easy_summary, easy_json_file, easy_results_file, easy_gains_file],
+            )
+        with gr.Tab("Ablation Test"):
+            with gr.Row():
+                ablation_model_name = gr.Textbox(value="Qwen/Qwen2.5-0.5B", label="Model")
+                ablation_run_lbw_guard = gr.Checkbox(value=True, label="Run LBW Guard comparison")
+            selected_ablations = gr.CheckboxGroup(
+                choices=["optimizer", "lr", "schedule", "steps", "data", "lora"],
+                value=["optimizer"],
+                label="Ablations",
+            )
+            with gr.Row():
+                ablation_max_steps = gr.Slider(1, 1000, value=200, step=1, label="Base optimizer steps")
+                ablation_eval_every = gr.Slider(1, 200, value=50, step=1, label="Eval every")
+                ablation_eval_batches = gr.Slider(1, 128, value=8, step=1, label="Eval batches")
+            with gr.Row():
+                ablation_seq_len = gr.Dropdown([64, 128, 256, 512], value=64, label="Sequence length")
+                ablation_batch_size = gr.Slider(1, 8, value=1, step=1, label="Batch size")
+                ablation_lr = gr.Number(value=5e-4, label="Base learning rate")
+            with gr.Row():
+                ablation_train_chars = gr.Slider(5_000, 2_000_000, value=20_000, step=5_000, label="Train char cap")
+                ablation_eval_chars = gr.Slider(1_000, 500_000, value=8_000, step=1_000, label="Eval char cap")
+                ablation_seed = gr.Number(value=42, precision=0, label="Seed")
+            with gr.Row():
+                ablation_schedule_mode = gr.Dropdown(["constant", "cosine"], value="constant", label="Base schedule")
+                ablation_warmup_steps = gr.Slider(0, 100, value=10, step=1, label="Warmup steps")
+            with gr.Row():
+                ablation_full_wikitext_train = gr.Checkbox(value=False, label="Full WikiText train")
+                ablation_full_wikitext_eval = gr.Checkbox(value=False, label="Full WikiText eval")
+                ablation_full_validation_ppl = gr.Checkbox(value=False, label="Full validation PPL")
+            with gr.Row():
+                lr_sweep_text = gr.Textbox(value="1e-3, 5e-4", label="LR sweep")
+                step_sweep_text = gr.Textbox(value="100, 200", label="Step sweep")
+                lora_r_sweep_text = gr.Textbox(value="4, 8, 16", label="LoRA r sweep")
+            ablation_run_button = gr.Button("Run Ablation Test", variant="primary")
+            ablation_summary = gr.Markdown()
+            ablation_json_file = gr.File(label="Raw JSON")
+            ablation_metrics_file = gr.File(label="Metrics CSV")
+            ablation_gains_file = gr.File(label="Gains CSV")
+            ablation_run_button.click(
+                fn=run_ablation_test,
+                inputs=[
+                    ablation_model_name,
+                    selected_ablations,
+                    ablation_run_lbw_guard,
+                    ablation_max_steps,
+                    ablation_eval_every,
+                    ablation_eval_batches,
+                    ablation_seq_len,
+                    ablation_batch_size,
+                    ablation_train_chars,
+                    ablation_eval_chars,
+                    ablation_full_wikitext_train,
+                    ablation_full_wikitext_eval,
+                    ablation_full_validation_ppl,
+                    ablation_lr,
+                    ablation_schedule_mode,
+                    ablation_warmup_steps,
+                    ablation_seed,
+                    lr_sweep_text,
+                    step_sweep_text,
+                    lora_r_sweep_text,
+                ],
+                outputs=[ablation_summary, ablation_json_file, ablation_metrics_file, ablation_gains_file],
+            )
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 torch
-transformers
-datasets
-peft
-accelerate
 lbw-guard==1.1.3

 torch
+transformers>=4.45
+datasets>=2.20
+peft>=0.12
+accelerate>=0.33
+sentencepiece
 lbw-guard==1.1.3