Spaces:

Qluon
/

lbw-guard-direct-runner

Running

App Files Files Community

Radianis commited on 2 days ago

Commit

ff99487

1 Parent(s): df976c3

Guard CPU runner settings and stream status

Browse files

Files changed (1) hide show

app.py +44 -4

app.py CHANGED Viewed

@@ -161,12 +161,29 @@ def run_demo(
     grad_accum: int,
     seed: int,
     run_lbw_guard: bool,
-) -> tuple[str, str | None, str | None]:
     if not run_lbw_guard:
         optimizers = ["adamw"]
     else:
         optimizers = ["adamw", "lbw_guard"]
     device = _device_default()
     config = _build_config(
         model_name=model_name,
         steps=steps,
@@ -186,12 +203,24 @@ def run_demo(
     log_buffer = io.StringIO()
     try:
         results = []
         with redirect_stdout(log_buffer):
             for optimizer_name in optimizers:
                 normalized = runtime.normalize_optimizer_name(optimizer_name)
                 ok, reason = runtime.check_optimizer_support(normalized, device=config.device)
                 if not ok:
                     raise RuntimeError(f"{normalized}: {reason}")
                 runtime.set_seed(int(seed), device=config.device)
                 run_config = runtime.BenchmarkConfig(**config.__dict__)
                 run_name = f"{normalized}_{int(time.time())}"
@@ -203,6 +232,15 @@ def run_demo(
                 )
                 result["optimizer"] = normalized
                 results.append(result)
                 gc.collect()
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()
@@ -267,12 +305,12 @@ def run_demo(
                 if speedup is not None:
                     summary.append(f"- `{gain.get('optimizer')}` wall tokens/s speedup: `{speedup:.3f}x`.")
         summary.extend(["", "## Runtime Log", "", "```text", log_buffer.getvalue()[-8000:], "```"])
-        return "\n".join(summary), str(json_path), str(csv_path)
     except Exception:
         error_text = traceback.format_exc()
         error_path = run_dir / "error.txt"
         error_path.write_text(error_text + "\n\n" + log_buffer.getvalue(), encoding="utf-8")
-        return f"Run failed.\n\n```text\n{error_text}\n```", str(error_path), None
 INTRO = """
@@ -281,6 +319,8 @@ INTRO = """
 Run a compact AdamW vs `lbw_guard` LoRA smoke test directly inside this Hugging Face Space.
 Use GPU hardware for real runs. CPU mode is best treated as an import/build check.
 """
@@ -290,7 +330,7 @@ with gr.Blocks(title="LBW Guard Direct Runner") as demo:
         model_name = gr.Textbox(value="Qwen/Qwen2.5-0.5B", label="Model")
         run_lbw_guard = gr.Checkbox(value=True, label="Run LBW Guard comparison")
     with gr.Row():
-        steps = gr.Slider(1, 100, value=5, step=1, label="Optimizer steps")
         lr = gr.Number(value=5e-4, label="Learning rate")
         seed = gr.Number(value=42, precision=0, label="Seed")
     with gr.Row():

     grad_accum: int,
     seed: int,
     run_lbw_guard: bool,
+) -> Any:
     if not run_lbw_guard:
         optimizers = ["adamw"]
     else:
         optimizers = ["adamw", "lbw_guard"]
     device = _device_default()
+    if device == "cpu" and int(steps) > 3:
+        yield (
+            "This Space is currently running on `cpu-basic`. "
+            "For CPU smoke checks, use `1-3` steps. For larger runs, switch the Space hardware to GPU first.",
+            None,
+            None,
+        )
+        return
+    if device == "cpu" and run_lbw_guard and int(steps) > 1:
+        yield (
+            "This Space is currently running on `cpu-basic`. "
+            "An AdamW + LBW comparison runs two full model passes, so CPU mode is capped at `1` step when comparison is enabled.",
+            None,
+            None,
+        )
+        return
     config = _build_config(
         model_name=model_name,
         steps=steps,
     log_buffer = io.StringIO()
     try:
         results = []
+        yield (
+            f"Starting run on `{device}` with `{int(steps)}` optimizer step(s) for `{', '.join(optimizers)}`.\n\n"
+            "The first run may spend time downloading the model and WikiText dataset.",
+            None,
+            None,
+        )
         with redirect_stdout(log_buffer):
             for optimizer_name in optimizers:
                 normalized = runtime.normalize_optimizer_name(optimizer_name)
                 ok, reason = runtime.check_optimizer_support(normalized, device=config.device)
                 if not ok:
                     raise RuntimeError(f"{normalized}: {reason}")
+                yield (
+                    f"Running `{normalized}` on `{device}`...\n\n"
+                    "Progress inside the optimizer loop is written to the Space logs and will appear here when this phase completes.",
+                    None,
+                    None,
+                )
                 runtime.set_seed(int(seed), device=config.device)
                 run_config = runtime.BenchmarkConfig(**config.__dict__)
                 run_name = f"{normalized}_{int(time.time())}"
                 )
                 result["optimizer"] = normalized
                 results.append(result)
+                partial_rows = [_result_row(item) for item in results]
+                next_message = "Preparing the next phase..." if len(results) < len(optimizers) else "Preparing final metrics..."
+                yield (
+                    f"Completed `{normalized}`.\n\n"
+                    f"Finished phases: `{', '.join(str(row.get('optimizer')) for row in partial_rows)}`\n\n"
+                    f"{next_message}",
+                    None,
+                    None,
+                )
                 gc.collect()
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()
                 if speedup is not None:
                     summary.append(f"- `{gain.get('optimizer')}` wall tokens/s speedup: `{speedup:.3f}x`.")
         summary.extend(["", "## Runtime Log", "", "```text", log_buffer.getvalue()[-8000:], "```"])
+        yield "\n".join(summary), str(json_path), str(csv_path)
     except Exception:
         error_text = traceback.format_exc()
         error_path = run_dir / "error.txt"
         error_path.write_text(error_text + "\n\n" + log_buffer.getvalue(), encoding="utf-8")
+        yield f"Run failed.\n\n```text\n{error_text}\n```", str(error_path), None
 INTRO = """
 Run a compact AdamW vs `lbw_guard` LoRA smoke test directly inside this Hugging Face Space.
 Use GPU hardware for real runs. CPU mode is best treated as an import/build check.
+If the Space says `cpu-basic`, keep smoke tests to `1` step or change hardware to a GPU before running larger jobs.
 """
         model_name = gr.Textbox(value="Qwen/Qwen2.5-0.5B", label="Model")
         run_lbw_guard = gr.Checkbox(value=True, label="Run LBW Guard comparison")
     with gr.Row():
+        steps = gr.Slider(1, 20, value=1, step=1, label="Optimizer steps")
         lr = gr.Number(value=5e-4, label="Learning rate")
         seed = gr.Number(value=42, precision=0, label="Seed")
     with gr.Row():