Radianis commited on
Commit
ff99487
·
1 Parent(s): df976c3

Guard CPU runner settings and stream status

Browse files
Files changed (1) hide show
  1. app.py +44 -4
app.py CHANGED
@@ -161,12 +161,29 @@ def run_demo(
161
  grad_accum: int,
162
  seed: int,
163
  run_lbw_guard: bool,
164
- ) -> tuple[str, str | None, str | None]:
165
  if not run_lbw_guard:
166
  optimizers = ["adamw"]
167
  else:
168
  optimizers = ["adamw", "lbw_guard"]
169
  device = _device_default()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  config = _build_config(
171
  model_name=model_name,
172
  steps=steps,
@@ -186,12 +203,24 @@ def run_demo(
186
  log_buffer = io.StringIO()
187
  try:
188
  results = []
 
 
 
 
 
 
189
  with redirect_stdout(log_buffer):
190
  for optimizer_name in optimizers:
191
  normalized = runtime.normalize_optimizer_name(optimizer_name)
192
  ok, reason = runtime.check_optimizer_support(normalized, device=config.device)
193
  if not ok:
194
  raise RuntimeError(f"{normalized}: {reason}")
 
 
 
 
 
 
195
  runtime.set_seed(int(seed), device=config.device)
196
  run_config = runtime.BenchmarkConfig(**config.__dict__)
197
  run_name = f"{normalized}_{int(time.time())}"
@@ -203,6 +232,15 @@ def run_demo(
203
  )
204
  result["optimizer"] = normalized
205
  results.append(result)
 
 
 
 
 
 
 
 
 
206
  gc.collect()
207
  if torch.cuda.is_available():
208
  torch.cuda.empty_cache()
@@ -267,12 +305,12 @@ def run_demo(
267
  if speedup is not None:
268
  summary.append(f"- `{gain.get('optimizer')}` wall tokens/s speedup: `{speedup:.3f}x`.")
269
  summary.extend(["", "## Runtime Log", "", "```text", log_buffer.getvalue()[-8000:], "```"])
270
- return "\n".join(summary), str(json_path), str(csv_path)
271
  except Exception:
272
  error_text = traceback.format_exc()
273
  error_path = run_dir / "error.txt"
274
  error_path.write_text(error_text + "\n\n" + log_buffer.getvalue(), encoding="utf-8")
275
- return f"Run failed.\n\n```text\n{error_text}\n```", str(error_path), None
276
 
277
 
278
  INTRO = """
@@ -281,6 +319,8 @@ INTRO = """
281
  Run a compact AdamW vs `lbw_guard` LoRA smoke test directly inside this Hugging Face Space.
282
 
283
  Use GPU hardware for real runs. CPU mode is best treated as an import/build check.
 
 
284
  """
285
 
286
 
@@ -290,7 +330,7 @@ with gr.Blocks(title="LBW Guard Direct Runner") as demo:
290
  model_name = gr.Textbox(value="Qwen/Qwen2.5-0.5B", label="Model")
291
  run_lbw_guard = gr.Checkbox(value=True, label="Run LBW Guard comparison")
292
  with gr.Row():
293
- steps = gr.Slider(1, 100, value=5, step=1, label="Optimizer steps")
294
  lr = gr.Number(value=5e-4, label="Learning rate")
295
  seed = gr.Number(value=42, precision=0, label="Seed")
296
  with gr.Row():
 
161
  grad_accum: int,
162
  seed: int,
163
  run_lbw_guard: bool,
164
+ ) -> Any:
165
  if not run_lbw_guard:
166
  optimizers = ["adamw"]
167
  else:
168
  optimizers = ["adamw", "lbw_guard"]
169
  device = _device_default()
170
+ if device == "cpu" and int(steps) > 3:
171
+ yield (
172
+ "This Space is currently running on `cpu-basic`. "
173
+ "For CPU smoke checks, use `1-3` steps. For larger runs, switch the Space hardware to GPU first.",
174
+ None,
175
+ None,
176
+ )
177
+ return
178
+ if device == "cpu" and run_lbw_guard and int(steps) > 1:
179
+ yield (
180
+ "This Space is currently running on `cpu-basic`. "
181
+ "An AdamW + LBW comparison runs two full model passes, so CPU mode is capped at `1` step when comparison is enabled.",
182
+ None,
183
+ None,
184
+ )
185
+ return
186
+
187
  config = _build_config(
188
  model_name=model_name,
189
  steps=steps,
 
203
  log_buffer = io.StringIO()
204
  try:
205
  results = []
206
+ yield (
207
+ f"Starting run on `{device}` with `{int(steps)}` optimizer step(s) for `{', '.join(optimizers)}`.\n\n"
208
+ "The first run may spend time downloading the model and WikiText dataset.",
209
+ None,
210
+ None,
211
+ )
212
  with redirect_stdout(log_buffer):
213
  for optimizer_name in optimizers:
214
  normalized = runtime.normalize_optimizer_name(optimizer_name)
215
  ok, reason = runtime.check_optimizer_support(normalized, device=config.device)
216
  if not ok:
217
  raise RuntimeError(f"{normalized}: {reason}")
218
+ yield (
219
+ f"Running `{normalized}` on `{device}`...\n\n"
220
+ "Progress inside the optimizer loop is written to the Space logs and will appear here when this phase completes.",
221
+ None,
222
+ None,
223
+ )
224
  runtime.set_seed(int(seed), device=config.device)
225
  run_config = runtime.BenchmarkConfig(**config.__dict__)
226
  run_name = f"{normalized}_{int(time.time())}"
 
232
  )
233
  result["optimizer"] = normalized
234
  results.append(result)
235
+ partial_rows = [_result_row(item) for item in results]
236
+ next_message = "Preparing the next phase..." if len(results) < len(optimizers) else "Preparing final metrics..."
237
+ yield (
238
+ f"Completed `{normalized}`.\n\n"
239
+ f"Finished phases: `{', '.join(str(row.get('optimizer')) for row in partial_rows)}`\n\n"
240
+ f"{next_message}",
241
+ None,
242
+ None,
243
+ )
244
  gc.collect()
245
  if torch.cuda.is_available():
246
  torch.cuda.empty_cache()
 
305
  if speedup is not None:
306
  summary.append(f"- `{gain.get('optimizer')}` wall tokens/s speedup: `{speedup:.3f}x`.")
307
  summary.extend(["", "## Runtime Log", "", "```text", log_buffer.getvalue()[-8000:], "```"])
308
+ yield "\n".join(summary), str(json_path), str(csv_path)
309
  except Exception:
310
  error_text = traceback.format_exc()
311
  error_path = run_dir / "error.txt"
312
  error_path.write_text(error_text + "\n\n" + log_buffer.getvalue(), encoding="utf-8")
313
+ yield f"Run failed.\n\n```text\n{error_text}\n```", str(error_path), None
314
 
315
 
316
  INTRO = """
 
319
  Run a compact AdamW vs `lbw_guard` LoRA smoke test directly inside this Hugging Face Space.
320
 
321
  Use GPU hardware for real runs. CPU mode is best treated as an import/build check.
322
+
323
+ If the Space says `cpu-basic`, keep smoke tests to `1` step or change hardware to a GPU before running larger jobs.
324
  """
325
 
326
 
 
330
  model_name = gr.Textbox(value="Qwen/Qwen2.5-0.5B", label="Model")
331
  run_lbw_guard = gr.Checkbox(value=True, label="Run LBW Guard comparison")
332
  with gr.Row():
333
+ steps = gr.Slider(1, 20, value=1, step=1, label="Optimizer steps")
334
  lr = gr.Number(value=5e-4, label="Learning rate")
335
  seed = gr.Number(value=42, precision=0, label="Seed")
336
  with gr.Row():