Radianis commited on
Commit
f7a0325
·
0 Parent(s):

Add LBW Guard Colab Space

Browse files
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ .DS_Store
4
+ .env
LBW_Guard_Ablation_Test_COLAB.ipynb ADDED
@@ -0,0 +1,602 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "Copyright (c) Qluon Inc. All rights reserved.\n",
8
+ "\n",
9
+ "Provided for Learn-By-Wire Guard evaluation and customer testing under the applicable Qluon license terms.\n",
10
+ "\n",
11
+ "# LBW Guard Ablation Colab\n",
12
+ "\n",
13
+ "This notebook is a black-box ablation test for `lbw_guard` in a lighter Colab form:\n",
14
+ "\n",
15
+ "1. Build one or more ablation scenarios.\n",
16
+ "2. Run the same model, data slice, and training loop for `adamw` and `lbw_guard`.\n",
17
+ "3. Write common metrics and LBW-vs-AdamW gain tables.\n",
18
+ "\n",
19
+ "It does not import local source folders. The only LBW code used is the installed `LBW-Guard` package that provides `lbw.Guard`.\n"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": null,
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "# @title 1. Install public dependencies and LBW Guard\n",
29
+ "import subprocess\n",
30
+ "import sys\n",
31
+ "\n",
32
+ "public_deps = [\n",
33
+ " \"transformers>=4.45\",\n",
34
+ " \"datasets>=2.20\",\n",
35
+ " \"peft>=0.12\",\n",
36
+ " \"accelerate>=0.33\",\n",
37
+ " \"sentencepiece\",\n",
38
+ " \"pandas\",\n",
39
+ "]\n",
40
+ "subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"--upgrade\", *public_deps])\n",
41
+ "\n",
42
+ "# Colab can include an old torchao build. Newer PEFT versions reject it,\n",
43
+ "# and this notebook does not need torchao for LoRA, so remove it if present.\n",
44
+ "subprocess.call([sys.executable, \"-m\", \"pip\", \"uninstall\", \"-y\", \"-q\", \"torchao\"])\n",
45
+ "\n",
46
+ "subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"LBW-Guard\"])\n",
47
+ "print(\"Dependency install complete. If this cell changed packages, restart runtime and run all cells once.\")\n"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": null,
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "# @title 2. Configure ablation plan\n",
57
+ "import importlib\n",
58
+ "from copy import deepcopy\n",
59
+ "\n",
60
+ "import torch\n",
61
+ "\n",
62
+ "lbw = importlib.import_module(\"lbw\")\n",
63
+ "print(\"lbw module:\", lbw.__file__)\n",
64
+ "print(\"lbw.Guard:\", lbw.Guard)\n",
65
+ "\n",
66
+ "MODEL_NAME = \"Qwen/Qwen2.5-0.5B\"\n",
67
+ "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
68
+ "OPTIMIZERS = [\"adamw\", \"lbw_guard\"]\n",
69
+ "\n",
70
+ "# Keep the default close to the local ablation test objective, but small enough for Colab.\n",
71
+ "# Add \"lr\", \"schedule\", \"steps\", \"data\", or \"lora\" for a wider matrix.\n",
72
+ "ABLATIONS = [\"optimizer\"]\n",
73
+ "\n",
74
+ "BASE_CONFIG = {\n",
75
+ " \"seed\": 42,\n",
76
+ " \"max_steps\": 200,\n",
77
+ " \"eval_every\": 50,\n",
78
+ " \"eval_batches\": 8,\n",
79
+ " \"seq_len\": 64,\n",
80
+ " \"batch_size\": 1,\n",
81
+ " \"max_chars\": 20000,\n",
82
+ " \"eval_chars\": 8000,\n",
83
+ " \"full_wikitext_train\": False,\n",
84
+ " \"full_wikitext_eval\": False,\n",
85
+ " \"full_validation_ppl\": False,\n",
86
+ " \"lr\": 5e-4,\n",
87
+ " \"betas\": (0.9, 0.999),\n",
88
+ " \"weight_decay\": 0.01,\n",
89
+ " \"warmup_steps\": 10,\n",
90
+ " \"schedule_mode\": \"constant\", # constant or cosine\n",
91
+ " \"lora_r\": 8,\n",
92
+ " \"lora_alpha\": 16,\n",
93
+ " \"lora_dropout\": 0.05,\n",
94
+ " \"lbw_stats_freq\": 10,\n",
95
+ " \"lbw_stress_th\": 1.1,\n",
96
+ " \"lbw_spike_th\": 1.5,\n",
97
+ " \"lbw_rec_fast\": 0.01,\n",
98
+ " \"lbw_ema_decay\": 0.95,\n",
99
+ "}\n",
100
+ "\n",
101
+ "LR_SWEEP = [1e-3, 5e-4]\n",
102
+ "SCHEDULE_SWEEP = [\"constant\", \"cosine\"]\n",
103
+ "STEP_SWEEP = [100, 200]\n",
104
+ "DATA_SWEEP = [\n",
105
+ " {\"max_chars\": 20000, \"eval_chars\": 8000, \"label\": \"small-data\"},\n",
106
+ " {\"max_chars\": 80000, \"eval_chars\": 20000, \"label\": \"larger-data\"},\n",
107
+ "]\n",
108
+ "LORA_R_SWEEP = [4, 8, 16]\n",
109
+ "\n",
110
+ "print(\"Device:\", DEVICE)\n",
111
+ "if DEVICE == \"cuda\":\n",
112
+ " print(\"GPU:\", torch.cuda.get_device_name(0))\n",
113
+ "print(\"Selected ablations:\", ABLATIONS)\n",
114
+ "print(\"Default optimizer steps:\", BASE_CONFIG[\"max_steps\"])\n"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": null,
120
+ "metadata": {},
121
+ "outputs": [],
122
+ "source": [
123
+ "# @title 3. Define ablation scenarios\n",
124
+ "import pandas as pd\n",
125
+ "\n",
126
+ "\n",
127
+ "def scenario(slug, label, note, overrides=None):\n",
128
+ " cfg = deepcopy(BASE_CONFIG)\n",
129
+ " if overrides:\n",
130
+ " cfg.update(overrides)\n",
131
+ " return {\n",
132
+ " \"slug\": slug,\n",
133
+ " \"label\": label,\n",
134
+ " \"note\": note,\n",
135
+ " \"config\": cfg,\n",
136
+ " \"optimizers\": list(OPTIMIZERS),\n",
137
+ " }\n",
138
+ "\n",
139
+ "\n",
140
+ "def build_scenarios():\n",
141
+ " selected = {str(item).strip().lower() for item in ABLATIONS}\n",
142
+ " scenarios = []\n",
143
+ "\n",
144
+ " if \"optimizer\" in selected:\n",
145
+ " scenarios.append(scenario(\n",
146
+ " \"optimizer-adamw-vs-lbw-guard\",\n",
147
+ " \"Optimizer: AdamW vs lbw_guard\",\n",
148
+ " \"Direct optimizer comparison with the base config.\",\n",
149
+ " ))\n",
150
+ "\n",
151
+ " if \"lr\" in selected:\n",
152
+ " for lr in LR_SWEEP:\n",
153
+ " scenarios.append(scenario(\n",
154
+ " f\"lr-{lr:g}\",\n",
155
+ " f\"Learning Rate: {lr:g}\",\n",
156
+ " \"Learning-rate sensitivity check.\",\n",
157
+ " {\"lr\": float(lr)},\n",
158
+ " ))\n",
159
+ "\n",
160
+ " if \"schedule\" in selected:\n",
161
+ " for mode in SCHEDULE_SWEEP:\n",
162
+ " scenarios.append(scenario(\n",
163
+ " f\"schedule-{mode}\",\n",
164
+ " f\"Schedule: {mode}\",\n",
165
+ " \"Scheduler-shape sensitivity check.\",\n",
166
+ " {\"schedule_mode\": mode},\n",
167
+ " ))\n",
168
+ "\n",
169
+ " if \"steps\" in selected:\n",
170
+ " for steps in STEP_SWEEP:\n",
171
+ " scenarios.append(scenario(\n",
172
+ " f\"steps-{steps}\",\n",
173
+ " f\"Steps: {steps}\",\n",
174
+ " \"Training-length sensitivity check.\",\n",
175
+ " {\"max_steps\": int(steps), \"eval_every\": max(1, int(steps) // 4)},\n",
176
+ " ))\n",
177
+ "\n",
178
+ " if \"data\" in selected:\n",
179
+ " for item in DATA_SWEEP:\n",
180
+ " label = item.get(\"label\", f\"data-{item['max_chars']}\")\n",
181
+ " scenarios.append(scenario(\n",
182
+ " label,\n",
183
+ " f\"Data Slice: {label}\",\n",
184
+ " \"WikiText slice-size sensitivity check.\",\n",
185
+ " {\"max_chars\": int(item[\"max_chars\"]), \"eval_chars\": int(item[\"eval_chars\"])},\n",
186
+ " ))\n",
187
+ "\n",
188
+ " if \"lora\" in selected:\n",
189
+ " for rank in LORA_R_SWEEP:\n",
190
+ " scenarios.append(scenario(\n",
191
+ " f\"lora-r{rank}\",\n",
192
+ " f\"LoRA Rank: {rank}\",\n",
193
+ " \"Adapter-capacity sensitivity check.\",\n",
194
+ " {\"lora_r\": int(rank), \"lora_alpha\": int(rank) * 2},\n",
195
+ " ))\n",
196
+ "\n",
197
+ " if not scenarios:\n",
198
+ " raise ValueError(\"No scenarios selected. Set ABLATIONS to include optimizer, lr, schedule, steps, data, or lora.\")\n",
199
+ " return scenarios\n",
200
+ "\n",
201
+ "\n",
202
+ "SCENARIOS = build_scenarios()\n",
203
+ "plan_rows = []\n",
204
+ "for item in SCENARIOS:\n",
205
+ " cfg = item[\"config\"]\n",
206
+ " plan_rows.append({\n",
207
+ " \"scenario\": item[\"label\"],\n",
208
+ " \"optimizers\": \",\".join(item[\"optimizers\"]),\n",
209
+ " \"steps\": cfg[\"max_steps\"],\n",
210
+ " \"lr\": cfg[\"lr\"],\n",
211
+ " \"schedule\": cfg[\"schedule_mode\"],\n",
212
+ " \"train_chars\": \"FULL\" if cfg[\"full_wikitext_train\"] else cfg[\"max_chars\"],\n",
213
+ " \"eval_chars\": \"FULL\" if cfg[\"full_wikitext_eval\"] else cfg[\"eval_chars\"],\n",
214
+ " \"lora_r\": cfg[\"lora_r\"],\n",
215
+ " \"note\": item[\"note\"],\n",
216
+ " })\n",
217
+ "\n",
218
+ "plan_df = pd.DataFrame(plan_rows)\n",
219
+ "display(plan_df)\n"
220
+ ]
221
+ },
222
+ {
223
+ "cell_type": "code",
224
+ "execution_count": null,
225
+ "metadata": {},
226
+ "outputs": [],
227
+ "source": [
228
+ "# @title 4. Define data, model, optimizer, and metric helpers\n",
229
+ "import gc\n",
230
+ "import math\n",
231
+ "import random\n",
232
+ "import time\n",
233
+ "\n",
234
+ "from datasets import load_dataset\n",
235
+ "from peft import LoraConfig, TaskType, get_peft_model\n",
236
+ "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
237
+ "\n",
238
+ "TOKENIZER = None\n",
239
+ "DATA_CACHE = {}\n",
240
+ "\n",
241
+ "\n",
242
+ "def set_seed(seed):\n",
243
+ " random.seed(int(seed))\n",
244
+ " torch.manual_seed(int(seed))\n",
245
+ " if torch.cuda.is_available():\n",
246
+ " torch.cuda.manual_seed_all(int(seed))\n",
247
+ "\n",
248
+ "\n",
249
+ "def get_tokenizer():\n",
250
+ " global TOKENIZER\n",
251
+ " if TOKENIZER is None:\n",
252
+ " TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)\n",
253
+ " if TOKENIZER.pad_token is None:\n",
254
+ " TOKENIZER.pad_token = TOKENIZER.eos_token\n",
255
+ " return TOKENIZER\n",
256
+ "\n",
257
+ "\n",
258
+ "def build_wikitext_chunks(tokenizer, split, seq_len, max_chars):\n",
259
+ " cap = None if max_chars is None else int(max_chars)\n",
260
+ " print(f\"Preparing WikiText split={split!r}\" + (f\" with char cap {cap:,}\" if cap is not None else \" with full split\"))\n",
261
+ " ds = load_dataset(\"wikitext\", \"wikitext-103-raw-v1\", split=split)\n",
262
+ " pieces = []\n",
263
+ " chars_used = 0\n",
264
+ " rows_used = 0\n",
265
+ " first_piece = True\n",
266
+ " for row in ds:\n",
267
+ " text = str(row.get(\"text\", \"\") or \"\")\n",
268
+ " if not text.strip():\n",
269
+ " continue\n",
270
+ " piece = text if first_piece else \" \" + text\n",
271
+ " if cap is not None:\n",
272
+ " remain = cap - chars_used\n",
273
+ " if remain <= 0:\n",
274
+ " break\n",
275
+ " if len(piece) > remain:\n",
276
+ " piece = piece[:remain]\n",
277
+ " pieces.append(piece)\n",
278
+ " chars_used += len(piece)\n",
279
+ " rows_used += 1\n",
280
+ " first_piece = False\n",
281
+ " if cap is not None and chars_used >= cap:\n",
282
+ " break\n",
283
+ " text = \"\".join(pieces)\n",
284
+ " token_ids = tokenizer(text, add_special_tokens=False)[\"input_ids\"]\n",
285
+ " ids = torch.tensor(token_ids, dtype=torch.long)\n",
286
+ " n = ids.numel() // int(seq_len)\n",
287
+ " if n <= 0:\n",
288
+ " raise RuntimeError(\"Not enough tokens. Increase max_chars or reduce seq_len.\")\n",
289
+ " ids = ids[: n * int(seq_len)].view(n, int(seq_len)).contiguous()\n",
290
+ " print(f\"Prepared split={split!r}: {chars_used:,} chars across {rows_used:,} rows -> {ids.size(0):,} sequences\")\n",
291
+ " return {\"input_ids\": ids, \"chars\": chars_used, \"rows\": rows_used, \"cap\": cap, \"seq_len\": int(seq_len)}\n",
292
+ "\n",
293
+ "\n",
294
+ "def get_chunks(cfg):\n",
295
+ " tokenizer = get_tokenizer()\n",
296
+ " train_cap = None if cfg[\"full_wikitext_train\"] else int(cfg[\"max_chars\"])\n",
297
+ " eval_cap = None if cfg[\"full_wikitext_eval\"] else int(cfg[\"eval_chars\"])\n",
298
+ " key = (int(cfg[\"seq_len\"]), train_cap, eval_cap)\n",
299
+ " if key not in DATA_CACHE:\n",
300
+ " DATA_CACHE[key] = {\n",
301
+ " \"train\": build_wikitext_chunks(tokenizer, \"train\", cfg[\"seq_len\"], train_cap),\n",
302
+ " \"eval\": build_wikitext_chunks(tokenizer, \"validation\", cfg[\"seq_len\"], eval_cap),\n",
303
+ " }\n",
304
+ " return DATA_CACHE[key][\"train\"], DATA_CACHE[key][\"eval\"]\n",
305
+ "\n",
306
+ "\n",
307
+ "def batch_iter(chunks, batch_size):\n",
308
+ " ids = chunks[\"input_ids\"]\n",
309
+ " i = 0\n",
310
+ " batch_size = int(batch_size)\n",
311
+ " while True:\n",
312
+ " if i + batch_size > ids.size(0):\n",
313
+ " i = 0\n",
314
+ " batch = ids[i : i + batch_size].to(DEVICE, non_blocking=True)\n",
315
+ " i += batch_size\n",
316
+ " yield batch\n",
317
+ "\n",
318
+ "\n",
319
+ "def load_lora_model(cfg):\n",
320
+ " dtype = torch.float16 if DEVICE == \"cuda\" else torch.float32\n",
321
+ " model = AutoModelForCausalLM.from_pretrained(\n",
322
+ " MODEL_NAME,\n",
323
+ " dtype=dtype,\n",
324
+ " low_cpu_mem_usage=True,\n",
325
+ " use_safetensors=True,\n",
326
+ " )\n",
327
+ " if getattr(model.config, \"use_cache\", None) is not None:\n",
328
+ " model.config.use_cache = False\n",
329
+ " model.to(DEVICE)\n",
330
+ " lora_cfg = LoraConfig(\n",
331
+ " r=int(cfg[\"lora_r\"]),\n",
332
+ " lora_alpha=int(cfg[\"lora_alpha\"]),\n",
333
+ " lora_dropout=float(cfg[\"lora_dropout\"]),\n",
334
+ " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
335
+ " task_type=TaskType.CAUSAL_LM,\n",
336
+ " bias=\"none\",\n",
337
+ " )\n",
338
+ " return get_peft_model(model, lora_cfg)\n",
339
+ "\n",
340
+ "\n",
341
+ "def make_optimizer(name, model, cfg):\n",
342
+ " params = [p for p in model.parameters() if p.requires_grad]\n",
343
+ " if name == \"adamw\":\n",
344
+ " return torch.optim.AdamW(params, lr=float(cfg[\"lr\"]), betas=tuple(cfg[\"betas\"]), weight_decay=float(cfg[\"weight_decay\"]))\n",
345
+ " if name == \"lbw_guard\":\n",
346
+ " return lbw.Guard(\n",
347
+ " params,\n",
348
+ " lr=float(cfg[\"lr\"]),\n",
349
+ " betas=tuple(cfg[\"betas\"]),\n",
350
+ " weight_decay=float(cfg[\"weight_decay\"]),\n",
351
+ " mode=\"eval\",\n",
352
+ " auto_enabled=True,\n",
353
+ " stats_freq=int(cfg[\"lbw_stats_freq\"]),\n",
354
+ " stress_threshold=float(cfg[\"lbw_stress_th\"]),\n",
355
+ " spike_threshold=float(cfg[\"lbw_spike_th\"]),\n",
356
+ " recovery_fast=float(cfg[\"lbw_rec_fast\"]),\n",
357
+ " ema_decay=float(cfg[\"lbw_ema_decay\"]),\n",
358
+ " use_max_rms=True,\n",
359
+ " )\n",
360
+ " raise ValueError(f\"Unknown optimizer: {name}\")\n",
361
+ "\n",
362
+ "\n",
363
+ "def scheduled_lr(cfg, step):\n",
364
+ " base_lr = float(cfg[\"lr\"])\n",
365
+ " warmup = max(int(cfg.get(\"warmup_steps\", 0)), 0)\n",
366
+ " max_steps = max(int(cfg[\"max_steps\"]), 1)\n",
367
+ " if warmup > 0 and step <= warmup:\n",
368
+ " return base_lr * float(step) / float(warmup)\n",
369
+ " mode = str(cfg.get(\"schedule_mode\", \"constant\")).lower()\n",
370
+ " if mode == \"cosine\":\n",
371
+ " progress = (step - warmup) / max(max_steps - warmup, 1)\n",
372
+ " progress = min(max(progress, 0.0), 1.0)\n",
373
+ " return base_lr * 0.5 * (1.0 + math.cos(math.pi * progress))\n",
374
+ " return base_lr\n",
375
+ "\n",
376
+ "\n",
377
+ "def set_lr(opt, value):\n",
378
+ " for group in opt.param_groups:\n",
379
+ " group[\"lr\"] = float(value)\n",
380
+ "\n",
381
+ "\n",
382
+ "@torch.no_grad()\n",
383
+ "def evaluate_ppl(model, eval_chunks, cfg, full_pass=False):\n",
384
+ " model.eval()\n",
385
+ " ids = eval_chunks[\"input_ids\"]\n",
386
+ " batch_size = int(cfg[\"batch_size\"])\n",
387
+ " max_sequences = ids.size(0) if full_pass else min(ids.size(0), int(cfg[\"eval_batches\"]) * batch_size)\n",
388
+ " losses = []\n",
389
+ " for start in range(0, max_sequences, batch_size):\n",
390
+ " xb = ids[start : start + batch_size].to(DEVICE, non_blocking=True)\n",
391
+ " with torch.autocast(device_type=DEVICE, dtype=torch.float16, enabled=(DEVICE == \"cuda\")):\n",
392
+ " loss = model(input_ids=xb, labels=xb).loss\n",
393
+ " losses.append(float(loss.detach().cpu()))\n",
394
+ " avg_loss = sum(losses) / max(len(losses), 1)\n",
395
+ " return avg_loss, math.exp(min(avg_loss, 20.0))\n",
396
+ "\n",
397
+ "\n",
398
+ "def optimizer_state(opt):\n",
399
+ " state = dict(getattr(opt, \"state\", {}).get(\"lbw\", {}) or {})\n",
400
+ " return {\n",
401
+ " \"scale\": float(state.get(\"scale\", state.get(\"lbw_scale\", 1.0))),\n",
402
+ " \"ratio\": float(state.get(\"ratio\", 1.0)),\n",
403
+ " \"stress_mode\": str(state.get(\"stress_mode\", \"none\")),\n",
404
+ " }\n"
405
+ ]
406
+ },
407
+ {
408
+ "cell_type": "code",
409
+ "execution_count": null,
410
+ "id": "107c58b1",
411
+ "metadata": {},
412
+ "outputs": [],
413
+ "source": [
414
+ "# @title 5. Run ablation matrix\n",
415
+ "\n",
416
+ "def run_one_optimizer(scenario_item, optimizer_name):\n",
417
+ " cfg = scenario_item[\"config\"]\n",
418
+ " train_chunks, eval_chunks = get_chunks(cfg)\n",
419
+ " set_seed(cfg[\"seed\"])\n",
420
+ " model = load_lora_model(cfg)\n",
421
+ " model.train()\n",
422
+ " opt = make_optimizer(optimizer_name, model, cfg)\n",
423
+ " train_batches = batch_iter(train_chunks, cfg[\"batch_size\"])\n",
424
+ "\n",
425
+ " start_time = time.time()\n",
426
+ " losses = []\n",
427
+ " eval_loss = None\n",
428
+ " eval_ppl = None\n",
429
+ " last_lr = float(cfg[\"lr\"])\n",
430
+ "\n",
431
+ " for step in range(1, int(cfg[\"max_steps\"]) + 1):\n",
432
+ " last_lr = scheduled_lr(cfg, step)\n",
433
+ " set_lr(opt, last_lr)\n",
434
+ " xb = next(train_batches)\n",
435
+ " with torch.autocast(device_type=DEVICE, dtype=torch.float16, enabled=(DEVICE == \"cuda\")):\n",
436
+ " loss = model(input_ids=xb, labels=xb).loss\n",
437
+ " loss.backward()\n",
438
+ " torch.nn.utils.clip_grad_norm_([p for p in model.parameters() if p.requires_grad], 1.0)\n",
439
+ " opt.step()\n",
440
+ " opt.zero_grad(set_to_none=True)\n",
441
+ " loss_value = float(loss.detach().cpu())\n",
442
+ " losses.append(loss_value)\n",
443
+ "\n",
444
+ " if step == 1 or step == int(cfg[\"max_steps\"]) or step % int(cfg[\"eval_every\"]) == 0:\n",
445
+ " eval_loss, eval_ppl = evaluate_ppl(model, eval_chunks, cfg, full_pass=False)\n",
446
+ " state = optimizer_state(opt)\n",
447
+ " print(\n",
448
+ " f\"[{scenario_item['slug']}] {optimizer_name} step {step}/{cfg['max_steps']}: \"\n",
449
+ " f\"loss={loss_value:.4f}, sampled_eval_ppl={eval_ppl:.4f}, \"\n",
450
+ " f\"lr={last_lr:.2e}, scale={state['scale']:.4f}, ratio={state['ratio']:.4f}\"\n",
451
+ " )\n",
452
+ " model.train()\n",
453
+ "\n",
454
+ " final_full_pass = bool(cfg[\"full_validation_ppl\"])\n",
455
+ " final_scope = \"full_wikitext\" if final_full_pass and eval_chunks[\"cap\"] is None else (\"full_loaded_subset\" if final_full_pass else \"sampled\")\n",
456
+ " final_loss, final_ppl = evaluate_ppl(model, eval_chunks, cfg, full_pass=final_full_pass)\n",
457
+ " state = optimizer_state(opt)\n",
458
+ " wall_time = max(time.time() - start_time, 1e-9)\n",
459
+ " trained_tokens = int(cfg[\"max_steps\"]) * int(cfg[\"batch_size\"]) * int(cfg[\"seq_len\"])\n",
460
+ "\n",
461
+ " result = {\n",
462
+ " \"scenario_slug\": scenario_item[\"slug\"],\n",
463
+ " \"scenario\": scenario_item[\"label\"],\n",
464
+ " \"optimizer\": optimizer_name,\n",
465
+ " \"final_eval_ppl\": final_ppl,\n",
466
+ " \"final_eval_loss\": final_loss,\n",
467
+ " \"train_loss_last\": losses[-1] if losses else None,\n",
468
+ " \"final_eval_scope\": final_scope,\n",
469
+ " \"max_steps\": int(cfg[\"max_steps\"]),\n",
470
+ " \"lr\": float(cfg[\"lr\"]),\n",
471
+ " \"scheduled_lr_last\": float(last_lr),\n",
472
+ " \"schedule_mode\": cfg[\"schedule_mode\"],\n",
473
+ " \"batch_size\": int(cfg[\"batch_size\"]),\n",
474
+ " \"seq_len\": int(cfg[\"seq_len\"]),\n",
475
+ " \"lora_r\": int(cfg[\"lora_r\"]),\n",
476
+ " \"train_chars\": int(train_chunks[\"chars\"]),\n",
477
+ " \"eval_chars\": int(eval_chunks[\"chars\"]),\n",
478
+ " \"train_sequences\": int(train_chunks[\"input_ids\"].size(0)),\n",
479
+ " \"eval_sequences\": int(eval_chunks[\"input_ids\"].size(0)),\n",
480
+ " \"scale\": state[\"scale\"],\n",
481
+ " \"ratio\": state[\"ratio\"],\n",
482
+ " \"stress_mode\": state[\"stress_mode\"],\n",
483
+ " \"wall_time_sec\": wall_time,\n",
484
+ " \"tokens_per_sec_wall\": trained_tokens / wall_time,\n",
485
+ " }\n",
486
+ "\n",
487
+ " del model, opt\n",
488
+ " gc.collect()\n",
489
+ " if DEVICE == \"cuda\":\n",
490
+ " torch.cuda.empty_cache()\n",
491
+ " return result\n",
492
+ "\n",
493
+ "\n",
494
+ "results = []\n",
495
+ "for scenario_item in SCENARIOS:\n",
496
+ " print(\"\\n=== Scenario:\", scenario_item[\"label\"], \"===\")\n",
497
+ " for optimizer_name in scenario_item[\"optimizers\"]:\n",
498
+ " print(\"\\n---\", optimizer_name, \"---\")\n",
499
+ " results.append(run_one_optimizer(scenario_item, optimizer_name))\n",
500
+ "\n",
501
+ "metrics_df = pd.DataFrame(results)\n",
502
+ "display(metrics_df)\n",
503
+ "metrics_path = \"/content/lbw_guard_ablation_metrics.csv\"\n",
504
+ "metrics_df.to_csv(metrics_path, index=False)\n",
505
+ "print(\"Wrote\", metrics_path)\n"
506
+ ]
507
+ },
508
+ {
509
+ "cell_type": "code",
510
+ "execution_count": null,
511
+ "metadata": {},
512
+ "outputs": [],
513
+ "source": [
514
+ "# @title 6. Compute LBW-vs-AdamW gains\n",
515
+ "\n",
516
+ "def build_gain_rows(metrics):\n",
517
+ " gain_rows = []\n",
518
+ " for scenario_slug, group in metrics.groupby(\"scenario_slug\"):\n",
519
+ " baseline_rows = group[group[\"optimizer\"] == \"adamw\"]\n",
520
+ " if baseline_rows.empty:\n",
521
+ " continue\n",
522
+ " baseline = baseline_rows.iloc[0]\n",
523
+ " for _, row in group.iterrows():\n",
524
+ " if row[\"optimizer\"] == \"adamw\":\n",
525
+ " continue\n",
526
+ " ppl_gain_pct = (baseline[\"final_eval_ppl\"] - row[\"final_eval_ppl\"]) / baseline[\"final_eval_ppl\"] * 100.0\n",
527
+ " loss_gain_pct = (baseline[\"final_eval_loss\"] - row[\"final_eval_loss\"]) / baseline[\"final_eval_loss\"] * 100.0\n",
528
+ " speed_gain_pct = (row[\"tokens_per_sec_wall\"] - baseline[\"tokens_per_sec_wall\"]) / baseline[\"tokens_per_sec_wall\"] * 100.0\n",
529
+ " gain_rows.append({\n",
530
+ " \"scenario_slug\": scenario_slug,\n",
531
+ " \"scenario\": row[\"scenario\"],\n",
532
+ " \"optimizer\": row[\"optimizer\"],\n",
533
+ " \"adamw_final_eval_ppl\": baseline[\"final_eval_ppl\"],\n",
534
+ " \"optimizer_final_eval_ppl\": row[\"final_eval_ppl\"],\n",
535
+ " \"ppl_gain_pct_vs_adamw\": ppl_gain_pct,\n",
536
+ " \"loss_gain_pct_vs_adamw\": loss_gain_pct,\n",
537
+ " \"speed_gain_pct_vs_adamw\": speed_gain_pct,\n",
538
+ " \"adamw_tokens_per_sec_wall\": baseline[\"tokens_per_sec_wall\"],\n",
539
+ " \"optimizer_tokens_per_sec_wall\": row[\"tokens_per_sec_wall\"],\n",
540
+ " \"lbw_scale\": row[\"scale\"],\n",
541
+ " \"lbw_ratio\": row[\"ratio\"],\n",
542
+ " \"lbw_stress_mode\": row[\"stress_mode\"],\n",
543
+ " })\n",
544
+ " return gain_rows\n",
545
+ "\n",
546
+ "\n",
547
+ "gains_df = pd.DataFrame(build_gain_rows(metrics_df))\n",
548
+ "display(gains_df if not gains_df.empty else pd.DataFrame([{\"message\": \"No gain rows. Keep adamw and lbw_guard in OPTIMIZERS.\"}]))\n",
549
+ "gains_path = \"/content/lbw_guard_ablation_gains.csv\"\n",
550
+ "gains_df.to_csv(gains_path, index=False)\n",
551
+ "print(\"Wrote\", gains_path)\n"
552
+ ]
553
+ },
554
+ {
555
+ "cell_type": "markdown",
556
+ "metadata": {},
557
+ "source": [
558
+ "## How to read this ablation\n",
559
+ "\n",
560
+ "- `scenario`: the ablation condition being tested.\n",
561
+ "- `optimizer`: `adamw` is the baseline; `lbw_guard` is the TCG optimizer under test.\n",
562
+ "- `final_eval_ppl`: lower is better for this WikiText smoke benchmark.\n",
563
+ "- `ppl_gain_pct_vs_adamw`: positive means `lbw_guard` achieved lower perplexity than AdamW in that scenario.\n",
564
+ "- `scale`: LBW Guard's control scale for the effective update.\n",
565
+ "- `ratio`: LBW Guard's gradient stress ratio.\n",
566
+ "- `stress_mode`: LBW Guard's current controller regime.\n",
567
+ "- `final_eval_scope`: `sampled`, `full_loaded_subset`, or `full_wikitext`.\n",
568
+ "\n",
569
+ "For true full WikiText validation PPL, set both values in `BASE_CONFIG`:\n",
570
+ "\n",
571
+ "```python\n",
572
+ "\"full_wikitext_eval\": True,\n",
573
+ "\"full_validation_ppl\": True,\n",
574
+ "```\n",
575
+ "\n",
576
+ "For a wider ablation matrix, change:\n",
577
+ "\n",
578
+ "```python\n",
579
+ "ABLATIONS = [\"optimizer\", \"lr\", \"schedule\", \"steps\", \"data\", \"lora\"]\n",
580
+ "```\n",
581
+ "\n",
582
+ "That will take longer because each scenario runs both AdamW and `lbw_guard`.\n"
583
+ ]
584
+ }
585
+ ],
586
+ "metadata": {
587
+ "accelerator": "GPU",
588
+ "colab": {
589
+ "name": "LBW_Guard_Ablation_Test_COLAB.ipynb",
590
+ "provenance": []
591
+ },
592
+ "kernelspec": {
593
+ "display_name": "Python 3",
594
+ "name": "python3"
595
+ },
596
+ "language_info": {
597
+ "name": "python"
598
+ }
599
+ },
600
+ "nbformat": 4,
601
+ "nbformat_minor": 5
602
+ }
LBW_Guard_Easy_Test_COLAB.ipynb ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "7646fe20",
6
+ "metadata": {},
7
+ "source": [
8
+ "Copyright (c) Qluon Inc. All rights reserved.\n",
9
+ "\n",
10
+ "Provided for Learn-By-Wire Guard evaluation and customer testing under the applicable Qluon license terms.\n",
11
+ "\n",
12
+ "# LBW Guard Easy Test Colab\n",
13
+ "\n",
14
+ "This is a black-box smoke test for `lbw_guard` commercial evaluation. It compares `torch.optim.AdamW` against `lbw.Guard` on a small WikiText-103 LoRA run.\n",
15
+ "\n",
16
+ "It does not import local source folders. The only LBW code used is the installed `LBW-Guard` package that provides `lbw.Guard`."
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "id": "7ce8f05d",
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "# @title 1. Install public dependencies\n",
27
+ "import subprocess\n",
28
+ "import sys\n",
29
+ "\n",
30
+ "deps = [\n",
31
+ " \"transformers>=4.45\",\n",
32
+ " \"datasets>=2.20\",\n",
33
+ " \"peft>=0.12\",\n",
34
+ " \"accelerate>=0.33\",\n",
35
+ " \"sentencepiece\",\n",
36
+ " \"pandas\",\n",
37
+ "]\n",
38
+ "subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", *deps])\n",
39
+ "print(\"Public dependency install complete.\")\n"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": null,
45
+ "id": "0d73bff3",
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "# @title 2. Install LBW Guard package\n",
50
+ "import importlib\n",
51
+ "import subprocess\n",
52
+ "import sys\n",
53
+ "\n",
54
+ "subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"LBW-Guard\"])\n",
55
+ "lbw = importlib.import_module(\"lbw\")\n",
56
+ "\n",
57
+ "print(\"lbw module:\", lbw.__file__)\n",
58
+ "print(\"lbw.Guard:\", lbw.Guard)\n"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": null,
64
+ "metadata": {},
65
+ "outputs": [],
66
+ "source": [
67
+ "# @title 3. Configure the easy test\n",
68
+ "import torch\n",
69
+ "\n",
70
+ "MODEL_NAME = \"TinyLlama/TinyLlama_v1.1\"\n",
71
+ "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
72
+ "\n",
73
+ "OPTIMIZERS = [\"adamw\", \"lbw_guard\"]\n",
74
+ "SEED = 42\n",
75
+ "MAX_STEPS = 5\n",
76
+ "EVAL_EVERY = 5\n",
77
+ "EVAL_BATCHES = 8\n",
78
+ "\n",
79
+ "SEQ_LEN = 64\n",
80
+ "BATCH_SIZE = 1\n",
81
+ "MAX_CHARS = 20000\n",
82
+ "EVAL_CHARS = 8000\n",
83
+ "\n",
84
+ "FULL_WIKITEXT_TRAIN = False\n",
85
+ "FULL_WIKITEXT_EVAL = False\n",
86
+ "FULL_VALIDATION_PPL = False\n",
87
+ "\n",
88
+ "LR = 5e-4\n",
89
+ "BETAS = (0.9, 0.999)\n",
90
+ "WEIGHT_DECAY = 0.01\n",
91
+ "\n",
92
+ "LBW_STATS_FREQ = 10\n",
93
+ "LBW_STRESS_TH = 1.1\n",
94
+ "LBW_SPIKE_TH = 1.5\n",
95
+ "LBW_REC_FAST = 0.01\n",
96
+ "LBW_EMA_DECAY = 0.95\n",
97
+ "\n",
98
+ "print(\"Device:\", DEVICE)\n",
99
+ "if DEVICE == \"cuda\":\n",
100
+ " print(\"GPU:\", torch.cuda.get_device_name(0))\n",
101
+ "print(\"For true full WikiText validation PPL, set FULL_WIKITEXT_EVAL=True and FULL_VALIDATION_PPL=True.\")\n"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": null,
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "# @title 4. Define data, model, training, and evaluation helpers\n",
111
+ "import gc\n",
112
+ "import math\n",
113
+ "import random\n",
114
+ "import time\n",
115
+ "\n",
116
+ "import pandas as pd\n",
117
+ "from datasets import load_dataset\n",
118
+ "from peft import LoraConfig, TaskType, get_peft_model\n",
119
+ "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
120
+ "\n",
121
+ "def set_seed(seed):\n",
122
+ " random.seed(seed)\n",
123
+ " torch.manual_seed(seed)\n",
124
+ " if torch.cuda.is_available():\n",
125
+ " torch.cuda.manual_seed_all(seed)\n",
126
+ "\n",
127
+ "def build_wikitext_chunks(tokenizer, split, max_chars):\n",
128
+ " cap = None if max_chars is None else int(max_chars)\n",
129
+ " print(f\"Preparing WikiText split={split!r}\" + (f\" with char cap {cap:,}\" if cap is not None else \" with full split\"))\n",
130
+ " ds = load_dataset(\"wikitext\", \"wikitext-103-raw-v1\", split=split)\n",
131
+ " pieces = []\n",
132
+ " chars_used = 0\n",
133
+ " rows_used = 0\n",
134
+ " first_piece = True\n",
135
+ " for row in ds:\n",
136
+ " text = str(row.get(\"text\", \"\") or \"\")\n",
137
+ " if not text.strip():\n",
138
+ " continue\n",
139
+ " piece = text if first_piece else \" \" + text\n",
140
+ " if cap is not None:\n",
141
+ " remain = cap - chars_used\n",
142
+ " if remain <= 0:\n",
143
+ " break\n",
144
+ " if len(piece) > remain:\n",
145
+ " piece = piece[:remain]\n",
146
+ " pieces.append(piece)\n",
147
+ " chars_used += len(piece)\n",
148
+ " rows_used += 1\n",
149
+ " first_piece = False\n",
150
+ " if cap is not None and chars_used >= cap:\n",
151
+ " break\n",
152
+ " text = \"\".join(pieces)\n",
153
+ " token_ids = tokenizer(text, add_special_tokens=False)[\"input_ids\"]\n",
154
+ " ids = torch.tensor(token_ids, dtype=torch.long)\n",
155
+ " n = ids.numel() // SEQ_LEN\n",
156
+ " if n <= 0:\n",
157
+ " raise RuntimeError(\"Not enough tokens. Increase MAX_CHARS or reduce SEQ_LEN.\")\n",
158
+ " ids = ids[: n * SEQ_LEN].view(n, SEQ_LEN).contiguous()\n",
159
+ " print(f\"Prepared split={split!r}: {chars_used:,} chars across {rows_used:,} rows -> {ids.size(0):,} sequences\")\n",
160
+ " return {\"input_ids\": ids, \"chars\": chars_used, \"rows\": rows_used, \"cap\": cap}\n",
161
+ "\n",
162
+ "def batch_iter(chunks):\n",
163
+ " ids = chunks[\"input_ids\"]\n",
164
+ " i = 0\n",
165
+ " while True:\n",
166
+ " if i + BATCH_SIZE > ids.size(0):\n",
167
+ " i = 0\n",
168
+ " batch = ids[i : i + BATCH_SIZE].to(DEVICE, non_blocking=True)\n",
169
+ " i += BATCH_SIZE\n",
170
+ " yield batch\n",
171
+ "\n",
172
+ "def load_lora_model():\n",
173
+ " dtype = torch.float16 if DEVICE == \"cuda\" else torch.float32\n",
174
+ " model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=dtype, low_cpu_mem_usage=True)\n",
175
+ " if getattr(model.config, \"use_cache\", None) is not None:\n",
176
+ " model.config.use_cache = False\n",
177
+ " model.to(DEVICE)\n",
178
+ " lora_cfg = LoraConfig(\n",
179
+ " r=8,\n",
180
+ " lora_alpha=16,\n",
181
+ " lora_dropout=0.05,\n",
182
+ " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
183
+ " task_type=TaskType.CAUSAL_LM,\n",
184
+ " bias=\"none\",\n",
185
+ " )\n",
186
+ " return get_peft_model(model, lora_cfg)\n",
187
+ "\n",
188
+ "def make_optimizer(name, model):\n",
189
+ " params = [p for p in model.parameters() if p.requires_grad]\n",
190
+ " if name == \"adamw\":\n",
191
+ " return torch.optim.AdamW(params, lr=LR, betas=BETAS, weight_decay=WEIGHT_DECAY)\n",
192
+ " if name == \"lbw_guard\":\n",
193
+ " return lbw.Guard(\n",
194
+ " params,\n",
195
+ " lr=LR,\n",
196
+ " betas=BETAS,\n",
197
+ " weight_decay=WEIGHT_DECAY,\n",
198
+ " mode=\"eval\",\n",
199
+ " auto_enabled=True,\n",
200
+ " stats_freq=LBW_STATS_FREQ,\n",
201
+ " stress_threshold=LBW_STRESS_TH,\n",
202
+ " spike_threshold=LBW_SPIKE_TH,\n",
203
+ " recovery_fast=LBW_REC_FAST,\n",
204
+ " ema_decay=LBW_EMA_DECAY,\n",
205
+ " use_max_rms=True,\n",
206
+ " )\n",
207
+ " raise ValueError(f\"Unknown optimizer: {name}\")\n",
208
+ "\n",
209
+ "@torch.no_grad()\n",
210
+ "def evaluate_ppl(model, eval_chunks, full_pass=False):\n",
211
+ " model.eval()\n",
212
+ " ids = eval_chunks[\"input_ids\"]\n",
213
+ " max_sequences = ids.size(0) if full_pass else min(ids.size(0), EVAL_BATCHES * BATCH_SIZE)\n",
214
+ " losses = []\n",
215
+ " for start in range(0, max_sequences, BATCH_SIZE):\n",
216
+ " xb = ids[start : start + BATCH_SIZE].to(DEVICE, non_blocking=True)\n",
217
+ " with torch.autocast(device_type=\"cuda\", dtype=torch.float16, enabled=(DEVICE == \"cuda\")):\n",
218
+ " loss = model(input_ids=xb, labels=xb).loss\n",
219
+ " losses.append(float(loss.detach().cpu()))\n",
220
+ " avg_loss = sum(losses) / max(len(losses), 1)\n",
221
+ " return avg_loss, math.exp(min(avg_loss, 20.0))\n",
222
+ "\n",
223
+ "def optimizer_state(opt):\n",
224
+ " state = dict(getattr(opt, \"state\", {}).get(\"lbw\", {}) or {})\n",
225
+ " return {\n",
226
+ " \"scale\": float(state.get(\"scale\", state.get(\"lbw_scale\", 1.0))),\n",
227
+ " \"ratio\": float(state.get(\"ratio\", 1.0)),\n",
228
+ " \"stress_mode\": str(state.get(\"stress_mode\", \"none\")),\n",
229
+ " }\n",
230
+ "\n",
231
+ "def run_one_optimizer(name, train_chunks, eval_chunks):\n",
232
+ " set_seed(SEED)\n",
233
+ " model = load_lora_model()\n",
234
+ " model.train()\n",
235
+ " opt = make_optimizer(name, model)\n",
236
+ " train_batches = batch_iter(train_chunks)\n",
237
+ " start_time = time.time()\n",
238
+ " last_loss = None\n",
239
+ " last_eval_loss = None\n",
240
+ " last_eval_ppl = None\n",
241
+ " for step in range(1, MAX_STEPS + 1):\n",
242
+ " xb = next(train_batches)\n",
243
+ " with torch.autocast(device_type=\"cuda\", dtype=torch.float16, enabled=(DEVICE == \"cuda\")):\n",
244
+ " loss = model(input_ids=xb, labels=xb).loss\n",
245
+ " loss.backward()\n",
246
+ " torch.nn.utils.clip_grad_norm_([p for p in model.parameters() if p.requires_grad], 1.0)\n",
247
+ " opt.step()\n",
248
+ " opt.zero_grad(set_to_none=True)\n",
249
+ " last_loss = float(loss.detach().cpu())\n",
250
+ " state = optimizer_state(opt)\n",
251
+ " if step == 1 or step == MAX_STEPS or step % EVAL_EVERY == 0:\n",
252
+ " last_eval_loss, last_eval_ppl = evaluate_ppl(model, eval_chunks, full_pass=False)\n",
253
+ " print(f\"{name} step {step}/{MAX_STEPS}: loss={last_loss:.4f}, sampled_eval_ppl={last_eval_ppl:.4f}, scale={state['scale']:.4f}, ratio={state['ratio']:.4f}\")\n",
254
+ " model.train()\n",
255
+ " final_full_pass = bool(FULL_VALIDATION_PPL)\n",
256
+ " final_scope = \"full_wikitext\" if final_full_pass and eval_chunks[\"cap\"] is None else (\"full_loaded_subset\" if final_full_pass else \"sampled\")\n",
257
+ " final_loss, final_ppl = evaluate_ppl(model, eval_chunks, full_pass=final_full_pass)\n",
258
+ " state = optimizer_state(opt)\n",
259
+ " wall_time = time.time() - start_time\n",
260
+ " result = {\n",
261
+ " \"optimizer\": name,\n",
262
+ " \"final_eval_ppl\": final_ppl,\n",
263
+ " \"final_eval_loss\": final_loss,\n",
264
+ " \"final_eval_scope\": final_scope,\n",
265
+ " \"train_chars\": train_chunks[\"chars\"],\n",
266
+ " \"eval_chars\": eval_chunks[\"chars\"],\n",
267
+ " \"scale\": state[\"scale\"],\n",
268
+ " \"ratio\": state[\"ratio\"],\n",
269
+ " \"stress_mode\": state[\"stress_mode\"],\n",
270
+ " \"wall_time_sec\": wall_time,\n",
271
+ " }\n",
272
+ " del model, opt\n",
273
+ " gc.collect()\n",
274
+ " if DEVICE == \"cuda\":\n",
275
+ " torch.cuda.empty_cache()\n",
276
+ " return result\n"
277
+ ]
278
+ },
279
+ {
280
+ "cell_type": "code",
281
+ "execution_count": null,
282
+ "metadata": {},
283
+ "outputs": [],
284
+ "source": [
285
+ "# @title 5. Run AdamW vs lbw_guard on WikiText-103\n",
286
+ "set_seed(SEED)\n",
287
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)\n",
288
+ "if tokenizer.pad_token is None:\n",
289
+ " tokenizer.pad_token = tokenizer.eos_token\n",
290
+ "\n",
291
+ "train_cap = None if FULL_WIKITEXT_TRAIN else MAX_CHARS\n",
292
+ "eval_cap = None if FULL_WIKITEXT_EVAL else EVAL_CHARS\n",
293
+ "train_chunks = build_wikitext_chunks(tokenizer, \"train\", train_cap)\n",
294
+ "eval_chunks = build_wikitext_chunks(tokenizer, \"validation\", eval_cap)\n",
295
+ "\n",
296
+ "results = []\n",
297
+ "for optimizer_name in OPTIMIZERS:\n",
298
+ " print(\"\\n===\", optimizer_name, \"===\")\n",
299
+ " results.append(run_one_optimizer(optimizer_name, train_chunks, eval_chunks))\n",
300
+ "\n",
301
+ "df = pd.DataFrame(results)\n",
302
+ "display(df)\n",
303
+ "df.to_csv(\"/content/lbw_guard_easy_test_results.csv\", index=False)\n",
304
+ "print(\"Wrote /content/lbw_guard_easy_test_results.csv\")\n"
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "markdown",
309
+ "metadata": {},
310
+ "source": [
311
+ "## Reading the result\n",
312
+ "\n",
313
+ "- `final_eval_ppl`: lower is better for this smoke test.\n",
314
+ "- `final_eval_scope`: `sampled`, `full_loaded_subset`, or `full_wikitext`.\n",
315
+ "- `scale`: the LBW Guard control scale applied to the effective update. AdamW stays at `1.0`.\n",
316
+ "- `ratio`: the LBW Guard gradient stress ratio. AdamW stays at `1.0`.\n",
317
+ "\n",
318
+ "This default is intentionally tiny. Use it to check installation and behavior, not to claim final benchmark quality."
319
+ ]
320
+ }
321
+ ],
322
+ "metadata": {
323
+ "accelerator": "GPU",
324
+ "colab": {
325
+ "name": "LBW_Guard_Easy_Test_COLAB.ipynb",
326
+ "provenance": []
327
+ },
328
+ "kernelspec": {
329
+ "display_name": "Python 3",
330
+ "name": "python3"
331
+ },
332
+ "language_info": {
333
+ "name": "python"
334
+ }
335
+ },
336
+ "nbformat": 4,
337
+ "nbformat_minor": 5
338
+ }
README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: LBW Guard Colab Tests
3
+ emoji: 🧪
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ python_version: "3.10"
8
+ app_file: app.py
9
+ suggested_hardware: t4-medium
10
+ models:
11
+ - Qwen/Qwen2.5-0.5B
12
+ datasets:
13
+ - Salesforce/wikitext
14
+ tags:
15
+ - optimizer
16
+ - training
17
+ - colab
18
+ - gradio
19
+ ---
20
+
21
+ Copyright (c) Qluon Inc. All rights reserved.
22
+
23
+ Provided for Learn-By-Wire Guard evaluation and customer testing under the applicable Qluon license terms.
24
+
25
+ # LBW Guard Colab Tests
26
+
27
+ This Space packages the LBW Guard customer Colab notebooks:
28
+
29
+ - `LBW_Guard_Easy_Test_COLAB.ipynb`: fast AdamW vs `lbw_guard` smoke test.
30
+ - `LBW_Guard_Ablation_Test_COLAB.ipynb`: ablation-style AdamW baseline and `lbw_guard` comparison.
31
+
32
+ Use the app to download either notebook, then open it in Google Colab with a GPU runtime.
33
+
34
+ The notebooks install `LBW-Guard` from Python package distribution and do not import local source folders.
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import gradio as gr
6
+
7
+
8
+ ROOT = Path(__file__).resolve().parent
9
+ EASY_NOTEBOOK = ROOT / "LBW_Guard_Easy_Test_COLAB.ipynb"
10
+ ABLATION_NOTEBOOK = ROOT / "LBW_Guard_Ablation_Test_COLAB.ipynb"
11
+
12
+ INTRO = """
13
+ # LBW Guard Colab Tests
14
+
15
+ Download a notebook, open it in Google Colab, set the runtime to GPU, and run the cells top to bottom.
16
+
17
+ The notebooks install `LBW-Guard` and public dependencies at runtime. They do not import local source folders.
18
+ """
19
+
20
+ EASY_DETAILS = """
21
+ ## Easy Test
22
+
23
+ Fast customer smoke test with one AdamW run and one `lbw_guard` run.
24
+ """
25
+
26
+ ABLATION_DETAILS = """
27
+ ## Ablation Test
28
+
29
+ Scenario matrix with AdamW as baseline, `lbw_guard` comparison, metrics CSV, and gains CSV.
30
+ """
31
+
32
+
33
+ with gr.Blocks(title="LBW Guard Colab Tests") as demo:
34
+ gr.Markdown(INTRO)
35
+ with gr.Row():
36
+ with gr.Column():
37
+ gr.Markdown(EASY_DETAILS)
38
+ gr.File(value=str(EASY_NOTEBOOK), label="LBW Guard Easy Test Colab", interactive=False)
39
+ with gr.Column():
40
+ gr.Markdown(ABLATION_DETAILS)
41
+ gr.File(value=str(ABLATION_NOTEBOOK), label="LBW Guard Ablation Test Colab", interactive=False)
42
+
43
+
44
+ if __name__ == "__main__":
45
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ transformers
4
+ datasets
5
+ peft
6
+ accelerate
7
+ pandas
8
+ LBW-Guard