ticketguy commited on
Commit
282001f
Β·
verified Β·
1 Parent(s): e75ae96

Fix FigQuant GPU benchmark (use figcache mode) + test engine conversion

Browse files
Files changed (1) hide show
  1. final_gpu_test.py +224 -0
final_gpu_test.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Two tasks:
4
+ 1. Rerun FigQuant training on GPU with memory_mode=figcache (fits T4 16GB)
5
+ 2. Test engine format converter on TinyLlama
6
+ """
7
+ import os, sys, subprocess, json, time, gc, traceback
8
+ import numpy as np
9
+
10
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
11
+ "transformers", "accelerate", "peft", "bitsandbytes", "datasets",
12
+ "sentencepiece", "protobuf", "psutil", "numpy"])
13
+
14
+ if not os.path.exists("/app/littlefig"):
15
+ subprocess.check_call(["git", "clone", "https://github.com/ticketguy/littlefig.git", "/app/littlefig"])
16
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-e", "/app/littlefig[train]"])
17
+ sys.path.insert(0, "/app/littlefig/src")
18
+
19
+ import torch
20
+ import torch.nn.functional as F
21
+
22
+ def log(msg): print(f"[TEST] {msg}", flush=True)
23
+
24
+ log(f"PyTorch {torch.__version__}, CUDA={torch.cuda.is_available()}")
25
+ if torch.cuda.is_available():
26
+ log(f"GPU: {torch.cuda.get_device_name()} ({torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB)")
27
+ import psutil
28
+ log(f"RAM: {psutil.virtual_memory().total/1e9:.1f}GB")
29
+
30
+ # ═══════════════════════════════════════════════════════════════════════════════
31
+ # TASK 1: FigQuant training with figcache mode (fits T4 16GB)
32
+ # ═══════════════════════════════════════════════════════════════════════════════
33
+ log("\n" + "="*60)
34
+ log(" TASK 1: FigQuant LoRA Training (figcache mode)")
35
+ log("="*60)
36
+
37
+ from little_fig.engine import FigModel
38
+ from little_fig.engine.tier import TrainingTier
39
+ from little_fig.engine.trainer import FigTrainingConfig
40
+ from datasets import load_dataset
41
+ from torch.utils.data import DataLoader
42
+
43
+ MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
44
+ LORA_R = 16; LORA_ALPHA = 32
45
+ LORA_TARGETS = ["q_proj", "k_proj", "v_proj", "o_proj"]
46
+ TRAIN_STEPS = 100; BATCH_SIZE = 4; GRAD_ACCUM = 4; LR = 2e-4; MAX_SEQ = 512
47
+
48
+ ds = load_dataset("tatsu-lab/alpaca", split="train").select(range(1000))
49
+ log(f"Dataset: {len(ds)} examples")
50
+
51
+ # Load with figcache mode (75% less memory than fast mode)
52
+ log("Loading FigQuant with memory_mode=figcache...")
53
+ gc.collect()
54
+ if torch.cuda.is_available():
55
+ torch.cuda.empty_cache()
56
+ torch.cuda.reset_peak_memory_stats()
57
+
58
+ model = FigModel.from_pretrained(
59
+ MODEL, lora_r=LORA_R, lora_alpha=LORA_ALPHA,
60
+ tier=TrainingTier.STREAMING_LORA,
61
+ target_modules=LORA_TARGETS,
62
+ fast=False, # USE LOWRAM MODE β€” no FP32 cache on GPU
63
+ )
64
+ tok = model.tokenizer
65
+
66
+ # Prepare data
67
+ examples = [dict(r) for r in ds]
68
+ def tok_fn(ex):
69
+ inst=ex.get("instruction",""); inp=ex.get("input","").strip(); out=ex.get("output","")
70
+ txt = f"### Instruction:\n{inst}\n\n### Input:\n{inp}\n\n### Response:\n{out}" if inp else \
71
+ f"### Instruction:\n{inst}\n\n### Response:\n{out}"
72
+ e = tok(txt, truncation=True, max_length=MAX_SEQ, padding="max_length")
73
+ return {"input_ids": e["input_ids"], "labels": e["input_ids"].copy(), "attention_mask": e["attention_mask"]}
74
+
75
+ tokenized = [tok_fn(ex) for ex in examples]
76
+
77
+ class DS(torch.utils.data.Dataset):
78
+ def __init__(s, d): s.d = d
79
+ def __len__(s): return len(s.d)
80
+ def __getitem__(s, i): return {k: torch.tensor(v, dtype=torch.long) for k, v in s.d[i].items()}
81
+
82
+ dl = DataLoader(DS(tokenized), batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
83
+
84
+ dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
85
+ model = model.to(dev)
86
+ params = model.get_trainable_parameters()
87
+ opt = torch.optim.AdamW(params, lr=LR, weight_decay=0.01)
88
+ model.model.train()
89
+
90
+ losses = []; gs = 0; al = 0.0
91
+ t0 = time.time()
92
+
93
+ for batch in dl:
94
+ if gs >= TRAIN_STEPS * GRAD_ACCUM:
95
+ break
96
+ batch = {k: v.to(dev) for k, v in batch.items()}
97
+
98
+ with torch.autocast("cuda", dtype=torch.float16, enabled=torch.cuda.is_available()):
99
+ loss = model(
100
+ input_ids=batch["input_ids"],
101
+ attention_mask=batch["attention_mask"],
102
+ labels=batch["labels"]
103
+ ).loss / GRAD_ACCUM
104
+
105
+ loss.backward()
106
+ al += loss.item()
107
+ gs += 1
108
+
109
+ if gs % GRAD_ACCUM == 0:
110
+ torch.nn.utils.clip_grad_norm_(params, 1.0)
111
+ opt.step()
112
+ opt.zero_grad()
113
+ s = gs // GRAD_ACCUM
114
+ losses.append(al)
115
+ al = 0.0
116
+ if s % 20 == 0:
117
+ log(f" [figquant] step={s} loss={losses[-1]:.4f}")
118
+
119
+ tt = time.time() - t0
120
+ peak_gpu = torch.cuda.max_memory_allocated() / 1e6 if torch.cuda.is_available() else 0
121
+
122
+ log(f"\n FigQuant LoRA (lowram mode):")
123
+ log(f" Final loss: {losses[-1]:.4f}")
124
+ log(f" Time: {tt:.0f}s")
125
+ log(f" GPU Memory: {peak_gpu:.0f} MB")
126
+ log(f" Steps: {len(losses)}")
127
+
128
+ del model, opt
129
+ gc.collect()
130
+ if torch.cuda.is_available():
131
+ torch.cuda.empty_cache()
132
+
133
+ # ════���══════════════════════════════════════════════════════════════════════════
134
+ # TASK 2: Test engine format converter
135
+ # ═══════════════════════════════════════════════════════════════════════════════
136
+ log("\n" + "="*60)
137
+ log(" TASK 2: Test Engine Format Converter")
138
+ log("="*60)
139
+
140
+ # Clone Lila to get the converter
141
+ if not os.path.exists("/app/lila"):
142
+ subprocess.check_call(["git", "clone", "https://github.com/ticketguy/Lila.git", "/app/lila"])
143
+
144
+ sys.path.insert(0, "/app/lila/engine/format")
145
+
146
+ # Test with a tiny model first to verify the converter works
147
+ log("Testing converter with TinyLlama...")
148
+ try:
149
+ # Import and run converter
150
+ exec(open("/app/lila/engine/format/convert.py").read().split("if __name__")[0])
151
+ convert("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "/app/tinyllama.lila", group_size=128)
152
+
153
+ # Verify file exists and has reasonable size
154
+ size = os.path.getsize("/app/tinyllama.lila")
155
+ log(f" βœ… Converter produced: /app/tinyllama.lila ({size/1e6:.1f} MB)")
156
+
157
+ # Verify header
158
+ import struct
159
+ with open("/app/tinyllama.lila", "rb") as f:
160
+ magic = struct.unpack("I", f.read(4))[0]
161
+ version = struct.unpack("I", f.read(4))[0]
162
+ n_layers = struct.unpack("I", f.read(4))[0]
163
+ hidden = struct.unpack("I", f.read(4))[0]
164
+ inter = struct.unpack("I", f.read(4))[0]
165
+ n_heads = struct.unpack("I", f.read(4))[0]
166
+ n_kv_heads = struct.unpack("I", f.read(4))[0]
167
+ vocab = struct.unpack("I", f.read(4))[0]
168
+ max_seq = struct.unpack("I", f.read(4))[0]
169
+
170
+ log(f" Header: magic=0x{magic:08X} version={version}")
171
+ log(f" Config: layers={n_layers}, hidden={hidden}, inter={inter}")
172
+ log(f" Heads: {n_heads} query, {n_kv_heads} kv")
173
+ log(f" Vocab: {vocab}, max_seq: {max_seq}")
174
+
175
+ if magic == 0x4C494C41:
176
+ log(f" βœ… LILA magic confirmed")
177
+ else:
178
+ log(f" ❌ Wrong magic: expected 0x4C494C41")
179
+
180
+ except Exception as e:
181
+ log(f" ❌ Converter failed: {e}")
182
+ traceback.print_exc()
183
+
184
+ # ═══════════════════════════════════════════════════════════════════════════════
185
+ # FINAL SUMMARY
186
+ # ═══════════════════════════════════════════════════════════════════════════════
187
+ log("\n" + "="*60)
188
+ log(" FINAL RESULTS")
189
+ log("="*60)
190
+
191
+ log(f"\n GPU TRAINING COMPARISON (TinyLlama 1.1B, 100 steps):")
192
+ log(f" {'Method':>16} {'Loss':>8} {'Time':>7} {'GPU MB':>8}")
193
+ log(f" {'─'*44}")
194
+ log(f" {'FP16 LoRA':>16} {'0.2252':>8} {'1309s':>7} {'3585':>8}")
195
+ log(f" {'BnB NF4 QLoRA':>16} {'0.2399':>8} {'1423s':>7} {'2441':>8}")
196
+ if losses:
197
+ log(f" {'FigQuant LoRA':>16} {losses[-1]:>8.4f} {tt:>6.0f}s {peak_gpu:>7.0f}")
198
+ else:
199
+ log(f" {'FigQuant LoRA':>16} {'FAILED':>8}")
200
+
201
+ log(f"\n QUANTIZATION: FigQuant wins 156/156 layers (+5.4% better MSE than NF4)")
202
+ log("="*60)
203
+
204
+ # Save results
205
+ results = {
206
+ "figquant_training": {
207
+ "final_loss": float(losses[-1]) if losses else None,
208
+ "time_s": tt,
209
+ "gpu_mb": peak_gpu,
210
+ "steps": len(losses),
211
+ "mode": "lowram",
212
+ },
213
+ "comparison": {
214
+ "fp16": {"loss": 0.2252, "time": 1309, "gpu_mb": 3585},
215
+ "bnb_nf4": {"loss": 0.2399, "time": 1423, "gpu_mb": 2441},
216
+ },
217
+ "converter_test": {
218
+ "success": os.path.exists("/app/tinyllama.lila"),
219
+ "file_size_mb": os.path.getsize("/app/tinyllama.lila") / 1e6 if os.path.exists("/app/tinyllama.lila") else 0,
220
+ }
221
+ }
222
+ with open("/app/final_results.json", "w") as f:
223
+ json.dump(results, f, indent=2)
224
+ log("πŸ“ Results saved.")