File size: 9,081 Bytes
282001f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | #!/usr/bin/env python3
"""
Two tasks:
1. Rerun FigQuant training on GPU with memory_mode=figcache (fits T4 16GB)
2. Test engine format converter on TinyLlama
"""
import os, sys, subprocess, json, time, gc, traceback
import numpy as np
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
"transformers", "accelerate", "peft", "bitsandbytes", "datasets",
"sentencepiece", "protobuf", "psutil", "numpy"])
if not os.path.exists("/app/littlefig"):
subprocess.check_call(["git", "clone", "https://github.com/ticketguy/littlefig.git", "/app/littlefig"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-e", "/app/littlefig[train]"])
sys.path.insert(0, "/app/littlefig/src")
import torch
import torch.nn.functional as F
def log(msg): print(f"[TEST] {msg}", flush=True)
log(f"PyTorch {torch.__version__}, CUDA={torch.cuda.is_available()}")
if torch.cuda.is_available():
log(f"GPU: {torch.cuda.get_device_name()} ({torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB)")
import psutil
log(f"RAM: {psutil.virtual_memory().total/1e9:.1f}GB")
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# TASK 1: FigQuant training with figcache mode (fits T4 16GB)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
log("\n" + "="*60)
log(" TASK 1: FigQuant LoRA Training (figcache mode)")
log("="*60)
from little_fig.engine import FigModel
from little_fig.engine.tier import TrainingTier
from little_fig.engine.trainer import FigTrainingConfig
from datasets import load_dataset
from torch.utils.data import DataLoader
MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
LORA_R = 16; LORA_ALPHA = 32
LORA_TARGETS = ["q_proj", "k_proj", "v_proj", "o_proj"]
TRAIN_STEPS = 100; BATCH_SIZE = 4; GRAD_ACCUM = 4; LR = 2e-4; MAX_SEQ = 512
ds = load_dataset("tatsu-lab/alpaca", split="train").select(range(1000))
log(f"Dataset: {len(ds)} examples")
# Load with figcache mode (75% less memory than fast mode)
log("Loading FigQuant with memory_mode=figcache...")
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
model = FigModel.from_pretrained(
MODEL, lora_r=LORA_R, lora_alpha=LORA_ALPHA,
tier=TrainingTier.STREAMING_LORA,
target_modules=LORA_TARGETS,
fast=False, # USE LOWRAM MODE β no FP32 cache on GPU
)
tok = model.tokenizer
# Prepare data
examples = [dict(r) for r in ds]
def tok_fn(ex):
inst=ex.get("instruction",""); inp=ex.get("input","").strip(); out=ex.get("output","")
txt = f"### Instruction:\n{inst}\n\n### Input:\n{inp}\n\n### Response:\n{out}" if inp else \
f"### Instruction:\n{inst}\n\n### Response:\n{out}"
e = tok(txt, truncation=True, max_length=MAX_SEQ, padding="max_length")
return {"input_ids": e["input_ids"], "labels": e["input_ids"].copy(), "attention_mask": e["attention_mask"]}
tokenized = [tok_fn(ex) for ex in examples]
class DS(torch.utils.data.Dataset):
def __init__(s, d): s.d = d
def __len__(s): return len(s.d)
def __getitem__(s, i): return {k: torch.tensor(v, dtype=torch.long) for k, v in s.d[i].items()}
dl = DataLoader(DS(tokenized), batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(dev)
params = model.get_trainable_parameters()
opt = torch.optim.AdamW(params, lr=LR, weight_decay=0.01)
model.model.train()
losses = []; gs = 0; al = 0.0
t0 = time.time()
for batch in dl:
if gs >= TRAIN_STEPS * GRAD_ACCUM:
break
batch = {k: v.to(dev) for k, v in batch.items()}
with torch.autocast("cuda", dtype=torch.float16, enabled=torch.cuda.is_available()):
loss = model(
input_ids=batch["input_ids"],
attention_mask=batch["attention_mask"],
labels=batch["labels"]
).loss / GRAD_ACCUM
loss.backward()
al += loss.item()
gs += 1
if gs % GRAD_ACCUM == 0:
torch.nn.utils.clip_grad_norm_(params, 1.0)
opt.step()
opt.zero_grad()
s = gs // GRAD_ACCUM
losses.append(al)
al = 0.0
if s % 20 == 0:
log(f" [figquant] step={s} loss={losses[-1]:.4f}")
tt = time.time() - t0
peak_gpu = torch.cuda.max_memory_allocated() / 1e6 if torch.cuda.is_available() else 0
log(f"\n FigQuant LoRA (lowram mode):")
log(f" Final loss: {losses[-1]:.4f}")
log(f" Time: {tt:.0f}s")
log(f" GPU Memory: {peak_gpu:.0f} MB")
log(f" Steps: {len(losses)}")
del model, opt
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# TASK 2: Test engine format converter
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
log("\n" + "="*60)
log(" TASK 2: Test Engine Format Converter")
log("="*60)
# Clone Lila to get the converter
if not os.path.exists("/app/lila"):
subprocess.check_call(["git", "clone", "https://github.com/ticketguy/Lila.git", "/app/lila"])
sys.path.insert(0, "/app/lila/engine/format")
# Test with a tiny model first to verify the converter works
log("Testing converter with TinyLlama...")
try:
# Import and run converter
exec(open("/app/lila/engine/format/convert.py").read().split("if __name__")[0])
convert("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "/app/tinyllama.lila", group_size=128)
# Verify file exists and has reasonable size
size = os.path.getsize("/app/tinyllama.lila")
log(f" β
Converter produced: /app/tinyllama.lila ({size/1e6:.1f} MB)")
# Verify header
import struct
with open("/app/tinyllama.lila", "rb") as f:
magic = struct.unpack("I", f.read(4))[0]
version = struct.unpack("I", f.read(4))[0]
n_layers = struct.unpack("I", f.read(4))[0]
hidden = struct.unpack("I", f.read(4))[0]
inter = struct.unpack("I", f.read(4))[0]
n_heads = struct.unpack("I", f.read(4))[0]
n_kv_heads = struct.unpack("I", f.read(4))[0]
vocab = struct.unpack("I", f.read(4))[0]
max_seq = struct.unpack("I", f.read(4))[0]
log(f" Header: magic=0x{magic:08X} version={version}")
log(f" Config: layers={n_layers}, hidden={hidden}, inter={inter}")
log(f" Heads: {n_heads} query, {n_kv_heads} kv")
log(f" Vocab: {vocab}, max_seq: {max_seq}")
if magic == 0x4C494C41:
log(f" β
LILA magic confirmed")
else:
log(f" β Wrong magic: expected 0x4C494C41")
except Exception as e:
log(f" β Converter failed: {e}")
traceback.print_exc()
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# FINAL SUMMARY
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
log("\n" + "="*60)
log(" FINAL RESULTS")
log("="*60)
log(f"\n GPU TRAINING COMPARISON (TinyLlama 1.1B, 100 steps):")
log(f" {'Method':>16} {'Loss':>8} {'Time':>7} {'GPU MB':>8}")
log(f" {'β'*44}")
log(f" {'FP16 LoRA':>16} {'0.2252':>8} {'1309s':>7} {'3585':>8}")
log(f" {'BnB NF4 QLoRA':>16} {'0.2399':>8} {'1423s':>7} {'2441':>8}")
if losses:
log(f" {'FigQuant LoRA':>16} {losses[-1]:>8.4f} {tt:>6.0f}s {peak_gpu:>7.0f}")
else:
log(f" {'FigQuant LoRA':>16} {'FAILED':>8}")
log(f"\n QUANTIZATION: FigQuant wins 156/156 layers (+5.4% better MSE than NF4)")
log("="*60)
# Save results
results = {
"figquant_training": {
"final_loss": float(losses[-1]) if losses else None,
"time_s": tt,
"gpu_mb": peak_gpu,
"steps": len(losses),
"mode": "lowram",
},
"comparison": {
"fp16": {"loss": 0.2252, "time": 1309, "gpu_mb": 3585},
"bnb_nf4": {"loss": 0.2399, "time": 1423, "gpu_mb": 2441},
},
"converter_test": {
"success": os.path.exists("/app/tinyllama.lila"),
"file_size_mb": os.path.getsize("/app/tinyllama.lila") / 1e6 if os.path.exists("/app/tinyllama.lila") else 0,
}
}
with open("/app/final_results.json", "w") as f:
json.dump(results, f, indent=2)
log("π Results saved.")
|