ticketguy
/

littlefig-bench

ml-intern

Model card Files Files and versions

xet

Community

ticketguy commited on about 22 hours ago

Commit

282001f

verified ·

1 Parent(s): e75ae96

Fix FigQuant GPU benchmark (use figcache mode) + test engine conversion

Browse files

Files changed (1) hide show

final_gpu_test.py +224 -0

final_gpu_test.py ADDED Viewed

	@@ -0,0 +1,224 @@

+#!/usr/bin/env python3
+"""
+Two tasks:
+1. Rerun FigQuant training on GPU with memory_mode=figcache (fits T4 16GB)
+2. Test engine format converter on TinyLlama
+"""
+import os, sys, subprocess, json, time, gc, traceback
+import numpy as np
+subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
+    "transformers", "accelerate", "peft", "bitsandbytes", "datasets",
+    "sentencepiece", "protobuf", "psutil", "numpy"])
+if not os.path.exists("/app/littlefig"):
+    subprocess.check_call(["git", "clone", "https://github.com/ticketguy/littlefig.git", "/app/littlefig"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-e", "/app/littlefig[train]"])
+sys.path.insert(0, "/app/littlefig/src")
+import torch
+import torch.nn.functional as F
+def log(msg): print(f"[TEST] {msg}", flush=True)
+log(f"PyTorch {torch.__version__}, CUDA={torch.cuda.is_available()}")
+if torch.cuda.is_available():
+    log(f"GPU: {torch.cuda.get_device_name()} ({torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB)")
+import psutil
+log(f"RAM: {psutil.virtual_memory().total/1e9:.1f}GB")
+# ═══════════════════════════════════════════════════════════════════════════════
+# TASK 1: FigQuant training with figcache mode (fits T4 16GB)
+# ═══════════════════════════════════════════════════════════════════════════════
+log("\n" + "="*60)
+log("  TASK 1: FigQuant LoRA Training (figcache mode)")
+log("="*60)
+from little_fig.engine import FigModel
+from little_fig.engine.tier import TrainingTier
+from little_fig.engine.trainer import FigTrainingConfig
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+LORA_R = 16; LORA_ALPHA = 32
+LORA_TARGETS = ["q_proj", "k_proj", "v_proj", "o_proj"]
+TRAIN_STEPS = 100; BATCH_SIZE = 4; GRAD_ACCUM = 4; LR = 2e-4; MAX_SEQ = 512
+ds = load_dataset("tatsu-lab/alpaca", split="train").select(range(1000))
+log(f"Dataset: {len(ds)} examples")
+# Load with figcache mode (75% less memory than fast mode)
+log("Loading FigQuant with memory_mode=figcache...")
+gc.collect()
+if torch.cuda.is_available():
+    torch.cuda.empty_cache()
+    torch.cuda.reset_peak_memory_stats()
+model = FigModel.from_pretrained(
+    MODEL, lora_r=LORA_R, lora_alpha=LORA_ALPHA,
+    tier=TrainingTier.STREAMING_LORA,
+    target_modules=LORA_TARGETS,
+    fast=False,  # USE LOWRAM MODE — no FP32 cache on GPU
+)
+tok = model.tokenizer
+# Prepare data
+examples = [dict(r) for r in ds]
+def tok_fn(ex):
+    inst=ex.get("instruction",""); inp=ex.get("input","").strip(); out=ex.get("output","")
+    txt = f"### Instruction:\n{inst}\n\n### Input:\n{inp}\n\n### Response:\n{out}" if inp else \
+          f"### Instruction:\n{inst}\n\n### Response:\n{out}"
+    e = tok(txt, truncation=True, max_length=MAX_SEQ, padding="max_length")
+    return {"input_ids": e["input_ids"], "labels": e["input_ids"].copy(), "attention_mask": e["attention_mask"]}
+tokenized = [tok_fn(ex) for ex in examples]
+class DS(torch.utils.data.Dataset):
+    def __init__(s, d): s.d = d
+    def __len__(s): return len(s.d)
+    def __getitem__(s, i): return {k: torch.tensor(v, dtype=torch.long) for k, v in s.d[i].items()}
+dl = DataLoader(DS(tokenized), batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
+dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = model.to(dev)
+params = model.get_trainable_parameters()
+opt = torch.optim.AdamW(params, lr=LR, weight_decay=0.01)
+model.model.train()
+losses = []; gs = 0; al = 0.0
+t0 = time.time()
+for batch in dl:
+    if gs >= TRAIN_STEPS * GRAD_ACCUM:
+        break
+    batch = {k: v.to(dev) for k, v in batch.items()}
+    with torch.autocast("cuda", dtype=torch.float16, enabled=torch.cuda.is_available()):
+        loss = model(
+            input_ids=batch["input_ids"],
+            attention_mask=batch["attention_mask"],
+            labels=batch["labels"]
+        ).loss / GRAD_ACCUM
+    loss.backward()
+    al += loss.item()
+    gs += 1
+    if gs % GRAD_ACCUM == 0:
+        torch.nn.utils.clip_grad_norm_(params, 1.0)
+        opt.step()
+        opt.zero_grad()
+        s = gs // GRAD_ACCUM
+        losses.append(al)
+        al = 0.0
+        if s % 20 == 0:
+            log(f"  [figquant] step={s} loss={losses[-1]:.4f}")
+tt = time.time() - t0
+peak_gpu = torch.cuda.max_memory_allocated() / 1e6 if torch.cuda.is_available() else 0
+log(f"\n  FigQuant LoRA (lowram mode):")
+log(f"    Final loss: {losses[-1]:.4f}")
+log(f"    Time: {tt:.0f}s")
+log(f"    GPU Memory: {peak_gpu:.0f} MB")
+log(f"    Steps: {len(losses)}")
+del model, opt
+gc.collect()
+if torch.cuda.is_available():
+    torch.cuda.empty_cache()
+# ════���══════════════════════════════════════════════════════════════════════════
+# TASK 2: Test engine format converter
+# ═══════════════════════════════════════════════════════════════════════════════
+log("\n" + "="*60)
+log("  TASK 2: Test Engine Format Converter")
+log("="*60)
+# Clone Lila to get the converter
+if not os.path.exists("/app/lila"):
+    subprocess.check_call(["git", "clone", "https://github.com/ticketguy/Lila.git", "/app/lila"])
+sys.path.insert(0, "/app/lila/engine/format")
+# Test with a tiny model first to verify the converter works
+log("Testing converter with TinyLlama...")
+try:
+    # Import and run converter
+    exec(open("/app/lila/engine/format/convert.py").read().split("if __name__")[0])
+    convert("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "/app/tinyllama.lila", group_size=128)
+    # Verify file exists and has reasonable size
+    size = os.path.getsize("/app/tinyllama.lila")
+    log(f"  ✅ Converter produced: /app/tinyllama.lila ({size/1e6:.1f} MB)")
+    # Verify header
+    import struct
+    with open("/app/tinyllama.lila", "rb") as f:
+        magic = struct.unpack("I", f.read(4))[0]
+        version = struct.unpack("I", f.read(4))[0]
+        n_layers = struct.unpack("I", f.read(4))[0]
+        hidden = struct.unpack("I", f.read(4))[0]
+        inter = struct.unpack("I", f.read(4))[0]
+        n_heads = struct.unpack("I", f.read(4))[0]
+        n_kv_heads = struct.unpack("I", f.read(4))[0]
+        vocab = struct.unpack("I", f.read(4))[0]
+        max_seq = struct.unpack("I", f.read(4))[0]
+    log(f"  Header: magic=0x{magic:08X} version={version}")
+    log(f"  Config: layers={n_layers}, hidden={hidden}, inter={inter}")
+    log(f"  Heads: {n_heads} query, {n_kv_heads} kv")
+    log(f"  Vocab: {vocab}, max_seq: {max_seq}")
+    if magic == 0x4C494C41:
+        log(f"  ✅ LILA magic confirmed")
+    else:
+        log(f"  ❌ Wrong magic: expected 0x4C494C41")
+except Exception as e:
+    log(f"  ❌ Converter failed: {e}")
+    traceback.print_exc()
+# ═══════════════════════════════════════════════════════════════════════════════
+# FINAL SUMMARY
+# ═══════════════════════════════════════════════════════════════════════════════
+log("\n" + "="*60)
+log("  FINAL RESULTS")
+log("="*60)
+log(f"\n  GPU TRAINING COMPARISON (TinyLlama 1.1B, 100 steps):")
+log(f"  {'Method':>16} {'Loss':>8} {'Time':>7} {'GPU MB':>8}")
+log(f"  {'─'*44}")
+log(f"  {'FP16 LoRA':>16} {'0.2252':>8} {'1309s':>7} {'3585':>8}")
+log(f"  {'BnB NF4 QLoRA':>16} {'0.2399':>8} {'1423s':>7} {'2441':>8}")
+if losses:
+    log(f"  {'FigQuant LoRA':>16} {losses[-1]:>8.4f} {tt:>6.0f}s {peak_gpu:>7.0f}")
+else:
+    log(f"  {'FigQuant LoRA':>16} {'FAILED':>8}")
+log(f"\n  QUANTIZATION: FigQuant wins 156/156 layers (+5.4% better MSE than NF4)")
+log("="*60)
+# Save results
+results = {
+    "figquant_training": {
+        "final_loss": float(losses[-1]) if losses else None,
+        "time_s": tt,
+        "gpu_mb": peak_gpu,
+        "steps": len(losses),
+        "mode": "lowram",
+    },
+    "comparison": {
+        "fp16": {"loss": 0.2252, "time": 1309, "gpu_mb": 3585},
+        "bnb_nf4": {"loss": 0.2399, "time": 1423, "gpu_mb": 2441},
+    },
+    "converter_test": {
+        "success": os.path.exists("/app/tinyllama.lila"),
+        "file_size_mb": os.path.getsize("/app/tinyllama.lila") / 1e6 if os.path.exists("/app/tinyllama.lila") else 0,
+    }
+}
+with open("/app/final_results.json", "w") as f:
+    json.dump(results, f, indent=2)
+log("📁 Results saved.")