ticketguy
/

littlefig-bench

ml-intern

Model card Files Files and versions

xet

Community

ticketguy commited on about 7 hours ago

Commit

ae7a539

verified ·

1 Parent(s): 246f26e

GPU memory experiment + CogMemBench on real model

Browse files

Files changed (1) hide show

memory_and_cogmem.py +211 -0

memory_and_cogmem.py ADDED Viewed

	@@ -0,0 +1,211 @@

+#!/usr/bin/env python3
+"""
+Two experiments:
+1. Research GPU memory reduction for FigQuant (figcache mode on GPU)
+2. Run CogMemBench on TinyLlama
+"""
+import os, sys, subprocess, time, gc, json
+import numpy as np
+subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
+    "transformers", "accelerate", "datasets", "sentencepiece", "protobuf", "psutil", "numpy"])
+subprocess.check_call(["git", "clone", "https://github.com/ticketguy/littlefig.git", "/app/littlefig"])
+subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-e", "/app/littlefig[train]"])
+sys.path.insert(0, "/app/littlefig/src")
+sys.path.insert(0, "/app/littlefig")
+import torch
+def log(msg): print(f"[EXP] {msg}", flush=True)
+log(f"PyTorch {torch.__version__}, CUDA={torch.cuda.is_available()}")
+if torch.cuda.is_available():
+    log(f"GPU: {torch.cuda.get_device_name()} ({torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB)")
+# ═══════════════════════════════════════════════════════════════════════════════
+# EXPERIMENT 1: GPU Memory Profiling — what eats the VRAM?
+# ═══════════════════════════════════════════════════════════════════════════════
+log("\n" + "="*60)
+log("  EXPERIMENT 1: GPU Memory Profiling")
+log("="*60)
+from little_fig.engine import FigModel
+from little_fig.engine.tier import TrainingTier
+MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
+# Profile: what's the memory at each stage?
+log("\n  Memory at each stage (lowram mode):")
+# Stage 1: load model on CPU
+model = FigModel.from_pretrained(MODEL, lora_r=16, lora_alpha=32,
+    tier=TrainingTier.STREAMING_LORA, target_modules=["q_proj","k_proj","v_proj","o_proj"],
+    fast=False)
+log(f"  After load (CPU):     GPU={torch.cuda.memory_allocated()/1e6:.0f}MB")
+# Stage 2: move to GPU
+dev = torch.device("cuda")
+model = model.to(dev)
+torch.cuda.synchronize()
+after_move = torch.cuda.memory_allocated()/1e6
+log(f"  After .to(cuda):      GPU={after_move:.0f}MB")
+# Stage 3: single forward pass
+tok = model.tokenizer
+enc = tok("Hello world", return_tensors="pt", max_length=64, truncation=True, padding="max_length")
+enc = {k: v.to(dev) for k, v in enc.items()}
+torch.cuda.reset_peak_memory_stats()
+with torch.autocast("cuda", dtype=torch.float16):
+    out = model(input_ids=enc["input_ids"], labels=enc["input_ids"])
+after_fwd = torch.cuda.max_memory_allocated()/1e6
+log(f"  After forward:        GPU={after_fwd:.0f}MB (peak)")
+# Stage 4: backward pass
+torch.cuda.reset_peak_memory_stats()
+out.loss.backward()
+after_bwd = torch.cuda.max_memory_allocated()/1e6
+log(f"  After backward:       GPU={after_bwd:.0f}MB (peak)")
+log(f"\n  ANALYSIS:")
+log(f"    Model on GPU:       {after_move:.0f}MB")
+log(f"    Forward peak:       {after_fwd:.0f}MB (+{after_fwd-after_move:.0f}MB activations)")
+log(f"    Backward peak:      {after_bwd:.0f}MB (+{after_bwd-after_fwd:.0f}MB gradients)")
+log(f"    Total training:     {after_bwd:.0f}MB")
+# What's eating memory? The INT4 weights are tiny, but they get dequantized to FP32 in forward
+# In lowram mode: each forward dequants to fp32 temporarily → that's where the spike is
+# With autocast(fp16): the dequant goes to fp16 (our dtype fix) → should be 2× less
+# Count parameters by type
+int4_bytes = 0
+fp32_bytes = 0
+for name, param in model.named_parameters():
+    if param.requires_grad:
+        fp32_bytes += param.numel() * param.element_size()
+for name, buf in model.named_buffers():
+    if buf is not None:
+        if buf.dtype == torch.uint8:
+            int4_bytes += buf.numel()
+        else:
+            fp32_bytes += buf.numel() * buf.element_size()
+log(f"\n  Weight breakdown:")
+log(f"    INT4 packed indices: {int4_bytes/1e6:.1f}MB")
+log(f"    FP32 params/buffers: {fp32_bytes/1e6:.1f}MB")
+log(f"    LoRA trainable:     {sum(p.numel()*4 for p in model.parameters() if p.requires_grad)/1e6:.1f}MB")
+# FINDING: The issue is that dequant creates full fp32/fp16 weight tensors per layer per forward
+# For 88 quantized layers at ~4MB each = ~350MB of temporary dequantized weights
+# Plus activations + gradients for a 1.1B model = total ~10GB
+log(f"\n  ROOT CAUSE: Each forward dequantizes 88 layers × ~4MB each = ~350MB temp tensors")
+log(f"  Plus activations for 1.1B model at seq_len=512 = ~several GB")
+log(f"  SOLUTIONS:")
+log(f"    1. Gradient checkpointing (already used — recompute activations)")
+log(f"    2. Smaller batch size (reduce activation memory)")
+log(f"    3. Shorter sequence length")
+log(f"    4. FP16 dequant instead of FP32 (our dtype fix helps)")
+log(f"    5. Layer-wise gradient accumulation (dequant only active layer)")
+del model; gc.collect(); torch.cuda.empty_cache()
+# ═══════════════════════════════════════════════════════════════════════════════
+# EXPERIMENT 2: Can we reduce memory by using smaller batch + shorter seq?
+# ═══════════════════════════════════════════════════════════════════════════════
+log("\n" + "="*60)
+log("  EXPERIMENT 2: Memory vs Batch Size/Seq Length")
+log("="*60)
+configs = [
+    (1, 128, "batch=1, seq=128"),
+    (1, 256, "batch=1, seq=256"),
+    (2, 256, "batch=2, seq=256"),
+    (4, 256, "batch=4, seq=256"),
+    (4, 512, "batch=4, seq=512"),
+]
+results_mem = []
+for batch_sz, seq_len, label in configs:
+    gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
+    model = FigModel.from_pretrained(MODEL, lora_r=16, lora_alpha=32,
+        tier=TrainingTier.STREAMING_LORA, target_modules=["q_proj","k_proj","v_proj","o_proj"],
+        fast=False)
+    model = model.to(dev)
+    ids = torch.randint(0, 32000, (batch_sz, seq_len), device=dev)
+    try:
+        torch.cuda.reset_peak_memory_stats()
+        with torch.autocast("cuda", dtype=torch.float16):
+            out = model(input_ids=ids, labels=ids)
+        out.loss.backward()
+        peak = torch.cuda.max_memory_allocated()/1e6
+        results_mem.append((label, peak, "✓"))
+        log(f"  {label:>20}: {peak:.0f}MB ✓")
+    except torch.cuda.OutOfMemoryError:
+        results_mem.append((label, 0, "OOM"))
+        log(f"  {label:>20}: OOM ✗")
+    del model; gc.collect(); torch.cuda.empty_cache()
+log(f"\n  FINDING: Memory scales with batch × seq_len")
+log(f"  For T4 (16GB): batch=2, seq=256 is the sweet spot for FigQuant lowram")
+# ═══════════════════════════════════════════════════════════════════════════════
+# EXPERIMENT 3: Run CogMemBench on TinyLlama
+# ═══════════════════════════════════════════════════════════════════════════════
+log("\n" + "="*60)
+log("  EXPERIMENT 3: CogMemBench on TinyLlama")
+log("="*60)
+from cogmembench import CogMemGenerator, CogMemScorer, CogMemRunner
+from transformers import AutoModelForCausalLM, AutoTokenizer
+gc.collect(); torch.cuda.empty_cache()
+log("Loading TinyLlama for benchmark...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
+model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.float16, device_map="auto")
+tokenizer.pad_token = tokenizer.eos_token
+def generate_response(prompt):
+    """Generate a response from TinyLlama given a CogMemBench prompt."""
+    messages = [{"role": "user", "content": prompt}]
+    try:
+        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    except:
+        text = f"<|user|>\n{prompt}\n<|assistant|>\n"
+    enc = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True).to("cuda")
+    with torch.no_grad():
+        out = model.generate(**enc, max_new_tokens=150, do_sample=False,
+                            pad_token_id=tokenizer.eos_token_id)
+    response = tokenizer.decode(out[0][enc["input_ids"].shape[1]:], skip_special_tokens=True)
+    return response
+# Run on a subset (full 1000 would take too long)
+runner = CogMemRunner(seed=42, per_axis=20)  # 100 total cases
+log("Running CogMemBench (100 cases, 5 axes)...")
+results = runner.run(
+    model_fn=generate_response,
+    max_cases=100,
+    verbose=True,
+)
+log(f"\n  CogMem Score: {results['cogmem_score']}/100")
+log(f"  Per-axis:")
+for ax, acc in results['axis_accuracy'].items():
+    log(f"    {ax:>15}: {acc*100:.1f}%")
+# Save results
+with open("/app/cogmem_results.json", "w") as f:
+    json.dump({k: v for k, v in results.items() if k != 'details'}, f, indent=2)
+log("\n" + "="*60)
+log("  ALL EXPERIMENTS COMPLETE")
+log("="*60)