littlefig-bench / memory_and_cogmem.py
ticketguy's picture
GPU memory experiment + CogMemBench on real model
ae7a539 verified
#!/usr/bin/env python3
"""
Two experiments:
1. Research GPU memory reduction for FigQuant (figcache mode on GPU)
2. Run CogMemBench on TinyLlama
"""
import os, sys, subprocess, time, gc, json
import numpy as np
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
"transformers", "accelerate", "datasets", "sentencepiece", "protobuf", "psutil", "numpy"])
subprocess.check_call(["git", "clone", "https://github.com/ticketguy/littlefig.git", "/app/littlefig"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-e", "/app/littlefig[train]"])
sys.path.insert(0, "/app/littlefig/src")
sys.path.insert(0, "/app/littlefig")
import torch
def log(msg): print(f"[EXP] {msg}", flush=True)
log(f"PyTorch {torch.__version__}, CUDA={torch.cuda.is_available()}")
if torch.cuda.is_available():
log(f"GPU: {torch.cuda.get_device_name()} ({torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB)")
# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT 1: GPU Memory Profiling β€” what eats the VRAM?
# ═══════════════════════════════════════════════════════════════════════════════
log("\n" + "="*60)
log(" EXPERIMENT 1: GPU Memory Profiling")
log("="*60)
from little_fig.engine import FigModel
from little_fig.engine.tier import TrainingTier
MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
# Profile: what's the memory at each stage?
log("\n Memory at each stage (lowram mode):")
# Stage 1: load model on CPU
model = FigModel.from_pretrained(MODEL, lora_r=16, lora_alpha=32,
tier=TrainingTier.STREAMING_LORA, target_modules=["q_proj","k_proj","v_proj","o_proj"],
fast=False)
log(f" After load (CPU): GPU={torch.cuda.memory_allocated()/1e6:.0f}MB")
# Stage 2: move to GPU
dev = torch.device("cuda")
model = model.to(dev)
torch.cuda.synchronize()
after_move = torch.cuda.memory_allocated()/1e6
log(f" After .to(cuda): GPU={after_move:.0f}MB")
# Stage 3: single forward pass
tok = model.tokenizer
enc = tok("Hello world", return_tensors="pt", max_length=64, truncation=True, padding="max_length")
enc = {k: v.to(dev) for k, v in enc.items()}
torch.cuda.reset_peak_memory_stats()
with torch.autocast("cuda", dtype=torch.float16):
out = model(input_ids=enc["input_ids"], labels=enc["input_ids"])
after_fwd = torch.cuda.max_memory_allocated()/1e6
log(f" After forward: GPU={after_fwd:.0f}MB (peak)")
# Stage 4: backward pass
torch.cuda.reset_peak_memory_stats()
out.loss.backward()
after_bwd = torch.cuda.max_memory_allocated()/1e6
log(f" After backward: GPU={after_bwd:.0f}MB (peak)")
log(f"\n ANALYSIS:")
log(f" Model on GPU: {after_move:.0f}MB")
log(f" Forward peak: {after_fwd:.0f}MB (+{after_fwd-after_move:.0f}MB activations)")
log(f" Backward peak: {after_bwd:.0f}MB (+{after_bwd-after_fwd:.0f}MB gradients)")
log(f" Total training: {after_bwd:.0f}MB")
# What's eating memory? The INT4 weights are tiny, but they get dequantized to FP32 in forward
# In lowram mode: each forward dequants to fp32 temporarily β†’ that's where the spike is
# With autocast(fp16): the dequant goes to fp16 (our dtype fix) β†’ should be 2Γ— less
# Count parameters by type
int4_bytes = 0
fp32_bytes = 0
for name, param in model.named_parameters():
if param.requires_grad:
fp32_bytes += param.numel() * param.element_size()
for name, buf in model.named_buffers():
if buf is not None:
if buf.dtype == torch.uint8:
int4_bytes += buf.numel()
else:
fp32_bytes += buf.numel() * buf.element_size()
log(f"\n Weight breakdown:")
log(f" INT4 packed indices: {int4_bytes/1e6:.1f}MB")
log(f" FP32 params/buffers: {fp32_bytes/1e6:.1f}MB")
log(f" LoRA trainable: {sum(p.numel()*4 for p in model.parameters() if p.requires_grad)/1e6:.1f}MB")
# FINDING: The issue is that dequant creates full fp32/fp16 weight tensors per layer per forward
# For 88 quantized layers at ~4MB each = ~350MB of temporary dequantized weights
# Plus activations + gradients for a 1.1B model = total ~10GB
log(f"\n ROOT CAUSE: Each forward dequantizes 88 layers Γ— ~4MB each = ~350MB temp tensors")
log(f" Plus activations for 1.1B model at seq_len=512 = ~several GB")
log(f" SOLUTIONS:")
log(f" 1. Gradient checkpointing (already used β€” recompute activations)")
log(f" 2. Smaller batch size (reduce activation memory)")
log(f" 3. Shorter sequence length")
log(f" 4. FP16 dequant instead of FP32 (our dtype fix helps)")
log(f" 5. Layer-wise gradient accumulation (dequant only active layer)")
del model; gc.collect(); torch.cuda.empty_cache()
# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT 2: Can we reduce memory by using smaller batch + shorter seq?
# ═══════════════════════════════════════════════════════════════════════════════
log("\n" + "="*60)
log(" EXPERIMENT 2: Memory vs Batch Size/Seq Length")
log("="*60)
configs = [
(1, 128, "batch=1, seq=128"),
(1, 256, "batch=1, seq=256"),
(2, 256, "batch=2, seq=256"),
(4, 256, "batch=4, seq=256"),
(4, 512, "batch=4, seq=512"),
]
results_mem = []
for batch_sz, seq_len, label in configs:
gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
model = FigModel.from_pretrained(MODEL, lora_r=16, lora_alpha=32,
tier=TrainingTier.STREAMING_LORA, target_modules=["q_proj","k_proj","v_proj","o_proj"],
fast=False)
model = model.to(dev)
ids = torch.randint(0, 32000, (batch_sz, seq_len), device=dev)
try:
torch.cuda.reset_peak_memory_stats()
with torch.autocast("cuda", dtype=torch.float16):
out = model(input_ids=ids, labels=ids)
out.loss.backward()
peak = torch.cuda.max_memory_allocated()/1e6
results_mem.append((label, peak, "βœ“"))
log(f" {label:>20}: {peak:.0f}MB βœ“")
except torch.cuda.OutOfMemoryError:
results_mem.append((label, 0, "OOM"))
log(f" {label:>20}: OOM βœ—")
del model; gc.collect(); torch.cuda.empty_cache()
log(f"\n FINDING: Memory scales with batch Γ— seq_len")
log(f" For T4 (16GB): batch=2, seq=256 is the sweet spot for FigQuant lowram")
# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT 3: Run CogMemBench on TinyLlama
# ═══════════════════════════════════════════════════════════════════════════════
log("\n" + "="*60)
log(" EXPERIMENT 3: CogMemBench on TinyLlama")
log("="*60)
from cogmembench import CogMemGenerator, CogMemScorer, CogMemRunner
from transformers import AutoModelForCausalLM, AutoTokenizer
gc.collect(); torch.cuda.empty_cache()
log("Loading TinyLlama for benchmark...")
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.float16, device_map="auto")
tokenizer.pad_token = tokenizer.eos_token
def generate_response(prompt):
"""Generate a response from TinyLlama given a CogMemBench prompt."""
messages = [{"role": "user", "content": prompt}]
try:
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
except:
text = f"<|user|>\n{prompt}\n<|assistant|>\n"
enc = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True).to("cuda")
with torch.no_grad():
out = model.generate(**enc, max_new_tokens=150, do_sample=False,
pad_token_id=tokenizer.eos_token_id)
response = tokenizer.decode(out[0][enc["input_ids"].shape[1]:], skip_special_tokens=True)
return response
# Run on a subset (full 1000 would take too long)
runner = CogMemRunner(seed=42, per_axis=20) # 100 total cases
log("Running CogMemBench (100 cases, 5 axes)...")
results = runner.run(
model_fn=generate_response,
max_cases=100,
verbose=True,
)
log(f"\n CogMem Score: {results['cogmem_score']}/100")
log(f" Per-axis:")
for ax, acc in results['axis_accuracy'].items():
log(f" {ax:>15}: {acc*100:.1f}%")
# Save results
with open("/app/cogmem_results.json", "w") as f:
json.dump({k: v for k, v in results.items() if k != 'details'}, f, indent=2)
log("\n" + "="*60)
log(" ALL EXPERIMENTS COMPLETE")
log("="*60)