littlefig-bench / memory_and_cogmem.py

GPU memory experiment + CogMemBench on real model

ae7a539 verified about 3 hours ago

9.31 kB

	#!/usr/bin/env python3
	"""
	Two experiments:
	1. Research GPU memory reduction for FigQuant (figcache mode on GPU)
	2. Run CogMemBench on TinyLlama
	"""
	import os, sys, subprocess, time, gc, json
	import numpy as np

	subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
	"transformers", "accelerate", "datasets", "sentencepiece", "protobuf", "psutil", "numpy"])
	subprocess.check_call(["git", "clone", "https://github.com/ticketguy/littlefig.git", "/app/littlefig"])
	subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-e", "/app/littlefig[train]"])
	sys.path.insert(0, "/app/littlefig/src")
	sys.path.insert(0, "/app/littlefig")

	import torch

	def log(msg): print(f"[EXP] {msg}", flush=True)

	log(f"PyTorch {torch.__version__}, CUDA={torch.cuda.is_available()}")
	if torch.cuda.is_available():
	log(f"GPU: {torch.cuda.get_device_name()} ({torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB)")

	# ═══════════════════════════════════════════════════════════════════════════════
	# EXPERIMENT 1: GPU Memory Profiling — what eats the VRAM?
	# ═══════════════════════════════════════════════════════════════════════════════
	log("\n" + "="*60)
	log(" EXPERIMENT 1: GPU Memory Profiling")
	log("="*60)

	from little_fig.engine import FigModel
	from little_fig.engine.tier import TrainingTier

	MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
	gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()

	# Profile: what's the memory at each stage?
	log("\n Memory at each stage (lowram mode):")

	# Stage 1: load model on CPU
	model = FigModel.from_pretrained(MODEL, lora_r=16, lora_alpha=32,
	tier=TrainingTier.STREAMING_LORA, target_modules=["q_proj","k_proj","v_proj","o_proj"],
	fast=False)

	log(f" After load (CPU): GPU={torch.cuda.memory_allocated()/1e6:.0f}MB")

	# Stage 2: move to GPU
	dev = torch.device("cuda")
	model = model.to(dev)
	torch.cuda.synchronize()
	after_move = torch.cuda.memory_allocated()/1e6
	log(f" After .to(cuda): GPU={after_move:.0f}MB")

	# Stage 3: single forward pass
	tok = model.tokenizer
	enc = tok("Hello world", return_tensors="pt", max_length=64, truncation=True, padding="max_length")
	enc = {k: v.to(dev) for k, v in enc.items()}

	torch.cuda.reset_peak_memory_stats()
	with torch.autocast("cuda", dtype=torch.float16):
	out = model(input_ids=enc["input_ids"], labels=enc["input_ids"])
	after_fwd = torch.cuda.max_memory_allocated()/1e6
	log(f" After forward: GPU={after_fwd:.0f}MB (peak)")

	# Stage 4: backward pass
	torch.cuda.reset_peak_memory_stats()
	out.loss.backward()
	after_bwd = torch.cuda.max_memory_allocated()/1e6
	log(f" After backward: GPU={after_bwd:.0f}MB (peak)")

	log(f"\n ANALYSIS:")
	log(f" Model on GPU: {after_move:.0f}MB")
	log(f" Forward peak: {after_fwd:.0f}MB (+{after_fwd-after_move:.0f}MB activations)")
	log(f" Backward peak: {after_bwd:.0f}MB (+{after_bwd-after_fwd:.0f}MB gradients)")
	log(f" Total training: {after_bwd:.0f}MB")

	# What's eating memory? The INT4 weights are tiny, but they get dequantized to FP32 in forward
	# In lowram mode: each forward dequants to fp32 temporarily → that's where the spike is
	# With autocast(fp16): the dequant goes to fp16 (our dtype fix) → should be 2× less

	# Count parameters by type
	int4_bytes = 0
	fp32_bytes = 0
	for name, param in model.named_parameters():
	if param.requires_grad:
	fp32_bytes += param.numel() * param.element_size()
	for name, buf in model.named_buffers():
	if buf is not None:
	if buf.dtype == torch.uint8:
	int4_bytes += buf.numel()
	else:
	fp32_bytes += buf.numel() * buf.element_size()

	log(f"\n Weight breakdown:")
	log(f" INT4 packed indices: {int4_bytes/1e6:.1f}MB")
	log(f" FP32 params/buffers: {fp32_bytes/1e6:.1f}MB")
	log(f" LoRA trainable: {sum(p.numel()*4 for p in model.parameters() if p.requires_grad)/1e6:.1f}MB")

	# FINDING: The issue is that dequant creates full fp32/fp16 weight tensors per layer per forward
	# For 88 quantized layers at ~4MB each = ~350MB of temporary dequantized weights
	# Plus activations + gradients for a 1.1B model = total ~10GB

	log(f"\n ROOT CAUSE: Each forward dequantizes 88 layers × ~4MB each = ~350MB temp tensors")
	log(f" Plus activations for 1.1B model at seq_len=512 = ~several GB")
	log(f" SOLUTIONS:")
	log(f" 1. Gradient checkpointing (already used — recompute activations)")
	log(f" 2. Smaller batch size (reduce activation memory)")
	log(f" 3. Shorter sequence length")
	log(f" 4. FP16 dequant instead of FP32 (our dtype fix helps)")
	log(f" 5. Layer-wise gradient accumulation (dequant only active layer)")

	del model; gc.collect(); torch.cuda.empty_cache()

	# ═══════════════════════════════════════════════════════════════════════════════
	# EXPERIMENT 2: Can we reduce memory by using smaller batch + shorter seq?
	# ═══════════════════════════════════════════════════════════════════════════════
	log("\n" + "="*60)
	log(" EXPERIMENT 2: Memory vs Batch Size/Seq Length")
	log("="*60)

	configs = [
	(1, 128, "batch=1, seq=128"),
	(1, 256, "batch=1, seq=256"),
	(2, 256, "batch=2, seq=256"),
	(4, 256, "batch=4, seq=256"),
	(4, 512, "batch=4, seq=512"),
	]

	results_mem = []
	for batch_sz, seq_len, label in configs:
	gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()

	model = FigModel.from_pretrained(MODEL, lora_r=16, lora_alpha=32,
	tier=TrainingTier.STREAMING_LORA, target_modules=["q_proj","k_proj","v_proj","o_proj"],
	fast=False)
	model = model.to(dev)

	ids = torch.randint(0, 32000, (batch_sz, seq_len), device=dev)

	try:
	torch.cuda.reset_peak_memory_stats()
	with torch.autocast("cuda", dtype=torch.float16):
	out = model(input_ids=ids, labels=ids)
	out.loss.backward()
	peak = torch.cuda.max_memory_allocated()/1e6
	results_mem.append((label, peak, "✓"))
	log(f" {label:>20}: {peak:.0f}MB ✓")
	except torch.cuda.OutOfMemoryError:
	results_mem.append((label, 0, "OOM"))
	log(f" {label:>20}: OOM ✗")

	del model; gc.collect(); torch.cuda.empty_cache()

	log(f"\n FINDING: Memory scales with batch × seq_len")
	log(f" For T4 (16GB): batch=2, seq=256 is the sweet spot for FigQuant lowram")

	# ═══════════════════════════════════════════════════════════════════════════════
	# EXPERIMENT 3: Run CogMemBench on TinyLlama
	# ═══════════════════════════════════════════════════════════════════════════════
	log("\n" + "="*60)
	log(" EXPERIMENT 3: CogMemBench on TinyLlama")
	log("="*60)

	from cogmembench import CogMemGenerator, CogMemScorer, CogMemRunner
	from transformers import AutoModelForCausalLM, AutoTokenizer

	gc.collect(); torch.cuda.empty_cache()

	log("Loading TinyLlama for benchmark...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL)
	model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.float16, device_map="auto")
	tokenizer.pad_token = tokenizer.eos_token

	def generate_response(prompt):
	"""Generate a response from TinyLlama given a CogMemBench prompt."""
	messages = [{"role": "user", "content": prompt}]
	try:
	text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	except:
	text = f"<\|user\|>\n{prompt}\n<\|assistant\|>\n"

	enc = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True).to("cuda")
	with torch.no_grad():
	out = model.generate(**enc, max_new_tokens=150, do_sample=False,
	pad_token_id=tokenizer.eos_token_id)
	response = tokenizer.decode(out[0][enc["input_ids"].shape[1]:], skip_special_tokens=True)
	return response

	# Run on a subset (full 1000 would take too long)
	runner = CogMemRunner(seed=42, per_axis=20) # 100 total cases
	log("Running CogMemBench (100 cases, 5 axes)...")

	results = runner.run(
	model_fn=generate_response,
	max_cases=100,
	verbose=True,
	)

	log(f"\n CogMem Score: {results['cogmem_score']}/100")
	log(f" Per-axis:")
	for ax, acc in results['axis_accuracy'].items():
	log(f" {ax:>15}: {acc*100:.1f}%")

	# Save results
	with open("/app/cogmem_results.json", "w") as f:
	json.dump({k: v for k, v in results.items() if k != 'details'}, f, indent=2)

	log("\n" + "="*60)
	log(" ALL EXPERIMENTS COMPLETE")
	log("="*60)