File size: 9,308 Bytes

ae7a539

#!/usr/bin/env python3
"""
Two experiments:
1. Research GPU memory reduction for FigQuant (figcache mode on GPU)
2. Run CogMemBench on TinyLlama
"""
import os, sys, subprocess, time, gc, json
import numpy as np

subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
    "transformers", "accelerate", "datasets", "sentencepiece", "protobuf", "psutil", "numpy"])
subprocess.check_call(["git", "clone", "https://github.com/ticketguy/littlefig.git", "/app/littlefig"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-e", "/app/littlefig[train]"])
sys.path.insert(0, "/app/littlefig/src")
sys.path.insert(0, "/app/littlefig")

import torch

def log(msg): print(f"[EXP] {msg}", flush=True)

log(f"PyTorch {torch.__version__}, CUDA={torch.cuda.is_available()}")
if torch.cuda.is_available():
    log(f"GPU: {torch.cuda.get_device_name()} ({torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB)")

# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT 1: GPU Memory Profiling — what eats the VRAM?
# ═══════════════════════════════════════════════════════════════════════════════
log("\n" + "="*60)
log("  EXPERIMENT 1: GPU Memory Profiling")
log("="*60)

from little_fig.engine import FigModel
from little_fig.engine.tier import TrainingTier

MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()

# Profile: what's the memory at each stage?
log("\n  Memory at each stage (lowram mode):")

# Stage 1: load model on CPU
model = FigModel.from_pretrained(MODEL, lora_r=16, lora_alpha=32,
    tier=TrainingTier.STREAMING_LORA, target_modules=["q_proj","k_proj","v_proj","o_proj"],
    fast=False)

log(f"  After load (CPU):     GPU={torch.cuda.memory_allocated()/1e6:.0f}MB")

# Stage 2: move to GPU
dev = torch.device("cuda")
model = model.to(dev)
torch.cuda.synchronize()
after_move = torch.cuda.memory_allocated()/1e6
log(f"  After .to(cuda):      GPU={after_move:.0f}MB")

# Stage 3: single forward pass
tok = model.tokenizer
enc = tok("Hello world", return_tensors="pt", max_length=64, truncation=True, padding="max_length")
enc = {k: v.to(dev) for k, v in enc.items()}

torch.cuda.reset_peak_memory_stats()
with torch.autocast("cuda", dtype=torch.float16):
    out = model(input_ids=enc["input_ids"], labels=enc["input_ids"])
after_fwd = torch.cuda.max_memory_allocated()/1e6
log(f"  After forward:        GPU={after_fwd:.0f}MB (peak)")

# Stage 4: backward pass
torch.cuda.reset_peak_memory_stats()
out.loss.backward()
after_bwd = torch.cuda.max_memory_allocated()/1e6
log(f"  After backward:       GPU={after_bwd:.0f}MB (peak)")

log(f"\n  ANALYSIS:")
log(f"    Model on GPU:       {after_move:.0f}MB")
log(f"    Forward peak:       {after_fwd:.0f}MB (+{after_fwd-after_move:.0f}MB activations)")
log(f"    Backward peak:      {after_bwd:.0f}MB (+{after_bwd-after_fwd:.0f}MB gradients)")
log(f"    Total training:     {after_bwd:.0f}MB")

# What's eating memory? The INT4 weights are tiny, but they get dequantized to FP32 in forward
# In lowram mode: each forward dequants to fp32 temporarily → that's where the spike is
# With autocast(fp16): the dequant goes to fp16 (our dtype fix) → should be 2× less

# Count parameters by type
int4_bytes = 0
fp32_bytes = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        fp32_bytes += param.numel() * param.element_size()
for name, buf in model.named_buffers():
    if buf is not None:
        if buf.dtype == torch.uint8:
            int4_bytes += buf.numel()
        else:
            fp32_bytes += buf.numel() * buf.element_size()

log(f"\n  Weight breakdown:")
log(f"    INT4 packed indices: {int4_bytes/1e6:.1f}MB")
log(f"    FP32 params/buffers: {fp32_bytes/1e6:.1f}MB")
log(f"    LoRA trainable:     {sum(p.numel()*4 for p in model.parameters() if p.requires_grad)/1e6:.1f}MB")

# FINDING: The issue is that dequant creates full fp32/fp16 weight tensors per layer per forward
# For 88 quantized layers at ~4MB each = ~350MB of temporary dequantized weights
# Plus activations + gradients for a 1.1B model = total ~10GB

log(f"\n  ROOT CAUSE: Each forward dequantizes 88 layers × ~4MB each = ~350MB temp tensors")
log(f"  Plus activations for 1.1B model at seq_len=512 = ~several GB")
log(f"  SOLUTIONS:")
log(f"    1. Gradient checkpointing (already used — recompute activations)")
log(f"    2. Smaller batch size (reduce activation memory)")
log(f"    3. Shorter sequence length")
log(f"    4. FP16 dequant instead of FP32 (our dtype fix helps)")
log(f"    5. Layer-wise gradient accumulation (dequant only active layer)")

del model; gc.collect(); torch.cuda.empty_cache()

# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT 2: Can we reduce memory by using smaller batch + shorter seq?
# ═══════════════════════════════════════════════════════════════════════════════
log("\n" + "="*60)
log("  EXPERIMENT 2: Memory vs Batch Size/Seq Length")
log("="*60)

configs = [
    (1, 128, "batch=1, seq=128"),
    (1, 256, "batch=1, seq=256"),
    (2, 256, "batch=2, seq=256"),
    (4, 256, "batch=4, seq=256"),
    (4, 512, "batch=4, seq=512"),
]

results_mem = []
for batch_sz, seq_len, label in configs:
    gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
    
    model = FigModel.from_pretrained(MODEL, lora_r=16, lora_alpha=32,
        tier=TrainingTier.STREAMING_LORA, target_modules=["q_proj","k_proj","v_proj","o_proj"],
        fast=False)
    model = model.to(dev)
    
    ids = torch.randint(0, 32000, (batch_sz, seq_len), device=dev)
    
    try:
        torch.cuda.reset_peak_memory_stats()
        with torch.autocast("cuda", dtype=torch.float16):
            out = model(input_ids=ids, labels=ids)
        out.loss.backward()
        peak = torch.cuda.max_memory_allocated()/1e6
        results_mem.append((label, peak, "✓"))
        log(f"  {label:>20}: {peak:.0f}MB ✓")
    except torch.cuda.OutOfMemoryError:
        results_mem.append((label, 0, "OOM"))
        log(f"  {label:>20}: OOM ✗")
    
    del model; gc.collect(); torch.cuda.empty_cache()

log(f"\n  FINDING: Memory scales with batch × seq_len")
log(f"  For T4 (16GB): batch=2, seq=256 is the sweet spot for FigQuant lowram")

# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT 3: Run CogMemBench on TinyLlama
# ═══════════════════════════════════════════════════════════════════════════════
log("\n" + "="*60)
log("  EXPERIMENT 3: CogMemBench on TinyLlama")
log("="*60)

from cogmembench import CogMemGenerator, CogMemScorer, CogMemRunner
from transformers import AutoModelForCausalLM, AutoTokenizer

gc.collect(); torch.cuda.empty_cache()

log("Loading TinyLlama for benchmark...")
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.float16, device_map="auto")
tokenizer.pad_token = tokenizer.eos_token

def generate_response(prompt):
    """Generate a response from TinyLlama given a CogMemBench prompt."""
    messages = [{"role": "user", "content": prompt}]
    try:
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    except:
        text = f"<|user|>\n{prompt}\n<|assistant|>\n"
    
    enc = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True).to("cuda")
    with torch.no_grad():
        out = model.generate(**enc, max_new_tokens=150, do_sample=False,
                            pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(out[0][enc["input_ids"].shape[1]:], skip_special_tokens=True)
    return response

# Run on a subset (full 1000 would take too long)
runner = CogMemRunner(seed=42, per_axis=20)  # 100 total cases
log("Running CogMemBench (100 cases, 5 axes)...")

results = runner.run(
    model_fn=generate_response,
    max_cases=100,
    verbose=True,
)

log(f"\n  CogMem Score: {results['cogmem_score']}/100")
log(f"  Per-axis:")
for ax, acc in results['axis_accuracy'].items():
    log(f"    {ax:>15}: {acc*100:.1f}%")

# Save results
with open("/app/cogmem_results.json", "w") as f:
    json.dump({k: v for k, v in results.items() if k != 'details'}, f, indent=2)

log("\n" + "="*60)
log("  ALL EXPERIMENTS COMPLETE")
log("="*60)