| |
| """ |
| Two experiments: |
| 1. Research GPU memory reduction for FigQuant (figcache mode on GPU) |
| 2. Run CogMemBench on TinyLlama |
| """ |
| import os, sys, subprocess, time, gc, json |
| import numpy as np |
|
|
| subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", |
| "transformers", "accelerate", "datasets", "sentencepiece", "protobuf", "psutil", "numpy"]) |
| subprocess.check_call(["git", "clone", "https://github.com/ticketguy/littlefig.git", "/app/littlefig"]) |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-e", "/app/littlefig[train]"]) |
| sys.path.insert(0, "/app/littlefig/src") |
| sys.path.insert(0, "/app/littlefig") |
|
|
| import torch |
|
|
| def log(msg): print(f"[EXP] {msg}", flush=True) |
|
|
| log(f"PyTorch {torch.__version__}, CUDA={torch.cuda.is_available()}") |
| if torch.cuda.is_available(): |
| log(f"GPU: {torch.cuda.get_device_name()} ({torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB)") |
|
|
| |
| |
| |
| log("\n" + "="*60) |
| log(" EXPERIMENT 1: GPU Memory Profiling") |
| log("="*60) |
|
|
| from little_fig.engine import FigModel |
| from little_fig.engine.tier import TrainingTier |
|
|
| MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" |
| gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats() |
|
|
| |
| log("\n Memory at each stage (lowram mode):") |
|
|
| |
| model = FigModel.from_pretrained(MODEL, lora_r=16, lora_alpha=32, |
| tier=TrainingTier.STREAMING_LORA, target_modules=["q_proj","k_proj","v_proj","o_proj"], |
| fast=False) |
|
|
| log(f" After load (CPU): GPU={torch.cuda.memory_allocated()/1e6:.0f}MB") |
|
|
| |
| dev = torch.device("cuda") |
| model = model.to(dev) |
| torch.cuda.synchronize() |
| after_move = torch.cuda.memory_allocated()/1e6 |
| log(f" After .to(cuda): GPU={after_move:.0f}MB") |
|
|
| |
| tok = model.tokenizer |
| enc = tok("Hello world", return_tensors="pt", max_length=64, truncation=True, padding="max_length") |
| enc = {k: v.to(dev) for k, v in enc.items()} |
|
|
| torch.cuda.reset_peak_memory_stats() |
| with torch.autocast("cuda", dtype=torch.float16): |
| out = model(input_ids=enc["input_ids"], labels=enc["input_ids"]) |
| after_fwd = torch.cuda.max_memory_allocated()/1e6 |
| log(f" After forward: GPU={after_fwd:.0f}MB (peak)") |
|
|
| |
| torch.cuda.reset_peak_memory_stats() |
| out.loss.backward() |
| after_bwd = torch.cuda.max_memory_allocated()/1e6 |
| log(f" After backward: GPU={after_bwd:.0f}MB (peak)") |
|
|
| log(f"\n ANALYSIS:") |
| log(f" Model on GPU: {after_move:.0f}MB") |
| log(f" Forward peak: {after_fwd:.0f}MB (+{after_fwd-after_move:.0f}MB activations)") |
| log(f" Backward peak: {after_bwd:.0f}MB (+{after_bwd-after_fwd:.0f}MB gradients)") |
| log(f" Total training: {after_bwd:.0f}MB") |
|
|
| |
| |
| |
|
|
| |
| int4_bytes = 0 |
| fp32_bytes = 0 |
| for name, param in model.named_parameters(): |
| if param.requires_grad: |
| fp32_bytes += param.numel() * param.element_size() |
| for name, buf in model.named_buffers(): |
| if buf is not None: |
| if buf.dtype == torch.uint8: |
| int4_bytes += buf.numel() |
| else: |
| fp32_bytes += buf.numel() * buf.element_size() |
|
|
| log(f"\n Weight breakdown:") |
| log(f" INT4 packed indices: {int4_bytes/1e6:.1f}MB") |
| log(f" FP32 params/buffers: {fp32_bytes/1e6:.1f}MB") |
| log(f" LoRA trainable: {sum(p.numel()*4 for p in model.parameters() if p.requires_grad)/1e6:.1f}MB") |
|
|
| |
| |
| |
|
|
| log(f"\n ROOT CAUSE: Each forward dequantizes 88 layers Γ ~4MB each = ~350MB temp tensors") |
| log(f" Plus activations for 1.1B model at seq_len=512 = ~several GB") |
| log(f" SOLUTIONS:") |
| log(f" 1. Gradient checkpointing (already used β recompute activations)") |
| log(f" 2. Smaller batch size (reduce activation memory)") |
| log(f" 3. Shorter sequence length") |
| log(f" 4. FP16 dequant instead of FP32 (our dtype fix helps)") |
| log(f" 5. Layer-wise gradient accumulation (dequant only active layer)") |
|
|
| del model; gc.collect(); torch.cuda.empty_cache() |
|
|
| |
| |
| |
| log("\n" + "="*60) |
| log(" EXPERIMENT 2: Memory vs Batch Size/Seq Length") |
| log("="*60) |
|
|
| configs = [ |
| (1, 128, "batch=1, seq=128"), |
| (1, 256, "batch=1, seq=256"), |
| (2, 256, "batch=2, seq=256"), |
| (4, 256, "batch=4, seq=256"), |
| (4, 512, "batch=4, seq=512"), |
| ] |
|
|
| results_mem = [] |
| for batch_sz, seq_len, label in configs: |
| gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats() |
| |
| model = FigModel.from_pretrained(MODEL, lora_r=16, lora_alpha=32, |
| tier=TrainingTier.STREAMING_LORA, target_modules=["q_proj","k_proj","v_proj","o_proj"], |
| fast=False) |
| model = model.to(dev) |
| |
| ids = torch.randint(0, 32000, (batch_sz, seq_len), device=dev) |
| |
| try: |
| torch.cuda.reset_peak_memory_stats() |
| with torch.autocast("cuda", dtype=torch.float16): |
| out = model(input_ids=ids, labels=ids) |
| out.loss.backward() |
| peak = torch.cuda.max_memory_allocated()/1e6 |
| results_mem.append((label, peak, "β")) |
| log(f" {label:>20}: {peak:.0f}MB β") |
| except torch.cuda.OutOfMemoryError: |
| results_mem.append((label, 0, "OOM")) |
| log(f" {label:>20}: OOM β") |
| |
| del model; gc.collect(); torch.cuda.empty_cache() |
|
|
| log(f"\n FINDING: Memory scales with batch Γ seq_len") |
| log(f" For T4 (16GB): batch=2, seq=256 is the sweet spot for FigQuant lowram") |
|
|
| |
| |
| |
| log("\n" + "="*60) |
| log(" EXPERIMENT 3: CogMemBench on TinyLlama") |
| log("="*60) |
|
|
| from cogmembench import CogMemGenerator, CogMemScorer, CogMemRunner |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
| gc.collect(); torch.cuda.empty_cache() |
|
|
| log("Loading TinyLlama for benchmark...") |
| tokenizer = AutoTokenizer.from_pretrained(MODEL) |
| model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.float16, device_map="auto") |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| def generate_response(prompt): |
| """Generate a response from TinyLlama given a CogMemBench prompt.""" |
| messages = [{"role": "user", "content": prompt}] |
| try: |
| text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
| except: |
| text = f"<|user|>\n{prompt}\n<|assistant|>\n" |
| |
| enc = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True).to("cuda") |
| with torch.no_grad(): |
| out = model.generate(**enc, max_new_tokens=150, do_sample=False, |
| pad_token_id=tokenizer.eos_token_id) |
| response = tokenizer.decode(out[0][enc["input_ids"].shape[1]:], skip_special_tokens=True) |
| return response |
|
|
| |
| runner = CogMemRunner(seed=42, per_axis=20) |
| log("Running CogMemBench (100 cases, 5 axes)...") |
|
|
| results = runner.run( |
| model_fn=generate_response, |
| max_cases=100, |
| verbose=True, |
| ) |
|
|
| log(f"\n CogMem Score: {results['cogmem_score']}/100") |
| log(f" Per-axis:") |
| for ax, acc in results['axis_accuracy'].items(): |
| log(f" {ax:>15}: {acc*100:.1f}%") |
|
|
| |
| with open("/app/cogmem_results.json", "w") as f: |
| json.dump({k: v for k, v in results.items() if k != 'details'}, f, indent=2) |
|
|
| log("\n" + "="*60) |
| log(" ALL EXPERIMENTS COMPLETE") |
| log("="*60) |
|
|