#!/usr/bin/env python3 r''' LiveCodeBench v6 Evaluation Script ================================== Compare base model vs SSD-trained model on LiveCodeBench v6. Expected: 10+ point improvement on hard-difficulty problems. Usage: pip install humanevalpack lm-evaluation-harness python evaluate_lcb.py --num_samples 100 --difficulty hard Note: Full evaluation requires execution sandbox (livecodebench package). ''' import os, sys os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' import torch from transformers import AutoModelForImageTextToText, AutoTokenizer from peft import PeftModel, PeftConfig BASE_MODEL = 'google/gemma-4-E2B-it' ADAPTER_REPO = 'ludsvick/gemma-4-E2B-it-SSD' TEMP = 0.6 # Per SSD paper: eval at low temperature def main(): print('Loading model...') model = AutoModelForImageTextToText.from_pretrained( BASE_MODEL, torch_dtype=torch.bfloat16, device_map='auto' ) tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print('Merging LoRA adapter...') model = PeftModel.from_pretrained(model, ADAPTER_REPO) model.eval() model = model.merge_and_unload() model.eval() print(f'Model ready: https://huggingface.co/{ADAPTER_REPO}') print('To run full LiveCodeBench evaluation, install livecodebench and run:') print(' python -m livecodebench.eval --model --dataset v6') if __name__ == '__main__': main()