gemma-4-E2B-it-SSD / evaluate_lcb.py
ludsvick's picture
Upload evaluate_lcb.py
05c3c25 verified
#!/usr/bin/env python3
r'''
LiveCodeBench v6 Evaluation Script
==================================
Compare base model vs SSD-trained model on LiveCodeBench v6.
Expected: 10+ point improvement on hard-difficulty problems.
Usage:
pip install humanevalpack lm-evaluation-harness
python evaluate_lcb.py --num_samples 100 --difficulty hard
Note: Full evaluation requires execution sandbox (livecodebench package).
'''
import os, sys
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
import torch
from transformers import AutoModelForImageTextToText, AutoTokenizer
from peft import PeftModel, PeftConfig
BASE_MODEL = 'google/gemma-4-E2B-it'
ADAPTER_REPO = 'ludsvick/gemma-4-E2B-it-SSD'
TEMP = 0.6 # Per SSD paper: eval at low temperature
def main():
print('Loading model...')
model = AutoModelForImageTextToText.from_pretrained(
BASE_MODEL, torch_dtype=torch.bfloat16, device_map='auto'
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print('Merging LoRA adapter...')
model = PeftModel.from_pretrained(model, ADAPTER_REPO)
model.eval()
model = model.merge_and_unload()
model.eval()
print(f'Model ready: https://huggingface.co/{ADAPTER_REPO}')
print('To run full LiveCodeBench evaluation, install livecodebench and run:')
print(' python -m livecodebench.eval --model <your-merged-model> --dataset v6')
if __name__ == '__main__':
main()