| |
| r''' |
| LiveCodeBench v6 Evaluation Script |
| ================================== |
| Compare base model vs SSD-trained model on LiveCodeBench v6. |
| Expected: 10+ point improvement on hard-difficulty problems. |
| |
| Usage: |
| pip install humanevalpack lm-evaluation-harness |
| python evaluate_lcb.py --num_samples 100 --difficulty hard |
| |
| Note: Full evaluation requires execution sandbox (livecodebench package). |
| ''' |
|
|
| import os, sys |
| os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' |
|
|
| import torch |
| from transformers import AutoModelForImageTextToText, AutoTokenizer |
| from peft import PeftModel, PeftConfig |
|
|
| BASE_MODEL = 'google/gemma-4-E2B-it' |
| ADAPTER_REPO = 'ludsvick/gemma-4-E2B-it-SSD' |
| TEMP = 0.6 |
|
|
|
|
| def main(): |
| print('Loading model...') |
| model = AutoModelForImageTextToText.from_pretrained( |
| BASE_MODEL, torch_dtype=torch.bfloat16, device_map='auto' |
| ) |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| print('Merging LoRA adapter...') |
| model = PeftModel.from_pretrained(model, ADAPTER_REPO) |
| model.eval() |
| model = model.merge_and_unload() |
| model.eval() |
| |
| print(f'Model ready: https://huggingface.co/{ADAPTER_REPO}') |
| print('To run full LiveCodeBench evaluation, install livecodebench and run:') |
| print(' python -m livecodebench.eval --model <your-merged-model> --dataset v6') |
|
|
|
|
| if __name__ == '__main__': |
| main() |