sandbox-5ca717e4 / eval_humaneval.py
Justin-lee's picture
Add HumanEval evaluation script
b86c96c verified
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HumanEval Evaluation Script
============================
Evaluate your fine-tuned Code LLM on the HumanEval benchmark.
Usage:
python eval_humaneval.py --model YOUR_USERNAME/code-qwen2.5-coder-3b
Requirements:
pip install transformers peft bitsandbytes accelerate human_eval
"""
import argparse
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
def load_model(model_path, base_model="Qwen/Qwen2.5-Coder-3B"):
print(f"๐Ÿ“ฅ ่ผ‰ๅ…ฅๆจกๅž‹: {model_path}")
print(f" ๅŸบ็คŽๆจกๅž‹: {base_model}")
tokenizer = AutoTokenizer.from_pretrained(base_model)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16,
)
base = AutoModelForCausalLM.from_pretrained(
base_model, quantization_config=bnb_config, device_map="auto", trust_remote_code=True,
)
try:
model = PeftModel.from_pretrained(base, model_path)
print("โœ… LoRA adapter ๅทฒ่ผ‰ๅ…ฅ")
except Exception:
model = base
print("โš ๏ธ ๆœชๆ‰พๅˆฐ adapter๏ผŒไฝฟ็”จๅŸบ็คŽๆจกๅž‹")
model.eval()
return model, tokenizer
def generate_completion(model, tokenizer, prompt, max_new_tokens=512):
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs, max_new_tokens=max_new_tokens, do_sample=False,
temperature=1.0, top_p=1.0,
pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id,
)
generated = outputs[0][inputs["input_ids"].shape[1]:]
completion = tokenizer.decode(generated, skip_special_tokens=True)
lines = completion.split("\n")
result_lines = []
for line in lines:
if result_lines and line.strip() and not line.startswith(" ") and not line.startswith("\t"):
break
result_lines.append(line)
return "\n".join(result_lines)
def run_manual_eval(model, tokenizer):
print("\n" + "="*60)
print(" MANUAL CODE GENERATION TEST")
print("="*60)
test_cases = [
{"name": "Two Sum", "prompt": 'def two_sum(nums: list[int], target: int) -> list[int]:\n """Given an array of integers nums and an integer target, return indices of the two numbers that add up to target."""\n'},
{"name": "Fibonacci", "prompt": 'def fibonacci(n: int) -> int:\n """Return the nth Fibonacci number."""\n'},
{"name": "Binary Search", "prompt": 'def binary_search(arr: list[int], target: int) -> int:\n """Return the index of target in sorted array arr, or -1 if not found."""\n'},
{"name": "Reverse Linked List", "prompt": 'class ListNode:\n def __init__(self, val=0, next=None):\n self.val = val\n self.next = next\n\ndef reverse_linked_list(head: ListNode) -> ListNode:\n """Reverse a singly linked list and return the new head."""\n'},
{"name": "Merge Sort", "prompt": 'def merge_sort(arr: list[int]) -> list[int]:\n """Sort an array using merge sort algorithm."""\n'},
]
results = []
for i, tc in enumerate(test_cases):
print(f"\n{'โ”€'*60}")
print(f"๐Ÿ“ Test {i+1}/{len(test_cases)}: {tc['name']}")
print(f"{'โ”€'*60}")
completion = generate_completion(model, tokenizer, tc["prompt"])
full_code = tc["prompt"] + completion
print(f"Generated:\n{full_code}")
try:
compile(full_code, "<string>", "exec")
print("โœ… ่ชžๆณ•ๆญฃ็ขบ")
results.append(True)
except SyntaxError as e:
print(f"โŒ ่ชžๆณ•้Œฏ่ชค: {e}")
results.append(False)
passed = sum(results)
print(f"\n{'='*60}")
print(f" ็ตๆžœ: {passed}/{len(results)} ่ชžๆณ•ๆญฃ็ขบ ({100*passed/len(results):.0f}%)")
print(f"{'='*60}")
return passed, len(results)
def run_humaneval(model, tokenizer):
try:
from human_eval.data import read_problems
from human_eval.evaluation import evaluate_functional_correctness
import tempfile, json
except ImportError:
print("\nโš ๏ธ human_eval ๆœชๅฎ‰่ฃ๏ผŒ่ทณ้Ž HumanEval ๅŸบๆบ–ๆธฌ่ฉฆ")
print(" ๅฎ‰่ฃๆ–นๅผ: pip install git+https://github.com/openai/human-eval.git")
return None
print("\n" + "="*60)
print(" HUMANEVAL BENCHMARK")
print("="*60)
problems = read_problems()
print(f"๐Ÿ“Š ๅ…ฑ {len(problems)} ้“้กŒ็›ฎ")
samples = []
for task_id, problem in problems.items():
completion = generate_completion(model, tokenizer, problem["prompt"], max_new_tokens=512)
samples.append({"task_id": task_id, "completion": completion})
idx = int(task_id.split("/")[-1])
if (idx + 1) % 20 == 0:
print(f" Generated {idx + 1}/{len(problems)}...")
with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
for sample in samples:
f.write(json.dumps(sample) + "\n")
tmp_path = f.name
print("๐Ÿ”„ ๅŸท่กŒๅŠŸ่ƒฝๆ€งๆญฃ็ขบๆ€งๆธฌ่ฉฆ...")
results = evaluate_functional_correctness(tmp_path)
pass_at_1 = results.get("pass@1", 0)
print(f"\n๐ŸŽฏ HumanEval pass@1: {pass_at_1*100:.1f}%")
os.unlink(tmp_path)
return pass_at_1
def main():
parser = argparse.ArgumentParser(description="Evaluate Code LLM")
parser.add_argument("--model", type=str, default="./output_code", help="Model path or HF model ID")
parser.add_argument("--base_model", type=str, default="Qwen/Qwen2.5-Coder-3B", help="Base model name")
parser.add_argument("--skip_humaneval", action="store_true", help="Skip HumanEval, only run manual tests")
args = parser.parse_args()
print("""
โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—
โ•‘ Code LLM - Evaluation โ•‘
โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
""")
model, tokenizer = load_model(args.model, args.base_model)
passed, total = run_manual_eval(model, tokenizer)
if not args.skip_humaneval:
run_humaneval(model, tokenizer)
else:
print("\nโญ๏ธ ่ทณ้Ž HumanEval ๅŸบๆบ–ๆธฌ่ฉฆ")
print("\n" + "="*60)
print(" EVALUATION COMPLETE")
print("="*60)
if __name__ == "__main__":
main()