Spaces:
Sleeping
Sleeping
File size: 6,735 Bytes
b86c96c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | #!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HumanEval Evaluation Script
============================
Evaluate your fine-tuned Code LLM on the HumanEval benchmark.
Usage:
python eval_humaneval.py --model YOUR_USERNAME/code-qwen2.5-coder-3b
Requirements:
pip install transformers peft bitsandbytes accelerate human_eval
"""
import argparse
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
def load_model(model_path, base_model="Qwen/Qwen2.5-Coder-3B"):
print(f"๐ฅ ่ผๅ
ฅๆจกๅ: {model_path}")
print(f" ๅบ็คๆจกๅ: {base_model}")
tokenizer = AutoTokenizer.from_pretrained(base_model)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16,
)
base = AutoModelForCausalLM.from_pretrained(
base_model, quantization_config=bnb_config, device_map="auto", trust_remote_code=True,
)
try:
model = PeftModel.from_pretrained(base, model_path)
print("โ
LoRA adapter ๅทฒ่ผๅ
ฅ")
except Exception:
model = base
print("โ ๏ธ ๆชๆพๅฐ adapter๏ผไฝฟ็จๅบ็คๆจกๅ")
model.eval()
return model, tokenizer
def generate_completion(model, tokenizer, prompt, max_new_tokens=512):
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs, max_new_tokens=max_new_tokens, do_sample=False,
temperature=1.0, top_p=1.0,
pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id,
)
generated = outputs[0][inputs["input_ids"].shape[1]:]
completion = tokenizer.decode(generated, skip_special_tokens=True)
lines = completion.split("\n")
result_lines = []
for line in lines:
if result_lines and line.strip() and not line.startswith(" ") and not line.startswith("\t"):
break
result_lines.append(line)
return "\n".join(result_lines)
def run_manual_eval(model, tokenizer):
print("\n" + "="*60)
print(" MANUAL CODE GENERATION TEST")
print("="*60)
test_cases = [
{"name": "Two Sum", "prompt": 'def two_sum(nums: list[int], target: int) -> list[int]:\n """Given an array of integers nums and an integer target, return indices of the two numbers that add up to target."""\n'},
{"name": "Fibonacci", "prompt": 'def fibonacci(n: int) -> int:\n """Return the nth Fibonacci number."""\n'},
{"name": "Binary Search", "prompt": 'def binary_search(arr: list[int], target: int) -> int:\n """Return the index of target in sorted array arr, or -1 if not found."""\n'},
{"name": "Reverse Linked List", "prompt": 'class ListNode:\n def __init__(self, val=0, next=None):\n self.val = val\n self.next = next\n\ndef reverse_linked_list(head: ListNode) -> ListNode:\n """Reverse a singly linked list and return the new head."""\n'},
{"name": "Merge Sort", "prompt": 'def merge_sort(arr: list[int]) -> list[int]:\n """Sort an array using merge sort algorithm."""\n'},
]
results = []
for i, tc in enumerate(test_cases):
print(f"\n{'โ'*60}")
print(f"๐ Test {i+1}/{len(test_cases)}: {tc['name']}")
print(f"{'โ'*60}")
completion = generate_completion(model, tokenizer, tc["prompt"])
full_code = tc["prompt"] + completion
print(f"Generated:\n{full_code}")
try:
compile(full_code, "<string>", "exec")
print("โ
่ชๆณๆญฃ็ขบ")
results.append(True)
except SyntaxError as e:
print(f"โ ่ชๆณ้ฏ่ชค: {e}")
results.append(False)
passed = sum(results)
print(f"\n{'='*60}")
print(f" ็ตๆ: {passed}/{len(results)} ่ชๆณๆญฃ็ขบ ({100*passed/len(results):.0f}%)")
print(f"{'='*60}")
return passed, len(results)
def run_humaneval(model, tokenizer):
try:
from human_eval.data import read_problems
from human_eval.evaluation import evaluate_functional_correctness
import tempfile, json
except ImportError:
print("\nโ ๏ธ human_eval ๆชๅฎ่ฃ๏ผ่ทณ้ HumanEval ๅบๆบๆธฌ่ฉฆ")
print(" ๅฎ่ฃๆนๅผ: pip install git+https://github.com/openai/human-eval.git")
return None
print("\n" + "="*60)
print(" HUMANEVAL BENCHMARK")
print("="*60)
problems = read_problems()
print(f"๐ ๅ
ฑ {len(problems)} ้้ก็ฎ")
samples = []
for task_id, problem in problems.items():
completion = generate_completion(model, tokenizer, problem["prompt"], max_new_tokens=512)
samples.append({"task_id": task_id, "completion": completion})
idx = int(task_id.split("/")[-1])
if (idx + 1) % 20 == 0:
print(f" Generated {idx + 1}/{len(problems)}...")
with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
for sample in samples:
f.write(json.dumps(sample) + "\n")
tmp_path = f.name
print("๐ ๅท่กๅ่ฝๆงๆญฃ็ขบๆงๆธฌ่ฉฆ...")
results = evaluate_functional_correctness(tmp_path)
pass_at_1 = results.get("pass@1", 0)
print(f"\n๐ฏ HumanEval pass@1: {pass_at_1*100:.1f}%")
os.unlink(tmp_path)
return pass_at_1
def main():
parser = argparse.ArgumentParser(description="Evaluate Code LLM")
parser.add_argument("--model", type=str, default="./output_code", help="Model path or HF model ID")
parser.add_argument("--base_model", type=str, default="Qwen/Qwen2.5-Coder-3B", help="Base model name")
parser.add_argument("--skip_humaneval", action="store_true", help="Skip HumanEval, only run manual tests")
args = parser.parse_args()
print("""
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ Code LLM - Evaluation โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
""")
model, tokenizer = load_model(args.model, args.base_model)
passed, total = run_manual_eval(model, tokenizer)
if not args.skip_humaneval:
run_humaneval(model, tokenizer)
else:
print("\nโญ๏ธ ่ทณ้ HumanEval ๅบๆบๆธฌ่ฉฆ")
print("\n" + "="*60)
print(" EVALUATION COMPLETE")
print("="*60)
if __name__ == "__main__":
main()
|