Justin-lee commited on
Commit
b86c96c
ยท
verified ยท
1 Parent(s): 28bf24b

Add HumanEval evaluation script

Browse files
Files changed (1) hide show
  1. eval_humaneval.py +171 -0
eval_humaneval.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ HumanEval Evaluation Script
5
+ ============================
6
+ Evaluate your fine-tuned Code LLM on the HumanEval benchmark.
7
+
8
+ Usage:
9
+ python eval_humaneval.py --model YOUR_USERNAME/code-qwen2.5-coder-3b
10
+
11
+ Requirements:
12
+ pip install transformers peft bitsandbytes accelerate human_eval
13
+ """
14
+
15
+ import argparse
16
+ import os
17
+ import torch
18
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
19
+ from peft import PeftModel
20
+
21
+
22
+ def load_model(model_path, base_model="Qwen/Qwen2.5-Coder-3B"):
23
+ print(f"๐Ÿ“ฅ ่ผ‰ๅ…ฅๆจกๅž‹: {model_path}")
24
+ print(f" ๅŸบ็คŽๆจกๅž‹: {base_model}")
25
+
26
+ tokenizer = AutoTokenizer.from_pretrained(base_model)
27
+ if tokenizer.pad_token is None:
28
+ tokenizer.pad_token = tokenizer.eos_token
29
+
30
+ bnb_config = BitsAndBytesConfig(
31
+ load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16,
32
+ )
33
+
34
+ base = AutoModelForCausalLM.from_pretrained(
35
+ base_model, quantization_config=bnb_config, device_map="auto", trust_remote_code=True,
36
+ )
37
+
38
+ try:
39
+ model = PeftModel.from_pretrained(base, model_path)
40
+ print("โœ… LoRA adapter ๅทฒ่ผ‰ๅ…ฅ")
41
+ except Exception:
42
+ model = base
43
+ print("โš ๏ธ ๆœชๆ‰พๅˆฐ adapter๏ผŒไฝฟ็”จๅŸบ็คŽๆจกๅž‹")
44
+
45
+ model.eval()
46
+ return model, tokenizer
47
+
48
+
49
+ def generate_completion(model, tokenizer, prompt, max_new_tokens=512):
50
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
51
+ with torch.no_grad():
52
+ outputs = model.generate(
53
+ **inputs, max_new_tokens=max_new_tokens, do_sample=False,
54
+ temperature=1.0, top_p=1.0,
55
+ pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id,
56
+ )
57
+ generated = outputs[0][inputs["input_ids"].shape[1]:]
58
+ completion = tokenizer.decode(generated, skip_special_tokens=True)
59
+
60
+ lines = completion.split("\n")
61
+ result_lines = []
62
+ for line in lines:
63
+ if result_lines and line.strip() and not line.startswith(" ") and not line.startswith("\t"):
64
+ break
65
+ result_lines.append(line)
66
+ return "\n".join(result_lines)
67
+
68
+
69
+ def run_manual_eval(model, tokenizer):
70
+ print("\n" + "="*60)
71
+ print(" MANUAL CODE GENERATION TEST")
72
+ print("="*60)
73
+
74
+ test_cases = [
75
+ {"name": "Two Sum", "prompt": 'def two_sum(nums: list[int], target: int) -> list[int]:\n """Given an array of integers nums and an integer target, return indices of the two numbers that add up to target."""\n'},
76
+ {"name": "Fibonacci", "prompt": 'def fibonacci(n: int) -> int:\n """Return the nth Fibonacci number."""\n'},
77
+ {"name": "Binary Search", "prompt": 'def binary_search(arr: list[int], target: int) -> int:\n """Return the index of target in sorted array arr, or -1 if not found."""\n'},
78
+ {"name": "Reverse Linked List", "prompt": 'class ListNode:\n def __init__(self, val=0, next=None):\n self.val = val\n self.next = next\n\ndef reverse_linked_list(head: ListNode) -> ListNode:\n """Reverse a singly linked list and return the new head."""\n'},
79
+ {"name": "Merge Sort", "prompt": 'def merge_sort(arr: list[int]) -> list[int]:\n """Sort an array using merge sort algorithm."""\n'},
80
+ ]
81
+
82
+ results = []
83
+ for i, tc in enumerate(test_cases):
84
+ print(f"\n{'โ”€'*60}")
85
+ print(f"๐Ÿ“ Test {i+1}/{len(test_cases)}: {tc['name']}")
86
+ print(f"{'โ”€'*60}")
87
+
88
+ completion = generate_completion(model, tokenizer, tc["prompt"])
89
+ full_code = tc["prompt"] + completion
90
+ print(f"Generated:\n{full_code}")
91
+
92
+ try:
93
+ compile(full_code, "<string>", "exec")
94
+ print("โœ… ่ชžๆณ•ๆญฃ็ขบ")
95
+ results.append(True)
96
+ except SyntaxError as e:
97
+ print(f"โŒ ่ชžๆณ•้Œฏ่ชค: {e}")
98
+ results.append(False)
99
+
100
+ passed = sum(results)
101
+ print(f"\n{'='*60}")
102
+ print(f" ็ตๆžœ: {passed}/{len(results)} ่ชžๆณ•ๆญฃ็ขบ ({100*passed/len(results):.0f}%)")
103
+ print(f"{'='*60}")
104
+ return passed, len(results)
105
+
106
+
107
+ def run_humaneval(model, tokenizer):
108
+ try:
109
+ from human_eval.data import read_problems
110
+ from human_eval.evaluation import evaluate_functional_correctness
111
+ import tempfile, json
112
+ except ImportError:
113
+ print("\nโš ๏ธ human_eval ๆœชๅฎ‰่ฃ๏ผŒ่ทณ้Ž HumanEval ๅŸบๆบ–ๆธฌ่ฉฆ")
114
+ print(" ๅฎ‰่ฃๆ–นๅผ: pip install git+https://github.com/openai/human-eval.git")
115
+ return None
116
+
117
+ print("\n" + "="*60)
118
+ print(" HUMANEVAL BENCHMARK")
119
+ print("="*60)
120
+
121
+ problems = read_problems()
122
+ print(f"๐Ÿ“Š ๅ…ฑ {len(problems)} ้“้กŒ็›ฎ")
123
+
124
+ samples = []
125
+ for task_id, problem in problems.items():
126
+ completion = generate_completion(model, tokenizer, problem["prompt"], max_new_tokens=512)
127
+ samples.append({"task_id": task_id, "completion": completion})
128
+ idx = int(task_id.split("/")[-1])
129
+ if (idx + 1) % 20 == 0:
130
+ print(f" Generated {idx + 1}/{len(problems)}...")
131
+
132
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
133
+ for sample in samples:
134
+ f.write(json.dumps(sample) + "\n")
135
+ tmp_path = f.name
136
+
137
+ print("๐Ÿ”„ ๅŸท่กŒๅŠŸ่ƒฝๆ€งๆญฃ็ขบๆ€งๆธฌ่ฉฆ...")
138
+ results = evaluate_functional_correctness(tmp_path)
139
+ pass_at_1 = results.get("pass@1", 0)
140
+ print(f"\n๐ŸŽฏ HumanEval pass@1: {pass_at_1*100:.1f}%")
141
+ os.unlink(tmp_path)
142
+ return pass_at_1
143
+
144
+
145
+ def main():
146
+ parser = argparse.ArgumentParser(description="Evaluate Code LLM")
147
+ parser.add_argument("--model", type=str, default="./output_code", help="Model path or HF model ID")
148
+ parser.add_argument("--base_model", type=str, default="Qwen/Qwen2.5-Coder-3B", help="Base model name")
149
+ parser.add_argument("--skip_humaneval", action="store_true", help="Skip HumanEval, only run manual tests")
150
+ args = parser.parse_args()
151
+
152
+ print("""
153
+ โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—
154
+ โ•‘ Code LLM - Evaluation โ•‘
155
+ โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
156
+ """)
157
+
158
+ model, tokenizer = load_model(args.model, args.base_model)
159
+ passed, total = run_manual_eval(model, tokenizer)
160
+
161
+ if not args.skip_humaneval:
162
+ run_humaneval(model, tokenizer)
163
+ else:
164
+ print("\nโญ๏ธ ่ทณ้Ž HumanEval ๅŸบๆบ–ๆธฌ่ฉฆ")
165
+
166
+ print("\n" + "="*60)
167
+ print(" EVALUATION COMPLETE")
168
+ print("="*60)
169
+
170
+ if __name__ == "__main__":
171
+ main()