Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| HumanEval Evaluation Script | |
| ============================ | |
| Evaluate your fine-tuned Code LLM on the HumanEval benchmark. | |
| Usage: | |
| python eval_humaneval.py --model YOUR_USERNAME/code-qwen2.5-coder-3b | |
| Requirements: | |
| pip install transformers peft bitsandbytes accelerate human_eval | |
| """ | |
| import argparse | |
| import os | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
| from peft import PeftModel | |
| def load_model(model_path, base_model="Qwen/Qwen2.5-Coder-3B"): | |
| print(f"๐ฅ ่ผๅ ฅๆจกๅ: {model_path}") | |
| print(f" ๅบ็คๆจกๅ: {base_model}") | |
| tokenizer = AutoTokenizer.from_pretrained(base_model) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, | |
| ) | |
| base = AutoModelForCausalLM.from_pretrained( | |
| base_model, quantization_config=bnb_config, device_map="auto", trust_remote_code=True, | |
| ) | |
| try: | |
| model = PeftModel.from_pretrained(base, model_path) | |
| print("โ LoRA adapter ๅทฒ่ผๅ ฅ") | |
| except Exception: | |
| model = base | |
| print("โ ๏ธ ๆชๆพๅฐ adapter๏ผไฝฟ็จๅบ็คๆจกๅ") | |
| model.eval() | |
| return model, tokenizer | |
| def generate_completion(model, tokenizer, prompt, max_new_tokens=512): | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, max_new_tokens=max_new_tokens, do_sample=False, | |
| temperature=1.0, top_p=1.0, | |
| pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, | |
| ) | |
| generated = outputs[0][inputs["input_ids"].shape[1]:] | |
| completion = tokenizer.decode(generated, skip_special_tokens=True) | |
| lines = completion.split("\n") | |
| result_lines = [] | |
| for line in lines: | |
| if result_lines and line.strip() and not line.startswith(" ") and not line.startswith("\t"): | |
| break | |
| result_lines.append(line) | |
| return "\n".join(result_lines) | |
| def run_manual_eval(model, tokenizer): | |
| print("\n" + "="*60) | |
| print(" MANUAL CODE GENERATION TEST") | |
| print("="*60) | |
| test_cases = [ | |
| {"name": "Two Sum", "prompt": 'def two_sum(nums: list[int], target: int) -> list[int]:\n """Given an array of integers nums and an integer target, return indices of the two numbers that add up to target."""\n'}, | |
| {"name": "Fibonacci", "prompt": 'def fibonacci(n: int) -> int:\n """Return the nth Fibonacci number."""\n'}, | |
| {"name": "Binary Search", "prompt": 'def binary_search(arr: list[int], target: int) -> int:\n """Return the index of target in sorted array arr, or -1 if not found."""\n'}, | |
| {"name": "Reverse Linked List", "prompt": 'class ListNode:\n def __init__(self, val=0, next=None):\n self.val = val\n self.next = next\n\ndef reverse_linked_list(head: ListNode) -> ListNode:\n """Reverse a singly linked list and return the new head."""\n'}, | |
| {"name": "Merge Sort", "prompt": 'def merge_sort(arr: list[int]) -> list[int]:\n """Sort an array using merge sort algorithm."""\n'}, | |
| ] | |
| results = [] | |
| for i, tc in enumerate(test_cases): | |
| print(f"\n{'โ'*60}") | |
| print(f"๐ Test {i+1}/{len(test_cases)}: {tc['name']}") | |
| print(f"{'โ'*60}") | |
| completion = generate_completion(model, tokenizer, tc["prompt"]) | |
| full_code = tc["prompt"] + completion | |
| print(f"Generated:\n{full_code}") | |
| try: | |
| compile(full_code, "<string>", "exec") | |
| print("โ ่ชๆณๆญฃ็ขบ") | |
| results.append(True) | |
| except SyntaxError as e: | |
| print(f"โ ่ชๆณ้ฏ่ชค: {e}") | |
| results.append(False) | |
| passed = sum(results) | |
| print(f"\n{'='*60}") | |
| print(f" ็ตๆ: {passed}/{len(results)} ่ชๆณๆญฃ็ขบ ({100*passed/len(results):.0f}%)") | |
| print(f"{'='*60}") | |
| return passed, len(results) | |
| def run_humaneval(model, tokenizer): | |
| try: | |
| from human_eval.data import read_problems | |
| from human_eval.evaluation import evaluate_functional_correctness | |
| import tempfile, json | |
| except ImportError: | |
| print("\nโ ๏ธ human_eval ๆชๅฎ่ฃ๏ผ่ทณ้ HumanEval ๅบๆบๆธฌ่ฉฆ") | |
| print(" ๅฎ่ฃๆนๅผ: pip install git+https://github.com/openai/human-eval.git") | |
| return None | |
| print("\n" + "="*60) | |
| print(" HUMANEVAL BENCHMARK") | |
| print("="*60) | |
| problems = read_problems() | |
| print(f"๐ ๅ ฑ {len(problems)} ้้ก็ฎ") | |
| samples = [] | |
| for task_id, problem in problems.items(): | |
| completion = generate_completion(model, tokenizer, problem["prompt"], max_new_tokens=512) | |
| samples.append({"task_id": task_id, "completion": completion}) | |
| idx = int(task_id.split("/")[-1]) | |
| if (idx + 1) % 20 == 0: | |
| print(f" Generated {idx + 1}/{len(problems)}...") | |
| with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: | |
| for sample in samples: | |
| f.write(json.dumps(sample) + "\n") | |
| tmp_path = f.name | |
| print("๐ ๅท่กๅ่ฝๆงๆญฃ็ขบๆงๆธฌ่ฉฆ...") | |
| results = evaluate_functional_correctness(tmp_path) | |
| pass_at_1 = results.get("pass@1", 0) | |
| print(f"\n๐ฏ HumanEval pass@1: {pass_at_1*100:.1f}%") | |
| os.unlink(tmp_path) | |
| return pass_at_1 | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Evaluate Code LLM") | |
| parser.add_argument("--model", type=str, default="./output_code", help="Model path or HF model ID") | |
| parser.add_argument("--base_model", type=str, default="Qwen/Qwen2.5-Coder-3B", help="Base model name") | |
| parser.add_argument("--skip_humaneval", action="store_true", help="Skip HumanEval, only run manual tests") | |
| args = parser.parse_args() | |
| print(""" | |
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| โ Code LLM - Evaluation โ | |
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| """) | |
| model, tokenizer = load_model(args.model, args.base_model) | |
| passed, total = run_manual_eval(model, tokenizer) | |
| if not args.skip_humaneval: | |
| run_humaneval(model, tokenizer) | |
| else: | |
| print("\nโญ๏ธ ่ทณ้ HumanEval ๅบๆบๆธฌ่ฉฆ") | |
| print("\n" + "="*60) | |
| print(" EVALUATION COMPLETE") | |
| print("="*60) | |
| if __name__ == "__main__": | |
| main() | |