File size: 6,735 Bytes
b86c96c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HumanEval Evaluation Script
============================
Evaluate your fine-tuned Code LLM on the HumanEval benchmark.

Usage:
    python eval_humaneval.py --model YOUR_USERNAME/code-qwen2.5-coder-3b

Requirements:
    pip install transformers peft bitsandbytes accelerate human_eval
"""

import argparse
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel


def load_model(model_path, base_model="Qwen/Qwen2.5-Coder-3B"):
    print(f"๐Ÿ“ฅ ่ผ‰ๅ…ฅๆจกๅž‹: {model_path}")
    print(f"   ๅŸบ็คŽๆจกๅž‹: {base_model}")

    tokenizer = AutoTokenizer.from_pretrained(base_model)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16,
    )

    base = AutoModelForCausalLM.from_pretrained(
        base_model, quantization_config=bnb_config, device_map="auto", trust_remote_code=True,
    )

    try:
        model = PeftModel.from_pretrained(base, model_path)
        print("โœ… LoRA adapter ๅทฒ่ผ‰ๅ…ฅ")
    except Exception:
        model = base
        print("โš ๏ธ  ๆœชๆ‰พๅˆฐ adapter๏ผŒไฝฟ็”จๅŸบ็คŽๆจกๅž‹")

    model.eval()
    return model, tokenizer


def generate_completion(model, tokenizer, prompt, max_new_tokens=512):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs, max_new_tokens=max_new_tokens, do_sample=False,
            temperature=1.0, top_p=1.0,
            pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id,
        )
    generated = outputs[0][inputs["input_ids"].shape[1]:]
    completion = tokenizer.decode(generated, skip_special_tokens=True)

    lines = completion.split("\n")
    result_lines = []
    for line in lines:
        if result_lines and line.strip() and not line.startswith(" ") and not line.startswith("\t"):
            break
        result_lines.append(line)
    return "\n".join(result_lines)


def run_manual_eval(model, tokenizer):
    print("\n" + "="*60)
    print("  MANUAL CODE GENERATION TEST")
    print("="*60)

    test_cases = [
        {"name": "Two Sum", "prompt": 'def two_sum(nums: list[int], target: int) -> list[int]:\n    """Given an array of integers nums and an integer target, return indices of the two numbers that add up to target."""\n'},
        {"name": "Fibonacci", "prompt": 'def fibonacci(n: int) -> int:\n    """Return the nth Fibonacci number."""\n'},
        {"name": "Binary Search", "prompt": 'def binary_search(arr: list[int], target: int) -> int:\n    """Return the index of target in sorted array arr, or -1 if not found."""\n'},
        {"name": "Reverse Linked List", "prompt": 'class ListNode:\n    def __init__(self, val=0, next=None):\n        self.val = val\n        self.next = next\n\ndef reverse_linked_list(head: ListNode) -> ListNode:\n    """Reverse a singly linked list and return the new head."""\n'},
        {"name": "Merge Sort", "prompt": 'def merge_sort(arr: list[int]) -> list[int]:\n    """Sort an array using merge sort algorithm."""\n'},
    ]

    results = []
    for i, tc in enumerate(test_cases):
        print(f"\n{'โ”€'*60}")
        print(f"๐Ÿ“ Test {i+1}/{len(test_cases)}: {tc['name']}")
        print(f"{'โ”€'*60}")

        completion = generate_completion(model, tokenizer, tc["prompt"])
        full_code = tc["prompt"] + completion
        print(f"Generated:\n{full_code}")

        try:
            compile(full_code, "<string>", "exec")
            print("โœ… ่ชžๆณ•ๆญฃ็ขบ")
            results.append(True)
        except SyntaxError as e:
            print(f"โŒ ่ชžๆณ•้Œฏ่ชค: {e}")
            results.append(False)

    passed = sum(results)
    print(f"\n{'='*60}")
    print(f"  ็ตๆžœ: {passed}/{len(results)} ่ชžๆณ•ๆญฃ็ขบ ({100*passed/len(results):.0f}%)")
    print(f"{'='*60}")
    return passed, len(results)


def run_humaneval(model, tokenizer):
    try:
        from human_eval.data import read_problems
        from human_eval.evaluation import evaluate_functional_correctness
        import tempfile, json
    except ImportError:
        print("\nโš ๏ธ  human_eval ๆœชๅฎ‰่ฃ๏ผŒ่ทณ้Ž HumanEval ๅŸบๆบ–ๆธฌ่ฉฆ")
        print("   ๅฎ‰่ฃๆ–นๅผ: pip install git+https://github.com/openai/human-eval.git")
        return None

    print("\n" + "="*60)
    print("  HUMANEVAL BENCHMARK")
    print("="*60)

    problems = read_problems()
    print(f"๐Ÿ“Š ๅ…ฑ {len(problems)} ้“้กŒ็›ฎ")

    samples = []
    for task_id, problem in problems.items():
        completion = generate_completion(model, tokenizer, problem["prompt"], max_new_tokens=512)
        samples.append({"task_id": task_id, "completion": completion})
        idx = int(task_id.split("/")[-1])
        if (idx + 1) % 20 == 0:
            print(f"   Generated {idx + 1}/{len(problems)}...")

    with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
        for sample in samples:
            f.write(json.dumps(sample) + "\n")
        tmp_path = f.name

    print("๐Ÿ”„ ๅŸท่กŒๅŠŸ่ƒฝๆ€งๆญฃ็ขบๆ€งๆธฌ่ฉฆ...")
    results = evaluate_functional_correctness(tmp_path)
    pass_at_1 = results.get("pass@1", 0)
    print(f"\n๐ŸŽฏ HumanEval pass@1: {pass_at_1*100:.1f}%")
    os.unlink(tmp_path)
    return pass_at_1


def main():
    parser = argparse.ArgumentParser(description="Evaluate Code LLM")
    parser.add_argument("--model", type=str, default="./output_code", help="Model path or HF model ID")
    parser.add_argument("--base_model", type=str, default="Qwen/Qwen2.5-Coder-3B", help="Base model name")
    parser.add_argument("--skip_humaneval", action="store_true", help="Skip HumanEval, only run manual tests")
    args = parser.parse_args()

    print("""
    โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—
    โ•‘          Code LLM - Evaluation                            โ•‘
    โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
    """)

    model, tokenizer = load_model(args.model, args.base_model)
    passed, total = run_manual_eval(model, tokenizer)

    if not args.skip_humaneval:
        run_humaneval(model, tokenizer)
    else:
        print("\nโญ๏ธ  ่ทณ้Ž HumanEval ๅŸบๆบ–ๆธฌ่ฉฆ")

    print("\n" + "="*60)
    print("  EVALUATION COMPLETE")
    print("="*60)

if __name__ == "__main__":
    main()