| import sys |
| import os |
|
|
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
| from datasets import load_dataset |
| import torch |
| import numpy as np |
| import json |
| import time |
| from tqdm import tqdm |
| from model import Load_model |
|
|
| |
| def load_benchmark_data_codealpaca(num_samples=20): |
| """Load the CodeAlpaca-20k instruction-tuning dataset.""" |
| dataset = load_dataset("sahil2801/CodeAlpaca-20k")["train"] |
| if num_samples and num_samples < len(dataset): |
| indices = np.random.choice(len(dataset), num_samples, replace=False) |
| dataset = dataset.select(indices) |
| return dataset |
|
|
| |
| def generate_solutions_codealpaca(model, tokenizer, dataset, max_tokens=512): |
| """ |
| Generate code for each CodeAlpaca instruction. |
| Returns a list of dicts with problem_id, prompt, generated_code, reference, generation_time. |
| """ |
| results = [] |
| prompt_template = "### Instruction:\n{}\n\n### Response:\n" |
| |
| for item in tqdm(dataset, desc="Generating CodeAlpaca solutions"): |
| instruction = item["instruction"] |
| reference = item["output"] |
| |
| prompt = prompt_template.format(instruction) |
| |
| |
| raw_id = instruction.strip().replace("\n", " ") |
| problem_id = raw_id[:50].replace(" ", "_").replace("/", "_") |
| |
| |
| start_time = time.time() |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
| with torch.no_grad(): |
| output = model.generate( |
| **inputs, |
| max_length=max_tokens, |
| do_sample=True, |
| temperature=0.2, |
| top_p=0.95, |
| pad_token_id=tokenizer.eos_token_id |
| ) |
| generation_time = time.time() - start_time |
| |
| generated = tokenizer.decode(output[0], skip_special_tokens=True) |
| |
| if generated.startswith(prompt): |
| generated = generated[len(prompt):] |
| |
| results.append({ |
| "problem_id": problem_id, |
| "prompt": prompt, |
| "generated_code": generated, |
| "reference": reference, |
| "generation_time": generation_time |
| }) |
| return results |
|
|
| |
| def evaluate_solutions_codealpaca(solutions): |
| """ |
| Count how many generations exactly match the reference. |
| Returns total, correct_count, pass_rate, plus all details. |
| """ |
| total = len(solutions) |
| correct_count = sum( |
| 1 for s in solutions |
| if s["generated_code"].strip() == s["reference"].strip() |
| ) |
| pass_rate = correct_count / total if total > 0 else 0.0 |
| |
| return { |
| "total": total, |
| "correct_count": correct_count, |
| "pass_rate": pass_rate, |
| "detailed_results": solutions |
| } |
|
|
| |
| def save_evaluation_results_codealpaca(model_name, results, solutions): |
| """Save summary (with pass_rate) and detailed generations to JSON files.""" |
| results_dir = "code_evaluation_results" |
| os.makedirs(results_dir, exist_ok=True) |
| |
| summary = { |
| "model": model_name, |
| "benchmark": "codealpaca", |
| "total_examples": results["total"], |
| "correct_count": results["correct_count"], |
| "pass_rate": results["pass_rate"], |
| "avg_generation_time": float(np.mean([s["generation_time"] for s in solutions])) |
| } |
| |
| summary_file = os.path.join(results_dir, f"{model_name}_codealpaca_summary.json") |
| detailed_file = os.path.join(results_dir, f"{model_name}_codealpaca_detailed.json") |
| |
| with open(summary_file, "w") as f: |
| json.dump(summary, f, indent=4) |
| with open(detailed_file, "w") as f: |
| json.dump(results["detailed_results"], f, indent=4) |
| |
| print(f"Saved summary to {summary_file}") |
| print(f"Saved detailed outputs to {detailed_file}") |
| return summary_file, detailed_file |
|
|
| |
| def run_codealpaca_evaluation(model_name, num_samples=20): |
| loader = Load_model("gg-cse476/gg-step2000") |
| model, tokenizer = loader.get() |
| |
| dataset = load_benchmark_data_codealpaca(num_samples) |
| solutions = generate_solutions_codealpaca(model, tokenizer, dataset) |
| results = evaluate_solutions_codealpaca(solutions) |
| save_evaluation_results_codealpaca(model_name, results, solutions) |
| |
| return results |
|
|
| if __name__ == "__main__": |
| model_name = "Llama-3-SFT" |
| |
| results = run_codealpaca_evaluation(model_name=model_name, num_samples=20) |
| |
| print("\n=== CodeAlpaca Evaluation Summary ===") |
| print(f"Total examples : {results['total']}") |
| print(f"Correct (exact) : {results['correct_count']}") |
| print(f"Pass@1 (exact) : {results['pass_rate']:.2%}") |
|
|