import sys import os # Add the parent directory to the path so Backend can be imported sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from datasets import load_dataset import torch import numpy as np import json import time from tqdm import tqdm from model import Load_model # === Step 1: Load CodeAlpaca Data === def load_benchmark_data_codealpaca(num_samples=20): """Load the CodeAlpaca-20k instruction-tuning dataset.""" dataset = load_dataset("sahil2801/CodeAlpaca-20k")["train"] if num_samples and num_samples < len(dataset): indices = np.random.choice(len(dataset), num_samples, replace=False) dataset = dataset.select(indices) return dataset # === Step 2: Generate Solutions (and collect reference) === def generate_solutions_codealpaca(model, tokenizer, dataset, max_tokens=512): """ Generate code for each CodeAlpaca instruction. Returns a list of dicts with problem_id, prompt, generated_code, reference, generation_time. """ results = [] prompt_template = "### Instruction:\n{}\n\n### Response:\n" for item in tqdm(dataset, desc="Generating CodeAlpaca solutions"): instruction = item["instruction"] reference = item["output"] prompt = prompt_template.format(instruction) # derive a safe problem_id from first 50 chars of instruction raw_id = instruction.strip().replace("\n", " ") problem_id = raw_id[:50].replace(" ", "_").replace("/", "_") # generation start_time = time.time() inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): output = model.generate( **inputs, max_length=max_tokens, do_sample=True, temperature=0.2, top_p=0.95, pad_token_id=tokenizer.eos_token_id ) generation_time = time.time() - start_time generated = tokenizer.decode(output[0], skip_special_tokens=True) # strip the prompt prefix from the generation if generated.startswith(prompt): generated = generated[len(prompt):] results.append({ "problem_id": problem_id, "prompt": prompt, "generated_code": generated, "reference": reference, "generation_time": generation_time }) return results # === Step 3: Evaluate Solutions by Exact-Match === def evaluate_solutions_codealpaca(solutions): """ Count how many generations exactly match the reference. Returns total, correct_count, pass_rate, plus all details. """ total = len(solutions) correct_count = sum( 1 for s in solutions if s["generated_code"].strip() == s["reference"].strip() ) pass_rate = correct_count / total if total > 0 else 0.0 return { "total": total, "correct_count": correct_count, "pass_rate": pass_rate, "detailed_results": solutions } # === Step 4: Save Results === def save_evaluation_results_codealpaca(model_name, results, solutions): """Save summary (with pass_rate) and detailed generations to JSON files.""" results_dir = "code_evaluation_results" os.makedirs(results_dir, exist_ok=True) summary = { "model": model_name, "benchmark": "codealpaca", "total_examples": results["total"], "correct_count": results["correct_count"], "pass_rate": results["pass_rate"], "avg_generation_time": float(np.mean([s["generation_time"] for s in solutions])) } summary_file = os.path.join(results_dir, f"{model_name}_codealpaca_summary.json") detailed_file = os.path.join(results_dir, f"{model_name}_codealpaca_detailed.json") with open(summary_file, "w") as f: json.dump(summary, f, indent=4) with open(detailed_file, "w") as f: json.dump(results["detailed_results"], f, indent=4) print(f"Saved summary to {summary_file}") print(f"Saved detailed outputs to {detailed_file}") return summary_file, detailed_file # === Step 5: Run the CodeAlpaca Evaluation === def run_codealpaca_evaluation(model_name, num_samples=20): loader = Load_model("gg-cse476/gg-step2000") model, tokenizer = loader.get() dataset = load_benchmark_data_codealpaca(num_samples) solutions = generate_solutions_codealpaca(model, tokenizer, dataset) results = evaluate_solutions_codealpaca(solutions) save_evaluation_results_codealpaca(model_name, results, solutions) return results if __name__ == "__main__": model_name = "Llama-3-SFT" # e.g. evaluate on 20 random CodeAlpaca examples results = run_codealpaca_evaluation(model_name=model_name, num_samples=20) print("\n=== CodeAlpaca Evaluation Summary ===") print(f"Total examples : {results['total']}") print(f"Correct (exact) : {results['correct_count']}") print(f"Pass@1 (exact) : {results['pass_rate']:.2%}")