phase2 / ajay.py
jpark284's picture
Clean upload with all adapters
6b8f7da
import sys
import os
# Add the parent directory to the path so Backend can be imported
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from datasets import load_dataset
import torch
import numpy as np
import json
import time
from tqdm import tqdm
from model import Load_model
# === Step 1: Load CodeAlpaca Data ===
def load_benchmark_data_codealpaca(num_samples=20):
"""Load the CodeAlpaca-20k instruction-tuning dataset."""
dataset = load_dataset("sahil2801/CodeAlpaca-20k")["train"]
if num_samples and num_samples < len(dataset):
indices = np.random.choice(len(dataset), num_samples, replace=False)
dataset = dataset.select(indices)
return dataset
# === Step 2: Generate Solutions (and collect reference) ===
def generate_solutions_codealpaca(model, tokenizer, dataset, max_tokens=512):
"""
Generate code for each CodeAlpaca instruction.
Returns a list of dicts with problem_id, prompt, generated_code, reference, generation_time.
"""
results = []
prompt_template = "### Instruction:\n{}\n\n### Response:\n"
for item in tqdm(dataset, desc="Generating CodeAlpaca solutions"):
instruction = item["instruction"]
reference = item["output"]
prompt = prompt_template.format(instruction)
# derive a safe problem_id from first 50 chars of instruction
raw_id = instruction.strip().replace("\n", " ")
problem_id = raw_id[:50].replace(" ", "_").replace("/", "_")
# generation
start_time = time.time()
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(
**inputs,
max_length=max_tokens,
do_sample=True,
temperature=0.2,
top_p=0.95,
pad_token_id=tokenizer.eos_token_id
)
generation_time = time.time() - start_time
generated = tokenizer.decode(output[0], skip_special_tokens=True)
# strip the prompt prefix from the generation
if generated.startswith(prompt):
generated = generated[len(prompt):]
results.append({
"problem_id": problem_id,
"prompt": prompt,
"generated_code": generated,
"reference": reference,
"generation_time": generation_time
})
return results
# === Step 3: Evaluate Solutions by Exact-Match ===
def evaluate_solutions_codealpaca(solutions):
"""
Count how many generations exactly match the reference.
Returns total, correct_count, pass_rate, plus all details.
"""
total = len(solutions)
correct_count = sum(
1 for s in solutions
if s["generated_code"].strip() == s["reference"].strip()
)
pass_rate = correct_count / total if total > 0 else 0.0
return {
"total": total,
"correct_count": correct_count,
"pass_rate": pass_rate,
"detailed_results": solutions
}
# === Step 4: Save Results ===
def save_evaluation_results_codealpaca(model_name, results, solutions):
"""Save summary (with pass_rate) and detailed generations to JSON files."""
results_dir = "code_evaluation_results"
os.makedirs(results_dir, exist_ok=True)
summary = {
"model": model_name,
"benchmark": "codealpaca",
"total_examples": results["total"],
"correct_count": results["correct_count"],
"pass_rate": results["pass_rate"],
"avg_generation_time": float(np.mean([s["generation_time"] for s in solutions]))
}
summary_file = os.path.join(results_dir, f"{model_name}_codealpaca_summary.json")
detailed_file = os.path.join(results_dir, f"{model_name}_codealpaca_detailed.json")
with open(summary_file, "w") as f:
json.dump(summary, f, indent=4)
with open(detailed_file, "w") as f:
json.dump(results["detailed_results"], f, indent=4)
print(f"Saved summary to {summary_file}")
print(f"Saved detailed outputs to {detailed_file}")
return summary_file, detailed_file
# === Step 5: Run the CodeAlpaca Evaluation ===
def run_codealpaca_evaluation(model_name, num_samples=20):
loader = Load_model("gg-cse476/gg-step2000")
model, tokenizer = loader.get()
dataset = load_benchmark_data_codealpaca(num_samples)
solutions = generate_solutions_codealpaca(model, tokenizer, dataset)
results = evaluate_solutions_codealpaca(solutions)
save_evaluation_results_codealpaca(model_name, results, solutions)
return results
if __name__ == "__main__":
model_name = "Llama-3-SFT"
# e.g. evaluate on 20 random CodeAlpaca examples
results = run_codealpaca_evaluation(model_name=model_name, num_samples=20)
print("\n=== CodeAlpaca Evaluation Summary ===")
print(f"Total examples : {results['total']}")
print(f"Correct (exact) : {results['correct_count']}")
print(f"Pass@1 (exact) : {results['pass_rate']:.2%}")