jpark284 commited on
Commit
6b8f7da
·
0 Parent(s):

Clean upload with all adapters

Browse files
.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.bin filter=lfs diff=lfs merge=lfs -text
3
+ *.pt filter=lfs diff=lfs merge=lfs -text
4
+ *.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ checkpoint-*
2
+ runs/
README.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GG Team Instruction-Tuned Adapters (LLaMA 3.2-3B)
2
+
3
+ This repository provides a collection of PEFT adapters (LoRA) trained on various instruction-tuning datasets using the base model **LLaMA 3.2-3B**. These adapters are developed by **GG Team - CSE476 @ Arizona State University**.
4
+
5
+ ## Adapter Variants
6
+
7
+ | Folder | Dataset(s) Used | Description |
8
+ |--------|------------------|-------------|
9
+ | `llama-3.2-3B-sft` | Alpaca | Fine-tuned only on the original Alpaca dataset |
10
+ | `llama-3.2-3B-sft-dolly` | Alpaca + Dolly | Fine-tuned on Databricks' Dolly dataset |
11
+ | `llama-3.2-3B-sft-FLAN` | Alpaca + Dolly + FLAN | Fine-tuned on FLAN and Alpaca mixed |
12
+ | `sft_a_d` | Alpaca + Dolly | Combined dataset fine-tuning (Alpaca + Dolly) |
13
+ | `sft_a_d1` | Alpaca(cleaned) + Dolly | Combined dataset fine-tuning (Alpaca + Dolly) |
14
+
15
+ ---
16
+
17
+ ## 🛠️ Usage (with `peft`)
18
+
19
+ Here's an example of loading one of the adapters using 🤗 Transformers and PEFT:
20
+
21
+ ```python
22
+ from peft import PeftModel
23
+ from transformers import AutoTokenizer, AutoModelForCausalLM
24
+
25
+ # Load base model
26
+ base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-3B")
27
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-3B")
28
+
29
+ # Load adapter (choose one)
30
+ model = PeftModel.from_pretrained(base_model, "gg-cse476/gg/sft_a_d")
31
+
32
+ # Inference
33
+ prompt = "Explain how a rocket works in simple terms."
34
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
35
+ outputs = model.generate(**inputs, max_new_tokens=100)
36
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
adapter_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a397f1a6ee965478e249f2b6142ac0da696267baaca2dc9446d1e104bc8d5d21
3
+ size 856
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9c181229a5f1c9a089843e7c619927962920a711def219f9b83a1c5ea9e28ef
3
+ size 97307544
ajay.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ # Add the parent directory to the path so Backend can be imported
5
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
6
+
7
+ from datasets import load_dataset
8
+ import torch
9
+ import numpy as np
10
+ import json
11
+ import time
12
+ from tqdm import tqdm
13
+ from model import Load_model
14
+
15
+ # === Step 1: Load CodeAlpaca Data ===
16
+ def load_benchmark_data_codealpaca(num_samples=20):
17
+ """Load the CodeAlpaca-20k instruction-tuning dataset."""
18
+ dataset = load_dataset("sahil2801/CodeAlpaca-20k")["train"]
19
+ if num_samples and num_samples < len(dataset):
20
+ indices = np.random.choice(len(dataset), num_samples, replace=False)
21
+ dataset = dataset.select(indices)
22
+ return dataset
23
+
24
+ # === Step 2: Generate Solutions (and collect reference) ===
25
+ def generate_solutions_codealpaca(model, tokenizer, dataset, max_tokens=512):
26
+ """
27
+ Generate code for each CodeAlpaca instruction.
28
+ Returns a list of dicts with problem_id, prompt, generated_code, reference, generation_time.
29
+ """
30
+ results = []
31
+ prompt_template = "### Instruction:\n{}\n\n### Response:\n"
32
+
33
+ for item in tqdm(dataset, desc="Generating CodeAlpaca solutions"):
34
+ instruction = item["instruction"]
35
+ reference = item["output"]
36
+
37
+ prompt = prompt_template.format(instruction)
38
+
39
+ # derive a safe problem_id from first 50 chars of instruction
40
+ raw_id = instruction.strip().replace("\n", " ")
41
+ problem_id = raw_id[:50].replace(" ", "_").replace("/", "_")
42
+
43
+ # generation
44
+ start_time = time.time()
45
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
46
+ with torch.no_grad():
47
+ output = model.generate(
48
+ **inputs,
49
+ max_length=max_tokens,
50
+ do_sample=True,
51
+ temperature=0.2,
52
+ top_p=0.95,
53
+ pad_token_id=tokenizer.eos_token_id
54
+ )
55
+ generation_time = time.time() - start_time
56
+
57
+ generated = tokenizer.decode(output[0], skip_special_tokens=True)
58
+ # strip the prompt prefix from the generation
59
+ if generated.startswith(prompt):
60
+ generated = generated[len(prompt):]
61
+
62
+ results.append({
63
+ "problem_id": problem_id,
64
+ "prompt": prompt,
65
+ "generated_code": generated,
66
+ "reference": reference,
67
+ "generation_time": generation_time
68
+ })
69
+ return results
70
+
71
+ # === Step 3: Evaluate Solutions by Exact-Match ===
72
+ def evaluate_solutions_codealpaca(solutions):
73
+ """
74
+ Count how many generations exactly match the reference.
75
+ Returns total, correct_count, pass_rate, plus all details.
76
+ """
77
+ total = len(solutions)
78
+ correct_count = sum(
79
+ 1 for s in solutions
80
+ if s["generated_code"].strip() == s["reference"].strip()
81
+ )
82
+ pass_rate = correct_count / total if total > 0 else 0.0
83
+
84
+ return {
85
+ "total": total,
86
+ "correct_count": correct_count,
87
+ "pass_rate": pass_rate,
88
+ "detailed_results": solutions
89
+ }
90
+
91
+ # === Step 4: Save Results ===
92
+ def save_evaluation_results_codealpaca(model_name, results, solutions):
93
+ """Save summary (with pass_rate) and detailed generations to JSON files."""
94
+ results_dir = "code_evaluation_results"
95
+ os.makedirs(results_dir, exist_ok=True)
96
+
97
+ summary = {
98
+ "model": model_name,
99
+ "benchmark": "codealpaca",
100
+ "total_examples": results["total"],
101
+ "correct_count": results["correct_count"],
102
+ "pass_rate": results["pass_rate"],
103
+ "avg_generation_time": float(np.mean([s["generation_time"] for s in solutions]))
104
+ }
105
+
106
+ summary_file = os.path.join(results_dir, f"{model_name}_codealpaca_summary.json")
107
+ detailed_file = os.path.join(results_dir, f"{model_name}_codealpaca_detailed.json")
108
+
109
+ with open(summary_file, "w") as f:
110
+ json.dump(summary, f, indent=4)
111
+ with open(detailed_file, "w") as f:
112
+ json.dump(results["detailed_results"], f, indent=4)
113
+
114
+ print(f"Saved summary to {summary_file}")
115
+ print(f"Saved detailed outputs to {detailed_file}")
116
+ return summary_file, detailed_file
117
+
118
+ # === Step 5: Run the CodeAlpaca Evaluation ===
119
+ def run_codealpaca_evaluation(model_name, num_samples=20):
120
+ loader = Load_model("gg-cse476/gg-step2000")
121
+ model, tokenizer = loader.get()
122
+
123
+ dataset = load_benchmark_data_codealpaca(num_samples)
124
+ solutions = generate_solutions_codealpaca(model, tokenizer, dataset)
125
+ results = evaluate_solutions_codealpaca(solutions)
126
+ save_evaluation_results_codealpaca(model_name, results, solutions)
127
+
128
+ return results
129
+
130
+ if __name__ == "__main__":
131
+ model_name = "Llama-3-SFT"
132
+ # e.g. evaluate on 20 random CodeAlpaca examples
133
+ results = run_codealpaca_evaluation(model_name=model_name, num_samples=20)
134
+
135
+ print("\n=== CodeAlpaca Evaluation Summary ===")
136
+ print(f"Total examples : {results['total']}")
137
+ print(f"Correct (exact) : {results['correct_count']}")
138
+ print(f"Pass@1 (exact) : {results['pass_rate']:.2%}")
special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:849070cae53bd45439e64ce5b1ddd650a66081b1bd47895c5a58939a05055579
3
+ size 335
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52716f60c3ad328509fa37cdded9a2f1196ecae463f5480f5d38c66a25e7a7dc
3
+ size 17210019
tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb0b184bfd935cbe6f8290f1af424c17814fd24dfc5aaac3be9b0b674fe40631
3
+ size 50560
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab0d3f35c51da985f6c4b5c45b6b6b5eeb42eafaf2e6d58a442c1a853e20f24e
3
+ size 5304