Model description
More information needed
Intended uses & limitations
Run on consumer Grade GPU
GPU
Tesla M60 16GB VRAM
Training hyperparameters
The following hyperparameters were used during training: per_device_train_batch_size: int = 2 per_device_eval_batch_size: int = 2 gradient_accumulation_steps: int = 16 learning_rate: float = 2e-4 weight_decay: float = 0.01 warmup_ratio: float = 0.03 logging_steps: int = 1 save_steps: int = 200 eval_steps: int = 200 max_seq_length: int = 1024 num_train_epochs: int = 1 max_grad_norm: float = 0.3 num_epochs: 5
Training results
Step Training Loss Validation Loss Entropy Num Tokens Mean Token Accuracy 50 1.399500 1.379406 1.427977 166273.000000 0.684534 100 1.350000 1.272701 1.238351 331643.000000 0.698206 150 1.391500 1.252361 1.240065 497339.000000 0.701490 200 1.175000 1.243332 1.248332 664364.000000 0.701699 250 1.357100 1.235908 1.209817 830792.000000 0.703880 300 1.341700 1.226673 1.196961 995955.000000 0.705412 350 1.211000 1.223105 1.219540 1161755.000000 0.705137 400 1.414100 1.219148 1.218188 1330892.000000 0.706035 450 1.088200 1.214209 1.244467 1494009.000000 0.707179 500 1.302800 1.210984 1.203838 1659876.000000 0.707986 550 1.192800 1.208378 1.201593 1828355.000000 0.708459 600 1.302300 1.206382 1.212914 1989352.000000 0.708516 650 1.177800 1.205050 1.245975 2155580.000000 0.708198 700 1.156600 1.201754 1.201212 2323534.000000 0.709032 750 1.271000 1.201216 1.218800 2488415.000000 0.708988 800 1.264100 1.198175 1.182730 2655756.000000 0.710219 850 1.324600 1.196617 1.189218 2822068.000000 0.710231 900 1.159400 1.198235 1.207774 2988438.000000 0.708831 950 1.294200 1.194295 1.211270 3153113.000000 0.709955 1000 1.370000 1.192295 1.215226 3321550.000000 0.710322 1050 1.157300 1.190316 1.214881 3485313.000000 0.710768 1100 1.124000 1.189019 1.210650 3651712.000000 0.711739 1150 1.139700 1.188874 1.209716 3815535.000000 0.711151 1200 1.293600 1.187840 1.198137 3980373.000000 0.710808 1250 1.199800 1.186739 1.226214 4146077.000000 0.711442 ... XXXX 7700 steps XXXX
How to use
Here is how to use this model with the pipeline API:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import time
import os
BASE_MODEL = "Qwen/Qwen3-0.6B" # change this for Qwen / Phi / MPT
ADAPTER_PATH = "Sidharthkr/Qwen3_0.6B_alpaca_lora" # your LoRA output_dir
DEVICE_MAP = "auto" # or "cuda" if single-GPU
DTYPE = torch.float32 # Tesla M60: fp16, NOT bf16
def create_alpaca_prompt(instruction: str, inp: str = "") -> str:
"""Format prompt in Alpaca style."""
if inp.strip():
prompt = (
"Below is an instruction that describes a task, paired with an input that provides further context. "
"Write a response that appropriately completes the request.\n\n"
f"### Instruction:\n{instruction.strip()}\n\n"
f"### Input:\n{inp.strip()}\n\n"
"### Response:\n"
)
else:
prompt = (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
f"### Instruction:\n{instruction.strip()}\n\n"
"### Response:\n"
)
return prompt
def load_model_and_tokenizer():
print(f"Loading base model: {BASE_MODEL}")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=DTYPE,
device_map=DEVICE_MAP,
)
print(f"Loading LoRA adapter from {ADAPTER_PATH}")
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
print("Merging LoRA weights into base model for speed...")
model = model.merge_and_unload()
model.eval()
# For safety with older GPUs
torch.backends.cuda.matmul.allow_tf32 = False
#torch.backends.cudnn.allow_tf32 = False
return model, tokenizer
@torch.no_grad()
def generate_single(
model,
tokenizer,
instruction: str,
inp: str = "",
max_new_tokens: int = 256,
temperature: float = 0.7,
top_p: float = 0.9,
):
prompt = create_alpaca_prompt(instruction, inp)
inputs = tokenizer(
prompt,
return_tensors="pt",
).to(model.device)
output_ids = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False, # ✅ no sampling → no multinomial
temperature=None, # ignored when do_sample=False
top_p=None,
pad_token_id=tokenizer.eos_token_id,
use_cache=True,
)
full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
# Strip the prompt part to keep only the response
if "### Response:" in full_text:
response = full_text.split("### Response:")[-1].strip()
else:
response = full_text.strip()
return response
@torch.no_grad()
def generate_batch(
model,
tokenizer,
instructions,
inputs=None,
max_new_tokens: int = 256,
temperature: float = 0.7,
top_p: float = 0.9,
):
if inputs is None:
inputs = [""] * len(instructions)
prompts = [
create_alpaca_prompt(inst, inp)
for inst, inp in zip(instructions, inputs)
]
tokenized = tokenizer(
prompts,
return_tensors="pt",
#padding=True,
#truncation=True,
).to(model.device)
output_ids = model.generate(
**tokenized,
max_new_tokens=max_new_tokens,
do_sample=False, # ✅ no sampling → no multinomial
temperature=None, # ignored when do_sample=False
top_p=None,
# do_sample=True,
# temperature=temperature,
# top_p=top_p,
pad_token_id=tokenizer.eos_token_id,
)
outputs = []
for i in range(len(prompts)):
full_text = tokenizer.decode(output_ids[i], skip_special_tokens=True)
if "### Response:" in full_text:
response = full_text.split("### Response:")[-1].strip()
else:
response = full_text.strip()
outputs.append(response)
return outputs
model, tokenizer = load_model_and_tokenizer()
t1 = time.time() # ⏱ start
# ---------- Example: single prediction ----------
instruction = "Explain what a GPU is to a 15 year old."
inp = ""
response = generate_single(model, tokenizer, instruction, inp, max_new_tokens=512)
t2 = time.time()
print(f"Total time: {t2 - t1:.2f} seconds")
print("=== Single prediction ===")
print(response)
>>> Total time: 4.42 seconds
=== Single prediction ===
A GPU (Graphics Processing Unit) is a type of computer processor used to generate images and videos. It is used in computers and other devices to create visual content, such as games and movies. It is much faster than a CPU (Central Processing Unit) and can process more data in less time.