import torch from transformers import AutoModelForCausalLM, AutoTokenizer def run_pin_inference(prompt, model_id="LH-Tech-AI/Pin-Tiny", subfolder="Pin-25M"): # 1. Device Setup device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # 2. Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained("gpt2") tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained(model_id, subfolder=subfolder).to(device) # 3. Format prompt formatted_prompt = f"[INST] {prompt} [/INST]" inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device) # 4. Generate with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=64, temperature=0.7, do_sample=True, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.encode("[")[0] ) # 5. Decode & Cleanup full_text = tokenizer.decode(outputs[0], skip_special_tokens=True) if "[/INST]" in full_text: response = full_text.split("[/INST]")[-1].split("[INST]")[0].strip() else: response = full_text return response # --- Sample test --- if __name__ == "__main__": user_query = "What is the weather like today?" answer = run_pin_inference(user_query, model_id="LH-Tech-AI/Pin", subfolder="Pin-25M") print(f"\nUser: {user_query}") print(f"Pin: {answer}")